Commit | Line | Data |
---|---|---|
d47a3fec AB |
1 | /* |
2 | * Blktrace replay utility - Play traces back | |
3 | * | |
4 | * Copyright (C) 2007 Alan D. Brunelle <Alan.Brunelle@hp.com> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2 of the License, or | |
9 | * (at your option) any later version. | |
10 | * | |
11 | * This program is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with this program; if not, write to the Free Software | |
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
19 | */ | |
20 | ||
21 | static char build_date[] = __DATE__ " at "__TIME__; | |
22 | ||
23 | #include <assert.h> | |
24 | #include <errno.h> | |
25 | #include <fcntl.h> | |
26 | #include <libaio.h> | |
27 | #include <pthread.h> | |
28 | #include <sched.h> | |
29 | #include <signal.h> | |
30 | #include <stdio.h> | |
31 | #include <stdlib.h> | |
32 | #include <string.h> | |
33 | #include <time.h> | |
34 | #include <unistd.h> | |
35 | #include <sys/param.h> | |
36 | #include <sys/stat.h> | |
37 | #include <sys/time.h> | |
38 | #include <sys/types.h> | |
39 | #include <dirent.h> | |
65a7043b | 40 | #include <stdarg.h> |
d47a3fec AB |
41 | |
42 | #if !defined(_GNU_SOURCE) | |
43 | # define _GNU_SOURCE | |
44 | #endif | |
45 | #include <getopt.h> | |
46 | ||
47 | #include "list.h" | |
48 | #include "btrecord.h" | |
49 | ||
50 | /* | |
51 | * ======================================================================== | |
52 | * ==== STRUCTURE DEFINITIONS ============================================= | |
53 | * ======================================================================== | |
54 | */ | |
55 | ||
56 | /** | |
57 | * Each device map has one of these: | |
58 | * | |
59 | * @head: Linked on to map_devs | |
60 | * @from_dev: Device name as seen on recorded system | |
61 | * @to_dev: Device name to be used on replay system | |
62 | */ | |
63 | struct map_dev { | |
64 | struct list_head head; | |
65 | char *from_dev, *to_dev; | |
66 | }; | |
67 | ||
68 | /** | |
69 | * Each device name specified has one of these (until threads are created) | |
70 | * | |
71 | * @head: Linked onto input_devs | |
72 | * @devnm: Device name -- 'sd*' | |
73 | */ | |
74 | struct dev_info { | |
75 | struct list_head head; | |
76 | char *devnm; | |
77 | }; | |
78 | ||
79 | /* | |
80 | * Per input file information | |
81 | * | |
82 | * @head: Used to link up on input_files | |
83 | * @free_iocbs: List of free iocb's available for use | |
84 | * @used_iocbs: List of iocb's currently outstanding | |
85 | * @mutex: Mutex used with condition variable to protect volatile values | |
86 | * @cond: Condition variable used when waiting on a volatile value change | |
87 | * @naios_out: Current number of AIOs outstanding on this context | |
88 | * @naios_free: Number of AIOs on the free list (short cut for list_len) | |
89 | * @send_wait: Boolean: When true, the sub thread is waiting on free IOCBs | |
90 | * @reap_wait: Boolean: When true, the rec thread is waiting on used IOCBs | |
91 | * @send_done: Boolean: When true, the sub thread has completed work | |
92 | * @reap_done: Boolean: When true, the rec thread has completed work | |
93 | * @sub_thread: Thread used to submit IOs. | |
94 | * @rec_thread: Thread used to reclaim IOs. | |
95 | * @ctx: IO context | |
96 | * @devnm: Copy of the device name being managed by this thread | |
97 | * @file_name: Full name of the input file | |
98 | * @cpu: CPU this thread is pinned to | |
99 | * @ifd: Input file descriptor | |
100 | * @ofd: Output file descriptor | |
101 | * @iterations: Remaining iterations to process | |
102 | * @vfp: For verbose dumping of actions performed | |
103 | */ | |
104 | struct thr_info { | |
105 | struct list_head head, free_iocbs, used_iocbs; | |
106 | pthread_mutex_t mutex; | |
107 | pthread_cond_t cond; | |
108 | volatile long naios_out, naios_free; | |
109 | volatile int send_wait, reap_wait, send_done, reap_done; | |
110 | pthread_t sub_thread, rec_thread; | |
111 | io_context_t ctx; | |
112 | char *devnm, *file_name; | |
113 | int cpu, ifd, ofd, iterations; | |
114 | FILE *vfp; | |
115 | }; | |
116 | ||
117 | /* | |
118 | * Every Asynchronous IO used has one of these (naios per file/device). | |
119 | * | |
120 | * @iocb: IOCB sent down via io_submit | |
121 | * @head: Linked onto file_list.free_iocbs or file_list.used_iocbs | |
122 | * @tip: Pointer to per-thread information this IO is associated with | |
123 | * @nbytes: Number of bytes in buffer associated with iocb | |
124 | */ | |
125 | struct iocb_pkt { | |
126 | struct iocb iocb; | |
127 | struct list_head head; | |
128 | struct thr_info *tip; | |
129 | int nbytes; | |
130 | }; | |
131 | ||
132 | /* | |
133 | * ======================================================================== | |
134 | * ==== GLOBAL VARIABLES ================================================== | |
135 | * ======================================================================== | |
136 | */ | |
137 | ||
138 | static volatile int signal_done = 0; // Boolean: Signal'ed, need to quit | |
139 | ||
140 | static char *ibase = "replay"; // Input base name | |
141 | static char *idir = "."; // Input directory base | |
142 | static int cpus_to_use = -1; // Number of CPUs to use | |
143 | static int def_iterations = 1; // Default number of iterations | |
144 | static int naios = 512; // Number of AIOs per thread | |
145 | static int ncpus = 0; // Number of CPUs in the system | |
146 | static int verbose = 0; // Boolean: Output some extra info | |
147 | static int write_enabled = 0; // Boolean: Enable writing | |
148 | static __u64 genesis = ~0; // Earliest time seen | |
149 | static __u64 rgenesis; // Our start time | |
150 | static size_t pgsize; // System Page size | |
151 | static int nb_sec = 512; // Number of bytes per sector | |
152 | static LIST_HEAD(input_devs); // List of devices to handle | |
153 | static LIST_HEAD(input_files); // List of input files to handle | |
154 | static LIST_HEAD(map_devs); // List of device maps | |
155 | static int nfiles = 0; // Number of files to handle | |
156 | static int no_stalls = 0; // Boolean: Disable pre-stalls | |
4a7968cc | 157 | static unsigned acc_factor = 1; // Int: Acceleration factor |
d47a3fec AB |
158 | static int find_records = 0; // Boolean: Find record files auto |
159 | ||
160 | /* | |
161 | * Variables managed under control of condition variables. | |
162 | * | |
163 | * n_reclaims_done: Counts number of reclaim threads that have completed. | |
164 | * n_replays_done: Counts number of replay threads that have completed. | |
165 | * n_replays_ready: Counts number of replay threads ready to start. | |
166 | * n_iters_done: Counts number of replay threads done one iteration. | |
167 | * iter_start: Starts an iteration for the replay threads. | |
168 | */ | |
169 | static volatile int n_reclaims_done = 0; | |
170 | static pthread_mutex_t reclaim_done_mutex = PTHREAD_MUTEX_INITIALIZER; | |
171 | static pthread_cond_t reclaim_done_cond = PTHREAD_COND_INITIALIZER; | |
172 | ||
173 | static volatile int n_replays_done = 0; | |
174 | static pthread_mutex_t replay_done_mutex = PTHREAD_MUTEX_INITIALIZER; | |
175 | static pthread_cond_t replay_done_cond = PTHREAD_COND_INITIALIZER; | |
176 | ||
177 | static volatile int n_replays_ready = 0; | |
178 | static pthread_mutex_t replay_ready_mutex = PTHREAD_MUTEX_INITIALIZER; | |
179 | static pthread_cond_t replay_ready_cond = PTHREAD_COND_INITIALIZER; | |
180 | ||
181 | static volatile int n_iters_done = 0; | |
182 | static pthread_mutex_t iter_done_mutex = PTHREAD_MUTEX_INITIALIZER; | |
183 | static pthread_cond_t iter_done_cond = PTHREAD_COND_INITIALIZER; | |
184 | ||
185 | static volatile int iter_start = 0; | |
186 | static pthread_mutex_t iter_start_mutex = PTHREAD_MUTEX_INITIALIZER; | |
187 | static pthread_cond_t iter_start_cond = PTHREAD_COND_INITIALIZER; | |
188 | ||
189 | /* | |
190 | * ======================================================================== | |
191 | * ==== FORWARD REFERENECES =============================================== | |
192 | * ======================================================================== | |
193 | */ | |
194 | ||
195 | static void *replay_sub(void *arg); | |
196 | static void *replay_rec(void *arg); | |
197 | static char usage_str[]; | |
198 | ||
199 | /* | |
200 | * ======================================================================== | |
201 | * ==== INLINE ROUTINES =================================================== | |
202 | * ======================================================================== | |
203 | */ | |
204 | ||
205 | /* | |
206 | * The 'fatal' macro will output a perror message (if errstring is !NULL) | |
207 | * and display a string (with variable arguments) and then exit with the | |
208 | * specified exit value. | |
209 | */ | |
210 | #define ERR_ARGS 1 | |
211 | #define ERR_SYSCALL 2 | |
65a7043b AB |
212 | static inline void fatal(const char *errstring, const int exitval, |
213 | const char *fmt, ...) | |
214 | { | |
215 | va_list ap; | |
216 | ||
217 | if (errstring) | |
218 | perror(errstring); | |
219 | ||
220 | va_start(ap, fmt); | |
221 | vfprintf(stderr, fmt, ap); | |
222 | va_end(ap); | |
223 | ||
224 | exit(exitval); | |
225 | /*NOTREACHED*/ | |
226 | } | |
d47a3fec AB |
227 | |
228 | static inline long long unsigned du64_to_sec(__u64 du64) | |
229 | { | |
230 | return (long long unsigned)du64 / (1000 * 1000 * 1000); | |
231 | } | |
232 | ||
233 | static inline long long unsigned du64_to_nsec(__u64 du64) | |
234 | { | |
235 | return llabs((long long)du64) % (1000 * 1000 * 1000); | |
236 | } | |
237 | ||
238 | /** | |
239 | * min - Return minimum of two integers | |
240 | */ | |
241 | static inline int min(int a, int b) | |
242 | { | |
243 | return a < b ? a : b; | |
244 | } | |
245 | ||
246 | /** | |
247 | * minl - Return minimum of two longs | |
248 | */ | |
249 | static inline long minl(long a, long b) | |
250 | { | |
251 | return a < b ? a : b; | |
252 | } | |
253 | ||
254 | /** | |
255 | * usage - Display usage string and version | |
256 | */ | |
257 | static inline void usage(void) | |
258 | { | |
259 | fprintf(stderr, "Usage: btreplay -- version %s\n%s", | |
260 | my_btversion, usage_str); | |
261 | } | |
262 | ||
263 | /** | |
264 | * is_send_done - Returns true if sender should quit early | |
265 | * @tip: Per-thread information | |
266 | */ | |
267 | static inline int is_send_done(struct thr_info *tip) | |
268 | { | |
269 | return signal_done || tip->send_done; | |
270 | } | |
271 | ||
272 | /** | |
273 | * is_reap_done - Returns true if reaper should quit early | |
274 | * @tip: Per-thread information | |
275 | */ | |
276 | static inline int is_reap_done(struct thr_info *tip) | |
277 | { | |
747f0e28 | 278 | return signal_done || (tip->send_done && tip->naios_out == 0); |
d47a3fec AB |
279 | } |
280 | ||
281 | /** | |
282 | * ts2ns - Convert timespec values to a nanosecond value | |
283 | */ | |
284 | #define NS_TICKS ((__u64)1000 * (__u64)1000 * (__u64)1000) | |
285 | static inline __u64 ts2ns(struct timespec *ts) | |
286 | { | |
287 | return ((__u64)(ts->tv_sec) * NS_TICKS) + (__u64)(ts->tv_nsec); | |
288 | } | |
289 | ||
290 | /** | |
291 | * ts2ns - Convert timeval values to a nanosecond value | |
292 | */ | |
293 | static inline __u64 tv2ns(struct timeval *tp) | |
294 | { | |
295 | return ((__u64)(tp->tv_sec)) + ((__u64)(tp->tv_usec) * (__u64)1000); | |
296 | } | |
297 | ||
298 | /** | |
299 | * touch_memory - Force physical memory to be allocating it | |
300 | * | |
301 | * For malloc()ed memory we need to /touch/ it to make it really | |
302 | * exist. Otherwise, for write's (to storage) things may not work | |
303 | * as planned - we see Linux just use a single area to /read/ from | |
304 | * (as there isn't any memory that has been associated with the | |
305 | * allocated virtual addresses yet). | |
306 | */ | |
307 | static inline void touch_memory(char *buf, size_t bsize) | |
308 | { | |
309 | #if defined(PREP_BUFS) | |
310 | memset(buf, 0, bsize); | |
311 | #else | |
312 | size_t i; | |
313 | ||
314 | for (i = 0; i < bsize; i += pgsize) | |
315 | buf[i] = 0; | |
316 | #endif | |
317 | } | |
318 | ||
319 | /** | |
320 | * buf_alloc - Returns a page-aligned buffer of the specified size | |
321 | * @nbytes: Number of bytes to allocate | |
322 | */ | |
323 | static inline void *buf_alloc(size_t nbytes) | |
324 | { | |
325 | void *buf; | |
326 | ||
327 | if (posix_memalign(&buf, pgsize, nbytes)) { | |
328 | fatal("posix_memalign", ERR_SYSCALL, "Allocation failed\n"); | |
329 | /*NOTREACHED*/ | |
330 | } | |
331 | ||
332 | return buf; | |
333 | } | |
334 | ||
335 | /** | |
336 | * gettime - Returns current time | |
337 | */ | |
338 | static inline __u64 gettime(void) | |
339 | { | |
340 | static int use_clock_gettime = -1; // Which clock to use | |
341 | ||
342 | if (use_clock_gettime < 0) { | |
343 | use_clock_gettime = clock_getres(CLOCK_MONOTONIC, NULL) == 0; | |
344 | if (use_clock_gettime) { | |
345 | struct timespec ts = { | |
346 | .tv_sec = 0, | |
347 | .tv_nsec = 0 | |
348 | }; | |
349 | clock_settime(CLOCK_MONOTONIC, &ts); | |
350 | } | |
351 | } | |
352 | ||
353 | if (use_clock_gettime) { | |
354 | struct timespec ts; | |
355 | clock_gettime(CLOCK_MONOTONIC, &ts); | |
356 | return ts2ns(&ts); | |
357 | } | |
358 | else { | |
359 | struct timeval tp; | |
360 | gettimeofday(&tp, NULL); | |
361 | return tv2ns(&tp); | |
362 | } | |
363 | } | |
364 | ||
365 | /** | |
366 | * setup_signal - Set up a signal handler for the specified signum | |
367 | */ | |
368 | static inline void setup_signal(int signum, sighandler_t handler) | |
369 | { | |
370 | if (signal(signum, handler) == SIG_ERR) { | |
371 | fatal("signal", ERR_SYSCALL, "Failed to set signal %d\n", | |
372 | signum); | |
373 | /*NOTREACHED*/ | |
374 | } | |
375 | } | |
376 | ||
377 | /* | |
378 | * ======================================================================== | |
379 | * ==== CONDITION VARIABLE ROUTINES ======================================= | |
380 | * ======================================================================== | |
381 | */ | |
382 | ||
383 | /** | |
384 | * __set_cv - Increments a variable under condition variable control. | |
385 | * @pmp: Pointer to the associated mutex | |
386 | * @pcp: Pointer to the associated condition variable | |
387 | * @vp: Pointer to the variable being incremented | |
388 | * @mxv: Max value for variable (Used only when ASSERTS are on) | |
389 | */ | |
390 | static inline void __set_cv(pthread_mutex_t *pmp, pthread_cond_t *pcp, | |
391 | volatile int *vp, | |
392 | __attribute__((__unused__))int mxv) | |
393 | { | |
394 | pthread_mutex_lock(pmp); | |
395 | assert(*vp < mxv); | |
396 | *vp += 1; | |
397 | pthread_cond_signal(pcp); | |
398 | pthread_mutex_unlock(pmp); | |
399 | } | |
400 | ||
401 | /** | |
402 | * __wait_cv - Waits for a variable under cond var control to hit a value | |
403 | * @pmp: Pointer to the associated mutex | |
404 | * @pcp: Pointer to the associated condition variable | |
405 | * @vp: Pointer to the variable being incremented | |
406 | * @mxv: Value to wait for | |
407 | */ | |
408 | static inline void __wait_cv(pthread_mutex_t *pmp, pthread_cond_t *pcp, | |
409 | volatile int *vp, int mxv) | |
410 | { | |
411 | pthread_mutex_lock(pmp); | |
412 | while (*vp < mxv) | |
413 | pthread_cond_wait(pcp, pmp); | |
414 | *vp = 0; | |
415 | pthread_mutex_unlock(pmp); | |
416 | } | |
417 | ||
418 | static inline void set_reclaim_done(void) | |
419 | { | |
420 | __set_cv(&reclaim_done_mutex, &reclaim_done_cond, &n_reclaims_done, | |
421 | nfiles); | |
422 | } | |
423 | ||
424 | static inline void wait_reclaims_done(void) | |
425 | { | |
426 | __wait_cv(&reclaim_done_mutex, &reclaim_done_cond, &n_reclaims_done, | |
427 | nfiles); | |
428 | } | |
429 | ||
430 | static inline void set_replay_ready(void) | |
431 | { | |
432 | __set_cv(&replay_ready_mutex, &replay_ready_cond, &n_replays_ready, | |
433 | nfiles); | |
434 | } | |
435 | ||
436 | static inline void wait_replays_ready(void) | |
437 | { | |
438 | __wait_cv(&replay_ready_mutex, &replay_ready_cond, &n_replays_ready, | |
439 | nfiles); | |
440 | } | |
441 | ||
442 | static inline void set_replay_done(void) | |
443 | { | |
444 | __set_cv(&replay_done_mutex, &replay_done_cond, &n_replays_done, | |
445 | nfiles); | |
446 | } | |
447 | ||
448 | static inline void wait_replays_done(void) | |
449 | { | |
450 | __wait_cv(&replay_done_mutex, &replay_done_cond, &n_replays_done, | |
451 | nfiles); | |
452 | } | |
453 | ||
454 | static inline void set_iter_done(void) | |
455 | { | |
456 | __set_cv(&iter_done_mutex, &iter_done_cond, &n_iters_done, | |
457 | nfiles); | |
458 | } | |
459 | ||
460 | static inline void wait_iters_done(void) | |
461 | { | |
462 | __wait_cv(&iter_done_mutex, &iter_done_cond, &n_iters_done, | |
463 | nfiles); | |
464 | } | |
465 | ||
466 | /** | |
467 | * wait_iter_start - Wait for an iteration to start | |
468 | * | |
469 | * This is /slightly/ different: we are waiting for a value to become | |
470 | * non-zero, and then we decrement it and go on. | |
471 | */ | |
472 | static inline void wait_iter_start(void) | |
473 | { | |
474 | pthread_mutex_lock(&iter_start_mutex); | |
475 | while (iter_start == 0) | |
476 | pthread_cond_wait(&iter_start_cond, &iter_start_mutex); | |
477 | assert(1 <= iter_start && iter_start <= nfiles); | |
478 | iter_start--; | |
479 | pthread_mutex_unlock(&iter_start_mutex); | |
480 | } | |
481 | ||
482 | /** | |
483 | * start_iter - Start an iteration at the replay thread level | |
484 | */ | |
485 | static inline void start_iter(void) | |
486 | { | |
487 | pthread_mutex_lock(&iter_start_mutex); | |
488 | assert(iter_start == 0); | |
489 | iter_start = nfiles; | |
490 | pthread_cond_broadcast(&iter_start_cond); | |
491 | pthread_mutex_unlock(&iter_start_mutex); | |
492 | } | |
493 | ||
494 | /* | |
495 | * ======================================================================== | |
496 | * ==== CPU RELATED ROUTINES ============================================== | |
497 | * ======================================================================== | |
498 | */ | |
499 | ||
500 | /** | |
501 | * get_ncpus - Sets up the global 'ncpus' value | |
502 | */ | |
503 | static void get_ncpus(void) | |
504 | { | |
80c4041b AA |
505 | #ifdef _SC_NPROCESSORS_ONLN |
506 | ncpus = sysconf(_SC_NPROCESSORS_ONLN); | |
fb697494 | 507 | #else |
0a915aab NZ |
508 | int nrcpus = 4096; |
509 | cpu_set_t * cpus; | |
510 | ||
511 | realloc: | |
512 | cpus = CPU_ALLOC(nrcpus); | |
513 | size = CPU_ALLOC_SIZE(nrcpus); | |
514 | CPU_ZERO_S(size, cpus); | |
515 | ||
7338236a | 516 | if (sched_getaffinity(0, size, cpus)) { |
0a915aab NZ |
517 | if( errno == EINVAL && nrcpus < (4096<<4) ) { |
518 | CPU_FREE(cpus); | |
2564a602 | 519 | nrcpus <<= 1; |
0a915aab NZ |
520 | goto realloc; |
521 | } | |
d47a3fec AB |
522 | fatal("sched_getaffinity", ERR_SYSCALL, "Can't get CPU info\n"); |
523 | /*NOTREACHED*/ | |
524 | } | |
525 | ||
0c2df13d NZ |
526 | ncpus = -1; |
527 | for (last_cpu = 0; last_cpu < CPU_SETSIZE && CPU_ISSET(last_cpu, &cpus); last_cpu++) | |
528 | if (CPU_ISSET( last_cpu, &cpus) ) | |
529 | ncpus = last_cpu; | |
530 | ncpus++; | |
0a915aab | 531 | CPU_FREE(cpus); |
fb697494 | 532 | #endif |
d47a3fec AB |
533 | if (ncpus == 0) { |
534 | fatal(NULL, ERR_SYSCALL, "Insufficient number of CPUs\n"); | |
535 | /*NOTREACHED*/ | |
536 | } | |
537 | } | |
538 | ||
539 | /** | |
540 | * pin_to_cpu - Pin this thread to a specific CPU | |
541 | * @tip: Thread information | |
542 | */ | |
543 | static void pin_to_cpu(struct thr_info *tip) | |
544 | { | |
0a915aab NZ |
545 | cpu_set_t *cpus; |
546 | size_t size; | |
547 | ||
548 | cpus = CPU_ALLOC(ncpus); | |
549 | size = CPU_ALLOC_SIZE(ncpus); | |
d47a3fec AB |
550 | |
551 | assert(0 <= tip->cpu && tip->cpu < ncpus); | |
552 | ||
f6541f75 | 553 | CPU_ZERO_S(size, cpus); |
0a915aab | 554 | CPU_SET_S(tip->cpu, size, cpus); |
7338236a | 555 | if (sched_setaffinity(0, size, cpus)) { |
d47a3fec AB |
556 | fatal("sched_setaffinity", ERR_SYSCALL, "Failed to pin CPU\n"); |
557 | /*NOTREACHED*/ | |
558 | } | |
7338236a | 559 | assert(tip->cpu == sched_getcpu()); |
d47a3fec AB |
560 | |
561 | if (verbose > 1) { | |
562 | int i; | |
0a915aab | 563 | cpu_set_t *now = CPU_ALLOC(ncpus); |
d47a3fec | 564 | |
7338236a | 565 | (void)sched_getaffinity(0, size, now); |
d47a3fec AB |
566 | fprintf(tip->vfp, "Pinned to CPU %02d ", tip->cpu); |
567 | for (i = 0; i < ncpus; i++) | |
0a915aab | 568 | fprintf(tip->vfp, "%1d", CPU_ISSET_S(i, size, now)); |
d47a3fec AB |
569 | fprintf(tip->vfp, "\n"); |
570 | } | |
571 | } | |
572 | ||
573 | /* | |
574 | * ======================================================================== | |
575 | * ==== INPUT DEVICE HANDLERS ============================================= | |
576 | * ======================================================================== | |
577 | */ | |
578 | ||
579 | /** | |
580 | * add_input_dev - Add a device ('sd*') to the list of devices to handle | |
581 | */ | |
582 | static void add_input_dev(char *devnm) | |
583 | { | |
584 | struct list_head *p; | |
585 | struct dev_info *dip; | |
586 | ||
587 | __list_for_each(p, &input_devs) { | |
588 | dip = list_entry(p, struct dev_info, head); | |
589 | if (strcmp(dip->devnm, devnm) == 0) | |
590 | return; | |
591 | } | |
592 | ||
593 | dip = malloc(sizeof(*dip)); | |
594 | dip->devnm = strdup(devnm); | |
595 | list_add_tail(&dip->head, &input_devs); | |
596 | } | |
597 | ||
598 | /** | |
599 | * rem_input_dev - Remove resources associated with this device | |
600 | */ | |
601 | static void rem_input_dev(struct dev_info *dip) | |
602 | { | |
603 | list_del(&dip->head); | |
604 | free(dip->devnm); | |
605 | free(dip); | |
606 | } | |
607 | ||
608 | static void find_input_devs(char *idir) | |
609 | { | |
610 | struct dirent *ent; | |
611 | DIR *dir = opendir(idir); | |
612 | ||
613 | if (dir == NULL) { | |
614 | fatal(idir, ERR_ARGS, "Unable to open %s\n", idir); | |
615 | /*NOTREACHED*/ | |
616 | } | |
617 | ||
618 | while ((ent = readdir(dir)) != NULL) { | |
6ca1e530 | 619 | char *p, *dsf; |
d47a3fec AB |
620 | |
621 | if (strstr(ent->d_name, ".replay.") == NULL) | |
622 | continue; | |
623 | ||
624 | dsf = strdup(ent->d_name); | |
625 | p = index(dsf, '.'); | |
626 | assert(p != NULL); | |
627 | *p = '\0'; | |
628 | add_input_dev(dsf); | |
629 | free(dsf); | |
630 | } | |
631 | ||
632 | closedir(dir); | |
633 | } | |
634 | ||
635 | /* | |
636 | * ======================================================================== | |
637 | * ==== MAP DEVICE INTERFACES ============================================= | |
638 | * ======================================================================== | |
639 | */ | |
640 | ||
641 | /** | |
642 | * read_map_devs - Read in a set of device mapping from the provided file. | |
643 | * @file_name: File containing device maps | |
644 | * | |
645 | * We support the notion of multiple such files being specifed on the cmd line | |
646 | */ | |
647 | static void read_map_devs(char *file_name) | |
648 | { | |
649 | FILE *fp; | |
dd093eb1 | 650 | char from_dev[256], to_dev[256]; |
d47a3fec AB |
651 | |
652 | fp = fopen(file_name, "r"); | |
653 | if (!fp) { | |
654 | fatal(file_name, ERR_SYSCALL, "Could not open map devs file\n"); | |
655 | /*NOTREACHED*/ | |
656 | } | |
657 | ||
dd093eb1 | 658 | while (fscanf(fp, "%s %s", from_dev, to_dev) == 2) { |
d47a3fec AB |
659 | struct map_dev *mdp = malloc(sizeof(*mdp)); |
660 | ||
661 | mdp->from_dev = from_dev; | |
662 | mdp->to_dev = to_dev; | |
663 | list_add_tail(&mdp->head, &map_devs); | |
664 | } | |
665 | ||
666 | fclose(fp); | |
667 | } | |
668 | ||
669 | /** | |
670 | * release_map_devs - Release resources associated with device mappings. | |
671 | */ | |
672 | static void release_map_devs(void) | |
673 | { | |
674 | struct list_head *p, *q; | |
675 | ||
676 | list_for_each_safe(p, q, &map_devs) { | |
677 | struct map_dev *mdp = list_entry(p, struct map_dev, head); | |
678 | ||
679 | list_del(&mdp->head); | |
680 | ||
681 | free(mdp->from_dev); | |
682 | free(mdp->to_dev); | |
683 | free(mdp); | |
684 | } | |
685 | } | |
686 | ||
687 | /** | |
688 | * map_dev - Return the mapped device for that specified | |
689 | * @from_dev: Device name as seen on recorded system | |
690 | * | |
691 | * Note: If there is no such mapping, we return the same name. | |
692 | */ | |
693 | static char *map_dev(char *from_dev) | |
694 | { | |
695 | struct list_head *p; | |
696 | ||
697 | __list_for_each(p, &map_devs) { | |
698 | struct map_dev *mdp = list_entry(p, struct map_dev, head); | |
699 | ||
700 | if (strcmp(from_dev, mdp->from_dev) == 0) | |
701 | return mdp->to_dev; | |
702 | } | |
703 | ||
704 | return from_dev; | |
705 | } | |
706 | ||
707 | /* | |
708 | * ======================================================================== | |
709 | * ==== IOCB MANAGEMENT ROUTINES ========================================== | |
710 | * ======================================================================== | |
711 | */ | |
712 | ||
713 | /** | |
714 | * iocb_init - Initialize the fields of an IOCB | |
715 | * @tip: Per-thread information | |
716 | * iocbp: IOCB pointer to update | |
717 | */ | |
718 | static void iocb_init(struct thr_info *tip, struct iocb_pkt *iocbp) | |
719 | { | |
720 | iocbp->tip = tip; | |
721 | iocbp->nbytes = 0; | |
722 | iocbp->iocb.u.c.buf = NULL; | |
723 | } | |
724 | ||
725 | /** | |
726 | * iocb_setup - Set up an iocb with this AIOs information | |
727 | * @iocbp: IOCB pointer to update | |
728 | * @rw: Direction (0 == write, 1 == read) | |
729 | * @n: Number of bytes to transfer | |
730 | * @off: Offset (in bytes) | |
731 | */ | |
732 | static void iocb_setup(struct iocb_pkt *iocbp, int rw, int n, long long off) | |
733 | { | |
734 | char *buf; | |
735 | struct iocb *iop = &iocbp->iocb; | |
736 | ||
737 | assert(rw == 0 || rw == 1); | |
738 | assert(0 < n && (n % nb_sec) == 0); | |
739 | assert(0 <= off); | |
740 | ||
741 | if (iocbp->nbytes) { | |
742 | if (iocbp->nbytes >= n) { | |
743 | buf = iop->u.c.buf; | |
744 | goto prep; | |
745 | } | |
746 | ||
747 | assert(iop->u.c.buf); | |
748 | free(iop->u.c.buf); | |
749 | } | |
750 | ||
751 | buf = buf_alloc(n); | |
752 | iocbp->nbytes = n; | |
753 | ||
754 | prep: | |
755 | if (rw) | |
756 | io_prep_pread(iop, iocbp->tip->ofd, buf, n, off); | |
757 | else { | |
758 | assert(write_enabled); | |
759 | io_prep_pwrite(iop, iocbp->tip->ofd, buf, n, off); | |
760 | touch_memory(buf, n); | |
761 | } | |
762 | ||
763 | iop->data = iocbp; | |
764 | } | |
765 | ||
766 | /* | |
767 | * ======================================================================== | |
768 | * ==== PER-THREAD SET UP & TEAR DOWN ===================================== | |
769 | * ======================================================================== | |
770 | */ | |
771 | ||
772 | /** | |
773 | * tip_init - Per thread initialization function | |
774 | */ | |
775 | static void tip_init(struct thr_info *tip) | |
776 | { | |
777 | int i; | |
778 | ||
779 | INIT_LIST_HEAD(&tip->free_iocbs); | |
780 | INIT_LIST_HEAD(&tip->used_iocbs); | |
781 | ||
782 | pthread_mutex_init(&tip->mutex, NULL); | |
783 | pthread_cond_init(&tip->cond, NULL); | |
784 | ||
785 | if (io_setup(naios, &tip->ctx)) { | |
786 | fatal("io_setup", ERR_SYSCALL, "io_setup failed\n"); | |
787 | /*NOTREACHED*/ | |
788 | } | |
789 | ||
790 | tip->ofd = -1; | |
791 | tip->naios_out = 0; | |
792 | tip->send_done = tip->reap_done = 0; | |
793 | tip->send_wait = tip->reap_wait = 0; | |
794 | ||
795 | memset(&tip->sub_thread, 0, sizeof(tip->sub_thread)); | |
796 | memset(&tip->rec_thread, 0, sizeof(tip->rec_thread)); | |
797 | ||
798 | for (i = 0; i < naios; i++) { | |
799 | struct iocb_pkt *iocbp = buf_alloc(sizeof(*iocbp)); | |
800 | ||
801 | iocb_init(tip, iocbp); | |
802 | list_add_tail(&iocbp->head, &tip->free_iocbs); | |
803 | } | |
804 | tip->naios_free = naios; | |
805 | ||
806 | if (verbose > 1) { | |
807 | char fn[MAXPATHLEN]; | |
808 | ||
809 | sprintf(fn, "%s/%s.%s.%d.rep", idir, tip->devnm, ibase, | |
810 | tip->cpu); | |
811 | tip->vfp = fopen(fn, "w"); | |
812 | if (!tip->vfp) { | |
813 | fatal(fn, ERR_SYSCALL, "Failed to open report\n"); | |
814 | /*NOTREACHED*/ | |
815 | } | |
816 | ||
817 | setlinebuf(tip->vfp); | |
818 | } | |
819 | ||
820 | if (pthread_create(&tip->sub_thread, NULL, replay_sub, tip)) { | |
821 | fatal("pthread_create", ERR_SYSCALL, | |
822 | "thread create failed\n"); | |
823 | /*NOTREACHED*/ | |
824 | } | |
825 | ||
826 | if (pthread_create(&tip->rec_thread, NULL, replay_rec, tip)) { | |
827 | fatal("pthread_create", ERR_SYSCALL, | |
828 | "thread create failed\n"); | |
829 | /*NOTREACHED*/ | |
830 | } | |
831 | } | |
832 | ||
833 | /** | |
834 | * tip_release - Release resources associated with this thread | |
835 | */ | |
836 | static void tip_release(struct thr_info *tip) | |
837 | { | |
838 | struct list_head *p, *q; | |
839 | ||
840 | assert(tip->send_done); | |
841 | assert(tip->reap_done); | |
842 | assert(list_len(&tip->used_iocbs) == 0); | |
843 | assert(tip->naios_free == naios); | |
844 | ||
845 | if (pthread_join(tip->sub_thread, NULL)) { | |
846 | fatal("pthread_join", ERR_SYSCALL, "pthread sub join failed\n"); | |
847 | /*NOTREACHED*/ | |
848 | } | |
849 | if (pthread_join(tip->rec_thread, NULL)) { | |
850 | fatal("pthread_join", ERR_SYSCALL, "pthread rec join failed\n"); | |
851 | /*NOTREACHED*/ | |
852 | } | |
853 | ||
854 | io_destroy(tip->ctx); | |
855 | ||
856 | list_splice(&tip->used_iocbs, &tip->free_iocbs); | |
857 | list_for_each_safe(p, q, &tip->free_iocbs) { | |
858 | struct iocb_pkt *iocbp = list_entry(p, struct iocb_pkt, head); | |
859 | ||
860 | list_del(&iocbp->head); | |
861 | if (iocbp->nbytes) | |
862 | free(iocbp->iocb.u.c.buf); | |
863 | free(iocbp); | |
864 | } | |
865 | ||
866 | pthread_cond_destroy(&tip->cond); | |
867 | pthread_mutex_destroy(&tip->mutex); | |
868 | } | |
869 | ||
870 | /** | |
871 | * add_input_file - Allocate and initialize per-input file structure | |
872 | * @cpu: CPU for this file | |
873 | * @devnm: Device name for this file | |
874 | * @file_name: Fully qualifed input file name | |
875 | */ | |
876 | static void add_input_file(int cpu, char *devnm, char *file_name) | |
877 | { | |
878 | struct stat buf; | |
879 | struct io_file_hdr hdr; | |
880 | struct thr_info *tip = buf_alloc(sizeof(*tip)); | |
881 | __u64 my_version = mk_btversion(btver_mjr, btver_mnr, btver_sub); | |
882 | ||
883 | assert(0 <= cpu && cpu < ncpus); | |
884 | ||
885 | memset(&hdr, 0, sizeof(hdr)); | |
886 | memset(tip, 0, sizeof(*tip)); | |
887 | tip->cpu = cpu % cpus_to_use; | |
888 | tip->iterations = def_iterations; | |
889 | ||
890 | tip->ifd = open(file_name, O_RDONLY); | |
891 | if (tip->ifd < 0) { | |
892 | fatal(file_name, ERR_ARGS, "Unable to open\n"); | |
893 | /*NOTREACHED*/ | |
894 | } | |
895 | if (fstat(tip->ifd, &buf) < 0) { | |
896 | fatal(file_name, ERR_SYSCALL, "fstat failed\n"); | |
897 | /*NOTREACHED*/ | |
898 | } | |
899 | if (buf.st_size < (off_t)sizeof(hdr)) { | |
900 | if (verbose) | |
901 | fprintf(stderr, "\t%s empty\n", file_name); | |
902 | goto empty_file; | |
903 | } | |
904 | ||
905 | if (read(tip->ifd, &hdr, sizeof(hdr)) != sizeof(hdr)) { | |
906 | fatal(file_name, ERR_ARGS, "Header read failed\n"); | |
907 | /*NOTREACHED*/ | |
908 | } | |
909 | ||
910 | if (hdr.version != my_version) { | |
911 | fprintf(stderr, "%llx %llx %llx %llx\n", | |
912 | (long long unsigned)hdr.version, | |
913 | (long long unsigned)hdr.genesis, | |
914 | (long long unsigned)hdr.nbunches, | |
915 | (long long unsigned)hdr.total_pkts); | |
916 | fatal(NULL, ERR_ARGS, | |
917 | "BT version mismatch: %lx versus my %lx\n", | |
918 | (long)hdr.version, (long)my_version); | |
919 | ||
920 | } | |
921 | ||
922 | if (hdr.nbunches == 0) { | |
923 | empty_file: | |
924 | close(tip->ifd); | |
925 | free(tip); | |
926 | return; | |
927 | } | |
928 | ||
929 | if (hdr.genesis < genesis) { | |
930 | if (verbose > 1) | |
931 | fprintf(stderr, "Setting genesis to %llu.%llu\n", | |
932 | du64_to_sec(hdr.genesis), | |
933 | du64_to_nsec(hdr.genesis)); | |
934 | genesis = hdr.genesis; | |
935 | } | |
936 | ||
937 | tip->devnm = strdup(devnm); | |
938 | tip->file_name = strdup(file_name); | |
939 | ||
940 | list_add_tail(&tip->head, &input_files); | |
941 | ||
942 | if (verbose) | |
943 | fprintf(stderr, "Added %s %llu\n", file_name, | |
944 | (long long)hdr.genesis); | |
945 | } | |
946 | ||
947 | /** | |
948 | * rem_input_file - Release resources associated with an input file | |
949 | * @tip: Per-input file information | |
950 | */ | |
951 | static void rem_input_file(struct thr_info *tip) | |
952 | { | |
953 | list_del(&tip->head); | |
954 | ||
955 | tip_release(tip); | |
956 | ||
957 | close(tip->ofd); | |
958 | close(tip->ifd); | |
959 | free(tip->file_name); | |
960 | free(tip->devnm); | |
961 | free(tip); | |
962 | } | |
963 | ||
964 | /** | |
965 | * rem_input_files - Remove all input files | |
966 | */ | |
967 | static void rem_input_files(void) | |
968 | { | |
969 | struct list_head *p, *q; | |
970 | ||
971 | list_for_each_safe(p, q, &input_files) { | |
972 | rem_input_file(list_entry(p, struct thr_info, head)); | |
973 | } | |
974 | } | |
975 | ||
976 | /** | |
977 | * __find_input_files - Find input files associated with this device (per cpu) | |
978 | */ | |
979 | static void __find_input_files(struct dev_info *dip) | |
980 | { | |
981 | int cpu = 0; | |
982 | ||
983 | for (;;) { | |
984 | char full_name[MAXPATHLEN]; | |
985 | ||
986 | sprintf(full_name, "%s/%s.%s.%d", idir, dip->devnm, ibase, cpu); | |
987 | if (access(full_name, R_OK) != 0) | |
988 | break; | |
989 | ||
990 | add_input_file(cpu, dip->devnm, full_name); | |
991 | cpu++; | |
992 | } | |
993 | ||
994 | if (!cpu) { | |
995 | fatal(NULL, ERR_ARGS, "No traces found for %s\n", dip->devnm); | |
996 | /*NOTREACHED*/ | |
997 | } | |
998 | ||
999 | rem_input_dev(dip); | |
1000 | } | |
1001 | ||
1002 | ||
1003 | /** | |
1004 | * find_input_files - Find input files for all devices | |
1005 | */ | |
1006 | static void find_input_files(void) | |
1007 | { | |
1008 | struct list_head *p, *q; | |
1009 | ||
1010 | list_for_each_safe(p, q, &input_devs) { | |
1011 | __find_input_files(list_entry(p, struct dev_info, head)); | |
1012 | } | |
1013 | } | |
1014 | ||
1015 | /* | |
1016 | * ======================================================================== | |
1017 | * ==== RECLAIM ROUTINES ================================================== | |
1018 | * ======================================================================== | |
1019 | */ | |
1020 | ||
1021 | /** | |
1022 | * reap_wait_aios - Wait for and return number of outstanding AIOs | |
1023 | * | |
1024 | * Will return 0 if we are done | |
1025 | */ | |
1026 | static int reap_wait_aios(struct thr_info *tip) | |
1027 | { | |
1028 | int naios = 0; | |
1029 | ||
1030 | if (!is_reap_done(tip)) { | |
1031 | pthread_mutex_lock(&tip->mutex); | |
1032 | while (tip->naios_out == 0) { | |
1033 | tip->reap_wait = 1; | |
1034 | if (pthread_cond_wait(&tip->cond, &tip->mutex)) { | |
1035 | fatal("pthread_cond_wait", ERR_SYSCALL, | |
1036 | "nfree_current cond wait failed\n"); | |
1037 | /*NOTREACHED*/ | |
1038 | } | |
1039 | } | |
1040 | naios = tip->naios_out; | |
1041 | pthread_mutex_unlock(&tip->mutex); | |
1042 | } | |
1043 | assert(is_reap_done(tip) || naios > 0); | |
1044 | ||
1045 | return is_reap_done(tip) ? 0 : naios; | |
1046 | } | |
1047 | ||
1048 | /** | |
1049 | * reclaim_ios - Reclaim AIOs completed, recycle IOCBs | |
1050 | * @tip: Per-thread information | |
1051 | * @naios_out: Number of AIOs we have outstanding (min) | |
1052 | */ | |
1053 | static void reclaim_ios(struct thr_info *tip, long naios_out) | |
1054 | { | |
1055 | long i, ndone; | |
1056 | struct io_event *evp, events[naios_out]; | |
1057 | ||
1058 | again: | |
1059 | assert(naios > 0); | |
1060 | for (;;) { | |
1061 | ndone = io_getevents(tip->ctx, 1, naios_out, events, NULL); | |
1062 | if (ndone > 0) | |
1063 | break; | |
1064 | ||
1065 | if (errno && errno != EINTR) { | |
1066 | fatal("io_getevents", ERR_SYSCALL, | |
1067 | "io_getevents failed\n"); | |
1068 | /*NOTREACHED*/ | |
1069 | } | |
1070 | } | |
1071 | assert(0 < ndone && ndone <= naios_out); | |
1072 | ||
1073 | pthread_mutex_lock(&tip->mutex); | |
1074 | for (i = 0, evp = events; i < ndone; i++, evp++) { | |
1075 | struct iocb_pkt *iocbp = evp->data; | |
1076 | ||
1077 | if (evp->res != iocbp->iocb.u.c.nbytes) { | |
1078 | fatal(NULL, ERR_SYSCALL, | |
1079 | "Event failure %ld/%ld\t(%ld + %ld)\n", | |
1080 | (long)evp->res, (long)evp->res2, | |
1081 | (long)iocbp->iocb.u.c.offset / nb_sec, | |
1082 | (long)iocbp->iocb.u.c.nbytes / nb_sec); | |
1083 | /*NOTREACHED*/ | |
1084 | } | |
1085 | ||
1086 | list_move_tail(&iocbp->head, &tip->free_iocbs); | |
1087 | } | |
1088 | ||
1089 | tip->naios_free += ndone; | |
1090 | tip->naios_out -= ndone; | |
1091 | naios_out = minl(naios_out, tip->naios_out); | |
1092 | ||
1093 | if (tip->send_wait) { | |
1094 | tip->send_wait = 0; | |
1095 | pthread_cond_signal(&tip->cond); | |
1096 | } | |
1097 | pthread_mutex_unlock(&tip->mutex); | |
1098 | ||
1099 | /* | |
1100 | * Short cut: If we /know/ there are some more AIOs, go handle them | |
1101 | */ | |
1102 | if (naios_out) | |
1103 | goto again; | |
1104 | } | |
1105 | ||
1106 | /** | |
1107 | * replay_rec - Worker thread to reclaim AIOs | |
1108 | * @arg: Pointer to thread information | |
1109 | */ | |
1110 | static void *replay_rec(void *arg) | |
1111 | { | |
1112 | long naios_out; | |
1113 | struct thr_info *tip = arg; | |
1114 | ||
1115 | while ((naios_out = reap_wait_aios(tip)) > 0) | |
1116 | reclaim_ios(tip, naios_out); | |
1117 | ||
1118 | assert(tip->send_done); | |
1119 | tip->reap_done = 1; | |
1120 | set_reclaim_done(); | |
1121 | ||
1122 | return NULL; | |
1123 | } | |
1124 | ||
1125 | /* | |
1126 | * ======================================================================== | |
1127 | * ==== REPLAY ROUTINES =================================================== | |
1128 | * ======================================================================== | |
1129 | */ | |
1130 | ||
1131 | /** | |
1132 | * next_bunch - Retrieve next bunch of AIOs to process | |
1133 | * @tip: Per-thread information | |
1134 | * @bunch: Bunch information | |
1135 | * | |
1136 | * Returns TRUE if we recovered a bunch of IOs, else hit EOF | |
1137 | */ | |
1138 | static int next_bunch(struct thr_info *tip, struct io_bunch *bunch) | |
1139 | { | |
1140 | size_t count, result; | |
1141 | ||
1142 | result = read(tip->ifd, &bunch->hdr, sizeof(bunch->hdr)); | |
1143 | if (result != sizeof(bunch->hdr)) { | |
1144 | if (result == 0) | |
1145 | return 0; | |
1146 | ||
1147 | fatal(tip->file_name, ERR_SYSCALL, "Short hdr(%ld)\n", | |
1148 | (long)result); | |
1149 | /*NOTREACHED*/ | |
1150 | } | |
1151 | assert(bunch->hdr.npkts <= BT_MAX_PKTS); | |
1152 | ||
1153 | count = bunch->hdr.npkts * sizeof(struct io_pkt); | |
1154 | result = read(tip->ifd, &bunch->pkts, count); | |
1155 | if (result != count) { | |
1156 | fatal(tip->file_name, ERR_SYSCALL, "Short pkts(%ld/%ld)\n", | |
1157 | (long)result, (long)count); | |
1158 | /*NOTREACHED*/ | |
1159 | } | |
1160 | ||
1161 | return 1; | |
1162 | } | |
1163 | ||
1164 | /** | |
1165 | * nfree_current - Returns current number of AIOs that are free | |
1166 | * | |
1167 | * Will wait for available ones... | |
1168 | * | |
1169 | * Returns 0 if we have some condition that causes us to exit | |
1170 | */ | |
1171 | static int nfree_current(struct thr_info *tip) | |
1172 | { | |
1173 | int nfree = 0; | |
1174 | ||
1175 | pthread_mutex_lock(&tip->mutex); | |
1176 | while (!is_send_done(tip) && ((nfree = tip->naios_free) == 0)) { | |
1177 | tip->send_wait = 1; | |
1178 | if (pthread_cond_wait(&tip->cond, &tip->mutex)) { | |
1179 | fatal("pthread_cond_wait", ERR_SYSCALL, | |
1180 | "nfree_current cond wait failed\n"); | |
1181 | /*NOTREACHED*/ | |
1182 | } | |
1183 | } | |
1184 | pthread_mutex_unlock(&tip->mutex); | |
1185 | ||
1186 | return nfree; | |
1187 | } | |
1188 | ||
1189 | /** | |
1190 | * stall - Stall for the number of nanoseconds requested | |
1191 | * | |
1192 | * We may be late, in which case we just return. | |
1193 | */ | |
1194 | static void stall(struct thr_info *tip, long long oclock) | |
1195 | { | |
1196 | struct timespec req; | |
1197 | long long dreal, tclock = gettime() - rgenesis; | |
1198 | ||
4a7968cc LU |
1199 | oclock /= acc_factor; |
1200 | ||
d47a3fec AB |
1201 | if (verbose > 1) |
1202 | fprintf(tip->vfp, " stall(%lld.%09lld, %lld.%09lld)\n", | |
1203 | du64_to_sec(oclock), du64_to_nsec(oclock), | |
1204 | du64_to_sec(tclock), du64_to_nsec(tclock)); | |
1205 | ||
1206 | while (!is_send_done(tip) && tclock < oclock) { | |
1207 | dreal = oclock - tclock; | |
1208 | req.tv_sec = dreal / (1000 * 1000 * 1000); | |
1209 | req.tv_nsec = dreal % (1000 * 1000 * 1000); | |
1210 | ||
1211 | if (verbose > 1) { | |
1212 | fprintf(tip->vfp, "++ stall(%lld.%09lld) ++\n", | |
1213 | (long long)req.tv_sec, | |
1214 | (long long)req.tv_nsec); | |
1215 | } | |
1216 | ||
1217 | if (nanosleep(&req, NULL) < 0 && signal_done) | |
1218 | break; | |
1219 | ||
1220 | tclock = gettime() - rgenesis; | |
1221 | } | |
1222 | } | |
1223 | ||
1224 | /** | |
1225 | * iocbs_map - Map a set of AIOs onto a set of IOCBs | |
1226 | * @tip: Per-thread information | |
1227 | * @list: List of AIOs created | |
1228 | * @pkts: AIOs to map | |
1229 | * @ntodo: Number of AIOs to map | |
1230 | */ | |
1231 | static void iocbs_map(struct thr_info *tip, struct iocb **list, | |
1232 | struct io_pkt *pkts, int ntodo) | |
1233 | { | |
1234 | int i; | |
1235 | struct io_pkt *pkt; | |
1236 | ||
1237 | assert(0 < ntodo && ntodo <= naios); | |
1238 | ||
1239 | pthread_mutex_lock(&tip->mutex); | |
1240 | assert(ntodo <= list_len(&tip->free_iocbs)); | |
1241 | for (i = 0, pkt = pkts; i < ntodo; i++, pkt++) { | |
1242 | __u32 rw = pkt->rw; | |
1243 | struct iocb_pkt *iocbp; | |
1244 | ||
1245 | if (!pkt->rw && !write_enabled) | |
1246 | rw = 1; | |
1247 | ||
1248 | if (verbose > 1) | |
1249 | fprintf(tip->vfp, "\t%10llu + %10llu %c%c\n", | |
1250 | (unsigned long long)pkt->sector, | |
1251 | (unsigned long long)pkt->nbytes / nb_sec, | |
1252 | rw ? 'R' : 'W', | |
1253 | (rw == 1 && pkt->rw == 0) ? '!' : ' '); | |
1254 | ||
1255 | iocbp = list_entry(tip->free_iocbs.next, struct iocb_pkt, head); | |
1256 | iocb_setup(iocbp, rw, pkt->nbytes, pkt->sector * nb_sec); | |
1257 | ||
1258 | list_move_tail(&iocbp->head, &tip->used_iocbs); | |
1259 | list[i] = &iocbp->iocb; | |
1260 | } | |
1261 | ||
1262 | tip->naios_free -= ntodo; | |
1263 | assert(tip->naios_free >= 0); | |
1264 | pthread_mutex_unlock(&tip->mutex); | |
1265 | } | |
1266 | ||
1267 | /** | |
1268 | * process_bunch - Process a bunch of requests | |
1269 | * @tip: Per-thread information | |
1270 | * @bunch: Bunch to process | |
1271 | */ | |
1272 | static void process_bunch(struct thr_info *tip, struct io_bunch *bunch) | |
1273 | { | |
1274 | __u64 i = 0; | |
1275 | struct iocb *list[bunch->hdr.npkts]; | |
1276 | ||
1277 | assert(0 < bunch->hdr.npkts && bunch->hdr.npkts <= BT_MAX_PKTS); | |
1278 | while (!is_send_done(tip) && (i < bunch->hdr.npkts)) { | |
1279 | long ndone; | |
1280 | int ntodo = min(nfree_current(tip), bunch->hdr.npkts - i); | |
1281 | ||
1282 | assert(0 < ntodo && ntodo <= naios); | |
1283 | iocbs_map(tip, list, &bunch->pkts[i], ntodo); | |
1284 | if (!no_stalls) | |
1285 | stall(tip, bunch->hdr.time_stamp - genesis); | |
1286 | ||
1287 | if (ntodo) { | |
1288 | if (verbose > 1) | |
1289 | fprintf(tip->vfp, "submit(%d)\n", ntodo); | |
1290 | ndone = io_submit(tip->ctx, ntodo, list); | |
1291 | if (ndone != (long)ntodo) { | |
1292 | fatal("io_submit", ERR_SYSCALL, | |
1293 | "%d: io_submit(%d:%ld) failed (%s)\n", | |
1294 | tip->cpu, ntodo, ndone, | |
1295 | strerror(labs(ndone))); | |
1296 | /*NOTREACHED*/ | |
1297 | } | |
1298 | ||
1299 | pthread_mutex_lock(&tip->mutex); | |
1300 | tip->naios_out += ndone; | |
1301 | assert(tip->naios_out <= naios); | |
1302 | if (tip->reap_wait) { | |
1303 | tip->reap_wait = 0; | |
1304 | pthread_cond_signal(&tip->cond); | |
1305 | } | |
1306 | pthread_mutex_unlock(&tip->mutex); | |
1307 | ||
1308 | i += ndone; | |
1309 | assert(i <= bunch->hdr.npkts); | |
1310 | } | |
1311 | } | |
1312 | } | |
1313 | ||
1314 | /** | |
1315 | * reset_input_file - Reset the input file for the next iteration | |
1316 | * @tip: Thread information | |
1317 | * | |
1318 | * We also do a dummy read of the file header to get us to the first bunch. | |
1319 | */ | |
1320 | static void reset_input_file(struct thr_info *tip) | |
1321 | { | |
1322 | struct io_file_hdr hdr; | |
1323 | ||
1324 | lseek(tip->ifd, 0, 0); | |
1325 | ||
1326 | if (read(tip->ifd, &hdr, sizeof(hdr)) != sizeof(hdr)) { | |
1327 | fatal(tip->file_name, ERR_ARGS, "Header reread failed\n"); | |
1328 | /*NOTREACHED*/ | |
1329 | } | |
1330 | } | |
1331 | ||
1332 | /** | |
1333 | * replay_sub - Worker thread to submit AIOs that are being replayed | |
1334 | */ | |
1335 | static void *replay_sub(void *arg) | |
1336 | { | |
a788dfde | 1337 | unsigned int i; |
e41bf0ff | 1338 | char *mdev; |
d47a3fec AB |
1339 | char path[MAXPATHLEN]; |
1340 | struct io_bunch bunch; | |
1341 | struct thr_info *tip = arg; | |
358504bb | 1342 | int oflags; |
d47a3fec AB |
1343 | |
1344 | pin_to_cpu(tip); | |
1345 | ||
e41bf0ff ES |
1346 | mdev = map_dev(tip->devnm); |
1347 | sprintf(path, "/dev/%s", mdev); | |
1348 | /* | |
1349 | * convert underscores to slashes to | |
1350 | * restore device names that have larger paths | |
1351 | */ | |
1352 | for (i = 0; i < strlen(mdev); i++) | |
1353 | if (path[strlen("/dev/") + i] == '_') | |
1354 | path[strlen("/dev/") + i] = '/'; | |
358504bb JA |
1355 | #ifdef O_NOATIME |
1356 | oflags = O_NOATIME; | |
1357 | #else | |
1358 | oflags = 0; | |
1359 | #endif | |
1360 | tip->ofd = open(path, O_RDWR | O_DIRECT | oflags); | |
d47a3fec AB |
1361 | if (tip->ofd < 0) { |
1362 | fatal(path, ERR_SYSCALL, "Failed device open\n"); | |
1363 | /*NOTREACHED*/ | |
1364 | } | |
1365 | ||
1366 | set_replay_ready(); | |
1367 | while (!is_send_done(tip) && tip->iterations--) { | |
1368 | wait_iter_start(); | |
cbb3e69e | 1369 | if (verbose > 1) |
d47a3fec AB |
1370 | fprintf(tip->vfp, "\n=== %d ===\n", tip->iterations); |
1371 | while (!is_send_done(tip) && next_bunch(tip, &bunch)) | |
1372 | process_bunch(tip, &bunch); | |
1373 | set_iter_done(); | |
1374 | reset_input_file(tip); | |
1375 | } | |
1376 | tip->send_done = 1; | |
1377 | set_replay_done(); | |
1378 | ||
1379 | return NULL; | |
1380 | } | |
1381 | ||
1382 | /* | |
1383 | * ======================================================================== | |
1384 | * ==== COMMAND LINE ARGUMENT HANDLING ==================================== | |
1385 | * ======================================================================== | |
1386 | */ | |
1387 | ||
1388 | static char usage_str[] = \ | |
4a7968cc | 1389 | "\n" \ |
d47a3fec AB |
1390 | "\t[ -c <cpus> : --cpus=<cpus> ] Default: 1\n" \ |
1391 | "\t[ -d <dir> : --input-directory=<dir> ] Default: .\n" \ | |
4a7968cc | 1392 | "\t[ -F : --find-records ] Default: Off\n" \ |
d47a3fec AB |
1393 | "\t[ -h : --help ] Default: Off\n" \ |
1394 | "\t[ -i <base> : --input-base=<base> ] Default: replay\n" \ | |
1395 | "\t[ -I <iters>: --iterations=<iters> ] Default: 1\n" \ | |
1396 | "\t[ -M <file> : --map-devs=<file> ] Default: None\n" \ | |
1397 | "\t[ -N : --no-stalls ] Default: Off\n" \ | |
4a7968cc | 1398 | "\t[ -x : --acc-factor ] Default: 1\n" \ |
d47a3fec AB |
1399 | "\t[ -v : --verbose ] Default: Off\n" \ |
1400 | "\t[ -V : --version ] Default: Off\n" \ | |
1401 | "\t[ -W : --write-enable ] Default: Off\n" \ | |
1402 | "\t<dev...> Default: None\n" \ | |
1403 | "\n"; | |
1404 | ||
4a7968cc | 1405 | #define S_OPTS "c:d:Fhi:I:M:Nx:t:vVW" |
d47a3fec AB |
1406 | static struct option l_opts[] = { |
1407 | { | |
1408 | .name = "cpus", | |
1409 | .has_arg = required_argument, | |
1410 | .flag = NULL, | |
1411 | .val = 'c' | |
1412 | }, | |
1413 | { | |
1414 | .name = "input-directory", | |
1415 | .has_arg = required_argument, | |
1416 | .flag = NULL, | |
1417 | .val = 'd' | |
1418 | }, | |
1419 | { | |
1420 | .name = "find-records", | |
1421 | .has_arg = no_argument, | |
1422 | .flag = NULL, | |
1423 | .val = 'F' | |
1424 | }, | |
1425 | { | |
1426 | .name = "help", | |
1427 | .has_arg = no_argument, | |
1428 | .flag = NULL, | |
1429 | .val = 'h' | |
1430 | }, | |
1431 | { | |
1432 | .name = "input-base", | |
1433 | .has_arg = required_argument, | |
1434 | .flag = NULL, | |
1435 | .val = 'i' | |
1436 | }, | |
1437 | { | |
1438 | .name = "iterations", | |
1439 | .has_arg = required_argument, | |
1440 | .flag = NULL, | |
1441 | .val = 'I' | |
1442 | }, | |
1443 | { | |
1444 | .name = "map-devs", | |
1445 | .has_arg = required_argument, | |
1446 | .flag = NULL, | |
1447 | .val = 'M' | |
1448 | }, | |
1449 | { | |
1450 | .name = "no-stalls", | |
1451 | .has_arg = no_argument, | |
1452 | .flag = NULL, | |
1453 | .val = 'N' | |
1454 | }, | |
4a7968cc LU |
1455 | { |
1456 | .name = "acc-factor", | |
1457 | .has_arg = required_argument, | |
1458 | .flag = NULL, | |
1459 | .val = 'x' | |
1460 | }, | |
d47a3fec AB |
1461 | { |
1462 | .name = "verbose", | |
1463 | .has_arg = no_argument, | |
1464 | .flag = NULL, | |
1465 | .val = 'v' | |
1466 | }, | |
1467 | { | |
1468 | .name = "version", | |
1469 | .has_arg = no_argument, | |
1470 | .flag = NULL, | |
1471 | .val = 'V' | |
1472 | }, | |
1473 | { | |
1474 | .name = "write-enable", | |
1475 | .has_arg = no_argument, | |
1476 | .flag = NULL, | |
1477 | .val = 'W' | |
1478 | }, | |
1479 | { | |
1480 | .name = NULL | |
1481 | } | |
1482 | }; | |
1483 | ||
1484 | /** | |
1485 | * handle_args: Parse passed in argument list | |
1486 | * @argc: Number of arguments in argv | |
1487 | * @argv: Arguments passed in | |
1488 | * | |
1489 | * Does rudimentary parameter verification as well. | |
1490 | */ | |
1491 | static void handle_args(int argc, char *argv[]) | |
1492 | { | |
1493 | int c; | |
4a7968cc | 1494 | int r; |
d47a3fec AB |
1495 | |
1496 | while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) { | |
1497 | switch (c) { | |
1498 | case 'c': | |
1499 | cpus_to_use = atoi(optarg); | |
1500 | if (cpus_to_use <= 0 || cpus_to_use > ncpus) { | |
1501 | fatal(NULL, ERR_ARGS, | |
1502 | "Invalid number of cpus %d (0<x<%d)\n", | |
1503 | cpus_to_use, ncpus); | |
1504 | /*NOTREACHED*/ | |
1505 | } | |
1506 | break; | |
1507 | ||
1508 | case 'd': | |
1509 | idir = optarg; | |
1510 | if (access(idir, R_OK | X_OK) != 0) { | |
1511 | fatal(idir, ERR_ARGS, | |
1512 | "Invalid input directory specified\n"); | |
1513 | /*NOTREACHED*/ | |
1514 | } | |
1515 | break; | |
1516 | ||
1517 | case 'F': | |
1518 | find_records = 1; | |
1519 | break; | |
1520 | ||
1521 | case 'h': | |
1522 | usage(); | |
1523 | exit(0); | |
1524 | /*NOTREACHED*/ | |
1525 | ||
1526 | case 'i': | |
1527 | ibase = optarg; | |
1528 | break; | |
1529 | ||
1530 | case 'I': | |
1531 | def_iterations = atoi(optarg); | |
1532 | if (def_iterations <= 0) { | |
1533 | fprintf(stderr, | |
1534 | "Invalid number of iterations %d\n", | |
1535 | def_iterations); | |
1536 | exit(ERR_ARGS); | |
1537 | /*NOTREACHED*/ | |
1538 | } | |
1539 | break; | |
1540 | ||
1541 | case 'M': | |
1542 | read_map_devs(optarg); | |
1543 | break; | |
1544 | ||
1545 | case 'N': | |
1546 | no_stalls = 1; | |
1547 | break; | |
1548 | ||
4a7968cc LU |
1549 | case 'x': |
1550 | r = sscanf(optarg,"%u",&acc_factor); | |
1551 | if (r!=1) { | |
1552 | fprintf(stderr, | |
1553 | "Invalid acceleration factor\n"); | |
1554 | exit(ERR_ARGS); | |
1555 | /*NOTREACHED*/ | |
1556 | } | |
1557 | break; | |
1558 | ||
d47a3fec AB |
1559 | case 'V': |
1560 | fprintf(stderr, "btreplay -- version %s\n", | |
1561 | my_btversion); | |
1562 | fprintf(stderr, " Built on %s\n", | |
1563 | build_date); | |
1564 | exit(0); | |
1565 | /*NOTREACHED*/ | |
1566 | ||
1567 | case 'v': | |
1568 | verbose++; | |
1569 | break; | |
1570 | ||
1571 | case 'W': | |
1572 | write_enabled = 1; | |
1573 | break; | |
1574 | ||
1575 | default: | |
1576 | usage(); | |
1577 | fatal(NULL, ERR_ARGS, | |
1578 | "Invalid command line argument %c\n", c); | |
1579 | /*NOTREACHED*/ | |
1580 | } | |
1581 | } | |
1582 | ||
1583 | while (optind < argc) | |
1584 | add_input_dev(argv[optind++]); | |
1585 | ||
1586 | if (find_records) | |
1587 | find_input_devs(idir); | |
1588 | ||
1589 | if (list_len(&input_devs) == 0) { | |
1590 | fatal(NULL, ERR_ARGS, "Missing required input dev name(s)\n"); | |
1591 | /*NOTREACHED*/ | |
1592 | } | |
1593 | ||
1594 | if (cpus_to_use < 0) | |
1595 | cpus_to_use = ncpus; | |
1596 | } | |
1597 | ||
1598 | /* | |
1599 | * ======================================================================== | |
1600 | * ==== MAIN ROUTINE ====================================================== | |
1601 | * ======================================================================== | |
1602 | */ | |
1603 | ||
1604 | /** | |
1605 | * set_signal_done - Signal handler, catches signals & sets signal_done | |
1606 | */ | |
1607 | static void set_signal_done(__attribute__((__unused__))int signum) | |
1608 | { | |
1609 | signal_done = 1; | |
1610 | } | |
1611 | ||
1612 | /** | |
1613 | * main - | |
1614 | * @argc: Number of arguments | |
1615 | * @argv: Array of arguments | |
1616 | */ | |
1617 | int main(int argc, char *argv[]) | |
1618 | { | |
1619 | int i; | |
1620 | struct list_head *p; | |
1621 | ||
1622 | pgsize = getpagesize(); | |
1623 | assert(pgsize > 0); | |
1624 | ||
1625 | setup_signal(SIGINT, set_signal_done); | |
1626 | setup_signal(SIGTERM, set_signal_done); | |
1627 | ||
1628 | get_ncpus(); | |
1629 | handle_args(argc, argv); | |
1630 | find_input_files(); | |
1631 | ||
1632 | nfiles = list_len(&input_files); | |
1633 | __list_for_each(p, &input_files) { | |
1634 | tip_init(list_entry(p, struct thr_info, head)); | |
1635 | } | |
1636 | ||
1637 | wait_replays_ready(); | |
1638 | for (i = 0; i < def_iterations; i++) { | |
1639 | rgenesis = gettime(); | |
1640 | start_iter(); | |
1641 | if (verbose) | |
1642 | fprintf(stderr, "I"); | |
1643 | wait_iters_done(); | |
1644 | } | |
1645 | ||
1646 | wait_replays_done(); | |
1647 | wait_reclaims_done(); | |
1648 | ||
1649 | if (verbose) | |
1650 | fprintf(stderr, "\n"); | |
1651 | ||
1652 | rem_input_files(); | |
1653 | release_map_devs(); | |
1654 | ||
1655 | return 0; | |
1656 | } |