[PATCH] fio: error handling
[disktools.git] / fio.c
1 /*
2  * fio - the flexible io tester
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <unistd.h>
24 #include <fcntl.h>
25 #include <string.h>
26 #include <errno.h>
27 #include <signal.h>
28 #include <time.h>
29 #include <ctype.h>
30 #include <sched.h>
31 #include <libaio.h>
32 #include <math.h>
33 #include <limits.h>
34 #include <assert.h>
35 #include <sys/time.h>
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/wait.h>
39 #include <semaphore.h>
40 #include <sys/ipc.h>
41 #include <sys/shm.h>
42 #include <sys/ioctl.h>
43 #include <asm/unistd.h>
44 #include <asm/types.h>
45
46 #include "arch.h"
47 #include "list.h"
48 #include "md5.h"
49
50 #ifndef BLKGETSIZE64
51 #define BLKGETSIZE64    _IOR(0x12,114,size_t)
52 #endif
53
54 #define MAX_JOBS        (1024)
55
56 static int ioprio_set(int which, int who, int ioprio)
57 {
58         return syscall(__NR_ioprio_set, which, who, ioprio);
59 }
60
61 /*
62  * we want fadvise64 really, but it's so tangled... later
63  */
64 static int fadvise(int fd, loff_t offset, size_t len, int advice)
65 {
66 #if 0
67         return syscall(__NR_fadvise64, fd, offset, offset >> 32, len, advice);
68 #else
69         return posix_fadvise(fd, (off_t) offset, len, advice);
70 #endif
71 }
72
73 enum {
74         IOPRIO_WHO_PROCESS = 1,
75         IOPRIO_WHO_PGRP,
76         IOPRIO_WHO_USER,
77 };
78
79 #define IOPRIO_CLASS_SHIFT      13
80
81 #define MASK    (4095)
82
83 #define DEF_BS          (4096)
84 #define DEF_TIMEOUT     (0)
85 #define DEF_RATE_CYCLE  (1000)
86 #define DEF_ODIRECT     (1)
87 #define DEF_SEQUENTIAL  (1)
88 #define DEF_RAND_REPEAT (1)
89 #define DEF_OVERWRITE   (0)
90 #define DEF_CREATE      (1)
91 #define DEF_INVALIDATE  (1)
92 #define DEF_SYNCIO      (0)
93 #define DEF_RANDSEED    (0xb1899bedUL)
94 #define DEF_BWAVGTIME   (500)
95 #define DEF_CREATE_SER  (1)
96 #define DEF_CREATE_FSYNC        (1)
97 #define DEF_LOOPS       (1)
98 #define DEF_VERIFY      (0)
99 #define DEF_STONEWALL   (0)
100 #define DEF_NUMJOBS     (1)
101
102 #define ALIGN(buf)      (char *) (((unsigned long) (buf) + MASK) & ~(MASK))
103
104 static int repeatable = DEF_RAND_REPEAT;
105 static int rate_quit = 1;
106 static int write_lat_log;
107 static int write_bw_log;
108 static int exitall_on_terminate;
109
110 static int thread_number;
111 static char *ini_file;
112
113 static int max_jobs = MAX_JOBS;
114
115 static char run_str[MAX_JOBS + 1];
116
117 static int shm_id;
118
119 enum {
120         DDIR_READ = 0,
121         DDIR_WRITE,
122 };
123
124 /*
125  * thread life cycle
126  */
127 enum {
128         TD_NOT_CREATED = 0,
129         TD_CREATED,
130         TD_RUNNING,
131         TD_VERIFYING,
132         TD_EXITED,
133         TD_REAPED,
134 };
135
136 enum {
137         MEM_MALLOC,
138         MEM_SHM,
139 };
140
141 /*
142  * The io unit
143  */
144 struct io_u {
145         struct iocb iocb;
146         struct timeval start_time;
147         struct timeval issue_time;
148
149         char *buf;
150         unsigned int buflen;
151         unsigned long long offset;
152
153         struct list_head list;
154 };
155
156 struct io_stat {
157         unsigned long val;
158         unsigned long val_sq;
159         unsigned long max_val;
160         unsigned long min_val;
161         unsigned long samples;
162 };
163
164 struct io_sample {
165         unsigned long time;
166         unsigned long val;
167 };
168
169 struct io_log {
170         unsigned long nr_samples;
171         unsigned long max_samples;
172         struct io_sample *log;
173 };
174
175 struct io_piece {
176         struct list_head list;
177         unsigned long long offset;
178         unsigned int len;
179 };
180
181 #define FIO_HDR_MAGIC   0xf00baaef
182
183 struct verify_header {
184         unsigned int fio_magic;
185         unsigned int len;
186         char md5_digest[MD5_HASH_WORDS * 4];
187 };
188
189 #define td_read(td)             ((td)->ddir == DDIR_READ)
190 #define td_write(td)            ((td)->ddir == DDIR_WRITE)
191 #define should_fsync(td)        (td_write(td) && !(td)->odirect)
192
193 #define BLOCKS_PER_MAP          (8 * sizeof(long))
194 #define TO_MAP_BLOCK(td, b)     ((b) - ((td)->file_offset / (td)->min_bs))
195 #define RAND_MAP_IDX(td, b)     (TO_MAP_BLOCK(td, b) / BLOCKS_PER_MAP)
196 #define RAND_MAP_BIT(td, b)     (TO_MAP_BLOCK(td, b) & (BLOCKS_PER_MAP - 1))
197
198 struct thread_data {
199         char file_name[256];
200         char directory[256];
201         int thread_number;
202         int error;
203         int fd;
204         pid_t pid;
205         char *orig_buffer;
206         volatile int terminate;
207         volatile int runstate;
208         volatile int old_runstate;
209         unsigned int ddir;
210         unsigned int ioprio;
211         unsigned int sequential;
212         unsigned int bs;
213         unsigned int min_bs;
214         unsigned int max_bs;
215         unsigned int odirect;
216         unsigned int thinktime;
217         unsigned int fsync_blocks;
218         unsigned int start_delay;
219         unsigned int timeout;
220         unsigned int use_aio;
221         unsigned int create_file;
222         unsigned int overwrite;
223         unsigned int invalidate_cache;
224         unsigned int bw_avg_time;
225         unsigned int create_serialize;
226         unsigned int create_fsync;
227         unsigned int loops;
228         unsigned long long file_size;
229         unsigned long long file_offset;
230         unsigned int sync_io;
231         unsigned int mem_type;
232         unsigned int verify;
233         unsigned int stonewall;
234         unsigned int numjobs;
235         cpu_set_t cpumask;
236
237         struct drand48_data bsrange_state;
238         struct drand48_data verify_state;
239
240         int shm_id;
241
242         off_t cur_off;
243
244         io_context_t aio_ctx;
245         unsigned int aio_depth;
246         struct io_event *aio_events;
247
248         unsigned int cur_depth;
249         struct list_head io_u_freelist;
250         struct list_head io_u_busylist;
251
252         unsigned int rate;
253         unsigned int ratemin;
254         unsigned int ratecycle;
255         unsigned long rate_usec_cycle;
256         long rate_pending_usleep;
257         unsigned long rate_bytes;
258         struct timeval lastrate;
259
260         unsigned long runtime;          /* sec */
261         unsigned long long io_size;
262
263         unsigned long io_blocks;
264         unsigned long io_bytes;
265         unsigned long this_io_bytes;
266         unsigned long last_bytes;
267         sem_t mutex;
268
269         struct drand48_data random_state;
270         unsigned long *file_map;
271         unsigned int num_maps;
272
273         /*
274          * bandwidth and latency stats
275          */
276         struct io_stat clat_stat;               /* completion latency */
277         struct io_stat slat_stat;               /* submission latency */
278
279         struct io_stat bw_stat;                 /* bandwidth stats */
280         unsigned long stat_io_bytes;
281         struct timeval stat_sample_time;
282
283         struct io_log *lat_log;
284         struct io_log *bw_log;
285
286         struct timeval start;
287         struct rusage ru_start;
288         struct rusage ru_end;
289
290         struct list_head io_hist_list;
291 };
292
293 static struct thread_data *threads;
294 static struct thread_data def_thread;
295
296 static sem_t startup_sem;
297
298 static void sig_handler(int sig)
299 {
300         int i;
301
302         for (i = 0; i < thread_number; i++) {
303                 struct thread_data *td = &threads[i];
304
305                 td->terminate = 1;
306                 td->start_delay = 0;
307         }
308 }
309
310 static int init_random_state(struct thread_data *td)
311 {
312         unsigned long seed;
313         int fd, num_maps, blocks;
314
315         fd = open("/dev/random", O_RDONLY);
316         if (fd == -1) {
317                 td->error = errno;
318                 return 1;
319         }
320
321         if (read(fd, &seed, sizeof(seed)) < (int) sizeof(seed)) {
322                 td->error = EIO;
323                 close(fd);
324                 return 1;
325         }
326
327         close(fd);
328
329         srand48_r(seed, &td->bsrange_state);
330         srand48_r(seed, &td->verify_state);
331
332         if (td->sequential)
333                 return 0;
334
335         if (repeatable)
336                 seed = DEF_RANDSEED;
337
338         blocks = (td->io_size + td->min_bs - 1) / td->min_bs;
339         num_maps = blocks / BLOCKS_PER_MAP;
340         td->file_map = malloc(num_maps * sizeof(long));
341         td->num_maps = num_maps;
342         memset(td->file_map, 0, num_maps * sizeof(long));
343
344         srand48_r(seed, &td->random_state);
345         return 0;
346 }
347
348 static unsigned long utime_since(struct timeval *s, struct timeval *e)
349 {
350         double sec, usec;
351
352         sec = e->tv_sec - s->tv_sec;
353         usec = e->tv_usec - s->tv_usec;
354         if (sec > 0 && usec < 0) {
355                 sec--;
356                 usec += 1000000;
357         }
358
359         sec *= (double) 1000000;
360
361         return sec + usec;
362 }
363
364 static unsigned long utime_since_now(struct timeval *s)
365 {
366         struct timeval t;
367
368         gettimeofday(&t, NULL);
369         return utime_since(s, &t);
370 }
371
372 static unsigned long mtime_since(struct timeval *s, struct timeval *e)
373 {
374         double sec, usec;
375
376         sec = e->tv_sec - s->tv_sec;
377         usec = e->tv_usec - s->tv_usec;
378         if (sec > 0 && usec < 0) {
379                 sec--;
380                 usec += 1000000;
381         }
382
383         sec *= (double) 1000;
384         usec /= (double) 1000;
385
386         return sec + usec;
387 }
388
389 static unsigned long mtime_since_now(struct timeval *s)
390 {
391         struct timeval t;
392
393         gettimeofday(&t, NULL);
394         return mtime_since(s, &t);
395 }
396
397 static inline unsigned long msec_now(struct timeval *s)
398 {
399         return s->tv_sec * 1000 + s->tv_usec / 1000;
400 }
401
402 static int random_map_free(struct thread_data *td, unsigned long long block)
403 {
404         unsigned int idx = RAND_MAP_IDX(td, block);
405         unsigned int bit = RAND_MAP_BIT(td, block);
406
407         return (td->file_map[idx] & (1UL << bit)) == 0;
408 }
409
410 static int get_next_free_block(struct thread_data *td, unsigned long long *b)
411 {
412         int i;
413
414         *b = 0;
415         i = 0;
416         while ((*b) * td->min_bs < td->io_size) {
417                 if (td->file_map[i] != -1UL) {
418                         *b += ffz(td->file_map[i]);
419                         return 0;
420                 }
421
422                 *b += BLOCKS_PER_MAP;
423                 i++;
424         }
425
426         return 1;
427 }
428
429 static void mark_random_map(struct thread_data *td, struct io_u *io_u)
430 {
431         unsigned long block = io_u->offset / td->min_bs;
432         unsigned int blocks = 0;
433
434         while (blocks < (io_u->buflen / td->min_bs)) {
435                 int idx, bit;
436
437                 if (!random_map_free(td, block))
438                         break;
439
440                 idx = RAND_MAP_IDX(td, block);
441                 bit = RAND_MAP_BIT(td, block);
442
443                 assert(idx < td->num_maps);
444
445                 td->file_map[idx] |= (1UL << bit);
446                 block++;
447                 blocks++;
448         }
449
450         if ((blocks * td->min_bs) < io_u->buflen)
451                 io_u->buflen = blocks * td->min_bs;
452 }
453
454 static int get_next_offset(struct thread_data *td, unsigned long long *offset)
455 {
456         unsigned long long b, rb;
457         long r;
458
459         if (!td->sequential) {
460                 unsigned long max_blocks = td->io_size / td->min_bs;
461                 int loops = 50;
462
463                 do {
464                         lrand48_r(&td->random_state, &r);
465                         b = ((max_blocks - 1) * r / (RAND_MAX+1.0));
466                         rb = b + (td->file_offset / td->min_bs);
467                         loops--;
468                 } while (!random_map_free(td, rb) && loops);
469
470                 if (!loops) {
471                         if (get_next_free_block(td, &b))
472                                 return 1;
473                 }
474         } else
475                 b = td->last_bytes / td->min_bs;
476
477         *offset = (b * td->min_bs) + td->file_offset;
478         if (*offset > td->file_size)
479                 return 1;
480
481         return 0;
482 }
483
484 static unsigned int get_next_buflen(struct thread_data *td)
485 {
486         unsigned int buflen;
487         long r;
488
489         if (td->min_bs == td->max_bs)
490                 buflen = td->min_bs;
491         else {
492                 lrand48_r(&td->bsrange_state, &r);
493                 buflen = (1 + (double) (td->max_bs - 1) * r / (RAND_MAX + 1.0));
494                 buflen = (buflen + td->min_bs - 1) & ~(td->min_bs - 1);
495         }
496
497         if (buflen > td->io_size - td->this_io_bytes)
498                 buflen = td->io_size - td->this_io_bytes;
499
500         return buflen;
501 }
502
503 static inline void add_stat_sample(struct thread_data *td, struct io_stat *is,
504                                    unsigned long val)
505 {
506         if (val > is->max_val)
507                 is->max_val = val;
508         if (val < is->min_val)
509                 is->min_val = val;
510
511         is->val += val;
512         is->val_sq += val * val;
513         is->samples++;
514 }
515
516 static void add_log_sample(struct thread_data *td, struct io_log *log,
517                            unsigned long val)
518 {
519         if (log->nr_samples == log->max_samples) {
520                 int new_size = sizeof(struct io_sample) * log->max_samples * 2;
521
522                 log->log = realloc(log->log, new_size);
523                 log->max_samples <<= 1;
524         }
525
526         log->log[log->nr_samples].val = val;
527         log->log[log->nr_samples].time = mtime_since_now(&td->start);
528         log->nr_samples++;
529 }
530
531 static void add_clat_sample(struct thread_data *td, unsigned long msec)
532 {
533         add_stat_sample(td, &td->clat_stat, msec);
534
535         if (td->lat_log)
536                 add_log_sample(td, td->lat_log, msec);
537 }
538
539 static void add_slat_sample(struct thread_data *td, unsigned long msec)
540 {
541         add_stat_sample(td, &td->slat_stat, msec);
542 }
543
544 static void add_bw_sample(struct thread_data *td)
545 {
546         unsigned long spent = mtime_since_now(&td->stat_sample_time);
547         unsigned long rate;
548
549         if (spent < td->bw_avg_time)
550                 return;
551
552         rate = (td->this_io_bytes - td->stat_io_bytes) / spent;
553         add_stat_sample(td, &td->bw_stat, rate);
554
555         if (td->bw_log)
556                 add_log_sample(td, td->bw_log, rate);
557
558         gettimeofday(&td->stat_sample_time, NULL);
559         td->stat_io_bytes = td->this_io_bytes;
560 }
561
562 /*
563  * busy looping version for the last few usec
564  */
565 static void __usec_sleep(int usec)
566 {
567         struct timeval start;
568
569         gettimeofday(&start, NULL);
570         while (utime_since_now(&start) < usec)
571                 nop;
572 }
573
574 static void usec_sleep(int usec)
575 {
576         struct timespec req = { .tv_sec = 0, .tv_nsec = usec * 1000 };
577         struct timespec rem;
578
579         do {
580                 if (usec < 5000) {
581                         __usec_sleep(usec);
582                         break;
583                 }
584                 rem.tv_sec = rem.tv_nsec = 0;
585                 nanosleep(&req, &rem);
586                 if (!rem.tv_nsec)
587                         break;
588
589                 req.tv_nsec = rem.tv_nsec;
590                 usec = rem.tv_nsec * 1000;
591         } while (1);
592 }
593
594 static void rate_throttle(struct thread_data *td, unsigned long time_spent,
595                           unsigned int bytes)
596 {
597         unsigned long usec_cycle;
598
599         if (!td->rate)
600                 return;
601
602         usec_cycle = td->rate_usec_cycle * (bytes / td->min_bs);
603
604         if (time_spent < usec_cycle) {
605                 unsigned long s = usec_cycle - time_spent;
606
607                 td->rate_pending_usleep += s;
608                 if (td->rate_pending_usleep >= 100000) {
609                         usec_sleep(td->rate_pending_usleep);
610                         td->rate_pending_usleep = 0;
611                 }
612         } else {
613                 long overtime = time_spent - usec_cycle;
614
615                 td->rate_pending_usleep -= overtime;
616         }
617 }
618
619 static int check_min_rate(struct thread_data *td, struct timeval *now)
620 {
621         unsigned long spent;
622         unsigned long rate;
623
624         /*
625          * allow a 2 second settle period in the beginning
626          */
627         if (mtime_since(&td->start, now) < 2000)
628                 return 0;
629
630         /*
631          * if rate blocks is set, sample is running
632          */
633         if (td->rate_bytes) {
634                 spent = mtime_since(&td->lastrate, now);
635                 if (spent < td->ratecycle)
636                         return 0;
637
638                 rate = (td->this_io_bytes - td->rate_bytes) / spent;
639                 if (rate < td->ratemin) {
640                         printf("Client%d: min rate %d not met, got %ldKiB/sec\n", td->thread_number, td->ratemin, rate);
641                         if (rate_quit)
642                                 sig_handler(0);
643                         return 1;
644                 }
645         }
646
647         td->rate_bytes = td->this_io_bytes;
648         memcpy(&td->lastrate, now, sizeof(*now));
649         return 0;
650 }
651
652 static inline int runtime_exceeded(struct thread_data *td, struct timeval *t)
653 {
654         if (!td->timeout)
655                 return 0;
656         if (mtime_since(&td->start, t) >= td->timeout * 1000)
657                 return 1;
658
659         return 0;
660 }
661
662 static void fill_random_bytes(struct thread_data *td,
663                               unsigned char *p, unsigned int len)
664 {
665         unsigned int todo;
666         double r;
667
668         while (len) {
669                 drand48_r(&td->verify_state, &r);
670
671                 /*
672                  * lrand48_r seems to be broken and only fill the bottom
673                  * 32-bits, even on 64-bit archs with 64-bit longs
674                  */
675                 todo = sizeof(r);
676                 if (todo > len)
677                         todo = len;
678
679                 memcpy(p, &r, todo);
680
681                 len -= todo;
682                 p += todo;
683         }
684 }
685
686 static void hexdump(void *buffer, int len)
687 {
688         unsigned char *p = buffer;
689         int i;
690
691         for (i = 0; i < len; i++)
692                 printf("%02x", p[i]);
693         printf("\n");
694 }
695
696 static int verify_io_u(struct io_u *io_u)
697 {
698         struct verify_header *hdr = (struct verify_header *) io_u->buf;
699         unsigned char *p = (unsigned char *) io_u->buf;
700         struct md5_ctx md5_ctx;
701         int ret;
702
703         if (hdr->fio_magic != FIO_HDR_MAGIC)
704                 return 1;
705
706         memset(&md5_ctx, 0, sizeof(md5_ctx));
707         p += sizeof(*hdr);
708         md5_update(&md5_ctx, p, hdr->len - sizeof(*hdr));
709
710         ret = memcmp(hdr->md5_digest, md5_ctx.hash, sizeof(md5_ctx.hash));
711         if (ret) {
712                 hexdump(hdr->md5_digest, sizeof(hdr->md5_digest));
713                 hexdump(md5_ctx.hash, sizeof(md5_ctx.hash));
714         }
715
716         return ret;
717 }
718
719 /*
720  * fill body of io_u->buf with random data and add a header with the
721  * (eg) sha1sum of that data.
722  */
723 static void populate_io_u(struct thread_data *td, struct io_u *io_u)
724 {
725         struct md5_ctx md5_ctx;
726         struct verify_header hdr;
727         unsigned char *p = (unsigned char *) io_u->buf;
728
729         hdr.fio_magic = FIO_HDR_MAGIC;
730         hdr.len = io_u->buflen;
731         p += sizeof(hdr);
732         fill_random_bytes(td, p, io_u->buflen - sizeof(hdr));
733
734         memset(&md5_ctx, 0, sizeof(md5_ctx));
735         md5_update(&md5_ctx, p, io_u->buflen - sizeof(hdr));
736         memcpy(hdr.md5_digest, md5_ctx.hash, sizeof(md5_ctx.hash));
737         memcpy(io_u->buf, &hdr, sizeof(hdr));
738 }
739
740 static void put_io_u(struct thread_data *td, struct io_u *io_u)
741 {
742         list_del(&io_u->list);
743         list_add(&io_u->list, &td->io_u_freelist);
744         td->cur_depth--;
745 }
746
747 #define queue_full(td)  (list_empty(&(td)->io_u_freelist))
748
749 static struct io_u *__get_io_u(struct thread_data *td)
750 {
751         struct io_u *io_u;
752
753         if (queue_full(td))
754                 return NULL;
755
756         io_u = list_entry(td->io_u_freelist.next, struct io_u, list);
757         list_del(&io_u->list);
758         list_add(&io_u->list, &td->io_u_busylist);
759         td->cur_depth++;
760         return io_u;
761 }
762
763 static struct io_u *get_io_u(struct thread_data *td)
764 {
765         struct io_u *io_u;
766
767         io_u = __get_io_u(td);
768         if (!io_u)
769                 return NULL;
770
771         if (get_next_offset(td, &io_u->offset)) {
772                 put_io_u(td, io_u);
773                 return NULL;
774         }
775
776         io_u->buflen = get_next_buflen(td);
777         if (!io_u->buflen) {
778                 put_io_u(td, io_u);
779                 return NULL;
780         }
781
782         if (io_u->buflen + io_u->offset > td->file_size)
783                 io_u->buflen = td->file_size - io_u->offset;
784
785         if (!td->sequential)
786                 mark_random_map(td, io_u);
787
788         td->last_bytes += io_u->buflen;
789
790         if (td->verify)
791                 populate_io_u(td, io_u);
792
793         if (td->use_aio) {
794                 if (td_read(td))
795                         io_prep_pread(&io_u->iocb, td->fd, io_u->buf, io_u->buflen, io_u->offset);
796                 else
797                         io_prep_pwrite(&io_u->iocb, td->fd, io_u->buf, io_u->buflen, io_u->offset);
798         }
799
800         gettimeofday(&io_u->start_time, NULL);
801         return io_u;
802 }
803
804 static inline void td_set_runstate(struct thread_data *td, int runstate)
805 {
806         td->old_runstate = td->runstate;
807         td->runstate = runstate;
808 }
809
810 static int get_next_verify(struct thread_data *td,
811                            unsigned long long *offset, unsigned int *len)
812 {
813         struct io_piece *ipo;
814
815         if (list_empty(&td->io_hist_list))
816                 return 1;
817
818         ipo = list_entry(td->io_hist_list.next, struct io_piece, list);
819         list_del(&ipo->list);
820
821         *offset = ipo->offset;
822         *len = ipo->len;
823         free(ipo);
824         return 0;
825 }
826
827 static void prune_io_piece_log(struct thread_data *td)
828 {
829         struct io_piece *ipo;
830
831         while (!list_empty(&td->io_hist_list)) {
832                 ipo = list_entry(td->io_hist_list.next, struct io_piece, list);
833
834                 list_del(&ipo->list);
835                 free(ipo);
836         }
837 }
838
839 /*
840  * log a succesful write, so we can unwind the log for verify
841  */
842 static void log_io_piece(struct thread_data *td, struct io_u *io_u)
843 {
844         struct io_piece *ipo = malloc(sizeof(struct io_piece));
845         struct list_head *entry;
846
847         INIT_LIST_HEAD(&ipo->list);
848         ipo->offset = io_u->offset;
849         ipo->len = io_u->buflen;
850
851         /*
852          * for random io where the writes extend the file, it will typically
853          * be laid out with the block scattered as written. it's faster to
854          * read them in in that order again, so don't sort
855          */
856         if (td->sequential || !td->overwrite) {
857                 list_add_tail(&ipo->list, &td->io_hist_list);
858                 return;
859         }
860
861         /*
862          * for random io, sort the list so verify will run faster
863          */
864         entry = &td->io_hist_list;
865         while ((entry = entry->prev) != &td->io_hist_list) {
866                 struct io_piece *__ipo = list_entry(entry, struct io_piece, list);
867
868                 if (__ipo->offset < ipo->offset)
869                         break;
870         }
871
872         list_add(&ipo->list, entry);
873 }
874
875 static void do_sync_verify(struct thread_data *td)
876 {
877         struct timeval t;
878         struct io_u *io_u = NULL;
879         int ret;
880
881         td_set_runstate(td, TD_VERIFYING);
882
883         io_u = __get_io_u(td);
884
885         if (!td->odirect) {
886                 if (fadvise(td->fd, td->file_offset, td->io_size, POSIX_FADV_DONTNEED) < 0) {
887                         td->error = errno;
888                         goto out;
889                 }
890         }
891
892         do {
893                 if (td->terminate)
894                         break;
895
896                 gettimeofday(&t, NULL);
897                 if (runtime_exceeded(td, &t))
898                         break;
899
900                 if (get_next_verify(td, &io_u->offset, &io_u->buflen))
901                         break;
902
903                 if (td->cur_off != io_u->offset) {
904                         if (lseek(td->fd, io_u->offset, SEEK_SET) == -1) {
905                                 td->error = errno;
906                                 break;
907                         }
908                 }
909
910                 ret = read(td->fd, io_u->buf, io_u->buflen);
911                 if (ret < (int) io_u->buflen) {
912                         if (ret == -1) {
913                                 td->error = errno;
914                                 break;
915                         } else if (!ret)
916                                 break;
917                         else
918                                 io_u->buflen = ret;
919                 }
920
921                 if (verify_io_u(io_u))
922                         break;
923
924                 td->cur_off = io_u->offset + io_u->buflen;
925         } while (1);
926
927 out:
928         td_set_runstate(td, TD_RUNNING);
929         put_io_u(td, io_u);
930 }
931
932 static void do_sync_io(struct thread_data *td)
933 {
934         unsigned long msec, usec;
935         struct io_u *io_u = NULL;
936         struct timeval e;
937
938         while (td->this_io_bytes < td->io_size) {
939                 int ret;
940
941                 if (td->terminate)
942                         break;
943
944                 io_u = get_io_u(td);
945                 if (!io_u)
946                         break;
947
948                 if (td->cur_off != io_u->offset) {
949                         if (lseek(td->fd, io_u->offset, SEEK_SET) == -1) {
950                                 td->error = errno;
951                                 break;
952                         }
953                 }
954
955                 if (td_read(td))
956                         ret = read(td->fd, io_u->buf, io_u->buflen);
957                 else
958                         ret = write(td->fd, io_u->buf, io_u->buflen);
959
960                 if (ret < (int) io_u->buflen) {
961                         if (ret == -1)
962                                 td->error = errno;
963                         break;
964                 }
965
966                 if (td_write(td))
967                         log_io_piece(td, io_u);
968
969                 td->io_blocks++;
970                 td->io_bytes += io_u->buflen;
971                 td->this_io_bytes += io_u->buflen;
972                 td->cur_off = io_u->offset + io_u->buflen;
973
974                 gettimeofday(&e, NULL);
975
976                 usec = utime_since(&io_u->start_time, &e);
977
978                 rate_throttle(td, usec, io_u->buflen);
979
980                 if (check_min_rate(td, &e)) {
981                         td->error = ENODATA;
982                         break;
983                 }
984
985                 msec = usec / 1000;
986                 add_clat_sample(td, msec);
987                 add_bw_sample(td);
988
989                 if (runtime_exceeded(td, &e))
990                         break;
991
992                 put_io_u(td, io_u);
993                 io_u = NULL;
994
995                 if (td->thinktime)
996                         usec_sleep(td->thinktime);
997
998                 if (should_fsync(td) && td->fsync_blocks &&
999                     (td->io_blocks % td->fsync_blocks) == 0)
1000                         fsync(td->fd);
1001         }
1002
1003         if (io_u)
1004                 put_io_u(td, io_u);
1005
1006         if (should_fsync(td))
1007                 fsync(td->fd);
1008 }
1009
1010 static int io_u_getevents(struct thread_data *td, int min, int max,
1011                           struct timespec *t)
1012 {
1013         int r;
1014
1015         do {
1016                 r = io_getevents(td->aio_ctx, min, max, td->aio_events, t);
1017                 if (r != -EAGAIN && r != -EINTR)
1018                         break;
1019         } while (1);
1020
1021         return r;
1022 }
1023
1024 static int io_u_queue(struct thread_data *td, struct io_u *io_u)
1025 {
1026         struct iocb *iocb = &io_u->iocb;
1027         int ret;
1028
1029         do {
1030                 ret = io_submit(td->aio_ctx, 1, &iocb);
1031                 if (ret == 1)
1032                         return 0;
1033                 else if (ret == -EAGAIN)
1034                         usleep(100);
1035                 else if (ret == -EINTR)
1036                         continue;
1037                 else
1038                         break;
1039         } while (1);
1040
1041         return ret;
1042 }
1043
1044 #define iocb_time(iocb) ((unsigned long) (iocb)->data)
1045 #define ev_to_iou(ev)   (struct io_u *) ((unsigned long) (ev)->obj)
1046
1047 static int ios_completed(struct thread_data *td, int nr)
1048 {
1049         unsigned long msec;
1050         struct io_u *io_u;
1051         struct timeval e;
1052         int i, bytes_done;
1053
1054         gettimeofday(&e, NULL);
1055
1056         for (i = 0, bytes_done = 0; i < nr; i++) {
1057                 io_u = ev_to_iou(td->aio_events + i);
1058
1059                 td->io_blocks++;
1060                 td->io_bytes += io_u->buflen;
1061                 td->this_io_bytes += io_u->buflen;
1062
1063                 msec = mtime_since(&io_u->issue_time, &e);
1064
1065                 add_clat_sample(td, msec);
1066                 add_bw_sample(td);
1067
1068                 if (td_write(td))
1069                         log_io_piece(td, io_u);
1070
1071                 bytes_done += io_u->buflen;
1072                 put_io_u(td, io_u);
1073         }
1074
1075         return bytes_done;
1076 }
1077
1078 static void cleanup_pending_aio(struct thread_data *td)
1079 {
1080         struct timespec ts = { .tv_sec = 0, .tv_nsec = 0};
1081         struct list_head *entry, *n;
1082         struct io_u *io_u;
1083         int r;
1084
1085         /*
1086          * get immediately available events, if any
1087          */
1088         r = io_u_getevents(td, 0, td->cur_depth, &ts);
1089         if (r > 0)
1090                 ios_completed(td, r);
1091
1092         /*
1093          * now cancel remaining active events
1094          */
1095         list_for_each_safe(entry, n, &td->io_u_busylist) {
1096                 io_u = list_entry(entry, struct io_u, list);
1097
1098                 r = io_cancel(td->aio_ctx, &io_u->iocb, td->aio_events);
1099                 if (!r)
1100                         put_io_u(td, io_u);
1101         }
1102
1103         if (td->cur_depth) {
1104                 r = io_u_getevents(td, td->cur_depth, td->cur_depth, NULL);
1105                 if (r > 0)
1106                         ios_completed(td, r);
1107         }
1108 }
1109
1110 static int async_do_verify(struct thread_data *td, struct io_u **io_u)
1111 {
1112         struct io_u *v_io_u = *io_u;
1113         int ret = 0;
1114
1115         if (v_io_u) {
1116                 ret = verify_io_u(v_io_u);
1117                 put_io_u(td, v_io_u);
1118                 *io_u = NULL;
1119         }
1120
1121         return ret;
1122 }
1123
1124 static void do_async_verify(struct thread_data *td)
1125 {
1126         struct timeval t;
1127         struct io_u *io_u, *v_io_u = NULL;
1128         int ret;
1129
1130         td_set_runstate(td, TD_VERIFYING);
1131
1132         do {
1133                 if (td->terminate)
1134                         break;
1135
1136                 gettimeofday(&t, NULL);
1137                 if (runtime_exceeded(td, &t))
1138                         break;
1139
1140                 io_u = __get_io_u(td);
1141                 if (!io_u)
1142                         break;
1143
1144                 if (get_next_verify(td, &io_u->offset, &io_u->buflen)) {
1145                         put_io_u(td, io_u);
1146                         break;
1147                 }
1148
1149                 io_prep_pread(&io_u->iocb, td->fd, io_u->buf, io_u->buflen, io_u->offset);
1150                 ret = io_u_queue(td, io_u);
1151                 if (ret) {
1152                         put_io_u(td, io_u);
1153                         td->error = ret;
1154                         break;
1155                 }
1156
1157                 /*
1158                  * we have one pending to verify, do that while the next
1159                  * we are doing io on the next one
1160                  */
1161                 if (async_do_verify(td, &v_io_u))
1162                         break;
1163
1164                 ret = io_u_getevents(td, 1, 1, NULL);
1165                 if (ret != 1) {
1166                         if (ret < 0)
1167                                 td->error = ret;
1168                         break;
1169                 }
1170
1171                 v_io_u = ev_to_iou(td->aio_events);
1172
1173                 td->cur_off = v_io_u->offset + v_io_u->buflen;
1174
1175                 /*
1176                  * if we can't submit more io, we need to verify now
1177                  */
1178                 if (queue_full(td) && async_do_verify(td, &v_io_u))
1179                         break;
1180
1181         } while (1);
1182
1183         async_do_verify(td, &v_io_u);
1184
1185         if (td->cur_depth)
1186                 cleanup_pending_aio(td);
1187
1188         td_set_runstate(td, TD_RUNNING);
1189 }
1190
1191 static void do_async_io(struct thread_data *td)
1192 {
1193         struct timeval s, e;
1194         unsigned long usec;
1195
1196         while (td->this_io_bytes < td->io_size) {
1197                 struct timespec ts = { .tv_sec = 0, .tv_nsec = 0};
1198                 struct timespec *timeout;
1199                 int ret, min_evts = 0;
1200                 struct io_u *io_u;
1201                 unsigned int bytes_done;
1202
1203                 if (td->terminate)
1204                         break;
1205
1206                 io_u = get_io_u(td);
1207                 if (!io_u)
1208                         break;
1209
1210                 memcpy(&s, &io_u->start_time, sizeof(s));
1211
1212                 ret = io_u_queue(td, io_u);
1213                 if (ret) {
1214                         put_io_u(td, io_u);
1215                         td->error = ret;
1216                         break;
1217                 }
1218
1219                 gettimeofday(&io_u->issue_time, NULL);
1220                 add_slat_sample(td, mtime_since(&io_u->start_time, &io_u->issue_time));
1221                 if (td->cur_depth < td->aio_depth) {
1222                         timeout = &ts;
1223                         min_evts = 0;
1224                 } else {
1225                         timeout = NULL;
1226                         min_evts = 1;
1227                 }
1228
1229                 ret = io_u_getevents(td, min_evts, td->cur_depth, timeout);
1230                 if (ret < 0) {
1231                         td->error = ret;
1232                         break;
1233                 } else if (!ret)
1234                         continue;
1235
1236                 bytes_done = ios_completed(td, ret);
1237
1238                 /*
1239                  * the rate is batched for now, it should work for batches
1240                  * of completions except the very first one which may look
1241                  * a little bursty
1242                  */
1243                 gettimeofday(&e, NULL);
1244                 usec = utime_since(&s, &e);
1245
1246                 rate_throttle(td, usec, bytes_done);
1247
1248                 if (check_min_rate(td, &e)) {
1249                         td->error = ENODATA;
1250                         break;
1251                 }
1252
1253                 if (runtime_exceeded(td, &e))
1254                         break;
1255
1256                 if (td->thinktime)
1257                         usec_sleep(td->thinktime);
1258
1259                 if (should_fsync(td) && td->fsync_blocks &&
1260                     (td->io_blocks % td->fsync_blocks) == 0)
1261                         fsync(td->fd);
1262         }
1263
1264         if (td->cur_depth)
1265                 cleanup_pending_aio(td);
1266
1267         if (should_fsync(td))
1268                 fsync(td->fd);
1269 }
1270
1271 static void cleanup_aio(struct thread_data *td)
1272 {
1273         io_destroy(td->aio_ctx);
1274
1275         if (td->aio_events)
1276                 free(td->aio_events);
1277 }
1278
1279 static int init_aio(struct thread_data *td)
1280 {
1281         if (io_queue_init(td->aio_depth, &td->aio_ctx)) {
1282                 td->error = errno;
1283                 return 1;
1284         }
1285
1286         td->aio_events = malloc(td->aio_depth * sizeof(struct io_event));
1287         return 0;
1288 }
1289
1290 static void cleanup_io_u(struct thread_data *td)
1291 {
1292         struct list_head *entry, *n;
1293         struct io_u *io_u;
1294
1295         list_for_each_safe(entry, n, &td->io_u_freelist) {
1296                 io_u = list_entry(entry, struct io_u, list);
1297
1298                 list_del(&io_u->list);
1299                 free(io_u);
1300         }
1301
1302         if (td->mem_type == MEM_MALLOC)
1303                 free(td->orig_buffer);
1304         else if (td->mem_type == MEM_SHM) {
1305                 struct shmid_ds sbuf;
1306
1307                 shmdt(td->orig_buffer);
1308                 shmctl(td->shm_id, IPC_RMID, &sbuf);
1309         }
1310 }
1311
1312 static int init_io_u(struct thread_data *td)
1313 {
1314         struct io_u *io_u;
1315         int i, max_units, mem_size;
1316         char *p;
1317
1318         if (!td->use_aio)
1319                 max_units = 1;
1320         else
1321                 max_units = td->aio_depth;
1322
1323         mem_size = td->max_bs * max_units + MASK;
1324
1325         if (td->mem_type == MEM_MALLOC)
1326                 td->orig_buffer = malloc(mem_size);
1327         else if (td->mem_type == MEM_SHM) {
1328                 td->shm_id = shmget(IPC_PRIVATE, mem_size, IPC_CREAT | 0600);
1329                 if (td->shm_id < 0) {
1330                         td->error = errno;
1331                         perror("shmget");
1332                         return 1;
1333                 }
1334
1335                 td->orig_buffer = shmat(td->shm_id, NULL, 0);
1336                 if (td->orig_buffer == (void *) -1) {
1337                         td->error = errno;
1338                         perror("shmat");
1339                         return 1;
1340                 }
1341         }
1342
1343         INIT_LIST_HEAD(&td->io_u_freelist);
1344         INIT_LIST_HEAD(&td->io_u_busylist);
1345         INIT_LIST_HEAD(&td->io_hist_list);
1346
1347         p = ALIGN(td->orig_buffer);
1348         for (i = 0; i < max_units; i++) {
1349                 io_u = malloc(sizeof(*io_u));
1350                 memset(io_u, 0, sizeof(*io_u));
1351                 INIT_LIST_HEAD(&io_u->list);
1352
1353                 io_u->buf = p + td->max_bs * i;
1354                 list_add(&io_u->list, &td->io_u_freelist);
1355         }
1356
1357         return 0;
1358 }
1359
1360 static void setup_log(struct io_log **log)
1361 {
1362         struct io_log *l = malloc(sizeof(*l));
1363
1364         l->nr_samples = 0;
1365         l->max_samples = 1024;
1366         l->log = malloc(l->max_samples * sizeof(struct io_sample));
1367         *log = l;
1368 }
1369
1370 static void finish_log(struct thread_data *td, struct io_log *log, char *name)
1371 {
1372         char file_name[128];
1373         FILE *f;
1374         unsigned int i;
1375
1376         sprintf(file_name, "client%d_%s.log", td->thread_number, name);
1377         f = fopen(file_name, "w");
1378         if (!f) {
1379                 perror("fopen log");
1380                 return;
1381         }
1382
1383         for (i = 0; i < log->nr_samples; i++)
1384                 fprintf(f, "%lu, %lu\n", log->log[i].time, log->log[i].val);
1385
1386         fclose(f);
1387         free(log->log);
1388         free(log);
1389 }
1390
1391 static int create_file(struct thread_data *td)
1392 {
1393         unsigned long long left;
1394         unsigned int bs;
1395         char *b;
1396         int r;
1397
1398         /*
1399          * unless specifically asked for overwrite, let normal io extend it
1400          */
1401         if (td_write(td) && !td->overwrite)
1402                 return 0;
1403
1404         if (!td->file_size) {
1405                 fprintf(stderr, "Need size for create\n");
1406                 td->error = EINVAL;
1407                 return 1;
1408         }
1409
1410         printf("Client%d: Laying out IO file\n", td->thread_number);
1411
1412         td->fd = open(td->file_name, O_WRONLY | O_CREAT | O_TRUNC, 0644);
1413         if (td->fd < 0) {
1414                 td->error = errno;
1415                 return 1;
1416         }
1417
1418         if (ftruncate(td->fd, td->file_size) == -1) {
1419                 td->error = errno;
1420                 return 1;
1421         }
1422
1423         td->io_size = td->file_size;
1424         b = malloc(td->max_bs);
1425         memset(b, 0, td->max_bs);
1426
1427         left = td->file_size;
1428         while (left) {
1429                 bs = td->max_bs;
1430                 if (bs > left)
1431                         bs = left;
1432
1433                 r = write(td->fd, b, bs);
1434
1435                 if (r == (int) bs) {
1436                         left -= bs;
1437                         continue;
1438                 } else {
1439                         if (r < 0)
1440                                 td->error = errno;
1441                         else
1442                                 td->error = EIO;
1443
1444                         break;
1445                 }
1446         }
1447
1448         if (td->create_fsync)
1449                 fsync(td->fd);
1450
1451         close(td->fd);
1452         td->fd = -1;
1453         free(b);
1454         return 0;
1455 }
1456
1457 static int file_exists(struct thread_data *td)
1458 {
1459         struct stat st;
1460
1461         if (stat(td->file_name, &st) != -1)
1462                 return 1;
1463
1464         return errno != ENOENT;
1465 }
1466
1467 static int get_file_size(struct thread_data *td)
1468 {
1469         size_t bytes = 0;
1470         struct stat st;
1471
1472         if (fstat(td->fd, &st) == -1) {
1473                 td->error = errno;
1474                 return 1;
1475         }
1476
1477         /*
1478          * if block device, get size via BLKGETSIZE64 ioctl. try that as well
1479          * if this is a link, fall back to st.st_size if it fails
1480          */
1481         if (S_ISBLK(st.st_mode) || S_ISLNK(st.st_mode)) {
1482                 if (ioctl(td->fd, BLKGETSIZE64, &bytes)) {
1483                         if (S_ISBLK(st.st_mode)) {
1484                                 td->error = errno;
1485                                 return 1;
1486                         } else
1487                                 bytes = st.st_size;
1488                 }
1489         } else
1490                 bytes = st.st_size;
1491
1492         if (td_read(td)) {
1493                 if (td->file_size > bytes)
1494                         bytes = td->file_size;
1495         } else {
1496                 if (!td->file_size)
1497                         td->file_size = 1024 * 1024 * 1024;
1498
1499                 bytes = td->file_size;
1500         }
1501
1502         if (td->file_offset > bytes) {
1503                 fprintf(stderr, "Client%d: offset larger than length\n", td->thread_number);
1504                 return 1;
1505         }
1506
1507         td->io_size = bytes - td->file_offset;
1508         if (td->io_size == 0) {
1509                 fprintf(stderr, "Client%d: no io blocks\n", td->thread_number);
1510                 td->error = EINVAL;
1511                 return 1;
1512         }
1513
1514         return 0;
1515 }
1516
1517 static int setup_file(struct thread_data *td)
1518 {
1519         int flags = 0;
1520
1521         if (!file_exists(td)) {
1522                 if (!td->create_file) {
1523                         td->error = ENOENT;
1524                         return 1;
1525                 }
1526                 if (create_file(td))
1527                         return 1;
1528         }
1529
1530         if (td->odirect)
1531                 flags |= O_DIRECT;
1532
1533         if (td_read(td))
1534                 td->fd = open(td->file_name, flags | O_RDONLY);
1535         else {
1536                 if (!td->overwrite)
1537                         flags |= O_TRUNC;
1538                 if (td->sync_io)
1539                         flags |= O_SYNC;
1540                 if (td->verify)
1541                         flags |= O_RDWR;
1542                 else
1543                         flags |= O_WRONLY;
1544
1545                 td->fd = open(td->file_name, flags | O_CREAT, 0600);
1546         }
1547
1548         if (td->fd == -1) {
1549                 td->error = errno;
1550                 return 1;
1551         }
1552
1553         if (get_file_size(td))
1554                 return 1;
1555
1556         if (td_write(td) && ftruncate(td->fd, td->file_size) == -1) {
1557                 td->error = errno;
1558                 return 1;
1559         }
1560
1561         if (td->invalidate_cache) {
1562                 if (fadvise(td->fd, td->file_offset, td->file_size, POSIX_FADV_DONTNEED) < 0) {
1563                         td->error = errno;
1564                         return 1;
1565                 }
1566         }
1567
1568         return 0;
1569 }
1570
1571 static void clear_io_state(struct thread_data *td)
1572 {
1573         if (!td->use_aio)
1574                 lseek(td->fd, SEEK_SET, 0);
1575
1576         td->cur_off = 0;
1577         td->last_bytes = 0;
1578         td->stat_io_bytes = 0;
1579         td->this_io_bytes = 0;
1580
1581         if (td->file_map)
1582                 memset(td->file_map, 0, td->num_maps * sizeof(long));
1583 }
1584
1585 static void *thread_main(int shm_id, int offset, char *argv[])
1586 {
1587         struct thread_data *td;
1588         int ret = 1;
1589         void *data;
1590
1591         setsid();
1592
1593         data = shmat(shm_id, NULL, 0);
1594         if (data == (void *) -1) {
1595                 perror("shmat");
1596                 return NULL;
1597         }
1598
1599         td = data + offset * sizeof(struct thread_data);
1600         td->pid = getpid();
1601
1602         if (init_io_u(td))
1603                 goto err;
1604
1605         if (sched_setaffinity(td->pid, sizeof(td->cpumask), &td->cpumask) == -1) {
1606                 td->error = errno;
1607                 goto err;
1608         }
1609
1610         sprintf(argv[0], "fio%d", offset);
1611
1612         if (td->use_aio && init_aio(td))
1613                 goto err;
1614
1615         if (td->ioprio) {
1616                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, td->ioprio) == -1) {
1617                         td->error = errno;
1618                         goto err;
1619                 }
1620         }
1621
1622         sem_post(&startup_sem);
1623         sem_wait(&td->mutex);
1624
1625         if (!td->create_serialize && setup_file(td))
1626                 goto err;
1627
1628         if (init_random_state(td))
1629                 goto err;
1630
1631         gettimeofday(&td->start, NULL);
1632
1633         getrusage(RUSAGE_SELF, &td->ru_start);
1634
1635         while (td->loops--) {
1636                 gettimeofday(&td->stat_sample_time, NULL);
1637
1638                 if (td->ratemin)
1639                         memcpy(&td->lastrate, &td->stat_sample_time, sizeof(td->lastrate));
1640
1641                 clear_io_state(td);
1642                 prune_io_piece_log(td);
1643
1644                 if (!td->use_aio)
1645                         do_sync_io(td);
1646                 else
1647                         do_async_io(td);
1648
1649                 if (td->error)
1650                         break;
1651
1652                 if (!td->verify)
1653                         continue;
1654
1655                 clear_io_state(td);
1656
1657                 if (!td->use_aio)
1658                         do_sync_verify(td);
1659                 else
1660                         do_async_verify(td);
1661
1662                 if (td->error)
1663                         break;
1664         }
1665
1666         td->runtime = mtime_since_now(&td->start);
1667         getrusage(RUSAGE_SELF, &td->ru_end);
1668         ret = 0;
1669
1670         if (td->bw_log)
1671                 finish_log(td, td->bw_log, "bw");
1672         if (td->lat_log)
1673                 finish_log(td, td->lat_log, "lat");
1674
1675         if (exitall_on_terminate)
1676                 sig_handler(0);
1677
1678 err:
1679         if (td->fd != -1) {
1680                 close(td->fd);
1681                 td->fd = -1;
1682         }
1683         if (td->use_aio)
1684                 cleanup_aio(td);
1685         cleanup_io_u(td);
1686         if (ret) {
1687                 sem_post(&startup_sem);
1688                 sem_wait(&td->mutex);
1689         }
1690         td_set_runstate(td, TD_EXITED);
1691         shmdt(data);
1692         return NULL;
1693 }
1694
1695 static void free_shm(void)
1696 {
1697         struct shmid_ds sbuf;
1698
1699         if (threads) {
1700                 shmdt(threads);
1701                 threads = NULL;
1702                 shmctl(shm_id, IPC_RMID, &sbuf);
1703         }
1704 }
1705
1706 static int calc_lat(struct io_stat *is, unsigned long *min, unsigned long *max,
1707                     double *mean, double *dev)
1708 {
1709         double n;
1710
1711         if (is->samples == 0)
1712                 return 0;
1713
1714         *min = is->min_val;
1715         *max = is->max_val;
1716
1717         n = (double) is->samples;
1718         *mean = (double) is->val / n;
1719         *dev = sqrt(((double) is->val_sq - (*mean * *mean) / n) / (n - 1));
1720         return 1;
1721 }
1722
1723 static void show_thread_status(struct thread_data *td)
1724 {
1725         int prio, prio_class;
1726         unsigned long min, max, bw = 0, ctx;
1727         double mean, dev, usr_cpu, sys_cpu;
1728
1729         if (!td->io_bytes && !td->error)
1730                 return;
1731
1732         if (td->runtime)
1733                 bw = td->io_bytes / td->runtime;
1734
1735         prio = td->ioprio & 0xff;
1736         prio_class = td->ioprio >> IOPRIO_CLASS_SHIFT;
1737
1738         printf("Client%d: err=%2d, io=%6luMiB, bw=%6luKiB/s, runt=%6lumsec\n", td->thread_number, td->error, td->io_bytes >> 20, bw, td->runtime);
1739
1740         if (calc_lat(&td->slat_stat, &min, &max, &mean, &dev))
1741                 printf("  slat (msec): min=%5lu, max=%5lu, avg=%5.02f, dev=%5.02f\n", min, max, mean, dev);
1742         if (calc_lat(&td->clat_stat, &min, &max, &mean, &dev))
1743                 printf("  clat (msec): min=%5lu, max=%5lu, avg=%5.02f, dev=%5.02f\n", min, max, mean, dev);
1744         if (calc_lat(&td->bw_stat, &min, &max, &mean, &dev))
1745                 printf("  bw (KiB/s) : min=%5lu, max=%5lu, avg=%5.02f, dev=%5.02f\n", min, max, mean, dev);
1746
1747         if (td->runtime) {
1748                 unsigned long t;
1749
1750                 t = mtime_since(&td->ru_start.ru_utime, &td->ru_end.ru_utime);
1751                 usr_cpu = (double) t * 100 / (double) td->runtime;
1752
1753                 t = mtime_since(&td->ru_start.ru_stime, &td->ru_end.ru_stime);
1754                 sys_cpu = (double) t * 100 / (double) td->runtime;
1755         } else {
1756                 usr_cpu = 0;
1757                 sys_cpu = 0;
1758         }
1759
1760         ctx = td->ru_end.ru_nvcsw + td->ru_end.ru_nivcsw - (td->ru_start.ru_nvcsw + td->ru_start.ru_nivcsw);
1761
1762         printf("  cpu        : usr=%3.2f%%, sys=%3.2f%%, ctx=%lu\n", usr_cpu, sys_cpu, ctx);
1763 }
1764
1765 static int setup_rate(struct thread_data *td)
1766 {
1767         int nr_reads_per_sec;
1768
1769         if (!td->rate)
1770                 return 0;
1771
1772         if (td->rate < td->ratemin) {
1773                 fprintf(stderr, "min rate larger than nominal rate\n");
1774                 return -1;
1775         }
1776
1777         nr_reads_per_sec = (td->rate * 1024) / td->min_bs;
1778         td->rate_usec_cycle = 1000000 / nr_reads_per_sec;
1779         td->rate_pending_usleep = 0;
1780         return 0;
1781 }
1782
1783 static struct thread_data *get_new_job(int global, struct thread_data *parent)
1784 {
1785         struct thread_data *td;
1786
1787         if (global)
1788                 return &def_thread;
1789         if (thread_number >= max_jobs)
1790                 return NULL;
1791
1792         td = &threads[thread_number++];
1793         memset(td, 0, sizeof(*td));
1794
1795         sprintf(td->directory, ".");
1796
1797         td->fd = -1;
1798         td->thread_number = thread_number;
1799
1800         td->ddir = parent->ddir;
1801         td->ioprio = parent->ioprio;
1802         td->sequential = parent->sequential;
1803         td->bs = parent->bs;
1804         td->min_bs = parent->min_bs;
1805         td->max_bs = parent->max_bs;
1806         td->odirect = parent->odirect;
1807         td->thinktime = parent->thinktime;
1808         td->fsync_blocks = parent->fsync_blocks;
1809         td->start_delay = parent->start_delay;
1810         td->timeout = parent->timeout;
1811         td->use_aio = parent->use_aio;
1812         td->create_file = parent->create_file;
1813         td->overwrite = parent->overwrite;
1814         td->invalidate_cache = parent->invalidate_cache;
1815         td->file_size = parent->file_size;
1816         td->file_offset = parent->file_offset;
1817         td->rate = parent->rate;
1818         td->ratemin = parent->ratemin;
1819         td->ratecycle = parent->ratecycle;
1820         td->aio_depth = parent->aio_depth;
1821         td->sync_io = parent->sync_io;
1822         td->mem_type = parent->mem_type;
1823         td->bw_avg_time = parent->bw_avg_time;
1824         td->create_serialize = parent->create_serialize;
1825         td->create_fsync = parent->create_fsync;
1826         td->loops = parent->loops;
1827         td->verify = parent->verify;
1828         td->stonewall = parent->stonewall;
1829         td->numjobs = parent->numjobs;
1830         memcpy(&td->cpumask, &parent->cpumask, sizeof(td->cpumask));
1831
1832         return td;
1833 }
1834
1835 static void put_job(struct thread_data *td)
1836 {
1837         memset(&threads[td->thread_number - 1], 0, sizeof(*td));
1838         thread_number--;
1839 }
1840
1841 static int add_job(struct thread_data *td, const char *jobname, int prioclass,
1842                    int prio)
1843 {
1844         int numjobs;
1845
1846         if (td == &def_thread)
1847                 return 0;
1848
1849         sprintf(td->file_name, "%s/%s.%d", td->directory, jobname, td->thread_number);
1850         sem_init(&td->mutex, 1, 0);
1851         td->ioprio = (prioclass << IOPRIO_CLASS_SHIFT) | prio;
1852
1853         td->clat_stat.min_val = ULONG_MAX;
1854         td->slat_stat.min_val = ULONG_MAX;
1855         td->bw_stat.min_val = ULONG_MAX;
1856
1857         run_str[td->thread_number - 1] = 'P';
1858
1859         if (td->use_aio && !td->aio_depth)
1860                 td->aio_depth = 1;
1861
1862         if (td->min_bs == -1U)
1863                 td->min_bs = td->bs;
1864         if (td->max_bs == -1U)
1865                 td->max_bs = td->bs;
1866         if (td_read(td))
1867                 td->verify = 0;
1868
1869         if (setup_rate(td))
1870                 goto err;
1871
1872         if (write_lat_log)
1873                 setup_log(&td->lat_log);
1874         if (write_bw_log)
1875                 setup_log(&td->bw_log);
1876
1877         printf("Client%d: rw=%d, prio=%d/%d, seq=%d, odir=%d, bs=%d-%d, rate=%d, aio=%d, aio_depth=%d\n", td->thread_number, td->ddir, prioclass, prio, td->sequential, td->odirect, td->min_bs, td->max_bs, td->rate, td->use_aio, td->aio_depth);
1878
1879         /*
1880          * recurse add identical jobs, clear numjobs and stonewall options
1881          * as they don't apply to sub-jobs
1882          */
1883         numjobs = td->numjobs;
1884         while (--numjobs) {
1885                 struct thread_data *td_new = get_new_job(0, td);
1886
1887                 if (!td_new)
1888                         goto err;
1889
1890                 td_new->numjobs = 1;
1891                 td_new->stonewall = 0;
1892
1893                 if (add_job(td_new, jobname, prioclass, prio))
1894                         goto err;
1895         }
1896         return 0;
1897 err:
1898         put_job(td);
1899         return -1;
1900 }
1901
1902 static void fill_cpu_mask(cpu_set_t cpumask, int cpu)
1903 {
1904         unsigned int i;
1905
1906         CPU_ZERO(&cpumask);
1907
1908         for (i = 0; i < sizeof(int) * 8; i++) {
1909                 if ((1 << i) & cpu)
1910                         CPU_SET(i, &cpumask);
1911         }
1912 }
1913
1914 unsigned long get_mult(char c)
1915 {
1916         switch (c) {
1917                 case 'k':
1918                 case 'K':
1919                         return 1024;
1920                 case 'm':
1921                 case 'M':
1922                         return 1024 * 1024;
1923                 case 'g':
1924                 case 'G':
1925                         return 1024 * 1024 * 1024;
1926                 default:
1927                         return 1;
1928         }
1929 }
1930
1931 /*
1932  * convert string after '=' into decimal value, noting any size suffix
1933  */
1934 static int str_cnv(char *p, unsigned long long *val)
1935 {
1936         char *str;
1937         int len;
1938
1939         str = strstr(p, "=");
1940         if (!str)
1941                 return 1;
1942
1943         str++;
1944         len = strlen(str);
1945
1946         *val = strtoul(str, NULL, 10);
1947         if (*val == ULONG_MAX && errno == ERANGE)
1948                 return 1;
1949
1950         *val *= get_mult(str[len - 2]);
1951         return 0;
1952 }
1953
1954 static int check_strcnv(char *p, char *name, unsigned long long *val)
1955 {
1956         if (!strstr(p, name))
1957                 return 1;
1958
1959         return str_cnv(p, val);
1960 }
1961
1962 static int check_str(char *p, char *name, char *option)
1963 {
1964         char *s = strstr(p, name);
1965
1966         if (!s)
1967                 return 1;
1968
1969         s += strlen(name);
1970         if (strstr(s, option))
1971                 return 0;
1972
1973         return 1;
1974 }
1975
1976 static int check_strstore(char *p, char *name, char *dest)
1977 {
1978         char *s = strstr(p, name);
1979
1980         if (!s)
1981                 return 1;
1982
1983         s = strstr(p, "=");
1984         if (!s)
1985                 return 1;
1986
1987         s++;
1988         while (isblank(*s))
1989                 s++;
1990
1991         strcpy(dest, s);
1992         return 0;
1993 }
1994
1995 static int check_range(char *p, char *name, unsigned long *s, unsigned long *e)
1996 {
1997         char str[128];
1998         char s1, s2;
1999
2000         sprintf(str, "%s=%%lu%%c-%%lu%%c", name);
2001         if (sscanf(p, str, s, &s1, e, &s2) == 4) {
2002                 *s *= get_mult(s1);
2003                 *e *= get_mult(s2);
2004                 return 0;
2005         }
2006
2007         sprintf(str, "%s = %%lu%%c-%%lu%%c", name);
2008         if (sscanf(p, str, s, &s1, e, &s2) == 4) {
2009                 *s *= get_mult(s1);
2010                 *e *= get_mult(s2);
2011                 return 0;
2012         }
2013
2014         sprintf(str, "%s=%%lu-%%lu", name);
2015         if (sscanf(p, str, s, e) == 2)
2016                 return 0;
2017
2018         sprintf(str, "%s = %%lu-%%lu", name);
2019         if (sscanf(p, str, s, e) == 2)
2020                 return 0;
2021
2022         return 1;
2023
2024 }
2025
2026 static int check_int(char *p, char *name, unsigned int *val)
2027 {
2028         char str[128];
2029
2030         sprintf(str, "%s=%%d", name);
2031         if (sscanf(p, str, val) == 1)
2032                 return 0;
2033
2034         sprintf(str, "%s = %%d", name);
2035         if (sscanf(p, str, val) == 1)
2036                 return 0;
2037
2038         return 1;
2039 }
2040
2041 static int is_empty_or_comment(char *line)
2042 {
2043         unsigned int i;
2044
2045         for (i = 0; i < strlen(line); i++) {
2046                 if (line[i] == ';')
2047                         return 1;
2048                 if (!isspace(line[i]) && !iscntrl(line[i]))
2049                         return 0;
2050         }
2051
2052         return 1;
2053 }
2054
2055 static int parse_jobs_ini(char *file)
2056 {
2057         unsigned int prioclass, prio, cpu, global;
2058         unsigned long long ull;
2059         unsigned long ul1, ul2;
2060         struct thread_data *td;
2061         char *string, *name;
2062         fpos_t off;
2063         FILE *f;
2064         char *p;
2065
2066         f = fopen(file, "r");
2067         if (!f) {
2068                 perror("fopen");
2069                 return 1;
2070         }
2071
2072         string = malloc(4096);
2073         name = malloc(256);
2074
2075         while ((p = fgets(string, 4096, f)) != NULL) {
2076                 if (is_empty_or_comment(p))
2077                         continue;
2078                 if (sscanf(p, "[%s]", name) != 1)
2079                         continue;
2080
2081                 global = !strncmp(name, "global", 6);
2082
2083                 name[strlen(name) - 1] = '\0';
2084
2085                 td = get_new_job(global, &def_thread);
2086                 if (!td)
2087                         return 1;
2088
2089                 prioclass = 2;
2090                 prio = 4;
2091
2092                 fgetpos(f, &off);
2093                 while ((p = fgets(string, 4096, f)) != NULL) {
2094                         if (is_empty_or_comment(p))
2095                                 continue;
2096                         if (strstr(p, "["))
2097                                 break;
2098                         if (!check_int(p, "rw", &td->ddir)) {
2099                                 fgetpos(f, &off);
2100                                 continue;
2101                         }
2102                         if (!check_int(p, "prio", &prio)) {
2103                                 fgetpos(f, &off);
2104                                 continue;
2105                         }
2106                         if (!check_int(p, "prioclass", &prioclass)) {
2107                                 fgetpos(f, &off);
2108                                 continue;
2109                         }
2110                         if (!check_int(p, "direct", &td->odirect)) {
2111                                 fgetpos(f, &off);
2112                                 continue;
2113                         }
2114                         if (!check_int(p, "rate", &td->rate)) {
2115                                 fgetpos(f, &off);
2116                                 continue;
2117                         }
2118                         if (!check_int(p, "ratemin", &td->ratemin)) {
2119                                 fgetpos(f, &off);
2120                                 continue;
2121                         }
2122                         if (!check_int(p, "ratecycle", &td->ratecycle)) {
2123                                 fgetpos(f, &off);
2124                                 continue;
2125                         }
2126                         if (!check_int(p, "thinktime", &td->thinktime)) {
2127                                 fgetpos(f, &off);
2128                                 continue;
2129                         }
2130                         if (!check_int(p, "cpumask", &cpu)) {
2131                                 fill_cpu_mask(td->cpumask, cpu);
2132                                 fgetpos(f, &off);
2133                                 continue;
2134                         }
2135                         if (!check_int(p, "fsync", &td->fsync_blocks)) {
2136                                 fgetpos(f, &off);
2137                                 continue;
2138                         }
2139                         if (!check_int(p, "startdelay", &td->start_delay)) {
2140                                 fgetpos(f, &off);
2141                                 continue;
2142                         }
2143                         if (!check_int(p, "timeout", &td->timeout)) {
2144                                 fgetpos(f, &off);
2145                                 continue;
2146                         }
2147                         if (!check_int(p, "invalidate",&td->invalidate_cache)) {
2148                                 fgetpos(f, &off);
2149                                 continue;
2150                         }
2151                         if (!check_int(p, "aio_depth", &td->aio_depth)) {
2152                                 fgetpos(f, &off);
2153                                 continue;
2154                         }
2155                         if (!check_int(p, "sync", &td->sync_io)) {
2156                                 fgetpos(f, &off);
2157                                 continue;
2158                         }
2159                         if (!check_int(p, "bwavgtime", &td->bw_avg_time)) {
2160                                 fgetpos(f, &off);
2161                                 continue;
2162                         }
2163                         if (!check_int(p, "create_serialize", &td->create_serialize)) {
2164                                 fgetpos(f, &off);
2165                                 continue;
2166                         }
2167                         if (!check_int(p, "create_fsync", &td->create_fsync)) {
2168                                 fgetpos(f, &off);
2169                                 continue;
2170                         }
2171                         if (!check_int(p, "loops", &td->loops)) {
2172                                 fgetpos(f, &off);
2173                                 continue;
2174                         }
2175                         if (!check_int(p, "verify", &td->verify)) {
2176                                 fgetpos(f, &off);
2177                                 continue;
2178                         }
2179                         if (!check_int(p, "numjobs", &td->numjobs)) {
2180                                 fgetpos(f, &off);
2181                                 continue;
2182                         }
2183                         if (!check_range(p, "bsrange", &ul1, &ul2)) {
2184                                 if (ul1 & 511)
2185                                         printf("bad min block size, must be a multiple of 512\n");
2186                                 else
2187                                         td->min_bs = ul1;
2188                                 if (ul2 & 511)
2189                                         printf("bad max block size, must be a multiple of 512\n");
2190                                 else
2191                                         td->max_bs = ul2;
2192                                 fgetpos(f, &off);
2193                                 continue;
2194                         }
2195                         if (!check_strcnv(p, "bs", &ull)) {
2196                                 if (ull & 511)
2197                                         printf("bad block size, must be a multiple of 512\n");
2198                                 else
2199                                         td->bs = ull;
2200                                 fgetpos(f, &off);
2201                                 continue;
2202                         }
2203                         if (!check_strcnv(p, "size", &td->file_size)) {
2204                                 fgetpos(f, &off);
2205                                 continue;
2206                         }
2207                         if (!check_strcnv(p, "offset", &td->file_offset)) {
2208                                 fgetpos(f, &off);
2209                                 continue;
2210                         }
2211                         if (!check_strstore(p, "directory", td->directory)) {
2212                                 fgetpos(f, &off);
2213                                 continue;
2214                         }
2215                         if (!check_str(p, "mem", "malloc")) {
2216                                 td->mem_type = MEM_MALLOC;
2217                                 fgetpos(f, &off);
2218                                 continue;
2219                         }
2220                         if (!check_str(p, "mem", "shm")) {
2221                                 td->mem_type = MEM_SHM;
2222                                 fgetpos(f, &off);
2223                                 continue;
2224                         }
2225                         if (!strncmp(p, "sequential", 10)) {
2226                                 td->sequential = 1;
2227                                 fgetpos(f, &off);
2228                                 continue;
2229                         }
2230                         if (!strncmp(p, "random", 6)) {
2231                                 td->sequential = 0;
2232                                 fgetpos(f, &off);
2233                                 continue;
2234                         }
2235                         if (!strncmp(p, "aio", 3)) {
2236                                 td->use_aio = 1;
2237                                 fgetpos(f, &off);
2238                                 continue;
2239                         }
2240                         if (!strncmp(p, "create", 6)) {
2241                                 td->create_file = 1;
2242                                 fgetpos(f, &off);
2243                                 continue;
2244                         }
2245                         if (!strncmp(p, "overwrite", 9)) {
2246                                 td->overwrite = 1;
2247                                 fgetpos(f, &off);
2248                                 continue;
2249                         }
2250                         if (!strncmp(p, "exitall", 7)) {
2251                                 exitall_on_terminate = 1;
2252                                 fgetpos(f, &off);
2253                                 continue;
2254                         }
2255                         if (!strncmp(p, "stonewall", 9)) {
2256                                 td->stonewall = 1;
2257                                 fgetpos(f, &off);
2258                                 continue;
2259                         }
2260                         printf("Client%d: bad option %s\n",td->thread_number,p);
2261                 }
2262                 fsetpos(f, &off);
2263
2264                 if (add_job(td, name, prioclass, prio))
2265                         return 1;
2266         }
2267
2268         free(string);
2269         free(name);
2270         fclose(f);
2271         return 0;
2272 }
2273
2274 static int parse_options(int argc, char *argv[])
2275 {
2276         int i;
2277
2278         for (i = 1; i < argc; i++) {
2279                 char *parm = argv[i];
2280
2281                 if (parm[0] != '-')
2282                         break;
2283
2284                 parm++;
2285                 switch (*parm) {
2286                         case 's':
2287                                 parm++;
2288                                 def_thread.sequential = !!atoi(parm);
2289                                 break;
2290                         case 'b':
2291                                 parm++;
2292                                 def_thread.bs = atoi(parm);
2293                                 def_thread.bs <<= 10;
2294                                 if (!def_thread.bs) {
2295                                         printf("bad block size\n");
2296                                         def_thread.bs = DEF_BS;
2297                                 }
2298                                 break;
2299                         case 't':
2300                                 parm++;
2301                                 def_thread.timeout = atoi(parm);
2302                                 break;
2303                         case 'r':
2304                                 parm++;
2305                                 repeatable = !!atoi(parm);
2306                                 break;
2307                         case 'R':
2308                                 parm++;
2309                                 rate_quit = !!atoi(parm);
2310                                 break;
2311                         case 'o':
2312                                 parm++;
2313                                 def_thread.odirect = !!atoi(parm);
2314                                 break;
2315                         case 'f':
2316                                 if (i + 1 >= argc) {
2317                                         printf("-f needs file as arg\n");
2318                                         break;
2319                                 }
2320                                 ini_file = strdup(argv[i+1]);
2321                                 i++;
2322                                 break;
2323                         case 'l':
2324                                 write_lat_log = 1;
2325                                 break;
2326                         case 'w':
2327                                 write_bw_log = 1;
2328                                 break;
2329                         default:
2330                                 printf("bad option %s\n", argv[i]);
2331                                 break;
2332                 }
2333         }
2334
2335         return i;
2336 }
2337
2338 static void print_thread_status(struct thread_data *td, int nr_running,
2339                                 int t_rate, int m_rate)
2340 {
2341         printf("Threads now running: %d", nr_running);
2342         if (m_rate || t_rate)
2343                 printf(", commitrate %d/%dKiB/sec", t_rate, m_rate);
2344         printf(" : [%s]\r", run_str);
2345         fflush(stdout);
2346 }
2347
2348 static void check_str_update(struct thread_data *td, int n, int t, int m)
2349 {
2350         char c = run_str[td->thread_number - 1];
2351
2352         if (td->runstate == td->old_runstate)
2353                 return;
2354
2355         switch (td->runstate) {
2356                 case TD_REAPED:
2357                         c = '_';
2358                         break;
2359                 case TD_EXITED:
2360                         c = 'E';
2361                         break;
2362                 case TD_RUNNING:
2363                         if (td_read(td)) {
2364                                 if (td->sequential)
2365                                         c = 'R';
2366                                 else
2367                                         c = 'r';
2368                         } else {
2369                                 if (td->sequential)
2370                                         c = 'W';
2371                                 else
2372                                         c = 'w';
2373                         }
2374                         break;
2375                 case TD_VERIFYING:
2376                         c = 'V';
2377                         break;
2378                 case TD_CREATED:
2379                         c = 'C';
2380                         break;
2381                 case TD_NOT_CREATED:
2382                         c = 'P';
2383                         break;
2384                 default:
2385                         printf("state %d\n", td->runstate);
2386         }
2387
2388         run_str[td->thread_number - 1] = c;
2389         print_thread_status(td, n, t, m);
2390         td->old_runstate = td->runstate;
2391 }
2392
2393 static void reap_threads(int *nr_running, int *t_rate, int *m_rate)
2394 {
2395         int i;
2396
2397         /*
2398          * reap exited threads (TD_EXITED -> TD_REAPED)
2399          */
2400         for (i = 0; i < thread_number; i++) {
2401                 struct thread_data *td = &threads[i];
2402
2403                 check_str_update(td, *nr_running, *t_rate, *m_rate);
2404
2405                 if (td->runstate != TD_EXITED)
2406                         continue;
2407
2408                 td_set_runstate(td, TD_REAPED);
2409                 waitpid(td->pid, NULL, 0);
2410                 (*nr_running)--;
2411                 (*m_rate) -= td->ratemin;
2412                 (*t_rate) -= td->rate;
2413                 check_str_update(td, *nr_running, *t_rate, *m_rate);
2414
2415                 if (td->terminate)
2416                         continue;
2417         }
2418 }
2419
2420 static void run_threads(char *argv[])
2421 {
2422         struct timeval genesis;
2423         struct thread_data *td;
2424         unsigned long spent;
2425         int i, todo, nr_running, m_rate, t_rate, nr_started;
2426
2427         printf("Starting %d threads\n", thread_number);
2428         fflush(stdout);
2429
2430         signal(SIGINT, sig_handler);
2431
2432         todo = thread_number;
2433         nr_running = 0;
2434         nr_started = 0;
2435         m_rate = t_rate = 0;
2436
2437         for (i = 0; i < thread_number; i++) {
2438                 td = &threads[i];
2439
2440                 if (!td->create_serialize)
2441                         continue;
2442
2443                 /*
2444                  * do file setup here so it happens sequentially,
2445                  * we don't want X number of threads getting their
2446                  * client data interspersed on disk
2447                  */
2448                 if (setup_file(td)) {
2449                         td_set_runstate(td, TD_REAPED);
2450                         todo--;
2451                 }
2452         }
2453
2454         gettimeofday(&genesis, NULL);
2455
2456         while (todo) {
2457                 /*
2458                  * create threads (TD_NOT_CREATED -> TD_CREATED)
2459                  */
2460                 for (i = 0; i < thread_number; i++) {
2461                         td = &threads[i];
2462
2463                         if (td->runstate != TD_NOT_CREATED)
2464                                 continue;
2465
2466                         /*
2467                          * never got a chance to start, killed by other
2468                          * thread for some reason
2469                          */
2470                         if (td->terminate) {
2471                                 todo--;
2472                                 continue;
2473                         }
2474
2475                         if (td->start_delay) {
2476                                 spent = mtime_since_now(&genesis);
2477
2478                                 if (td->start_delay * 1000 > spent)
2479                                         continue;
2480                         }
2481
2482                         if (td->stonewall && (nr_started || nr_running))
2483                                 break;
2484
2485                         td_set_runstate(td, TD_CREATED);
2486                         check_str_update(td, nr_running, t_rate, m_rate);
2487                         sem_init(&startup_sem, 1, 1);
2488                         todo--;
2489                         nr_started++;
2490
2491                         if (fork())
2492                                 sem_wait(&startup_sem);
2493                         else {
2494                                 thread_main(shm_id, i, argv);
2495                                 exit(0);
2496                         }
2497                 }
2498
2499                 /*
2500                  * start created threads (TD_CREATED -> TD_RUNNING)
2501                  */
2502                 for (i = 0; i < thread_number; i++) {
2503                         struct thread_data *td = &threads[i];
2504
2505                         if (td->runstate != TD_CREATED)
2506                                 continue;
2507
2508                         td_set_runstate(td, TD_RUNNING);
2509                         nr_running++;
2510                         nr_started--;
2511                         m_rate += td->ratemin;
2512                         t_rate += td->rate;
2513                         check_str_update(td, nr_running, t_rate, m_rate);
2514                         sem_post(&td->mutex);
2515                 }
2516
2517                 for (i = 0; i < thread_number; i++) {
2518                         struct thread_data *td = &threads[i];
2519
2520                         if (td->runstate != TD_RUNNING &&
2521                             td->runstate != TD_VERIFYING)
2522                                 continue;
2523
2524                         check_str_update(td, nr_running, t_rate, m_rate);
2525                 }
2526
2527                 reap_threads(&nr_running, &t_rate, &m_rate);
2528
2529                 if (todo)
2530                         usleep(100000);
2531         }
2532
2533         while (nr_running) {
2534                 reap_threads(&nr_running, &t_rate, &m_rate);
2535                 usleep(10000);
2536         }
2537 }
2538
2539 int setup_thread_area(void)
2540 {
2541         /*
2542          * 1024 is too much on some machines, scale max_jobs if
2543          * we get a failure that looks like too large a shm segment
2544          */
2545         do {
2546                 int s = max_jobs * sizeof(struct thread_data);
2547
2548                 shm_id = shmget(0, s, IPC_CREAT | 0600);
2549                 if (shm_id != -1)
2550                         break;
2551                 if (errno != EINVAL) {
2552                         perror("shmget");
2553                         break;
2554                 }
2555
2556                 max_jobs >>= 1;
2557         } while (max_jobs);
2558
2559         if (shm_id == -1)
2560                 return 1;
2561
2562         threads = shmat(shm_id, NULL, 0);
2563         if (threads == (void *) -1) {
2564                 perror("shmat");
2565                 return 1;
2566         }
2567
2568         atexit(free_shm);
2569         return 0;
2570 }
2571
2572 int main(int argc, char *argv[])
2573 {
2574         static unsigned long max_run[2], min_run[2];
2575         static unsigned long max_bw[2], min_bw[2];
2576         static unsigned long io_mb[2], agg[2];
2577         int i;
2578
2579         if (setup_thread_area())
2580                 return 1;
2581
2582         if (sched_getaffinity(getpid(), sizeof(cpu_set_t), &def_thread.cpumask) == -1) {
2583                 perror("sched_getaffinity");
2584                 return 1;
2585         }
2586
2587         /*
2588          * fill globals
2589          */
2590         def_thread.ddir = DDIR_READ;
2591         def_thread.bs = DEF_BS;
2592         def_thread.min_bs = -1;
2593         def_thread.max_bs = -1;
2594         def_thread.odirect = DEF_ODIRECT;
2595         def_thread.ratecycle = DEF_RATE_CYCLE;
2596         def_thread.sequential = DEF_SEQUENTIAL;
2597         def_thread.timeout = DEF_TIMEOUT;
2598         def_thread.create_file = DEF_CREATE;
2599         def_thread.overwrite = DEF_OVERWRITE;
2600         def_thread.invalidate_cache = DEF_INVALIDATE;
2601         def_thread.sync_io = DEF_SYNCIO;
2602         def_thread.mem_type = MEM_MALLOC;
2603         def_thread.bw_avg_time = DEF_BWAVGTIME;
2604         def_thread.create_serialize = DEF_CREATE_SER;
2605         def_thread.create_fsync = DEF_CREATE_FSYNC;
2606         def_thread.loops = DEF_LOOPS;
2607         def_thread.verify = DEF_VERIFY;
2608         def_thread.stonewall = DEF_STONEWALL;
2609         def_thread.numjobs = DEF_NUMJOBS;
2610
2611         i = parse_options(argc, argv);
2612
2613         if (!ini_file) {
2614                 printf("Need job file\n");
2615                 return 1;
2616         }
2617
2618         if (parse_jobs_ini(ini_file))
2619                 return 1;
2620
2621         if (!thread_number) {
2622                 printf("Nothing to do\n");
2623                 return 1;
2624         }
2625
2626         run_threads(argv);
2627
2628         min_bw[0] = min_run[0] = ~0UL;
2629         min_bw[1] = min_run[1] = ~0UL;
2630         io_mb[0] = io_mb[1] = 0;
2631         agg[0] = agg[1] = 0;
2632         for (i = 0; i < thread_number; i++) {
2633                 struct thread_data *td = &threads[i];
2634                 unsigned long bw = 0;
2635
2636                 if (!td->error) {
2637                         if (td->runtime < min_run[td->ddir])
2638                                 min_run[td->ddir] = td->runtime;
2639                         if (td->runtime > max_run[td->ddir])
2640                                 max_run[td->ddir] = td->runtime;
2641
2642                         if (td->runtime)
2643                                 bw = td->io_bytes / td->runtime;
2644                         if (bw < min_bw[td->ddir])
2645                                 min_bw[td->ddir] = bw;
2646                         if (bw > max_bw[td->ddir])
2647                                 max_bw[td->ddir] = bw;
2648
2649                         io_mb[td->ddir] += td->io_bytes >> 20;
2650                 }
2651
2652                 show_thread_status(td);
2653         }
2654         
2655         if (max_run[0])
2656                 agg[0] = (io_mb[0] * 1024 * 1000) / max_run[0];
2657         if (max_run[1])
2658                 agg[1] = (io_mb[1] * 1024 * 1000) / max_run[1];
2659
2660         printf("\nRun status:\n");
2661         if (max_run[DDIR_READ])
2662                 printf("   READ: io=%luMiB, aggrb=%lu, minb=%lu, maxb=%lu, mint=%lumsec, maxt=%lumsec\n", io_mb[0], agg[0], min_bw[0], max_bw[0], min_run[0], max_run[0]);
2663         if (max_run[DDIR_WRITE])
2664                 printf("  WRITE: io=%luMiB, aggrb=%lu, minb=%lu, maxb=%lu, mint=%lumsec, maxt=%lumsec\n", io_mb[1], agg[1], min_bw[1], max_bw[1], min_run[1], max_run[1]);
2665
2666         return 0;
2667 }