[PATCH] fio: Change '+' runstate to show what the job is doing (r/R/w/W)
[disktools.git] / fio.c
1 /*
2  * fio - the flexible io tester
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <unistd.h>
24 #include <fcntl.h>
25 #include <string.h>
26 #include <errno.h>
27 #include <signal.h>
28 #include <time.h>
29 #include <ctype.h>
30 #include <sched.h>
31 #include <libaio.h>
32 #include <math.h>
33 #include <limits.h>
34 #include <assert.h>
35 #include <sys/time.h>
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/wait.h>
39 #include <semaphore.h>
40 #include <sys/ipc.h>
41 #include <sys/shm.h>
42 #include <sys/ioctl.h>
43 #include <asm/unistd.h>
44 #include <asm/types.h>
45 #include <asm/bitops.h>
46
47 #include "list.h"
48 #include "md5.h"
49
50 #ifndef BLKGETSIZE64
51 #define BLKGETSIZE64    _IOR(0x12,114,size_t)
52 #endif
53
54 #define MAX_JOBS        (1024)
55
56 /*
57  * assume we don't have _get either, if _set isn't defined
58  */
59 #ifndef __NR_ioprio_set
60 #if defined(__i386__)
61 #define __NR_ioprio_set         289
62 #define __NR_ioprio_get         290
63 #elif defined(__powerpc__) || defined(__powerpc64__)
64 #define __NR_ioprio_set         273
65 #define __NR_ioprio_get         274
66 #elif defined(__x86_64__)
67 #define __NR_ioprio_set         251
68 #define __NR_ioprio_get         252
69 #elif defined(__ia64__)
70 #define __NR_ioprio_set         1274
71 #define __NR_ioprio_get         1275
72 #elif defined(__alpha__)
73 #define __NR_ioprio_set         442
74 #define __NR_ioprio_get         443
75 #elif defined(__s390x__) || defined(__s390__)
76 #define __NR_ioprio_set         282
77 #define __NR_ioprio_get         283
78 #else
79 #error "Unsupported arch"
80 #endif
81 #endif
82
83 #ifndef __NR_fadvise64
84 #if defined(__i386__)
85 #define __NR_fadvise64          250
86 #elif defined(__powerpc__) || defined(__powerpc64__)
87 #define __NR_fadvise64          233
88 #elif defined(__x86_64__)
89 #define __NR_fadvise64          221
90 #elif defined(__ia64__)
91 #define __NR_fadvise64          1234
92 #elif defined(__alpha__)
93 #define __NR_fadvise64          413
94 #elif defined(__s390x__) || defined(__s390__)
95 #define __NR_fadvise64          253
96 #else
97 #error "Unsupported arch"
98 #endif
99 #endif
100
101 static int ioprio_set(int which, int who, int ioprio)
102 {
103         return syscall(__NR_ioprio_set, which, who, ioprio);
104 }
105
106 /*
107  * we want fadvise64 really, but it's so tangled... later
108  */
109 static int fadvise(int fd, loff_t offset, size_t len, int advice)
110 {
111 #if 0
112         return syscall(__NR_fadvise64, fd, offset, offset >> 32, len, advice);
113 #else
114         return posix_fadvise(fd, (off_t) offset, len, advice);
115 #endif
116 }
117
118 enum {
119         IOPRIO_WHO_PROCESS = 1,
120         IOPRIO_WHO_PGRP,
121         IOPRIO_WHO_USER,
122 };
123
124 #define IOPRIO_CLASS_SHIFT      13
125
126 #define MASK    (4095)
127
128 #define DEF_BS          (4096)
129 #define DEF_TIMEOUT     (0)
130 #define DEF_RATE_CYCLE  (1000)
131 #define DEF_ODIRECT     (1)
132 #define DEF_SEQUENTIAL  (1)
133 #define DEF_RAND_REPEAT (1)
134 #define DEF_OVERWRITE   (0)
135 #define DEF_CREATE      (1)
136 #define DEF_INVALIDATE  (1)
137 #define DEF_SYNCIO      (0)
138 #define DEF_RANDSEED    (0xb1899bedUL)
139 #define DEF_BWAVGTIME   (500)
140 #define DEF_CREATE_SER  (1)
141 #define DEF_CREATE_FSYNC        (1)
142 #define DEF_LOOPS       (1)
143 #define DEF_VERIFY      (0)
144
145 #define ALIGN(buf)      (char *) (((unsigned long) (buf) + MASK) & ~(MASK))
146
147 static int repeatable = DEF_RAND_REPEAT;
148 static int rate_quit = 1;
149 static int write_lat_log;
150 static int write_bw_log;
151 static int exitall_on_terminate;
152
153 static int thread_number;
154 static char *ini_file;
155
156 static int max_jobs = MAX_JOBS;
157
158 static char run_str[MAX_JOBS + 1];
159
160 static int shm_id;
161
162 enum {
163         DDIR_READ = 0,
164         DDIR_WRITE,
165 };
166
167 /*
168  * thread life cycle
169  */
170 enum {
171         TD_NOT_CREATED = 0,
172         TD_CREATED,
173         TD_RUNNING,
174         TD_VERIFYING,
175         TD_EXITED,
176         TD_REAPED,
177 };
178
179 enum {
180         MEM_MALLOC,
181         MEM_SHM,
182 };
183
184 /*
185  * The io unit
186  */
187 struct io_u {
188         struct iocb iocb;
189         struct timeval start_time;
190         struct timeval issue_time;
191
192         char *buf;
193         unsigned int buflen;
194         unsigned long long offset;
195
196         struct list_head list;
197 };
198
199 struct io_stat {
200         unsigned long val;
201         unsigned long val_sq;
202         unsigned long max_val;
203         unsigned long min_val;
204         unsigned long samples;
205 };
206
207 struct io_sample {
208         unsigned long time;
209         unsigned long val;
210 };
211
212 struct io_log {
213         unsigned long nr_samples;
214         unsigned long max_samples;
215         struct io_sample *log;
216 };
217
218 struct io_piece {
219         struct list_head list;
220         unsigned long long offset;
221         unsigned int len;
222 };
223
224 #define FIO_HDR_MAGIC   0xf00baaef
225
226 struct verify_header {
227         unsigned int fio_magic;
228         unsigned int len;
229         char md5_digest[MD5_HASH_WORDS * 4];
230 };
231
232 #define td_read(td)             ((td)->ddir == DDIR_READ)
233 #define td_write(td)            ((td)->ddir == DDIR_WRITE)
234 #define should_fsync(td)        (td_write(td) && !(td)->odirect)
235
236 #define BLOCKS_PER_MAP          (8 * sizeof(long))
237 #define RAND_MAP_IDX(sector)    ((sector) / BLOCKS_PER_MAP)
238 #define RAND_MAP_BIT(sector)    ((sector) & (BLOCKS_PER_MAP - 1))
239
240 struct thread_data {
241         char file_name[256];
242         int thread_number;
243         int error;
244         int fd;
245         pid_t pid;
246         char *orig_buffer;
247         volatile int terminate;
248         volatile int runstate;
249         volatile int old_runstate;
250         unsigned int ddir;
251         unsigned int ioprio;
252         unsigned int sequential;
253         unsigned int bs;
254         unsigned int min_bs;
255         unsigned int max_bs;
256         unsigned int odirect;
257         unsigned int thinktime;
258         unsigned int fsync_blocks;
259         unsigned int start_delay;
260         unsigned int timeout;
261         unsigned int use_aio;
262         unsigned int create_file;
263         unsigned int overwrite;
264         unsigned int invalidate_cache;
265         unsigned int bw_avg_time;
266         unsigned int create_serialize;
267         unsigned int create_fsync;
268         unsigned int loops;
269         unsigned long long file_size;
270         unsigned long long file_offset;
271         unsigned int sync_io;
272         unsigned int mem_type;
273         unsigned int verify;
274         cpu_set_t cpumask;
275
276         struct drand48_data bsrange_state;
277         struct drand48_data verify_state;
278
279         int shm_id;
280
281         off_t cur_off;
282
283         io_context_t aio_ctx;
284         unsigned int aio_depth;
285         struct io_event *aio_events;
286
287         unsigned int cur_depth;
288         struct list_head io_u_freelist;
289         struct list_head io_u_busylist;
290
291         unsigned int rate;
292         unsigned int ratemin;
293         unsigned int ratecycle;
294         unsigned long rate_usec_cycle;
295         long rate_pending_usleep;
296         unsigned long rate_bytes;
297         struct timeval lastrate;
298
299         unsigned long runtime;          /* sec */
300         unsigned long long io_size;
301
302         unsigned long io_blocks;
303         unsigned long io_bytes;
304         unsigned long this_io_bytes;
305         unsigned long last_bytes;
306         sem_t mutex;
307
308         struct drand48_data random_state;
309         unsigned long *file_map;
310         unsigned int num_maps;
311
312         /*
313          * bandwidth and latency stats
314          */
315         struct io_stat clat_stat;               /* completion latency */
316         struct io_stat slat_stat;               /* submission latency */
317
318         struct io_stat bw_stat;                 /* bandwidth stats */
319         unsigned long stat_io_bytes;
320         struct timeval stat_sample_time;
321
322         struct io_log *lat_log;
323         struct io_log *bw_log;
324
325         struct timeval start;
326
327         struct list_head io_hist_list;
328 };
329
330 static struct thread_data *threads;
331 static struct thread_data def_thread;
332
333 static sem_t startup_sem;
334
335 static void sig_handler(int sig)
336 {
337         int i;
338
339         for (i = 0; i < thread_number; i++) {
340                 struct thread_data *td = &threads[i];
341
342                 td->terminate = 1;
343                 td->start_delay = 0;
344         }
345 }
346
347 static int init_random_state(struct thread_data *td)
348 {
349         unsigned long seed;
350         int fd, num_maps, blocks;
351
352         fd = open("/dev/random", O_RDONLY);
353         if (fd == -1) {
354                 td->error = errno;
355                 return 1;
356         }
357
358         if (read(fd, &seed, sizeof(seed)) < (int) sizeof(seed)) {
359                 td->error = EIO;
360                 close(fd);
361                 return 1;
362         }
363
364         close(fd);
365
366         srand48_r(seed, &td->bsrange_state);
367         srand48_r(seed, &td->verify_state);
368
369         if (td->sequential)
370                 return 0;
371
372         if (repeatable)
373                 seed = DEF_RANDSEED;
374
375         blocks = (td->io_size + td->min_bs - 1) / td->min_bs;
376         num_maps = blocks / BLOCKS_PER_MAP;
377         td->file_map = malloc(num_maps * sizeof(long));
378         td->num_maps = num_maps;
379         memset(td->file_map, 0, num_maps * sizeof(long));
380
381         srand48_r(seed, &td->random_state);
382         return 0;
383 }
384
385 static unsigned long utime_since(struct timeval *s, struct timeval *e)
386 {
387         double sec, usec;
388
389         sec = e->tv_sec - s->tv_sec;
390         usec = e->tv_usec - s->tv_usec;
391         if (sec > 0 && usec < 0) {
392                 sec--;
393                 usec += 1000000;
394         }
395
396         sec *= (double) 1000000;
397
398         return sec + usec;
399 }
400
401 static unsigned long mtime_since(struct timeval *s, struct timeval *e)
402 {
403         double sec, usec;
404
405         sec = e->tv_sec - s->tv_sec;
406         usec = e->tv_usec - s->tv_usec;
407         if (sec > 0 && usec < 0) {
408                 sec--;
409                 usec += 1000000;
410         }
411
412         sec *= (double) 1000;
413         usec /= (double) 1000;
414
415         return sec + usec;
416 }
417
418 static unsigned long mtime_since_now(struct timeval *s)
419 {
420         struct timeval t;
421
422         gettimeofday(&t, NULL);
423         return mtime_since(s, &t);
424 }
425
426 static inline unsigned long msec_now(struct timeval *s)
427 {
428         return s->tv_sec * 1000 + s->tv_usec / 1000;
429 }
430
431 static int random_map_free(struct thread_data *td, unsigned long long block)
432 {
433         unsigned int idx = RAND_MAP_IDX(block);
434         unsigned int bit = RAND_MAP_BIT(block);
435
436         return (td->file_map[idx] & (1UL << bit)) == 0;
437 }
438
439 static int get_next_free_block(struct thread_data *td, unsigned long long *b)
440 {
441         int i;
442
443         *b = 0;
444         i = 0;
445         while ((*b) * td->min_bs < td->io_size) {
446                 if (td->file_map[i] != -1UL) {
447                         *b += ffz(td->file_map[i]);
448                         return 0;
449                 }
450
451                 *b += BLOCKS_PER_MAP;
452                 i++;
453         }
454
455         return 1;
456 }
457
458 static void mark_random_map(struct thread_data *td, struct io_u *io_u)
459 {
460         unsigned long block = io_u->offset / td->min_bs;
461         unsigned int blocks = 0;
462
463         while (blocks < (io_u->buflen / td->min_bs)) {
464                 int idx, bit;
465
466                 if (!random_map_free(td, block))
467                         break;
468
469                 idx = RAND_MAP_IDX(block);
470                 bit = RAND_MAP_BIT(block);
471
472                 assert(idx < td->num_maps);
473
474                 td->file_map[idx] |= (1UL << bit);
475                 block++;
476                 blocks++;
477         }
478
479         if ((blocks * td->min_bs) < io_u->buflen)
480                 io_u->buflen = blocks * td->min_bs;
481 }
482
483 static int get_next_offset(struct thread_data *td, unsigned long long *offset)
484 {
485         unsigned long long b;
486         long r;
487
488         if (!td->sequential) {
489                 unsigned long max_blocks = td->io_size / td->min_bs;
490                 int loops = 50;
491
492                 do {
493                         lrand48_r(&td->random_state, &r);
494                         b = ((max_blocks - 1) * r / (RAND_MAX+1.0));
495                         loops--;
496                 } while (!random_map_free(td, b) && loops);
497
498                 if (!loops) {
499                         if (get_next_free_block(td, &b))
500                                 return 1;
501                 }
502         } else
503                 b = td->last_bytes / td->min_bs;
504
505         *offset = (b * td->min_bs) + td->file_offset;
506         return 0;
507 }
508
509 static unsigned int get_next_buflen(struct thread_data *td)
510 {
511         unsigned int buflen;
512         long r;
513
514         if (td->min_bs == td->max_bs)
515                 buflen = td->min_bs;
516         else {
517                 lrand48_r(&td->bsrange_state, &r);
518                 buflen = (1 + (double) (td->max_bs - 1) * r / (RAND_MAX + 1.0));
519                 buflen = (buflen + td->min_bs - 1) & ~(td->min_bs - 1);
520         }
521
522         if (buflen > td->io_size - td->this_io_bytes)
523                 buflen = td->io_size - td->this_io_bytes;
524
525         return buflen;
526 }
527
528 static inline void add_stat_sample(struct thread_data *td, struct io_stat *is,
529                                    unsigned long val)
530 {
531         if (val > is->max_val)
532                 is->max_val = val;
533         if (val < is->min_val)
534                 is->min_val = val;
535
536         is->val += val;
537         is->val_sq += val * val;
538         is->samples++;
539 }
540
541 static void add_log_sample(struct thread_data *td, struct io_log *log,
542                            unsigned long val)
543 {
544         if (log->nr_samples == log->max_samples) {
545                 int new_size = sizeof(struct io_sample) * log->max_samples * 2;
546
547                 log->log = realloc(log->log, new_size);
548                 log->max_samples <<= 1;
549         }
550
551         log->log[log->nr_samples].val = val;
552         log->log[log->nr_samples].time = mtime_since_now(&td->start);
553         log->nr_samples++;
554 }
555
556 static void add_clat_sample(struct thread_data *td, unsigned long msec)
557 {
558         add_stat_sample(td, &td->clat_stat, msec);
559
560         if (td->lat_log)
561                 add_log_sample(td, td->lat_log, msec);
562 }
563
564 static void add_slat_sample(struct thread_data *td, unsigned long msec)
565 {
566         add_stat_sample(td, &td->slat_stat, msec);
567 }
568
569 static void add_bw_sample(struct thread_data *td)
570 {
571         unsigned long spent = mtime_since_now(&td->stat_sample_time);
572         unsigned long rate;
573
574         if (spent < td->bw_avg_time)
575                 return;
576
577         rate = (td->this_io_bytes - td->stat_io_bytes) / spent;
578         add_stat_sample(td, &td->bw_stat, rate);
579
580         if (td->bw_log)
581                 add_log_sample(td, td->bw_log, rate);
582
583         gettimeofday(&td->stat_sample_time, NULL);
584         td->stat_io_bytes = td->this_io_bytes;
585 }
586
587 static void usec_sleep(int usec)
588 {
589         struct timespec req = { .tv_sec = 0, .tv_nsec = usec * 1000 };
590         struct timespec rem;
591
592         do {
593                 rem.tv_sec = rem.tv_nsec = 0;
594                 nanosleep(&req, &rem);
595                 if (!rem.tv_nsec)
596                         break;
597
598                 req.tv_nsec = rem.tv_nsec;
599         } while (1);
600 }
601
602 static void rate_throttle(struct thread_data *td, unsigned long time_spent,
603                           unsigned int bytes)
604 {
605         unsigned long usec_cycle;
606
607         if (!td->rate)
608                 return;
609
610         usec_cycle = td->rate_usec_cycle * (bytes / td->min_bs);
611
612         if (time_spent < usec_cycle) {
613                 unsigned long s = usec_cycle - time_spent;
614
615                 td->rate_pending_usleep += s;
616                 if (td->rate_pending_usleep >= 100000) {
617                         usec_sleep(td->rate_pending_usleep);
618                         td->rate_pending_usleep = 0;
619                 }
620         } else {
621                 long overtime = time_spent - usec_cycle;
622
623                 td->rate_pending_usleep -= overtime;
624         }
625 }
626
627 static int check_min_rate(struct thread_data *td, struct timeval *now)
628 {
629         unsigned long spent;
630         unsigned long rate;
631
632         /*
633          * allow a 2 second settle period in the beginning
634          */
635         if (mtime_since(&td->start, now) < 2000)
636                 return 0;
637
638         /*
639          * if rate blocks is set, sample is running
640          */
641         if (td->rate_bytes) {
642                 spent = mtime_since(&td->lastrate, now);
643                 if (spent < td->ratecycle)
644                         return 0;
645
646                 rate = (td->this_io_bytes - td->rate_bytes) / spent;
647                 if (rate < td->ratemin) {
648                         printf("Client%d: min rate %d not met, got %ldKiB/sec\n", td->thread_number, td->ratemin, rate);
649                         if (rate_quit)
650                                 sig_handler(0);
651                         return 1;
652                 }
653         }
654
655         td->rate_bytes = td->this_io_bytes;
656         memcpy(&td->lastrate, now, sizeof(*now));
657         return 0;
658 }
659
660 static inline int runtime_exceeded(struct thread_data *td, struct timeval *t)
661 {
662         if (!td->timeout)
663                 return 0;
664         if (mtime_since(&td->start, t) >= td->timeout * 1000)
665                 return 1;
666
667         return 0;
668 }
669
670 static void fill_random_bytes(struct thread_data *td,
671                               unsigned char *p, unsigned int len)
672 {
673         unsigned int todo;
674         double r;
675
676         while (len) {
677                 drand48_r(&td->verify_state, &r);
678
679                 /*
680                  * lrand48_r seems to be broken and only fill the bottom
681                  * 32-bits, even on 64-bit archs with 64-bit longs
682                  */
683                 todo = sizeof(r);
684                 if (todo > len)
685                         todo = len;
686
687                 memcpy(p, &r, todo);
688
689                 len -= todo;
690                 p += todo;
691         }
692 }
693
694 static void hexdump(void *buffer, int len)
695 {
696         unsigned char *p = buffer;
697         int i;
698
699         for (i = 0; i < len; i++)
700                 printf("%02x", p[i]);
701         printf("\n");
702 }
703
704 static int verify_io_u(struct io_u *io_u)
705 {
706         struct verify_header *hdr = (struct verify_header *) io_u->buf;
707         unsigned char *p = (unsigned char *) io_u->buf;
708         struct md5_ctx md5_ctx;
709         int ret;
710
711         if (hdr->fio_magic != FIO_HDR_MAGIC)
712                 return 1;
713
714         memset(&md5_ctx, 0, sizeof(md5_ctx));
715         p += sizeof(*hdr);
716         md5_update(&md5_ctx, p, hdr->len - sizeof(*hdr));
717
718         ret = memcmp(hdr->md5_digest, md5_ctx.hash, sizeof(md5_ctx.hash));
719         if (ret) {
720                 hexdump(hdr->md5_digest, sizeof(hdr->md5_digest));
721                 hexdump(md5_ctx.hash, sizeof(md5_ctx.hash));
722         }
723
724         return ret;
725 }
726
727 /*
728  * fill body of io_u->buf with random data and add a header with the
729  * (eg) sha1sum of that data.
730  */
731 static void populate_io_u(struct thread_data *td, struct io_u *io_u)
732 {
733         struct md5_ctx md5_ctx;
734         struct verify_header hdr;
735         unsigned char *p = (unsigned char *) io_u->buf;
736
737         hdr.fio_magic = FIO_HDR_MAGIC;
738         hdr.len = io_u->buflen;
739         p += sizeof(hdr);
740         fill_random_bytes(td, p, io_u->buflen - sizeof(hdr));
741
742         memset(&md5_ctx, 0, sizeof(md5_ctx));
743         md5_update(&md5_ctx, p, io_u->buflen - sizeof(hdr));
744         memcpy(hdr.md5_digest, md5_ctx.hash, sizeof(md5_ctx.hash));
745         memcpy(io_u->buf, &hdr, sizeof(hdr));
746 }
747
748 static void put_io_u(struct thread_data *td, struct io_u *io_u)
749 {
750         list_del(&io_u->list);
751         list_add(&io_u->list, &td->io_u_freelist);
752         td->cur_depth--;
753 }
754
755 #define queue_full(td)  (list_empty(&(td)->io_u_freelist))
756
757 static struct io_u *__get_io_u(struct thread_data *td)
758 {
759         struct io_u *io_u;
760
761         if (queue_full(td))
762                 return NULL;
763
764         io_u = list_entry(td->io_u_freelist.next, struct io_u, list);
765         list_del(&io_u->list);
766         list_add(&io_u->list, &td->io_u_busylist);
767         td->cur_depth++;
768         return io_u;
769 }
770
771 static struct io_u *get_io_u(struct thread_data *td)
772 {
773         struct io_u *io_u;
774
775         io_u = __get_io_u(td);
776         if (!io_u)
777                 return NULL;
778
779         if (get_next_offset(td, &io_u->offset)) {
780                 put_io_u(td, io_u);
781                 return NULL;
782         }
783
784         io_u->buflen = get_next_buflen(td);
785         if (!io_u->buflen) {
786                 put_io_u(td, io_u);
787                 return NULL;
788         }
789
790         if (io_u->buflen + io_u->offset > td->io_size)
791                 io_u->buflen = td->io_size - io_u->offset;
792
793         if (!td->sequential)
794                 mark_random_map(td, io_u);
795
796         td->last_bytes += io_u->buflen;
797
798         if (td->verify)
799                 populate_io_u(td, io_u);
800
801         if (td->use_aio) {
802                 if (td_read(td))
803                         io_prep_pread(&io_u->iocb, td->fd, io_u->buf, io_u->buflen, io_u->offset);
804                 else
805                         io_prep_pwrite(&io_u->iocb, td->fd, io_u->buf, io_u->buflen, io_u->offset);
806         }
807
808         gettimeofday(&io_u->start_time, NULL);
809         return io_u;
810 }
811
812 static inline void td_set_runstate(struct thread_data *td, int runstate)
813 {
814         td->old_runstate = td->runstate;
815         td->runstate = runstate;
816 }
817
818 static int get_next_verify(struct thread_data *td,
819                            unsigned long long *offset, unsigned int *len)
820 {
821         struct io_piece *ipo;
822
823         if (list_empty(&td->io_hist_list))
824                 return 1;
825
826         ipo = list_entry(td->io_hist_list.next, struct io_piece, list);
827         list_del(&ipo->list);
828
829         *offset = ipo->offset;
830         *len = ipo->len;
831         free(ipo);
832         return 0;
833 }
834
835 static void prune_io_piece_log(struct thread_data *td)
836 {
837         struct io_piece *ipo;
838
839         while (!list_empty(&td->io_hist_list)) {
840                 ipo = list_entry(td->io_hist_list.next, struct io_piece, list);
841
842                 list_del(&ipo->list);
843                 free(ipo);
844         }
845 }
846
847 /*
848  * log a succesful write, so we can unwind the log for verify
849  */
850 static void log_io_piece(struct thread_data *td, struct io_u *io_u)
851 {
852         struct io_piece *ipo = malloc(sizeof(struct io_piece));
853         struct list_head *entry;
854
855         INIT_LIST_HEAD(&ipo->list);
856         ipo->offset = io_u->offset;
857         ipo->len = io_u->buflen;
858
859         /*
860          * for random io where the writes extend the file, it will typically
861          * be laid out with the block scattered as written. it's faster to
862          * read them in in that order again, so don't sort
863          */
864         if (td->sequential || !td->overwrite) {
865                 list_add_tail(&ipo->list, &td->io_hist_list);
866                 return;
867         }
868
869         /*
870          * for random io, sort the list so verify will run faster
871          */
872         entry = &td->io_hist_list;
873         while ((entry = entry->prev) != &td->io_hist_list) {
874                 struct io_piece *__ipo = list_entry(entry, struct io_piece, list);
875
876                 if (__ipo->offset < ipo->offset)
877                         break;
878         }
879
880         list_add(&ipo->list, entry);
881 }
882
883 static void do_sync_verify(struct thread_data *td)
884 {
885         struct timeval t;
886         struct io_u *io_u = NULL;
887         int ret;
888
889         td_set_runstate(td, TD_VERIFYING);
890
891         io_u = __get_io_u(td);
892
893         if (!td->odirect) {
894                 if (fadvise(td->fd, td->file_offset, td->io_size, POSIX_FADV_DONTNEED) < 0) {
895                         td->error = errno;
896                         goto out;
897                 }
898         }
899
900         do {
901                 if (td->terminate)
902                         break;
903
904                 gettimeofday(&t, NULL);
905                 if (runtime_exceeded(td, &t))
906                         break;
907
908                 if (get_next_verify(td, &io_u->offset, &io_u->buflen))
909                         break;
910
911                 if (td->cur_off != io_u->offset) {
912                         if (lseek(td->fd, io_u->offset, SEEK_SET) == -1) {
913                                 td->error = errno;
914                                 break;
915                         }
916                 }
917
918                 ret = read(td->fd, io_u->buf, io_u->buflen);
919                 if (ret < (int) io_u->buflen) {
920                         if (ret == -1) {
921                                 td->error = errno;
922                                 break;
923                         } else if (!ret)
924                                 break;
925                         else
926                                 io_u->buflen = ret;
927                 }
928
929                 if (verify_io_u(io_u))
930                         break;
931
932                 td->cur_off = io_u->offset + io_u->buflen;
933         } while (1);
934
935 out:
936         td_set_runstate(td, TD_RUNNING);
937         put_io_u(td, io_u);
938 }
939
940 static void do_sync_io(struct thread_data *td)
941 {
942         unsigned long msec, usec;
943         struct io_u *io_u = NULL;
944         struct timeval e;
945
946         while (td->this_io_bytes < td->io_size) {
947                 int ret;
948
949                 if (td->terminate)
950                         break;
951
952                 io_u = get_io_u(td);
953                 if (!io_u)
954                         break;
955
956                 if (td->cur_off != io_u->offset) {
957                         if (lseek(td->fd, io_u->offset, SEEK_SET) == -1) {
958                                 td->error = errno;
959                                 break;
960                         }
961                 }
962
963                 if (td_read(td))
964                         ret = read(td->fd, io_u->buf, io_u->buflen);
965                 else
966                         ret = write(td->fd, io_u->buf, io_u->buflen);
967
968                 if (ret < (int) io_u->buflen) {
969                         if (ret == -1)
970                                 td->error = errno;
971                         break;
972                 }
973
974                 if (td_write(td))
975                         log_io_piece(td, io_u);
976
977                 td->io_blocks++;
978                 td->io_bytes += io_u->buflen;
979                 td->this_io_bytes += io_u->buflen;
980                 td->cur_off = io_u->offset + io_u->buflen;
981
982                 gettimeofday(&e, NULL);
983
984                 usec = utime_since(&io_u->start_time, &e);
985
986                 rate_throttle(td, usec, io_u->buflen);
987
988                 if (check_min_rate(td, &e)) {
989                         td->error = ENODATA;
990                         break;
991                 }
992
993                 msec = usec / 1000;
994                 add_clat_sample(td, msec);
995                 add_bw_sample(td);
996
997                 if (runtime_exceeded(td, &e))
998                         break;
999
1000                 put_io_u(td, io_u);
1001                 io_u = NULL;
1002
1003                 if (td->thinktime)
1004                         usec_sleep(td->thinktime);
1005
1006                 if (should_fsync(td) && td->fsync_blocks &&
1007                     (td->io_blocks % td->fsync_blocks) == 0)
1008                         fsync(td->fd);
1009         }
1010
1011         if (io_u)
1012                 put_io_u(td, io_u);
1013
1014         if (should_fsync(td))
1015                 fsync(td->fd);
1016 }
1017
1018 static int io_u_getevents(struct thread_data *td, int min, int max,
1019                           struct timespec *t)
1020 {
1021         int r;
1022
1023         do {
1024                 r = io_getevents(td->aio_ctx, min, max, td->aio_events, t);
1025                 if (r != -EAGAIN && r != -EINTR)
1026                         break;
1027         } while (1);
1028
1029         return r;
1030 }
1031
1032 static int io_u_queue(struct thread_data *td, struct io_u *io_u)
1033 {
1034         struct iocb *iocb = &io_u->iocb;
1035         int ret;
1036
1037         do {
1038                 ret = io_submit(td->aio_ctx, 1, &iocb);
1039                 if (ret == 1)
1040                         return 0;
1041                 else if (ret == -EAGAIN)
1042                         usleep(100);
1043                 else if (ret == -EINTR)
1044                         continue;
1045                 else
1046                         break;
1047         } while (1);
1048
1049         return ret;
1050 }
1051
1052 #define iocb_time(iocb) ((unsigned long) (iocb)->data)
1053 #define ev_to_iou(ev)   (struct io_u *) ((unsigned long) (ev)->obj)
1054
1055 static int ios_completed(struct thread_data *td, int nr)
1056 {
1057         unsigned long msec;
1058         struct io_u *io_u;
1059         struct timeval e;
1060         int i, bytes_done;
1061
1062         gettimeofday(&e, NULL);
1063
1064         for (i = 0, bytes_done = 0; i < nr; i++) {
1065                 io_u = ev_to_iou(td->aio_events + i);
1066
1067                 td->io_blocks++;
1068                 td->io_bytes += io_u->buflen;
1069                 td->this_io_bytes += io_u->buflen;
1070
1071                 msec = mtime_since(&io_u->issue_time, &e);
1072
1073                 add_clat_sample(td, msec);
1074                 add_bw_sample(td);
1075
1076                 if (td_write(td))
1077                         log_io_piece(td, io_u);
1078
1079                 bytes_done += io_u->buflen;
1080                 put_io_u(td, io_u);
1081         }
1082
1083         return bytes_done;
1084 }
1085
1086 static void cleanup_pending_aio(struct thread_data *td)
1087 {
1088         struct timespec ts = { .tv_sec = 0, .tv_nsec = 0};
1089         struct list_head *entry, *n;
1090         struct io_u *io_u;
1091         int r;
1092
1093         /*
1094          * get immediately available events, if any
1095          */
1096         r = io_u_getevents(td, 0, td->cur_depth, &ts);
1097         if (r > 0)
1098                 ios_completed(td, r);
1099
1100         /*
1101          * now cancel remaining active events
1102          */
1103         list_for_each_safe(entry, n, &td->io_u_busylist) {
1104                 io_u = list_entry(entry, struct io_u, list);
1105
1106                 r = io_cancel(td->aio_ctx, &io_u->iocb, td->aio_events);
1107                 if (!r)
1108                         put_io_u(td, io_u);
1109         }
1110
1111         if (td->cur_depth) {
1112                 r = io_u_getevents(td, td->cur_depth, td->cur_depth, NULL);
1113                 if (r > 0)
1114                         ios_completed(td, r);
1115         }
1116 }
1117
1118 static int async_do_verify(struct thread_data *td, struct io_u **io_u)
1119 {
1120         struct io_u *v_io_u = *io_u;
1121         int ret = 0;
1122
1123         if (v_io_u) {
1124                 ret = verify_io_u(v_io_u);
1125                 put_io_u(td, v_io_u);
1126                 *io_u = NULL;
1127         }
1128
1129         return ret;
1130 }
1131
1132 static void do_async_verify(struct thread_data *td)
1133 {
1134         struct timeval t;
1135         struct io_u *io_u, *v_io_u = NULL;
1136         int ret;
1137
1138         td_set_runstate(td, TD_VERIFYING);
1139
1140         do {
1141                 if (td->terminate)
1142                         break;
1143
1144                 gettimeofday(&t, NULL);
1145                 if (runtime_exceeded(td, &t))
1146                         break;
1147
1148                 io_u = __get_io_u(td);
1149                 if (!io_u)
1150                         break;
1151
1152                 if (get_next_verify(td, &io_u->offset, &io_u->buflen)) {
1153                         put_io_u(td, io_u);
1154                         break;
1155                 }
1156
1157                 io_prep_pread(&io_u->iocb, td->fd, io_u->buf, io_u->buflen, io_u->offset);
1158                 ret = io_u_queue(td, io_u);
1159                 if (ret) {
1160                         put_io_u(td, io_u);
1161                         td->error = ret;
1162                         break;
1163                 }
1164
1165                 /*
1166                  * we have one pending to verify, do that while the next
1167                  * we are doing io on the next one
1168                  */
1169                 if (async_do_verify(td, &v_io_u))
1170                         break;
1171
1172                 ret = io_u_getevents(td, 1, 1, NULL);
1173                 if (ret != 1) {
1174                         if (ret < 0)
1175                                 td->error = ret;
1176                         break;
1177                 }
1178
1179                 v_io_u = ev_to_iou(td->aio_events);
1180
1181                 td->cur_off = v_io_u->offset + v_io_u->buflen;
1182
1183                 /*
1184                  * if we can't submit more io, we need to verify now
1185                  */
1186                 if (queue_full(td) && async_do_verify(td, &v_io_u))
1187                         break;
1188
1189         } while (1);
1190
1191         async_do_verify(td, &v_io_u);
1192
1193         if (td->cur_depth)
1194                 cleanup_pending_aio(td);
1195
1196         td_set_runstate(td, TD_RUNNING);
1197 }
1198
1199 static void do_async_io(struct thread_data *td)
1200 {
1201         struct timeval s, e;
1202         unsigned long usec;
1203
1204         while (td->this_io_bytes < td->io_size) {
1205                 struct timespec ts = { .tv_sec = 0, .tv_nsec = 0};
1206                 struct timespec *timeout;
1207                 int ret, min_evts = 0;
1208                 struct io_u *io_u;
1209                 unsigned int bytes_done;
1210
1211                 if (td->terminate)
1212                         break;
1213
1214                 io_u = get_io_u(td);
1215                 if (!io_u)
1216                         break;
1217
1218                 memcpy(&s, &io_u->start_time, sizeof(s));
1219
1220                 ret = io_u_queue(td, io_u);
1221                 if (ret) {
1222                         put_io_u(td, io_u);
1223                         td->error = ret;
1224                         break;
1225                 }
1226
1227                 gettimeofday(&io_u->issue_time, NULL);
1228                 add_slat_sample(td, mtime_since(&io_u->start_time, &io_u->issue_time));
1229                 if (td->cur_depth < td->aio_depth) {
1230                         timeout = &ts;
1231                         min_evts = 0;
1232                 } else {
1233                         timeout = NULL;
1234                         min_evts = 1;
1235                 }
1236
1237                 ret = io_u_getevents(td, min_evts, td->cur_depth, timeout);
1238                 if (ret < 0) {
1239                         td->error = ret;
1240                         break;
1241                 } else if (!ret)
1242                         continue;
1243
1244                 bytes_done = ios_completed(td, ret);
1245
1246                 /*
1247                  * the rate is batched for now, it should work for batches
1248                  * of completions except the very first one which may look
1249                  * a little bursty
1250                  */
1251                 gettimeofday(&e, NULL);
1252                 usec = utime_since(&s, &e);
1253
1254                 rate_throttle(td, usec, bytes_done);
1255
1256                 if (check_min_rate(td, &e)) {
1257                         td->error = ENODATA;
1258                         break;
1259                 }
1260
1261                 if (runtime_exceeded(td, &e))
1262                         break;
1263
1264                 if (td->thinktime)
1265                         usec_sleep(td->thinktime);
1266
1267                 if (should_fsync(td) && td->fsync_blocks &&
1268                     (td->io_blocks % td->fsync_blocks) == 0)
1269                         fsync(td->fd);
1270         }
1271
1272         if (td->cur_depth)
1273                 cleanup_pending_aio(td);
1274
1275         if (should_fsync(td))
1276                 fsync(td->fd);
1277 }
1278
1279 static void cleanup_aio(struct thread_data *td)
1280 {
1281         io_destroy(td->aio_ctx);
1282
1283         if (td->aio_events)
1284                 free(td->aio_events);
1285 }
1286
1287 static int init_aio(struct thread_data *td)
1288 {
1289         if (io_queue_init(td->aio_depth, &td->aio_ctx)) {
1290                 td->error = errno;
1291                 return 1;
1292         }
1293
1294         td->aio_events = malloc(td->aio_depth * sizeof(struct io_event));
1295         return 0;
1296 }
1297
1298 static void cleanup_io_u(struct thread_data *td)
1299 {
1300         struct list_head *entry, *n;
1301         struct io_u *io_u;
1302
1303         list_for_each_safe(entry, n, &td->io_u_freelist) {
1304                 io_u = list_entry(entry, struct io_u, list);
1305
1306                 list_del(&io_u->list);
1307                 free(io_u);
1308         }
1309
1310         if (td->mem_type == MEM_MALLOC)
1311                 free(td->orig_buffer);
1312         else if (td->mem_type == MEM_SHM) {
1313                 struct shmid_ds sbuf;
1314
1315                 shmdt(td->orig_buffer);
1316                 shmctl(td->shm_id, IPC_RMID, &sbuf);
1317         }
1318 }
1319
1320 static int init_io_u(struct thread_data *td)
1321 {
1322         struct io_u *io_u;
1323         int i, max_units, mem_size;
1324         char *p;
1325
1326         if (!td->use_aio)
1327                 max_units = 1;
1328         else
1329                 max_units = td->aio_depth;
1330
1331         mem_size = td->max_bs * max_units + MASK;
1332
1333         if (td->mem_type == MEM_MALLOC)
1334                 td->orig_buffer = malloc(mem_size);
1335         else if (td->mem_type == MEM_SHM) {
1336                 td->shm_id = shmget(IPC_PRIVATE, mem_size, IPC_CREAT | 0600);
1337                 if (td->shm_id < 0) {
1338                         td->error = errno;
1339                         perror("shmget");
1340                         return 1;
1341                 }
1342
1343                 td->orig_buffer = shmat(td->shm_id, NULL, 0);
1344                 if (td->orig_buffer == (void *) -1) {
1345                         td->error = errno;
1346                         perror("shmat");
1347                         return 1;
1348                 }
1349         }
1350
1351         INIT_LIST_HEAD(&td->io_u_freelist);
1352         INIT_LIST_HEAD(&td->io_u_busylist);
1353         INIT_LIST_HEAD(&td->io_hist_list);
1354
1355         p = ALIGN(td->orig_buffer);
1356         for (i = 0; i < max_units; i++) {
1357                 io_u = malloc(sizeof(*io_u));
1358                 memset(io_u, 0, sizeof(*io_u));
1359                 INIT_LIST_HEAD(&io_u->list);
1360
1361                 io_u->buf = p + td->max_bs * i;
1362                 list_add(&io_u->list, &td->io_u_freelist);
1363         }
1364
1365         return 0;
1366 }
1367
1368 static void setup_log(struct io_log **log)
1369 {
1370         struct io_log *l = malloc(sizeof(*l));
1371
1372         l->nr_samples = 0;
1373         l->max_samples = 1024;
1374         l->log = malloc(l->max_samples * sizeof(struct io_sample));
1375         *log = l;
1376 }
1377
1378 static void finish_log(struct thread_data *td, struct io_log *log, char *name)
1379 {
1380         char file_name[128];
1381         FILE *f;
1382         unsigned int i;
1383
1384         sprintf(file_name, "client%d_%s.log", td->thread_number, name);
1385         f = fopen(file_name, "w");
1386         if (!f) {
1387                 perror("fopen log");
1388                 return;
1389         }
1390
1391         for (i = 0; i < log->nr_samples; i++)
1392                 fprintf(f, "%lu, %lu\n", log->log[i].time, log->log[i].val);
1393
1394         fclose(f);
1395         free(log->log);
1396         free(log);
1397 }
1398
1399 static int create_file(struct thread_data *td)
1400 {
1401         unsigned long long left;
1402         unsigned int bs;
1403         char *b;
1404         int r;
1405
1406         /*
1407          * unless specifically asked for overwrite, let normal io extend it
1408          */
1409         if (td_write(td) && !td->overwrite)
1410                 return 0;
1411
1412         if (!td->file_size) {
1413                 fprintf(stderr, "Need size for create\n");
1414                 td->error = EINVAL;
1415                 return 1;
1416         }
1417
1418         printf("Client%d: Laying out IO file\n", td->thread_number);
1419
1420         td->fd = open(td->file_name, O_WRONLY | O_CREAT | O_TRUNC, 0644);
1421         if (td->fd < 0) {
1422                 td->error = errno;
1423                 return 1;
1424         }
1425
1426         if (ftruncate(td->fd, td->file_size) == -1) {
1427                 td->error = errno;
1428                 return 1;
1429         }
1430
1431         td->io_size = td->file_size;
1432         b = malloc(td->max_bs);
1433         memset(b, 0, td->max_bs);
1434
1435         left = td->file_size;
1436         while (left) {
1437                 bs = td->max_bs;
1438                 if (bs > left)
1439                         bs = left;
1440
1441                 r = write(td->fd, b, bs);
1442
1443                 if (r == (int) bs) {
1444                         left -= bs;
1445                         continue;
1446                 } else {
1447                         if (r < 0)
1448                                 td->error = errno;
1449                         else
1450                                 td->error = EIO;
1451
1452                         break;
1453                 }
1454         }
1455
1456         if (td->create_fsync)
1457                 fsync(td->fd);
1458
1459         close(td->fd);
1460         td->fd = -1;
1461         free(b);
1462         return 0;
1463 }
1464
1465 static int file_exists(struct thread_data *td)
1466 {
1467         struct stat st;
1468
1469         if (stat(td->file_name, &st) != -1)
1470                 return 1;
1471
1472         return errno != ENOENT;
1473 }
1474
1475 static int get_file_size(struct thread_data *td)
1476 {
1477         size_t bytes = 0;
1478         struct stat st;
1479
1480         if (fstat(td->fd, &st) == -1) {
1481                 td->error = errno;
1482                 return 1;
1483         }
1484
1485         /*
1486          * if block device, get size via BLKGETSIZE64 ioctl. try that as well
1487          * if this is a link, fall back to st.st_size if it fails
1488          */
1489         if (S_ISBLK(st.st_mode) || S_ISLNK(st.st_mode)) {
1490                 if (ioctl(td->fd, BLKGETSIZE64, &bytes)) {
1491                         if (S_ISBLK(st.st_mode)) {
1492                                 td->error = errno;
1493                                 return 1;
1494                         } else
1495                                 bytes = st.st_size;
1496                 }
1497         } else
1498                 bytes = st.st_size;
1499
1500         if (td_read(td)) {
1501                 if (td->file_size > bytes)
1502                         bytes = td->file_size;
1503         } else {
1504                 if (!td->file_size)
1505                         td->file_size = 1024 * 1024 * 1024;
1506
1507                 bytes = td->file_size;
1508         }
1509
1510         if (td->file_offset > bytes) {
1511                 fprintf(stderr, "Client%d: offset larger than length\n", td->thread_number);
1512                 return 1;
1513         }
1514
1515         td->io_size = bytes - td->file_offset;
1516         if (td->io_size == 0) {
1517                 fprintf(stderr, "Client%d: no io blocks\n", td->thread_number);
1518                 td->error = EINVAL;
1519                 return 1;
1520         }
1521
1522         return 0;
1523 }
1524
1525 static int setup_file(struct thread_data *td)
1526 {
1527         int flags = 0;
1528
1529         if (!file_exists(td)) {
1530                 if (!td->create_file) {
1531                         td->error = ENOENT;
1532                         return 1;
1533                 }
1534                 if (create_file(td))
1535                         return 1;
1536         }
1537
1538         if (td->odirect)
1539                 flags |= O_DIRECT;
1540
1541         if (td_read(td))
1542                 td->fd = open(td->file_name, flags | O_RDONLY);
1543         else {
1544                 if (!td->overwrite)
1545                         flags |= O_TRUNC;
1546                 if (td->sync_io)
1547                         flags |= O_SYNC;
1548                 if (td->verify)
1549                         flags |= O_RDWR;
1550                 else
1551                         flags |= O_WRONLY;
1552
1553                 td->fd = open(td->file_name, flags | O_CREAT, 0600);
1554         }
1555
1556         if (td->fd == -1) {
1557                 td->error = errno;
1558                 return 1;
1559         }
1560
1561         if (get_file_size(td))
1562                 return 1;
1563
1564         if (td_write(td) && ftruncate(td->fd, td->file_size) == -1) {
1565                 td->error = errno;
1566                 return 1;
1567         }
1568
1569         if (td->invalidate_cache) {
1570                 if (fadvise(td->fd, td->file_offset, td->file_size, POSIX_FADV_DONTNEED) < 0) {
1571                         td->error = errno;
1572                         return 1;
1573                 }
1574         }
1575
1576         return 0;
1577 }
1578
1579 static void clear_io_state(struct thread_data *td)
1580 {
1581         if (!td->use_aio)
1582                 lseek(td->fd, SEEK_SET, 0);
1583
1584         td->cur_off = 0;
1585         td->last_bytes = 0;
1586         td->stat_io_bytes = 0;
1587         td->this_io_bytes = 0;
1588
1589         if (td->file_map)
1590                 memset(td->file_map, 0, td->num_maps * sizeof(long));
1591 }
1592
1593 static void *thread_main(int shm_id, int offset, char *argv[])
1594 {
1595         struct thread_data *td;
1596         int ret = 1;
1597         void *data;
1598
1599         setsid();
1600
1601         data = shmat(shm_id, NULL, 0);
1602         if (data == (void *) -1) {
1603                 perror("shmat");
1604                 return NULL;
1605         }
1606
1607         td = data + offset * sizeof(struct thread_data);
1608         td->pid = getpid();
1609
1610         if (init_io_u(td))
1611                 goto err;
1612
1613         if (sched_setaffinity(td->pid, sizeof(td->cpumask), &td->cpumask) == -1) {
1614                 td->error = errno;
1615                 goto err;
1616         }
1617
1618         sprintf(argv[0], "fio%d", offset);
1619
1620         if (td->use_aio && init_aio(td))
1621                 goto err;
1622
1623         if (td->ioprio) {
1624                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, td->ioprio) == -1) {
1625                         td->error = errno;
1626                         goto err;
1627                 }
1628         }
1629
1630         sem_post(&startup_sem);
1631         sem_wait(&td->mutex);
1632
1633         if (!td->create_serialize && setup_file(td))
1634                 goto err;
1635
1636         if (init_random_state(td))
1637                 goto err;
1638
1639         gettimeofday(&td->start, NULL);
1640
1641         while (td->loops--) {
1642                 gettimeofday(&td->stat_sample_time, NULL);
1643
1644                 if (td->ratemin)
1645                         memcpy(&td->lastrate, &td->stat_sample_time, sizeof(td->lastrate));
1646
1647                 clear_io_state(td);
1648                 prune_io_piece_log(td);
1649
1650                 if (!td->use_aio)
1651                         do_sync_io(td);
1652                 else
1653                         do_async_io(td);
1654
1655                 if (td->error)
1656                         break;
1657
1658                 if (!td->verify)
1659                         continue;
1660
1661                 clear_io_state(td);
1662
1663                 if (!td->use_aio)
1664                         do_sync_verify(td);
1665                 else
1666                         do_async_verify(td);
1667
1668                 if (td->error)
1669                         break;
1670         }
1671
1672         td->runtime = mtime_since_now(&td->start);
1673         ret = 0;
1674
1675         if (td->bw_log)
1676                 finish_log(td, td->bw_log, "bw");
1677         if (td->lat_log)
1678                 finish_log(td, td->lat_log, "lat");
1679
1680         if (exitall_on_terminate)
1681                 sig_handler(0);
1682
1683 err:
1684         if (td->fd != -1) {
1685                 close(td->fd);
1686                 td->fd = -1;
1687         }
1688         if (td->use_aio)
1689                 cleanup_aio(td);
1690         cleanup_io_u(td);
1691         if (ret) {
1692                 sem_post(&startup_sem);
1693                 sem_wait(&td->mutex);
1694         }
1695         td_set_runstate(td, TD_EXITED);
1696         shmdt(data);
1697         return NULL;
1698 }
1699
1700 static void free_shm(void)
1701 {
1702         struct shmid_ds sbuf;
1703
1704         if (threads) {
1705                 shmdt(threads);
1706                 threads = NULL;
1707                 shmctl(shm_id, IPC_RMID, &sbuf);
1708         }
1709 }
1710
1711 static int calc_lat(struct io_stat *is, unsigned long *min, unsigned long *max,
1712                     double *mean, double *dev)
1713 {
1714         double n;
1715
1716         if (is->samples == 0)
1717                 return 0;
1718
1719         *min = is->min_val;
1720         *max = is->max_val;
1721
1722         n = (double) is->samples;
1723         *mean = (double) is->val / n;
1724         *dev = sqrt(((double) is->val_sq - (*mean * *mean) / n) / (n - 1));
1725         return 1;
1726 }
1727
1728 static void show_thread_status(struct thread_data *td)
1729 {
1730         int prio, prio_class;
1731         unsigned long min, max, bw = 0;
1732         double mean, dev;
1733
1734         if (!td->io_bytes && !td->error)
1735                 return;
1736
1737         if (td->runtime)
1738                 bw = td->io_bytes / td->runtime;
1739
1740         prio = td->ioprio & 0xff;
1741         prio_class = td->ioprio >> IOPRIO_CLASS_SHIFT;
1742
1743         printf("Client%d: err=%2d, io=%6luMiB, bw=%6luKiB/s, runt=%6lumsec\n", td->thread_number, td->error, td->io_bytes >> 20, bw, td->runtime);
1744
1745         if (calc_lat(&td->slat_stat, &min, &max, &mean, &dev))
1746                 printf("  slat (msec): min=%5lu, max=%5lu, avg=%5.02f, dev=%5.02f\n", min, max, mean, dev);
1747         if (calc_lat(&td->clat_stat, &min, &max, &mean, &dev))
1748                 printf("  clat (msec): min=%5lu, max=%5lu, avg=%5.02f, dev=%5.02f\n", min, max, mean, dev);
1749         if (calc_lat(&td->bw_stat, &min, &max, &mean, &dev))
1750                 printf("  bw (KiB/s) : min=%5lu, max=%5lu, avg=%5.02f, dev=%5.02f\n", min, max, mean, dev);
1751 }
1752
1753 static int setup_rate(struct thread_data *td)
1754 {
1755         int nr_reads_per_sec;
1756
1757         if (!td->rate)
1758                 return 0;
1759
1760         if (td->rate < td->ratemin) {
1761                 fprintf(stderr, "min rate larger than nominal rate\n");
1762                 return -1;
1763         }
1764
1765         nr_reads_per_sec = (td->rate * 1024) / td->min_bs;
1766         td->rate_usec_cycle = 1000000 / nr_reads_per_sec;
1767         td->rate_pending_usleep = 0;
1768         return 0;
1769 }
1770
1771 static struct thread_data *get_new_job(int global)
1772 {
1773         struct thread_data *td;
1774
1775         if (global)
1776                 return &def_thread;
1777         if (thread_number >= max_jobs)
1778                 return NULL;
1779
1780         td = &threads[thread_number++];
1781         memset(td, 0, sizeof(*td));
1782
1783         td->fd = -1;
1784         td->thread_number = thread_number;
1785
1786         td->ddir = def_thread.ddir;
1787         td->ioprio = def_thread.ioprio;
1788         td->sequential = def_thread.sequential;
1789         td->bs = def_thread.bs;
1790         td->min_bs = def_thread.min_bs;
1791         td->max_bs = def_thread.max_bs;
1792         td->odirect = def_thread.odirect;
1793         td->thinktime = def_thread.thinktime;
1794         td->fsync_blocks = def_thread.fsync_blocks;
1795         td->start_delay = def_thread.start_delay;
1796         td->timeout = def_thread.timeout;
1797         td->use_aio = def_thread.use_aio;
1798         td->create_file = def_thread.create_file;
1799         td->overwrite = def_thread.overwrite;
1800         td->invalidate_cache = def_thread.invalidate_cache;
1801         td->file_size = def_thread.file_size;
1802         td->file_offset = def_thread.file_offset;
1803         td->rate = def_thread.rate;
1804         td->ratemin = def_thread.ratemin;
1805         td->ratecycle = def_thread.ratecycle;
1806         td->aio_depth = def_thread.aio_depth;
1807         td->sync_io = def_thread.sync_io;
1808         td->mem_type = def_thread.mem_type;
1809         td->bw_avg_time = def_thread.bw_avg_time;
1810         td->create_serialize = def_thread.create_serialize;
1811         td->create_fsync = def_thread.create_fsync;
1812         td->loops = def_thread.loops;
1813         td->verify = def_thread.verify;
1814         memcpy(&td->cpumask, &def_thread.cpumask, sizeof(td->cpumask));
1815
1816         return td;
1817 }
1818
1819 static void put_job(struct thread_data *td)
1820 {
1821         memset(&threads[td->thread_number - 1], 0, sizeof(*td));
1822         thread_number--;
1823 }
1824
1825 static int add_job(struct thread_data *td, const char *filename, int prioclass,
1826                    int prio)
1827 {
1828         if (td == &def_thread)
1829                 return 0;
1830
1831         strcpy(td->file_name, filename);
1832         sem_init(&td->mutex, 1, 0);
1833         td->ioprio = (prioclass << IOPRIO_CLASS_SHIFT) | prio;
1834
1835         td->clat_stat.min_val = ULONG_MAX;
1836         td->slat_stat.min_val = ULONG_MAX;
1837         td->bw_stat.min_val = ULONG_MAX;
1838
1839         run_str[td->thread_number - 1] = 'P';
1840
1841         if (td->use_aio && !td->aio_depth)
1842                 td->aio_depth = 1;
1843
1844         if (td->min_bs == -1U)
1845                 td->min_bs = td->bs;
1846         if (td->max_bs == -1U)
1847                 td->max_bs = td->bs;
1848         if (td_read(td))
1849                 td->verify = 0;
1850
1851         if (setup_rate(td))
1852                 return -1;
1853
1854         if (write_lat_log)
1855                 setup_log(&td->lat_log);
1856         if (write_bw_log)
1857                 setup_log(&td->bw_log);
1858
1859         printf("Client%d: file=%s, rw=%d, prio=%d/%d, seq=%d, odir=%d, bs=%d-%d, rate=%d, aio=%d, aio_depth=%d\n", td->thread_number, filename, td->ddir, prioclass, prio, td->sequential, td->odirect, td->min_bs, td->max_bs, td->rate, td->use_aio, td->aio_depth);
1860         return 0;
1861 }
1862
1863 static void fill_cpu_mask(cpu_set_t cpumask, int cpu)
1864 {
1865         unsigned int i;
1866
1867         CPU_ZERO(&cpumask);
1868
1869         for (i = 0; i < sizeof(int) * 8; i++) {
1870                 if ((1 << i) & cpu)
1871                         CPU_SET(i, &cpumask);
1872         }
1873 }
1874
1875 unsigned long get_mult(char c)
1876 {
1877         switch (c) {
1878                 case 'k':
1879                 case 'K':
1880                         return 1024;
1881                 case 'm':
1882                 case 'M':
1883                         return 1024 * 1024;
1884                 case 'g':
1885                 case 'G':
1886                         return 1024 * 1024 * 1024;
1887                 default:
1888                         return 1;
1889         }
1890 }
1891
1892 /*
1893  * convert string after '=' into decimal value, noting any size suffix
1894  */
1895 static int str_cnv(char *p, unsigned long long *val)
1896 {
1897         char *str;
1898         int len;
1899
1900         str = strstr(p, "=");
1901         if (!str)
1902                 return 1;
1903
1904         str++;
1905         len = strlen(str);
1906
1907         *val = strtoul(str, NULL, 10);
1908         if (*val == ULONG_MAX && errno == ERANGE)
1909                 return 1;
1910
1911         *val *= get_mult(str[len - 2]);
1912         return 0;
1913 }
1914
1915 static int check_strcnv(char *p, char *name, unsigned long long *val)
1916 {
1917         if (!strstr(p, name))
1918                 return 1;
1919
1920         return str_cnv(p, val);
1921 }
1922
1923 static int check_str(char *p, char *name, char *option)
1924 {
1925         char *s = strstr(p, name);
1926
1927         if (!s)
1928                 return 1;
1929
1930         s += strlen(name);
1931         if (strstr(s, option))
1932                 return 0;
1933
1934         return 1;
1935 }
1936
1937 static int check_range(char *p, char *name, unsigned long *s, unsigned long *e)
1938 {
1939         char str[128];
1940         char s1, s2;
1941
1942         sprintf(str, "%s=%%lu%%c-%%lu%%c", name);
1943         if (sscanf(p, str, s, &s1, e, &s2) == 4) {
1944                 *s *= get_mult(s1);
1945                 *e *= get_mult(s2);
1946                 return 0;
1947         }
1948
1949         sprintf(str, "%s = %%lu%%c-%%lu%%c", name);
1950         if (sscanf(p, str, s, &s1, e, &s2) == 4) {
1951                 *s *= get_mult(s1);
1952                 *e *= get_mult(s2);
1953                 return 0;
1954         }
1955
1956         sprintf(str, "%s=%%lu-%%lu", name);
1957         if (sscanf(p, str, s, e) == 2)
1958                 return 0;
1959
1960         sprintf(str, "%s = %%lu-%%lu", name);
1961         if (sscanf(p, str, s, e) == 2)
1962                 return 0;
1963
1964         return 1;
1965
1966 }
1967
1968 static int check_int(char *p, char *name, unsigned int *val)
1969 {
1970         char str[128];
1971
1972         sprintf(str, "%s=%%d", name);
1973         if (sscanf(p, str, val) == 1)
1974                 return 0;
1975
1976         sprintf(str, "%s = %%d", name);
1977         if (sscanf(p, str, val) == 1)
1978                 return 0;
1979
1980         return 1;
1981 }
1982
1983 static int is_empty_or_comment(char *line)
1984 {
1985         unsigned int i;
1986
1987         for (i = 0; i < strlen(line); i++) {
1988                 if (line[i] == ';')
1989                         return 1;
1990                 if (!isspace(line[i]) && !iscntrl(line[i]))
1991                         return 0;
1992         }
1993
1994         return 1;
1995 }
1996
1997 static int parse_jobs_ini(char *file)
1998 {
1999         unsigned int prioclass, prio, cpu, global;
2000         unsigned long long ull;
2001         unsigned long ul1, ul2;
2002         struct thread_data *td;
2003         char *string, *name;
2004         fpos_t off;
2005         FILE *f;
2006         char *p;
2007
2008         f = fopen(file, "r");
2009         if (!f) {
2010                 perror("fopen");
2011                 return 1;
2012         }
2013
2014         string = malloc(4096);
2015         name = malloc(256);
2016
2017         while ((p = fgets(string, 4096, f)) != NULL) {
2018                 if (is_empty_or_comment(p))
2019                         continue;
2020                 if (sscanf(p, "[%s]", name) != 1)
2021                         continue;
2022
2023                 global = !strncmp(name, "global", 6);
2024
2025                 name[strlen(name) - 1] = '\0';
2026
2027                 td = get_new_job(global);
2028                 if (!td)
2029                         break;
2030
2031                 prioclass = 2;
2032                 prio = 4;
2033
2034                 fgetpos(f, &off);
2035                 while ((p = fgets(string, 4096, f)) != NULL) {
2036                         if (is_empty_or_comment(p))
2037                                 continue;
2038                         if (strstr(p, "["))
2039                                 break;
2040                         if (!check_int(p, "rw", &td->ddir)) {
2041                                 fgetpos(f, &off);
2042                                 continue;
2043                         }
2044                         if (!check_int(p, "prio", &prio)) {
2045                                 fgetpos(f, &off);
2046                                 continue;
2047                         }
2048                         if (!check_int(p, "prioclass", &prioclass)) {
2049                                 fgetpos(f, &off);
2050                                 continue;
2051                         }
2052                         if (!check_int(p, "direct", &td->odirect)) {
2053                                 fgetpos(f, &off);
2054                                 continue;
2055                         }
2056                         if (!check_int(p, "rate", &td->rate)) {
2057                                 fgetpos(f, &off);
2058                                 continue;
2059                         }
2060                         if (!check_int(p, "ratemin", &td->ratemin)) {
2061                                 fgetpos(f, &off);
2062                                 continue;
2063                         }
2064                         if (!check_int(p, "ratecycle", &td->ratecycle)) {
2065                                 fgetpos(f, &off);
2066                                 continue;
2067                         }
2068                         if (!check_int(p, "thinktime", &td->thinktime)) {
2069                                 fgetpos(f, &off);
2070                                 continue;
2071                         }
2072                         if (!check_int(p, "cpumask", &cpu)) {
2073                                 fill_cpu_mask(td->cpumask, cpu);
2074                                 fgetpos(f, &off);
2075                                 continue;
2076                         }
2077                         if (!check_int(p, "fsync", &td->fsync_blocks)) {
2078                                 fgetpos(f, &off);
2079                                 continue;
2080                         }
2081                         if (!check_int(p, "startdelay", &td->start_delay)) {
2082                                 fgetpos(f, &off);
2083                                 continue;
2084                         }
2085                         if (!check_int(p, "timeout", &td->timeout)) {
2086                                 fgetpos(f, &off);
2087                                 continue;
2088                         }
2089                         if (!check_int(p, "invalidate",&td->invalidate_cache)) {
2090                                 fgetpos(f, &off);
2091                                 continue;
2092                         }
2093                         if (!check_int(p, "aio_depth", &td->aio_depth)) {
2094                                 fgetpos(f, &off);
2095                                 continue;
2096                         }
2097                         if (!check_int(p, "sync", &td->sync_io)) {
2098                                 fgetpos(f, &off);
2099                                 continue;
2100                         }
2101                         if (!check_int(p, "bwavgtime", &td->bw_avg_time)) {
2102                                 fgetpos(f, &off);
2103                                 continue;
2104                         }
2105                         if (!check_int(p, "create_serialize", &td->create_serialize)) {
2106                                 fgetpos(f, &off);
2107                                 continue;
2108                         }
2109                         if (!check_int(p, "create_fsync", &td->create_fsync)) {
2110                                 fgetpos(f, &off);
2111                                 continue;
2112                         }
2113                         if (!check_int(p, "loops", &td->loops)) {
2114                                 fgetpos(f, &off);
2115                                 continue;
2116                         }
2117                         if (!check_int(p, "verify", &td->verify)) {
2118                                 fgetpos(f, &off);
2119                                 continue;
2120                         }
2121                         if (!check_range(p, "bsrange", &ul1, &ul2)) {
2122                                 if (ul1 & 511)
2123                                         printf("bad min block size, must be a multiple of 512\n");
2124                                 else
2125                                         td->min_bs = ul1;
2126                                 if (ul2 & 511)
2127                                         printf("bad max block size, must be a multiple of 512\n");
2128                                 else
2129                                         td->max_bs = ul2;
2130                                 fgetpos(f, &off);
2131                                 continue;
2132                         }
2133                         if (!check_strcnv(p, "bs", &ull)) {
2134                                 if (ull & 511)
2135                                         printf("bad block size, must be a multiple of 512\n");
2136                                 else
2137                                         td->bs = ull;
2138                                 fgetpos(f, &off);
2139                                 continue;
2140                         }
2141                         if (!check_strcnv(p, "size", &td->file_size)) {
2142                                 fgetpos(f, &off);
2143                                 continue;
2144                         }
2145                         if (!check_strcnv(p, "offset", &td->file_offset)) {
2146                                 fgetpos(f, &off);
2147                                 continue;
2148                         }
2149                         if (!check_str(p, "mem", "malloc")) {
2150                                 td->mem_type = MEM_MALLOC;
2151                                 fgetpos(f, &off);
2152                                 continue;
2153                         }
2154                         if (!check_str(p, "mem", "shm")) {
2155                                 td->mem_type = MEM_SHM;
2156                                 fgetpos(f, &off);
2157                                 continue;
2158                         }
2159                         if (!strncmp(p, "sequential", 10)) {
2160                                 td->sequential = 1;
2161                                 fgetpos(f, &off);
2162                                 continue;
2163                         }
2164                         if (!strncmp(p, "random", 6)) {
2165                                 td->sequential = 0;
2166                                 fgetpos(f, &off);
2167                                 continue;
2168                         }
2169                         if (!strncmp(p, "aio", 3)) {
2170                                 td->use_aio = 1;
2171                                 fgetpos(f, &off);
2172                                 continue;
2173                         }
2174                         if (!strncmp(p, "create", 6)) {
2175                                 td->create_file = 1;
2176                                 fgetpos(f, &off);
2177                                 continue;
2178                         }
2179                         if (!strncmp(p, "overwrite", 9)) {
2180                                 td->overwrite = 1;
2181                                 fgetpos(f, &off);
2182                                 continue;
2183                         }
2184                         if (!strncmp(p, "exitall", 7)) {
2185                                 exitall_on_terminate = 1;
2186                                 fgetpos(f, &off);
2187                                 continue;
2188                         }
2189                         printf("Client%d: bad option %s\n",td->thread_number,p);
2190                 }
2191                 fsetpos(f, &off);
2192
2193                 if (add_job(td, name, prioclass, prio))
2194                         put_job(td);
2195         }
2196
2197         free(string);
2198         free(name);
2199         fclose(f);
2200         return 0;
2201 }
2202
2203 static int parse_options(int argc, char *argv[])
2204 {
2205         int i;
2206
2207         for (i = 1; i < argc; i++) {
2208                 char *parm = argv[i];
2209
2210                 if (parm[0] != '-')
2211                         break;
2212
2213                 parm++;
2214                 switch (*parm) {
2215                         case 's':
2216                                 parm++;
2217                                 def_thread.sequential = !!atoi(parm);
2218                                 break;
2219                         case 'b':
2220                                 parm++;
2221                                 def_thread.bs = atoi(parm);
2222                                 def_thread.bs <<= 10;
2223                                 if (!def_thread.bs) {
2224                                         printf("bad block size\n");
2225                                         def_thread.bs = DEF_BS;
2226                                 }
2227                                 break;
2228                         case 't':
2229                                 parm++;
2230                                 def_thread.timeout = atoi(parm);
2231                                 break;
2232                         case 'r':
2233                                 parm++;
2234                                 repeatable = !!atoi(parm);
2235                                 break;
2236                         case 'R':
2237                                 parm++;
2238                                 rate_quit = !!atoi(parm);
2239                                 break;
2240                         case 'o':
2241                                 parm++;
2242                                 def_thread.odirect = !!atoi(parm);
2243                                 break;
2244                         case 'f':
2245                                 if (i + 1 >= argc) {
2246                                         printf("-f needs file as arg\n");
2247                                         break;
2248                                 }
2249                                 ini_file = strdup(argv[i+1]);
2250                                 i++;
2251                                 break;
2252                         case 'l':
2253                                 write_lat_log = 1;
2254                                 break;
2255                         case 'w':
2256                                 write_bw_log = 1;
2257                                 break;
2258                         default:
2259                                 printf("bad option %s\n", argv[i]);
2260                                 break;
2261                 }
2262         }
2263
2264         return i;
2265 }
2266
2267 static void print_thread_status(struct thread_data *td, int nr_running,
2268                                 int t_rate, int m_rate)
2269 {
2270         printf("Threads now running: %d", nr_running);
2271         if (m_rate || t_rate)
2272                 printf(", commitrate %d/%dKiB/sec", t_rate, m_rate);
2273         printf(" : [%s]\r", run_str);
2274         fflush(stdout);
2275 }
2276
2277 static void check_str_update(struct thread_data *td, int n, int t, int m)
2278 {
2279         char c = run_str[td->thread_number - 1];
2280
2281         if (td->runstate == td->old_runstate)
2282                 return;
2283
2284         switch (td->runstate) {
2285                 case TD_REAPED:
2286                         c = '_';
2287                         break;
2288                 case TD_EXITED:
2289                         c = 'E';
2290                         break;
2291                 case TD_RUNNING:
2292                         if (td_read(td)) {
2293                                 if (td->sequential)
2294                                         c = 'R';
2295                                 else
2296                                         c = 'r';
2297                         } else {
2298                                 if (td->sequential)
2299                                         c = 'W';
2300                                 else
2301                                         c = 'w';
2302                         }
2303                         break;
2304                 case TD_VERIFYING:
2305                         c = 'V';
2306                         break;
2307                 case TD_CREATED:
2308                         c = 'C';
2309                         break;
2310                 case TD_NOT_CREATED:
2311                         c = 'P';
2312                         break;
2313                 default:
2314                         printf("state %d\n", td->runstate);
2315         }
2316
2317         run_str[td->thread_number - 1] = c;
2318         print_thread_status(td, n, t, m);
2319         td->old_runstate = td->runstate;
2320 }
2321
2322 static void reap_threads(int *nr_running, int *t_rate, int *m_rate)
2323 {
2324         int i;
2325
2326         /*
2327          * reap exited threads (TD_EXITED -> TD_REAPED)
2328          */
2329         for (i = 0; i < thread_number; i++) {
2330                 struct thread_data *td = &threads[i];
2331
2332                 check_str_update(td, *nr_running, *t_rate, *m_rate);
2333
2334                 if (td->runstate != TD_EXITED)
2335                         continue;
2336
2337                 td_set_runstate(td, TD_REAPED);
2338                 waitpid(td->pid, NULL, 0);
2339                 (*nr_running)--;
2340                 (*m_rate) -= td->ratemin;
2341                 (*t_rate) -= td->rate;
2342                 check_str_update(td, *nr_running, *t_rate, *m_rate);
2343
2344                 if (td->terminate)
2345                         continue;
2346         }
2347 }
2348
2349 static void run_threads(char *argv[])
2350 {
2351         struct timeval genesis;
2352         struct thread_data *td;
2353         unsigned long spent;
2354         int i, todo, nr_running, m_rate, t_rate;
2355
2356         printf("Starting %d threads\n", thread_number);
2357         fflush(stdout);
2358
2359         signal(SIGINT, sig_handler);
2360
2361         todo = thread_number;
2362         nr_running = 0;
2363         m_rate = t_rate = 0;
2364
2365         for (i = 0; i < thread_number; i++) {
2366                 td = &threads[i];
2367
2368                 if (!td->create_serialize)
2369                         continue;
2370
2371                 /*
2372                  * do file setup here so it happens sequentially,
2373                  * we don't want X number of threads getting their
2374                  * client data interspersed on disk
2375                  */
2376                 if (setup_file(td)) {
2377                         td_set_runstate(td, TD_REAPED);
2378                         todo--;
2379                 }
2380         }
2381
2382         gettimeofday(&genesis, NULL);
2383
2384         while (todo) {
2385                 /*
2386                  * create threads (TD_NOT_CREATED -> TD_CREATED)
2387                  */
2388                 for (i = 0; i < thread_number; i++) {
2389                         td = &threads[i];
2390
2391                         if (td->runstate != TD_NOT_CREATED)
2392                                 continue;
2393
2394                         /*
2395                          * never got a chance to start, killed by other
2396                          * thread for some reason
2397                          */
2398                         if (td->terminate) {
2399                                 todo--;
2400                                 continue;
2401                         }
2402
2403                         if (td->start_delay) {
2404                                 spent = mtime_since_now(&genesis);
2405
2406                                 if (td->start_delay * 1000 > spent)
2407                                         continue;
2408                         }
2409
2410                         td_set_runstate(td, TD_CREATED);
2411                         check_str_update(td, nr_running, t_rate, m_rate);
2412                         sem_init(&startup_sem, 1, 1);
2413                         todo--;
2414
2415                         if (fork())
2416                                 sem_wait(&startup_sem);
2417                         else {
2418                                 thread_main(shm_id, i, argv);
2419                                 exit(0);
2420                         }
2421                 }
2422
2423                 /*
2424                  * start created threads (TD_CREATED -> TD_RUNNING)
2425                  */
2426                 for (i = 0; i < thread_number; i++) {
2427                         struct thread_data *td = &threads[i];
2428
2429                         if (td->runstate != TD_CREATED)
2430                                 continue;
2431
2432                         td_set_runstate(td, TD_RUNNING);
2433                         nr_running++;
2434                         m_rate += td->ratemin;
2435                         t_rate += td->rate;
2436                         check_str_update(td, nr_running, t_rate, m_rate);
2437                         sem_post(&td->mutex);
2438                 }
2439
2440                 for (i = 0; i < thread_number; i++) {
2441                         struct thread_data *td = &threads[i];
2442
2443                         if (td->runstate == TD_RUNNING)
2444                                 run_str[td->thread_number - 1] = '+';
2445                         else if (td->runstate == TD_VERIFYING)
2446                                 run_str[td->thread_number - 1] = 'V';
2447                         else
2448                                 continue;
2449
2450                         check_str_update(td, nr_running, t_rate, m_rate);
2451                 }
2452
2453                 reap_threads(&nr_running, &t_rate, &m_rate);
2454
2455                 if (todo)
2456                         usleep(100000);
2457         }
2458
2459         while (nr_running) {
2460                 reap_threads(&nr_running, &t_rate, &m_rate);
2461                 usleep(10000);
2462         }
2463 }
2464
2465 int setup_thread_area(void)
2466 {
2467         /*
2468          * 1024 is too much on some machines, scale max_jobs if
2469          * we get a failure that looks like too large a shm segment
2470          */
2471         do {
2472                 int s = max_jobs * sizeof(struct thread_data);
2473
2474                 shm_id = shmget(0, s, IPC_CREAT | 0600);
2475                 if (shm_id != -1)
2476                         break;
2477                 if (errno != EINVAL) {
2478                         perror("shmget");
2479                         break;
2480                 }
2481
2482                 max_jobs >>= 1;
2483         } while (max_jobs);
2484
2485         if (shm_id == -1)
2486                 return 1;
2487
2488         threads = shmat(shm_id, NULL, 0);
2489         if (threads == (void *) -1) {
2490                 perror("shmat");
2491                 return 1;
2492         }
2493
2494         atexit(free_shm);
2495         return 0;
2496 }
2497
2498 int main(int argc, char *argv[])
2499 {
2500         static unsigned long max_run[2], min_run[2];
2501         static unsigned long max_bw[2], min_bw[2];
2502         static unsigned long io_mb[2], agg[2];
2503         int i;
2504
2505         if (setup_thread_area())
2506                 return 1;
2507
2508         if (sched_getaffinity(getpid(), sizeof(cpu_set_t), &def_thread.cpumask) == -1) {
2509                 perror("sched_getaffinity");
2510                 return 1;
2511         }
2512
2513         /*
2514          * fill globals
2515          */
2516         def_thread.ddir = DDIR_READ;
2517         def_thread.bs = DEF_BS;
2518         def_thread.min_bs = -1;
2519         def_thread.max_bs = -1;
2520         def_thread.odirect = DEF_ODIRECT;
2521         def_thread.ratecycle = DEF_RATE_CYCLE;
2522         def_thread.sequential = DEF_SEQUENTIAL;
2523         def_thread.timeout = DEF_TIMEOUT;
2524         def_thread.create_file = DEF_CREATE;
2525         def_thread.overwrite = DEF_OVERWRITE;
2526         def_thread.invalidate_cache = DEF_INVALIDATE;
2527         def_thread.sync_io = DEF_SYNCIO;
2528         def_thread.mem_type = MEM_MALLOC;
2529         def_thread.bw_avg_time = DEF_BWAVGTIME;
2530         def_thread.create_serialize = DEF_CREATE_SER;
2531         def_thread.create_fsync = DEF_CREATE_FSYNC;
2532         def_thread.loops = DEF_LOOPS;
2533         def_thread.verify = DEF_VERIFY;
2534
2535         i = parse_options(argc, argv);
2536
2537         if (!ini_file) {
2538                 printf("Need job file\n");
2539                 return 1;
2540         }
2541
2542         if (parse_jobs_ini(ini_file))
2543                 return 1;
2544
2545         if (!thread_number) {
2546                 printf("Nothing to do\n");
2547                 return 1;
2548         }
2549
2550         run_threads(argv);
2551
2552         min_bw[0] = min_run[0] = ~0UL;
2553         min_bw[1] = min_run[1] = ~0UL;
2554         io_mb[0] = io_mb[1] = 0;
2555         agg[0] = agg[1] = 0;
2556         for (i = 0; i < thread_number; i++) {
2557                 struct thread_data *td = &threads[i];
2558                 unsigned long bw = 0;
2559
2560                 if (!td->error) {
2561                         if (td->runtime < min_run[td->ddir])
2562                                 min_run[td->ddir] = td->runtime;
2563                         if (td->runtime > max_run[td->ddir])
2564                                 max_run[td->ddir] = td->runtime;
2565
2566                         if (td->runtime)
2567                                 bw = td->io_bytes / td->runtime;
2568                         if (bw < min_bw[td->ddir])
2569                                 min_bw[td->ddir] = bw;
2570                         if (bw > max_bw[td->ddir])
2571                                 max_bw[td->ddir] = bw;
2572
2573                         io_mb[td->ddir] += td->io_bytes >> 20;
2574                 }
2575
2576                 show_thread_status(td);
2577         }
2578         
2579         if (max_run[0])
2580                 agg[0] = (io_mb[0] * 1024 * 1000) / max_run[0];
2581         if (max_run[1])
2582                 agg[1] = (io_mb[1] * 1024 * 1000) / max_run[1];
2583
2584         printf("\nRun status:\n");
2585         if (max_run[DDIR_READ])
2586                 printf("   READ: io=%luMiB, aggrb=%lu, minb=%lu, maxb=%lu, mint=%lumsec, maxt=%lumsec\n", io_mb[0], agg[0], min_bw[0], max_bw[0], min_run[0], max_run[0]);
2587         if (max_run[DDIR_WRITE])
2588                 printf("  WRITE: io=%luMiB, aggrb=%lu, minb=%lu, maxb=%lu, mint=%lumsec, maxt=%lumsec\n", io_mb[1], agg[1], min_bw[1], max_bw[1], min_run[1], max_run[1]);
2589
2590         return 0;
2591 }