[PATCH] fio: lots of missing global -> job inheritance
[disktools.git] / fio.c
1 /*
2  * fio - the flexible io tester
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <unistd.h>
24 #include <fcntl.h>
25 #include <string.h>
26 #include <errno.h>
27 #include <signal.h>
28 #include <time.h>
29 #include <ctype.h>
30 #include <sched.h>
31 #include <libaio.h>
32 #include <math.h>
33 #include <limits.h>
34 #include <sys/time.h>
35 #include <sys/types.h>
36 #include <sys/stat.h>
37 #include <sys/wait.h>
38 #include <semaphore.h>
39 #include <sys/ipc.h>
40 #include <sys/shm.h>
41 #include <asm/unistd.h>
42
43 #include "list.h"
44
45 #define MAX_JOBS        (1024)
46
47 /*
48  * assume we don't have _get either, if _set isn't defined
49  */
50 #ifndef __NR_ioprio_set
51 #if defined(__i386__)
52 #define __NR_ioprio_set         289
53 #define __NR_ioprio_get         290
54 #elif defined(__powerpc__) || defined(__powerpc64__)
55 #define __NR_ioprio_set         273
56 #define __NR_ioprio_get         274
57 #elif defined(__x86_64__)
58 #define __NR_ioprio_set         251
59 #define __NR_ioprio_get         252
60 #elif defined(__ia64__)
61 #define __NR_ioprio_set         1274
62 #define __NR_ioprio_get         1275
63 #elif defined(__alpha__)
64 #define __NR_ioprio_set         442
65 #define __NR_ioprio_get         443
66 #elif defined(__s390x__) || defined(__s390__)
67 #define __NR_ioprio_set         282
68 #define __NR_ioprio_get         283
69 #else
70 #error "Unsupported arch"
71 #endif
72 #endif
73
74 #ifndef __NR_fadvise64
75 #if defined(__i386__)
76 #define __NR_fadvise64          250
77 #elif defined(__powerpc__) || defined(__powerpc64__)
78 #define __NR_fadvise64          233
79 #elif defined(__x86_64__)
80 #define __NR_fadvise64          221
81 #elif defined(__ia64__)
82 #define __NR_fadvise64          1234
83 #elif defined(__alpha__)
84 #define __NR_fadvise64          413
85 #elif defined(__s390x__) || defined(__s390__)
86 #define __NR_fadvise64          253
87 #else
88 #error "Unsupported arch"
89 #endif
90 #endif
91
92 static int ioprio_set(int which, int who, int ioprio)
93 {
94         return syscall(__NR_ioprio_set, which, who, ioprio);
95 }
96
97 /*
98  * we want fadvise64 really, but it's so tangled... later
99  */
100 static int fadvise(int fd, loff_t offset, size_t len, int advice)
101 {
102 #if 0
103         return syscall(__NR_fadvise64, fd, offset, offset >> 32, len, advice);
104 #else
105         return posix_fadvise(fd, (off_t) offset, len, advice);
106 #endif
107 }
108
109 enum {
110         IOPRIO_WHO_PROCESS = 1,
111         IOPRIO_WHO_PGRP,
112         IOPRIO_WHO_USER,
113 };
114
115 #define IOPRIO_CLASS_SHIFT      13
116
117 #define MASK    (4095)
118
119 #define DEF_BS          (4096)
120 #define DEF_TIMEOUT     (30)
121 #define DEF_RATE_CYCLE  (1000)
122 #define DEF_ODIRECT     (1)
123 #define DEF_SEQUENTIAL  (1)
124 #define DEF_RAND_REPEAT (1)
125 #define DEF_OVERWRITE   (0)
126 #define DEF_CREATE      (1)
127 #define DEF_INVALIDATE  (1)
128
129 #define ALIGN(buf)      (char *) (((unsigned long) (buf) + MASK) & ~(MASK))
130
131 static int repeatable = DEF_RAND_REPEAT;
132 static int rate_quit = 1;
133
134 static int thread_number;
135 static char *ini_file;
136
137 static int max_jobs = MAX_JOBS;
138
139 static char run_str[MAX_JOBS + 1];
140
141 static int shm_id;
142
143 enum {
144         DDIR_READ = 0,
145         DDIR_WRITE,
146 };
147
148 /*
149  * thread life cycle
150  */
151 enum {
152         TD_NOT_CREATED = 0,
153         TD_CREATED,
154         TD_STARTED,
155         TD_EXITED,
156         TD_REAPED,
157 };
158
159 /*
160  * The io unit
161  */
162 struct io_u {
163         struct iocb iocb;
164         struct timeval start_time;
165         struct timeval issue_time;
166
167         void *mem;
168         char *buf;
169         unsigned int buflen;
170         off_t offset;
171
172         struct list_head list;
173 };
174
175 struct io_stat {
176         unsigned long val;
177         unsigned long val_sq;
178         unsigned long max_val;
179         unsigned long min_val;
180         unsigned long samples;
181 };
182
183 #define td_read(td)             ((td)->ddir == DDIR_READ)
184 #define should_fsync(td)        (!td_read(td) && !(td)->odirect)
185
186 struct thread_data {
187         char file_name[256];
188         int thread_number;
189         int error;
190         int fd;
191         pid_t pid;
192         volatile int terminate;
193         volatile int runstate;
194         unsigned int ddir;
195         unsigned int ioprio;
196         unsigned int sequential;
197         unsigned int bs;
198         unsigned int odirect;
199         unsigned int delay_sleep;
200         unsigned int fsync_blocks;
201         unsigned int start_delay;
202         unsigned int timeout;
203         unsigned int use_aio;
204         unsigned int create_file;
205         unsigned int overwrite;
206         unsigned int invalidate_cache;
207         unsigned long long file_size;
208         unsigned long long file_offset;
209         cpu_set_t cpumask;
210
211         off_t cur_off;
212
213         io_context_t aio_ctx;
214         unsigned int aio_depth;
215         struct io_event *aio_events;
216
217         unsigned int cur_depth;
218         struct list_head io_u_freelist;
219         struct list_head io_u_busylist;
220
221         unsigned int rate;
222         unsigned int ratemin;
223         unsigned int ratecycle;
224         unsigned long rate_usec_cycle;
225         long rate_pending_usleep;
226         unsigned long rate_blocks;
227         struct timeval lastrate;
228
229         unsigned long runtime;          /* sec */
230         unsigned long blocks;
231         unsigned long io_blocks;
232         unsigned long last_block;
233         sem_t mutex;
234         struct drand48_data random_state;
235
236         /*
237          * bandwidth and latency stats
238          */
239         struct io_stat clat_stat;               /* completion latency */
240         struct io_stat slat_stat;               /* submission latency */
241
242         struct io_stat bw_stat;                 /* bandwidth stats */
243         unsigned long stat_io_blocks;
244         struct timeval stat_sample_time;
245
246         struct timeval start;
247 };
248
249 static struct thread_data *threads;
250 static struct thread_data def_thread;
251
252 static sem_t startup_sem;
253
254 static void sig_handler(int sig)
255 {
256         int i;
257
258         for (i = 0; i < thread_number; i++) {
259                 struct thread_data *td = &threads[i];
260
261                 td->terminate = 1;
262                 td->start_delay = 0;
263         }
264 }
265
266 static int init_random_state(struct thread_data *td)
267 {
268         unsigned long seed = 123;
269
270         if (td->sequential)
271                 return 0;
272
273         if (!repeatable) {
274                 int fd = open("/dev/random", O_RDONLY);
275
276                 if (fd == -1) {
277                         td->error = errno;
278                         return 1;
279                 }
280
281                 if (read(fd, &seed, sizeof(seed)) < (int) sizeof(seed)) {
282                         td->error = EIO;
283                         close(fd);
284                         return 1;
285                 }
286
287                 close(fd);
288         }
289
290         srand48_r(seed, &td->random_state);
291         return 0;
292 }
293
294 static unsigned long utime_since(struct timeval *s, struct timeval *e)
295 {
296         double sec, usec;
297
298         sec = e->tv_sec - s->tv_sec;
299         usec = e->tv_usec - s->tv_usec;
300         if (sec > 0 && usec < 0) {
301                 sec--;
302                 usec += 1000000;
303         }
304
305         sec *= (double) 1000000;
306
307         return sec + usec;
308 }
309
310 static unsigned long mtime_since(struct timeval *s, struct timeval *e)
311 {
312         double sec, usec;
313
314         sec = e->tv_sec - s->tv_sec;
315         usec = e->tv_usec - s->tv_usec;
316         if (sec > 0 && usec < 0) {
317                 sec--;
318                 usec += 1000000;
319         }
320
321         sec *= (double) 1000;
322         usec /= (double) 1000;
323
324         return sec + usec;
325 }
326
327 static unsigned long mtime_since_now(struct timeval *s)
328 {
329         struct timeval t;
330
331         gettimeofday(&t, NULL);
332         return mtime_since(s, &t);
333 }
334
335 static inline unsigned long msec_now(struct timeval *s)
336 {
337         return s->tv_sec * 1000 + s->tv_usec / 1000;
338 }
339
340 static unsigned long get_next_offset(struct thread_data *td)
341 {
342         unsigned long b;
343         long r;
344
345         if (!td->sequential) {
346                 lrand48_r(&td->random_state, &r);
347                 b = (1+(double) (td->blocks-1) * r / (RAND_MAX+1.0));
348         } else {
349                 b = td->last_block;
350                 td->last_block++;
351         }
352
353         return b * td->bs + td->file_offset;
354 }
355
356 static inline void add_stat_sample(struct thread_data *td, struct io_stat *is,
357                                    unsigned long val)
358 {
359         if (val > is->max_val)
360                 is->max_val = val;
361         if (val < is->min_val)
362                 is->min_val = val;
363
364         is->val += val;
365         is->val_sq += val * val;
366         is->samples++;
367 }
368
369 static void add_clat_sample(struct thread_data *td, unsigned long msec)
370 {
371         add_stat_sample(td, &td->clat_stat, msec);
372 }
373
374 static void add_slat_sample(struct thread_data *td, unsigned long msec)
375 {
376         add_stat_sample(td, &td->slat_stat, msec);
377 }
378
379 static void add_bw_sample(struct thread_data *td, unsigned long msec)
380 {
381         unsigned long spent = mtime_since_now(&td->stat_sample_time);
382         unsigned long rate;
383
384         if (spent < 500)
385                 return;
386
387         rate = ((td->io_blocks - td->stat_io_blocks) * td->bs) / spent;
388         add_stat_sample(td, &td->bw_stat, rate);
389
390         gettimeofday(&td->stat_sample_time, NULL);
391         td->stat_io_blocks = td->io_blocks;
392 }
393
394 static void usec_sleep(int usec)
395 {
396         struct timespec req = { .tv_sec = 0, .tv_nsec = usec * 1000 };
397         struct timespec rem;
398
399         do {
400                 rem.tv_sec = rem.tv_nsec = 0;
401                 nanosleep(&req, &rem);
402                 if (!rem.tv_nsec)
403                         break;
404
405                 req.tv_nsec = rem.tv_nsec;
406         } while (1);
407 }
408
409 static void rate_throttle(struct thread_data *td, unsigned long time_spent)
410 {
411         if (!td->rate)
412                 return;
413
414         if (time_spent < td->rate_usec_cycle) {
415                 unsigned long s = td->rate_usec_cycle - time_spent;
416
417                 td->rate_pending_usleep += s;
418                 if (td->rate_pending_usleep >= 100000) {
419                         usec_sleep(td->rate_pending_usleep);
420                         td->rate_pending_usleep = 0;
421                 }
422         } else {
423                 long overtime = time_spent - td->rate_usec_cycle;
424
425                 td->rate_pending_usleep -= overtime;
426         }
427 }
428
429 static int check_min_rate(struct thread_data *td, struct timeval *now)
430 {
431         unsigned long spent;
432         unsigned long rate;
433
434         /*
435          * allow a 2 second settle period in the beginning
436          */
437         if (mtime_since(&td->start, now) < 2000)
438                 return 0;
439
440         /*
441          * if rate blocks is set, sample is running
442          */
443         if (td->rate_blocks) {
444                 spent = mtime_since(&td->lastrate, now);
445                 if (spent < td->ratecycle)
446                         return 0;
447
448                 rate = ((td->io_blocks - td->rate_blocks) * td->bs) / spent;
449                 if (rate < td->ratemin) {
450                         printf("Client%d: min rate %d not met, got %ldKiB/sec\n", td->thread_number, td->ratemin, rate);
451                         if (rate_quit)
452                                 sig_handler(0);
453                         return 1;
454                 }
455         }
456
457         td->rate_blocks = td->io_blocks;
458         memcpy(&td->lastrate, now, sizeof(*now));
459         return 0;
460 }
461
462 static inline int runtime_exceeded(struct thread_data *td, struct timeval *t)
463 {
464         if (mtime_since(&td->start, t) >= td->timeout * 1000)
465                 return 1;
466
467         return 0;
468 }
469
470 static void put_io_u(struct thread_data *td, struct io_u *io_u)
471 {
472         list_del(&io_u->list);
473         list_add(&io_u->list, &td->io_u_freelist);
474         td->cur_depth--;
475 }
476
477 static struct io_u *get_io_u(struct thread_data *td)
478 {
479         struct io_u *io_u;
480
481         if (list_empty(&td->io_u_freelist))
482                 return NULL;
483
484         io_u = list_entry(td->io_u_freelist.next, struct io_u, list);
485         list_del(&io_u->list);
486         list_add(&io_u->list, &td->io_u_busylist);
487
488         io_u->offset = get_next_offset(td);
489
490         if (td->use_aio) {
491                 if (td_read(td))
492                         io_prep_pread(&io_u->iocb, td->fd, io_u->buf, io_u->buflen, io_u->offset);
493                 else
494                         io_prep_pwrite(&io_u->iocb, td->fd, io_u->buf, io_u->buflen, io_u->offset);
495         }
496
497         gettimeofday(&io_u->start_time, NULL);
498         td->cur_depth++;
499         return io_u;
500 }
501
502 static void do_sync_io(struct thread_data *td)
503 {
504         unsigned long blocks, msec, usec;
505         struct timeval e;
506
507         td->cur_off = 0;
508
509         for (blocks = 0; blocks < td->blocks; blocks++) {
510                 struct io_u *io_u;
511                 int ret;
512
513                 if (td->terminate)
514                         break;
515
516                 io_u = get_io_u(td);
517
518                 if (td->cur_off != io_u->offset) {
519                         if (lseek(td->fd, io_u->offset, SEEK_SET) == -1) {
520                                 td->error = errno;
521                                 break;
522                         }
523                 }
524
525                 if (td->delay_sleep)
526                         usec_sleep(td->delay_sleep);
527
528                 if (td_read(td))
529                         ret = read(td->fd, io_u->buf, io_u->buflen);
530                 else
531                         ret = write(td->fd, io_u->buf, io_u->buflen);
532
533                 if (ret < (int) io_u->buflen) {
534                         if (ret == -1)
535                                 td->error = errno;
536                         break;
537                 }
538
539                 td->io_blocks++;
540                 td->cur_off = io_u->offset + io_u->buflen;
541
542                 if (should_fsync(td) && td->fsync_blocks &&
543                     (td->io_blocks % td->fsync_blocks) == 0)
544                         fsync(td->fd);
545
546                 gettimeofday(&e, NULL);
547
548                 usec = utime_since(&io_u->start_time, &e);
549
550                 rate_throttle(td, usec);
551
552                 if (check_min_rate(td, &e)) {
553                         td->error = ENODATA;
554                         break;
555                 }
556
557                 msec = usec / 1000;
558                 add_clat_sample(td, msec);
559                 add_bw_sample(td, msec);
560
561                 if (runtime_exceeded(td, &e))
562                         break;
563
564                 put_io_u(td, io_u);
565         }
566
567         if (should_fsync(td))
568                 fsync(td->fd);
569 }
570
571 static int io_u_queue(struct thread_data *td, struct io_u *io_u)
572 {
573         struct iocb *iocb = &io_u->iocb;
574         int ret;
575
576         do {
577                 ret = io_submit(td->aio_ctx, 1, &iocb);
578                 if (ret == 1)
579                         return 0;
580                 else if (ret == EAGAIN)
581                         usleep(100);
582                 else if (ret == EINTR)
583                         continue;
584                 else
585                         break;
586         } while (1);
587
588         return ret;
589 }
590
591 #define iocb_time(iocb) ((unsigned long) (iocb)->data)
592 #define ev_to_iou(ev)   (struct io_u *) ((unsigned long) (ev)->obj)
593
594 static void ios_completed(struct thread_data *td, int nr)
595 {
596         unsigned long msec;
597         struct io_u *io_u;
598         struct timeval e;
599         int i;
600
601         gettimeofday(&e, NULL);
602
603         for (i = 0; i < nr; i++) {
604                 td->io_blocks++;
605
606                 io_u = ev_to_iou(td->aio_events + i);
607
608                 msec = mtime_since(&io_u->issue_time, &e);
609
610                 add_clat_sample(td, msec);
611                 add_bw_sample(td, msec);
612
613                 put_io_u(td, io_u);
614         }
615 }
616
617 static void cleanup_pending_aio(struct thread_data *td)
618 {
619         struct timespec ts = { .tv_sec = 0, .tv_nsec = 0};
620         struct list_head *entry, *n;
621         struct io_u *io_u;
622         int r;
623
624         /*
625          * get immediately available events, if any
626          */
627         r = io_getevents(td->aio_ctx, 0, td->cur_depth, td->aio_events, &ts);
628         if (r > 0)
629                 ios_completed(td, r);
630
631         /*
632          * now cancel remaining active events
633          */
634         list_for_each_safe(entry, n, &td->io_u_busylist) {
635                 io_u = list_entry(entry, struct io_u, list);
636
637                 r = io_cancel(td->aio_ctx, &io_u->iocb, td->aio_events);
638                 if (!r)
639                         put_io_u(td, io_u);
640         }
641
642         if (td->cur_depth) {
643                 r = io_getevents(td->aio_ctx, td->cur_depth, td->cur_depth, td->aio_events, NULL);
644                 if (r > 0)
645                         ios_completed(td, r);
646         }
647 }
648
649 static void do_async_io(struct thread_data *td)
650 {
651         struct timeval s, e;
652         unsigned long blocks, usec;
653
654         for (blocks = 0; blocks < td->blocks; blocks++) {
655                 struct timespec ts = { .tv_sec = 0, .tv_nsec = 0};
656                 struct timespec *timeout;
657                 int ret, min_evts = 0;
658                 struct io_u *io_u;
659
660                 if (td->terminate)
661                         break;
662
663                 if (td->delay_sleep)
664                         usec_sleep(td->delay_sleep);
665
666                 io_u = get_io_u(td);
667
668                 memcpy(&s, &io_u->start_time, sizeof(s));
669
670                 ret = io_u_queue(td, io_u);
671                 if (ret) {
672                         put_io_u(td, io_u);
673                         td->error = ret;
674                         break;
675                 }
676
677                 gettimeofday(&io_u->issue_time, NULL);
678                 add_slat_sample(td, mtime_since(&io_u->start_time, &io_u->issue_time));
679
680                 if (td->cur_depth < td->aio_depth) {
681                         timeout = &ts;
682                         min_evts = 0;
683                 } else {
684                         timeout = NULL;
685                         min_evts = 1;
686                 }
687
688                 ret = io_getevents(td->aio_ctx, min_evts, td->cur_depth, td->aio_events, timeout);
689                 if (ret < 0) {
690                         td->error = errno;
691                         break;
692                 } else if (!ret)
693                         continue;
694
695                 ios_completed(td, ret);
696
697                 /*
698                  * the rate is batched for now, it should work for batches
699                  * of completions except the very first one which may look
700                  * a little bursty
701                  */
702                 gettimeofday(&e, NULL);
703                 usec = utime_since(&s, &e);
704
705                 rate_throttle(td, usec);
706
707                 if (check_min_rate(td, &e)) {
708                         td->error = ENODATA;
709                         break;
710                 }
711
712                 if (runtime_exceeded(td, &e))
713                         break;
714         }
715
716         if (td->cur_depth)
717                 cleanup_pending_aio(td);
718 }
719
720 static void cleanup_aio(struct thread_data *td)
721 {
722         io_destroy(td->aio_ctx);
723
724         if (td->aio_events)
725                 free(td->aio_events);
726 }
727
728 static int init_aio(struct thread_data *td)
729 {
730         if (io_queue_init(td->aio_depth, &td->aio_ctx)) {
731                 td->error = errno;
732                 return 1;
733         }
734
735         td->aio_events = malloc(td->aio_depth * sizeof(struct io_event));
736         return 0;
737 }
738
739 static void cleanup_io_u(struct thread_data *td)
740 {
741         struct list_head *entry, *n;
742         struct io_u *io_u;
743
744         list_for_each_safe(entry, n, &td->io_u_freelist) {
745                 io_u = list_entry(entry, struct io_u, list);
746
747                 list_del(&io_u->list);
748                 free(io_u->mem);
749                 free(io_u);
750         }
751 }
752
753 static void init_io_u(struct thread_data *td)
754 {
755         struct io_u *io_u;
756         int i, max_units;
757
758         if (!td->use_aio)
759                 max_units = 1;
760         else
761                 max_units = td->aio_depth;
762
763         INIT_LIST_HEAD(&td->io_u_freelist);
764         INIT_LIST_HEAD(&td->io_u_busylist);
765
766         for (i = 0; i < max_units; i++) {
767                 io_u = malloc(sizeof(*io_u));
768                 memset(io_u, 0, sizeof(*io_u));
769                 INIT_LIST_HEAD(&io_u->list);
770
771                 io_u->mem = malloc(td->bs + MASK);
772                 io_u->buf = ALIGN(io_u->mem);
773                 io_u->buflen = td->bs;
774
775                 list_add(&io_u->list, &td->io_u_freelist);
776         }
777 }
778
779 static int create_file(struct thread_data *td)
780 {
781         unsigned int i;
782         char *b;
783
784         /*
785          * unless specifically asked for overwrite, let normal io extend it
786          */
787         if (!td_read(td) && !td->overwrite)
788                 return 0;
789
790         if (!td->file_size) {
791                 fprintf(stderr, "Need size for create\n");
792                 td->error = EINVAL;
793                 return 1;
794         }
795
796         td->fd = open(td->file_name, O_WRONLY | O_CREAT | O_TRUNC, 0644);
797         if (td->fd < 0) {
798                 td->error = errno;
799                 return 1;
800         }
801
802         td->blocks = td->file_size / td->bs;
803         b = malloc(td->bs);
804         memset(b, 0, td->bs);
805
806         for (i = 0; i < td->blocks; i++) {
807                 int r = write(td->fd, b, td->bs);
808
809                 if (r == td->bs)
810                         continue;
811                 else {
812                         if (r < 0)
813                                 td->error = errno;
814                         else
815                                 td->error = EIO;
816
817                         break;
818                 }
819         }
820
821         fsync(td->fd);
822         close(td->fd);
823         td->fd = -1;
824         free(b);
825         return 0;
826 }
827
828 static int file_exists(struct thread_data *td)
829 {
830         struct stat st;
831
832         if (stat(td->file_name, &st) != -1)
833                 return 1;
834
835         return errno != ENOENT;
836 }
837
838 static int setup_file(struct thread_data *td)
839 {
840         struct stat st;
841         int flags = 0;
842
843         if (!file_exists(td)) {
844                 if (!td->create_file) {
845                         td->error = ENOENT;
846                         return 1;
847                 }
848                 if (create_file(td))
849                         return 1;
850         }
851
852         if (td->odirect)
853                 flags |= O_DIRECT;
854
855         if (td_read(td))
856                 td->fd = open(td->file_name, flags | O_RDONLY);
857         else {
858                 if (!td->overwrite)
859                         flags |= O_TRUNC;
860
861                 td->fd = open(td->file_name, flags | O_WRONLY | O_CREAT, 0600);
862         }
863
864         if (td->fd == -1) {
865                 td->error = errno;
866                 return 1;
867         }
868
869         if (td_read(td)) {
870                 if (fstat(td->fd, &st) == -1) {
871                         td->error = errno;
872                         return 1;
873                 }
874
875                 if (td->file_size > st.st_size)
876                         st.st_size = td->file_size;
877         } else {
878                 if (!td->file_size)
879                         td->file_size = 1024 * 1024 * 1024;
880
881                 st.st_size = td->file_size;
882         }
883
884         td->blocks = (st.st_size - td->file_offset) / td->bs;
885         if (!td->blocks) {
886                 fprintf(stderr, "Client%d: no io blocks\n", td->thread_number);
887                 td->error = EINVAL;
888                 return 1;
889         }
890
891         if (td->invalidate_cache) {
892                 if (fadvise(td->fd, 0, st.st_size, POSIX_FADV_DONTNEED) < 0) {
893                         td->error = errno;
894                         return 1;
895                 }
896         }
897
898         return 0;
899 }
900
901 static void *thread_main(int shm_id, int offset, char *argv[])
902 {
903         struct thread_data *td;
904         int ret = 1;
905         void *data;
906
907         setsid();
908
909         data = shmat(shm_id, NULL, 0);
910         td = data + offset * sizeof(struct thread_data);
911         td->pid = getpid();
912
913         init_io_u(td);
914
915         if (sched_setaffinity(td->pid, sizeof(td->cpumask), &td->cpumask) == -1) {
916                 td->error = errno;
917                 goto err;
918         }
919
920         sprintf(argv[0], "fio%d", offset);
921
922         if (td->use_aio && init_aio(td))
923                 goto err;
924
925         if (init_random_state(td))
926                 goto err;
927
928         if (td->ioprio) {
929                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, td->ioprio) == -1) {
930                         td->error = errno;
931                         goto err;
932                 }
933         }
934
935         if (setup_file(td))
936                 goto err;
937
938         sem_post(&startup_sem);
939         sem_wait(&td->mutex);
940
941         gettimeofday(&td->start, NULL);
942
943         if (td->ratemin)
944                 memcpy(&td->lastrate, &td->start, sizeof(td->start));
945
946         memcpy(&td->stat_sample_time, &td->start, sizeof(td->start));
947
948         if (!td->use_aio)
949                 do_sync_io(td);
950         else
951                 do_async_io(td);
952
953         td->runtime = mtime_since_now(&td->start);
954         ret = 0;
955 err:
956         if (td->use_aio)
957                 cleanup_aio(td);
958         if (td->fd != -1) {
959                 close(td->fd);
960                 td->fd = -1;
961         }
962         cleanup_io_u(td);
963         if (ret) {
964                 sem_post(&startup_sem);
965                 sem_wait(&td->mutex);
966         }
967         td->runstate = TD_EXITED;
968         shmdt(data);
969         return NULL;
970 }
971
972 static void free_shm(void)
973 {
974         shmdt(threads);
975 }
976
977 static int calc_lat(struct io_stat *is, unsigned long *min, unsigned long *max,
978                     double *mean, double *dev)
979 {
980         double n;
981
982         if (is->samples == 0)
983                 return 0;
984
985         *min = is->min_val;
986         *max = is->max_val;
987
988         n = (double) is->samples;
989         *mean = (double) is->val / n;
990         *dev = sqrt(((double) is->val_sq - (*mean * *mean) / n) / (n - 1));
991         return 1;
992 }
993
994 static void show_thread_status(struct thread_data *td)
995 {
996         int prio, prio_class;
997         unsigned long min, max, bw = 0;
998         double mean, dev;
999
1000         if (!td->io_blocks && !td->error)
1001                 return;
1002
1003         if (td->runtime)
1004                 bw = (td->io_blocks * td->bs) / td->runtime;
1005
1006         prio = td->ioprio & 0xff;
1007         prio_class = td->ioprio >> IOPRIO_CLASS_SHIFT;
1008
1009         printf("Client%d: err=%2d, io=%6luMiB, bw=%6luKiB/s\n", td->thread_number, td->error, td->io_blocks * td->bs >> 20, bw);
1010
1011         if (calc_lat(&td->slat_stat, &min, &max, &mean, &dev))
1012                 printf("  slat (msec): min=%5lu, max=%5lu, avg=%5.02f, dev=%5.02f\n", min, max, mean, dev);
1013         if (calc_lat(&td->clat_stat, &min, &max, &mean, &dev))
1014                 printf("  clat (msec): min=%5lu, max=%5lu, avg=%5.02f, dev=%5.02f\n", min, max, mean, dev);
1015         if (calc_lat(&td->bw_stat, &min, &max, &mean, &dev))
1016                 printf("  bw (KiB/s) : min=%5lu, max=%5lu, avg=%5.02f, dev=%5.02f\n", min, max, mean, dev);
1017 }
1018
1019 static int setup_rate(struct thread_data *td)
1020 {
1021         int nr_reads_per_sec;
1022
1023         if (!td->rate)
1024                 return 0;
1025
1026         if (td->rate < td->ratemin) {
1027                 fprintf(stderr, "min rate larger than nominal rate\n");
1028                 return -1;
1029         }
1030
1031         nr_reads_per_sec = td->rate * 1024 / td->bs;
1032         td->rate_usec_cycle = 1000000 / nr_reads_per_sec;
1033         td->rate_pending_usleep = 0;
1034         return 0;
1035 }
1036
1037 static struct thread_data *get_new_job(int global)
1038 {
1039         struct thread_data *td;
1040
1041         if (global)
1042                 return &def_thread;
1043         if (thread_number >= max_jobs)
1044                 return NULL;
1045
1046         td = &threads[thread_number++];
1047         memset(td, 0, sizeof(*td));
1048
1049         td->fd = -1;
1050         td->thread_number = thread_number;
1051
1052         td->ddir = def_thread.ddir;
1053         td->ioprio = def_thread.ioprio;
1054         td->sequential = def_thread.sequential;
1055         td->bs = def_thread.bs;
1056         td->odirect = def_thread.odirect;
1057         td->delay_sleep = def_thread.delay_sleep;
1058         td->fsync_blocks = def_thread.fsync_blocks;
1059         td->start_delay = def_thread.start_delay;
1060         td->timeout = def_thread.timeout;
1061         td->use_aio = def_thread.use_aio;
1062         td->create_file = def_thread.create_file;
1063         td->overwrite = def_thread.overwrite;
1064         td->invalidate_cache = def_thread.invalidate_cache;
1065         td->file_size = def_thread.file_size;
1066         td->file_size = def_thread.file_offset;
1067         td->rate = def_thread.rate;
1068         td->ratemin = def_thread.ratemin;
1069         td->ratecycle = def_thread.ratecycle;
1070         td->aio_depth = def_thread.aio_depth;
1071         memcpy(&td->cpumask, &def_thread.cpumask, sizeof(td->cpumask));
1072
1073         return td;
1074 }
1075
1076 static void put_job(struct thread_data *td)
1077 {
1078         memset(&threads[td->thread_number - 1], 0, sizeof(*td));
1079         thread_number--;
1080 }
1081
1082 static int add_job(struct thread_data *td, const char *filename, int prioclass,
1083                    int prio)
1084 {
1085         if (td == &def_thread)
1086                 return 0;
1087
1088         strcpy(td->file_name, filename);
1089         sem_init(&td->mutex, 1, 0);
1090         td->ioprio = (prioclass << IOPRIO_CLASS_SHIFT) | prio;
1091
1092         td->clat_stat.min_val = ULONG_MAX;
1093         td->slat_stat.min_val = ULONG_MAX;
1094         td->bw_stat.min_val = ULONG_MAX;
1095
1096         run_str[td->thread_number - 1] = 'P';
1097
1098         if (td->use_aio) {
1099                 if (!td->aio_depth)
1100                         td->aio_depth = 1;
1101
1102                 /*
1103                  * no buffered aio support for now
1104                  */
1105                 if (!td->odirect)
1106                         td->odirect = 1;
1107         }
1108
1109         if (setup_rate(td))
1110                 return -1;
1111
1112         printf("Client%d: file=%s, rw=%d, prio=%d/%d, seq=%d, odir=%d, bs=%d, rate=%d, aio=%d, aio_depth=%d\n", td->thread_number, filename, td->ddir, prioclass, prio, td->sequential, td->odirect, td->bs, td->rate, td->use_aio, td->aio_depth);
1113         return 0;
1114 }
1115
1116 static void fill_cpu_mask(cpu_set_t cpumask, int cpu)
1117 {
1118         unsigned int i;
1119
1120         CPU_ZERO(&cpumask);
1121
1122         for (i = 0; i < sizeof(int) * 8; i++) {
1123                 if ((1 << i) & cpu)
1124                         CPU_SET(i, &cpumask);
1125         }
1126 }
1127
1128 static void fill_option(const char *input, char *output)
1129 {
1130         int i;
1131
1132         i = 0;
1133         while (input[i] != ',' && input[i] != '}' && input[i] != '\0') {
1134                 output[i] = input[i];
1135                 i++;
1136         }
1137
1138         output[i] = '\0';
1139 }
1140
1141 /*
1142  * convert string after '=' into decimal value, noting any size suffix
1143  */
1144 static int str_cnv(char *p, unsigned long long *val)
1145 {
1146         unsigned long mult;
1147         char *str;
1148         int len;
1149
1150         str = strstr(p, "=");
1151         if (!str)
1152                 return 1;
1153
1154         str++;
1155         len = strlen(str);
1156         mult = 1;
1157
1158         switch (str[len - 2]) {
1159                 case 'k':
1160                 case 'K':
1161                         mult = 1024;
1162                         break;
1163                 case 'm':
1164                 case 'M':
1165                         mult = 1024 * 1024;
1166                         break;
1167                 case 'g':
1168                 case 'G':
1169                         mult = 1024 * 1024 * 1024;
1170                         break;
1171         }
1172
1173         *val = strtoul(str, NULL, 10);
1174         if (*val == ULONG_MAX && errno == ERANGE)
1175                 return 1;
1176
1177         *val *= mult;
1178         return 0;
1179
1180 }
1181
1182 /*
1183  * job key words:
1184  *
1185  * file=
1186  * bs=
1187  * rw=
1188  * direct=
1189  */
1190 static void parse_jobs_cmd(int argc, char *argv[], int index)
1191 {
1192         struct thread_data *td;
1193         unsigned int prio, prioclass, cpu;
1194         char *string, *filename, *p, *c;
1195         int i;
1196
1197         string = malloc(256);
1198         filename = malloc(256);
1199
1200         for (i = index; i < argc; i++) {
1201                 p = argv[i];
1202
1203                 c = strpbrk(p, "{");
1204                 if (!c)
1205                         break;
1206
1207                 filename[0] = 0;
1208
1209                 td = get_new_job(0);
1210                 if (!td)
1211                         break;
1212
1213                 prioclass = 2;
1214                 prio = 4;
1215
1216                 c = strstr(p, "rw=");
1217                 if (c) {
1218                         c += 3;
1219                         if (*c == '0')
1220                                 td->ddir = DDIR_READ;
1221                         else
1222                                 td->ddir = DDIR_WRITE;
1223                 }
1224
1225                 c = strstr(p, "prio=");
1226                 if (c) {
1227                         c += 5;
1228                         prio = *c - '0';
1229                 }
1230
1231                 c = strstr(p, "prioclass=");
1232                 if (c) {
1233                         c += 10;
1234                         prioclass = *c - '0';
1235                 }
1236
1237                 c = strstr(p, "file=");
1238                 if (c) {
1239                         c += 5;
1240                         fill_option(c, filename);
1241                 }
1242
1243                 c = strstr(p, "bs=");
1244                 if (c) {
1245                         c += 3;
1246                         fill_option(c, string);
1247                         td->bs = strtoul(string, NULL, 10);
1248                         td->bs <<= 10;
1249                 }
1250
1251                 c = strstr(p, "direct=");
1252                 if (c) {
1253                         c += 7;
1254                         if (*c != '0')
1255                                 td->odirect = 1;
1256                         else
1257                                 td->odirect = 0;
1258                 }
1259
1260                 c = strstr(p, "delay=");
1261                 if (c) {
1262                         c += 6;
1263                         fill_option(c, string);
1264                         td->delay_sleep = strtoul(string, NULL, 10);
1265                 }
1266
1267                 c = strstr(p, "rate=");
1268                 if (c) {
1269                         c += 5;
1270                         fill_option(c, string);
1271                         td->rate = strtoul(string, NULL, 10);
1272                 }
1273
1274                 c = strstr(p, "ratemin=");
1275                 if (c) {
1276                         c += 8;
1277                         fill_option(c, string);
1278                         td->ratemin = strtoul(string, NULL, 10);
1279                 }
1280
1281                 c = strstr(p, "ratecycle=");
1282                 if (c) {
1283                         c += 10;
1284                         fill_option(c, string);
1285                         td->ratecycle = strtoul(string, NULL, 10);
1286                 }
1287
1288                 c = strstr(p, "cpumask=");
1289                 if (c) {
1290                         c += 8;
1291                         fill_option(c, string);
1292                         cpu = strtoul(string, NULL, 10);
1293                         fill_cpu_mask(td->cpumask, cpu);
1294                 }
1295
1296                 c = strstr(p, "fsync=");
1297                 if (c) {
1298                         c += 6;
1299                         fill_option(c, string);
1300                         td->fsync_blocks = strtoul(string, NULL, 10);
1301                 }
1302
1303                 c = strstr(p, "startdelay=");
1304                 if (c) {
1305                         c += 11;
1306                         fill_option(c, string);
1307                         td->start_delay = strtoul(string, NULL, 10);
1308                 }
1309
1310                 c = strstr(p, "timeout=");
1311                 if (c) {
1312                         c += 8;
1313                         fill_option(c, string);
1314                         td->timeout = strtoul(string, NULL, 10);
1315                 }
1316
1317                 c = strstr(p, "invalidate=");
1318                 if (c) {
1319                         c += 11;
1320                         if (*c != '0')
1321                                 td->invalidate_cache = 1;
1322                         else
1323                                 td->invalidate_cache = 0;
1324                 }
1325
1326                 c = strstr(p, "size=");
1327                 if (c) {
1328                         c += 5;
1329                         str_cnv(c, &td->file_size);
1330                 }
1331
1332                 c = strstr(p, "offset=");
1333                 if (c) {
1334                         c += 7;
1335                         str_cnv(c, &td->file_offset);
1336                 }
1337
1338                 c = strstr(p, "aio_depth=");
1339                 if (c) {
1340                         c += 10;
1341                         fill_option(c, string);
1342                         td->aio_depth = strtoul(string, NULL, 10);
1343                 }
1344
1345                 c = strstr(p, "aio");
1346                 if (c)
1347                         td->use_aio = 1;
1348
1349                 c = strstr(p, "create");
1350                 if (c)
1351                         td->create_file = 1;
1352
1353                 c = strstr(p, "overwrite");
1354                 if (c)
1355                         td->overwrite = 1;
1356
1357                 c = strstr(p, "random");
1358                 if (c)
1359                         td->sequential = 0;
1360                 c = strstr(p, "sequential");
1361                 if (c)
1362                         td->sequential = 1;
1363
1364                 if (add_job(td, filename, prioclass, prio))
1365                         put_job(td);
1366         }
1367
1368         free(string);
1369         free(filename);
1370 }
1371
1372 static int check_strcnv(char *p, char *name, unsigned long long *val)
1373 {
1374         if (!strstr(p, name))
1375                 return 1;
1376
1377         return str_cnv(p, val);
1378 }
1379
1380 static int check_int(char *p, char *name, unsigned int *val)
1381 {
1382         char str[128];
1383
1384         sprintf(str, "%s=%%d", name);
1385         if (sscanf(p, str, val) == 1)
1386                 return 0;
1387
1388         sprintf(str, "%s = %%d", name);
1389         if (sscanf(p, str, val) == 1)
1390                 return 0;
1391
1392         return 1;
1393 }
1394
1395 static int is_empty_or_comment(char *line)
1396 {
1397         unsigned int i;
1398
1399         for (i = 0; i < strlen(line); i++) {
1400                 if (line[i] == ';')
1401                         return 1;
1402                 if (!isspace(line[i]) && !iscntrl(line[i]))
1403                         return 0;
1404         }
1405
1406         return 1;
1407 }
1408
1409 static int parse_jobs_ini(char *file)
1410 {
1411         unsigned int prioclass, prio, cpu, global;
1412         struct thread_data *td;
1413         char *string, *name;
1414         fpos_t off;
1415         FILE *f;
1416         char *p;
1417
1418         f = fopen(file, "r");
1419         if (!f) {
1420                 perror("fopen");
1421                 return 1;
1422         }
1423
1424         string = malloc(4096);
1425         name = malloc(256);
1426
1427         while ((p = fgets(string, 4096, f)) != NULL) {
1428                 if (is_empty_or_comment(p))
1429                         continue;
1430                 if (sscanf(p, "[%s]", name) != 1)
1431                         continue;
1432
1433                 global = !strncmp(name, "global", 6);
1434
1435                 name[strlen(name) - 1] = '\0';
1436
1437                 td = get_new_job(global);
1438                 if (!td)
1439                         break;
1440
1441                 prioclass = 2;
1442                 prio = 4;
1443
1444                 fgetpos(f, &off);
1445                 while ((p = fgets(string, 4096, f)) != NULL) {
1446                         if (is_empty_or_comment(p))
1447                                 continue;
1448                         if (strstr(p, "["))
1449                                 break;
1450                         if (!check_int(p, "bs", &td->bs)) {
1451                                 td->bs <<= 10;
1452                                 fgetpos(f, &off);
1453                                 continue;
1454                         }
1455                         if (!check_int(p, "rw", &td->ddir)) {
1456                                 fgetpos(f, &off);
1457                                 continue;
1458                         }
1459                         if (!check_int(p, "prio", &prio)) {
1460                                 fgetpos(f, &off);
1461                                 continue;
1462                         }
1463                         if (!check_int(p, "prioclass", &prioclass)) {
1464                                 fgetpos(f, &off);
1465                                 continue;
1466                         }
1467                         if (!check_int(p, "direct", &td->odirect)) {
1468                                 fgetpos(f, &off);
1469                                 continue;
1470                         }
1471                         if (!check_int(p, "rate", &td->rate)) {
1472                                 fgetpos(f, &off);
1473                                 continue;
1474                         }
1475                         if (!check_int(p, "ratemin", &td->ratemin)) {
1476                                 fgetpos(f, &off);
1477                                 continue;
1478                         }
1479                         if (!check_int(p, "ratecycle", &td->ratecycle)) {
1480                                 fgetpos(f, &off);
1481                                 continue;
1482                         }
1483                         if (!check_int(p, "delay", &td->delay_sleep)) {
1484                                 fgetpos(f, &off);
1485                                 continue;
1486                         }
1487                         if (!check_int(p, "cpumask", &cpu)) {
1488                                 fill_cpu_mask(td->cpumask, cpu);
1489                                 fgetpos(f, &off);
1490                                 continue;
1491                         }
1492                         if (!check_int(p, "fsync", &td->fsync_blocks)) {
1493                                 fgetpos(f, &off);
1494                                 continue;
1495                         }
1496                         if (!check_int(p, "startdelay", &td->start_delay)) {
1497                                 fgetpos(f, &off);
1498                                 continue;
1499                         }
1500                         if (!check_int(p, "timeout", &td->timeout)) {
1501                                 fgetpos(f, &off);
1502                                 continue;
1503                         }
1504                         if (!check_int(p, "invalidate",&td->invalidate_cache)) {
1505                                 fgetpos(f, &off);
1506                                 continue;
1507                         }
1508                         if (!check_int(p, "aio_depth", &td->aio_depth)) {
1509                                 fgetpos(f, &off);
1510                                 continue;
1511                         }
1512                         if (!check_strcnv(p, "size", &td->file_size)) {
1513                                 fgetpos(f, &off);
1514                                 continue;
1515                         }
1516                         if (!check_strcnv(p, "offset", &td->file_offset)) {
1517                                 fgetpos(f, &off);
1518                                 continue;
1519                         }
1520                         if (!strncmp(p, "sequential", 10)) {
1521                                 td->sequential = 1;
1522                                 fgetpos(f, &off);
1523                                 continue;
1524                         }
1525                         if (!strncmp(p, "random", 6)) {
1526                                 td->sequential = 0;
1527                                 fgetpos(f, &off);
1528                                 continue;
1529                         }
1530                         if (!strncmp(p, "aio", 3)) {
1531                                 td->use_aio = 1;
1532                                 fgetpos(f, &off);
1533                                 continue;
1534                         }
1535                         if (!strncmp(p, "create", 6)) {
1536                                 td->create_file = 1;
1537                                 fgetpos(f, &off);
1538                                 continue;
1539                         }
1540                         if (!strncmp(p, "overwrite", 9)) {
1541                                 td->overwrite = 1;
1542                                 fgetpos(f, &off);
1543                                 continue;
1544                         }
1545                         printf("Client%d: bad option %s\n",td->thread_number,p);
1546                 }
1547                 fsetpos(f, &off);
1548
1549                 if (add_job(td, name, prioclass, prio))
1550                         put_job(td);
1551         }
1552
1553         free(string);
1554         free(name);
1555         fclose(f);
1556         return 0;
1557 }
1558
1559 static int parse_options(int argc, char *argv[])
1560 {
1561         int i;
1562
1563         for (i = 1; i < argc; i++) {
1564                 char *parm = argv[i];
1565
1566                 if (parm[0] != '-')
1567                         break;
1568
1569                 parm++;
1570                 switch (*parm) {
1571                         case 's':
1572                                 parm++;
1573                                 def_thread.sequential = !!atoi(parm);
1574                                 break;
1575                         case 'b':
1576                                 parm++;
1577                                 def_thread.bs = atoi(parm);
1578                                 def_thread.bs <<= 10;
1579                                 if (!def_thread.bs) {
1580                                         printf("bad block size\n");
1581                                         def_thread.bs = DEF_BS;
1582                                 }
1583                                 break;
1584                         case 't':
1585                                 parm++;
1586                                 def_thread.timeout = atoi(parm);
1587                                 break;
1588                         case 'r':
1589                                 parm++;
1590                                 repeatable = !!atoi(parm);
1591                                 break;
1592                         case 'R':
1593                                 parm++;
1594                                 rate_quit = !!atoi(parm);
1595                                 break;
1596                         case 'o':
1597                                 parm++;
1598                                 def_thread.odirect = !!atoi(parm);
1599                                 break;
1600                         case 'f':
1601                                 if (i + 1 >= argc) {
1602                                         printf("-f needs file as arg\n");
1603                                         break;
1604                                 }
1605                                 ini_file = strdup(argv[i+1]);
1606                                 i++;
1607                                 break;
1608                         default:
1609                                 printf("bad option %s\n", argv[i]);
1610                                 break;
1611                 }
1612         }
1613
1614         return i;
1615 }
1616
1617 static void print_thread_status(struct thread_data *td, int nr_running,
1618                                 int t_rate, int m_rate)
1619 {
1620         printf("Threads now running: %d", nr_running);
1621         if (m_rate || t_rate)
1622                 printf(", commitrate %d/%dKiB/sec", t_rate, m_rate);
1623         printf(" : [%s]\r", run_str);
1624         fflush(stdout);
1625 }
1626
1627 static void reap_threads(int *nr_running, int *t_rate, int *m_rate)
1628 {
1629         int i;
1630
1631         /*
1632          * reap exited threads (TD_EXITED -> TD_REAPED)
1633          */
1634         for (i = 0; i < thread_number; i++) {
1635                 struct thread_data *td = &threads[i];
1636
1637                 if (td->runstate != TD_EXITED)
1638                         continue;
1639
1640                 td->runstate = TD_REAPED;
1641                 run_str[td->thread_number - 1] = '_';
1642                 waitpid(td->pid, NULL, 0);
1643                 (*nr_running)--;
1644                 (*m_rate) -= td->ratemin;
1645                 (*t_rate) -= td->rate;
1646
1647                 if (td->terminate)
1648                         continue;
1649
1650                 print_thread_status(td, *nr_running, *t_rate, *m_rate);
1651         }
1652 }
1653
1654 static void run_threads(char *argv[])
1655 {
1656         struct timeval genesis;
1657         struct thread_data *td;
1658         unsigned long spent;
1659         int i, todo, nr_running, m_rate, t_rate;
1660
1661         gettimeofday(&genesis, NULL);
1662
1663         printf("Starting %d threads\n", thread_number);
1664         fflush(stdout);
1665
1666         signal(SIGINT, sig_handler);
1667
1668         todo = thread_number;
1669         nr_running = 0;
1670         m_rate = t_rate = 0;
1671
1672         while (todo) {
1673                 /*
1674                  * create threads (TD_NOT_CREATED -> TD_CREATED)
1675                  */
1676                 for (i = 0; i < thread_number; i++) {
1677                         td = &threads[i];
1678
1679                         if (td->runstate != TD_NOT_CREATED)
1680                                 continue;
1681
1682                         /*
1683                          * never got a chance to start, killed by other
1684                          * thread for some reason
1685                          */
1686                         if (td->terminate) {
1687                                 todo--;
1688                                 continue;
1689                         }
1690
1691                         if (td->start_delay) {
1692                                 spent = mtime_since_now(&genesis);
1693
1694                                 if (td->start_delay * 1000 > spent)
1695                                         continue;
1696                         }
1697
1698                         td->runstate = TD_CREATED;
1699                         run_str[td->thread_number - 1] = 'C';
1700                         sem_init(&startup_sem, 1, 1);
1701                         todo--;
1702
1703                         if (fork())
1704                                 sem_wait(&startup_sem);
1705                         else {
1706                                 thread_main(shm_id, i, argv);
1707                                 exit(0);
1708                         }
1709                 }
1710
1711                 /*
1712                  * start created threads (TD_CREATED -> TD_STARTED)
1713                  */
1714                 for (i = 0; i < thread_number; i++) {
1715                         struct thread_data *td = &threads[i];
1716
1717                         if (td->runstate != TD_CREATED)
1718                                 continue;
1719
1720                         td->runstate = TD_STARTED;
1721                         run_str[td->thread_number - 1] = '+';
1722                         nr_running++;
1723                         m_rate += td->ratemin;
1724                         t_rate += td->rate;
1725                         sem_post(&td->mutex);
1726
1727                         print_thread_status(td, nr_running, t_rate, m_rate);
1728                 }
1729
1730                 reap_threads(&nr_running, &t_rate, &m_rate);
1731
1732                 if (todo)
1733                         usleep(100000);
1734         }
1735
1736         while (nr_running) {
1737                 reap_threads(&nr_running, &t_rate, &m_rate);
1738                 usleep(10000);
1739         }
1740 }
1741
1742 int setup_thread_area(void)
1743 {
1744         /*
1745          * 1024 is too much on some machines, scale max_jobs if
1746          * we get a failure that looks like too large a shm segment
1747          */
1748         do {
1749                 int s = max_jobs * sizeof(struct thread_data);
1750
1751                 shm_id = shmget(0, s, IPC_CREAT | 0600);
1752                 if (shm_id != -1)
1753                         break;
1754                 if (errno != EINVAL) {
1755                         perror("shmget");
1756                         break;
1757                 }
1758
1759                 max_jobs >>= 1;
1760         } while (max_jobs);
1761
1762         if (shm_id == -1)
1763                 return 1;
1764
1765         threads = shmat(shm_id, NULL, 0);
1766         if (threads == (void *) -1) {
1767                 perror("shmat");
1768                 return 1;
1769         }
1770
1771         atexit(free_shm);
1772         return 0;
1773 }
1774
1775 int main(int argc, char *argv[])
1776 {
1777         static unsigned long max_run[2], min_run[2], total_blocks[2];
1778         static unsigned long max_bw[2], min_bw[2];
1779         static unsigned long read_mb, write_mb, read_agg, write_agg;
1780         int i;
1781
1782         if (setup_thread_area())
1783                 return 1;
1784
1785         if (sched_getaffinity(getpid(), sizeof(cpu_set_t), &def_thread.cpumask) == -1) {
1786                 perror("sched_getaffinity");
1787                 return 1;
1788         }
1789
1790         /*
1791          * fill globals
1792          */
1793         def_thread.ddir = DDIR_READ;
1794         def_thread.bs = DEF_BS;
1795         def_thread.odirect = DEF_ODIRECT;
1796         def_thread.ratecycle = DEF_RATE_CYCLE;
1797         def_thread.sequential = DEF_SEQUENTIAL;
1798         def_thread.timeout = DEF_TIMEOUT;
1799         def_thread.create_file = DEF_CREATE;
1800         def_thread.overwrite = DEF_OVERWRITE;
1801         def_thread.invalidate_cache = DEF_INVALIDATE;
1802
1803         i = parse_options(argc, argv);
1804
1805         if (ini_file) {
1806                 if (parse_jobs_ini(ini_file))
1807                         return 1;
1808         } else
1809                 parse_jobs_cmd(argc, argv, i);
1810
1811         if (!thread_number) {
1812                 printf("Nothing to do\n");
1813                 return 1;
1814         }
1815
1816         run_threads(argv);
1817
1818         min_bw[0] = min_run[0] = ~0UL;
1819         min_bw[1] = min_run[1] = ~0UL;
1820         for (i = 0; i < thread_number; i++) {
1821                 struct thread_data *td = &threads[i];
1822                 unsigned long bw = 0;
1823
1824                 if (td->error)
1825                         goto show_stat;
1826
1827                 if (td->runtime < min_run[td->ddir])
1828                         min_run[td->ddir] = td->runtime;
1829                 if (td->runtime > max_run[td->ddir])
1830                         max_run[td->ddir] = td->runtime;
1831
1832                 if (td->runtime)
1833                         bw = (td->io_blocks * td->bs) / td->runtime;
1834                 if (bw < min_bw[td->ddir])
1835                         min_bw[td->ddir] = bw;
1836                 if (bw > max_bw[td->ddir])
1837                         max_bw[td->ddir] = bw;
1838
1839                 total_blocks[td->ddir] += td->io_blocks;
1840
1841                 if (td_read(td)) {
1842                         read_mb += (td->bs * td->io_blocks) >> 20;
1843                         if (td->runtime)
1844                                 read_agg += (td->io_blocks * td->bs) / td->runtime;
1845                 } else {
1846                         write_mb += (td->bs * td->io_blocks) >> 20;
1847                         if (td->runtime)
1848                                 write_agg += (td->io_blocks * td->bs) / td->runtime;
1849                 }
1850
1851 show_stat:
1852                 show_thread_status(td);
1853         }
1854
1855         printf("\nRun status:\n");
1856         if (max_run[DDIR_READ])
1857                 printf("   READ: io=%luMiB, aggrb=%lu, minb=%lu, maxb=%lu, mint=%lumsec, maxt=%lumsec\n", read_mb, read_agg, min_bw[0], max_bw[0], min_run[0], max_run[0]);
1858         if (max_run[DDIR_WRITE])
1859                 printf("  WRITE: io=%luMiB, aggrb=%lu, minb=%lu, maxb=%lu, mint=%lumsec, maxt=%lumsec\n", write_mb, write_agg, min_bw[1], max_bw[1], min_run[1], max_run[1]);
1860
1861         return 0;
1862 }