[PATCH] fio: add cpu usage stats
[disktools.git] / fio.c
1 /*
2  * fio - the flexible io tester
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <unistd.h>
24 #include <fcntl.h>
25 #include <string.h>
26 #include <errno.h>
27 #include <signal.h>
28 #include <time.h>
29 #include <ctype.h>
30 #include <sched.h>
31 #include <libaio.h>
32 #include <math.h>
33 #include <limits.h>
34 #include <assert.h>
35 #include <sys/time.h>
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/wait.h>
39 #include <semaphore.h>
40 #include <sys/ipc.h>
41 #include <sys/shm.h>
42 #include <sys/ioctl.h>
43 #include <asm/unistd.h>
44 #include <asm/types.h>
45 #include <asm/bitops.h>
46
47 #include "list.h"
48 #include "md5.h"
49
50 #ifndef BLKGETSIZE64
51 #define BLKGETSIZE64    _IOR(0x12,114,size_t)
52 #endif
53
54 #define MAX_JOBS        (1024)
55
56 /*
57  * assume we don't have _get either, if _set isn't defined
58  */
59 #ifndef __NR_ioprio_set
60 #if defined(__i386__)
61 #define __NR_ioprio_set         289
62 #define __NR_ioprio_get         290
63 #elif defined(__powerpc__) || defined(__powerpc64__)
64 #define __NR_ioprio_set         273
65 #define __NR_ioprio_get         274
66 #elif defined(__x86_64__)
67 #define __NR_ioprio_set         251
68 #define __NR_ioprio_get         252
69 #elif defined(__ia64__)
70 #define __NR_ioprio_set         1274
71 #define __NR_ioprio_get         1275
72 #elif defined(__alpha__)
73 #define __NR_ioprio_set         442
74 #define __NR_ioprio_get         443
75 #elif defined(__s390x__) || defined(__s390__)
76 #define __NR_ioprio_set         282
77 #define __NR_ioprio_get         283
78 #else
79 #error "Unsupported arch"
80 #endif
81 #endif
82
83 #ifndef __NR_fadvise64
84 #if defined(__i386__)
85 #define __NR_fadvise64          250
86 #elif defined(__powerpc__) || defined(__powerpc64__)
87 #define __NR_fadvise64          233
88 #elif defined(__x86_64__)
89 #define __NR_fadvise64          221
90 #elif defined(__ia64__)
91 #define __NR_fadvise64          1234
92 #elif defined(__alpha__)
93 #define __NR_fadvise64          413
94 #elif defined(__s390x__) || defined(__s390__)
95 #define __NR_fadvise64          253
96 #else
97 #error "Unsupported arch"
98 #endif
99 #endif
100
101 static int ioprio_set(int which, int who, int ioprio)
102 {
103         return syscall(__NR_ioprio_set, which, who, ioprio);
104 }
105
106 /*
107  * we want fadvise64 really, but it's so tangled... later
108  */
109 static int fadvise(int fd, loff_t offset, size_t len, int advice)
110 {
111 #if 0
112         return syscall(__NR_fadvise64, fd, offset, offset >> 32, len, advice);
113 #else
114         return posix_fadvise(fd, (off_t) offset, len, advice);
115 #endif
116 }
117
118 enum {
119         IOPRIO_WHO_PROCESS = 1,
120         IOPRIO_WHO_PGRP,
121         IOPRIO_WHO_USER,
122 };
123
124 #define IOPRIO_CLASS_SHIFT      13
125
126 #define MASK    (4095)
127
128 #define DEF_BS          (4096)
129 #define DEF_TIMEOUT     (0)
130 #define DEF_RATE_CYCLE  (1000)
131 #define DEF_ODIRECT     (1)
132 #define DEF_SEQUENTIAL  (1)
133 #define DEF_RAND_REPEAT (1)
134 #define DEF_OVERWRITE   (0)
135 #define DEF_CREATE      (1)
136 #define DEF_INVALIDATE  (1)
137 #define DEF_SYNCIO      (0)
138 #define DEF_RANDSEED    (0xb1899bedUL)
139 #define DEF_BWAVGTIME   (500)
140 #define DEF_CREATE_SER  (1)
141 #define DEF_CREATE_FSYNC        (1)
142 #define DEF_LOOPS       (1)
143 #define DEF_VERIFY      (0)
144
145 #define ALIGN(buf)      (char *) (((unsigned long) (buf) + MASK) & ~(MASK))
146
147 static int repeatable = DEF_RAND_REPEAT;
148 static int rate_quit = 1;
149 static int write_lat_log;
150 static int write_bw_log;
151 static int exitall_on_terminate;
152
153 static int thread_number;
154 static char *ini_file;
155
156 static int max_jobs = MAX_JOBS;
157
158 static char run_str[MAX_JOBS + 1];
159
160 static int shm_id;
161
162 enum {
163         DDIR_READ = 0,
164         DDIR_WRITE,
165 };
166
167 /*
168  * thread life cycle
169  */
170 enum {
171         TD_NOT_CREATED = 0,
172         TD_CREATED,
173         TD_RUNNING,
174         TD_VERIFYING,
175         TD_EXITED,
176         TD_REAPED,
177 };
178
179 enum {
180         MEM_MALLOC,
181         MEM_SHM,
182 };
183
184 /*
185  * The io unit
186  */
187 struct io_u {
188         struct iocb iocb;
189         struct timeval start_time;
190         struct timeval issue_time;
191
192         char *buf;
193         unsigned int buflen;
194         unsigned long long offset;
195
196         struct list_head list;
197 };
198
199 struct io_stat {
200         unsigned long val;
201         unsigned long val_sq;
202         unsigned long max_val;
203         unsigned long min_val;
204         unsigned long samples;
205 };
206
207 struct io_sample {
208         unsigned long time;
209         unsigned long val;
210 };
211
212 struct io_log {
213         unsigned long nr_samples;
214         unsigned long max_samples;
215         struct io_sample *log;
216 };
217
218 struct io_piece {
219         struct list_head list;
220         unsigned long long offset;
221         unsigned int len;
222 };
223
224 #define FIO_HDR_MAGIC   0xf00baaef
225
226 struct verify_header {
227         unsigned int fio_magic;
228         unsigned int len;
229         char md5_digest[MD5_HASH_WORDS * 4];
230 };
231
232 #define td_read(td)             ((td)->ddir == DDIR_READ)
233 #define td_write(td)            ((td)->ddir == DDIR_WRITE)
234 #define should_fsync(td)        (td_write(td) && !(td)->odirect)
235
236 #define BLOCKS_PER_MAP          (8 * sizeof(long))
237 #define RAND_MAP_IDX(sector)    ((sector) / BLOCKS_PER_MAP)
238 #define RAND_MAP_BIT(sector)    ((sector) & (BLOCKS_PER_MAP - 1))
239
240 struct thread_data {
241         char file_name[256];
242         int thread_number;
243         int error;
244         int fd;
245         pid_t pid;
246         char *orig_buffer;
247         volatile int terminate;
248         volatile int runstate;
249         volatile int old_runstate;
250         unsigned int ddir;
251         unsigned int ioprio;
252         unsigned int sequential;
253         unsigned int bs;
254         unsigned int min_bs;
255         unsigned int max_bs;
256         unsigned int odirect;
257         unsigned int thinktime;
258         unsigned int fsync_blocks;
259         unsigned int start_delay;
260         unsigned int timeout;
261         unsigned int use_aio;
262         unsigned int create_file;
263         unsigned int overwrite;
264         unsigned int invalidate_cache;
265         unsigned int bw_avg_time;
266         unsigned int create_serialize;
267         unsigned int create_fsync;
268         unsigned int loops;
269         unsigned long long file_size;
270         unsigned long long file_offset;
271         unsigned int sync_io;
272         unsigned int mem_type;
273         unsigned int verify;
274         cpu_set_t cpumask;
275
276         struct drand48_data bsrange_state;
277         struct drand48_data verify_state;
278
279         int shm_id;
280
281         off_t cur_off;
282
283         io_context_t aio_ctx;
284         unsigned int aio_depth;
285         struct io_event *aio_events;
286
287         unsigned int cur_depth;
288         struct list_head io_u_freelist;
289         struct list_head io_u_busylist;
290
291         unsigned int rate;
292         unsigned int ratemin;
293         unsigned int ratecycle;
294         unsigned long rate_usec_cycle;
295         long rate_pending_usleep;
296         unsigned long rate_bytes;
297         struct timeval lastrate;
298
299         unsigned long runtime;          /* sec */
300         unsigned long long io_size;
301
302         unsigned long io_blocks;
303         unsigned long io_bytes;
304         unsigned long this_io_bytes;
305         unsigned long last_bytes;
306         sem_t mutex;
307
308         struct drand48_data random_state;
309         unsigned long *file_map;
310         unsigned int num_maps;
311
312         /*
313          * bandwidth and latency stats
314          */
315         struct io_stat clat_stat;               /* completion latency */
316         struct io_stat slat_stat;               /* submission latency */
317
318         struct io_stat bw_stat;                 /* bandwidth stats */
319         unsigned long stat_io_bytes;
320         struct timeval stat_sample_time;
321
322         struct io_log *lat_log;
323         struct io_log *bw_log;
324
325         struct timeval start;
326         struct rusage ru_start;
327         struct rusage ru_end;
328
329         struct list_head io_hist_list;
330 };
331
332 static struct thread_data *threads;
333 static struct thread_data def_thread;
334
335 static sem_t startup_sem;
336
337 static void sig_handler(int sig)
338 {
339         int i;
340
341         for (i = 0; i < thread_number; i++) {
342                 struct thread_data *td = &threads[i];
343
344                 td->terminate = 1;
345                 td->start_delay = 0;
346         }
347 }
348
349 static int init_random_state(struct thread_data *td)
350 {
351         unsigned long seed;
352         int fd, num_maps, blocks;
353
354         fd = open("/dev/random", O_RDONLY);
355         if (fd == -1) {
356                 td->error = errno;
357                 return 1;
358         }
359
360         if (read(fd, &seed, sizeof(seed)) < (int) sizeof(seed)) {
361                 td->error = EIO;
362                 close(fd);
363                 return 1;
364         }
365
366         close(fd);
367
368         srand48_r(seed, &td->bsrange_state);
369         srand48_r(seed, &td->verify_state);
370
371         if (td->sequential)
372                 return 0;
373
374         if (repeatable)
375                 seed = DEF_RANDSEED;
376
377         blocks = (td->io_size + td->min_bs - 1) / td->min_bs;
378         num_maps = blocks / BLOCKS_PER_MAP;
379         td->file_map = malloc(num_maps * sizeof(long));
380         td->num_maps = num_maps;
381         memset(td->file_map, 0, num_maps * sizeof(long));
382
383         srand48_r(seed, &td->random_state);
384         return 0;
385 }
386
387 static unsigned long utime_since(struct timeval *s, struct timeval *e)
388 {
389         double sec, usec;
390
391         sec = e->tv_sec - s->tv_sec;
392         usec = e->tv_usec - s->tv_usec;
393         if (sec > 0 && usec < 0) {
394                 sec--;
395                 usec += 1000000;
396         }
397
398         sec *= (double) 1000000;
399
400         return sec + usec;
401 }
402
403 static unsigned long mtime_since(struct timeval *s, struct timeval *e)
404 {
405         double sec, usec;
406
407         sec = e->tv_sec - s->tv_sec;
408         usec = e->tv_usec - s->tv_usec;
409         if (sec > 0 && usec < 0) {
410                 sec--;
411                 usec += 1000000;
412         }
413
414         sec *= (double) 1000;
415         usec /= (double) 1000;
416
417         return sec + usec;
418 }
419
420 static unsigned long mtime_since_now(struct timeval *s)
421 {
422         struct timeval t;
423
424         gettimeofday(&t, NULL);
425         return mtime_since(s, &t);
426 }
427
428 static inline unsigned long msec_now(struct timeval *s)
429 {
430         return s->tv_sec * 1000 + s->tv_usec / 1000;
431 }
432
433 static int random_map_free(struct thread_data *td, unsigned long long block)
434 {
435         unsigned int idx = RAND_MAP_IDX(block);
436         unsigned int bit = RAND_MAP_BIT(block);
437
438         return (td->file_map[idx] & (1UL << bit)) == 0;
439 }
440
441 static int get_next_free_block(struct thread_data *td, unsigned long long *b)
442 {
443         int i;
444
445         *b = 0;
446         i = 0;
447         while ((*b) * td->min_bs < td->io_size) {
448                 if (td->file_map[i] != -1UL) {
449                         *b += ffz(td->file_map[i]);
450                         return 0;
451                 }
452
453                 *b += BLOCKS_PER_MAP;
454                 i++;
455         }
456
457         return 1;
458 }
459
460 static void mark_random_map(struct thread_data *td, struct io_u *io_u)
461 {
462         unsigned long block = io_u->offset / td->min_bs;
463         unsigned int blocks = 0;
464
465         while (blocks < (io_u->buflen / td->min_bs)) {
466                 int idx, bit;
467
468                 if (!random_map_free(td, block))
469                         break;
470
471                 idx = RAND_MAP_IDX(block);
472                 bit = RAND_MAP_BIT(block);
473
474                 assert(idx < td->num_maps);
475
476                 td->file_map[idx] |= (1UL << bit);
477                 block++;
478                 blocks++;
479         }
480
481         if ((blocks * td->min_bs) < io_u->buflen)
482                 io_u->buflen = blocks * td->min_bs;
483 }
484
485 static int get_next_offset(struct thread_data *td, unsigned long long *offset)
486 {
487         unsigned long long b;
488         long r;
489
490         if (!td->sequential) {
491                 unsigned long max_blocks = td->io_size / td->min_bs;
492                 int loops = 50;
493
494                 do {
495                         lrand48_r(&td->random_state, &r);
496                         b = ((max_blocks - 1) * r / (RAND_MAX+1.0));
497                         loops--;
498                 } while (!random_map_free(td, b) && loops);
499
500                 if (!loops) {
501                         if (get_next_free_block(td, &b))
502                                 return 1;
503                 }
504         } else
505                 b = td->last_bytes / td->min_bs;
506
507         *offset = (b * td->min_bs) + td->file_offset;
508         return 0;
509 }
510
511 static unsigned int get_next_buflen(struct thread_data *td)
512 {
513         unsigned int buflen;
514         long r;
515
516         if (td->min_bs == td->max_bs)
517                 buflen = td->min_bs;
518         else {
519                 lrand48_r(&td->bsrange_state, &r);
520                 buflen = (1 + (double) (td->max_bs - 1) * r / (RAND_MAX + 1.0));
521                 buflen = (buflen + td->min_bs - 1) & ~(td->min_bs - 1);
522         }
523
524         if (buflen > td->io_size - td->this_io_bytes)
525                 buflen = td->io_size - td->this_io_bytes;
526
527         return buflen;
528 }
529
530 static inline void add_stat_sample(struct thread_data *td, struct io_stat *is,
531                                    unsigned long val)
532 {
533         if (val > is->max_val)
534                 is->max_val = val;
535         if (val < is->min_val)
536                 is->min_val = val;
537
538         is->val += val;
539         is->val_sq += val * val;
540         is->samples++;
541 }
542
543 static void add_log_sample(struct thread_data *td, struct io_log *log,
544                            unsigned long val)
545 {
546         if (log->nr_samples == log->max_samples) {
547                 int new_size = sizeof(struct io_sample) * log->max_samples * 2;
548
549                 log->log = realloc(log->log, new_size);
550                 log->max_samples <<= 1;
551         }
552
553         log->log[log->nr_samples].val = val;
554         log->log[log->nr_samples].time = mtime_since_now(&td->start);
555         log->nr_samples++;
556 }
557
558 static void add_clat_sample(struct thread_data *td, unsigned long msec)
559 {
560         add_stat_sample(td, &td->clat_stat, msec);
561
562         if (td->lat_log)
563                 add_log_sample(td, td->lat_log, msec);
564 }
565
566 static void add_slat_sample(struct thread_data *td, unsigned long msec)
567 {
568         add_stat_sample(td, &td->slat_stat, msec);
569 }
570
571 static void add_bw_sample(struct thread_data *td)
572 {
573         unsigned long spent = mtime_since_now(&td->stat_sample_time);
574         unsigned long rate;
575
576         if (spent < td->bw_avg_time)
577                 return;
578
579         rate = (td->this_io_bytes - td->stat_io_bytes) / spent;
580         add_stat_sample(td, &td->bw_stat, rate);
581
582         if (td->bw_log)
583                 add_log_sample(td, td->bw_log, rate);
584
585         gettimeofday(&td->stat_sample_time, NULL);
586         td->stat_io_bytes = td->this_io_bytes;
587 }
588
589 static void usec_sleep(int usec)
590 {
591         struct timespec req = { .tv_sec = 0, .tv_nsec = usec * 1000 };
592         struct timespec rem;
593
594         do {
595                 rem.tv_sec = rem.tv_nsec = 0;
596                 nanosleep(&req, &rem);
597                 if (!rem.tv_nsec)
598                         break;
599
600                 req.tv_nsec = rem.tv_nsec;
601         } while (1);
602 }
603
604 static void rate_throttle(struct thread_data *td, unsigned long time_spent,
605                           unsigned int bytes)
606 {
607         unsigned long usec_cycle;
608
609         if (!td->rate)
610                 return;
611
612         usec_cycle = td->rate_usec_cycle * (bytes / td->min_bs);
613
614         if (time_spent < usec_cycle) {
615                 unsigned long s = usec_cycle - time_spent;
616
617                 td->rate_pending_usleep += s;
618                 if (td->rate_pending_usleep >= 100000) {
619                         usec_sleep(td->rate_pending_usleep);
620                         td->rate_pending_usleep = 0;
621                 }
622         } else {
623                 long overtime = time_spent - usec_cycle;
624
625                 td->rate_pending_usleep -= overtime;
626         }
627 }
628
629 static int check_min_rate(struct thread_data *td, struct timeval *now)
630 {
631         unsigned long spent;
632         unsigned long rate;
633
634         /*
635          * allow a 2 second settle period in the beginning
636          */
637         if (mtime_since(&td->start, now) < 2000)
638                 return 0;
639
640         /*
641          * if rate blocks is set, sample is running
642          */
643         if (td->rate_bytes) {
644                 spent = mtime_since(&td->lastrate, now);
645                 if (spent < td->ratecycle)
646                         return 0;
647
648                 rate = (td->this_io_bytes - td->rate_bytes) / spent;
649                 if (rate < td->ratemin) {
650                         printf("Client%d: min rate %d not met, got %ldKiB/sec\n", td->thread_number, td->ratemin, rate);
651                         if (rate_quit)
652                                 sig_handler(0);
653                         return 1;
654                 }
655         }
656
657         td->rate_bytes = td->this_io_bytes;
658         memcpy(&td->lastrate, now, sizeof(*now));
659         return 0;
660 }
661
662 static inline int runtime_exceeded(struct thread_data *td, struct timeval *t)
663 {
664         if (!td->timeout)
665                 return 0;
666         if (mtime_since(&td->start, t) >= td->timeout * 1000)
667                 return 1;
668
669         return 0;
670 }
671
672 static void fill_random_bytes(struct thread_data *td,
673                               unsigned char *p, unsigned int len)
674 {
675         unsigned int todo;
676         double r;
677
678         while (len) {
679                 drand48_r(&td->verify_state, &r);
680
681                 /*
682                  * lrand48_r seems to be broken and only fill the bottom
683                  * 32-bits, even on 64-bit archs with 64-bit longs
684                  */
685                 todo = sizeof(r);
686                 if (todo > len)
687                         todo = len;
688
689                 memcpy(p, &r, todo);
690
691                 len -= todo;
692                 p += todo;
693         }
694 }
695
696 static void hexdump(void *buffer, int len)
697 {
698         unsigned char *p = buffer;
699         int i;
700
701         for (i = 0; i < len; i++)
702                 printf("%02x", p[i]);
703         printf("\n");
704 }
705
706 static int verify_io_u(struct io_u *io_u)
707 {
708         struct verify_header *hdr = (struct verify_header *) io_u->buf;
709         unsigned char *p = (unsigned char *) io_u->buf;
710         struct md5_ctx md5_ctx;
711         int ret;
712
713         if (hdr->fio_magic != FIO_HDR_MAGIC)
714                 return 1;
715
716         memset(&md5_ctx, 0, sizeof(md5_ctx));
717         p += sizeof(*hdr);
718         md5_update(&md5_ctx, p, hdr->len - sizeof(*hdr));
719
720         ret = memcmp(hdr->md5_digest, md5_ctx.hash, sizeof(md5_ctx.hash));
721         if (ret) {
722                 hexdump(hdr->md5_digest, sizeof(hdr->md5_digest));
723                 hexdump(md5_ctx.hash, sizeof(md5_ctx.hash));
724         }
725
726         return ret;
727 }
728
729 /*
730  * fill body of io_u->buf with random data and add a header with the
731  * (eg) sha1sum of that data.
732  */
733 static void populate_io_u(struct thread_data *td, struct io_u *io_u)
734 {
735         struct md5_ctx md5_ctx;
736         struct verify_header hdr;
737         unsigned char *p = (unsigned char *) io_u->buf;
738
739         hdr.fio_magic = FIO_HDR_MAGIC;
740         hdr.len = io_u->buflen;
741         p += sizeof(hdr);
742         fill_random_bytes(td, p, io_u->buflen - sizeof(hdr));
743
744         memset(&md5_ctx, 0, sizeof(md5_ctx));
745         md5_update(&md5_ctx, p, io_u->buflen - sizeof(hdr));
746         memcpy(hdr.md5_digest, md5_ctx.hash, sizeof(md5_ctx.hash));
747         memcpy(io_u->buf, &hdr, sizeof(hdr));
748 }
749
750 static void put_io_u(struct thread_data *td, struct io_u *io_u)
751 {
752         list_del(&io_u->list);
753         list_add(&io_u->list, &td->io_u_freelist);
754         td->cur_depth--;
755 }
756
757 #define queue_full(td)  (list_empty(&(td)->io_u_freelist))
758
759 static struct io_u *__get_io_u(struct thread_data *td)
760 {
761         struct io_u *io_u;
762
763         if (queue_full(td))
764                 return NULL;
765
766         io_u = list_entry(td->io_u_freelist.next, struct io_u, list);
767         list_del(&io_u->list);
768         list_add(&io_u->list, &td->io_u_busylist);
769         td->cur_depth++;
770         return io_u;
771 }
772
773 static struct io_u *get_io_u(struct thread_data *td)
774 {
775         struct io_u *io_u;
776
777         io_u = __get_io_u(td);
778         if (!io_u)
779                 return NULL;
780
781         if (get_next_offset(td, &io_u->offset)) {
782                 put_io_u(td, io_u);
783                 return NULL;
784         }
785
786         io_u->buflen = get_next_buflen(td);
787         if (!io_u->buflen) {
788                 put_io_u(td, io_u);
789                 return NULL;
790         }
791
792         if (io_u->buflen + io_u->offset > td->io_size)
793                 io_u->buflen = td->io_size - io_u->offset;
794
795         if (!td->sequential)
796                 mark_random_map(td, io_u);
797
798         td->last_bytes += io_u->buflen;
799
800         if (td->verify)
801                 populate_io_u(td, io_u);
802
803         if (td->use_aio) {
804                 if (td_read(td))
805                         io_prep_pread(&io_u->iocb, td->fd, io_u->buf, io_u->buflen, io_u->offset);
806                 else
807                         io_prep_pwrite(&io_u->iocb, td->fd, io_u->buf, io_u->buflen, io_u->offset);
808         }
809
810         gettimeofday(&io_u->start_time, NULL);
811         return io_u;
812 }
813
814 static inline void td_set_runstate(struct thread_data *td, int runstate)
815 {
816         td->old_runstate = td->runstate;
817         td->runstate = runstate;
818 }
819
820 static int get_next_verify(struct thread_data *td,
821                            unsigned long long *offset, unsigned int *len)
822 {
823         struct io_piece *ipo;
824
825         if (list_empty(&td->io_hist_list))
826                 return 1;
827
828         ipo = list_entry(td->io_hist_list.next, struct io_piece, list);
829         list_del(&ipo->list);
830
831         *offset = ipo->offset;
832         *len = ipo->len;
833         free(ipo);
834         return 0;
835 }
836
837 static void prune_io_piece_log(struct thread_data *td)
838 {
839         struct io_piece *ipo;
840
841         while (!list_empty(&td->io_hist_list)) {
842                 ipo = list_entry(td->io_hist_list.next, struct io_piece, list);
843
844                 list_del(&ipo->list);
845                 free(ipo);
846         }
847 }
848
849 /*
850  * log a succesful write, so we can unwind the log for verify
851  */
852 static void log_io_piece(struct thread_data *td, struct io_u *io_u)
853 {
854         struct io_piece *ipo = malloc(sizeof(struct io_piece));
855         struct list_head *entry;
856
857         INIT_LIST_HEAD(&ipo->list);
858         ipo->offset = io_u->offset;
859         ipo->len = io_u->buflen;
860
861         /*
862          * for random io where the writes extend the file, it will typically
863          * be laid out with the block scattered as written. it's faster to
864          * read them in in that order again, so don't sort
865          */
866         if (td->sequential || !td->overwrite) {
867                 list_add_tail(&ipo->list, &td->io_hist_list);
868                 return;
869         }
870
871         /*
872          * for random io, sort the list so verify will run faster
873          */
874         entry = &td->io_hist_list;
875         while ((entry = entry->prev) != &td->io_hist_list) {
876                 struct io_piece *__ipo = list_entry(entry, struct io_piece, list);
877
878                 if (__ipo->offset < ipo->offset)
879                         break;
880         }
881
882         list_add(&ipo->list, entry);
883 }
884
885 static void do_sync_verify(struct thread_data *td)
886 {
887         struct timeval t;
888         struct io_u *io_u = NULL;
889         int ret;
890
891         td_set_runstate(td, TD_VERIFYING);
892
893         io_u = __get_io_u(td);
894
895         if (!td->odirect) {
896                 if (fadvise(td->fd, td->file_offset, td->io_size, POSIX_FADV_DONTNEED) < 0) {
897                         td->error = errno;
898                         goto out;
899                 }
900         }
901
902         do {
903                 if (td->terminate)
904                         break;
905
906                 gettimeofday(&t, NULL);
907                 if (runtime_exceeded(td, &t))
908                         break;
909
910                 if (get_next_verify(td, &io_u->offset, &io_u->buflen))
911                         break;
912
913                 if (td->cur_off != io_u->offset) {
914                         if (lseek(td->fd, io_u->offset, SEEK_SET) == -1) {
915                                 td->error = errno;
916                                 break;
917                         }
918                 }
919
920                 ret = read(td->fd, io_u->buf, io_u->buflen);
921                 if (ret < (int) io_u->buflen) {
922                         if (ret == -1) {
923                                 td->error = errno;
924                                 break;
925                         } else if (!ret)
926                                 break;
927                         else
928                                 io_u->buflen = ret;
929                 }
930
931                 if (verify_io_u(io_u))
932                         break;
933
934                 td->cur_off = io_u->offset + io_u->buflen;
935         } while (1);
936
937 out:
938         td_set_runstate(td, TD_RUNNING);
939         put_io_u(td, io_u);
940 }
941
942 static void do_sync_io(struct thread_data *td)
943 {
944         unsigned long msec, usec;
945         struct io_u *io_u = NULL;
946         struct timeval e;
947
948         while (td->this_io_bytes < td->io_size) {
949                 int ret;
950
951                 if (td->terminate)
952                         break;
953
954                 io_u = get_io_u(td);
955                 if (!io_u)
956                         break;
957
958                 if (td->cur_off != io_u->offset) {
959                         if (lseek(td->fd, io_u->offset, SEEK_SET) == -1) {
960                                 td->error = errno;
961                                 break;
962                         }
963                 }
964
965                 if (td_read(td))
966                         ret = read(td->fd, io_u->buf, io_u->buflen);
967                 else
968                         ret = write(td->fd, io_u->buf, io_u->buflen);
969
970                 if (ret < (int) io_u->buflen) {
971                         if (ret == -1)
972                                 td->error = errno;
973                         break;
974                 }
975
976                 if (td_write(td))
977                         log_io_piece(td, io_u);
978
979                 td->io_blocks++;
980                 td->io_bytes += io_u->buflen;
981                 td->this_io_bytes += io_u->buflen;
982                 td->cur_off = io_u->offset + io_u->buflen;
983
984                 gettimeofday(&e, NULL);
985
986                 usec = utime_since(&io_u->start_time, &e);
987
988                 rate_throttle(td, usec, io_u->buflen);
989
990                 if (check_min_rate(td, &e)) {
991                         td->error = ENODATA;
992                         break;
993                 }
994
995                 msec = usec / 1000;
996                 add_clat_sample(td, msec);
997                 add_bw_sample(td);
998
999                 if (runtime_exceeded(td, &e))
1000                         break;
1001
1002                 put_io_u(td, io_u);
1003                 io_u = NULL;
1004
1005                 if (td->thinktime)
1006                         usec_sleep(td->thinktime);
1007
1008                 if (should_fsync(td) && td->fsync_blocks &&
1009                     (td->io_blocks % td->fsync_blocks) == 0)
1010                         fsync(td->fd);
1011         }
1012
1013         if (io_u)
1014                 put_io_u(td, io_u);
1015
1016         if (should_fsync(td))
1017                 fsync(td->fd);
1018 }
1019
1020 static int io_u_getevents(struct thread_data *td, int min, int max,
1021                           struct timespec *t)
1022 {
1023         int r;
1024
1025         do {
1026                 r = io_getevents(td->aio_ctx, min, max, td->aio_events, t);
1027                 if (r != -EAGAIN && r != -EINTR)
1028                         break;
1029         } while (1);
1030
1031         return r;
1032 }
1033
1034 static int io_u_queue(struct thread_data *td, struct io_u *io_u)
1035 {
1036         struct iocb *iocb = &io_u->iocb;
1037         int ret;
1038
1039         do {
1040                 ret = io_submit(td->aio_ctx, 1, &iocb);
1041                 if (ret == 1)
1042                         return 0;
1043                 else if (ret == -EAGAIN)
1044                         usleep(100);
1045                 else if (ret == -EINTR)
1046                         continue;
1047                 else
1048                         break;
1049         } while (1);
1050
1051         return ret;
1052 }
1053
1054 #define iocb_time(iocb) ((unsigned long) (iocb)->data)
1055 #define ev_to_iou(ev)   (struct io_u *) ((unsigned long) (ev)->obj)
1056
1057 static int ios_completed(struct thread_data *td, int nr)
1058 {
1059         unsigned long msec;
1060         struct io_u *io_u;
1061         struct timeval e;
1062         int i, bytes_done;
1063
1064         gettimeofday(&e, NULL);
1065
1066         for (i = 0, bytes_done = 0; i < nr; i++) {
1067                 io_u = ev_to_iou(td->aio_events + i);
1068
1069                 td->io_blocks++;
1070                 td->io_bytes += io_u->buflen;
1071                 td->this_io_bytes += io_u->buflen;
1072
1073                 msec = mtime_since(&io_u->issue_time, &e);
1074
1075                 add_clat_sample(td, msec);
1076                 add_bw_sample(td);
1077
1078                 if (td_write(td))
1079                         log_io_piece(td, io_u);
1080
1081                 bytes_done += io_u->buflen;
1082                 put_io_u(td, io_u);
1083         }
1084
1085         return bytes_done;
1086 }
1087
1088 static void cleanup_pending_aio(struct thread_data *td)
1089 {
1090         struct timespec ts = { .tv_sec = 0, .tv_nsec = 0};
1091         struct list_head *entry, *n;
1092         struct io_u *io_u;
1093         int r;
1094
1095         /*
1096          * get immediately available events, if any
1097          */
1098         r = io_u_getevents(td, 0, td->cur_depth, &ts);
1099         if (r > 0)
1100                 ios_completed(td, r);
1101
1102         /*
1103          * now cancel remaining active events
1104          */
1105         list_for_each_safe(entry, n, &td->io_u_busylist) {
1106                 io_u = list_entry(entry, struct io_u, list);
1107
1108                 r = io_cancel(td->aio_ctx, &io_u->iocb, td->aio_events);
1109                 if (!r)
1110                         put_io_u(td, io_u);
1111         }
1112
1113         if (td->cur_depth) {
1114                 r = io_u_getevents(td, td->cur_depth, td->cur_depth, NULL);
1115                 if (r > 0)
1116                         ios_completed(td, r);
1117         }
1118 }
1119
1120 static int async_do_verify(struct thread_data *td, struct io_u **io_u)
1121 {
1122         struct io_u *v_io_u = *io_u;
1123         int ret = 0;
1124
1125         if (v_io_u) {
1126                 ret = verify_io_u(v_io_u);
1127                 put_io_u(td, v_io_u);
1128                 *io_u = NULL;
1129         }
1130
1131         return ret;
1132 }
1133
1134 static void do_async_verify(struct thread_data *td)
1135 {
1136         struct timeval t;
1137         struct io_u *io_u, *v_io_u = NULL;
1138         int ret;
1139
1140         td_set_runstate(td, TD_VERIFYING);
1141
1142         do {
1143                 if (td->terminate)
1144                         break;
1145
1146                 gettimeofday(&t, NULL);
1147                 if (runtime_exceeded(td, &t))
1148                         break;
1149
1150                 io_u = __get_io_u(td);
1151                 if (!io_u)
1152                         break;
1153
1154                 if (get_next_verify(td, &io_u->offset, &io_u->buflen)) {
1155                         put_io_u(td, io_u);
1156                         break;
1157                 }
1158
1159                 io_prep_pread(&io_u->iocb, td->fd, io_u->buf, io_u->buflen, io_u->offset);
1160                 ret = io_u_queue(td, io_u);
1161                 if (ret) {
1162                         put_io_u(td, io_u);
1163                         td->error = ret;
1164                         break;
1165                 }
1166
1167                 /*
1168                  * we have one pending to verify, do that while the next
1169                  * we are doing io on the next one
1170                  */
1171                 if (async_do_verify(td, &v_io_u))
1172                         break;
1173
1174                 ret = io_u_getevents(td, 1, 1, NULL);
1175                 if (ret != 1) {
1176                         if (ret < 0)
1177                                 td->error = ret;
1178                         break;
1179                 }
1180
1181                 v_io_u = ev_to_iou(td->aio_events);
1182
1183                 td->cur_off = v_io_u->offset + v_io_u->buflen;
1184
1185                 /*
1186                  * if we can't submit more io, we need to verify now
1187                  */
1188                 if (queue_full(td) && async_do_verify(td, &v_io_u))
1189                         break;
1190
1191         } while (1);
1192
1193         async_do_verify(td, &v_io_u);
1194
1195         if (td->cur_depth)
1196                 cleanup_pending_aio(td);
1197
1198         td_set_runstate(td, TD_RUNNING);
1199 }
1200
1201 static void do_async_io(struct thread_data *td)
1202 {
1203         struct timeval s, e;
1204         unsigned long usec;
1205
1206         while (td->this_io_bytes < td->io_size) {
1207                 struct timespec ts = { .tv_sec = 0, .tv_nsec = 0};
1208                 struct timespec *timeout;
1209                 int ret, min_evts = 0;
1210                 struct io_u *io_u;
1211                 unsigned int bytes_done;
1212
1213                 if (td->terminate)
1214                         break;
1215
1216                 io_u = get_io_u(td);
1217                 if (!io_u)
1218                         break;
1219
1220                 memcpy(&s, &io_u->start_time, sizeof(s));
1221
1222                 ret = io_u_queue(td, io_u);
1223                 if (ret) {
1224                         put_io_u(td, io_u);
1225                         td->error = ret;
1226                         break;
1227                 }
1228
1229                 gettimeofday(&io_u->issue_time, NULL);
1230                 add_slat_sample(td, mtime_since(&io_u->start_time, &io_u->issue_time));
1231                 if (td->cur_depth < td->aio_depth) {
1232                         timeout = &ts;
1233                         min_evts = 0;
1234                 } else {
1235                         timeout = NULL;
1236                         min_evts = 1;
1237                 }
1238
1239                 ret = io_u_getevents(td, min_evts, td->cur_depth, timeout);
1240                 if (ret < 0) {
1241                         td->error = ret;
1242                         break;
1243                 } else if (!ret)
1244                         continue;
1245
1246                 bytes_done = ios_completed(td, ret);
1247
1248                 /*
1249                  * the rate is batched for now, it should work for batches
1250                  * of completions except the very first one which may look
1251                  * a little bursty
1252                  */
1253                 gettimeofday(&e, NULL);
1254                 usec = utime_since(&s, &e);
1255
1256                 rate_throttle(td, usec, bytes_done);
1257
1258                 if (check_min_rate(td, &e)) {
1259                         td->error = ENODATA;
1260                         break;
1261                 }
1262
1263                 if (runtime_exceeded(td, &e))
1264                         break;
1265
1266                 if (td->thinktime)
1267                         usec_sleep(td->thinktime);
1268
1269                 if (should_fsync(td) && td->fsync_blocks &&
1270                     (td->io_blocks % td->fsync_blocks) == 0)
1271                         fsync(td->fd);
1272         }
1273
1274         if (td->cur_depth)
1275                 cleanup_pending_aio(td);
1276
1277         if (should_fsync(td))
1278                 fsync(td->fd);
1279 }
1280
1281 static void cleanup_aio(struct thread_data *td)
1282 {
1283         io_destroy(td->aio_ctx);
1284
1285         if (td->aio_events)
1286                 free(td->aio_events);
1287 }
1288
1289 static int init_aio(struct thread_data *td)
1290 {
1291         if (io_queue_init(td->aio_depth, &td->aio_ctx)) {
1292                 td->error = errno;
1293                 return 1;
1294         }
1295
1296         td->aio_events = malloc(td->aio_depth * sizeof(struct io_event));
1297         return 0;
1298 }
1299
1300 static void cleanup_io_u(struct thread_data *td)
1301 {
1302         struct list_head *entry, *n;
1303         struct io_u *io_u;
1304
1305         list_for_each_safe(entry, n, &td->io_u_freelist) {
1306                 io_u = list_entry(entry, struct io_u, list);
1307
1308                 list_del(&io_u->list);
1309                 free(io_u);
1310         }
1311
1312         if (td->mem_type == MEM_MALLOC)
1313                 free(td->orig_buffer);
1314         else if (td->mem_type == MEM_SHM) {
1315                 struct shmid_ds sbuf;
1316
1317                 shmdt(td->orig_buffer);
1318                 shmctl(td->shm_id, IPC_RMID, &sbuf);
1319         }
1320 }
1321
1322 static int init_io_u(struct thread_data *td)
1323 {
1324         struct io_u *io_u;
1325         int i, max_units, mem_size;
1326         char *p;
1327
1328         if (!td->use_aio)
1329                 max_units = 1;
1330         else
1331                 max_units = td->aio_depth;
1332
1333         mem_size = td->max_bs * max_units + MASK;
1334
1335         if (td->mem_type == MEM_MALLOC)
1336                 td->orig_buffer = malloc(mem_size);
1337         else if (td->mem_type == MEM_SHM) {
1338                 td->shm_id = shmget(IPC_PRIVATE, mem_size, IPC_CREAT | 0600);
1339                 if (td->shm_id < 0) {
1340                         td->error = errno;
1341                         perror("shmget");
1342                         return 1;
1343                 }
1344
1345                 td->orig_buffer = shmat(td->shm_id, NULL, 0);
1346                 if (td->orig_buffer == (void *) -1) {
1347                         td->error = errno;
1348                         perror("shmat");
1349                         return 1;
1350                 }
1351         }
1352
1353         INIT_LIST_HEAD(&td->io_u_freelist);
1354         INIT_LIST_HEAD(&td->io_u_busylist);
1355         INIT_LIST_HEAD(&td->io_hist_list);
1356
1357         p = ALIGN(td->orig_buffer);
1358         for (i = 0; i < max_units; i++) {
1359                 io_u = malloc(sizeof(*io_u));
1360                 memset(io_u, 0, sizeof(*io_u));
1361                 INIT_LIST_HEAD(&io_u->list);
1362
1363                 io_u->buf = p + td->max_bs * i;
1364                 list_add(&io_u->list, &td->io_u_freelist);
1365         }
1366
1367         return 0;
1368 }
1369
1370 static void setup_log(struct io_log **log)
1371 {
1372         struct io_log *l = malloc(sizeof(*l));
1373
1374         l->nr_samples = 0;
1375         l->max_samples = 1024;
1376         l->log = malloc(l->max_samples * sizeof(struct io_sample));
1377         *log = l;
1378 }
1379
1380 static void finish_log(struct thread_data *td, struct io_log *log, char *name)
1381 {
1382         char file_name[128];
1383         FILE *f;
1384         unsigned int i;
1385
1386         sprintf(file_name, "client%d_%s.log", td->thread_number, name);
1387         f = fopen(file_name, "w");
1388         if (!f) {
1389                 perror("fopen log");
1390                 return;
1391         }
1392
1393         for (i = 0; i < log->nr_samples; i++)
1394                 fprintf(f, "%lu, %lu\n", log->log[i].time, log->log[i].val);
1395
1396         fclose(f);
1397         free(log->log);
1398         free(log);
1399 }
1400
1401 static int create_file(struct thread_data *td)
1402 {
1403         unsigned long long left;
1404         unsigned int bs;
1405         char *b;
1406         int r;
1407
1408         /*
1409          * unless specifically asked for overwrite, let normal io extend it
1410          */
1411         if (td_write(td) && !td->overwrite)
1412                 return 0;
1413
1414         if (!td->file_size) {
1415                 fprintf(stderr, "Need size for create\n");
1416                 td->error = EINVAL;
1417                 return 1;
1418         }
1419
1420         printf("Client%d: Laying out IO file\n", td->thread_number);
1421
1422         td->fd = open(td->file_name, O_WRONLY | O_CREAT | O_TRUNC, 0644);
1423         if (td->fd < 0) {
1424                 td->error = errno;
1425                 return 1;
1426         }
1427
1428         if (ftruncate(td->fd, td->file_size) == -1) {
1429                 td->error = errno;
1430                 return 1;
1431         }
1432
1433         td->io_size = td->file_size;
1434         b = malloc(td->max_bs);
1435         memset(b, 0, td->max_bs);
1436
1437         left = td->file_size;
1438         while (left) {
1439                 bs = td->max_bs;
1440                 if (bs > left)
1441                         bs = left;
1442
1443                 r = write(td->fd, b, bs);
1444
1445                 if (r == (int) bs) {
1446                         left -= bs;
1447                         continue;
1448                 } else {
1449                         if (r < 0)
1450                                 td->error = errno;
1451                         else
1452                                 td->error = EIO;
1453
1454                         break;
1455                 }
1456         }
1457
1458         if (td->create_fsync)
1459                 fsync(td->fd);
1460
1461         close(td->fd);
1462         td->fd = -1;
1463         free(b);
1464         return 0;
1465 }
1466
1467 static int file_exists(struct thread_data *td)
1468 {
1469         struct stat st;
1470
1471         if (stat(td->file_name, &st) != -1)
1472                 return 1;
1473
1474         return errno != ENOENT;
1475 }
1476
1477 static int get_file_size(struct thread_data *td)
1478 {
1479         size_t bytes = 0;
1480         struct stat st;
1481
1482         if (fstat(td->fd, &st) == -1) {
1483                 td->error = errno;
1484                 return 1;
1485         }
1486
1487         /*
1488          * if block device, get size via BLKGETSIZE64 ioctl. try that as well
1489          * if this is a link, fall back to st.st_size if it fails
1490          */
1491         if (S_ISBLK(st.st_mode) || S_ISLNK(st.st_mode)) {
1492                 if (ioctl(td->fd, BLKGETSIZE64, &bytes)) {
1493                         if (S_ISBLK(st.st_mode)) {
1494                                 td->error = errno;
1495                                 return 1;
1496                         } else
1497                                 bytes = st.st_size;
1498                 }
1499         } else
1500                 bytes = st.st_size;
1501
1502         if (td_read(td)) {
1503                 if (td->file_size > bytes)
1504                         bytes = td->file_size;
1505         } else {
1506                 if (!td->file_size)
1507                         td->file_size = 1024 * 1024 * 1024;
1508
1509                 bytes = td->file_size;
1510         }
1511
1512         if (td->file_offset > bytes) {
1513                 fprintf(stderr, "Client%d: offset larger than length\n", td->thread_number);
1514                 return 1;
1515         }
1516
1517         td->io_size = bytes - td->file_offset;
1518         if (td->io_size == 0) {
1519                 fprintf(stderr, "Client%d: no io blocks\n", td->thread_number);
1520                 td->error = EINVAL;
1521                 return 1;
1522         }
1523
1524         return 0;
1525 }
1526
1527 static int setup_file(struct thread_data *td)
1528 {
1529         int flags = 0;
1530
1531         if (!file_exists(td)) {
1532                 if (!td->create_file) {
1533                         td->error = ENOENT;
1534                         return 1;
1535                 }
1536                 if (create_file(td))
1537                         return 1;
1538         }
1539
1540         if (td->odirect)
1541                 flags |= O_DIRECT;
1542
1543         if (td_read(td))
1544                 td->fd = open(td->file_name, flags | O_RDONLY);
1545         else {
1546                 if (!td->overwrite)
1547                         flags |= O_TRUNC;
1548                 if (td->sync_io)
1549                         flags |= O_SYNC;
1550                 if (td->verify)
1551                         flags |= O_RDWR;
1552                 else
1553                         flags |= O_WRONLY;
1554
1555                 td->fd = open(td->file_name, flags | O_CREAT, 0600);
1556         }
1557
1558         if (td->fd == -1) {
1559                 td->error = errno;
1560                 return 1;
1561         }
1562
1563         if (get_file_size(td))
1564                 return 1;
1565
1566         if (td_write(td) && ftruncate(td->fd, td->file_size) == -1) {
1567                 td->error = errno;
1568                 return 1;
1569         }
1570
1571         if (td->invalidate_cache) {
1572                 if (fadvise(td->fd, td->file_offset, td->file_size, POSIX_FADV_DONTNEED) < 0) {
1573                         td->error = errno;
1574                         return 1;
1575                 }
1576         }
1577
1578         return 0;
1579 }
1580
1581 static void clear_io_state(struct thread_data *td)
1582 {
1583         if (!td->use_aio)
1584                 lseek(td->fd, SEEK_SET, 0);
1585
1586         td->cur_off = 0;
1587         td->last_bytes = 0;
1588         td->stat_io_bytes = 0;
1589         td->this_io_bytes = 0;
1590
1591         if (td->file_map)
1592                 memset(td->file_map, 0, td->num_maps * sizeof(long));
1593 }
1594
1595 static void *thread_main(int shm_id, int offset, char *argv[])
1596 {
1597         struct thread_data *td;
1598         int ret = 1;
1599         void *data;
1600
1601         setsid();
1602
1603         data = shmat(shm_id, NULL, 0);
1604         if (data == (void *) -1) {
1605                 perror("shmat");
1606                 return NULL;
1607         }
1608
1609         td = data + offset * sizeof(struct thread_data);
1610         td->pid = getpid();
1611
1612         if (init_io_u(td))
1613                 goto err;
1614
1615         if (sched_setaffinity(td->pid, sizeof(td->cpumask), &td->cpumask) == -1) {
1616                 td->error = errno;
1617                 goto err;
1618         }
1619
1620         sprintf(argv[0], "fio%d", offset);
1621
1622         if (td->use_aio && init_aio(td))
1623                 goto err;
1624
1625         if (td->ioprio) {
1626                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, td->ioprio) == -1) {
1627                         td->error = errno;
1628                         goto err;
1629                 }
1630         }
1631
1632         sem_post(&startup_sem);
1633         sem_wait(&td->mutex);
1634
1635         if (!td->create_serialize && setup_file(td))
1636                 goto err;
1637
1638         if (init_random_state(td))
1639                 goto err;
1640
1641         gettimeofday(&td->start, NULL);
1642
1643         getrusage(RUSAGE_SELF, &td->ru_start);
1644
1645         while (td->loops--) {
1646                 gettimeofday(&td->stat_sample_time, NULL);
1647
1648                 if (td->ratemin)
1649                         memcpy(&td->lastrate, &td->stat_sample_time, sizeof(td->lastrate));
1650
1651                 clear_io_state(td);
1652                 prune_io_piece_log(td);
1653
1654                 if (!td->use_aio)
1655                         do_sync_io(td);
1656                 else
1657                         do_async_io(td);
1658
1659                 if (td->error)
1660                         break;
1661
1662                 if (!td->verify)
1663                         continue;
1664
1665                 clear_io_state(td);
1666
1667                 if (!td->use_aio)
1668                         do_sync_verify(td);
1669                 else
1670                         do_async_verify(td);
1671
1672                 if (td->error)
1673                         break;
1674         }
1675
1676         td->runtime = mtime_since_now(&td->start);
1677         getrusage(RUSAGE_SELF, &td->ru_end);
1678         ret = 0;
1679
1680         if (td->bw_log)
1681                 finish_log(td, td->bw_log, "bw");
1682         if (td->lat_log)
1683                 finish_log(td, td->lat_log, "lat");
1684
1685         if (exitall_on_terminate)
1686                 sig_handler(0);
1687
1688 err:
1689         if (td->fd != -1) {
1690                 close(td->fd);
1691                 td->fd = -1;
1692         }
1693         if (td->use_aio)
1694                 cleanup_aio(td);
1695         cleanup_io_u(td);
1696         if (ret) {
1697                 sem_post(&startup_sem);
1698                 sem_wait(&td->mutex);
1699         }
1700         td_set_runstate(td, TD_EXITED);
1701         shmdt(data);
1702         return NULL;
1703 }
1704
1705 static void free_shm(void)
1706 {
1707         struct shmid_ds sbuf;
1708
1709         if (threads) {
1710                 shmdt(threads);
1711                 threads = NULL;
1712                 shmctl(shm_id, IPC_RMID, &sbuf);
1713         }
1714 }
1715
1716 static int calc_lat(struct io_stat *is, unsigned long *min, unsigned long *max,
1717                     double *mean, double *dev)
1718 {
1719         double n;
1720
1721         if (is->samples == 0)
1722                 return 0;
1723
1724         *min = is->min_val;
1725         *max = is->max_val;
1726
1727         n = (double) is->samples;
1728         *mean = (double) is->val / n;
1729         *dev = sqrt(((double) is->val_sq - (*mean * *mean) / n) / (n - 1));
1730         return 1;
1731 }
1732
1733 static void show_thread_status(struct thread_data *td)
1734 {
1735         int prio, prio_class;
1736         unsigned long min, max, bw = 0, ctx;
1737         double mean, dev, usr_cpu, sys_cpu;
1738
1739         if (!td->io_bytes && !td->error)
1740                 return;
1741
1742         if (td->runtime)
1743                 bw = td->io_bytes / td->runtime;
1744
1745         prio = td->ioprio & 0xff;
1746         prio_class = td->ioprio >> IOPRIO_CLASS_SHIFT;
1747
1748         printf("Client%d: err=%2d, io=%6luMiB, bw=%6luKiB/s, runt=%6lumsec\n", td->thread_number, td->error, td->io_bytes >> 20, bw, td->runtime);
1749
1750         if (calc_lat(&td->slat_stat, &min, &max, &mean, &dev))
1751                 printf("  slat (msec): min=%5lu, max=%5lu, avg=%5.02f, dev=%5.02f\n", min, max, mean, dev);
1752         if (calc_lat(&td->clat_stat, &min, &max, &mean, &dev))
1753                 printf("  clat (msec): min=%5lu, max=%5lu, avg=%5.02f, dev=%5.02f\n", min, max, mean, dev);
1754         if (calc_lat(&td->bw_stat, &min, &max, &mean, &dev))
1755                 printf("  bw (KiB/s) : min=%5lu, max=%5lu, avg=%5.02f, dev=%5.02f\n", min, max, mean, dev);
1756
1757         if (td->runtime) {
1758                 unsigned long t;
1759
1760                 t = mtime_since(&td->ru_start.ru_utime, &td->ru_end.ru_utime);
1761                 usr_cpu = (double) t * 100 / (double) td->runtime;
1762
1763                 t = mtime_since(&td->ru_start.ru_stime, &td->ru_end.ru_stime);
1764                 sys_cpu = (double) t * 100 / (double) td->runtime;
1765         } else {
1766                 usr_cpu = 0;
1767                 sys_cpu = 0;
1768         }
1769
1770         ctx = td->ru_end.ru_nvcsw + td->ru_end.ru_nivcsw - (td->ru_start.ru_nvcsw + td->ru_start.ru_nivcsw);
1771
1772         printf("  cpu        : usr=%3.2f%%, sys=%3.2f%%, ctx=%lu\n", usr_cpu, sys_cpu, ctx);
1773 }
1774
1775 static int setup_rate(struct thread_data *td)
1776 {
1777         int nr_reads_per_sec;
1778
1779         if (!td->rate)
1780                 return 0;
1781
1782         if (td->rate < td->ratemin) {
1783                 fprintf(stderr, "min rate larger than nominal rate\n");
1784                 return -1;
1785         }
1786
1787         nr_reads_per_sec = (td->rate * 1024) / td->min_bs;
1788         td->rate_usec_cycle = 1000000 / nr_reads_per_sec;
1789         td->rate_pending_usleep = 0;
1790         return 0;
1791 }
1792
1793 static struct thread_data *get_new_job(int global)
1794 {
1795         struct thread_data *td;
1796
1797         if (global)
1798                 return &def_thread;
1799         if (thread_number >= max_jobs)
1800                 return NULL;
1801
1802         td = &threads[thread_number++];
1803         memset(td, 0, sizeof(*td));
1804
1805         td->fd = -1;
1806         td->thread_number = thread_number;
1807
1808         td->ddir = def_thread.ddir;
1809         td->ioprio = def_thread.ioprio;
1810         td->sequential = def_thread.sequential;
1811         td->bs = def_thread.bs;
1812         td->min_bs = def_thread.min_bs;
1813         td->max_bs = def_thread.max_bs;
1814         td->odirect = def_thread.odirect;
1815         td->thinktime = def_thread.thinktime;
1816         td->fsync_blocks = def_thread.fsync_blocks;
1817         td->start_delay = def_thread.start_delay;
1818         td->timeout = def_thread.timeout;
1819         td->use_aio = def_thread.use_aio;
1820         td->create_file = def_thread.create_file;
1821         td->overwrite = def_thread.overwrite;
1822         td->invalidate_cache = def_thread.invalidate_cache;
1823         td->file_size = def_thread.file_size;
1824         td->file_offset = def_thread.file_offset;
1825         td->rate = def_thread.rate;
1826         td->ratemin = def_thread.ratemin;
1827         td->ratecycle = def_thread.ratecycle;
1828         td->aio_depth = def_thread.aio_depth;
1829         td->sync_io = def_thread.sync_io;
1830         td->mem_type = def_thread.mem_type;
1831         td->bw_avg_time = def_thread.bw_avg_time;
1832         td->create_serialize = def_thread.create_serialize;
1833         td->create_fsync = def_thread.create_fsync;
1834         td->loops = def_thread.loops;
1835         td->verify = def_thread.verify;
1836         memcpy(&td->cpumask, &def_thread.cpumask, sizeof(td->cpumask));
1837
1838         return td;
1839 }
1840
1841 static void put_job(struct thread_data *td)
1842 {
1843         memset(&threads[td->thread_number - 1], 0, sizeof(*td));
1844         thread_number--;
1845 }
1846
1847 static int add_job(struct thread_data *td, const char *filename, int prioclass,
1848                    int prio)
1849 {
1850         if (td == &def_thread)
1851                 return 0;
1852
1853         strcpy(td->file_name, filename);
1854         sem_init(&td->mutex, 1, 0);
1855         td->ioprio = (prioclass << IOPRIO_CLASS_SHIFT) | prio;
1856
1857         td->clat_stat.min_val = ULONG_MAX;
1858         td->slat_stat.min_val = ULONG_MAX;
1859         td->bw_stat.min_val = ULONG_MAX;
1860
1861         run_str[td->thread_number - 1] = 'P';
1862
1863         if (td->use_aio && !td->aio_depth)
1864                 td->aio_depth = 1;
1865
1866         if (td->min_bs == -1U)
1867                 td->min_bs = td->bs;
1868         if (td->max_bs == -1U)
1869                 td->max_bs = td->bs;
1870         if (td_read(td))
1871                 td->verify = 0;
1872
1873         if (setup_rate(td))
1874                 return -1;
1875
1876         if (write_lat_log)
1877                 setup_log(&td->lat_log);
1878         if (write_bw_log)
1879                 setup_log(&td->bw_log);
1880
1881         printf("Client%d: file=%s, rw=%d, prio=%d/%d, seq=%d, odir=%d, bs=%d-%d, rate=%d, aio=%d, aio_depth=%d\n", td->thread_number, filename, td->ddir, prioclass, prio, td->sequential, td->odirect, td->min_bs, td->max_bs, td->rate, td->use_aio, td->aio_depth);
1882         return 0;
1883 }
1884
1885 static void fill_cpu_mask(cpu_set_t cpumask, int cpu)
1886 {
1887         unsigned int i;
1888
1889         CPU_ZERO(&cpumask);
1890
1891         for (i = 0; i < sizeof(int) * 8; i++) {
1892                 if ((1 << i) & cpu)
1893                         CPU_SET(i, &cpumask);
1894         }
1895 }
1896
1897 unsigned long get_mult(char c)
1898 {
1899         switch (c) {
1900                 case 'k':
1901                 case 'K':
1902                         return 1024;
1903                 case 'm':
1904                 case 'M':
1905                         return 1024 * 1024;
1906                 case 'g':
1907                 case 'G':
1908                         return 1024 * 1024 * 1024;
1909                 default:
1910                         return 1;
1911         }
1912 }
1913
1914 /*
1915  * convert string after '=' into decimal value, noting any size suffix
1916  */
1917 static int str_cnv(char *p, unsigned long long *val)
1918 {
1919         char *str;
1920         int len;
1921
1922         str = strstr(p, "=");
1923         if (!str)
1924                 return 1;
1925
1926         str++;
1927         len = strlen(str);
1928
1929         *val = strtoul(str, NULL, 10);
1930         if (*val == ULONG_MAX && errno == ERANGE)
1931                 return 1;
1932
1933         *val *= get_mult(str[len - 2]);
1934         return 0;
1935 }
1936
1937 static int check_strcnv(char *p, char *name, unsigned long long *val)
1938 {
1939         if (!strstr(p, name))
1940                 return 1;
1941
1942         return str_cnv(p, val);
1943 }
1944
1945 static int check_str(char *p, char *name, char *option)
1946 {
1947         char *s = strstr(p, name);
1948
1949         if (!s)
1950                 return 1;
1951
1952         s += strlen(name);
1953         if (strstr(s, option))
1954                 return 0;
1955
1956         return 1;
1957 }
1958
1959 static int check_range(char *p, char *name, unsigned long *s, unsigned long *e)
1960 {
1961         char str[128];
1962         char s1, s2;
1963
1964         sprintf(str, "%s=%%lu%%c-%%lu%%c", name);
1965         if (sscanf(p, str, s, &s1, e, &s2) == 4) {
1966                 *s *= get_mult(s1);
1967                 *e *= get_mult(s2);
1968                 return 0;
1969         }
1970
1971         sprintf(str, "%s = %%lu%%c-%%lu%%c", name);
1972         if (sscanf(p, str, s, &s1, e, &s2) == 4) {
1973                 *s *= get_mult(s1);
1974                 *e *= get_mult(s2);
1975                 return 0;
1976         }
1977
1978         sprintf(str, "%s=%%lu-%%lu", name);
1979         if (sscanf(p, str, s, e) == 2)
1980                 return 0;
1981
1982         sprintf(str, "%s = %%lu-%%lu", name);
1983         if (sscanf(p, str, s, e) == 2)
1984                 return 0;
1985
1986         return 1;
1987
1988 }
1989
1990 static int check_int(char *p, char *name, unsigned int *val)
1991 {
1992         char str[128];
1993
1994         sprintf(str, "%s=%%d", name);
1995         if (sscanf(p, str, val) == 1)
1996                 return 0;
1997
1998         sprintf(str, "%s = %%d", name);
1999         if (sscanf(p, str, val) == 1)
2000                 return 0;
2001
2002         return 1;
2003 }
2004
2005 static int is_empty_or_comment(char *line)
2006 {
2007         unsigned int i;
2008
2009         for (i = 0; i < strlen(line); i++) {
2010                 if (line[i] == ';')
2011                         return 1;
2012                 if (!isspace(line[i]) && !iscntrl(line[i]))
2013                         return 0;
2014         }
2015
2016         return 1;
2017 }
2018
2019 static int parse_jobs_ini(char *file)
2020 {
2021         unsigned int prioclass, prio, cpu, global;
2022         unsigned long long ull;
2023         unsigned long ul1, ul2;
2024         struct thread_data *td;
2025         char *string, *name;
2026         fpos_t off;
2027         FILE *f;
2028         char *p;
2029
2030         f = fopen(file, "r");
2031         if (!f) {
2032                 perror("fopen");
2033                 return 1;
2034         }
2035
2036         string = malloc(4096);
2037         name = malloc(256);
2038
2039         while ((p = fgets(string, 4096, f)) != NULL) {
2040                 if (is_empty_or_comment(p))
2041                         continue;
2042                 if (sscanf(p, "[%s]", name) != 1)
2043                         continue;
2044
2045                 global = !strncmp(name, "global", 6);
2046
2047                 name[strlen(name) - 1] = '\0';
2048
2049                 td = get_new_job(global);
2050                 if (!td)
2051                         break;
2052
2053                 prioclass = 2;
2054                 prio = 4;
2055
2056                 fgetpos(f, &off);
2057                 while ((p = fgets(string, 4096, f)) != NULL) {
2058                         if (is_empty_or_comment(p))
2059                                 continue;
2060                         if (strstr(p, "["))
2061                                 break;
2062                         if (!check_int(p, "rw", &td->ddir)) {
2063                                 fgetpos(f, &off);
2064                                 continue;
2065                         }
2066                         if (!check_int(p, "prio", &prio)) {
2067                                 fgetpos(f, &off);
2068                                 continue;
2069                         }
2070                         if (!check_int(p, "prioclass", &prioclass)) {
2071                                 fgetpos(f, &off);
2072                                 continue;
2073                         }
2074                         if (!check_int(p, "direct", &td->odirect)) {
2075                                 fgetpos(f, &off);
2076                                 continue;
2077                         }
2078                         if (!check_int(p, "rate", &td->rate)) {
2079                                 fgetpos(f, &off);
2080                                 continue;
2081                         }
2082                         if (!check_int(p, "ratemin", &td->ratemin)) {
2083                                 fgetpos(f, &off);
2084                                 continue;
2085                         }
2086                         if (!check_int(p, "ratecycle", &td->ratecycle)) {
2087                                 fgetpos(f, &off);
2088                                 continue;
2089                         }
2090                         if (!check_int(p, "thinktime", &td->thinktime)) {
2091                                 fgetpos(f, &off);
2092                                 continue;
2093                         }
2094                         if (!check_int(p, "cpumask", &cpu)) {
2095                                 fill_cpu_mask(td->cpumask, cpu);
2096                                 fgetpos(f, &off);
2097                                 continue;
2098                         }
2099                         if (!check_int(p, "fsync", &td->fsync_blocks)) {
2100                                 fgetpos(f, &off);
2101                                 continue;
2102                         }
2103                         if (!check_int(p, "startdelay", &td->start_delay)) {
2104                                 fgetpos(f, &off);
2105                                 continue;
2106                         }
2107                         if (!check_int(p, "timeout", &td->timeout)) {
2108                                 fgetpos(f, &off);
2109                                 continue;
2110                         }
2111                         if (!check_int(p, "invalidate",&td->invalidate_cache)) {
2112                                 fgetpos(f, &off);
2113                                 continue;
2114                         }
2115                         if (!check_int(p, "aio_depth", &td->aio_depth)) {
2116                                 fgetpos(f, &off);
2117                                 continue;
2118                         }
2119                         if (!check_int(p, "sync", &td->sync_io)) {
2120                                 fgetpos(f, &off);
2121                                 continue;
2122                         }
2123                         if (!check_int(p, "bwavgtime", &td->bw_avg_time)) {
2124                                 fgetpos(f, &off);
2125                                 continue;
2126                         }
2127                         if (!check_int(p, "create_serialize", &td->create_serialize)) {
2128                                 fgetpos(f, &off);
2129                                 continue;
2130                         }
2131                         if (!check_int(p, "create_fsync", &td->create_fsync)) {
2132                                 fgetpos(f, &off);
2133                                 continue;
2134                         }
2135                         if (!check_int(p, "loops", &td->loops)) {
2136                                 fgetpos(f, &off);
2137                                 continue;
2138                         }
2139                         if (!check_int(p, "verify", &td->verify)) {
2140                                 fgetpos(f, &off);
2141                                 continue;
2142                         }
2143                         if (!check_range(p, "bsrange", &ul1, &ul2)) {
2144                                 if (ul1 & 511)
2145                                         printf("bad min block size, must be a multiple of 512\n");
2146                                 else
2147                                         td->min_bs = ul1;
2148                                 if (ul2 & 511)
2149                                         printf("bad max block size, must be a multiple of 512\n");
2150                                 else
2151                                         td->max_bs = ul2;
2152                                 fgetpos(f, &off);
2153                                 continue;
2154                         }
2155                         if (!check_strcnv(p, "bs", &ull)) {
2156                                 if (ull & 511)
2157                                         printf("bad block size, must be a multiple of 512\n");
2158                                 else
2159                                         td->bs = ull;
2160                                 fgetpos(f, &off);
2161                                 continue;
2162                         }
2163                         if (!check_strcnv(p, "size", &td->file_size)) {
2164                                 fgetpos(f, &off);
2165                                 continue;
2166                         }
2167                         if (!check_strcnv(p, "offset", &td->file_offset)) {
2168                                 fgetpos(f, &off);
2169                                 continue;
2170                         }
2171                         if (!check_str(p, "mem", "malloc")) {
2172                                 td->mem_type = MEM_MALLOC;
2173                                 fgetpos(f, &off);
2174                                 continue;
2175                         }
2176                         if (!check_str(p, "mem", "shm")) {
2177                                 td->mem_type = MEM_SHM;
2178                                 fgetpos(f, &off);
2179                                 continue;
2180                         }
2181                         if (!strncmp(p, "sequential", 10)) {
2182                                 td->sequential = 1;
2183                                 fgetpos(f, &off);
2184                                 continue;
2185                         }
2186                         if (!strncmp(p, "random", 6)) {
2187                                 td->sequential = 0;
2188                                 fgetpos(f, &off);
2189                                 continue;
2190                         }
2191                         if (!strncmp(p, "aio", 3)) {
2192                                 td->use_aio = 1;
2193                                 fgetpos(f, &off);
2194                                 continue;
2195                         }
2196                         if (!strncmp(p, "create", 6)) {
2197                                 td->create_file = 1;
2198                                 fgetpos(f, &off);
2199                                 continue;
2200                         }
2201                         if (!strncmp(p, "overwrite", 9)) {
2202                                 td->overwrite = 1;
2203                                 fgetpos(f, &off);
2204                                 continue;
2205                         }
2206                         if (!strncmp(p, "exitall", 7)) {
2207                                 exitall_on_terminate = 1;
2208                                 fgetpos(f, &off);
2209                                 continue;
2210                         }
2211                         printf("Client%d: bad option %s\n",td->thread_number,p);
2212                 }
2213                 fsetpos(f, &off);
2214
2215                 if (add_job(td, name, prioclass, prio))
2216                         put_job(td);
2217         }
2218
2219         free(string);
2220         free(name);
2221         fclose(f);
2222         return 0;
2223 }
2224
2225 static int parse_options(int argc, char *argv[])
2226 {
2227         int i;
2228
2229         for (i = 1; i < argc; i++) {
2230                 char *parm = argv[i];
2231
2232                 if (parm[0] != '-')
2233                         break;
2234
2235                 parm++;
2236                 switch (*parm) {
2237                         case 's':
2238                                 parm++;
2239                                 def_thread.sequential = !!atoi(parm);
2240                                 break;
2241                         case 'b':
2242                                 parm++;
2243                                 def_thread.bs = atoi(parm);
2244                                 def_thread.bs <<= 10;
2245                                 if (!def_thread.bs) {
2246                                         printf("bad block size\n");
2247                                         def_thread.bs = DEF_BS;
2248                                 }
2249                                 break;
2250                         case 't':
2251                                 parm++;
2252                                 def_thread.timeout = atoi(parm);
2253                                 break;
2254                         case 'r':
2255                                 parm++;
2256                                 repeatable = !!atoi(parm);
2257                                 break;
2258                         case 'R':
2259                                 parm++;
2260                                 rate_quit = !!atoi(parm);
2261                                 break;
2262                         case 'o':
2263                                 parm++;
2264                                 def_thread.odirect = !!atoi(parm);
2265                                 break;
2266                         case 'f':
2267                                 if (i + 1 >= argc) {
2268                                         printf("-f needs file as arg\n");
2269                                         break;
2270                                 }
2271                                 ini_file = strdup(argv[i+1]);
2272                                 i++;
2273                                 break;
2274                         case 'l':
2275                                 write_lat_log = 1;
2276                                 break;
2277                         case 'w':
2278                                 write_bw_log = 1;
2279                                 break;
2280                         default:
2281                                 printf("bad option %s\n", argv[i]);
2282                                 break;
2283                 }
2284         }
2285
2286         return i;
2287 }
2288
2289 static void print_thread_status(struct thread_data *td, int nr_running,
2290                                 int t_rate, int m_rate)
2291 {
2292         printf("Threads now running: %d", nr_running);
2293         if (m_rate || t_rate)
2294                 printf(", commitrate %d/%dKiB/sec", t_rate, m_rate);
2295         printf(" : [%s]\r", run_str);
2296         fflush(stdout);
2297 }
2298
2299 static void check_str_update(struct thread_data *td, int n, int t, int m)
2300 {
2301         char c = run_str[td->thread_number - 1];
2302
2303         if (td->runstate == td->old_runstate)
2304                 return;
2305
2306         switch (td->runstate) {
2307                 case TD_REAPED:
2308                         c = '_';
2309                         break;
2310                 case TD_EXITED:
2311                         c = 'E';
2312                         break;
2313                 case TD_RUNNING:
2314                         if (td_read(td)) {
2315                                 if (td->sequential)
2316                                         c = 'R';
2317                                 else
2318                                         c = 'r';
2319                         } else {
2320                                 if (td->sequential)
2321                                         c = 'W';
2322                                 else
2323                                         c = 'w';
2324                         }
2325                         break;
2326                 case TD_VERIFYING:
2327                         c = 'V';
2328                         break;
2329                 case TD_CREATED:
2330                         c = 'C';
2331                         break;
2332                 case TD_NOT_CREATED:
2333                         c = 'P';
2334                         break;
2335                 default:
2336                         printf("state %d\n", td->runstate);
2337         }
2338
2339         run_str[td->thread_number - 1] = c;
2340         print_thread_status(td, n, t, m);
2341         td->old_runstate = td->runstate;
2342 }
2343
2344 static void reap_threads(int *nr_running, int *t_rate, int *m_rate)
2345 {
2346         int i;
2347
2348         /*
2349          * reap exited threads (TD_EXITED -> TD_REAPED)
2350          */
2351         for (i = 0; i < thread_number; i++) {
2352                 struct thread_data *td = &threads[i];
2353
2354                 check_str_update(td, *nr_running, *t_rate, *m_rate);
2355
2356                 if (td->runstate != TD_EXITED)
2357                         continue;
2358
2359                 td_set_runstate(td, TD_REAPED);
2360                 waitpid(td->pid, NULL, 0);
2361                 (*nr_running)--;
2362                 (*m_rate) -= td->ratemin;
2363                 (*t_rate) -= td->rate;
2364                 check_str_update(td, *nr_running, *t_rate, *m_rate);
2365
2366                 if (td->terminate)
2367                         continue;
2368         }
2369 }
2370
2371 static void run_threads(char *argv[])
2372 {
2373         struct timeval genesis;
2374         struct thread_data *td;
2375         unsigned long spent;
2376         int i, todo, nr_running, m_rate, t_rate;
2377
2378         printf("Starting %d threads\n", thread_number);
2379         fflush(stdout);
2380
2381         signal(SIGINT, sig_handler);
2382
2383         todo = thread_number;
2384         nr_running = 0;
2385         m_rate = t_rate = 0;
2386
2387         for (i = 0; i < thread_number; i++) {
2388                 td = &threads[i];
2389
2390                 if (!td->create_serialize)
2391                         continue;
2392
2393                 /*
2394                  * do file setup here so it happens sequentially,
2395                  * we don't want X number of threads getting their
2396                  * client data interspersed on disk
2397                  */
2398                 if (setup_file(td)) {
2399                         td_set_runstate(td, TD_REAPED);
2400                         todo--;
2401                 }
2402         }
2403
2404         gettimeofday(&genesis, NULL);
2405
2406         while (todo) {
2407                 /*
2408                  * create threads (TD_NOT_CREATED -> TD_CREATED)
2409                  */
2410                 for (i = 0; i < thread_number; i++) {
2411                         td = &threads[i];
2412
2413                         if (td->runstate != TD_NOT_CREATED)
2414                                 continue;
2415
2416                         /*
2417                          * never got a chance to start, killed by other
2418                          * thread for some reason
2419                          */
2420                         if (td->terminate) {
2421                                 todo--;
2422                                 continue;
2423                         }
2424
2425                         if (td->start_delay) {
2426                                 spent = mtime_since_now(&genesis);
2427
2428                                 if (td->start_delay * 1000 > spent)
2429                                         continue;
2430                         }
2431
2432                         td_set_runstate(td, TD_CREATED);
2433                         check_str_update(td, nr_running, t_rate, m_rate);
2434                         sem_init(&startup_sem, 1, 1);
2435                         todo--;
2436
2437                         if (fork())
2438                                 sem_wait(&startup_sem);
2439                         else {
2440                                 thread_main(shm_id, i, argv);
2441                                 exit(0);
2442                         }
2443                 }
2444
2445                 /*
2446                  * start created threads (TD_CREATED -> TD_RUNNING)
2447                  */
2448                 for (i = 0; i < thread_number; i++) {
2449                         struct thread_data *td = &threads[i];
2450
2451                         if (td->runstate != TD_CREATED)
2452                                 continue;
2453
2454                         td_set_runstate(td, TD_RUNNING);
2455                         nr_running++;
2456                         m_rate += td->ratemin;
2457                         t_rate += td->rate;
2458                         check_str_update(td, nr_running, t_rate, m_rate);
2459                         sem_post(&td->mutex);
2460                 }
2461
2462                 for (i = 0; i < thread_number; i++) {
2463                         struct thread_data *td = &threads[i];
2464
2465                         if (td->runstate == TD_RUNNING)
2466                                 run_str[td->thread_number - 1] = '+';
2467                         else if (td->runstate == TD_VERIFYING)
2468                                 run_str[td->thread_number - 1] = 'V';
2469                         else
2470                                 continue;
2471
2472                         check_str_update(td, nr_running, t_rate, m_rate);
2473                 }
2474
2475                 reap_threads(&nr_running, &t_rate, &m_rate);
2476
2477                 if (todo)
2478                         usleep(100000);
2479         }
2480
2481         while (nr_running) {
2482                 reap_threads(&nr_running, &t_rate, &m_rate);
2483                 usleep(10000);
2484         }
2485 }
2486
2487 int setup_thread_area(void)
2488 {
2489         /*
2490          * 1024 is too much on some machines, scale max_jobs if
2491          * we get a failure that looks like too large a shm segment
2492          */
2493         do {
2494                 int s = max_jobs * sizeof(struct thread_data);
2495
2496                 shm_id = shmget(0, s, IPC_CREAT | 0600);
2497                 if (shm_id != -1)
2498                         break;
2499                 if (errno != EINVAL) {
2500                         perror("shmget");
2501                         break;
2502                 }
2503
2504                 max_jobs >>= 1;
2505         } while (max_jobs);
2506
2507         if (shm_id == -1)
2508                 return 1;
2509
2510         threads = shmat(shm_id, NULL, 0);
2511         if (threads == (void *) -1) {
2512                 perror("shmat");
2513                 return 1;
2514         }
2515
2516         atexit(free_shm);
2517         return 0;
2518 }
2519
2520 int main(int argc, char *argv[])
2521 {
2522         static unsigned long max_run[2], min_run[2];
2523         static unsigned long max_bw[2], min_bw[2];
2524         static unsigned long io_mb[2], agg[2];
2525         int i;
2526
2527         if (setup_thread_area())
2528                 return 1;
2529
2530         if (sched_getaffinity(getpid(), sizeof(cpu_set_t), &def_thread.cpumask) == -1) {
2531                 perror("sched_getaffinity");
2532                 return 1;
2533         }
2534
2535         /*
2536          * fill globals
2537          */
2538         def_thread.ddir = DDIR_READ;
2539         def_thread.bs = DEF_BS;
2540         def_thread.min_bs = -1;
2541         def_thread.max_bs = -1;
2542         def_thread.odirect = DEF_ODIRECT;
2543         def_thread.ratecycle = DEF_RATE_CYCLE;
2544         def_thread.sequential = DEF_SEQUENTIAL;
2545         def_thread.timeout = DEF_TIMEOUT;
2546         def_thread.create_file = DEF_CREATE;
2547         def_thread.overwrite = DEF_OVERWRITE;
2548         def_thread.invalidate_cache = DEF_INVALIDATE;
2549         def_thread.sync_io = DEF_SYNCIO;
2550         def_thread.mem_type = MEM_MALLOC;
2551         def_thread.bw_avg_time = DEF_BWAVGTIME;
2552         def_thread.create_serialize = DEF_CREATE_SER;
2553         def_thread.create_fsync = DEF_CREATE_FSYNC;
2554         def_thread.loops = DEF_LOOPS;
2555         def_thread.verify = DEF_VERIFY;
2556
2557         i = parse_options(argc, argv);
2558
2559         if (!ini_file) {
2560                 printf("Need job file\n");
2561                 return 1;
2562         }
2563
2564         if (parse_jobs_ini(ini_file))
2565                 return 1;
2566
2567         if (!thread_number) {
2568                 printf("Nothing to do\n");
2569                 return 1;
2570         }
2571
2572         run_threads(argv);
2573
2574         min_bw[0] = min_run[0] = ~0UL;
2575         min_bw[1] = min_run[1] = ~0UL;
2576         io_mb[0] = io_mb[1] = 0;
2577         agg[0] = agg[1] = 0;
2578         for (i = 0; i < thread_number; i++) {
2579                 struct thread_data *td = &threads[i];
2580                 unsigned long bw = 0;
2581
2582                 if (!td->error) {
2583                         if (td->runtime < min_run[td->ddir])
2584                                 min_run[td->ddir] = td->runtime;
2585                         if (td->runtime > max_run[td->ddir])
2586                                 max_run[td->ddir] = td->runtime;
2587
2588                         if (td->runtime)
2589                                 bw = td->io_bytes / td->runtime;
2590                         if (bw < min_bw[td->ddir])
2591                                 min_bw[td->ddir] = bw;
2592                         if (bw > max_bw[td->ddir])
2593                                 max_bw[td->ddir] = bw;
2594
2595                         io_mb[td->ddir] += td->io_bytes >> 20;
2596                 }
2597
2598                 show_thread_status(td);
2599         }
2600         
2601         if (max_run[0])
2602                 agg[0] = (io_mb[0] * 1024 * 1000) / max_run[0];
2603         if (max_run[1])
2604                 agg[1] = (io_mb[1] * 1024 * 1000) / max_run[1];
2605
2606         printf("\nRun status:\n");
2607         if (max_run[DDIR_READ])
2608                 printf("   READ: io=%luMiB, aggrb=%lu, minb=%lu, maxb=%lu, mint=%lumsec, maxt=%lumsec\n", io_mb[0], agg[0], min_bw[0], max_bw[0], min_run[0], max_run[0]);
2609         if (max_run[DDIR_WRITE])
2610                 printf("  WRITE: io=%luMiB, aggrb=%lu, minb=%lu, maxb=%lu, mint=%lumsec, maxt=%lumsec\n", io_mb[1], agg[1], min_bw[1], max_bw[1], min_run[1], max_run[1]);
2611
2612         return 0;
2613 }