[PATCH] fio: fix parameters after -f and fix recent variance prints
[disktools.git] / fio.c
1 /*
2  * fio - the flexible io tester
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <unistd.h>
24 #include <fcntl.h>
25 #include <string.h>
26 #include <errno.h>
27 #include <signal.h>
28 #include <time.h>
29 #include <ctype.h>
30 #include <sched.h>
31 #include <libaio.h>
32 #include <math.h>
33 #include <sys/time.h>
34 #include <sys/types.h>
35 #include <sys/stat.h>
36 #include <sys/wait.h>
37 #include <semaphore.h>
38 #include <sys/ipc.h>
39 #include <sys/shm.h>
40 #include <asm/unistd.h>
41
42 #define MAX_JOBS        (1024)
43
44 /*
45  * assume we don't have _get either, if _set isn't defined
46  */
47 #ifndef __NR_ioprio_set
48
49 #if defined(__i386__)
50 #define __NR_ioprio_set         289
51 #define __NR_ioprio_get         290
52 #elif defined(__powerpc__) || defined(__powerpc64__)
53 #define __NR_ioprio_set         273
54 #define __NR_ioprio_get         274
55 #elif defined(__x86_64__)
56 #define __NR_ioprio_set         251
57 #define __NR_ioprio_get         252
58 #elif defined(__ia64__)
59 #define __NR_ioprio_set         1274
60 #define __NR_ioprio_get         1275
61 #elif defined(__alpha__)
62 #define __NR_ioprio_set         442
63 #define __NR_ioprio_get         443
64 #elif defined(__s390x__) || defined(__s390__)
65 #define __NR_ioprio_set         282
66 #define __NR_ioprio_get         283
67 #else
68 #error "Unsupported arch"
69 #endif
70
71 #endif
72
73 static int ioprio_set(int which, int who, int ioprio)
74 {
75         return syscall(__NR_ioprio_set, which, who, ioprio);
76 }
77
78 enum {
79         IOPRIO_WHO_PROCESS = 1,
80         IOPRIO_WHO_PGRP,
81         IOPRIO_WHO_USER,
82 };
83
84 #define IOPRIO_CLASS_SHIFT      13
85
86 #define MASK    (4095)
87
88 #define DEF_BS          (4096)
89 #define DEF_TIMEOUT     (30)
90 #define DEF_RATE_CYCLE  (1000)
91 #define DEF_ODIRECT     (1)
92 #define DEF_SEQUENTIAL  (1)
93 #define DEF_WRITESTAT   (0)
94 #define DEF_RAND_REPEAT (1)
95
96 #define ALIGN(buf)      (char *) (((unsigned long) (buf) + MASK) & ~(MASK))
97
98 static int repeatable = DEF_RAND_REPEAT;
99 static int rate_quit = 1;
100
101 static int thread_number;
102 static char *ini_file;
103
104 static int shm_id;
105
106 enum {
107         DDIR_READ = 0,
108         DDIR_WRITE,
109 };
110
111 /*
112  * thread life cycle
113  */
114 enum {
115         TD_NOT_CREATED = 0,
116         TD_CREATED,
117         TD_STARTED,
118         TD_EXITED,
119         TD_REAPED,
120 };
121
122 struct thread_data {
123         char file_name[256];
124         int thread_number;
125         int error;
126         int fd;
127         pid_t pid;
128         char *buf;
129         volatile int terminate;
130         volatile int runstate;
131         unsigned int ddir;
132         unsigned int ioprio;
133         unsigned int sequential;
134         unsigned int bs;
135         unsigned int odirect;
136         unsigned int delay_sleep;
137         unsigned int fsync_blocks;
138         unsigned int start_delay;
139         unsigned int timeout;
140         unsigned int use_aio;
141         cpu_set_t cpumask;
142
143         io_context_t *aio_ctx;
144         struct iocb *aio_iocbs;
145         unsigned int aio_depth;
146         unsigned int aio_cur_depth;
147         struct io_event *aio_events;
148         char *aio_iocbs_status;
149
150         unsigned int rate;
151         unsigned int ratemin;
152         unsigned int ratecycle;
153         unsigned long rate_usec_cycle;
154         long rate_pending_usleep;
155         unsigned long rate_blocks;
156         struct timeval lastrate;
157
158         unsigned long max_latency;      /* msec */
159         unsigned long min_latency;      /* msec */
160         unsigned long runtime;          /* sec */
161         unsigned long blocks;
162         unsigned long io_blocks;
163         unsigned long last_block;
164         sem_t mutex;
165         struct drand48_data random_state;
166
167         /*
168          * bandwidth and latency stats
169          */
170         unsigned long stat_time;
171         unsigned long stat_time_sq;
172         unsigned long stat_time_samples;
173         unsigned long stat_io_blocks;
174         unsigned long stat_bw;
175         unsigned long stat_bw_sq;
176         unsigned long stat_bw_samples;
177         struct timeval stat_sample_time;
178
179         struct timeval start;
180 };
181
182 static struct thread_data *threads;
183 static struct thread_data def_thread;
184
185 static sem_t startup_sem;
186
187 static void sig_handler(int sig)
188 {
189         int i;
190
191         for (i = 0; i < thread_number; i++) {
192                 struct thread_data *td = &threads[i];
193
194                 td->terminate = 1;
195                 td->start_delay = 0;
196         }
197 }
198
199 static int init_random_state(struct thread_data *td)
200 {
201         unsigned long seed = 123;
202
203         if (td->sequential)
204                 return 0;
205
206         if (!repeatable) {
207                 int fd = open("/dev/random", O_RDONLY);
208
209                 if (fd == -1) {
210                         td->error = errno;
211                         return 1;
212                 }
213
214                 if (read(fd, &seed, sizeof(seed)) < (int) sizeof(seed)) {
215                         td->error = EIO;
216                         close(fd);
217                         return 1;
218                 }
219
220                 close(fd);
221         }
222
223         srand48_r(seed, &td->random_state);
224         return 0;
225 }
226
227 static unsigned long utime_since(struct timeval *s, struct timeval *e)
228 {
229         double sec, usec;
230
231         sec = e->tv_sec - s->tv_sec;
232         usec = e->tv_usec - s->tv_usec;
233         if (sec > 0 && usec < 0) {
234                 sec--;
235                 usec += 1000000;
236         }
237
238         sec *= (double) 1000000;
239
240         return sec + usec;
241 }
242
243 static unsigned long mtime_since(struct timeval *s, struct timeval *e)
244 {
245         double sec, usec;
246
247         sec = e->tv_sec - s->tv_sec;
248         usec = e->tv_usec - s->tv_usec;
249         if (sec > 0 && usec < 0) {
250                 sec--;
251                 usec += 1000000;
252         }
253
254         sec *= (double) 1000;
255         usec /= (double) 1000;
256
257         return sec + usec;
258 }
259
260 static unsigned long mtime_since_now(struct timeval *s)
261 {
262         struct timeval t;
263
264         gettimeofday(&t, NULL);
265         return mtime_since(s, &t);
266 }
267
268 static inline unsigned long msec_now(struct timeval *s)
269 {
270         return s->tv_sec * 1000 + s->tv_usec / 1000;
271 }
272
273 static unsigned long get_next_offset(struct thread_data *td)
274 {
275         unsigned long b;
276         long r;
277
278         if (!td->sequential) {
279                 lrand48_r(&td->random_state, &r);
280                 b = (1+(double) (td->blocks-1) * r / (RAND_MAX+1.0));
281         } else {
282                 b = td->last_block;
283                 td->last_block++;
284         }
285
286         return b * td->bs;
287 }
288
289 static void add_stat_sample(struct thread_data *td, unsigned long msec)
290 {
291         unsigned long spent;
292
293         td->stat_time += msec;
294         td->stat_time_sq += msec * msec;
295         td->stat_time_samples++;
296
297         spent = mtime_since_now(&td->stat_sample_time);
298         if (spent >= 500) {
299                 unsigned long rate = ((td->io_blocks - td->stat_io_blocks) * td->bs) / spent;
300
301                 td->stat_bw += rate;
302                 td->stat_bw_sq += rate * rate;
303                 gettimeofday(&td->stat_sample_time, NULL);
304                 td->stat_io_blocks = td->io_blocks;
305                 td->stat_bw_samples++;
306         }
307 }
308
309 static void usec_sleep(int usec)
310 {
311         struct timespec req = { .tv_sec = 0, .tv_nsec = usec * 1000 };
312         struct timespec rem;
313
314         do {
315                 rem.tv_sec = rem.tv_nsec = 0;
316                 nanosleep(&req, &rem);
317                 if (!rem.tv_nsec)
318                         break;
319
320                 req.tv_nsec = rem.tv_nsec;
321         } while (1);
322 }
323
324 static void rate_throttle(struct thread_data *td, unsigned long time_spent)
325 {
326         if (!td->rate)
327                 return;
328
329         if (time_spent < td->rate_usec_cycle) {
330                 unsigned long s = td->rate_usec_cycle - time_spent;
331
332                 td->rate_pending_usleep += s;
333                 if (td->rate_pending_usleep >= 100000) {
334                         usec_sleep(td->rate_pending_usleep);
335                         td->rate_pending_usleep = 0;
336                 }
337         } else {
338                 long overtime = time_spent - td->rate_usec_cycle;
339
340                 td->rate_pending_usleep -= overtime;
341         }
342 }
343
344 static int check_min_rate(struct thread_data *td, struct timeval *now)
345 {
346         unsigned long spent;
347         unsigned long rate;
348
349         /*
350          * allow a 2 second settle period in the beginning
351          */
352         if (mtime_since(&td->start, now) < 2000)
353                 return 0;
354
355         /*
356          * if rate blocks is set, sample is running
357          */
358         if (td->rate_blocks) {
359                 spent = mtime_since(&td->lastrate, now);
360                 if (spent < td->ratecycle)
361                         return 0;
362
363                 rate = ((td->io_blocks - td->rate_blocks) * td->bs) / spent;
364                 if (rate < td->ratemin) {
365                         printf("Client%d: min rate %d not met, got %ldKiB/sec\n", td->thread_number, td->ratemin, rate);
366                         if (rate_quit)
367                                 sig_handler(0);
368                         return 1;
369                 }
370         }
371
372         td->rate_blocks = td->io_blocks;
373         memcpy(&td->lastrate, now, sizeof(*now));
374         return 0;
375 }
376
377 static inline int runtime_exceeded(struct thread_data *td, struct timeval *t)
378 {
379         if (mtime_since(&td->start, t) >= td->timeout * 1000)
380                 return 1;
381
382         return 0;
383 }
384
385 #define should_fsync(td)        ((td)->ddir == DDIR_WRITE && !(td)->odirect)
386
387 static void do_sync_io(struct thread_data *td)
388 {
389         struct timeval s, e;
390         unsigned long blocks, msec, usec;
391
392         for (blocks = 0; blocks < td->blocks; blocks++) {
393                 off_t offset = get_next_offset(td);
394                 int ret;
395
396                 if (td->terminate)
397                         break;
398
399                 if (lseek(td->fd, offset, SEEK_SET) == -1) {
400                         td->error = errno;
401                         break;
402                 }
403
404                 if (td->delay_sleep)
405                         usec_sleep(td->delay_sleep);
406
407                 gettimeofday(&s, NULL);
408
409                 if (td->ddir == DDIR_READ)
410                         ret = read(td->fd, td->buf, td->bs);
411                 else
412                         ret = write(td->fd, td->buf, td->bs);
413
414                 if (ret < (int) td->bs) {
415                         if (ret == -1)
416                                 td->error = errno;
417                         break;
418                 }
419
420                 td->io_blocks++;
421
422                 if (should_fsync(td) && td->fsync_blocks &&
423                     (td->io_blocks % td->fsync_blocks) == 0)
424                         fsync(td->fd);
425
426                 gettimeofday(&e, NULL);
427
428                 usec = utime_since(&s, &e);
429
430                 rate_throttle(td, usec);
431
432                 if (check_min_rate(td, &e)) {
433                         td->error = ENODATA;
434                         break;
435                 }
436
437                 msec = usec / 1000;
438                 add_stat_sample(td, msec);
439
440                 if (msec < td->min_latency)
441                         td->min_latency = msec;
442                 if (msec > td->max_latency)
443                         td->max_latency = msec;
444
445                 if (runtime_exceeded(td, &e))
446                         break;
447         }
448
449         if (should_fsync(td))
450                 fsync(td->fd);
451 }
452
453 static void aio_put_iocb(struct thread_data *td, struct iocb *iocb)
454 {
455         long offset = ((long) iocb - (long) td->aio_iocbs)/ sizeof(struct iocb);
456
457         td->aio_iocbs_status[offset] = 0;
458         td->aio_cur_depth--;
459 }
460
461 static struct iocb *aio_get_iocb(struct thread_data *td, struct timeval *t)
462 {
463         struct iocb *iocb = NULL;
464         unsigned int i;
465
466         for (i = 0; i < td->aio_depth; i++) {
467                 if (td->aio_iocbs_status[i] == 0) {
468                         td->aio_iocbs_status[i] = 1;
469                         iocb = &td->aio_iocbs[i];
470                         break;
471                 }
472         }
473
474         if (iocb) {
475                 off_t off = get_next_offset(td);
476                 char *p = td->buf + i * td->bs;
477
478                 if (td->ddir == DDIR_READ)
479                         io_prep_pread(iocb, td->fd, p, td->bs, off);
480                 else
481                         io_prep_pwrite(iocb, td->fd, p, td->bs, off);
482
483                 io_set_callback(iocb, (io_callback_t) msec_now(t));
484         }
485
486         return iocb;
487 }
488
489 static int aio_submit(struct thread_data *td, struct iocb *iocb)
490 {
491         int ret;
492
493         do {
494                 ret = io_submit(*td->aio_ctx, 1, &iocb);
495                 if (ret == 1)
496                         return 0;
497
498                 if (errno == EINTR)
499                         continue;
500                 else if (errno == EAGAIN)
501                         usleep(100);
502                 else
503                         break;
504         } while (1);
505
506         return 1;
507 }
508
509 #define iocb_time(iocb) ((unsigned long) (iocb)->data)
510
511 static void do_async_io(struct thread_data *td)
512 {
513         struct timeval s, e;
514         unsigned long blocks, msec, usec;
515
516         for (blocks = 0; blocks < td->blocks; blocks++) {
517                 struct timespec ts = { .tv_sec = 0, .tv_nsec = 0};
518                 struct timespec *timeout;
519                 int ret, i, min_evts = 0;
520                 struct iocb *iocb;
521
522                 if (td->terminate)
523                         break;
524
525                 if (td->delay_sleep)
526                         usec_sleep(td->delay_sleep);
527
528                 gettimeofday(&s, NULL);
529
530                 iocb = aio_get_iocb(td, &s);
531
532                 ret = aio_submit(td, iocb);
533                 if (ret) {
534                         td->error = errno;
535                         break;
536                 }
537
538                 td->aio_cur_depth++;
539
540                 if (td->aio_cur_depth < td->aio_depth) {
541                         timeout = &ts;
542                         min_evts = 0;
543                 } else {
544                         timeout = NULL;
545                         min_evts = 1;
546                 }
547
548                 ret = io_getevents(*td->aio_ctx, min_evts, td->aio_cur_depth, td->aio_events, timeout);
549                 if (ret < 0) {
550                         td->error = errno;
551                         break;
552                 } else if (!ret)
553                         continue;
554
555                 gettimeofday(&e, NULL);
556
557                 for (i = 0; i < ret; i++) {
558                         struct io_event *ev = td->aio_events + i;
559
560                         td->io_blocks++;
561
562                         iocb = ev->obj;
563
564                         msec = msec_now(&e) - iocb_time(iocb);
565                         add_stat_sample(td, msec);
566
567                         if (msec < td->min_latency)
568                                 td->min_latency = msec;
569                         if (msec > td->max_latency)
570                                 td->max_latency = msec;
571
572                         aio_put_iocb(td, iocb);
573                 }
574
575                 /*
576                  * the rate is batched for now, it should work for batches
577                  * of completions except the very first one which may look
578                  * a little bursty
579                  */
580                 usec = utime_since(&s, &e);
581
582                 rate_throttle(td, usec);
583
584                 if (check_min_rate(td, &e)) {
585                         td->error = ENODATA;
586                         break;
587                 }
588
589                 if (runtime_exceeded(td, &e))
590                         break;
591         }
592 }
593
594 static void cleanup_pending_aio(struct thread_data *td)
595 {
596         struct timespec ts = { .tv_sec = 0, .tv_nsec = 0};
597         unsigned int i;
598         int r;
599
600         /*
601          * get immediately available events, if any
602          */
603         r = io_getevents(*td->aio_ctx, 0, td->aio_cur_depth, td->aio_events, &ts);
604         if (r > 0) {
605                 for (i = 0; i < r; i++)
606                         aio_put_iocb(td, &td->aio_iocbs[i]);
607         }
608
609         /*
610          * now cancel remaining active events
611          */
612         for (i = 0; i < td->aio_depth; i++) {
613                 if (td->aio_iocbs_status[i] == 0)
614                         continue;
615
616                 r = io_cancel(*td->aio_ctx, &td->aio_iocbs[i], td->aio_events);
617                 if (!r)
618                         aio_put_iocb(td, &td->aio_iocbs[i]);
619         }
620
621         if (td->aio_cur_depth)
622                 io_getevents(*td->aio_ctx, td->aio_cur_depth, td->aio_cur_depth, td->aio_events, NULL);
623 }
624
625 static void cleanup_aio(struct thread_data *td)
626 {
627         if (td->aio_cur_depth)
628                 cleanup_pending_aio(td);
629
630         if (td->aio_ctx) {
631                 io_destroy(*td->aio_ctx);
632                 free(td->aio_ctx);
633         }
634         if (td->aio_iocbs)
635                 free(td->aio_iocbs);
636         if (td->aio_events)
637                 free(td->aio_events);
638         if (td->aio_iocbs_status)
639                 free(td->aio_iocbs_status);
640 }
641
642 static int init_aio(struct thread_data *td)
643 {
644         td->aio_ctx = malloc(sizeof(*td->aio_ctx));
645
646         if (io_queue_init(td->aio_depth, td->aio_ctx)) {
647                 td->error = errno;
648                 return 1;
649         }
650
651         td->aio_iocbs = malloc(td->aio_depth * sizeof(struct iocb));
652         td->aio_events = malloc(td->aio_depth * sizeof(struct io_event));
653         td->aio_iocbs_status = malloc(td->aio_depth * sizeof(char));
654         return 0;
655 }
656
657 static void *thread_main(int shm_id, int offset, char *argv[])
658 {
659         struct thread_data *td;
660         void *data, *ptr = NULL;
661         struct stat st;
662         int ret = 1, flags;
663
664         setsid();
665
666         data = shmat(shm_id, NULL, 0);
667         td = data + offset * sizeof(struct thread_data);
668         td->pid = getpid();
669
670         td->fd = -1;
671
672         if (sched_setaffinity(td->pid, sizeof(td->cpumask), &td->cpumask) == -1) {
673                 td->error = errno;
674                 goto err;
675         }
676
677         printf("Client%d (pid=%u) started\n", td->thread_number, td->pid);
678
679         sprintf(argv[0], "fio%d", offset);
680
681         flags = 0;
682         if (td->odirect)
683                 flags |= O_DIRECT;
684
685         if (td->ddir == DDIR_READ)
686                 td->fd = open(td->file_name, flags | O_RDONLY);
687         else
688                 td->fd = open(td->file_name, flags | O_WRONLY | O_CREAT | O_TRUNC, 0644);
689
690         if (td->fd == -1) {
691                 td->error = errno;
692                 goto err;
693         }
694
695         if (td->use_aio && init_aio(td))
696                 goto err;
697
698         if (init_random_state(td))
699                 goto err;
700
701         if (td->ddir == DDIR_READ) {
702                 if (fstat(td->fd, &st) == -1) {
703                         td->error = errno;
704                         goto err;
705                 }
706
707                 td->blocks = st.st_size / td->bs;
708                 if (!td->blocks) {
709                         td->error = EINVAL;
710                         goto err;
711                 }
712         } else
713                 td->blocks = 1024 * 1024 * 1024 / td->bs;
714
715         if (td->ioprio) {
716                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, td->ioprio) == -1) {
717                         td->error = errno;
718                         goto err;
719                 }
720         }
721
722         sem_post(&startup_sem);
723         sem_wait(&td->mutex);
724
725         gettimeofday(&td->start, NULL);
726
727         if (td->ratemin)
728                 memcpy(&td->lastrate, &td->start, sizeof(td->start));
729
730         memcpy(&td->stat_sample_time, &td->start, sizeof(td->start));
731
732         if (!td->use_aio) {
733                 ptr = malloc(td->bs + MASK);
734                 td->buf = ALIGN(ptr);
735                 do_sync_io(td);
736         } else {
737                 ptr = malloc(td->bs * td->aio_depth + MASK);
738                 td->buf = ALIGN(ptr);
739                 do_async_io(td);
740         }
741
742         td->runtime = mtime_since_now(&td->start);
743         ret = 0;
744 err:
745         if (td->use_aio)
746                 cleanup_aio(td);
747         if (td->fd != -1) {
748                 close(td->fd);
749                 td->fd = -1;
750         }
751         if (ret) {
752                 sem_post(&startup_sem);
753                 sem_wait(&td->mutex);
754         }
755         if (ptr)
756                 free(ptr);
757         td->runstate = TD_EXITED;
758         shmdt(data);
759         return NULL;
760 }
761
762 static void free_shm(void)
763 {
764         shmdt(threads);
765 }
766
767 static void show_thread_status(struct thread_data *td)
768 {
769         int prio, prio_class;
770         unsigned long bw = 0;
771         double n_lat, n_bw, m_lat, m_bw, dev_lat, dev_bw;
772
773         if (!td->io_blocks && !td->error)
774                 return;
775
776         if (td->runtime)
777                 bw = (td->io_blocks * td->bs) / td->runtime;
778
779         prio = td->ioprio & 0xff;
780         prio_class = td->ioprio >> IOPRIO_CLASS_SHIFT;
781
782         n_lat = (double) td->stat_time_samples;
783         n_bw = (double) td->stat_bw_samples;
784
785         m_lat = (double) td->stat_time / n_lat;
786         dev_lat = sqrt(((double) td->stat_time_sq - (m_lat * m_lat) / n_lat) / (n_lat - 1));
787         m_bw = (double) td->stat_bw / n_bw;
788         dev_bw = sqrt(((double) td->stat_bw_sq - (m_bw * m_bw) / n_bw) / (n_bw - 1));
789
790         printf("Client%d: err=%2d, io=%6luMiB, bw=%6luKiB/sec, latmax=%5lumsec, latavg=%5.02fmsec, latdev=%5.02fmsec, bwavg=%5.02fKiB/sec, bwdev=%5.02fKiB/sec\n", td->thread_number, td->error, td->io_blocks * td->bs >> 20, bw, td->max_latency, m_lat, dev_lat, m_bw, dev_bw);
791 }
792
793 static int setup_rate(struct thread_data *td)
794 {
795         int nr_reads_per_sec;
796
797         if (!td->rate)
798                 return 0;
799
800         if (td->rate < td->ratemin) {
801                 fprintf(stderr, "min rate larger than nominal rate\n");
802                 return -1;
803         }
804
805         nr_reads_per_sec = td->rate * 1024 / td->bs;
806         td->rate_usec_cycle = 1000000 / nr_reads_per_sec;
807         td->rate_pending_usleep = 0;
808         return 0;
809 }
810
811 static struct thread_data *get_new_job(int global)
812 {
813         struct thread_data *td;
814
815         if (global)
816                 return &def_thread;
817         if (thread_number >= MAX_JOBS)
818                 return NULL;
819
820         td = &threads[thread_number++];
821         memset(td, 0, sizeof(*td));
822
823         td->thread_number = thread_number;
824         td->ddir = def_thread.ddir;
825         td->bs = def_thread.bs;
826         td->odirect = def_thread.odirect;
827         td->ratecycle = def_thread.ratecycle;
828         td->sequential = def_thread.sequential;
829         td->timeout = def_thread.timeout;
830         memcpy(&td->cpumask, &def_thread.cpumask, sizeof(td->cpumask));
831
832         return td;
833 }
834
835 static void put_job(struct thread_data *td)
836 {
837         memset(&threads[td->thread_number - 1], 0, sizeof(*td));
838         thread_number--;
839 }
840
841 static int add_job(struct thread_data *td, const char *filename, int prioclass,
842                    int prio)
843 {
844         if (td == &def_thread)
845                 return 0;
846
847         strcpy(td->file_name, filename);
848         sem_init(&td->mutex, 1, 0);
849         td->min_latency = 10000000;
850         td->ioprio = (prioclass << IOPRIO_CLASS_SHIFT) | prio;
851
852         if (td->use_aio && !td->aio_depth)
853                 td->aio_depth = 1;
854
855         if (setup_rate(td))
856                 return -1;
857
858         printf("Client%d: file=%s, rw=%d, prio=%d/%d, seq=%d, odir=%d, bs=%d, rate=%d, aio=%d, aio_depth=%d\n", td->thread_number, filename, td->ddir, prioclass, prio, td->sequential, td->odirect, td->bs, td->rate, td->use_aio, td->aio_depth);
859         return 0;
860 }
861
862 static void fill_cpu_mask(cpu_set_t cpumask, int cpu)
863 {
864         unsigned int i;
865
866         CPU_ZERO(&cpumask);
867
868         for (i = 0; i < sizeof(int) * 8; i++) {
869                 if ((1 << i) & cpu)
870                         CPU_SET(i, &cpumask);
871         }
872 }
873
874 static void fill_option(const char *input, char *output)
875 {
876         int i;
877
878         i = 0;
879         while (input[i] != ',' && input[i] != '}' && input[i] != '\0') {
880                 output[i] = input[i];
881                 i++;
882         }
883
884         output[i] = '\0';
885 }
886
887 /*
888  * job key words:
889  *
890  * file=
891  * bs=
892  * rw=
893  * direct=
894  */
895 static void parse_jobs_cmd(int argc, char *argv[], int index)
896 {
897         struct thread_data *td;
898         unsigned int prio, prioclass, cpu;
899         char *string, *filename, *p, *c;
900         int i;
901
902         string = malloc(256);
903         filename = malloc(256);
904
905         for (i = index; i < argc; i++) {
906                 p = argv[i];
907
908                 c = strpbrk(p, "{");
909                 if (!c)
910                         break;
911
912                 filename[0] = 0;
913
914                 td = get_new_job(0);
915                 if (!td)
916                         break;
917
918                 prioclass = 2;
919                 prio = 4;
920
921                 c = strstr(p, "rw=");
922                 if (c) {
923                         c += 3;
924                         if (*c == '0')
925                                 td->ddir = DDIR_READ;
926                         else
927                                 td->ddir = DDIR_WRITE;
928                 }
929
930                 c = strstr(p, "prio=");
931                 if (c) {
932                         c += 5;
933                         prio = *c - '0';
934                 }
935
936                 c = strstr(p, "prioclass=");
937                 if (c) {
938                         c += 10;
939                         prioclass = *c - '0';
940                 }
941
942                 c = strstr(p, "file=");
943                 if (c) {
944                         c += 5;
945                         fill_option(c, filename);
946                 }
947
948                 c = strstr(p, "bs=");
949                 if (c) {
950                         c += 3;
951                         fill_option(c, string);
952                         td->bs = strtoul(string, NULL, 10);
953                         td->bs <<= 10;
954                 }
955
956                 c = strstr(p, "direct=");
957                 if (c) {
958                         c += 7;
959                         if (*c != '0')
960                                 td->odirect = 1;
961                         else
962                                 td->odirect = 0;
963                 }
964
965                 c = strstr(p, "delay=");
966                 if (c) {
967                         c += 6;
968                         fill_option(c, string);
969                         td->delay_sleep = strtoul(string, NULL, 10);
970                 }
971
972                 c = strstr(p, "rate=");
973                 if (c) {
974                         c += 5;
975                         fill_option(c, string);
976                         td->rate = strtoul(string, NULL, 10);
977                 }
978
979                 c = strstr(p, "ratemin=");
980                 if (c) {
981                         c += 8;
982                         fill_option(c, string);
983                         td->ratemin = strtoul(string, NULL, 10);
984                 }
985
986                 c = strstr(p, "ratecycle=");
987                 if (c) {
988                         c += 10;
989                         fill_option(c, string);
990                         td->ratecycle = strtoul(string, NULL, 10);
991                 }
992
993                 c = strstr(p, "cpumask=");
994                 if (c) {
995                         c += 8;
996                         fill_option(c, string);
997                         cpu = strtoul(string, NULL, 10);
998                         fill_cpu_mask(td->cpumask, cpu);
999                 }
1000
1001                 c = strstr(p, "fsync=");
1002                 if (c) {
1003                         c += 6;
1004                         fill_option(c, string);
1005                         td->fsync_blocks = strtoul(string, NULL, 10);
1006                 }
1007
1008                 c = strstr(p, "startdelay=");
1009                 if (c) {
1010                         c += 11;
1011                         fill_option(c, string);
1012                         td->start_delay = strtoul(string, NULL, 10);
1013                 }
1014
1015                 c = strstr(p, "timeout=");
1016                 if (c) {
1017                         c += 8;
1018                         fill_option(c, string);
1019                         td->timeout = strtoul(string, NULL, 10);
1020                 }
1021
1022                 c = strstr(p, "aio_depth=");
1023                 if (c) {
1024                         c += 10;
1025                         fill_option(c, string);
1026                         td->aio_depth = strtoul(string, NULL, 10);
1027                 }
1028
1029                 c = strstr(p, "aio");
1030                 if (c)
1031                         td->use_aio = 1;
1032
1033                 c = strstr(p, "random");
1034                 if (c)
1035                         td->sequential = 0;
1036                 c = strstr(p, "sequential");
1037                 if (c)
1038                         td->sequential = 1;
1039
1040                 if (add_job(td, filename, prioclass, prio))
1041                         put_job(td);
1042         }
1043
1044         free(string);
1045         free(filename);
1046 }
1047
1048 static int check_int(char *p, char *name, unsigned int *val)
1049 {
1050         char str[128];
1051
1052         sprintf(str, "%s=%%d", name);
1053         if (sscanf(p, str, val) == 1)
1054                 return 0;
1055
1056         sprintf(str, "%s = %%d", name);
1057         if (sscanf(p, str, val) == 1)
1058                 return 0;
1059
1060         return 1;
1061 }
1062
1063 static int is_empty_or_comment(char *line)
1064 {
1065         unsigned int i;
1066
1067         for (i = 0; i < strlen(line); i++) {
1068                 if (line[i] == ';')
1069                         return 1;
1070                 if (!isspace(line[i]) && !iscntrl(line[i]))
1071                         return 0;
1072         }
1073
1074         return 1;
1075 }
1076
1077 static int parse_jobs_ini(char *file)
1078 {
1079         unsigned int prioclass, prio, cpu, global;
1080         struct thread_data *td;
1081         char *string, *name;
1082         fpos_t off;
1083         FILE *f;
1084         char *p;
1085
1086         f = fopen(file, "r");
1087         if (!f) {
1088                 perror("fopen");
1089                 return 1;
1090         }
1091
1092         string = malloc(4096);
1093         name = malloc(256);
1094
1095         while ((p = fgets(string, 4096, f)) != NULL) {
1096                 if (is_empty_or_comment(p))
1097                         continue;
1098                 if (sscanf(p, "[%s]", name) != 1)
1099                         continue;
1100
1101                 global = !strncmp(name, "global", 6);
1102
1103                 name[strlen(name) - 1] = '\0';
1104
1105                 td = get_new_job(global);
1106                 if (!td)
1107                         break;
1108
1109                 prioclass = 2;
1110                 prio = 4;
1111
1112                 fgetpos(f, &off);
1113                 while ((p = fgets(string, 4096, f)) != NULL) {
1114                         if (is_empty_or_comment(p))
1115                                 continue;
1116                         if (strstr(p, "["))
1117                                 break;
1118                         if (!check_int(p, "bs", &td->bs)) {
1119                                 td->bs <<= 10;
1120                                 fgetpos(f, &off);
1121                                 continue;
1122                         }
1123                         if (!check_int(p, "rw", &td->ddir)) {
1124                                 fgetpos(f, &off);
1125                                 continue;
1126                         }
1127                         if (!check_int(p, "prio", &prio)) {
1128                                 fgetpos(f, &off);
1129                                 continue;
1130                         }
1131                         if (!check_int(p, "prioclass", &prioclass)) {
1132                                 fgetpos(f, &off);
1133                                 continue;
1134                         }
1135                         if (!check_int(p, "direct", &td->odirect)) {
1136                                 fgetpos(f, &off);
1137                                 continue;
1138                         }
1139                         if (!check_int(p, "rate", &td->rate)) {
1140                                 fgetpos(f, &off);
1141                                 continue;
1142                         }
1143                         if (!check_int(p, "ratemin", &td->ratemin)) {
1144                                 fgetpos(f, &off);
1145                                 continue;
1146                         }
1147                         if (!check_int(p, "ratecycle", &td->ratecycle)) {
1148                                 fgetpos(f, &off);
1149                                 continue;
1150                         }
1151                         if (!check_int(p, "delay", &td->delay_sleep)) {
1152                                 fgetpos(f, &off);
1153                                 continue;
1154                         }
1155                         if (!check_int(p, "cpumask", &cpu)) {
1156                                 fill_cpu_mask(td->cpumask, cpu);
1157                                 fgetpos(f, &off);
1158                                 continue;
1159                         }
1160                         if (!check_int(p, "fsync", &td->fsync_blocks)) {
1161                                 fgetpos(f, &off);
1162                                 continue;
1163                         }
1164                         if (!check_int(p, "startdelay", &td->start_delay)) {
1165                                 fgetpos(f, &off);
1166                                 continue;
1167                         }
1168                         if (!check_int(p, "timeout", &td->timeout)) {
1169                                 fgetpos(f, &off);
1170                                 continue;
1171                         }
1172                         if (!check_int(p, "aio_depth", &td->aio_depth)) {
1173                                 fgetpos(f, &off);
1174                                 continue;
1175                         }
1176                         if (!strncmp(p, "sequential", 10)) {
1177                                 td->sequential = 1;
1178                                 fgetpos(f, &off);
1179                                 continue;
1180                         }
1181                         if (!strncmp(p, "random", 6)) {
1182                                 td->sequential = 0;
1183                                 fgetpos(f, &off);
1184                                 continue;
1185                         }
1186                         if (!strncmp(p, "aio", 3)) {
1187                                 td->use_aio = 1;
1188                                 fgetpos(f, &off);
1189                                 continue;
1190                         }
1191
1192                         printf("Client%d: bad option %s\n",td->thread_number,p);
1193                 }
1194                 fsetpos(f, &off);
1195
1196                 if (add_job(td, name, prioclass, prio))
1197                         put_job(td);
1198         }
1199
1200         free(string);
1201         free(name);
1202         fclose(f);
1203         return 0;
1204 }
1205
1206 static int parse_options(int argc, char *argv[])
1207 {
1208         int i;
1209
1210         for (i = 1; i < argc; i++) {
1211                 char *parm = argv[i];
1212
1213                 if (parm[0] != '-')
1214                         break;
1215
1216                 parm++;
1217                 switch (*parm) {
1218                         case 's':
1219                                 parm++;
1220                                 def_thread.sequential = !!atoi(parm);
1221                                 break;
1222                         case 'b':
1223                                 parm++;
1224                                 def_thread.bs = atoi(parm);
1225                                 def_thread.bs <<= 10;
1226                                 if (!def_thread.bs) {
1227                                         printf("bad block size\n");
1228                                         def_thread.bs = DEF_BS;
1229                                 }
1230                                 break;
1231                         case 't':
1232                                 parm++;
1233                                 def_thread.timeout = atoi(parm);
1234                                 break;
1235                         case 'r':
1236                                 parm++;
1237                                 repeatable = !!atoi(parm);
1238                                 break;
1239                         case 'R':
1240                                 parm++;
1241                                 rate_quit = !!atoi(parm);
1242                                 break;
1243                         case 'o':
1244                                 parm++;
1245                                 def_thread.odirect = !!atoi(parm);
1246                                 break;
1247                         case 'f':
1248                                 if (i + 1 >= argc) {
1249                                         printf("-f needs file as arg\n");
1250                                         break;
1251                                 }
1252                                 ini_file = strdup(argv[i+1]);
1253                                 i++;
1254                                 break;
1255                         default:
1256                                 printf("bad option %s\n", argv[i]);
1257                                 break;
1258                 }
1259         }
1260
1261         return i;
1262 }
1263
1264 static void reap_threads(int *nr_running, int *t_rate, int *m_rate)
1265 {
1266         int i;
1267
1268         for (i = 0; i < thread_number; i++) {
1269                 struct thread_data *td = &threads[i];
1270
1271                 if (td->runstate != TD_EXITED)
1272                         continue;
1273
1274                 td->runstate = TD_REAPED;
1275                 waitpid(td->pid, NULL, 0);
1276                 (*nr_running)--;
1277                 (*m_rate) -= td->ratemin;
1278                 (*t_rate) -= td->rate;
1279
1280                 if (td->terminate)
1281                         continue;
1282
1283                 printf("Threads now running: %d", *nr_running);
1284                 if (*m_rate || *t_rate)
1285                         printf(", rate %d/%dKiB/sec", *t_rate, *m_rate);
1286                 printf("\n");
1287         }
1288 }
1289
1290 static void run_threads(char *argv[])
1291 {
1292         struct timeval genesis;
1293         struct thread_data *td;
1294         unsigned long spent;
1295         int i, todo, nr_running, m_rate, t_rate;
1296
1297         gettimeofday(&genesis, NULL);
1298
1299         printf("Starting %d threads\n", thread_number);
1300         fflush(stdout);
1301
1302         signal(SIGINT, sig_handler);
1303
1304         todo = thread_number;
1305         nr_running = 0;
1306         m_rate = t_rate = 0;
1307
1308         while (todo) {
1309                 for (i = 0; i < thread_number; i++) {
1310                         td = &threads[i];
1311
1312                         if (td->runstate != TD_NOT_CREATED)
1313                                 continue;
1314
1315                         /*
1316                          * never got a chance to start, killed by other
1317                          * thread for some reason
1318                          */
1319                         if (td->terminate) {
1320                                 todo--;
1321                                 continue;
1322                         }
1323
1324                         if (td->start_delay) {
1325                                 spent = mtime_since_now(&genesis);
1326
1327                                 if (td->start_delay * 1000 > spent)
1328                                         continue;
1329                         }
1330
1331                         td->runstate = TD_CREATED;
1332                         sem_init(&startup_sem, 1, 1);
1333                         todo--;
1334
1335                         if (fork())
1336                                 sem_wait(&startup_sem);
1337                         else {
1338                                 thread_main(shm_id, i, argv);
1339                                 exit(0);
1340                         }
1341                 }
1342
1343                 for (i = 0; i < thread_number; i++) {
1344                         struct thread_data *td = &threads[i];
1345
1346                         if (td->runstate == TD_CREATED) {
1347                                 td->runstate = TD_STARTED;
1348                                 nr_running++;
1349                                 m_rate += td->ratemin;
1350                                 t_rate += td->rate;
1351                                 sem_post(&td->mutex);
1352
1353                                 printf("Threads now running: %d", nr_running);
1354                                 if (m_rate || t_rate)
1355                                         printf(", rate %d/%dKiB/sec", t_rate, m_rate);
1356                                 printf("\n");
1357                         }
1358                 }
1359
1360                 reap_threads(&nr_running, &t_rate, &m_rate);
1361
1362                 if (todo)
1363                         usleep(100000);
1364         }
1365
1366         while (nr_running) {
1367                 reap_threads(&nr_running, &t_rate, &m_rate);
1368                 usleep(10000);
1369         }
1370 }
1371
1372 int main(int argc, char *argv[])
1373 {
1374         static unsigned long max_run[2], min_run[2], total_blocks[2];
1375         static unsigned long max_bw[2], min_bw[2], maxl[2], minl[2];
1376         static unsigned long read_mb, write_mb, read_agg, write_agg;
1377         int i;
1378
1379         shm_id = shmget(0, MAX_JOBS * sizeof(struct thread_data), IPC_CREAT | 0600);
1380         if (shm_id == -1) {
1381                 perror("shmget");
1382                 return 1;
1383         }
1384
1385         threads = shmat(shm_id, NULL, 0);
1386         if (threads == (void *) -1 ) {
1387                 perror("shmat");
1388                 return 1;
1389         }
1390
1391         atexit(free_shm);
1392
1393         if (sched_getaffinity(getpid(), sizeof(cpu_set_t), &def_thread.cpumask) == -1) {
1394                 perror("sched_getaffinity");
1395                 return 1;
1396         }
1397
1398         /*
1399          * fill globals
1400          */
1401         def_thread.ddir = DDIR_READ;
1402         def_thread.bs = DEF_BS;
1403         def_thread.odirect = 1;
1404         def_thread.ratecycle = DEF_RATE_CYCLE;
1405         def_thread.sequential = 1;
1406         def_thread.timeout = DEF_TIMEOUT;
1407
1408         i = parse_options(argc, argv);
1409
1410         if (ini_file) {
1411                 if (parse_jobs_ini(ini_file))
1412                         return 1;
1413         } else
1414                 parse_jobs_cmd(argc, argv, i);
1415
1416         if (!thread_number) {
1417                 printf("Nothing to do\n");
1418                 return 1;
1419         }
1420
1421         run_threads(argv);
1422
1423         min_bw[0] = min_run[0] = ~0UL;
1424         min_bw[1] = min_run[1] = ~0UL;
1425         minl[0] = minl[1] = ~0UL;
1426         for (i = 0; i < thread_number; i++) {
1427                 struct thread_data *td = &threads[i];
1428                 unsigned long bw = 0;
1429
1430                 if (td->error)
1431                         goto show_stat;
1432
1433                 if (td->runtime < min_run[td->ddir])
1434                         min_run[td->ddir] = td->runtime;
1435                 if (td->runtime > max_run[td->ddir])
1436                         max_run[td->ddir] = td->runtime;
1437
1438                 if (td->runtime)
1439                         bw = (td->io_blocks * td->bs) / td->runtime;
1440                 if (bw < min_bw[td->ddir])
1441                         min_bw[td->ddir] = bw;
1442                 if (bw > max_bw[td->ddir])
1443                         max_bw[td->ddir] = bw;
1444                 if (td->max_latency < minl[td->ddir])
1445                         minl[td->ddir] = td->max_latency;
1446                 if (td->max_latency > maxl[td->ddir])
1447                         maxl[td->ddir] = td->max_latency;
1448
1449                 total_blocks[td->ddir] += td->io_blocks;
1450
1451                 if (td->ddir == DDIR_READ) {
1452                         read_mb += (td->bs * td->io_blocks) >> 20;
1453                         if (td->runtime)
1454                                 read_agg += (td->io_blocks * td->bs) / td->runtime;
1455                 }
1456                 if (td->ddir == DDIR_WRITE) {
1457                         write_mb += (td->bs * td->io_blocks) >> 20;
1458                         if (td->runtime)
1459                                 write_agg += (td->io_blocks * td->bs) / td->runtime;
1460                 }
1461
1462 show_stat:
1463                 show_thread_status(td);
1464         }
1465
1466         printf("Run status:\n");
1467         if (max_run[DDIR_READ])
1468                 printf("   READ: io=%luMiB, aggrb=%lu, minl=%lu, maxl=%lu, minb=%lu, maxb=%lu, mint=%lumsec, maxt=%lumsec\n", read_mb, read_agg, minl[0], maxl[0], min_bw[0], max_bw[0], min_run[0], max_run[0]);
1469         if (max_run[DDIR_WRITE])
1470                 printf("  WRITE: io=%luMiB, aggrb=%lu, minl=%lu, maxl=%lu, minb=%lu, maxb=%lu, mint=%lumsec, maxt=%lumsec\n", write_mb, write_agg, minl[1], maxl[1], min_bw[1], max_bw[1], min_run[1], max_run[1]);
1471
1472         return 0;
1473 }