Switch to using our internal Tausworthe based random generator for offsets
[fio.git] / io_u.c
... / ...
CommitLineData
1#include <unistd.h>
2#include <fcntl.h>
3#include <string.h>
4#include <signal.h>
5#include <time.h>
6#include <assert.h>
7
8#include "fio.h"
9#include "hash.h"
10#include "verify.h"
11#include "trim.h"
12#include "lib/rand.h"
13
14struct io_completion_data {
15 int nr; /* input */
16
17 int error; /* output */
18 unsigned long bytes_done[2]; /* output */
19 struct timeval time; /* output */
20};
21
22/*
23 * The ->file_map[] contains a map of blocks we have or have not done io
24 * to yet. Used to make sure we cover the entire range in a fair fashion.
25 */
26static int random_map_free(struct fio_file *f, const unsigned long long block)
27{
28 unsigned int idx = RAND_MAP_IDX(f, block);
29 unsigned int bit = RAND_MAP_BIT(f, block);
30
31 dprint(FD_RANDOM, "free: b=%llu, idx=%u, bit=%u\n", block, idx, bit);
32
33 return (f->file_map[idx] & (1UL << bit)) == 0;
34}
35
36/*
37 * Mark a given offset as used in the map.
38 */
39static void mark_random_map(struct thread_data *td, struct io_u *io_u)
40{
41 unsigned int min_bs = td->o.rw_min_bs;
42 struct fio_file *f = io_u->file;
43 unsigned long long block;
44 unsigned int blocks, nr_blocks;
45 int busy_check;
46
47 block = (io_u->offset - f->file_offset) / (unsigned long long) min_bs;
48 nr_blocks = (io_u->buflen + min_bs - 1) / min_bs;
49 blocks = 0;
50 busy_check = !(io_u->flags & IO_U_F_BUSY_OK);
51
52 while (nr_blocks) {
53 unsigned int idx, bit;
54 unsigned long mask, this_blocks;
55
56 /*
57 * If we have a mixed random workload, we may
58 * encounter blocks we already did IO to.
59 */
60 if (!busy_check) {
61 blocks = nr_blocks;
62 break;
63 }
64 if ((td->o.ddir_seq_nr == 1) && !random_map_free(f, block))
65 break;
66
67 idx = RAND_MAP_IDX(f, block);
68 bit = RAND_MAP_BIT(f, block);
69
70 fio_assert(td, idx < f->num_maps);
71
72 this_blocks = nr_blocks;
73 if (this_blocks + bit > BLOCKS_PER_MAP)
74 this_blocks = BLOCKS_PER_MAP - bit;
75
76 do {
77 if (this_blocks == BLOCKS_PER_MAP)
78 mask = -1UL;
79 else
80 mask = ((1UL << this_blocks) - 1) << bit;
81
82 if (!(f->file_map[idx] & mask))
83 break;
84
85 this_blocks--;
86 } while (this_blocks);
87
88 if (!this_blocks)
89 break;
90
91 f->file_map[idx] |= mask;
92 nr_blocks -= this_blocks;
93 blocks += this_blocks;
94 block += this_blocks;
95 }
96
97 if ((blocks * min_bs) < io_u->buflen)
98 io_u->buflen = blocks * min_bs;
99}
100
101static unsigned long long last_block(struct thread_data *td, struct fio_file *f,
102 enum fio_ddir ddir)
103{
104 unsigned long long max_blocks;
105 unsigned long long max_size;
106
107 assert(ddir_rw(ddir));
108
109 /*
110 * Hmm, should we make sure that ->io_size <= ->real_file_size?
111 */
112 max_size = f->io_size;
113 if (max_size > f->real_file_size)
114 max_size = f->real_file_size;
115
116 max_blocks = max_size / (unsigned long long) td->o.ba[ddir];
117 if (!max_blocks)
118 return 0;
119
120 return max_blocks;
121}
122
123/*
124 * Return the next free block in the map.
125 */
126static int get_next_free_block(struct thread_data *td, struct fio_file *f,
127 enum fio_ddir ddir, unsigned long long *b)
128{
129 unsigned long long block, min_bs = td->o.rw_min_bs, lastb;
130 int i;
131
132 lastb = last_block(td, f, ddir);
133 if (!lastb)
134 return 1;
135
136 i = f->last_free_lookup;
137 block = i * BLOCKS_PER_MAP;
138 while (block * min_bs < f->real_file_size &&
139 block * min_bs < f->io_size) {
140 if (f->file_map[i] != -1UL) {
141 block += ffz(f->file_map[i]);
142 if (block > lastb)
143 break;
144 f->last_free_lookup = i;
145 *b = block;
146 return 0;
147 }
148
149 block += BLOCKS_PER_MAP;
150 i++;
151 }
152
153 dprint(FD_IO, "failed finding a free block\n");
154 return 1;
155}
156
157static int get_next_rand_offset(struct thread_data *td, struct fio_file *f,
158 enum fio_ddir ddir, unsigned long long *b)
159{
160 unsigned long long r, lastb;
161 int loops = 5;
162
163 lastb = last_block(td, f, ddir);
164 if (!lastb)
165 return 1;
166
167 if (f->failed_rands >= 200)
168 goto ffz;
169
170 do {
171 if (td->o.use_os_rand) {
172 r = os_random_long(&td->random_state);
173 *b = (lastb - 1) * (r / ((unsigned long long) OS_RAND_MAX + 1.0));
174 } else {
175 r = __rand(&td->__random_state);
176 *b = (lastb - 1) * (r / ((unsigned long long) FRAND_MAX + 1.0));
177 }
178
179 dprint(FD_RANDOM, "off rand %llu\n", r);
180
181
182 /*
183 * if we are not maintaining a random map, we are done.
184 */
185 if (!file_randommap(td, f))
186 goto ret_good;
187
188 /*
189 * calculate map offset and check if it's free
190 */
191 if (random_map_free(f, *b))
192 goto ret_good;
193
194 dprint(FD_RANDOM, "get_next_rand_offset: offset %llu busy\n",
195 *b);
196 } while (--loops);
197
198 if (!f->failed_rands++)
199 f->last_free_lookup = 0;
200
201 /*
202 * we get here, if we didn't suceed in looking up a block. generate
203 * a random start offset into the filemap, and find the first free
204 * block from there.
205 */
206 loops = 10;
207 do {
208 f->last_free_lookup = (f->num_maps - 1) *
209 (r / (OS_RAND_MAX + 1.0));
210 if (!get_next_free_block(td, f, ddir, b))
211 goto ret;
212
213 r = os_random_long(&td->random_state);
214 } while (--loops);
215
216 /*
217 * that didn't work either, try exhaustive search from the start
218 */
219 f->last_free_lookup = 0;
220ffz:
221 if (!get_next_free_block(td, f, ddir, b))
222 return 0;
223 f->last_free_lookup = 0;
224 return get_next_free_block(td, f, ddir, b);
225ret_good:
226 f->failed_rands = 0;
227ret:
228 return 0;
229}
230
231static int get_next_rand_block(struct thread_data *td, struct fio_file *f,
232 enum fio_ddir ddir, unsigned long long *b)
233{
234 if (get_next_rand_offset(td, f, ddir, b)) {
235 dprint(FD_IO, "%s: rand offset failed, last=%llu, size=%llu\n",
236 f->file_name, f->last_pos, f->real_file_size);
237 return 1;
238 }
239
240 return 0;
241}
242
243static int get_next_seq_block(struct thread_data *td, struct fio_file *f,
244 enum fio_ddir ddir, unsigned long long *b)
245{
246 assert(ddir_rw(ddir));
247
248 if (f->last_pos < f->real_file_size) {
249 *b = (f->last_pos - f->file_offset) / td->o.min_bs[ddir];
250 return 0;
251 }
252
253 return 1;
254}
255
256static int get_next_block(struct thread_data *td, struct io_u *io_u,
257 enum fio_ddir ddir, int rw_seq, unsigned long long *b)
258{
259 struct fio_file *f = io_u->file;
260 int ret;
261
262 assert(ddir_rw(ddir));
263
264 if (rw_seq) {
265 if (td_random(td))
266 ret = get_next_rand_block(td, f, ddir, b);
267 else
268 ret = get_next_seq_block(td, f, ddir, b);
269 } else {
270 io_u->flags |= IO_U_F_BUSY_OK;
271
272 if (td->o.rw_seq == RW_SEQ_SEQ) {
273 ret = get_next_seq_block(td, f, ddir, b);
274 if (ret)
275 ret = get_next_rand_block(td, f, ddir, b);
276 } else if (td->o.rw_seq == RW_SEQ_IDENT) {
277 if (f->last_start != -1ULL)
278 *b = (f->last_start - f->file_offset)
279 / td->o.min_bs[ddir];
280 else
281 *b = 0;
282 ret = 0;
283 } else {
284 log_err("fio: unknown rw_seq=%d\n", td->o.rw_seq);
285 ret = 1;
286 }
287 }
288
289 return ret;
290}
291
292/*
293 * For random io, generate a random new block and see if it's used. Repeat
294 * until we find a free one. For sequential io, just return the end of
295 * the last io issued.
296 */
297static int __get_next_offset(struct thread_data *td, struct io_u *io_u)
298{
299 struct fio_file *f = io_u->file;
300 unsigned long long b;
301 enum fio_ddir ddir = io_u->ddir;
302 int rw_seq_hit = 0;
303
304 assert(ddir_rw(ddir));
305
306 if (td->o.ddir_seq_nr && !--td->ddir_seq_nr) {
307 rw_seq_hit = 1;
308 td->ddir_seq_nr = td->o.ddir_seq_nr;
309 }
310
311 if (get_next_block(td, io_u, ddir, rw_seq_hit, &b))
312 return 1;
313
314 io_u->offset = b * td->o.ba[ddir];
315 if (io_u->offset >= f->io_size) {
316 dprint(FD_IO, "get_next_offset: offset %llu >= io_size %llu\n",
317 io_u->offset, f->io_size);
318 return 1;
319 }
320
321 io_u->offset += f->file_offset;
322 if (io_u->offset >= f->real_file_size) {
323 dprint(FD_IO, "get_next_offset: offset %llu >= size %llu\n",
324 io_u->offset, f->real_file_size);
325 return 1;
326 }
327
328 return 0;
329}
330
331static int get_next_offset(struct thread_data *td, struct io_u *io_u)
332{
333 struct prof_io_ops *ops = &td->prof_io_ops;
334
335 if (ops->fill_io_u_off)
336 return ops->fill_io_u_off(td, io_u);
337
338 return __get_next_offset(td, io_u);
339}
340
341static unsigned int __get_next_buflen(struct thread_data *td, struct io_u *io_u)
342{
343 const int ddir = io_u->ddir;
344 unsigned int uninitialized_var(buflen);
345 unsigned int minbs, maxbs;
346 long r;
347
348 assert(ddir_rw(ddir));
349
350 minbs = td->o.min_bs[ddir];
351 maxbs = td->o.max_bs[ddir];
352
353 if (minbs == maxbs)
354 buflen = minbs;
355 else {
356 r = os_random_long(&td->bsrange_state);
357 if (!td->o.bssplit_nr[ddir]) {
358 buflen = 1 + (unsigned int) ((double) maxbs *
359 (r / (OS_RAND_MAX + 1.0)));
360 if (buflen < minbs)
361 buflen = minbs;
362 } else {
363 long perc = 0;
364 unsigned int i;
365
366 for (i = 0; i < td->o.bssplit_nr[ddir]; i++) {
367 struct bssplit *bsp = &td->o.bssplit[ddir][i];
368
369 buflen = bsp->bs;
370 perc += bsp->perc;
371 if (r <= ((OS_RAND_MAX / 100L) * perc))
372 break;
373 }
374 }
375 if (!td->o.bs_unaligned && is_power_of_2(minbs))
376 buflen = (buflen + minbs - 1) & ~(minbs - 1);
377 }
378
379 if (io_u->offset + buflen > io_u->file->real_file_size) {
380 dprint(FD_IO, "lower buflen %u -> %u (ddir=%d)\n", buflen,
381 minbs, ddir);
382 buflen = minbs;
383 }
384
385 return buflen;
386}
387
388static unsigned int get_next_buflen(struct thread_data *td, struct io_u *io_u)
389{
390 struct prof_io_ops *ops = &td->prof_io_ops;
391
392 if (ops->fill_io_u_size)
393 return ops->fill_io_u_size(td, io_u);
394
395 return __get_next_buflen(td, io_u);
396}
397
398static void set_rwmix_bytes(struct thread_data *td)
399{
400 unsigned int diff;
401
402 /*
403 * we do time or byte based switch. this is needed because
404 * buffered writes may issue a lot quicker than they complete,
405 * whereas reads do not.
406 */
407 diff = td->o.rwmix[td->rwmix_ddir ^ 1];
408 td->rwmix_issues = (td->io_issues[td->rwmix_ddir] * diff) / 100;
409}
410
411static inline enum fio_ddir get_rand_ddir(struct thread_data *td)
412{
413 unsigned int v;
414 long r;
415
416 r = os_random_long(&td->rwmix_state);
417 v = 1 + (int) (100.0 * (r / (OS_RAND_MAX + 1.0)));
418 if (v <= td->o.rwmix[DDIR_READ])
419 return DDIR_READ;
420
421 return DDIR_WRITE;
422}
423
424static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir)
425{
426 enum fio_ddir odir = ddir ^ 1;
427 struct timeval t;
428 long usec;
429
430 assert(ddir_rw(ddir));
431
432 if (td->rate_pending_usleep[ddir] <= 0)
433 return ddir;
434
435 /*
436 * We have too much pending sleep in this direction. See if we
437 * should switch.
438 */
439 if (td_rw(td)) {
440 /*
441 * Other direction does not have too much pending, switch
442 */
443 if (td->rate_pending_usleep[odir] < 100000)
444 return odir;
445
446 /*
447 * Both directions have pending sleep. Sleep the minimum time
448 * and deduct from both.
449 */
450 if (td->rate_pending_usleep[ddir] <=
451 td->rate_pending_usleep[odir]) {
452 usec = td->rate_pending_usleep[ddir];
453 } else {
454 usec = td->rate_pending_usleep[odir];
455 ddir = odir;
456 }
457 } else
458 usec = td->rate_pending_usleep[ddir];
459
460 fio_gettime(&t, NULL);
461 usec_sleep(td, usec);
462 usec = utime_since_now(&t);
463
464 td->rate_pending_usleep[ddir] -= usec;
465
466 odir = ddir ^ 1;
467 if (td_rw(td) && __should_check_rate(td, odir))
468 td->rate_pending_usleep[odir] -= usec;
469
470 return ddir;
471}
472
473/*
474 * Return the data direction for the next io_u. If the job is a
475 * mixed read/write workload, check the rwmix cycle and switch if
476 * necessary.
477 */
478static enum fio_ddir get_rw_ddir(struct thread_data *td)
479{
480 enum fio_ddir ddir;
481
482 /*
483 * see if it's time to fsync
484 */
485 if (td->o.fsync_blocks &&
486 !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks) &&
487 td->io_issues[DDIR_WRITE] && should_fsync(td))
488 return DDIR_SYNC;
489
490 /*
491 * see if it's time to fdatasync
492 */
493 if (td->o.fdatasync_blocks &&
494 !(td->io_issues[DDIR_WRITE] % td->o.fdatasync_blocks) &&
495 td->io_issues[DDIR_WRITE] && should_fsync(td))
496 return DDIR_DATASYNC;
497
498 /*
499 * see if it's time to sync_file_range
500 */
501 if (td->sync_file_range_nr &&
502 !(td->io_issues[DDIR_WRITE] % td->sync_file_range_nr) &&
503 td->io_issues[DDIR_WRITE] && should_fsync(td))
504 return DDIR_SYNC_FILE_RANGE;
505
506 if (td_rw(td)) {
507 /*
508 * Check if it's time to seed a new data direction.
509 */
510 if (td->io_issues[td->rwmix_ddir] >= td->rwmix_issues) {
511 /*
512 * Put a top limit on how many bytes we do for
513 * one data direction, to avoid overflowing the
514 * ranges too much
515 */
516 ddir = get_rand_ddir(td);
517
518 if (ddir != td->rwmix_ddir)
519 set_rwmix_bytes(td);
520
521 td->rwmix_ddir = ddir;
522 }
523 ddir = td->rwmix_ddir;
524 } else if (td_read(td))
525 ddir = DDIR_READ;
526 else
527 ddir = DDIR_WRITE;
528
529 td->rwmix_ddir = rate_ddir(td, ddir);
530 return td->rwmix_ddir;
531}
532
533static void set_rw_ddir(struct thread_data *td, struct io_u *io_u)
534{
535 io_u->ddir = get_rw_ddir(td);
536
537 if (io_u->ddir == DDIR_WRITE && (td->io_ops->flags & FIO_BARRIER) &&
538 td->o.barrier_blocks &&
539 !(td->io_issues[DDIR_WRITE] % td->o.barrier_blocks) &&
540 td->io_issues[DDIR_WRITE])
541 io_u->flags |= IO_U_F_BARRIER;
542}
543
544void put_file_log(struct thread_data *td, struct fio_file *f)
545{
546 int ret = put_file(td, f);
547
548 if (ret)
549 td_verror(td, ret, "file close");
550}
551
552void put_io_u(struct thread_data *td, struct io_u *io_u)
553{
554 td_io_u_lock(td);
555
556 io_u->flags |= IO_U_F_FREE;
557 io_u->flags &= ~IO_U_F_FREE_DEF;
558
559 if (io_u->file)
560 put_file_log(td, io_u->file);
561
562 io_u->file = NULL;
563 if (io_u->flags & IO_U_F_IN_CUR_DEPTH)
564 td->cur_depth--;
565 flist_del_init(&io_u->list);
566 flist_add(&io_u->list, &td->io_u_freelist);
567 td_io_u_unlock(td);
568 td_io_u_free_notify(td);
569}
570
571void clear_io_u(struct thread_data *td, struct io_u *io_u)
572{
573 io_u->flags &= ~IO_U_F_FLIGHT;
574 put_io_u(td, io_u);
575}
576
577void requeue_io_u(struct thread_data *td, struct io_u **io_u)
578{
579 struct io_u *__io_u = *io_u;
580
581 dprint(FD_IO, "requeue %p\n", __io_u);
582
583 td_io_u_lock(td);
584
585 __io_u->flags |= IO_U_F_FREE;
586 if ((__io_u->flags & IO_U_F_FLIGHT) && ddir_rw(__io_u->ddir))
587 td->io_issues[__io_u->ddir]--;
588
589 __io_u->flags &= ~IO_U_F_FLIGHT;
590 if (__io_u->flags & IO_U_F_IN_CUR_DEPTH)
591 td->cur_depth--;
592 flist_del(&__io_u->list);
593 flist_add_tail(&__io_u->list, &td->io_u_requeues);
594 td_io_u_unlock(td);
595 *io_u = NULL;
596}
597
598static int fill_io_u(struct thread_data *td, struct io_u *io_u)
599{
600 if (td->io_ops->flags & FIO_NOIO)
601 goto out;
602
603 set_rw_ddir(td, io_u);
604
605 /*
606 * fsync() or fdatasync() or trim etc, we are done
607 */
608 if (!ddir_rw(io_u->ddir))
609 goto out;
610
611 /*
612 * See if it's time to switch to a new zone
613 */
614 if (td->zone_bytes >= td->o.zone_size) {
615 td->zone_bytes = 0;
616 io_u->file->last_pos += td->o.zone_skip;
617 td->io_skip_bytes += td->o.zone_skip;
618 }
619
620 /*
621 * No log, let the seq/rand engine retrieve the next buflen and
622 * position.
623 */
624 if (get_next_offset(td, io_u)) {
625 dprint(FD_IO, "io_u %p, failed getting offset\n", io_u);
626 return 1;
627 }
628
629 io_u->buflen = get_next_buflen(td, io_u);
630 if (!io_u->buflen) {
631 dprint(FD_IO, "io_u %p, failed getting buflen\n", io_u);
632 return 1;
633 }
634
635 if (io_u->offset + io_u->buflen > io_u->file->real_file_size) {
636 dprint(FD_IO, "io_u %p, offset too large\n", io_u);
637 dprint(FD_IO, " off=%llu/%lu > %llu\n", io_u->offset,
638 io_u->buflen, io_u->file->real_file_size);
639 return 1;
640 }
641
642 /*
643 * mark entry before potentially trimming io_u
644 */
645 if (td_random(td) && file_randommap(td, io_u->file))
646 mark_random_map(td, io_u);
647
648 /*
649 * If using a write iolog, store this entry.
650 */
651out:
652 dprint_io_u(io_u, "fill_io_u");
653 td->zone_bytes += io_u->buflen;
654 log_io_u(td, io_u);
655 return 0;
656}
657
658static void __io_u_mark_map(unsigned int *map, unsigned int nr)
659{
660 int idx = 0;
661
662 switch (nr) {
663 default:
664 idx = 6;
665 break;
666 case 33 ... 64:
667 idx = 5;
668 break;
669 case 17 ... 32:
670 idx = 4;
671 break;
672 case 9 ... 16:
673 idx = 3;
674 break;
675 case 5 ... 8:
676 idx = 2;
677 break;
678 case 1 ... 4:
679 idx = 1;
680 case 0:
681 break;
682 }
683
684 map[idx]++;
685}
686
687void io_u_mark_submit(struct thread_data *td, unsigned int nr)
688{
689 __io_u_mark_map(td->ts.io_u_submit, nr);
690 td->ts.total_submit++;
691}
692
693void io_u_mark_complete(struct thread_data *td, unsigned int nr)
694{
695 __io_u_mark_map(td->ts.io_u_complete, nr);
696 td->ts.total_complete++;
697}
698
699void io_u_mark_depth(struct thread_data *td, unsigned int nr)
700{
701 int idx = 0;
702
703 switch (td->cur_depth) {
704 default:
705 idx = 6;
706 break;
707 case 32 ... 63:
708 idx = 5;
709 break;
710 case 16 ... 31:
711 idx = 4;
712 break;
713 case 8 ... 15:
714 idx = 3;
715 break;
716 case 4 ... 7:
717 idx = 2;
718 break;
719 case 2 ... 3:
720 idx = 1;
721 case 1:
722 break;
723 }
724
725 td->ts.io_u_map[idx] += nr;
726}
727
728static void io_u_mark_lat_usec(struct thread_data *td, unsigned long usec)
729{
730 int idx = 0;
731
732 assert(usec < 1000);
733
734 switch (usec) {
735 case 750 ... 999:
736 idx = 9;
737 break;
738 case 500 ... 749:
739 idx = 8;
740 break;
741 case 250 ... 499:
742 idx = 7;
743 break;
744 case 100 ... 249:
745 idx = 6;
746 break;
747 case 50 ... 99:
748 idx = 5;
749 break;
750 case 20 ... 49:
751 idx = 4;
752 break;
753 case 10 ... 19:
754 idx = 3;
755 break;
756 case 4 ... 9:
757 idx = 2;
758 break;
759 case 2 ... 3:
760 idx = 1;
761 case 0 ... 1:
762 break;
763 }
764
765 assert(idx < FIO_IO_U_LAT_U_NR);
766 td->ts.io_u_lat_u[idx]++;
767}
768
769static void io_u_mark_lat_msec(struct thread_data *td, unsigned long msec)
770{
771 int idx = 0;
772
773 switch (msec) {
774 default:
775 idx = 11;
776 break;
777 case 1000 ... 1999:
778 idx = 10;
779 break;
780 case 750 ... 999:
781 idx = 9;
782 break;
783 case 500 ... 749:
784 idx = 8;
785 break;
786 case 250 ... 499:
787 idx = 7;
788 break;
789 case 100 ... 249:
790 idx = 6;
791 break;
792 case 50 ... 99:
793 idx = 5;
794 break;
795 case 20 ... 49:
796 idx = 4;
797 break;
798 case 10 ... 19:
799 idx = 3;
800 break;
801 case 4 ... 9:
802 idx = 2;
803 break;
804 case 2 ... 3:
805 idx = 1;
806 case 0 ... 1:
807 break;
808 }
809
810 assert(idx < FIO_IO_U_LAT_M_NR);
811 td->ts.io_u_lat_m[idx]++;
812}
813
814static void io_u_mark_latency(struct thread_data *td, unsigned long usec)
815{
816 if (usec < 1000)
817 io_u_mark_lat_usec(td, usec);
818 else
819 io_u_mark_lat_msec(td, usec / 1000);
820}
821
822/*
823 * Get next file to service by choosing one at random
824 */
825static struct fio_file *get_next_file_rand(struct thread_data *td,
826 enum fio_file_flags goodf,
827 enum fio_file_flags badf)
828{
829 struct fio_file *f;
830 int fno;
831
832 do {
833 long r = os_random_long(&td->next_file_state);
834 int opened = 0;
835
836 fno = (unsigned int) ((double) td->o.nr_files
837 * (r / (OS_RAND_MAX + 1.0)));
838 f = td->files[fno];
839 if (fio_file_done(f))
840 continue;
841
842 if (!fio_file_open(f)) {
843 int err;
844
845 err = td_io_open_file(td, f);
846 if (err)
847 continue;
848 opened = 1;
849 }
850
851 if ((!goodf || (f->flags & goodf)) && !(f->flags & badf)) {
852 dprint(FD_FILE, "get_next_file_rand: %p\n", f);
853 return f;
854 }
855 if (opened)
856 td_io_close_file(td, f);
857 } while (1);
858}
859
860/*
861 * Get next file to service by doing round robin between all available ones
862 */
863static struct fio_file *get_next_file_rr(struct thread_data *td, int goodf,
864 int badf)
865{
866 unsigned int old_next_file = td->next_file;
867 struct fio_file *f;
868
869 do {
870 int opened = 0;
871
872 f = td->files[td->next_file];
873
874 td->next_file++;
875 if (td->next_file >= td->o.nr_files)
876 td->next_file = 0;
877
878 dprint(FD_FILE, "trying file %s %x\n", f->file_name, f->flags);
879 if (fio_file_done(f)) {
880 f = NULL;
881 continue;
882 }
883
884 if (!fio_file_open(f)) {
885 int err;
886
887 err = td_io_open_file(td, f);
888 if (err) {
889 dprint(FD_FILE, "error %d on open of %s\n",
890 err, f->file_name);
891 f = NULL;
892 continue;
893 }
894 opened = 1;
895 }
896
897 dprint(FD_FILE, "goodf=%x, badf=%x, ff=%x\n", goodf, badf,
898 f->flags);
899 if ((!goodf || (f->flags & goodf)) && !(f->flags & badf))
900 break;
901
902 if (opened)
903 td_io_close_file(td, f);
904
905 f = NULL;
906 } while (td->next_file != old_next_file);
907
908 dprint(FD_FILE, "get_next_file_rr: %p\n", f);
909 return f;
910}
911
912static struct fio_file *__get_next_file(struct thread_data *td)
913{
914 struct fio_file *f;
915
916 assert(td->o.nr_files <= td->files_index);
917
918 if (td->nr_done_files >= td->o.nr_files) {
919 dprint(FD_FILE, "get_next_file: nr_open=%d, nr_done=%d,"
920 " nr_files=%d\n", td->nr_open_files,
921 td->nr_done_files,
922 td->o.nr_files);
923 return NULL;
924 }
925
926 f = td->file_service_file;
927 if (f && fio_file_open(f) && !fio_file_closing(f)) {
928 if (td->o.file_service_type == FIO_FSERVICE_SEQ)
929 goto out;
930 if (td->file_service_left--)
931 goto out;
932 }
933
934 if (td->o.file_service_type == FIO_FSERVICE_RR ||
935 td->o.file_service_type == FIO_FSERVICE_SEQ)
936 f = get_next_file_rr(td, FIO_FILE_open, FIO_FILE_closing);
937 else
938 f = get_next_file_rand(td, FIO_FILE_open, FIO_FILE_closing);
939
940 td->file_service_file = f;
941 td->file_service_left = td->file_service_nr - 1;
942out:
943 dprint(FD_FILE, "get_next_file: %p [%s]\n", f, f->file_name);
944 return f;
945}
946
947static struct fio_file *get_next_file(struct thread_data *td)
948{
949 struct prof_io_ops *ops = &td->prof_io_ops;
950
951 if (ops->get_next_file)
952 return ops->get_next_file(td);
953
954 return __get_next_file(td);
955}
956
957static int set_io_u_file(struct thread_data *td, struct io_u *io_u)
958{
959 struct fio_file *f;
960
961 do {
962 f = get_next_file(td);
963 if (!f)
964 return 1;
965
966 io_u->file = f;
967 get_file(f);
968
969 if (!fill_io_u(td, io_u))
970 break;
971
972 put_file_log(td, f);
973 td_io_close_file(td, f);
974 io_u->file = NULL;
975 fio_file_set_done(f);
976 td->nr_done_files++;
977 dprint(FD_FILE, "%s: is done (%d of %d)\n", f->file_name,
978 td->nr_done_files, td->o.nr_files);
979 } while (1);
980
981 return 0;
982}
983
984
985struct io_u *__get_io_u(struct thread_data *td)
986{
987 struct io_u *io_u = NULL;
988
989 td_io_u_lock(td);
990
991again:
992 if (!flist_empty(&td->io_u_requeues))
993 io_u = flist_entry(td->io_u_requeues.next, struct io_u, list);
994 else if (!queue_full(td)) {
995 io_u = flist_entry(td->io_u_freelist.next, struct io_u, list);
996
997 io_u->buflen = 0;
998 io_u->resid = 0;
999 io_u->file = NULL;
1000 io_u->end_io = NULL;
1001 }
1002
1003 if (io_u) {
1004 assert(io_u->flags & IO_U_F_FREE);
1005 io_u->flags &= ~(IO_U_F_FREE | IO_U_F_FREE_DEF);
1006 io_u->flags &= ~(IO_U_F_TRIMMED | IO_U_F_BARRIER);
1007
1008 io_u->error = 0;
1009 flist_del(&io_u->list);
1010 flist_add(&io_u->list, &td->io_u_busylist);
1011 td->cur_depth++;
1012 io_u->flags |= IO_U_F_IN_CUR_DEPTH;
1013 } else if (td->o.verify_async) {
1014 /*
1015 * We ran out, wait for async verify threads to finish and
1016 * return one
1017 */
1018 pthread_cond_wait(&td->free_cond, &td->io_u_lock);
1019 goto again;
1020 }
1021
1022 td_io_u_unlock(td);
1023 return io_u;
1024}
1025
1026static int check_get_trim(struct thread_data *td, struct io_u *io_u)
1027{
1028 if (td->o.trim_backlog && td->trim_entries) {
1029 int get_trim = 0;
1030
1031 if (td->trim_batch) {
1032 td->trim_batch--;
1033 get_trim = 1;
1034 } else if (!(td->io_hist_len % td->o.trim_backlog) &&
1035 td->last_ddir != DDIR_READ) {
1036 td->trim_batch = td->o.trim_batch;
1037 if (!td->trim_batch)
1038 td->trim_batch = td->o.trim_backlog;
1039 get_trim = 1;
1040 }
1041
1042 if (get_trim && !get_next_trim(td, io_u))
1043 return 1;
1044 }
1045
1046 return 0;
1047}
1048
1049static int check_get_verify(struct thread_data *td, struct io_u *io_u)
1050{
1051 if (td->o.verify_backlog && td->io_hist_len) {
1052 int get_verify = 0;
1053
1054 if (td->verify_batch) {
1055 td->verify_batch--;
1056 get_verify = 1;
1057 } else if (!(td->io_hist_len % td->o.verify_backlog) &&
1058 td->last_ddir != DDIR_READ) {
1059 td->verify_batch = td->o.verify_batch;
1060 if (!td->verify_batch)
1061 td->verify_batch = td->o.verify_backlog;
1062 get_verify = 1;
1063 }
1064
1065 if (get_verify && !get_next_verify(td, io_u))
1066 return 1;
1067 }
1068
1069 return 0;
1070}
1071
1072/*
1073 * Return an io_u to be processed. Gets a buflen and offset, sets direction,
1074 * etc. The returned io_u is fully ready to be prepped and submitted.
1075 */
1076struct io_u *get_io_u(struct thread_data *td)
1077{
1078 struct fio_file *f;
1079 struct io_u *io_u;
1080
1081 io_u = __get_io_u(td);
1082 if (!io_u) {
1083 dprint(FD_IO, "__get_io_u failed\n");
1084 return NULL;
1085 }
1086
1087 if (check_get_verify(td, io_u))
1088 goto out;
1089 if (check_get_trim(td, io_u))
1090 goto out;
1091
1092 /*
1093 * from a requeue, io_u already setup
1094 */
1095 if (io_u->file)
1096 goto out;
1097
1098 /*
1099 * If using an iolog, grab next piece if any available.
1100 */
1101 if (td->o.read_iolog_file) {
1102 if (read_iolog_get(td, io_u))
1103 goto err_put;
1104 } else if (set_io_u_file(td, io_u)) {
1105 dprint(FD_IO, "io_u %p, setting file failed\n", io_u);
1106 goto err_put;
1107 }
1108
1109 f = io_u->file;
1110 assert(fio_file_open(f));
1111
1112 if (ddir_rw(io_u->ddir)) {
1113 if (!io_u->buflen && !(td->io_ops->flags & FIO_NOIO)) {
1114 dprint(FD_IO, "get_io_u: zero buflen on %p\n", io_u);
1115 goto err_put;
1116 }
1117
1118 f->last_start = io_u->offset;
1119 f->last_pos = io_u->offset + io_u->buflen;
1120
1121 if (td->o.verify != VERIFY_NONE && io_u->ddir == DDIR_WRITE)
1122 populate_verify_io_u(td, io_u);
1123 else if (td->o.refill_buffers && io_u->ddir == DDIR_WRITE)
1124 io_u_fill_buffer(td, io_u, io_u->xfer_buflen);
1125 else if (io_u->ddir == DDIR_READ) {
1126 /*
1127 * Reset the buf_filled parameters so next time if the
1128 * buffer is used for writes it is refilled.
1129 */
1130 io_u->buf_filled_len = 0;
1131 }
1132 }
1133
1134 /*
1135 * Set io data pointers.
1136 */
1137 io_u->xfer_buf = io_u->buf;
1138 io_u->xfer_buflen = io_u->buflen;
1139
1140out:
1141 assert(io_u->file);
1142 if (!td_io_prep(td, io_u)) {
1143 if (!td->o.disable_slat)
1144 fio_gettime(&io_u->start_time, NULL);
1145 return io_u;
1146 }
1147err_put:
1148 dprint(FD_IO, "get_io_u failed\n");
1149 put_io_u(td, io_u);
1150 return NULL;
1151}
1152
1153void io_u_log_error(struct thread_data *td, struct io_u *io_u)
1154{
1155 const char *msg[] = { "read", "write", "sync", "datasync",
1156 "sync_file_range", "wait", "trim" };
1157
1158
1159
1160 log_err("fio: io_u error");
1161
1162 if (io_u->file)
1163 log_err(" on file %s", io_u->file->file_name);
1164
1165 log_err(": %s\n", strerror(io_u->error));
1166
1167 log_err(" %s offset=%llu, buflen=%lu\n", msg[io_u->ddir],
1168 io_u->offset, io_u->xfer_buflen);
1169
1170 if (!td->error)
1171 td_verror(td, io_u->error, "io_u error");
1172}
1173
1174static void io_completed(struct thread_data *td, struct io_u *io_u,
1175 struct io_completion_data *icd)
1176{
1177 /*
1178 * Older gcc's are too dumb to realize that usec is always used
1179 * initialized, silence that warning.
1180 */
1181 unsigned long uninitialized_var(usec);
1182 struct fio_file *f;
1183
1184 dprint_io_u(io_u, "io complete");
1185
1186 td_io_u_lock(td);
1187 assert(io_u->flags & IO_U_F_FLIGHT);
1188 io_u->flags &= ~(IO_U_F_FLIGHT | IO_U_F_BUSY_OK);
1189 td_io_u_unlock(td);
1190
1191 if (ddir_sync(io_u->ddir)) {
1192 td->last_was_sync = 1;
1193 f = io_u->file;
1194 if (f) {
1195 f->first_write = -1ULL;
1196 f->last_write = -1ULL;
1197 }
1198 return;
1199 }
1200
1201 td->last_was_sync = 0;
1202 td->last_ddir = io_u->ddir;
1203
1204 if (!io_u->error && ddir_rw(io_u->ddir)) {
1205 unsigned int bytes = io_u->buflen - io_u->resid;
1206 const enum fio_ddir idx = io_u->ddir;
1207 const enum fio_ddir odx = io_u->ddir ^ 1;
1208 int ret;
1209
1210 td->io_blocks[idx]++;
1211 td->io_bytes[idx] += bytes;
1212 td->this_io_bytes[idx] += bytes;
1213
1214 if (idx == DDIR_WRITE) {
1215 f = io_u->file;
1216 if (f) {
1217 if (f->first_write == -1ULL ||
1218 io_u->offset < f->first_write)
1219 f->first_write = io_u->offset;
1220 if (f->last_write == -1ULL ||
1221 ((io_u->offset + bytes) > f->last_write))
1222 f->last_write = io_u->offset + bytes;
1223 }
1224 }
1225
1226 if (ramp_time_over(td)) {
1227 unsigned long uninitialized_var(lusec);
1228
1229 if (!td->o.disable_clat || !td->o.disable_bw)
1230 lusec = utime_since(&io_u->issue_time,
1231 &icd->time);
1232 if (!td->o.disable_lat) {
1233 unsigned long tusec;
1234
1235 tusec = utime_since(&io_u->start_time,
1236 &icd->time);
1237 add_lat_sample(td, idx, tusec, bytes);
1238 }
1239 if (!td->o.disable_clat) {
1240 add_clat_sample(td, idx, lusec, bytes);
1241 io_u_mark_latency(td, lusec);
1242 }
1243 if (!td->o.disable_bw)
1244 add_bw_sample(td, idx, bytes, &icd->time);
1245 if (__should_check_rate(td, idx)) {
1246 td->rate_pending_usleep[idx] =
1247 ((td->this_io_bytes[idx] *
1248 td->rate_nsec_cycle[idx]) / 1000 -
1249 utime_since_now(&td->start));
1250 }
1251 if (__should_check_rate(td, idx ^ 1))
1252 td->rate_pending_usleep[odx] =
1253 ((td->this_io_bytes[odx] *
1254 td->rate_nsec_cycle[odx]) / 1000 -
1255 utime_since_now(&td->start));
1256 }
1257
1258 if (td_write(td) && idx == DDIR_WRITE &&
1259 td->o.do_verify &&
1260 td->o.verify != VERIFY_NONE)
1261 log_io_piece(td, io_u);
1262
1263 icd->bytes_done[idx] += bytes;
1264
1265 if (io_u->end_io) {
1266 ret = io_u->end_io(td, io_u);
1267 if (ret && !icd->error)
1268 icd->error = ret;
1269 }
1270 } else if (io_u->error) {
1271 icd->error = io_u->error;
1272 io_u_log_error(td, io_u);
1273 }
1274 if (td->o.continue_on_error && icd->error &&
1275 td_non_fatal_error(icd->error)) {
1276 /*
1277 * If there is a non_fatal error, then add to the error count
1278 * and clear all the errors.
1279 */
1280 update_error_count(td, icd->error);
1281 td_clear_error(td);
1282 icd->error = 0;
1283 io_u->error = 0;
1284 }
1285}
1286
1287static void init_icd(struct thread_data *td, struct io_completion_data *icd,
1288 int nr)
1289{
1290 if (!td->o.disable_clat || !td->o.disable_bw)
1291 fio_gettime(&icd->time, NULL);
1292
1293 icd->nr = nr;
1294
1295 icd->error = 0;
1296 icd->bytes_done[0] = icd->bytes_done[1] = 0;
1297}
1298
1299static void ios_completed(struct thread_data *td,
1300 struct io_completion_data *icd)
1301{
1302 struct io_u *io_u;
1303 int i;
1304
1305 for (i = 0; i < icd->nr; i++) {
1306 io_u = td->io_ops->event(td, i);
1307
1308 io_completed(td, io_u, icd);
1309
1310 if (!(io_u->flags & IO_U_F_FREE_DEF))
1311 put_io_u(td, io_u);
1312 }
1313}
1314
1315/*
1316 * Complete a single io_u for the sync engines.
1317 */
1318int io_u_sync_complete(struct thread_data *td, struct io_u *io_u,
1319 unsigned long *bytes)
1320{
1321 struct io_completion_data icd;
1322
1323 init_icd(td, &icd, 1);
1324 io_completed(td, io_u, &icd);
1325
1326 if (!(io_u->flags & IO_U_F_FREE_DEF))
1327 put_io_u(td, io_u);
1328
1329 if (icd.error) {
1330 td_verror(td, icd.error, "io_u_sync_complete");
1331 return -1;
1332 }
1333
1334 if (bytes) {
1335 bytes[0] += icd.bytes_done[0];
1336 bytes[1] += icd.bytes_done[1];
1337 }
1338
1339 return 0;
1340}
1341
1342/*
1343 * Called to complete min_events number of io for the async engines.
1344 */
1345int io_u_queued_complete(struct thread_data *td, int min_evts,
1346 unsigned long *bytes)
1347{
1348 struct io_completion_data icd;
1349 struct timespec *tvp = NULL;
1350 int ret;
1351 struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, };
1352
1353 dprint(FD_IO, "io_u_queued_completed: min=%d\n", min_evts);
1354
1355 if (!min_evts)
1356 tvp = &ts;
1357
1358 ret = td_io_getevents(td, min_evts, td->o.iodepth_batch_complete, tvp);
1359 if (ret < 0) {
1360 td_verror(td, -ret, "td_io_getevents");
1361 return ret;
1362 } else if (!ret)
1363 return ret;
1364
1365 init_icd(td, &icd, ret);
1366 ios_completed(td, &icd);
1367 if (icd.error) {
1368 td_verror(td, icd.error, "io_u_queued_complete");
1369 return -1;
1370 }
1371
1372 if (bytes) {
1373 bytes[0] += icd.bytes_done[0];
1374 bytes[1] += icd.bytes_done[1];
1375 }
1376
1377 return 0;
1378}
1379
1380/*
1381 * Call when io_u is really queued, to update the submission latency.
1382 */
1383void io_u_queued(struct thread_data *td, struct io_u *io_u)
1384{
1385 if (!td->o.disable_slat) {
1386 unsigned long slat_time;
1387
1388 slat_time = utime_since(&io_u->start_time, &io_u->issue_time);
1389 add_slat_sample(td, io_u->ddir, slat_time, io_u->xfer_buflen);
1390 }
1391}
1392
1393/*
1394 * "randomly" fill the buffer contents
1395 */
1396void io_u_fill_buffer(struct thread_data *td, struct io_u *io_u,
1397 unsigned int max_bs)
1398{
1399 io_u->buf_filled_len = 0;
1400
1401 if (!td->o.zero_buffers)
1402 fill_random_buf(io_u->buf, max_bs);
1403 else
1404 memset(io_u->buf, 0, max_bs);
1405}