io_u.c

   1 #include <unistd.h>
   2 #include <fcntl.h>
   3 #include <string.h>
   4 #include <signal.h>
   5 #include <time.h>
   6 #include <assert.h>
   7
   8 #include "fio.h"
   9 #include "hash.h"
  10 #include "verify.h"
  11 #include "lib/rand.h"
  12
  13 struct io_completion_data {
  14         int nr;                         /* input */
  15
  16         int error;                      /* output */
  17         unsigned long bytes_done[2];    /* output */
  18         struct timeval time;            /* output */
  19 };
  20
  21 /*
  22  * The ->file_map[] contains a map of blocks we have or have not done io
  23  * to yet. Used to make sure we cover the entire range in a fair fashion.
  24  */
  25 static int random_map_free(struct fio_file *f, const unsigned long long block)
  26 {
  27         unsigned int idx = RAND_MAP_IDX(f, block);
  28         unsigned int bit = RAND_MAP_BIT(f, block);
  29
  30         dprint(FD_RANDOM, "free: b=%llu, idx=%u, bit=%u\n", block, idx, bit);
  31
  32         return (f->file_map[idx] & (1 << bit)) == 0;
  33 }
  34
  35 /*
  36  * Mark a given offset as used in the map.
  37  */
  38 static void mark_random_map(struct thread_data *td, struct io_u *io_u)
  39 {
  40         unsigned int min_bs = td->o.rw_min_bs;
  41         struct fio_file *f = io_u->file;
  42         unsigned long long block;
  43         unsigned int blocks, nr_blocks;
  44
  45         block = (io_u->offset - f->file_offset) / (unsigned long long) min_bs;
  46         nr_blocks = (io_u->buflen + min_bs - 1) / min_bs;
  47         blocks = 0;
  48
  49         while (nr_blocks) {
  50                 unsigned int this_blocks, mask;
  51                 unsigned int idx, bit;
  52
  53                 /*
  54                  * If we have a mixed random workload, we may
  55                  * encounter blocks we already did IO to.
  56                  */
  57                 if ((td->o.ddir_nr == 1) && !random_map_free(f, block)) {
  58                         if (!blocks)
  59                                 blocks = 1;
  60                         break;
  61                 }
  62
  63                 idx = RAND_MAP_IDX(f, block);
  64                 bit = RAND_MAP_BIT(f, block);
  65
  66                 fio_assert(td, idx < f->num_maps);
  67
  68                 this_blocks = nr_blocks;
  69                 if (this_blocks + bit > BLOCKS_PER_MAP)
  70                         this_blocks = BLOCKS_PER_MAP - bit;
  71
  72                 if (this_blocks == BLOCKS_PER_MAP)
  73                         mask = -1U;
  74                 else
  75                         mask = ((1U << this_blocks) - 1) << bit;
  76
  77                 f->file_map[idx] |= mask;
  78                 nr_blocks -= this_blocks;
  79                 blocks += this_blocks;
  80                 block += this_blocks;
  81         }
  82
  83         if ((blocks * min_bs) < io_u->buflen)
  84                 io_u->buflen = blocks * min_bs;
  85 }
  86
  87 static unsigned long long last_block(struct thread_data *td, struct fio_file *f,
  88                                      enum fio_ddir ddir)
  89 {
  90         unsigned long long max_blocks;
  91         unsigned long long max_size;
  92
  93         /*
  94          * Hmm, should we make sure that ->io_size <= ->real_file_size?
  95          */
  96         max_size = f->io_size;
  97         if (max_size > f->real_file_size)
  98                 max_size = f->real_file_size;
  99
 100         max_blocks = max_size / (unsigned long long) td->o.ba[ddir];
 101         if (!max_blocks)
 102                 return 0;
 103
 104         return max_blocks;
 105 }
 106
 107 /*
 108  * Return the next free block in the map.
 109  */
 110 static int get_next_free_block(struct thread_data *td, struct fio_file *f,
 111                                enum fio_ddir ddir, unsigned long long *b)
 112 {
 113         unsigned long long min_bs = td->o.rw_min_bs;
 114         int i;
 115
 116         i = f->last_free_lookup;
 117         *b = (i * BLOCKS_PER_MAP);
 118         while ((*b) * min_bs < f->real_file_size &&
 119                 (*b) * min_bs < f->io_size) {
 120                 if (f->file_map[i] != (unsigned int) -1) {
 121                         *b += ffz(f->file_map[i]);
 122                         if (*b > last_block(td, f, ddir))
 123                                 break;
 124                         f->last_free_lookup = i;
 125                         return 0;
 126                 }
 127
 128                 *b += BLOCKS_PER_MAP;
 129                 i++;
 130         }
 131
 132         dprint(FD_IO, "failed finding a free block\n");
 133         return 1;
 134 }
 135
 136 static int get_next_rand_offset(struct thread_data *td, struct fio_file *f,
 137                                 enum fio_ddir ddir, unsigned long long *b)
 138 {
 139         unsigned long long r;
 140         int loops = 5;
 141
 142         do {
 143                 r = os_random_long(&td->random_state);
 144                 dprint(FD_RANDOM, "off rand %llu\n", r);
 145                 *b = (last_block(td, f, ddir) - 1)
 146                         * (r / ((unsigned long long) OS_RAND_MAX + 1.0));
 147
 148                 /*
 149                  * if we are not maintaining a random map, we are done.
 150                  */
 151                 if (!file_randommap(td, f))
 152                         return 0;
 153
 154                 /*
 155                  * calculate map offset and check if it's free
 156                  */
 157                 if (random_map_free(f, *b))
 158                         return 0;
 159
 160                 dprint(FD_RANDOM, "get_next_rand_offset: offset %llu busy\n",
 161                                                                         *b);
 162         } while (--loops);
 163
 164         /*
 165          * we get here, if we didn't suceed in looking up a block. generate
 166          * a random start offset into the filemap, and find the first free
 167          * block from there.
 168          */
 169         loops = 10;
 170         do {
 171                 f->last_free_lookup = (f->num_maps - 1) *
 172                                         (r / (OS_RAND_MAX + 1.0));
 173                 if (!get_next_free_block(td, f, ddir, b))
 174                         return 0;
 175
 176                 r = os_random_long(&td->random_state);
 177         } while (--loops);
 178
 179         /*
 180          * that didn't work either, try exhaustive search from the start
 181          */
 182         f->last_free_lookup = 0;
 183         return get_next_free_block(td, f, ddir, b);
 184 }
 185
 186 /*
 187  * For random io, generate a random new block and see if it's used. Repeat
 188  * until we find a free one. For sequential io, just return the end of
 189  * the last io issued.
 190  */
 191 static int __get_next_offset(struct thread_data *td, struct io_u *io_u)
 192 {
 193         struct fio_file *f = io_u->file;
 194         unsigned long long b;
 195         enum fio_ddir ddir = io_u->ddir;
 196
 197         if (td_random(td) && (td->o.ddir_nr && !--td->ddir_nr)) {
 198                 td->ddir_nr = td->o.ddir_nr;
 199
 200                 if (get_next_rand_offset(td, f, ddir, &b)) {
 201                         dprint(FD_IO, "%s: getting rand offset failed\n",
 202                                 f->file_name);
 203                         return 1;
 204                 }
 205         } else {
 206                 if (f->last_pos >= f->real_file_size) {
 207                         if (!td_random(td) ||
 208                              get_next_rand_offset(td, f, ddir, &b)) {
 209                                 dprint(FD_IO, "%s: pos %llu > size %llu\n",
 210                                                 f->file_name, f->last_pos,
 211                                                 f->real_file_size);
 212                                 return 1;
 213                         }
 214                 } else
 215                         b = (f->last_pos - f->file_offset) / td->o.min_bs[ddir];
 216         }
 217
 218         io_u->offset = b * td->o.ba[ddir];
 219         if (io_u->offset >= f->io_size) {
 220                 dprint(FD_IO, "get_next_offset: offset %llu >= io_size %llu\n",
 221                                         io_u->offset, f->io_size);
 222                 return 1;
 223         }
 224
 225         io_u->offset += f->file_offset;
 226         if (io_u->offset >= f->real_file_size) {
 227                 dprint(FD_IO, "get_next_offset: offset %llu >= size %llu\n",
 228                                         io_u->offset, f->real_file_size);
 229                 return 1;
 230         }
 231
 232         return 0;
 233 }
 234
 235 static int get_next_offset(struct thread_data *td, struct io_u *io_u)
 236 {
 237         struct prof_io_ops *ops = &td->prof_io_ops;
 238
 239         if (ops->fill_io_u_off)
 240                 return ops->fill_io_u_off(td, io_u);
 241
 242         return __get_next_offset(td, io_u);
 243 }
 244
 245 static unsigned int __get_next_buflen(struct thread_data *td, struct io_u *io_u)
 246 {
 247         const int ddir = io_u->ddir;
 248         unsigned int uninitialized_var(buflen);
 249         unsigned int minbs, maxbs;
 250         long r;
 251
 252         minbs = td->o.min_bs[ddir];
 253         maxbs = td->o.max_bs[ddir];
 254
 255         if (minbs == maxbs)
 256                 buflen = minbs;
 257         else {
 258                 r = os_random_long(&td->bsrange_state);
 259                 if (!td->o.bssplit_nr[ddir]) {
 260                         buflen = 1 + (unsigned int) ((double) maxbs *
 261                                         (r / (OS_RAND_MAX + 1.0)));
 262                         if (buflen < minbs)
 263                                 buflen = minbs;
 264                 } else {
 265                         long perc = 0;
 266                         unsigned int i;
 267
 268                         for (i = 0; i < td->o.bssplit_nr[ddir]; i++) {
 269                                 struct bssplit *bsp = &td->o.bssplit[ddir][i];
 270
 271                                 buflen = bsp->bs;
 272                                 perc += bsp->perc;
 273                                 if (r <= ((OS_RAND_MAX / 100L) * perc))
 274                                         break;
 275                         }
 276                 }
 277                 if (!td->o.bs_unaligned && is_power_of_2(minbs))
 278                         buflen = (buflen + minbs - 1) & ~(minbs - 1);
 279         }
 280
 281         if (io_u->offset + buflen > io_u->file->real_file_size) {
 282                 dprint(FD_IO, "lower buflen %u -> %u (ddir=%d)\n", buflen,
 283                                                 minbs, ddir);
 284                 buflen = minbs;
 285         }
 286
 287         return buflen;
 288 }
 289
 290 static unsigned int get_next_buflen(struct thread_data *td, struct io_u *io_u)
 291 {
 292         struct prof_io_ops *ops = &td->prof_io_ops;
 293
 294         if (ops->fill_io_u_size)
 295                 return ops->fill_io_u_size(td, io_u);
 296
 297         return __get_next_buflen(td, io_u);
 298 }
 299
 300 static void set_rwmix_bytes(struct thread_data *td)
 301 {
 302         unsigned int diff;
 303
 304         /*
 305          * we do time or byte based switch. this is needed because
 306          * buffered writes may issue a lot quicker than they complete,
 307          * whereas reads do not.
 308          */
 309         diff = td->o.rwmix[td->rwmix_ddir ^ 1];
 310         td->rwmix_issues = (td->io_issues[td->rwmix_ddir] * diff) / 100;
 311 }
 312
 313 static inline enum fio_ddir get_rand_ddir(struct thread_data *td)
 314 {
 315         unsigned int v;
 316         long r;
 317
 318         r = os_random_long(&td->rwmix_state);
 319         v = 1 + (int) (100.0 * (r / (OS_RAND_MAX + 1.0)));
 320         if (v <= td->o.rwmix[DDIR_READ])
 321                 return DDIR_READ;
 322
 323         return DDIR_WRITE;
 324 }
 325
 326 static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir)
 327 {
 328         enum fio_ddir odir = ddir ^ 1;
 329         struct timeval t;
 330         long usec;
 331
 332         if (td->rate_pending_usleep[ddir] <= 0)
 333                 return ddir;
 334
 335         /*
 336          * We have too much pending sleep in this direction. See if we
 337          * should switch.
 338          */
 339         if (td_rw(td)) {
 340                 /*
 341                  * Other direction does not have too much pending, switch
 342                  */
 343                 if (td->rate_pending_usleep[odir] < 100000)
 344                         return odir;
 345
 346                 /*
 347                  * Both directions have pending sleep. Sleep the minimum time
 348                  * and deduct from both.
 349                  */
 350                 if (td->rate_pending_usleep[ddir] <=
 351                         td->rate_pending_usleep[odir]) {
 352                         usec = td->rate_pending_usleep[ddir];
 353                 } else {
 354                         usec = td->rate_pending_usleep[odir];
 355                         ddir = odir;
 356                 }
 357         } else
 358                 usec = td->rate_pending_usleep[ddir];
 359
 360         fio_gettime(&t, NULL);
 361         usec_sleep(td, usec);
 362         usec = utime_since_now(&t);
 363
 364         td->rate_pending_usleep[ddir] -= usec;
 365
 366         odir = ddir ^ 1;
 367         if (td_rw(td) && __should_check_rate(td, odir))
 368                 td->rate_pending_usleep[odir] -= usec;
 369
 370         return ddir;
 371 }
 372
 373 /*
 374  * Return the data direction for the next io_u. If the job is a
 375  * mixed read/write workload, check the rwmix cycle and switch if
 376  * necessary.
 377  */
 378 static enum fio_ddir get_rw_ddir(struct thread_data *td)
 379 {
 380         enum fio_ddir ddir;
 381
 382         /*
 383          * see if it's time to fsync
 384          */
 385         if (td->o.fsync_blocks &&
 386            !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks) &&
 387              td->io_issues[DDIR_WRITE] && should_fsync(td))
 388                 return DDIR_SYNC;
 389
 390         /*
 391          * see if it's time to fdatasync
 392          */
 393         if (td->o.fdatasync_blocks &&
 394            !(td->io_issues[DDIR_WRITE] % td->o.fdatasync_blocks) &&
 395              td->io_issues[DDIR_WRITE] && should_fsync(td))
 396                 return DDIR_DATASYNC;
 397
 398         /*
 399          * see if it's time to sync_file_range
 400          */
 401         if (td->sync_file_range_nr &&
 402            !(td->io_issues[DDIR_WRITE] % td->sync_file_range_nr) &&
 403              td->io_issues[DDIR_WRITE] && should_fsync(td))
 404                 return DDIR_SYNC_FILE_RANGE;
 405
 406         if (td_rw(td)) {
 407                 /*
 408                  * Check if it's time to seed a new data direction.
 409                  */
 410                 if (td->io_issues[td->rwmix_ddir] >= td->rwmix_issues) {
 411                         /*
 412                          * Put a top limit on how many bytes we do for
 413                          * one data direction, to avoid overflowing the
 414                          * ranges too much
 415                          */
 416                         ddir = get_rand_ddir(td);
 417
 418                         if (ddir != td->rwmix_ddir)
 419                                 set_rwmix_bytes(td);
 420
 421                         td->rwmix_ddir = ddir;
 422                 }
 423                 ddir = td->rwmix_ddir;
 424         } else if (td_read(td))
 425                 ddir = DDIR_READ;
 426         else
 427                 ddir = DDIR_WRITE;
 428
 429         td->rwmix_ddir = rate_ddir(td, ddir);
 430         return td->rwmix_ddir;
 431 }
 432
 433 void put_file_log(struct thread_data *td, struct fio_file *f)
 434 {
 435         int ret = put_file(td, f);
 436
 437         if (ret)
 438                 td_verror(td, ret, "file close");
 439 }
 440
 441 void put_io_u(struct thread_data *td, struct io_u *io_u)
 442 {
 443         td_io_u_lock(td);
 444
 445         io_u->flags |= IO_U_F_FREE;
 446         io_u->flags &= ~IO_U_F_FREE_DEF;
 447
 448         if (io_u->file)
 449                 put_file_log(td, io_u->file);
 450
 451         io_u->file = NULL;
 452         if (io_u->flags & IO_U_F_IN_CUR_DEPTH)
 453                 td->cur_depth--;
 454         flist_del_init(&io_u->list);
 455         flist_add(&io_u->list, &td->io_u_freelist);
 456         td_io_u_unlock(td);
 457         td_io_u_free_notify(td);
 458 }
 459
 460 void clear_io_u(struct thread_data *td, struct io_u *io_u)
 461 {
 462         io_u->flags &= ~IO_U_F_FLIGHT;
 463         put_io_u(td, io_u);
 464 }
 465
 466 void requeue_io_u(struct thread_data *td, struct io_u **io_u)
 467 {
 468         struct io_u *__io_u = *io_u;
 469
 470         dprint(FD_IO, "requeue %p\n", __io_u);
 471
 472         td_io_u_lock(td);
 473
 474         __io_u->flags |= IO_U_F_FREE;
 475         if ((__io_u->flags & IO_U_F_FLIGHT) && !ddir_sync(__io_u->ddir))
 476                 td->io_issues[__io_u->ddir]--;
 477
 478         __io_u->flags &= ~IO_U_F_FLIGHT;
 479         if (__io_u->flags & IO_U_F_IN_CUR_DEPTH)
 480                 td->cur_depth--;
 481         flist_del(&__io_u->list);
 482         flist_add_tail(&__io_u->list, &td->io_u_requeues);
 483         td_io_u_unlock(td);
 484         *io_u = NULL;
 485 }
 486
 487 static int fill_io_u(struct thread_data *td, struct io_u *io_u)
 488 {
 489         if (td->io_ops->flags & FIO_NOIO)
 490                 goto out;
 491
 492         io_u->ddir = get_rw_ddir(td);
 493
 494         /*
 495          * fsync() or fdatasync(), we are done
 496          */
 497         if (ddir_sync(io_u->ddir))
 498                 goto out;
 499
 500         /*
 501          * See if it's time to switch to a new zone
 502          */
 503         if (td->zone_bytes >= td->o.zone_size) {
 504                 td->zone_bytes = 0;
 505                 io_u->file->last_pos += td->o.zone_skip;
 506                 td->io_skip_bytes += td->o.zone_skip;
 507         }
 508
 509         /*
 510          * No log, let the seq/rand engine retrieve the next buflen and
 511          * position.
 512          */
 513         if (get_next_offset(td, io_u)) {
 514                 dprint(FD_IO, "io_u %p, failed getting offset\n", io_u);
 515                 return 1;
 516         }
 517
 518         io_u->buflen = get_next_buflen(td, io_u);
 519         if (!io_u->buflen) {
 520                 dprint(FD_IO, "io_u %p, failed getting buflen\n", io_u);
 521                 return 1;
 522         }
 523
 524         if (io_u->offset + io_u->buflen > io_u->file->real_file_size) {
 525                 dprint(FD_IO, "io_u %p, offset too large\n", io_u);
 526                 dprint(FD_IO, "  off=%llu/%lu > %llu\n", io_u->offset,
 527                                 io_u->buflen, io_u->file->real_file_size);
 528                 return 1;
 529         }
 530
 531         /*
 532          * mark entry before potentially trimming io_u
 533          */
 534         if (td_random(td) && file_randommap(td, io_u->file))
 535                 mark_random_map(td, io_u);
 536
 537         /*
 538          * If using a write iolog, store this entry.
 539          */
 540 out:
 541         dprint_io_u(io_u, "fill_io_u");
 542         td->zone_bytes += io_u->buflen;
 543         log_io_u(td, io_u);
 544         return 0;
 545 }
 546
 547 static void __io_u_mark_map(unsigned int *map, unsigned int nr)
 548 {
 549         int index = 0;
 550
 551         switch (nr) {
 552         default:
 553                 index = 6;
 554                 break;
 555         case 33 ... 64:
 556                 index = 5;
 557                 break;
 558         case 17 ... 32:
 559                 index = 4;
 560                 break;
 561         case 9 ... 16:
 562                 index = 3;
 563                 break;
 564         case 5 ... 8:
 565                 index = 2;
 566                 break;
 567         case 1 ... 4:
 568                 index = 1;
 569         case 0:
 570                 break;
 571         }
 572
 573         map[index]++;
 574 }
 575
 576 void io_u_mark_submit(struct thread_data *td, unsigned int nr)
 577 {
 578         __io_u_mark_map(td->ts.io_u_submit, nr);
 579         td->ts.total_submit++;
 580 }
 581
 582 void io_u_mark_complete(struct thread_data *td, unsigned int nr)
 583 {
 584         __io_u_mark_map(td->ts.io_u_complete, nr);
 585         td->ts.total_complete++;
 586 }
 587
 588 void io_u_mark_depth(struct thread_data *td, unsigned int nr)
 589 {
 590         int index = 0;
 591
 592         switch (td->cur_depth) {
 593         default:
 594                 index = 6;
 595                 break;
 596         case 32 ... 63:
 597                 index = 5;
 598                 break;
 599         case 16 ... 31:
 600                 index = 4;
 601                 break;
 602         case 8 ... 15:
 603                 index = 3;
 604                 break;
 605         case 4 ... 7:
 606                 index = 2;
 607                 break;
 608         case 2 ... 3:
 609                 index = 1;
 610         case 1:
 611                 break;
 612         }
 613
 614         td->ts.io_u_map[index] += nr;
 615 }
 616
 617 static void io_u_mark_lat_usec(struct thread_data *td, unsigned long usec)
 618 {
 619         int index = 0;
 620
 621         assert(usec < 1000);
 622
 623         switch (usec) {
 624         case 750 ... 999:
 625                 index = 9;
 626                 break;
 627         case 500 ... 749:
 628                 index = 8;
 629                 break;
 630         case 250 ... 499:
 631                 index = 7;
 632                 break;
 633         case 100 ... 249:
 634                 index = 6;
 635                 break;
 636         case 50 ... 99:
 637                 index = 5;
 638                 break;
 639         case 20 ... 49:
 640                 index = 4;
 641                 break;
 642         case 10 ... 19:
 643                 index = 3;
 644                 break;
 645         case 4 ... 9:
 646                 index = 2;
 647                 break;
 648         case 2 ... 3:
 649                 index = 1;
 650         case 0 ... 1:
 651                 break;
 652         }
 653
 654         assert(index < FIO_IO_U_LAT_U_NR);
 655         td->ts.io_u_lat_u[index]++;
 656 }
 657
 658 static void io_u_mark_lat_msec(struct thread_data *td, unsigned long msec)
 659 {
 660         int index = 0;
 661
 662         switch (msec) {
 663         default:
 664                 index = 11;
 665                 break;
 666         case 1000 ... 1999:
 667                 index = 10;
 668                 break;
 669         case 750 ... 999:
 670                 index = 9;
 671                 break;
 672         case 500 ... 749:
 673                 index = 8;
 674                 break;
 675         case 250 ... 499:
 676                 index = 7;
 677                 break;
 678         case 100 ... 249:
 679                 index = 6;
 680                 break;
 681         case 50 ... 99:
 682                 index = 5;
 683                 break;
 684         case 20 ... 49:
 685                 index = 4;
 686                 break;
 687         case 10 ... 19:
 688                 index = 3;
 689                 break;
 690         case 4 ... 9:
 691                 index = 2;
 692                 break;
 693         case 2 ... 3:
 694                 index = 1;
 695         case 0 ... 1:
 696                 break;
 697         }
 698
 699         assert(index < FIO_IO_U_LAT_M_NR);
 700         td->ts.io_u_lat_m[index]++;
 701 }
 702
 703 static void io_u_mark_latency(struct thread_data *td, unsigned long usec)
 704 {
 705         if (usec < 1000)
 706                 io_u_mark_lat_usec(td, usec);
 707         else
 708                 io_u_mark_lat_msec(td, usec / 1000);
 709 }
 710
 711 /*
 712  * Get next file to service by choosing one at random
 713  */
 714 static struct fio_file *get_next_file_rand(struct thread_data *td,
 715                                            enum fio_file_flags goodf,
 716                                            enum fio_file_flags badf)
 717 {
 718         struct fio_file *f;
 719         int fno;
 720
 721         do {
 722                 long r = os_random_long(&td->next_file_state);
 723                 int opened = 0;
 724
 725                 fno = (unsigned int) ((double) td->o.nr_files
 726                         * (r / (OS_RAND_MAX + 1.0)));
 727                 f = td->files[fno];
 728                 if (fio_file_done(f))
 729                         continue;
 730
 731                 if (!fio_file_open(f)) {
 732                         int err;
 733
 734                         err = td_io_open_file(td, f);
 735                         if (err)
 736                                 continue;
 737                         opened = 1;
 738                 }
 739
 740                 if ((!goodf || (f->flags & goodf)) && !(f->flags & badf)) {
 741                         dprint(FD_FILE, "get_next_file_rand: %p\n", f);
 742                         return f;
 743                 }
 744                 if (opened)
 745                         td_io_close_file(td, f);
 746         } while (1);
 747 }
 748
 749 /*
 750  * Get next file to service by doing round robin between all available ones
 751  */
 752 static struct fio_file *get_next_file_rr(struct thread_data *td, int goodf,
 753                                          int badf)
 754 {
 755         unsigned int old_next_file = td->next_file;
 756         struct fio_file *f;
 757
 758         do {
 759                 int opened = 0;
 760
 761                 f = td->files[td->next_file];
 762
 763                 td->next_file++;
 764                 if (td->next_file >= td->o.nr_files)
 765                         td->next_file = 0;
 766
 767                 dprint(FD_FILE, "trying file %s %x\n", f->file_name, f->flags);
 768                 if (fio_file_done(f)) {
 769                         f = NULL;
 770                         continue;
 771                 }
 772
 773                 if (!fio_file_open(f)) {
 774                         int err;
 775
 776                         err = td_io_open_file(td, f);
 777                         if (err) {
 778                                 dprint(FD_FILE, "error %d on open of %s\n",
 779                                         err, f->file_name);
 780                                 f = NULL;
 781                                 continue;
 782                         }
 783                         opened = 1;
 784                 }
 785
 786                 dprint(FD_FILE, "goodf=%x, badf=%x, ff=%x\n", goodf, badf,
 787                                                                 f->flags);
 788                 if ((!goodf || (f->flags & goodf)) && !(f->flags & badf))
 789                         break;
 790
 791                 if (opened)
 792                         td_io_close_file(td, f);
 793
 794                 f = NULL;
 795         } while (td->next_file != old_next_file);
 796
 797         dprint(FD_FILE, "get_next_file_rr: %p\n", f);
 798         return f;
 799 }
 800
 801 static struct fio_file *__get_next_file(struct thread_data *td)
 802 {
 803         struct fio_file *f;
 804
 805         assert(td->o.nr_files <= td->files_index);
 806
 807         if (td->nr_done_files >= td->o.nr_files) {
 808                 dprint(FD_FILE, "get_next_file: nr_open=%d, nr_done=%d,"
 809                                 " nr_files=%d\n", td->nr_open_files,
 810                                                   td->nr_done_files,
 811                                                   td->o.nr_files);
 812                 return NULL;
 813         }
 814
 815         f = td->file_service_file;
 816         if (f && fio_file_open(f) && !fio_file_closing(f)) {
 817                 if (td->o.file_service_type == FIO_FSERVICE_SEQ)
 818                         goto out;
 819                 if (td->file_service_left--)
 820                         goto out;
 821         }
 822
 823         if (td->o.file_service_type == FIO_FSERVICE_RR ||
 824             td->o.file_service_type == FIO_FSERVICE_SEQ)
 825                 f = get_next_file_rr(td, FIO_FILE_open, FIO_FILE_closing);
 826         else
 827                 f = get_next_file_rand(td, FIO_FILE_open, FIO_FILE_closing);
 828
 829         td->file_service_file = f;
 830         td->file_service_left = td->file_service_nr - 1;
 831 out:
 832         dprint(FD_FILE, "get_next_file: %p [%s]\n", f, f->file_name);
 833         return f;
 834 }
 835
 836 static struct fio_file *get_next_file(struct thread_data *td)
 837 {
 838         struct prof_io_ops *ops = &td->prof_io_ops;
 839
 840         if (ops->get_next_file)
 841                 return ops->get_next_file(td);
 842
 843         return __get_next_file(td);
 844 }
 845
 846 static int set_io_u_file(struct thread_data *td, struct io_u *io_u)
 847 {
 848         struct fio_file *f;
 849
 850         do {
 851                 f = get_next_file(td);
 852                 if (!f)
 853                         return 1;
 854
 855                 io_u->file = f;
 856                 get_file(f);
 857
 858                 if (!fill_io_u(td, io_u))
 859                         break;
 860
 861                 put_file_log(td, f);
 862                 td_io_close_file(td, f);
 863                 io_u->file = NULL;
 864                 fio_file_set_done(f);
 865                 td->nr_done_files++;
 866                 dprint(FD_FILE, "%s: is done (%d of %d)\n", f->file_name,
 867                                         td->nr_done_files, td->o.nr_files);
 868         } while (1);
 869
 870         return 0;
 871 }
 872
 873
 874 struct io_u *__get_io_u(struct thread_data *td)
 875 {
 876         struct io_u *io_u = NULL;
 877
 878         td_io_u_lock(td);
 879
 880 again:
 881         if (!flist_empty(&td->io_u_requeues))
 882                 io_u = flist_entry(td->io_u_requeues.next, struct io_u, list);
 883         else if (!queue_full(td)) {
 884                 io_u = flist_entry(td->io_u_freelist.next, struct io_u, list);
 885
 886                 io_u->buflen = 0;
 887                 io_u->resid = 0;
 888                 io_u->file = NULL;
 889                 io_u->end_io = NULL;
 890         }
 891
 892         if (io_u) {
 893                 assert(io_u->flags & IO_U_F_FREE);
 894                 io_u->flags &= ~(IO_U_F_FREE | IO_U_F_FREE_DEF);
 895
 896                 io_u->error = 0;
 897                 flist_del(&io_u->list);
 898                 flist_add(&io_u->list, &td->io_u_busylist);
 899                 td->cur_depth++;
 900                 io_u->flags |= IO_U_F_IN_CUR_DEPTH;
 901         } else if (td->o.verify_async) {
 902                 /*
 903                  * We ran out, wait for async verify threads to finish and
 904                  * return one
 905                  */
 906                 pthread_cond_wait(&td->free_cond, &td->io_u_lock);
 907                 goto again;
 908         }
 909
 910         td_io_u_unlock(td);
 911         return io_u;
 912 }
 913
 914 /*
 915  * Return an io_u to be processed. Gets a buflen and offset, sets direction,
 916  * etc. The returned io_u is fully ready to be prepped and submitted.
 917  */
 918 struct io_u *get_io_u(struct thread_data *td)
 919 {
 920         struct fio_file *f;
 921         struct io_u *io_u;
 922
 923         io_u = __get_io_u(td);
 924         if (!io_u) {
 925                 dprint(FD_IO, "__get_io_u failed\n");
 926                 return NULL;
 927         }
 928
 929         if (td->o.verify_backlog && td->io_hist_len) {
 930                 int get_verify = 0;
 931
 932                 if (td->verify_batch) {
 933                         td->verify_batch--;
 934                         get_verify = 1;
 935                 } else if (!(td->io_hist_len % td->o.verify_backlog) &&
 936                          td->last_ddir != DDIR_READ) {
 937                         td->verify_batch = td->o.verify_batch;
 938                         get_verify = 1;
 939                 }
 940
 941                 if (get_verify && !get_next_verify(td, io_u))
 942                         goto out;
 943         }
 944
 945         /*
 946          * from a requeue, io_u already setup
 947          */
 948         if (io_u->file)
 949                 goto out;
 950
 951         /*
 952          * If using an iolog, grab next piece if any available.
 953          */
 954         if (td->o.read_iolog_file) {
 955                 if (read_iolog_get(td, io_u))
 956                         goto err_put;
 957         } else if (set_io_u_file(td, io_u)) {
 958                 dprint(FD_IO, "io_u %p, setting file failed\n", io_u);
 959                 goto err_put;
 960         }
 961
 962         f = io_u->file;
 963         assert(fio_file_open(f));
 964
 965         if (!ddir_sync(io_u->ddir)) {
 966                 if (!io_u->buflen && !(td->io_ops->flags & FIO_NOIO)) {
 967                         dprint(FD_IO, "get_io_u: zero buflen on %p\n", io_u);
 968                         goto err_put;
 969                 }
 970
 971                 f->last_pos = io_u->offset + io_u->buflen;
 972
 973                 if (td->o.verify != VERIFY_NONE && io_u->ddir == DDIR_WRITE)
 974                         populate_verify_io_u(td, io_u);
 975                 else if (td->o.refill_buffers && io_u->ddir == DDIR_WRITE)
 976                         io_u_fill_buffer(td, io_u, io_u->xfer_buflen);
 977         }
 978
 979         /*
 980          * Set io data pointers.
 981          */
 982         io_u->xfer_buf = io_u->buf;
 983         io_u->xfer_buflen = io_u->buflen;
 984
 985 out:
 986         if (!td_io_prep(td, io_u)) {
 987                 if (!td->o.disable_slat)
 988                         fio_gettime(&io_u->start_time, NULL);
 989                 return io_u;
 990         }
 991 err_put:
 992         dprint(FD_IO, "get_io_u failed\n");
 993         put_io_u(td, io_u);
 994         return NULL;
 995 }
 996
 997 void io_u_log_error(struct thread_data *td, struct io_u *io_u)
 998 {
 999         const char *msg[] = { "read", "write", "sync" };
1000
1001         log_err("fio: io_u error");
1002
1003         if (io_u->file)
1004                 log_err(" on file %s", io_u->file->file_name);
1005
1006         log_err(": %s\n", strerror(io_u->error));
1007
1008         log_err("     %s offset=%llu, buflen=%lu\n", msg[io_u->ddir],
1009                                         io_u->offset, io_u->xfer_buflen);
1010
1011         if (!td->error)
1012                 td_verror(td, io_u->error, "io_u error");
1013 }
1014
1015 static void io_completed(struct thread_data *td, struct io_u *io_u,
1016                          struct io_completion_data *icd)
1017 {
1018         /*
1019          * Older gcc's are too dumb to realize that usec is always used
1020          * initialized, silence that warning.
1021          */
1022         unsigned long uninitialized_var(usec);
1023         struct fio_file *f;
1024
1025         dprint_io_u(io_u, "io complete");
1026
1027         td_io_u_lock(td);
1028         assert(io_u->flags & IO_U_F_FLIGHT);
1029         io_u->flags &= ~IO_U_F_FLIGHT;
1030         td_io_u_unlock(td);
1031
1032         if (ddir_sync(io_u->ddir)) {
1033                 td->last_was_sync = 1;
1034                 f = io_u->file;
1035                 if (f) {
1036                         f->first_write = -1ULL;
1037                         f->last_write = -1ULL;
1038                 }
1039                 return;
1040         }
1041
1042         td->last_was_sync = 0;
1043         td->last_ddir = io_u->ddir;
1044
1045         if (!io_u->error) {
1046                 unsigned int bytes = io_u->buflen - io_u->resid;
1047                 const enum fio_ddir idx = io_u->ddir;
1048                 const enum fio_ddir odx = io_u->ddir ^ 1;
1049                 int ret;
1050
1051                 td->io_blocks[idx]++;
1052                 td->io_bytes[idx] += bytes;
1053                 td->this_io_bytes[idx] += bytes;
1054
1055                 if (idx == DDIR_WRITE) {
1056                         f = io_u->file;
1057                         if (f) {
1058                                 if (f->first_write == -1ULL ||
1059                                     io_u->offset < f->first_write)
1060                                         f->first_write = io_u->offset;
1061                                 if (f->last_write == -1ULL ||
1062                                     ((io_u->offset + bytes) > f->last_write))
1063                                         f->last_write = io_u->offset + bytes;
1064                         }
1065                 }
1066
1067                 if (ramp_time_over(td)) {
1068                         unsigned long uninitialized_var(lusec);
1069
1070                         if (!td->o.disable_clat || !td->o.disable_bw)
1071                                 lusec = utime_since(&io_u->issue_time,
1072                                                         &icd->time);
1073
1074                         if (!td->o.disable_clat) {
1075                                 add_clat_sample(td, idx, lusec, bytes);
1076                                 io_u_mark_latency(td, lusec);
1077                         }
1078                         if (!td->o.disable_bw)
1079                                 add_bw_sample(td, idx, bytes, &icd->time);
1080                         if (__should_check_rate(td, idx)) {
1081                                 td->rate_pending_usleep[idx] =
1082                                         ((td->this_io_bytes[idx] *
1083                                           td->rate_nsec_cycle[idx]) / 1000 -
1084                                          utime_since_now(&td->start));
1085                         }
1086                         if (__should_check_rate(td, idx ^ 1))
1087                                 td->rate_pending_usleep[odx] =
1088                                         ((td->this_io_bytes[odx] *
1089                                           td->rate_nsec_cycle[odx]) / 1000 -
1090                                          utime_since_now(&td->start));
1091                 }
1092
1093                 if (td_write(td) && idx == DDIR_WRITE &&
1094                     td->o.do_verify &&
1095                     td->o.verify != VERIFY_NONE)
1096                         log_io_piece(td, io_u);
1097
1098                 icd->bytes_done[idx] += bytes;
1099
1100                 if (io_u->end_io) {
1101                         ret = io_u->end_io(td, io_u);
1102                         if (ret && !icd->error)
1103                                 icd->error = ret;
1104                 }
1105         } else {
1106                 icd->error = io_u->error;
1107                 io_u_log_error(td, io_u);
1108         }
1109         if (td->o.continue_on_error && icd->error &&
1110             td_non_fatal_error(icd->error)) {
1111                 /*
1112                  * If there is a non_fatal error, then add to the error count
1113                  * and clear all the errors.
1114                  */
1115                 update_error_count(td, icd->error);
1116                 td_clear_error(td);
1117                 icd->error = 0;
1118                 io_u->error = 0;
1119         }
1120 }
1121
1122 static void init_icd(struct thread_data *td, struct io_completion_data *icd,
1123                      int nr)
1124 {
1125         if (!td->o.disable_clat || !td->o.disable_bw)
1126                 fio_gettime(&icd->time, NULL);
1127
1128         icd->nr = nr;
1129
1130         icd->error = 0;
1131         icd->bytes_done[0] = icd->bytes_done[1] = 0;
1132 }
1133
1134 static void ios_completed(struct thread_data *td,
1135                           struct io_completion_data *icd)
1136 {
1137         struct io_u *io_u;
1138         int i;
1139
1140         for (i = 0; i < icd->nr; i++) {
1141                 io_u = td->io_ops->event(td, i);
1142
1143                 io_completed(td, io_u, icd);
1144
1145                 if (!(io_u->flags & IO_U_F_FREE_DEF))
1146                         put_io_u(td, io_u);
1147         }
1148 }
1149
1150 /*
1151  * Complete a single io_u for the sync engines.
1152  */
1153 int io_u_sync_complete(struct thread_data *td, struct io_u *io_u,
1154                        unsigned long *bytes)
1155 {
1156         struct io_completion_data icd;
1157
1158         init_icd(td, &icd, 1);
1159         io_completed(td, io_u, &icd);
1160
1161         if (!(io_u->flags & IO_U_F_FREE_DEF))
1162                 put_io_u(td, io_u);
1163
1164         if (icd.error) {
1165                 td_verror(td, icd.error, "io_u_sync_complete");
1166                 return -1;
1167         }
1168
1169         if (bytes) {
1170                 bytes[0] += icd.bytes_done[0];
1171                 bytes[1] += icd.bytes_done[1];
1172         }
1173
1174         return 0;
1175 }
1176
1177 /*
1178  * Called to complete min_events number of io for the async engines.
1179  */
1180 int io_u_queued_complete(struct thread_data *td, int min_evts,
1181                          unsigned long *bytes)
1182 {
1183         struct io_completion_data icd;
1184         struct timespec *tvp = NULL;
1185         int ret;
1186         struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, };
1187
1188         dprint(FD_IO, "io_u_queued_completed: min=%d\n", min_evts);
1189
1190         if (!min_evts)
1191                 tvp = &ts;
1192
1193         ret = td_io_getevents(td, min_evts, td->o.iodepth_batch_complete, tvp);
1194         if (ret < 0) {
1195                 td_verror(td, -ret, "td_io_getevents");
1196                 return ret;
1197         } else if (!ret)
1198                 return ret;
1199
1200         init_icd(td, &icd, ret);
1201         ios_completed(td, &icd);
1202         if (icd.error) {
1203                 td_verror(td, icd.error, "io_u_queued_complete");
1204                 return -1;
1205         }
1206
1207         if (bytes) {
1208                 bytes[0] += icd.bytes_done[0];
1209                 bytes[1] += icd.bytes_done[1];
1210         }
1211
1212         return 0;
1213 }
1214
1215 /*
1216  * Call when io_u is really queued, to update the submission latency.
1217  */
1218 void io_u_queued(struct thread_data *td, struct io_u *io_u)
1219 {
1220         if (!td->o.disable_slat) {
1221                 unsigned long slat_time;
1222
1223                 slat_time = utime_since(&io_u->start_time, &io_u->issue_time);
1224                 add_slat_sample(td, io_u->ddir, slat_time, io_u->xfer_buflen);
1225         }
1226 }
1227
1228 /*
1229  * "randomly" fill the buffer contents
1230  */
1231 void io_u_fill_buffer(struct thread_data *td, struct io_u *io_u,
1232                       unsigned int max_bs)
1233 {
1234         long *ptr = io_u->buf;
1235
1236         if (!td->o.zero_buffers) {
1237                 unsigned long r = __rand(&__fio_rand_state);
1238
1239                 if (sizeof(int) != sizeof(*ptr))
1240                         r *= (unsigned long) __rand(&__fio_rand_state);
1241
1242                 while ((void *) ptr - io_u->buf < max_bs) {
1243                         *ptr = r;
1244                         ptr++;
1245                         r *= GOLDEN_RATIO_PRIME;
1246                         r >>= 3;
1247                 }
1248         } else
1249                 memset(ptr, 0, max_bs);
1250 }