zbd.c

   1 /*
   2  * Copyright (C) 2018 Western Digital Corporation or its affiliates.
   3  *
   4  * This file is released under the GPL.
   5  */
   6
   7 #include <errno.h>
   8 #include <string.h>
   9 #include <stdlib.h>
  10 #include <fcntl.h>
  11 #include <sys/stat.h>
  12 #include <unistd.h>
  13
  14 #include "os/os.h"
  15 #include "file.h"
  16 #include "fio.h"
  17 #include "lib/pow2.h"
  18 #include "log.h"
  19 #include "oslib/asprintf.h"
  20 #include "smalloc.h"
  21 #include "verify.h"
  22 #include "pshared.h"
  23 #include "zbd.h"
  24
  25 static bool is_valid_offset(const struct fio_file *f, uint64_t offset)
  26 {
  27         return (uint64_t)(offset - f->file_offset) < f->io_size;
  28 }
  29
  30 static inline unsigned int zbd_zone_idx(const struct fio_file *f,
  31                                         struct fio_zone_info *zone)
  32 {
  33         return zone - f->zbd_info->zone_info;
  34 }
  35
  36 /**
  37  * zbd_offset_to_zone_idx - convert an offset into a zone number
  38  * @f: file pointer.
  39  * @offset: offset in bytes. If this offset is in the first zone_size bytes
  40  *          past the disk size then the index of the sentinel is returned.
  41  */
  42 static unsigned int zbd_offset_to_zone_idx(const struct fio_file *f,
  43                                            uint64_t offset)
  44 {
  45         uint32_t zone_idx;
  46
  47         if (f->zbd_info->zone_size_log2 > 0)
  48                 zone_idx = offset >> f->zbd_info->zone_size_log2;
  49         else
  50                 zone_idx = offset / f->zbd_info->zone_size;
  51
  52         return min(zone_idx, f->zbd_info->nr_zones);
  53 }
  54
  55 /**
  56  * zbd_zone_end - Return zone end location
  57  * @z: zone info pointer.
  58  */
  59 static inline uint64_t zbd_zone_end(const struct fio_zone_info *z)
  60 {
  61         return (z+1)->start;
  62 }
  63
  64 /**
  65  * zbd_zone_capacity_end - Return zone capacity limit end location
  66  * @z: zone info pointer.
  67  */
  68 static inline uint64_t zbd_zone_capacity_end(const struct fio_zone_info *z)
  69 {
  70         return z->start + z->capacity;
  71 }
  72
  73 /**
  74  * zbd_zone_remainder - Return the number of bytes that are still available for
  75  *                      writing before the zone gets full
  76  * @z: zone info pointer.
  77  */
  78 static inline uint64_t zbd_zone_remainder(struct fio_zone_info *z)
  79 {
  80         if (z->wp >= zbd_zone_capacity_end(z))
  81                 return 0;
  82
  83         return zbd_zone_capacity_end(z) - z->wp;
  84 }
  85
  86 /**
  87  * zbd_zone_full - verify whether a minimum number of bytes remain in a zone
  88  * @f: file pointer.
  89  * @z: zone info pointer.
  90  * @required: minimum number of bytes that must remain in a zone.
  91  *
  92  * The caller must hold z->mutex.
  93  */
  94 static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z,
  95                           uint64_t required)
  96 {
  97         assert((required & 511) == 0);
  98
  99         return z->has_wp && required > zbd_zone_remainder(z);
 100 }
 101
 102 static void zone_lock(struct thread_data *td, const struct fio_file *f,
 103                       struct fio_zone_info *z)
 104 {
 105         struct zoned_block_device_info *zbd = f->zbd_info;
 106         uint32_t nz = z - zbd->zone_info;
 107
 108         /* A thread should never lock zones outside its working area. */
 109         assert(f->min_zone <= nz && nz < f->max_zone);
 110
 111         assert(z->has_wp);
 112
 113         /*
 114          * Lock the io_u target zone. The zone will be unlocked if io_u offset
 115          * is changed or when io_u completes and zbd_put_io() executed.
 116          * To avoid multiple jobs doing asynchronous I/Os from deadlocking each
 117          * other waiting for zone locks when building an io_u batch, first
 118          * only trylock the zone. If the zone is already locked by another job,
 119          * process the currently queued I/Os so that I/O progress is made and
 120          * zones unlocked.
 121          */
 122         if (pthread_mutex_trylock(&z->mutex) != 0) {
 123                 if (!td_ioengine_flagged(td, FIO_SYNCIO))
 124                         io_u_quiesce(td);
 125                 pthread_mutex_lock(&z->mutex);
 126         }
 127 }
 128
 129 static inline void zone_unlock(struct fio_zone_info *z)
 130 {
 131         int ret;
 132
 133         assert(z->has_wp);
 134         ret = pthread_mutex_unlock(&z->mutex);
 135         assert(!ret);
 136 }
 137
 138 static inline struct fio_zone_info *zbd_get_zone(const struct fio_file *f,
 139                                                  unsigned int zone_idx)
 140 {
 141         return &f->zbd_info->zone_info[zone_idx];
 142 }
 143
 144 static inline struct fio_zone_info *
 145 zbd_offset_to_zone(const struct fio_file *f,  uint64_t offset)
 146 {
 147         return zbd_get_zone(f, zbd_offset_to_zone_idx(f, offset));
 148 }
 149
 150 /**
 151  * zbd_get_zoned_model - Get a device zoned model
 152  * @td: FIO thread data
 153  * @f: FIO file for which to get model information
 154  */
 155 static int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f,
 156                                enum zbd_zoned_model *model)
 157 {
 158         int ret;
 159
 160         if (f->filetype == FIO_TYPE_PIPE) {
 161                 log_err("zonemode=zbd does not support pipes\n");
 162                 return -EINVAL;
 163         }
 164
 165         /* If regular file, always emulate zones inside the file. */
 166         if (f->filetype == FIO_TYPE_FILE) {
 167                 *model = ZBD_NONE;
 168                 return 0;
 169         }
 170
 171         if (td->io_ops && td->io_ops->get_zoned_model)
 172                 ret = td->io_ops->get_zoned_model(td, f, model);
 173         else
 174                 ret = blkzoned_get_zoned_model(td, f, model);
 175         if (ret < 0) {
 176                 td_verror(td, errno, "get zoned model failed");
 177                 log_err("%s: get zoned model failed (%d).\n",
 178                         f->file_name, errno);
 179         }
 180
 181         return ret;
 182 }
 183
 184 /**
 185  * zbd_report_zones - Get zone information
 186  * @td: FIO thread data.
 187  * @f: FIO file for which to get zone information
 188  * @offset: offset from which to report zones
 189  * @zones: Array of struct zbd_zone
 190  * @nr_zones: Size of @zones array
 191  *
 192  * Get zone information into @zones starting from the zone at offset @offset
 193  * for the device specified by @f.
 194  *
 195  * Returns the number of zones reported upon success and a negative error code
 196  * upon failure. If the zone report is empty, always assume an error (device
 197  * problem) and return -EIO.
 198  */
 199 static int zbd_report_zones(struct thread_data *td, struct fio_file *f,
 200                             uint64_t offset, struct zbd_zone *zones,
 201                             unsigned int nr_zones)
 202 {
 203         int ret;
 204
 205         if (td->io_ops && td->io_ops->report_zones)
 206                 ret = td->io_ops->report_zones(td, f, offset, zones, nr_zones);
 207         else
 208                 ret = blkzoned_report_zones(td, f, offset, zones, nr_zones);
 209         if (ret < 0) {
 210                 td_verror(td, errno, "report zones failed");
 211                 log_err("%s: report zones from sector %"PRIu64" failed (%d).\n",
 212                         f->file_name, offset >> 9, errno);
 213         } else if (ret == 0) {
 214                 td_verror(td, errno, "Empty zone report");
 215                 log_err("%s: report zones from sector %"PRIu64" is empty.\n",
 216                         f->file_name, offset >> 9);
 217                 ret = -EIO;
 218         }
 219
 220         return ret;
 221 }
 222
 223 /**
 224  * zbd_reset_wp - reset the write pointer of a range of zones
 225  * @td: FIO thread data.
 226  * @f: FIO file for which to reset zones
 227  * @offset: Starting offset of the first zone to reset
 228  * @length: Length of the range of zones to reset
 229  *
 230  * Reset the write pointer of all zones in the range @offset...@offset+@length.
 231  * Returns 0 upon success and a negative error code upon failure.
 232  */
 233 static int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
 234                         uint64_t offset, uint64_t length)
 235 {
 236         int ret;
 237
 238         if (td->io_ops && td->io_ops->reset_wp)
 239                 ret = td->io_ops->reset_wp(td, f, offset, length);
 240         else
 241                 ret = blkzoned_reset_wp(td, f, offset, length);
 242         if (ret < 0) {
 243                 td_verror(td, errno, "resetting wp failed");
 244                 log_err("%s: resetting wp for %"PRIu64" sectors at sector %"PRIu64" failed (%d).\n",
 245                         f->file_name, length >> 9, offset >> 9, errno);
 246         }
 247
 248         return ret;
 249 }
 250
 251 /**
 252  * zbd_reset_zone - reset the write pointer of a single zone
 253  * @td: FIO thread data.
 254  * @f: FIO file associated with the disk for which to reset a write pointer.
 255  * @z: Zone to reset.
 256  *
 257  * Returns 0 upon success and a negative error code upon failure.
 258  *
 259  * The caller must hold z->mutex.
 260  */
 261 static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
 262                           struct fio_zone_info *z)
 263 {
 264         uint64_t offset = z->start;
 265         uint64_t length = (z+1)->start - offset;
 266         uint64_t data_in_zone = z->wp - z->start;
 267         int ret = 0;
 268
 269         if (!data_in_zone)
 270                 return 0;
 271
 272         assert(is_valid_offset(f, offset + length - 1));
 273
 274         dprint(FD_ZBD, "%s: resetting wp of zone %u.\n",
 275                f->file_name, zbd_zone_idx(f, z));
 276
 277         switch (f->zbd_info->model) {
 278         case ZBD_HOST_AWARE:
 279         case ZBD_HOST_MANAGED:
 280                 ret = zbd_reset_wp(td, f, offset, length);
 281                 if (ret < 0)
 282                         return ret;
 283                 break;
 284         default:
 285                 break;
 286         }
 287
 288         pthread_mutex_lock(&f->zbd_info->mutex);
 289         f->zbd_info->sectors_with_data -= data_in_zone;
 290         f->zbd_info->wp_sectors_with_data -= data_in_zone;
 291         pthread_mutex_unlock(&f->zbd_info->mutex);
 292
 293         z->wp = z->start;
 294         z->verify_block = 0;
 295
 296         td->ts.nr_zone_resets++;
 297
 298         return ret;
 299 }
 300
 301 /**
 302  * zbd_close_zone - Remove a zone from the open zones array.
 303  * @td: FIO thread data.
 304  * @f: FIO file associated with the disk for which to reset a write pointer.
 305  * @zone_idx: Index of the zone to remove.
 306  *
 307  * The caller must hold f->zbd_info->mutex.
 308  */
 309 static void zbd_close_zone(struct thread_data *td, const struct fio_file *f,
 310                            struct fio_zone_info *z)
 311 {
 312         uint32_t ozi;
 313
 314         if (!z->open)
 315                 return;
 316
 317         for (ozi = 0; ozi < f->zbd_info->num_open_zones; ozi++) {
 318                 if (zbd_get_zone(f, f->zbd_info->open_zones[ozi]) == z)
 319                         break;
 320         }
 321         if (ozi == f->zbd_info->num_open_zones)
 322                 return;
 323
 324         dprint(FD_ZBD, "%s: closing zone %u\n",
 325                f->file_name, zbd_zone_idx(f, z));
 326
 327         memmove(f->zbd_info->open_zones + ozi,
 328                 f->zbd_info->open_zones + ozi + 1,
 329                 (ZBD_MAX_OPEN_ZONES - (ozi + 1)) *
 330                 sizeof(f->zbd_info->open_zones[0]));
 331
 332         f->zbd_info->num_open_zones--;
 333         td->num_open_zones--;
 334         z->open = 0;
 335 }
 336
 337 /**
 338  * zbd_reset_zones - Reset a range of zones.
 339  * @td: fio thread data.
 340  * @f: fio file for which to reset zones
 341  * @zb: first zone to reset.
 342  * @ze: first zone not to reset.
 343  *
 344  * Returns 0 upon success and 1 upon failure.
 345  */
 346 static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
 347                            struct fio_zone_info *const zb,
 348                            struct fio_zone_info *const ze)
 349 {
 350         struct fio_zone_info *z;
 351         const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
 352         int res = 0;
 353
 354         assert(min_bs);
 355
 356         dprint(FD_ZBD, "%s: examining zones %u .. %u\n",
 357                f->file_name, zbd_zone_idx(f, zb), zbd_zone_idx(f, ze));
 358
 359         for (z = zb; z < ze; z++) {
 360                 if (!z->has_wp)
 361                         continue;
 362
 363                 zone_lock(td, f, z);
 364                 pthread_mutex_lock(&f->zbd_info->mutex);
 365                 zbd_close_zone(td, f, z);
 366                 pthread_mutex_unlock(&f->zbd_info->mutex);
 367
 368                 if (z->wp != z->start) {
 369                         dprint(FD_ZBD, "%s: resetting zone %u\n",
 370                                f->file_name, zbd_zone_idx(f, z));
 371                         if (zbd_reset_zone(td, f, z) < 0)
 372                                 res = 1;
 373                 }
 374
 375                 zone_unlock(z);
 376         }
 377
 378         return res;
 379 }
 380
 381 /**
 382  * zbd_get_max_open_zones - Get the maximum number of open zones
 383  * @td: FIO thread data
 384  * @f: FIO file for which to get max open zones
 385  * @max_open_zones: Upon success, result will be stored here.
 386  *
 387  * A @max_open_zones value set to zero means no limit.
 388  *
 389  * Returns 0 upon success and a negative error code upon failure.
 390  */
 391 static int zbd_get_max_open_zones(struct thread_data *td, struct fio_file *f,
 392                                   unsigned int *max_open_zones)
 393 {
 394         int ret;
 395
 396         if (td->io_ops && td->io_ops->get_max_open_zones)
 397                 ret = td->io_ops->get_max_open_zones(td, f, max_open_zones);
 398         else
 399                 ret = blkzoned_get_max_open_zones(td, f, max_open_zones);
 400         if (ret < 0) {
 401                 td_verror(td, errno, "get max open zones failed");
 402                 log_err("%s: get max open zones failed (%d).\n",
 403                         f->file_name, errno);
 404         }
 405
 406         return ret;
 407 }
 408
 409 /**
 410  * zbd_open_zone - Add a zone to the array of open zones.
 411  * @td: fio thread data.
 412  * @f: fio file that has the open zones to add.
 413  * @zone_idx: Index of the zone to add.
 414  *
 415  * Open a ZBD zone if it is not already open. Returns true if either the zone
 416  * was already open or if the zone was successfully added to the array of open
 417  * zones without exceeding the maximum number of open zones. Returns false if
 418  * the zone was not already open and opening the zone would cause the zone limit
 419  * to be exceeded.
 420  */
 421 static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f,
 422                           struct fio_zone_info *z)
 423 {
 424         const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
 425         struct zoned_block_device_info *zbdi = f->zbd_info;
 426         uint32_t zone_idx = zbd_zone_idx(f, z);
 427         bool res = true;
 428
 429         if (z->cond == ZBD_ZONE_COND_OFFLINE)
 430                 return false;
 431
 432         /*
 433          * Skip full zones with data verification enabled because resetting a
 434          * zone causes data loss and hence causes verification to fail.
 435          */
 436         if (td->o.verify != VERIFY_NONE && zbd_zone_full(f, z, min_bs))
 437                 return false;
 438
 439         /*
 440          * zbdi->max_open_zones == 0 means that there is no limit on the maximum
 441          * number of open zones. In this case, do no track open zones in
 442          * zbdi->open_zones array.
 443          */
 444         if (!zbdi->max_open_zones)
 445                 return true;
 446
 447         pthread_mutex_lock(&zbdi->mutex);
 448
 449         if (z->open) {
 450                 /*
 451                  * If the zone is going to be completely filled by writes
 452                  * already in-flight, handle it as a full zone instead of an
 453                  * open zone.
 454                  */
 455                 if (!zbd_zone_remainder(z))
 456                         res = false;
 457                 goto out;
 458         }
 459
 460         res = false;
 461         /* Zero means no limit */
 462         if (td->o.job_max_open_zones > 0 &&
 463             td->num_open_zones >= td->o.job_max_open_zones)
 464                 goto out;
 465         if (zbdi->num_open_zones >= zbdi->max_open_zones)
 466                 goto out;
 467
 468         dprint(FD_ZBD, "%s: opening zone %u\n",
 469                f->file_name, zone_idx);
 470
 471         zbdi->open_zones[zbdi->num_open_zones++] = zone_idx;
 472         td->num_open_zones++;
 473         z->open = 1;
 474         res = true;
 475
 476 out:
 477         pthread_mutex_unlock(&zbdi->mutex);
 478         return res;
 479 }
 480
 481 /* Verify whether direct I/O is used for all host-managed zoned block drives. */
 482 static bool zbd_using_direct_io(void)
 483 {
 484         struct thread_data *td;
 485         struct fio_file *f;
 486         int i, j;
 487
 488         for_each_td(td, i) {
 489                 if (td->o.odirect || !(td->o.td_ddir & TD_DDIR_WRITE))
 490                         continue;
 491                 for_each_file(td, f, j) {
 492                         if (f->zbd_info && f->filetype == FIO_TYPE_BLOCK &&
 493                             f->zbd_info->model == ZBD_HOST_MANAGED)
 494                                 return false;
 495                 }
 496         }
 497
 498         return true;
 499 }
 500
 501 /* Whether or not the I/O range for f includes one or more sequential zones */
 502 static bool zbd_is_seq_job(struct fio_file *f)
 503 {
 504         uint32_t zone_idx, zone_idx_b, zone_idx_e;
 505
 506         assert(f->zbd_info);
 507
 508         if (f->io_size == 0)
 509                 return false;
 510
 511         zone_idx_b = zbd_offset_to_zone_idx(f, f->file_offset);
 512         zone_idx_e =
 513                 zbd_offset_to_zone_idx(f, f->file_offset + f->io_size - 1);
 514         for (zone_idx = zone_idx_b; zone_idx <= zone_idx_e; zone_idx++)
 515                 if (zbd_get_zone(f, zone_idx)->has_wp)
 516                         return true;
 517
 518         return false;
 519 }
 520
 521 /*
 522  * Verify whether the file offset and size parameters are aligned with zone
 523  * boundaries. If the file offset is not aligned, align it down to the start of
 524  * the zone containing the start offset and align up the file io_size parameter.
 525  */
 526 static bool zbd_zone_align_file_sizes(struct thread_data *td,
 527                                       struct fio_file *f)
 528 {
 529         const struct fio_zone_info *z;
 530         uint64_t new_offset, new_end;
 531
 532         if (!f->zbd_info)
 533                 return true;
 534         if (f->file_offset >= f->real_file_size)
 535                 return true;
 536         if (!zbd_is_seq_job(f))
 537                 return true;
 538
 539         if (!td->o.zone_size) {
 540                 td->o.zone_size = f->zbd_info->zone_size;
 541                 if (!td->o.zone_size) {
 542                         log_err("%s: invalid 0 zone size\n",
 543                                 f->file_name);
 544                         return false;
 545                 }
 546         } else if (td->o.zone_size != f->zbd_info->zone_size) {
 547                 log_err("%s: zonesize %llu does not match the device zone size %"PRIu64".\n",
 548                         f->file_name, td->o.zone_size,
 549                         f->zbd_info->zone_size);
 550                 return false;
 551         }
 552
 553         if (td->o.zone_skip % td->o.zone_size) {
 554                 log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n",
 555                         f->file_name, td->o.zone_skip,
 556                         td->o.zone_size);
 557                 return false;
 558         }
 559
 560         z = zbd_offset_to_zone(f, f->file_offset);
 561         if ((f->file_offset != z->start) &&
 562             (td->o.td_ddir != TD_DDIR_READ)) {
 563                 new_offset = zbd_zone_end(z);
 564                 if (new_offset >= f->file_offset + f->io_size) {
 565                         log_info("%s: io_size must be at least one zone\n",
 566                                  f->file_name);
 567                         return false;
 568                 }
 569                 log_info("%s: rounded up offset from %"PRIu64" to %"PRIu64"\n",
 570                          f->file_name, f->file_offset,
 571                          new_offset);
 572                 f->io_size -= (new_offset - f->file_offset);
 573                 f->file_offset = new_offset;
 574         }
 575
 576         z = zbd_offset_to_zone(f, f->file_offset + f->io_size);
 577         new_end = z->start;
 578         if ((td->o.td_ddir != TD_DDIR_READ) &&
 579             (f->file_offset + f->io_size != new_end)) {
 580                 if (new_end <= f->file_offset) {
 581                         log_info("%s: io_size must be at least one zone\n",
 582                                  f->file_name);
 583                         return false;
 584                 }
 585                 log_info("%s: rounded down io_size from %"PRIu64" to %"PRIu64"\n",
 586                          f->file_name, f->io_size,
 587                          new_end - f->file_offset);
 588                 f->io_size = new_end - f->file_offset;
 589         }
 590
 591         return true;
 592 }
 593
 594 /*
 595  * Verify whether offset and size parameters are aligned with zone boundaries.
 596  */
 597 static bool zbd_verify_sizes(void)
 598 {
 599         struct thread_data *td;
 600         struct fio_file *f;
 601         int i, j;
 602
 603         for_each_td(td, i) {
 604                 for_each_file(td, f, j) {
 605                         if (!zbd_zone_align_file_sizes(td, f))
 606                                 return false;
 607                 }
 608         }
 609
 610         return true;
 611 }
 612
 613 static bool zbd_verify_bs(void)
 614 {
 615         struct thread_data *td;
 616         struct fio_file *f;
 617         int i, j, k;
 618
 619         for_each_td(td, i) {
 620                 if (td_trim(td) &&
 621                     (td->o.min_bs[DDIR_TRIM] != td->o.max_bs[DDIR_TRIM] ||
 622                      td->o.bssplit_nr[DDIR_TRIM])) {
 623                         log_info("bsrange and bssplit are not allowed for trim with zonemode=zbd\n");
 624                         return false;
 625                 }
 626                 for_each_file(td, f, j) {
 627                         uint64_t zone_size;
 628
 629                         if (!f->zbd_info)
 630                                 continue;
 631
 632                         zone_size = f->zbd_info->zone_size;
 633                         if (td_trim(td) && td->o.bs[DDIR_TRIM] != zone_size) {
 634                                 log_info("%s: trim block size %llu is not the zone size %"PRIu64"\n",
 635                                          f->file_name, td->o.bs[DDIR_TRIM],
 636                                          zone_size);
 637                                 return false;
 638                         }
 639                         for (k = 0; k < FIO_ARRAY_SIZE(td->o.bs); k++) {
 640                                 if (td->o.verify != VERIFY_NONE &&
 641                                     zone_size % td->o.bs[k] != 0) {
 642                                         log_info("%s: block size %llu is not a divisor of the zone size %"PRIu64"\n",
 643                                                  f->file_name, td->o.bs[k],
 644                                                  zone_size);
 645                                         return false;
 646                                 }
 647                         }
 648                 }
 649         }
 650         return true;
 651 }
 652
 653 static int ilog2(uint64_t i)
 654 {
 655         int log = -1;
 656
 657         while (i) {
 658                 i >>= 1;
 659                 log++;
 660         }
 661         return log;
 662 }
 663
 664 /*
 665  * Initialize f->zbd_info for devices that are not zoned block devices. This
 666  * allows to execute a ZBD workload against a non-ZBD device.
 667  */
 668 static int init_zone_info(struct thread_data *td, struct fio_file *f)
 669 {
 670         uint32_t nr_zones;
 671         struct fio_zone_info *p;
 672         uint64_t zone_size = td->o.zone_size;
 673         uint64_t zone_capacity = td->o.zone_capacity;
 674         struct zoned_block_device_info *zbd_info = NULL;
 675         int i;
 676
 677         if (zone_size == 0) {
 678                 log_err("%s: Specifying the zone size is mandatory for regular file/block device with --zonemode=zbd\n\n",
 679                         f->file_name);
 680                 return 1;
 681         }
 682
 683         if (zone_size < 512) {
 684                 log_err("%s: zone size must be at least 512 bytes for --zonemode=zbd\n\n",
 685                         f->file_name);
 686                 return 1;
 687         }
 688
 689         if (zone_capacity == 0)
 690                 zone_capacity = zone_size;
 691
 692         if (zone_capacity > zone_size) {
 693                 log_err("%s: job parameter zonecapacity %llu is larger than zone size %llu\n",
 694                         f->file_name, td->o.zone_capacity, td->o.zone_size);
 695                 return 1;
 696         }
 697
 698         if (f->real_file_size < zone_size) {
 699                 log_err("%s: file/device size %"PRIu64" is smaller than zone size %"PRIu64"\n",
 700                         f->file_name, f->real_file_size, zone_size);
 701                 return -EINVAL;
 702         }
 703
 704         nr_zones = (f->real_file_size + zone_size - 1) / zone_size;
 705         zbd_info = scalloc(1, sizeof(*zbd_info) +
 706                            (nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
 707         if (!zbd_info)
 708                 return -ENOMEM;
 709
 710         mutex_init_pshared(&zbd_info->mutex);
 711         zbd_info->refcount = 1;
 712         p = &zbd_info->zone_info[0];
 713         for (i = 0; i < nr_zones; i++, p++) {
 714                 mutex_init_pshared_with_type(&p->mutex,
 715                                              PTHREAD_MUTEX_RECURSIVE);
 716                 p->start = i * zone_size;
 717                 p->wp = p->start;
 718                 p->type = ZBD_ZONE_TYPE_SWR;
 719                 p->cond = ZBD_ZONE_COND_EMPTY;
 720                 p->capacity = zone_capacity;
 721                 p->has_wp = 1;
 722         }
 723         /* a sentinel */
 724         p->start = nr_zones * zone_size;
 725
 726         f->zbd_info = zbd_info;
 727         f->zbd_info->zone_size = zone_size;
 728         f->zbd_info->zone_size_log2 = is_power_of_2(zone_size) ?
 729                 ilog2(zone_size) : 0;
 730         f->zbd_info->nr_zones = nr_zones;
 731         return 0;
 732 }
 733
 734 /*
 735  * Maximum number of zones to report in one operation.
 736  */
 737 #define ZBD_REPORT_MAX_ZONES    8192U
 738
 739 /*
 740  * Parse the device zone report and store it in f->zbd_info. Must be called
 741  * only for devices that are zoned, namely those with a model != ZBD_NONE.
 742  */
 743 static int parse_zone_info(struct thread_data *td, struct fio_file *f)
 744 {
 745         int nr_zones, nrz;
 746         struct zbd_zone *zones, *z;
 747         struct fio_zone_info *p;
 748         uint64_t zone_size, offset;
 749         struct zoned_block_device_info *zbd_info = NULL;
 750         int i, j, ret = -ENOMEM;
 751
 752         zones = calloc(ZBD_REPORT_MAX_ZONES, sizeof(struct zbd_zone));
 753         if (!zones)
 754                 goto out;
 755
 756         nrz = zbd_report_zones(td, f, 0, zones, ZBD_REPORT_MAX_ZONES);
 757         if (nrz < 0) {
 758                 ret = nrz;
 759                 log_info("fio: report zones (offset 0) failed for %s (%d).\n",
 760                          f->file_name, -ret);
 761                 goto out;
 762         }
 763
 764         zone_size = zones[0].len;
 765         nr_zones = (f->real_file_size + zone_size - 1) / zone_size;
 766
 767         if (td->o.zone_size == 0) {
 768                 td->o.zone_size = zone_size;
 769         } else if (td->o.zone_size != zone_size) {
 770                 log_err("fio: %s job parameter zonesize %llu does not match disk zone size %"PRIu64".\n",
 771                         f->file_name, td->o.zone_size, zone_size);
 772                 ret = -EINVAL;
 773                 goto out;
 774         }
 775
 776         dprint(FD_ZBD, "Device %s has %d zones of size %"PRIu64" KB\n",
 777                f->file_name, nr_zones, zone_size / 1024);
 778
 779         zbd_info = scalloc(1, sizeof(*zbd_info) +
 780                            (nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
 781         if (!zbd_info)
 782                 goto out;
 783         mutex_init_pshared(&zbd_info->mutex);
 784         zbd_info->refcount = 1;
 785         p = &zbd_info->zone_info[0];
 786         for (offset = 0, j = 0; j < nr_zones;) {
 787                 z = &zones[0];
 788                 for (i = 0; i < nrz; i++, j++, z++, p++) {
 789                         mutex_init_pshared_with_type(&p->mutex,
 790                                                      PTHREAD_MUTEX_RECURSIVE);
 791                         p->start = z->start;
 792                         p->capacity = z->capacity;
 793
 794                         switch (z->cond) {
 795                         case ZBD_ZONE_COND_NOT_WP:
 796                         case ZBD_ZONE_COND_FULL:
 797                                 p->wp = p->start + p->capacity;
 798                                 break;
 799                         default:
 800                                 assert(z->start <= z->wp);
 801                                 assert(z->wp <= z->start + zone_size);
 802                                 p->wp = z->wp;
 803                                 break;
 804                         }
 805
 806                         switch (z->type) {
 807                         case ZBD_ZONE_TYPE_SWR:
 808                                 p->has_wp = 1;
 809                                 break;
 810                         default:
 811                                 p->has_wp = 0;
 812                         }
 813                         p->type = z->type;
 814                         p->cond = z->cond;
 815
 816                         if (j > 0 && p->start != p[-1].start + zone_size) {
 817                                 log_info("%s: invalid zone data\n",
 818                                          f->file_name);
 819                                 ret = -EINVAL;
 820                                 goto out;
 821                         }
 822                 }
 823                 z--;
 824                 offset = z->start + z->len;
 825                 if (j >= nr_zones)
 826                         break;
 827
 828                 nrz = zbd_report_zones(td, f, offset, zones,
 829                                        min((uint32_t)(nr_zones - j),
 830                                            ZBD_REPORT_MAX_ZONES));
 831                 if (nrz < 0) {
 832                         ret = nrz;
 833                         log_info("fio: report zones (offset %"PRIu64") failed for %s (%d).\n",
 834                                  offset, f->file_name, -ret);
 835                         goto out;
 836                 }
 837         }
 838
 839         /* a sentinel */
 840         zbd_info->zone_info[nr_zones].start = offset;
 841
 842         f->zbd_info = zbd_info;
 843         f->zbd_info->zone_size = zone_size;
 844         f->zbd_info->zone_size_log2 = is_power_of_2(zone_size) ?
 845                 ilog2(zone_size) : 0;
 846         f->zbd_info->nr_zones = nr_zones;
 847         zbd_info = NULL;
 848         ret = 0;
 849
 850 out:
 851         sfree(zbd_info);
 852         free(zones);
 853         return ret;
 854 }
 855
 856 static int zbd_set_max_open_zones(struct thread_data *td, struct fio_file *f)
 857 {
 858         struct zoned_block_device_info *zbd = f->zbd_info;
 859         unsigned int max_open_zones;
 860         int ret;
 861
 862         if (zbd->model != ZBD_HOST_MANAGED || td->o.ignore_zone_limits) {
 863                 /* Only host-managed devices have a max open limit */
 864                 zbd->max_open_zones = td->o.max_open_zones;
 865                 goto out;
 866         }
 867
 868         /* If host-managed, get the max open limit */
 869         ret = zbd_get_max_open_zones(td, f, &max_open_zones);
 870         if (ret)
 871                 return ret;
 872
 873         if (!max_open_zones) {
 874                 /* No device limit */
 875                 zbd->max_open_zones = td->o.max_open_zones;
 876         } else if (!td->o.max_open_zones) {
 877                 /* No user limit. Set limit to device limit */
 878                 zbd->max_open_zones = max_open_zones;
 879         } else if (td->o.max_open_zones <= max_open_zones) {
 880                 /* Both user limit and dev limit. User limit not too large */
 881                 zbd->max_open_zones = td->o.max_open_zones;
 882         } else {
 883                 /* Both user limit and dev limit. User limit too large */
 884                 td_verror(td, EINVAL,
 885                           "Specified --max_open_zones is too large");
 886                 log_err("Specified --max_open_zones (%d) is larger than max (%u)\n",
 887                         td->o.max_open_zones, max_open_zones);
 888                 return -EINVAL;
 889         }
 890
 891 out:
 892         /* Ensure that the limit is not larger than FIO's internal limit */
 893         if (zbd->max_open_zones > ZBD_MAX_OPEN_ZONES) {
 894                 td_verror(td, EINVAL, "'max_open_zones' value is too large");
 895                 log_err("'max_open_zones' value is larger than %u\n",
 896                         ZBD_MAX_OPEN_ZONES);
 897                 return -EINVAL;
 898         }
 899
 900         dprint(FD_ZBD, "%s: using max open zones limit: %"PRIu32"\n",
 901                f->file_name, zbd->max_open_zones);
 902
 903         return 0;
 904 }
 905
 906 /*
 907  * Allocate zone information and store it into f->zbd_info if zonemode=zbd.
 908  *
 909  * Returns 0 upon success and a negative error code upon failure.
 910  */
 911 static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
 912 {
 913         enum zbd_zoned_model zbd_model;
 914         int ret;
 915
 916         assert(td->o.zone_mode == ZONE_MODE_ZBD);
 917
 918         ret = zbd_get_zoned_model(td, f, &zbd_model);
 919         if (ret)
 920                 return ret;
 921
 922         switch (zbd_model) {
 923         case ZBD_HOST_AWARE:
 924         case ZBD_HOST_MANAGED:
 925                 ret = parse_zone_info(td, f);
 926                 if (ret)
 927                         return ret;
 928                 break;
 929         case ZBD_NONE:
 930                 ret = init_zone_info(td, f);
 931                 if (ret)
 932                         return ret;
 933                 break;
 934         default:
 935                 td_verror(td, EINVAL, "Unsupported zoned model");
 936                 log_err("Unsupported zoned model\n");
 937                 return -EINVAL;
 938         }
 939
 940         assert(f->zbd_info);
 941         f->zbd_info->model = zbd_model;
 942
 943         ret = zbd_set_max_open_zones(td, f);
 944         if (ret) {
 945                 zbd_free_zone_info(f);
 946                 return ret;
 947         }
 948
 949         return 0;
 950 }
 951
 952 void zbd_free_zone_info(struct fio_file *f)
 953 {
 954         uint32_t refcount;
 955
 956         assert(f->zbd_info);
 957
 958         pthread_mutex_lock(&f->zbd_info->mutex);
 959         refcount = --f->zbd_info->refcount;
 960         pthread_mutex_unlock(&f->zbd_info->mutex);
 961
 962         assert((int32_t)refcount >= 0);
 963         if (refcount == 0)
 964                 sfree(f->zbd_info);
 965         f->zbd_info = NULL;
 966 }
 967
 968 /*
 969  * Initialize f->zbd_info.
 970  *
 971  * Returns 0 upon success and a negative error code upon failure.
 972  *
 973  * Note: this function can only work correctly if it is called before the first
 974  * fio fork() call.
 975  */
 976 static int zbd_init_zone_info(struct thread_data *td, struct fio_file *file)
 977 {
 978         struct thread_data *td2;
 979         struct fio_file *f2;
 980         int i, j, ret;
 981
 982         for_each_td(td2, i) {
 983                 for_each_file(td2, f2, j) {
 984                         if (td2 == td && f2 == file)
 985                                 continue;
 986                         if (!f2->zbd_info ||
 987                             strcmp(f2->file_name, file->file_name) != 0)
 988                                 continue;
 989                         file->zbd_info = f2->zbd_info;
 990                         file->zbd_info->refcount++;
 991                         return 0;
 992                 }
 993         }
 994
 995         ret = zbd_create_zone_info(td, file);
 996         if (ret < 0)
 997                 td_verror(td, -ret, "zbd_create_zone_info() failed");
 998
 999         return ret;
1000 }
1001
1002 int zbd_init_files(struct thread_data *td)
1003 {
1004         struct fio_file *f;
1005         int i;
1006
1007         for_each_file(td, f, i) {
1008                 if (zbd_init_zone_info(td, f))
1009                         return 1;
1010         }
1011
1012         return 0;
1013 }
1014
1015 void zbd_recalc_options_with_zone_granularity(struct thread_data *td)
1016 {
1017         struct fio_file *f;
1018         int i;
1019
1020         for_each_file(td, f, i) {
1021                 struct zoned_block_device_info *zbd = f->zbd_info;
1022                 uint64_t zone_size;
1023
1024                 /* zonemode=strided doesn't get per-file zone size. */
1025                 zone_size = zbd ? zbd->zone_size : td->o.zone_size;
1026                 if (zone_size == 0)
1027                         continue;
1028
1029                 if (td->o.size_nz > 0)
1030                         td->o.size = td->o.size_nz * zone_size;
1031                 if (td->o.io_size_nz > 0)
1032                         td->o.io_size = td->o.io_size_nz * zone_size;
1033                 if (td->o.start_offset_nz > 0)
1034                         td->o.start_offset = td->o.start_offset_nz * zone_size;
1035                 if (td->o.offset_increment_nz > 0)
1036                         td->o.offset_increment =
1037                                 td->o.offset_increment_nz * zone_size;
1038                 if (td->o.zone_skip_nz > 0)
1039                         td->o.zone_skip = td->o.zone_skip_nz * zone_size;
1040         }
1041 }
1042
1043 int zbd_setup_files(struct thread_data *td)
1044 {
1045         struct fio_file *f;
1046         int i;
1047
1048         if (!zbd_using_direct_io()) {
1049                 log_err("Using direct I/O is mandatory for writing to ZBD drives\n\n");
1050                 return 1;
1051         }
1052
1053         if (!zbd_verify_sizes())
1054                 return 1;
1055
1056         if (!zbd_verify_bs())
1057                 return 1;
1058
1059         for_each_file(td, f, i) {
1060                 struct zoned_block_device_info *zbd = f->zbd_info;
1061                 struct fio_zone_info *z;
1062                 int zi;
1063
1064                 assert(zbd);
1065
1066                 f->min_zone = zbd_offset_to_zone_idx(f, f->file_offset);
1067                 f->max_zone =
1068                         zbd_offset_to_zone_idx(f, f->file_offset + f->io_size);
1069
1070                 /*
1071                  * When all zones in the I/O range are conventional, io_size
1072                  * can be smaller than zone size, making min_zone the same
1073                  * as max_zone. This is why the assert below needs to be made
1074                  * conditional.
1075                  */
1076                 if (zbd_is_seq_job(f))
1077                         assert(f->min_zone < f->max_zone);
1078
1079                 if (td->o.max_open_zones > 0 &&
1080                     zbd->max_open_zones != td->o.max_open_zones) {
1081                         log_err("Different 'max_open_zones' values\n");
1082                         return 1;
1083                 }
1084
1085                 /*
1086                  * The per job max open zones limit cannot be used without a
1087                  * global max open zones limit. (As the tracking of open zones
1088                  * is disabled when there is no global max open zones limit.)
1089                  */
1090                 if (td->o.job_max_open_zones && !zbd->max_open_zones) {
1091                         log_err("'job_max_open_zones' cannot be used without a global open zones limit\n");
1092                         return 1;
1093                 }
1094
1095                 /*
1096                  * zbd->max_open_zones is the global limit shared for all jobs
1097                  * that target the same zoned block device. Force sync the per
1098                  * thread global limit with the actual global limit. (The real
1099                  * per thread/job limit is stored in td->o.job_max_open_zones).
1100                  */
1101                 td->o.max_open_zones = zbd->max_open_zones;
1102
1103                 for (zi = f->min_zone; zi < f->max_zone; zi++) {
1104                         z = &zbd->zone_info[zi];
1105                         if (z->cond != ZBD_ZONE_COND_IMP_OPEN &&
1106                             z->cond != ZBD_ZONE_COND_EXP_OPEN)
1107                                 continue;
1108                         if (zbd_open_zone(td, f, z))
1109                                 continue;
1110                         /*
1111                          * If the number of open zones exceeds specified limits,
1112                          * reset all extra open zones.
1113                          */
1114                         if (zbd_reset_zone(td, f, z) < 0) {
1115                                 log_err("Failed to reest zone %d\n", zi);
1116                                 return 1;
1117                         }
1118                 }
1119         }
1120
1121         return 0;
1122 }
1123
1124 /*
1125  * Reset zbd_info.write_cnt, the counter that counts down towards the next
1126  * zone reset.
1127  */
1128 static void _zbd_reset_write_cnt(const struct thread_data *td,
1129                                  const struct fio_file *f)
1130 {
1131         assert(0 <= td->o.zrf.u.f && td->o.zrf.u.f <= 1);
1132
1133         f->zbd_info->write_cnt = td->o.zrf.u.f ?
1134                 min(1.0 / td->o.zrf.u.f, 0.0 + UINT_MAX) : UINT_MAX;
1135 }
1136
1137 static void zbd_reset_write_cnt(const struct thread_data *td,
1138                                 const struct fio_file *f)
1139 {
1140         pthread_mutex_lock(&f->zbd_info->mutex);
1141         _zbd_reset_write_cnt(td, f);
1142         pthread_mutex_unlock(&f->zbd_info->mutex);
1143 }
1144
1145 static bool zbd_dec_and_reset_write_cnt(const struct thread_data *td,
1146                                         const struct fio_file *f)
1147 {
1148         uint32_t write_cnt = 0;
1149
1150         pthread_mutex_lock(&f->zbd_info->mutex);
1151         assert(f->zbd_info->write_cnt);
1152         if (f->zbd_info->write_cnt)
1153                 write_cnt = --f->zbd_info->write_cnt;
1154         if (write_cnt == 0)
1155                 _zbd_reset_write_cnt(td, f);
1156         pthread_mutex_unlock(&f->zbd_info->mutex);
1157
1158         return write_cnt == 0;
1159 }
1160
1161 enum swd_action {
1162         CHECK_SWD,
1163         SET_SWD,
1164 };
1165
1166 /* Calculate the number of sectors with data (swd) and perform action 'a' */
1167 static uint64_t zbd_process_swd(struct thread_data *td,
1168                                 const struct fio_file *f, enum swd_action a)
1169 {
1170         struct fio_zone_info *zb, *ze, *z;
1171         uint64_t swd = 0;
1172         uint64_t wp_swd = 0;
1173
1174         zb = zbd_get_zone(f, f->min_zone);
1175         ze = zbd_get_zone(f, f->max_zone);
1176         for (z = zb; z < ze; z++) {
1177                 if (z->has_wp) {
1178                         zone_lock(td, f, z);
1179                         wp_swd += z->wp - z->start;
1180                 }
1181                 swd += z->wp - z->start;
1182         }
1183
1184         pthread_mutex_lock(&f->zbd_info->mutex);
1185         switch (a) {
1186         case CHECK_SWD:
1187                 assert(f->zbd_info->sectors_with_data == swd);
1188                 assert(f->zbd_info->wp_sectors_with_data == wp_swd);
1189                 break;
1190         case SET_SWD:
1191                 f->zbd_info->sectors_with_data = swd;
1192                 f->zbd_info->wp_sectors_with_data = wp_swd;
1193                 break;
1194         }
1195         pthread_mutex_unlock(&f->zbd_info->mutex);
1196
1197         for (z = zb; z < ze; z++)
1198                 if (z->has_wp)
1199                         zone_unlock(z);
1200
1201         return swd;
1202 }
1203
1204 /*
1205  * The swd check is useful for debugging but takes too much time to leave
1206  * it enabled all the time. Hence it is disabled by default.
1207  */
1208 static const bool enable_check_swd = false;
1209
1210 /* Check whether the values of zbd_info.*sectors_with_data are correct. */
1211 static void zbd_check_swd(struct thread_data *td, const struct fio_file *f)
1212 {
1213         if (!enable_check_swd)
1214                 return;
1215
1216         zbd_process_swd(td, f, CHECK_SWD);
1217 }
1218
1219 void zbd_file_reset(struct thread_data *td, struct fio_file *f)
1220 {
1221         struct fio_zone_info *zb, *ze;
1222         uint64_t swd;
1223
1224         if (!f->zbd_info || !td_write(td))
1225                 return;
1226
1227         zb = zbd_get_zone(f, f->min_zone);
1228         ze = zbd_get_zone(f, f->max_zone);
1229         swd = zbd_process_swd(td, f, SET_SWD);
1230
1231         dprint(FD_ZBD, "%s(%s): swd = %" PRIu64 "\n",
1232                __func__, f->file_name, swd);
1233
1234         /*
1235          * If data verification is enabled reset the affected zones before
1236          * writing any data to avoid that a zone reset has to be issued while
1237          * writing data, which causes data loss.
1238          */
1239         if (td->o.verify != VERIFY_NONE && td->runstate != TD_VERIFYING)
1240                 zbd_reset_zones(td, f, zb, ze);
1241         zbd_reset_write_cnt(td, f);
1242 }
1243
1244 /* Return random zone index for one of the open zones. */
1245 static uint32_t pick_random_zone_idx(const struct fio_file *f,
1246                                      const struct io_u *io_u)
1247 {
1248         return (io_u->offset - f->file_offset) *
1249                 f->zbd_info->num_open_zones / f->io_size;
1250 }
1251
1252 static bool any_io_in_flight(void)
1253 {
1254         struct thread_data *td;
1255         int i;
1256
1257         for_each_td(td, i) {
1258                 if (td->io_u_in_flight)
1259                         return true;
1260         }
1261
1262         return false;
1263 }
1264
1265 /*
1266  * Modify the offset of an I/O unit that does not refer to an open zone such
1267  * that it refers to an open zone. Close an open zone and open a new zone if
1268  * necessary. The open zone is searched across sequential zones.
1269  * This algorithm can only work correctly if all write pointers are
1270  * a multiple of the fio block size. The caller must neither hold z->mutex
1271  * nor f->zbd_info->mutex. Returns with z->mutex held upon success.
1272  */
1273 static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
1274                                                       struct io_u *io_u)
1275 {
1276         const uint64_t min_bs = td->o.min_bs[io_u->ddir];
1277         struct fio_file *f = io_u->file;
1278         struct zoned_block_device_info *zbdi = f->zbd_info;
1279         struct fio_zone_info *z;
1280         unsigned int open_zone_idx = -1;
1281         uint32_t zone_idx, new_zone_idx;
1282         int i;
1283         bool wait_zone_close;
1284         bool in_flight;
1285         bool should_retry = true;
1286
1287         assert(is_valid_offset(f, io_u->offset));
1288
1289         if (zbdi->max_open_zones || td->o.job_max_open_zones) {
1290                 /*
1291                  * This statement accesses zbdi->open_zones[] on purpose
1292                  * without locking.
1293                  */
1294                 zone_idx = zbdi->open_zones[pick_random_zone_idx(f, io_u)];
1295         } else {
1296                 zone_idx = zbd_offset_to_zone_idx(f, io_u->offset);
1297         }
1298         if (zone_idx < f->min_zone)
1299                 zone_idx = f->min_zone;
1300         else if (zone_idx >= f->max_zone)
1301                 zone_idx = f->max_zone - 1;
1302
1303         dprint(FD_ZBD,
1304                "%s(%s): starting from zone %d (offset %lld, buflen %lld)\n",
1305                __func__, f->file_name, zone_idx, io_u->offset, io_u->buflen);
1306
1307         /*
1308          * Since z->mutex is the outer lock and zbdi->mutex the inner
1309          * lock it can happen that the state of the zone with index zone_idx
1310          * has changed after 'z' has been assigned and before zbdi->mutex
1311          * has been obtained. Hence the loop.
1312          */
1313         for (;;) {
1314                 uint32_t tmp_idx;
1315
1316                 z = zbd_get_zone(f, zone_idx);
1317                 if (z->has_wp)
1318                         zone_lock(td, f, z);
1319
1320                 pthread_mutex_lock(&zbdi->mutex);
1321
1322                 if (z->has_wp) {
1323                         if (z->cond != ZBD_ZONE_COND_OFFLINE &&
1324                             zbdi->max_open_zones == 0 &&
1325                             td->o.job_max_open_zones == 0)
1326                                 goto examine_zone;
1327                         if (zbdi->num_open_zones == 0) {
1328                                 dprint(FD_ZBD, "%s(%s): no zones are open\n",
1329                                        __func__, f->file_name);
1330                                 goto open_other_zone;
1331                         }
1332                 }
1333
1334                 /*
1335                  * List of opened zones is per-device, shared across all
1336                  * threads. Start with quasi-random candidate zone. Ignore
1337                  * zones which don't belong to thread's offset/size area.
1338                  */
1339                 open_zone_idx = pick_random_zone_idx(f, io_u);
1340                 assert(!open_zone_idx ||
1341                        open_zone_idx < zbdi->num_open_zones);
1342                 tmp_idx = open_zone_idx;
1343
1344                 for (i = 0; i < zbdi->num_open_zones; i++) {
1345                         uint32_t tmpz;
1346
1347                         if (tmp_idx >= zbdi->num_open_zones)
1348                                 tmp_idx = 0;
1349                         tmpz = zbdi->open_zones[tmp_idx];
1350                         if (f->min_zone <= tmpz && tmpz < f->max_zone) {
1351                                 open_zone_idx = tmp_idx;
1352                                 goto found_candidate_zone;
1353                         }
1354
1355                         tmp_idx++;
1356                 }
1357
1358                 dprint(FD_ZBD, "%s(%s): no candidate zone\n",
1359                         __func__, f->file_name);
1360
1361                 pthread_mutex_unlock(&zbdi->mutex);
1362
1363                 if (z->has_wp)
1364                         zone_unlock(z);
1365
1366                 return NULL;
1367
1368 found_candidate_zone:
1369                 new_zone_idx = zbdi->open_zones[open_zone_idx];
1370                 if (new_zone_idx == zone_idx)
1371                         break;
1372                 zone_idx = new_zone_idx;
1373
1374                 pthread_mutex_unlock(&zbdi->mutex);
1375
1376                 if (z->has_wp)
1377                         zone_unlock(z);
1378         }
1379
1380         /* Both z->mutex and zbdi->mutex are held. */
1381
1382 examine_zone:
1383         if (zbd_zone_remainder(z) >= min_bs) {
1384                 pthread_mutex_unlock(&zbdi->mutex);
1385                 goto out;
1386         }
1387
1388 open_other_zone:
1389         /* Check if number of open zones reaches one of limits. */
1390         wait_zone_close =
1391                 zbdi->num_open_zones == f->max_zone - f->min_zone ||
1392                 (zbdi->max_open_zones &&
1393                  zbdi->num_open_zones == zbdi->max_open_zones) ||
1394                 (td->o.job_max_open_zones &&
1395                  td->num_open_zones == td->o.job_max_open_zones);
1396
1397         pthread_mutex_unlock(&zbdi->mutex);
1398
1399         /* Only z->mutex is held. */
1400
1401         /*
1402          * When number of open zones reaches to one of limits, wait for
1403          * zone close before opening a new zone.
1404          */
1405         if (wait_zone_close) {
1406                 dprint(FD_ZBD,
1407                        "%s(%s): quiesce to allow open zones to close\n",
1408                        __func__, f->file_name);
1409                 io_u_quiesce(td);
1410         }
1411
1412 retry:
1413         /* Zone 'z' is full, so try to open a new zone. */
1414         for (i = f->io_size / zbdi->zone_size; i > 0; i--) {
1415                 zone_idx++;
1416                 if (z->has_wp)
1417                         zone_unlock(z);
1418                 z++;
1419                 if (!is_valid_offset(f, z->start)) {
1420                         /* Wrap-around. */
1421                         zone_idx = f->min_zone;
1422                         z = zbd_get_zone(f, zone_idx);
1423                 }
1424                 assert(is_valid_offset(f, z->start));
1425                 if (!z->has_wp)
1426                         continue;
1427                 zone_lock(td, f, z);
1428                 if (z->open)
1429                         continue;
1430                 if (zbd_open_zone(td, f, z))
1431                         goto out;
1432         }
1433
1434         /* Only z->mutex is held. */
1435
1436         /* Check whether the write fits in any of the already opened zones. */
1437         pthread_mutex_lock(&zbdi->mutex);
1438         for (i = 0; i < zbdi->num_open_zones; i++) {
1439                 zone_idx = zbdi->open_zones[i];
1440                 if (zone_idx < f->min_zone || zone_idx >= f->max_zone)
1441                         continue;
1442                 pthread_mutex_unlock(&zbdi->mutex);
1443                 zone_unlock(z);
1444
1445                 z = zbd_get_zone(f, zone_idx);
1446
1447                 zone_lock(td, f, z);
1448                 if (zbd_zone_remainder(z) >= min_bs)
1449                         goto out;
1450                 pthread_mutex_lock(&zbdi->mutex);
1451         }
1452
1453         /*
1454          * When any I/O is in-flight or when all I/Os in-flight get completed,
1455          * the I/Os might have closed zones then retry the steps to open a zone.
1456          * Before retry, call io_u_quiesce() to complete in-flight writes.
1457          */
1458         in_flight = any_io_in_flight();
1459         if (in_flight || should_retry) {
1460                 dprint(FD_ZBD,
1461                        "%s(%s): wait zone close and retry open zones\n",
1462                        __func__, f->file_name);
1463                 pthread_mutex_unlock(&zbdi->mutex);
1464                 zone_unlock(z);
1465                 io_u_quiesce(td);
1466                 zone_lock(td, f, z);
1467                 should_retry = in_flight;
1468                 goto retry;
1469         }
1470
1471         pthread_mutex_unlock(&zbdi->mutex);
1472
1473         zone_unlock(z);
1474
1475         dprint(FD_ZBD, "%s(%s): did not open another zone\n",
1476                __func__, f->file_name);
1477
1478         return NULL;
1479
1480 out:
1481         dprint(FD_ZBD, "%s(%s): returning zone %d\n",
1482                __func__, f->file_name, zone_idx);
1483
1484         io_u->offset = z->start;
1485         assert(z->has_wp);
1486         assert(z->cond != ZBD_ZONE_COND_OFFLINE);
1487
1488         return z;
1489 }
1490
1491 /* The caller must hold z->mutex. */
1492 static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td,
1493                                                     struct io_u *io_u,
1494                                                     struct fio_zone_info *z)
1495 {
1496         const struct fio_file *f = io_u->file;
1497         const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
1498
1499         if (!zbd_open_zone(td, f, z)) {
1500                 zone_unlock(z);
1501                 z = zbd_convert_to_open_zone(td, io_u);
1502                 assert(z);
1503         }
1504
1505         if (z->verify_block * min_bs >= z->capacity) {
1506                 log_err("%s: %d * %"PRIu64" >= %"PRIu64"\n",
1507                         f->file_name, z->verify_block, min_bs, z->capacity);
1508                 /*
1509                  * If the assertion below fails during a test run, adding
1510                  * "--experimental_verify=1" to the command line may help.
1511                  */
1512                 assert(false);
1513         }
1514
1515         io_u->offset = z->start + z->verify_block * min_bs;
1516         if (io_u->offset + io_u->buflen >= zbd_zone_capacity_end(z)) {
1517                 log_err("%s: %llu + %llu >= %"PRIu64"\n",
1518                         f->file_name, io_u->offset, io_u->buflen,
1519                         zbd_zone_capacity_end(z));
1520                 assert(false);
1521         }
1522         z->verify_block += io_u->buflen / min_bs;
1523
1524         return z;
1525 }
1526
1527 /*
1528  * Find another zone which has @min_bytes of readable data. Search in zones
1529  * @zb + 1 .. @zl. For random workload, also search in zones @zb - 1 .. @zf.
1530  *
1531  * Either returns NULL or returns a zone pointer. When the zone has write
1532  * pointer, hold the mutex for the zone.
1533  */
1534 static struct fio_zone_info *
1535 zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint64_t min_bytes,
1536               struct fio_zone_info *zb, struct fio_zone_info *zl)
1537 {
1538         struct fio_file *f = io_u->file;
1539         struct fio_zone_info *z1, *z2;
1540         const struct fio_zone_info *const zf = zbd_get_zone(f, f->min_zone);
1541
1542         /*
1543          * Skip to the next non-empty zone in case of sequential I/O and to
1544          * the nearest non-empty zone in case of random I/O.
1545          */
1546         for (z1 = zb + 1, z2 = zb - 1; z1 < zl || z2 >= zf; z1++, z2--) {
1547                 if (z1 < zl && z1->cond != ZBD_ZONE_COND_OFFLINE) {
1548                         if (z1->has_wp)
1549                                 zone_lock(td, f, z1);
1550                         if (z1->start + min_bytes <= z1->wp)
1551                                 return z1;
1552                         if (z1->has_wp)
1553                                 zone_unlock(z1);
1554                 } else if (!td_random(td)) {
1555                         break;
1556                 }
1557
1558                 if (td_random(td) && z2 >= zf &&
1559                     z2->cond != ZBD_ZONE_COND_OFFLINE) {
1560                         if (z2->has_wp)
1561                                 zone_lock(td, f, z2);
1562                         if (z2->start + min_bytes <= z2->wp)
1563                                 return z2;
1564                         if (z2->has_wp)
1565                                 zone_unlock(z2);
1566                 }
1567         }
1568
1569         dprint(FD_ZBD,
1570                "%s: no zone has %"PRIu64" bytes of readable data\n",
1571                f->file_name, min_bytes);
1572
1573         return NULL;
1574 }
1575
1576 /**
1577  * zbd_end_zone_io - update zone status at command completion
1578  * @io_u: I/O unit
1579  * @z: zone info pointer
1580  *
1581  * If the write command made the zone full, close it.
1582  *
1583  * The caller must hold z->mutex.
1584  */
1585 static void zbd_end_zone_io(struct thread_data *td, const struct io_u *io_u,
1586                             struct fio_zone_info *z)
1587 {
1588         const struct fio_file *f = io_u->file;
1589
1590         if (io_u->ddir == DDIR_WRITE &&
1591             io_u->offset + io_u->buflen >= zbd_zone_capacity_end(z)) {
1592                 pthread_mutex_lock(&f->zbd_info->mutex);
1593                 zbd_close_zone(td, f, z);
1594                 pthread_mutex_unlock(&f->zbd_info->mutex);
1595         }
1596 }
1597
1598 /**
1599  * zbd_queue_io - update the write pointer of a sequential zone
1600  * @io_u: I/O unit
1601  * @success: Whether or not the I/O unit has been queued successfully
1602  * @q: queueing status (busy, completed or queued).
1603  *
1604  * For write and trim operations, update the write pointer of the I/O unit
1605  * target zone.
1606  */
1607 static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q,
1608                          bool success)
1609 {
1610         const struct fio_file *f = io_u->file;
1611         struct zoned_block_device_info *zbd_info = f->zbd_info;
1612         struct fio_zone_info *z;
1613         uint64_t zone_end;
1614
1615         assert(zbd_info);
1616
1617         z = zbd_offset_to_zone(f, io_u->offset);
1618         assert(z->has_wp);
1619
1620         if (!success)
1621                 goto unlock;
1622
1623         dprint(FD_ZBD,
1624                "%s: queued I/O (%lld, %llu) for zone %u\n",
1625                f->file_name, io_u->offset, io_u->buflen, zbd_zone_idx(f, z));
1626
1627         switch (io_u->ddir) {
1628         case DDIR_WRITE:
1629                 zone_end = min((uint64_t)(io_u->offset + io_u->buflen),
1630                                zbd_zone_capacity_end(z));
1631
1632                 /*
1633                  * z->wp > zone_end means that one or more I/O errors
1634                  * have occurred.
1635                  */
1636                 pthread_mutex_lock(&zbd_info->mutex);
1637                 if (z->wp <= zone_end) {
1638                         zbd_info->sectors_with_data += zone_end - z->wp;
1639                         zbd_info->wp_sectors_with_data += zone_end - z->wp;
1640                 }
1641                 pthread_mutex_unlock(&zbd_info->mutex);
1642                 z->wp = zone_end;
1643                 break;
1644         default:
1645                 break;
1646         }
1647
1648         if (q == FIO_Q_COMPLETED && !io_u->error)
1649                 zbd_end_zone_io(td, io_u, z);
1650
1651 unlock:
1652         if (!success || q != FIO_Q_QUEUED) {
1653                 /* BUSY or COMPLETED: unlock the zone */
1654                 zone_unlock(z);
1655                 io_u->zbd_put_io = NULL;
1656         }
1657 }
1658
1659 /**
1660  * zbd_put_io - Unlock an I/O unit target zone lock
1661  * @io_u: I/O unit
1662  */
1663 static void zbd_put_io(struct thread_data *td, const struct io_u *io_u)
1664 {
1665         const struct fio_file *f = io_u->file;
1666         struct zoned_block_device_info *zbd_info = f->zbd_info;
1667         struct fio_zone_info *z;
1668
1669         assert(zbd_info);
1670
1671         z = zbd_offset_to_zone(f, io_u->offset);
1672         assert(z->has_wp);
1673
1674         dprint(FD_ZBD,
1675                "%s: terminate I/O (%lld, %llu) for zone %u\n",
1676                f->file_name, io_u->offset, io_u->buflen, zbd_zone_idx(f, z));
1677
1678         zbd_end_zone_io(td, io_u, z);
1679
1680         zone_unlock(z);
1681         zbd_check_swd(td, f);
1682 }
1683
1684 /*
1685  * Windows and MacOS do not define this.
1686  */
1687 #ifndef EREMOTEIO
1688 #define EREMOTEIO       121     /* POSIX value */
1689 #endif
1690
1691 bool zbd_unaligned_write(int error_code)
1692 {
1693         switch (error_code) {
1694         case EIO:
1695         case EREMOTEIO:
1696                 return true;
1697         }
1698         return false;
1699 }
1700
1701 /**
1702  * setup_zbd_zone_mode - handle zoneskip as necessary for ZBD drives
1703  * @td: FIO thread data.
1704  * @io_u: FIO I/O unit.
1705  *
1706  * For sequential workloads, change the file offset to skip zoneskip bytes when
1707  * no more IO can be performed in the current zone.
1708  * - For read workloads, zoneskip is applied when the io has reached the end of
1709  *   the zone or the zone write position (when td->o.read_beyond_wp is false).
1710  * - For write workloads, zoneskip is applied when the zone is full.
1711  * This applies only to read and write operations.
1712  */
1713 void setup_zbd_zone_mode(struct thread_data *td, struct io_u *io_u)
1714 {
1715         struct fio_file *f = io_u->file;
1716         enum fio_ddir ddir = io_u->ddir;
1717         struct fio_zone_info *z;
1718
1719         assert(td->o.zone_mode == ZONE_MODE_ZBD);
1720         assert(td->o.zone_size);
1721         assert(f->zbd_info);
1722
1723         z = zbd_offset_to_zone(f, f->last_pos[ddir]);
1724
1725         /*
1726          * When the zone capacity is smaller than the zone size and the I/O is
1727          * sequential write, skip to zone end if the latest position is at the
1728          * zone capacity limit.
1729          */
1730         if (z->capacity < f->zbd_info->zone_size &&
1731             !td_random(td) && ddir == DDIR_WRITE &&
1732             f->last_pos[ddir] >= zbd_zone_capacity_end(z)) {
1733                 dprint(FD_ZBD,
1734                        "%s: Jump from zone capacity limit to zone end:"
1735                        " (%"PRIu64" -> %"PRIu64") for zone %u (%"PRIu64")\n",
1736                        f->file_name, f->last_pos[ddir],
1737                        zbd_zone_end(z), zbd_zone_idx(f, z), z->capacity);
1738                 td->io_skip_bytes += zbd_zone_end(z) - f->last_pos[ddir];
1739                 f->last_pos[ddir] = zbd_zone_end(z);
1740         }
1741
1742         /*
1743          * zone_skip is valid only for sequential workloads.
1744          */
1745         if (td_random(td) || !td->o.zone_skip)
1746                 return;
1747
1748         /*
1749          * It is time to switch to a new zone if:
1750          * - zone_bytes == zone_size bytes have already been accessed
1751          * - The last position reached the end of the current zone.
1752          * - For reads with td->o.read_beyond_wp == false, the last position
1753          *   reached the zone write pointer.
1754          */
1755         if (td->zone_bytes >= td->o.zone_size ||
1756             f->last_pos[ddir] >= zbd_zone_end(z) ||
1757             (ddir == DDIR_READ &&
1758              (!td->o.read_beyond_wp) && f->last_pos[ddir] >= z->wp)) {
1759                 /*
1760                  * Skip zones.
1761                  */
1762                 td->zone_bytes = 0;
1763                 f->file_offset += td->o.zone_size + td->o.zone_skip;
1764
1765                 /*
1766                  * Wrap from the beginning, if we exceed the file size
1767                  */
1768                 if (f->file_offset >= f->real_file_size)
1769                         f->file_offset = get_start_offset(td, f);
1770
1771                 f->last_pos[ddir] = f->file_offset;
1772                 td->io_skip_bytes += td->o.zone_skip;
1773         }
1774 }
1775
1776 /**
1777  * zbd_adjust_ddir - Adjust an I/O direction for zonemode=zbd.
1778  *
1779  * @td: FIO thread data.
1780  * @io_u: FIO I/O unit.
1781  * @ddir: I/O direction before adjustment.
1782  *
1783  * Return adjusted I/O direction.
1784  */
1785 enum fio_ddir zbd_adjust_ddir(struct thread_data *td, struct io_u *io_u,
1786                               enum fio_ddir ddir)
1787 {
1788         /*
1789          * In case read direction is chosen for the first random I/O, fio with
1790          * zonemode=zbd stops because no data can be read from zoned block
1791          * devices with all empty zones. Overwrite the first I/O direction as
1792          * write to make sure data to read exists.
1793          */
1794         assert(io_u->file->zbd_info);
1795         if (ddir != DDIR_READ || !td_rw(td))
1796                 return ddir;
1797
1798         if (io_u->file->zbd_info->sectors_with_data ||
1799             td->o.read_beyond_wp)
1800                 return DDIR_READ;
1801
1802         return DDIR_WRITE;
1803 }
1804
1805 /**
1806  * zbd_adjust_block - adjust the offset and length as necessary for ZBD drives
1807  * @td: FIO thread data.
1808  * @io_u: FIO I/O unit.
1809  *
1810  * Locking strategy: returns with z->mutex locked if and only if z refers
1811  * to a sequential zone and if io_u_accept is returned. z is the zone that
1812  * corresponds to io_u->offset at the end of this function.
1813  */
1814 enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
1815 {
1816         struct fio_file *f = io_u->file;
1817         struct zoned_block_device_info *zbdi = f->zbd_info;
1818         struct fio_zone_info *zb, *zl, *orig_zb;
1819         uint32_t orig_len = io_u->buflen;
1820         uint64_t min_bs = td->o.min_bs[io_u->ddir];
1821         uint64_t new_len;
1822         int64_t range;
1823
1824         assert(zbdi);
1825         assert(min_bs);
1826         assert(is_valid_offset(f, io_u->offset));
1827         assert(io_u->buflen);
1828
1829         zb = zbd_offset_to_zone(f, io_u->offset);
1830         orig_zb = zb;
1831
1832         if (!zb->has_wp) {
1833                 /* Accept non-write I/Os for conventional zones. */
1834                 if (io_u->ddir != DDIR_WRITE)
1835                         return io_u_accept;
1836
1837                 /*
1838                  * Make sure that writes to conventional zones
1839                  * don't cross over to any sequential zones.
1840                  */
1841                 if (!(zb + 1)->has_wp ||
1842                     io_u->offset + io_u->buflen <= (zb + 1)->start)
1843                         return io_u_accept;
1844
1845                 if (io_u->offset + min_bs > (zb + 1)->start) {
1846                         dprint(FD_IO,
1847                                "%s: off=%llu + min_bs=%"PRIu64" > next zone %"PRIu64"\n",
1848                                f->file_name, io_u->offset,
1849                                min_bs, (zb + 1)->start);
1850                         io_u->offset =
1851                                 zb->start + (zb + 1)->start - io_u->offset;
1852                         new_len = min(io_u->buflen,
1853                                       (zb + 1)->start - io_u->offset);
1854                 } else {
1855                         new_len = (zb + 1)->start - io_u->offset;
1856                 }
1857
1858                 io_u->buflen = new_len / min_bs * min_bs;
1859
1860                 return io_u_accept;
1861         }
1862
1863         /*
1864          * Accept the I/O offset for reads if reading beyond the write pointer
1865          * is enabled.
1866          */
1867         if (zb->cond != ZBD_ZONE_COND_OFFLINE &&
1868             io_u->ddir == DDIR_READ && td->o.read_beyond_wp)
1869                 return io_u_accept;
1870
1871         zbd_check_swd(td, f);
1872
1873         zone_lock(td, f, zb);
1874
1875         switch (io_u->ddir) {
1876         case DDIR_READ:
1877                 if (td->runstate == TD_VERIFYING && td_write(td)) {
1878                         zb = zbd_replay_write_order(td, io_u, zb);
1879                         goto accept;
1880                 }
1881
1882                 /*
1883                  * Check that there is enough written data in the zone to do an
1884                  * I/O of at least min_bs B. If there isn't, find a new zone for
1885                  * the I/O.
1886                  */
1887                 range = zb->cond != ZBD_ZONE_COND_OFFLINE ?
1888                         zb->wp - zb->start : 0;
1889                 if (range < min_bs ||
1890                     ((!td_random(td)) && (io_u->offset + min_bs > zb->wp))) {
1891                         zone_unlock(zb);
1892                         zl = zbd_get_zone(f, f->max_zone);
1893                         zb = zbd_find_zone(td, io_u, min_bs, zb, zl);
1894                         if (!zb) {
1895                                 dprint(FD_ZBD,
1896                                        "%s: zbd_find_zone(%lld, %llu) failed\n",
1897                                        f->file_name, io_u->offset,
1898                                        io_u->buflen);
1899                                 goto eof;
1900                         }
1901                         /*
1902                          * zbd_find_zone() returned a zone with a range of at
1903                          * least min_bs.
1904                          */
1905                         range = zb->wp - zb->start;
1906                         assert(range >= min_bs);
1907
1908                         if (!td_random(td))
1909                                 io_u->offset = zb->start;
1910                 }
1911
1912                 /*
1913                  * Make sure the I/O is within the zone valid data range while
1914                  * maximizing the I/O size and preserving randomness.
1915                  */
1916                 if (range <= io_u->buflen)
1917                         io_u->offset = zb->start;
1918                 else if (td_random(td))
1919                         io_u->offset = zb->start +
1920                                 ((io_u->offset - orig_zb->start) %
1921                                  (range - io_u->buflen)) / min_bs * min_bs;
1922
1923                 /*
1924                  * When zbd_find_zone() returns a conventional zone,
1925                  * we can simply accept the new i/o offset here.
1926                  */
1927                 if (!zb->has_wp)
1928                         return io_u_accept;
1929
1930                 /*
1931                  * Make sure the I/O does not cross over the zone wp position.
1932                  */
1933                 new_len = min((unsigned long long)io_u->buflen,
1934                               (unsigned long long)(zb->wp - io_u->offset));
1935                 new_len = new_len / min_bs * min_bs;
1936                 if (new_len < io_u->buflen) {
1937                         io_u->buflen = new_len;
1938                         dprint(FD_IO, "Changed length from %u into %llu\n",
1939                                orig_len, io_u->buflen);
1940                 }
1941
1942                 assert(zb->start <= io_u->offset);
1943                 assert(io_u->offset + io_u->buflen <= zb->wp);
1944
1945                 goto accept;
1946
1947         case DDIR_WRITE:
1948                 if (io_u->buflen > zbdi->zone_size) {
1949                         td_verror(td, EINVAL, "I/O buflen exceeds zone size");
1950                         dprint(FD_IO,
1951                                "%s: I/O buflen %llu exceeds zone size %"PRIu64"\n",
1952                                f->file_name, io_u->buflen, zbdi->zone_size);
1953                         goto eof;
1954                 }
1955
1956                 if (!zbd_open_zone(td, f, zb)) {
1957                         zone_unlock(zb);
1958                         zb = zbd_convert_to_open_zone(td, io_u);
1959                         if (!zb) {
1960                                 dprint(FD_IO, "%s: can't convert to open zone",
1961                                        f->file_name);
1962                                 goto eof;
1963                         }
1964                 }
1965
1966                 /* Check whether the zone reset threshold has been exceeded */
1967                 if (td->o.zrf.u.f) {
1968                         if (zbdi->wp_sectors_with_data >= f->io_size * td->o.zrt.u.f &&
1969                             zbd_dec_and_reset_write_cnt(td, f))
1970                                 zb->reset_zone = 1;
1971                 }
1972
1973                 /* Reset the zone pointer if necessary */
1974                 if (zb->reset_zone || zbd_zone_full(f, zb, min_bs)) {
1975                         assert(td->o.verify == VERIFY_NONE);
1976                         /*
1977                          * Since previous write requests may have been submitted
1978                          * asynchronously and since we will submit the zone
1979                          * reset synchronously, wait until previously submitted
1980                          * write requests have completed before issuing a
1981                          * zone reset.
1982                          */
1983                         io_u_quiesce(td);
1984                         zb->reset_zone = 0;
1985                         if (zbd_reset_zone(td, f, zb) < 0)
1986                                 goto eof;
1987
1988                         if (zb->capacity < min_bs) {
1989                                 td_verror(td, EINVAL, "ZCAP is less min_bs");
1990                                 log_err("zone capacity %"PRIu64" smaller than minimum block size %"PRIu64"\n",
1991                                         zb->capacity, min_bs);
1992                                 goto eof;
1993                         }
1994                 }
1995
1996                 /* Make writes occur at the write pointer */
1997                 assert(!zbd_zone_full(f, zb, min_bs));
1998                 io_u->offset = zb->wp;
1999                 if (!is_valid_offset(f, io_u->offset)) {
2000                         td_verror(td, EINVAL, "invalid WP value");
2001                         dprint(FD_ZBD, "%s: dropped request with offset %llu\n",
2002                                f->file_name, io_u->offset);
2003                         goto eof;
2004                 }
2005
2006                 /*
2007                  * Make sure that the buflen is a multiple of the minimal
2008                  * block size. Give up if shrinking would make the request too
2009                  * small.
2010                  */
2011                 new_len = min((unsigned long long)io_u->buflen,
2012                               zbd_zone_capacity_end(zb) - io_u->offset);
2013                 new_len = new_len / min_bs * min_bs;
2014                 if (new_len == io_u->buflen)
2015                         goto accept;
2016                 if (new_len >= min_bs) {
2017                         io_u->buflen = new_len;
2018                         dprint(FD_IO, "Changed length from %u into %llu\n",
2019                                orig_len, io_u->buflen);
2020                         goto accept;
2021                 }
2022
2023                 td_verror(td, EIO, "zone remainder too small");
2024                 log_err("zone remainder %lld smaller than min block size %"PRIu64"\n",
2025                         (zbd_zone_capacity_end(zb) - io_u->offset), min_bs);
2026
2027                 goto eof;
2028
2029         case DDIR_TRIM:
2030                 /* Check random trim targets a non-empty zone */
2031                 if (!td_random(td) || zb->wp > zb->start)
2032                         goto accept;
2033
2034                 /* Find out a non-empty zone to trim */
2035                 zone_unlock(zb);
2036                 zl = zbd_get_zone(f, f->max_zone);
2037                 zb = zbd_find_zone(td, io_u, 1, zb, zl);
2038                 if (zb) {
2039                         io_u->offset = zb->start;
2040                         dprint(FD_ZBD, "%s: found new zone(%lld) for trim\n",
2041                                f->file_name, io_u->offset);
2042                         goto accept;
2043                 }
2044
2045                 goto eof;
2046
2047         case DDIR_SYNC:
2048                 /* fall-through */
2049         case DDIR_DATASYNC:
2050         case DDIR_SYNC_FILE_RANGE:
2051         case DDIR_WAIT:
2052         case DDIR_LAST:
2053         case DDIR_INVAL:
2054                 goto accept;
2055         }
2056
2057         assert(false);
2058
2059 accept:
2060         assert(zb->has_wp);
2061         assert(zb->cond != ZBD_ZONE_COND_OFFLINE);
2062         assert(!io_u->zbd_queue_io);
2063         assert(!io_u->zbd_put_io);
2064
2065         io_u->zbd_queue_io = zbd_queue_io;
2066         io_u->zbd_put_io = zbd_put_io;
2067
2068         /*
2069          * Since we return with the zone lock still held,
2070          * add an annotation to let Coverity know that it
2071          * is intentional.
2072          */
2073         /* coverity[missing_unlock] */
2074
2075         return io_u_accept;
2076
2077 eof:
2078         if (zb && zb->has_wp)
2079                 zone_unlock(zb);
2080
2081         return io_u_eof;
2082 }
2083
2084 /* Return a string with ZBD statistics */
2085 char *zbd_write_status(const struct thread_stat *ts)
2086 {
2087         char *res;
2088
2089         if (asprintf(&res, "; %"PRIu64" zone resets", ts->nr_zone_resets) < 0)
2090                 return NULL;
2091         return res;
2092 }
2093
2094 /**
2095  * zbd_do_io_u_trim - If reset zone is applicable, do reset zone instead of trim
2096  *
2097  * @td: FIO thread data.
2098  * @io_u: FIO I/O unit.
2099  *
2100  * It is assumed that z->mutex is already locked.
2101  * Return io_u_completed when reset zone succeeds. Return 0 when the target zone
2102  * does not have write pointer. On error, return negative errno.
2103  */
2104 int zbd_do_io_u_trim(const struct thread_data *td, struct io_u *io_u)
2105 {
2106         struct fio_file *f = io_u->file;
2107         struct fio_zone_info *z;
2108         int ret;
2109
2110         z = zbd_offset_to_zone(f, io_u->offset);
2111         if (!z->has_wp)
2112                 return 0;
2113
2114         if (io_u->offset != z->start) {
2115                 log_err("Trim offset not at zone start (%lld)\n",
2116                         io_u->offset);
2117                 return -EINVAL;
2118         }
2119
2120         ret = zbd_reset_zone((struct thread_data *)td, f, z);
2121         if (ret < 0)
2122                 return ret;
2123
2124         return io_u_completed;
2125 }