t/io_uring: only calculate per-file depth if we have files
[fio.git] / zbd.c
CommitLineData
bfbdd35b
BVA
1/*
2 * Copyright (C) 2018 Western Digital Corporation or its affiliates.
3 *
4 * This file is released under the GPL.
5 */
6
7#include <errno.h>
8#include <string.h>
9#include <stdlib.h>
bfbdd35b 10#include <fcntl.h>
bfbdd35b
BVA
11#include <sys/stat.h>
12#include <unistd.h>
f5bff36e 13
83276370 14#include "compiler/compiler.h"
cf42d79e 15#include "os/os.h"
bfbdd35b
BVA
16#include "file.h"
17#include "fio.h"
18#include "lib/pow2.h"
19#include "log.h"
f5bff36e 20#include "oslib/asprintf.h"
bfbdd35b
BVA
21#include "smalloc.h"
22#include "verify.h"
44ec32cb 23#include "pshared.h"
bfbdd35b
BVA
24#include "zbd.h"
25
410a071c
DLM
26static bool is_valid_offset(const struct fio_file *f, uint64_t offset)
27{
28 return (uint64_t)(offset - f->file_offset) < f->io_size;
29}
30
dc8a3d62
DLM
31static inline unsigned int zbd_zone_idx(const struct fio_file *f,
32 struct fio_zone_info *zone)
410a071c
DLM
33{
34 return zone - f->zbd_info->zone_info;
35}
36
37/**
dc8a3d62 38 * zbd_offset_to_zone_idx - convert an offset into a zone number
410a071c
DLM
39 * @f: file pointer.
40 * @offset: offset in bytes. If this offset is in the first zone_size bytes
41 * past the disk size then the index of the sentinel is returned.
42 */
dc8a3d62
DLM
43static unsigned int zbd_offset_to_zone_idx(const struct fio_file *f,
44 uint64_t offset)
410a071c
DLM
45{
46 uint32_t zone_idx;
47
48 if (f->zbd_info->zone_size_log2 > 0)
49 zone_idx = offset >> f->zbd_info->zone_size_log2;
50 else
51 zone_idx = offset / f->zbd_info->zone_size;
52
53 return min(zone_idx, f->zbd_info->nr_zones);
54}
55
56/**
57 * zbd_zone_end - Return zone end location
58 * @z: zone info pointer.
59 */
60static inline uint64_t zbd_zone_end(const struct fio_zone_info *z)
61{
62 return (z+1)->start;
63}
64
65/**
66 * zbd_zone_capacity_end - Return zone capacity limit end location
67 * @z: zone info pointer.
68 */
69static inline uint64_t zbd_zone_capacity_end(const struct fio_zone_info *z)
70{
71 return z->start + z->capacity;
72}
73
df67bf1e
SK
74/**
75 * zbd_zone_remainder - Return the number of bytes that are still available for
76 * writing before the zone gets full
77 * @z: zone info pointer.
78 */
79static inline uint64_t zbd_zone_remainder(struct fio_zone_info *z)
80{
81 if (z->wp >= zbd_zone_capacity_end(z))
82 return 0;
83
84 return zbd_zone_capacity_end(z) - z->wp;
85}
86
410a071c
DLM
87/**
88 * zbd_zone_full - verify whether a minimum number of bytes remain in a zone
89 * @f: file pointer.
90 * @z: zone info pointer.
91 * @required: minimum number of bytes that must remain in a zone.
92 *
93 * The caller must hold z->mutex.
94 */
95static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z,
96 uint64_t required)
97{
98 assert((required & 511) == 0);
99
df67bf1e 100 return z->has_wp && required > zbd_zone_remainder(z);
410a071c
DLM
101}
102
103static void zone_lock(struct thread_data *td, const struct fio_file *f,
104 struct fio_zone_info *z)
105{
83276370 106#ifndef NDEBUG
69c53a63 107 unsigned int const nz = zbd_zone_idx(f, z);
410a071c
DLM
108 /* A thread should never lock zones outside its working area. */
109 assert(f->min_zone <= nz && nz < f->max_zone);
410a071c 110 assert(z->has_wp);
83276370 111#endif
410a071c
DLM
112
113 /*
114 * Lock the io_u target zone. The zone will be unlocked if io_u offset
115 * is changed or when io_u completes and zbd_put_io() executed.
116 * To avoid multiple jobs doing asynchronous I/Os from deadlocking each
117 * other waiting for zone locks when building an io_u batch, first
118 * only trylock the zone. If the zone is already locked by another job,
119 * process the currently queued I/Os so that I/O progress is made and
120 * zones unlocked.
121 */
122 if (pthread_mutex_trylock(&z->mutex) != 0) {
123 if (!td_ioengine_flagged(td, FIO_SYNCIO))
124 io_u_quiesce(td);
125 pthread_mutex_lock(&z->mutex);
126 }
127}
128
129static inline void zone_unlock(struct fio_zone_info *z)
130{
410a071c 131 assert(z->has_wp);
83276370 132 pthread_mutex_unlock(&z->mutex);
410a071c
DLM
133}
134
39e06ee7
DLM
135static inline struct fio_zone_info *zbd_get_zone(const struct fio_file *f,
136 unsigned int zone_idx)
410a071c 137{
39e06ee7 138 return &f->zbd_info->zone_info[zone_idx];
410a071c
DLM
139}
140
53aa6171
DLM
141static inline struct fio_zone_info *
142zbd_offset_to_zone(const struct fio_file *f, uint64_t offset)
143{
144 return zbd_get_zone(f, zbd_offset_to_zone_idx(f, offset));
145}
146
2fb29f27
SK
147static bool accounting_vdb(struct thread_data *td, const struct fio_file *f)
148{
149 return td->o.zrt.u.f && td_write(td);
150}
151
b7694961
DLM
152/**
153 * zbd_get_zoned_model - Get a device zoned model
154 * @td: FIO thread data
155 * @f: FIO file for which to get model information
156 */
38334c13
DLM
157static int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f,
158 enum zbd_zoned_model *model)
b7694961
DLM
159{
160 int ret;
161
50cc48d5
NC
162 if (f->filetype == FIO_TYPE_PIPE) {
163 log_err("zonemode=zbd does not support pipes\n");
164 return -EINVAL;
165 }
166
9db0cde8
NC
167 /* If regular file, always emulate zones inside the file. */
168 if (f->filetype == FIO_TYPE_FILE) {
169 *model = ZBD_NONE;
170 return 0;
171 }
172
6c5b11d3
DLM
173 if (td->io_ops && td->io_ops->get_zoned_model)
174 ret = td->io_ops->get_zoned_model(td, f, model);
175 else
176 ret = blkzoned_get_zoned_model(td, f, model);
b7694961
DLM
177 if (ret < 0) {
178 td_verror(td, errno, "get zoned model failed");
179 log_err("%s: get zoned model failed (%d).\n",
180 f->file_name, errno);
181 }
182
183 return ret;
184}
185
186/**
187 * zbd_report_zones - Get zone information
188 * @td: FIO thread data.
189 * @f: FIO file for which to get zone information
190 * @offset: offset from which to report zones
191 * @zones: Array of struct zbd_zone
192 * @nr_zones: Size of @zones array
193 *
194 * Get zone information into @zones starting from the zone at offset @offset
195 * for the device specified by @f.
196 *
197 * Returns the number of zones reported upon success and a negative error code
198 * upon failure. If the zone report is empty, always assume an error (device
199 * problem) and return -EIO.
200 */
38334c13
DLM
201static int zbd_report_zones(struct thread_data *td, struct fio_file *f,
202 uint64_t offset, struct zbd_zone *zones,
203 unsigned int nr_zones)
b7694961
DLM
204{
205 int ret;
206
6c5b11d3
DLM
207 if (td->io_ops && td->io_ops->report_zones)
208 ret = td->io_ops->report_zones(td, f, offset, zones, nr_zones);
209 else
210 ret = blkzoned_report_zones(td, f, offset, zones, nr_zones);
b7694961
DLM
211 if (ret < 0) {
212 td_verror(td, errno, "report zones failed");
362ce037
BVA
213 log_err("%s: report zones from sector %"PRIu64" failed (nr_zones=%d; errno=%d).\n",
214 f->file_name, offset >> 9, nr_zones, errno);
b7694961
DLM
215 } else if (ret == 0) {
216 td_verror(td, errno, "Empty zone report");
ee5e3436
SK
217 log_err("%s: report zones from sector %"PRIu64" is empty.\n",
218 f->file_name, offset >> 9);
b7694961
DLM
219 ret = -EIO;
220 }
221
222 return ret;
223}
224
225/**
226 * zbd_reset_wp - reset the write pointer of a range of zones
227 * @td: FIO thread data.
228 * @f: FIO file for which to reset zones
229 * @offset: Starting offset of the first zone to reset
230 * @length: Length of the range of zones to reset
231 *
232 * Reset the write pointer of all zones in the range @offset...@offset+@length.
233 * Returns 0 upon success and a negative error code upon failure.
234 */
38334c13
DLM
235static int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
236 uint64_t offset, uint64_t length)
b7694961
DLM
237{
238 int ret;
239
6c5b11d3
DLM
240 if (td->io_ops && td->io_ops->reset_wp)
241 ret = td->io_ops->reset_wp(td, f, offset, length);
242 else
243 ret = blkzoned_reset_wp(td, f, offset, length);
b7694961
DLM
244 if (ret < 0) {
245 td_verror(td, errno, "resetting wp failed");
ee5e3436
SK
246 log_err("%s: resetting wp for %"PRIu64" sectors at sector %"PRIu64" failed (%d).\n",
247 f->file_name, length >> 9, offset >> 9, errno);
b7694961
DLM
248 }
249
250 return ret;
251}
252
410a071c 253/**
67282020 254 * __zbd_reset_zone - reset the write pointer of a single zone
410a071c
DLM
255 * @td: FIO thread data.
256 * @f: FIO file associated with the disk for which to reset a write pointer.
257 * @z: Zone to reset.
258 *
259 * Returns 0 upon success and a negative error code upon failure.
260 *
261 * The caller must hold z->mutex.
262 */
67282020
SK
263static int __zbd_reset_zone(struct thread_data *td, struct fio_file *f,
264 struct fio_zone_info *z)
410a071c
DLM
265{
266 uint64_t offset = z->start;
267 uint64_t length = (z+1)->start - offset;
268 uint64_t data_in_zone = z->wp - z->start;
269 int ret = 0;
270
271 if (!data_in_zone)
272 return 0;
273
274 assert(is_valid_offset(f, offset + length - 1));
275
139d8dc6 276 dprint(FD_ZBD, "%s: resetting wp of zone %u.\n",
dc8a3d62 277 f->file_name, zbd_zone_idx(f, z));
139d8dc6 278
410a071c
DLM
279 switch (f->zbd_info->model) {
280 case ZBD_HOST_AWARE:
281 case ZBD_HOST_MANAGED:
282 ret = zbd_reset_wp(td, f, offset, length);
283 if (ret < 0)
284 return ret;
285 break;
286 default:
287 break;
288 }
289
2fb29f27
SK
290 if (accounting_vdb(td, f)) {
291 pthread_mutex_lock(&f->zbd_info->mutex);
292 f->zbd_info->wp_valid_data_bytes -= data_in_zone;
293 pthread_mutex_unlock(&f->zbd_info->mutex);
294 }
139d8dc6 295
410a071c 296 z->wp = z->start;
410a071c
DLM
297
298 td->ts.nr_zone_resets++;
299
300 return ret;
301}
302
303/**
a4807046 304 * zbd_write_zone_put - Remove a zone from the write target zones array.
410a071c 305 * @td: FIO thread data.
a4807046 306 * @f: FIO file that has the write zones array to remove.
410a071c
DLM
307 * @zone_idx: Index of the zone to remove.
308 *
309 * The caller must hold f->zbd_info->mutex.
310 */
a4807046
SK
311static void zbd_write_zone_put(struct thread_data *td, const struct fio_file *f,
312 struct fio_zone_info *z)
410a071c 313{
a4807046 314 uint32_t zi;
410a071c 315
a4807046 316 if (!z->write)
a23411bb
DLM
317 return;
318
a4807046
SK
319 for (zi = 0; zi < f->zbd_info->num_write_zones; zi++) {
320 if (zbd_get_zone(f, f->zbd_info->write_zones[zi]) == z)
410a071c
DLM
321 break;
322 }
a4807046 323 if (zi == f->zbd_info->num_write_zones)
410a071c
DLM
324 return;
325
a4807046 326 dprint(FD_ZBD, "%s: removing zone %u from write zone array\n",
dc8a3d62 327 f->file_name, zbd_zone_idx(f, z));
139d8dc6 328
a4807046
SK
329 memmove(f->zbd_info->write_zones + zi,
330 f->zbd_info->write_zones + zi + 1,
331 (ZBD_MAX_WRITE_ZONES - (zi + 1)) *
332 sizeof(f->zbd_info->write_zones[0]));
139d8dc6 333
a4807046
SK
334 f->zbd_info->num_write_zones--;
335 td->num_write_zones--;
336 z->write = 0;
410a071c
DLM
337}
338
67282020
SK
339/**
340 * zbd_reset_zone - reset the write pointer of a single zone and remove the zone
341 * from the array of write zones.
342 * @td: FIO thread data.
343 * @f: FIO file associated with the disk for which to reset a write pointer.
344 * @z: Zone to reset.
345 *
346 * Returns 0 upon success and a negative error code upon failure.
347 *
348 * The caller must hold z->mutex.
349 */
350static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
351 struct fio_zone_info *z)
352{
353 int ret;
354
355 ret = __zbd_reset_zone(td, f, z);
356 if (ret)
357 return ret;
358
359 pthread_mutex_lock(&f->zbd_info->mutex);
360 zbd_write_zone_put(td, f, z);
361 pthread_mutex_unlock(&f->zbd_info->mutex);
362 return 0;
363}
364
e1a1b59b
SK
365/**
366 * zbd_finish_zone - finish the specified zone
367 * @td: FIO thread data.
368 * @f: FIO file for which to finish a zone
369 * @z: Zone to finish.
370 *
371 * Finish the zone at @offset with open or close status.
372 */
373static int zbd_finish_zone(struct thread_data *td, struct fio_file *f,
374 struct fio_zone_info *z)
375{
376 uint64_t offset = z->start;
377 uint64_t length = f->zbd_info->zone_size;
378 int ret = 0;
379
380 switch (f->zbd_info->model) {
381 case ZBD_HOST_AWARE:
382 case ZBD_HOST_MANAGED:
383 if (td->io_ops && td->io_ops->finish_zone)
384 ret = td->io_ops->finish_zone(td, f, offset, length);
385 else
386 ret = blkzoned_finish_zone(td, f, offset, length);
387 break;
388 default:
389 break;
390 }
391
392 if (ret < 0) {
393 td_verror(td, errno, "finish zone failed");
394 log_err("%s: finish zone at sector %"PRIu64" failed (%d).\n",
395 f->file_name, offset >> 9, errno);
396 } else {
397 z->wp = (z+1)->start;
398 }
399
400 return ret;
401}
402
410a071c
DLM
403/**
404 * zbd_reset_zones - Reset a range of zones.
405 * @td: fio thread data.
406 * @f: fio file for which to reset zones
407 * @zb: first zone to reset.
408 * @ze: first zone not to reset.
409 *
410 * Returns 0 upon success and 1 upon failure.
411 */
412static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
413 struct fio_zone_info *const zb,
414 struct fio_zone_info *const ze)
415{
416 struct fio_zone_info *z;
417 const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
418 int res = 0;
419
83276370
DP
420 if (fio_unlikely(0 == min_bs))
421 return 1;
410a071c 422
139d8dc6 423 dprint(FD_ZBD, "%s: examining zones %u .. %u\n",
dc8a3d62 424 f->file_name, zbd_zone_idx(f, zb), zbd_zone_idx(f, ze));
139d8dc6 425
410a071c 426 for (z = zb; z < ze; z++) {
410a071c
DLM
427 if (!z->has_wp)
428 continue;
139d8dc6 429
410a071c 430 zone_lock(td, f, z);
139d8dc6 431
410a071c
DLM
432 if (z->wp != z->start) {
433 dprint(FD_ZBD, "%s: resetting zone %u\n",
dc8a3d62 434 f->file_name, zbd_zone_idx(f, z));
410a071c
DLM
435 if (zbd_reset_zone(td, f, z) < 0)
436 res = 1;
437 }
139d8dc6 438
410a071c
DLM
439 zone_unlock(z);
440 }
441
442 return res;
443}
444
d2f442bc
NC
445/**
446 * zbd_get_max_open_zones - Get the maximum number of open zones
447 * @td: FIO thread data
448 * @f: FIO file for which to get max open zones
449 * @max_open_zones: Upon success, result will be stored here.
450 *
451 * A @max_open_zones value set to zero means no limit.
452 *
453 * Returns 0 upon success and a negative error code upon failure.
454 */
38334c13
DLM
455static int zbd_get_max_open_zones(struct thread_data *td, struct fio_file *f,
456 unsigned int *max_open_zones)
d2f442bc
NC
457{
458 int ret;
459
460 if (td->io_ops && td->io_ops->get_max_open_zones)
461 ret = td->io_ops->get_max_open_zones(td, f, max_open_zones);
462 else
463 ret = blkzoned_get_max_open_zones(td, f, max_open_zones);
464 if (ret < 0) {
465 td_verror(td, errno, "get max open zones failed");
466 log_err("%s: get max open zones failed (%d).\n",
467 f->file_name, errno);
468 }
469
470 return ret;
471}
472
9e523ef8
SK
473/**
474 * zbd_get_max_active_zones - Get the maximum number of active zones
475 * @td: FIO thread data
476 * @f: FIO file for which to get max active zones
477 *
478 * Returns max_active_zones limit value of the target file if it is available.
479 * Otherwise return zero, which means no limit.
480 */
481static unsigned int zbd_get_max_active_zones(struct thread_data *td,
482 struct fio_file *f)
483{
484 unsigned int max_active_zones;
485 int ret;
486
487 if (td->io_ops && td->io_ops->get_max_active_zones)
488 ret = td->io_ops->get_max_active_zones(td, f,
489 &max_active_zones);
490 else
491 ret = blkzoned_get_max_active_zones(td, f, &max_active_zones);
492 if (ret < 0) {
493 dprint(FD_ZBD, "%s: max_active_zones is not available\n",
494 f->file_name);
495 return 0;
496 }
497
498 return max_active_zones;
499}
500
bfbdd35b 501/**
f539b98c 502 * __zbd_write_zone_get - Add a zone to the array of write zones.
410a071c 503 * @td: fio thread data.
a4807046 504 * @f: fio file that has the write zones array to add.
410a071c 505 * @zone_idx: Index of the zone to add.
bfbdd35b 506 *
f539b98c
SK
507 * Do same operation as @zbd_write_zone_get, except it adds the zone at
508 * @zone_idx to write target zones array even when it does not have remainder
509 * space to write one block.
bfbdd35b 510 */
f539b98c
SK
511static bool __zbd_write_zone_get(struct thread_data *td,
512 const struct fio_file *f,
513 struct fio_zone_info *z)
1f57803b 514{
410a071c 515 struct zoned_block_device_info *zbdi = f->zbd_info;
dc8a3d62 516 uint32_t zone_idx = zbd_zone_idx(f, z);
410a071c 517 bool res = true;
fae3b9a0 518
410a071c
DLM
519 if (z->cond == ZBD_ZONE_COND_OFFLINE)
520 return false;
43bcbd5b 521
1f57803b 522 /*
410a071c
DLM
523 * Skip full zones with data verification enabled because resetting a
524 * zone causes data loss and hence causes verification to fail.
1f57803b 525 */
f539b98c 526 if (td->o.verify != VERIFY_NONE && zbd_zone_remainder(z) == 0)
410a071c 527 return false;
4d4c71e6 528
410a071c 529 /*
a4807046
SK
530 * zbdi->max_write_zones == 0 means that there is no limit on the
531 * maximum number of write target zones. In this case, do no track write
532 * target zones in zbdi->write_zones array.
410a071c 533 */
a4807046 534 if (!zbdi->max_write_zones)
410a071c 535 return true;
4d4c71e6 536
410a071c 537 pthread_mutex_lock(&zbdi->mutex);
b5a0f7ce 538
a4807046 539 if (z->write) {
410a071c 540 /*
b5a0f7ce 541 * If the zone is going to be completely filled by writes
a4807046
SK
542 * already in-flight, handle it as a full zone instead of a
543 * write target zone.
410a071c 544 */
df67bf1e 545 if (!zbd_zone_remainder(z))
410a071c
DLM
546 res = false;
547 goto out;
548 }
139d8dc6 549
410a071c
DLM
550 res = false;
551 /* Zero means no limit */
552 if (td->o.job_max_open_zones > 0 &&
a4807046 553 td->num_write_zones >= td->o.job_max_open_zones)
410a071c 554 goto out;
a4807046 555 if (zbdi->num_write_zones >= zbdi->max_write_zones)
410a071c 556 goto out;
139d8dc6 557
a4807046 558 dprint(FD_ZBD, "%s: adding zone %u to write zone array\n",
139d8dc6
DLM
559 f->file_name, zone_idx);
560
a4807046
SK
561 zbdi->write_zones[zbdi->num_write_zones++] = zone_idx;
562 td->num_write_zones++;
563 z->write = 1;
410a071c 564 res = true;
bfbdd35b 565
410a071c
DLM
566out:
567 pthread_mutex_unlock(&zbdi->mutex);
568 return res;
923f7c1e
DF
569}
570
f539b98c
SK
571/**
572 * zbd_write_zone_get - Add a zone to the array of write zones.
573 * @td: fio thread data.
574 * @f: fio file that has the open zones to add.
575 * @zone_idx: Index of the zone to add.
576 *
577 * Add a ZBD zone to write target zones array, if it is not yet added. Returns
578 * true if either the zone was already added or if the zone was successfully
579 * added to the array without exceeding the maximum number of write zones.
580 * Returns false if the zone was not already added and addition of the zone
581 * would cause the zone limit to be exceeded.
582 */
583static bool zbd_write_zone_get(struct thread_data *td, const struct fio_file *f,
584 struct fio_zone_info *z)
585{
586 const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
587
588 /*
589 * Skip full zones with data verification enabled because resetting a
590 * zone causes data loss and hence causes verification to fail.
591 */
592 if (td->o.verify != VERIFY_NONE && zbd_zone_full(f, z, min_bs))
593 return false;
594
595 return __zbd_write_zone_get(td, f, z);
596}
597
59c3200d 598/* Verify whether direct I/O is used for all host-managed zoned block drives. */
bfbdd35b
BVA
599static bool zbd_using_direct_io(void)
600{
bfbdd35b 601 struct fio_file *f;
da8f124f 602 int j;
bfbdd35b 603
da8f124f 604 for_each_td(td) {
bfbdd35b
BVA
605 if (td->o.odirect || !(td->o.td_ddir & TD_DDIR_WRITE))
606 continue;
607 for_each_file(td, f, j) {
59c3200d 608 if (f->zbd_info && f->filetype == FIO_TYPE_BLOCK &&
b7694961 609 f->zbd_info->model == ZBD_HOST_MANAGED)
bfbdd35b
BVA
610 return false;
611 }
da8f124f 612 } end_for_each();
bfbdd35b
BVA
613
614 return true;
615}
616
617/* Whether or not the I/O range for f includes one or more sequential zones */
b3e9bd03 618static bool zbd_is_seq_job(const struct fio_file *f)
bfbdd35b
BVA
619{
620 uint32_t zone_idx, zone_idx_b, zone_idx_e;
621
622 assert(f->zbd_info);
139d8dc6 623
bfbdd35b
BVA
624 if (f->io_size == 0)
625 return false;
139d8dc6 626
dc8a3d62
DLM
627 zone_idx_b = zbd_offset_to_zone_idx(f, f->file_offset);
628 zone_idx_e =
629 zbd_offset_to_zone_idx(f, f->file_offset + f->io_size - 1);
bfbdd35b 630 for (zone_idx = zone_idx_b; zone_idx <= zone_idx_e; zone_idx++)
39e06ee7 631 if (zbd_get_zone(f, zone_idx)->has_wp)
bfbdd35b
BVA
632 return true;
633
634 return false;
635}
636
0bf93a1a
DLM
637/*
638 * Verify whether the file offset and size parameters are aligned with zone
639 * boundaries. If the file offset is not aligned, align it down to the start of
640 * the zone containing the start offset and align up the file io_size parameter.
641 */
642static bool zbd_zone_align_file_sizes(struct thread_data *td,
643 struct fio_file *f)
644{
645 const struct fio_zone_info *z;
646 uint64_t new_offset, new_end;
0bf93a1a
DLM
647
648 if (!f->zbd_info)
649 return true;
650 if (f->file_offset >= f->real_file_size)
651 return true;
652 if (!zbd_is_seq_job(f))
653 return true;
654
655 if (!td->o.zone_size) {
656 td->o.zone_size = f->zbd_info->zone_size;
657 if (!td->o.zone_size) {
658 log_err("%s: invalid 0 zone size\n",
659 f->file_name);
660 return false;
661 }
662 } else if (td->o.zone_size != f->zbd_info->zone_size) {
663 log_err("%s: zonesize %llu does not match the device zone size %"PRIu64".\n",
664 f->file_name, td->o.zone_size,
665 f->zbd_info->zone_size);
666 return false;
667 }
668
669 if (td->o.zone_skip % td->o.zone_size) {
670 log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n",
671 f->file_name, td->o.zone_skip,
672 td->o.zone_size);
673 return false;
674 }
675
3f96645f
DF
676 if (td->o.td_ddir == TD_DDIR_READ) {
677 z = zbd_offset_to_zone(f, f->file_offset + f->io_size);
678 new_end = z->start;
679 if (f->file_offset + f->io_size > new_end) {
680 log_info("%s: rounded io_size from %"PRIu64" to %"PRIu64"\n",
681 f->file_name, f->io_size,
682 new_end - f->file_offset);
683 f->io_size = new_end - f->file_offset;
684 }
685 return true;
686 }
687
53aa6171 688 z = zbd_offset_to_zone(f, f->file_offset);
3f96645f 689 if (f->file_offset != z->start) {
0bf93a1a
DLM
690 new_offset = zbd_zone_end(z);
691 if (new_offset >= f->file_offset + f->io_size) {
692 log_info("%s: io_size must be at least one zone\n",
693 f->file_name);
694 return false;
695 }
696 log_info("%s: rounded up offset from %"PRIu64" to %"PRIu64"\n",
697 f->file_name, f->file_offset,
698 new_offset);
699 f->io_size -= (new_offset - f->file_offset);
700 f->file_offset = new_offset;
701 }
702
53aa6171 703 z = zbd_offset_to_zone(f, f->file_offset + f->io_size);
0bf93a1a 704 new_end = z->start;
3f96645f 705 if (f->file_offset + f->io_size != new_end) {
0bf93a1a
DLM
706 if (new_end <= f->file_offset) {
707 log_info("%s: io_size must be at least one zone\n",
708 f->file_name);
709 return false;
710 }
711 log_info("%s: rounded down io_size from %"PRIu64" to %"PRIu64"\n",
712 f->file_name, f->io_size,
713 new_end - f->file_offset);
714 f->io_size = new_end - f->file_offset;
715 }
716
717 return true;
718}
719
bfbdd35b
BVA
720/*
721 * Verify whether offset and size parameters are aligned with zone boundaries.
722 */
723static bool zbd_verify_sizes(void)
724{
bfbdd35b 725 struct fio_file *f;
da8f124f 726 int j;
bfbdd35b 727
da8f124f 728 for_each_td(td) {
bfbdd35b 729 for_each_file(td, f, j) {
0bf93a1a 730 if (!zbd_zone_align_file_sizes(td, f))
4d37720a 731 return false;
bfbdd35b 732 }
da8f124f 733 } end_for_each();
bfbdd35b
BVA
734
735 return true;
736}
737
738static bool zbd_verify_bs(void)
739{
bfbdd35b 740 struct fio_file *f;
da8f124f 741 int j;
bfbdd35b 742
da8f124f 743 for_each_td(td) {
e3be810b
SK
744 if (td_trim(td) &&
745 (td->o.min_bs[DDIR_TRIM] != td->o.max_bs[DDIR_TRIM] ||
746 td->o.bssplit_nr[DDIR_TRIM])) {
747 log_info("bsrange and bssplit are not allowed for trim with zonemode=zbd\n");
748 return false;
749 }
bfbdd35b 750 for_each_file(td, f, j) {
1ddd225e
AD
751 uint64_t zone_size;
752
bfbdd35b
BVA
753 if (!f->zbd_info)
754 continue;
139d8dc6 755
bfbdd35b 756 zone_size = f->zbd_info->zone_size;
e3be810b 757 if (td_trim(td) && td->o.bs[DDIR_TRIM] != zone_size) {
ee5e3436 758 log_info("%s: trim block size %llu is not the zone size %"PRIu64"\n",
e3be810b 759 f->file_name, td->o.bs[DDIR_TRIM],
ee5e3436 760 zone_size);
e3be810b
SK
761 return false;
762 }
bfbdd35b 763 }
da8f124f 764 } end_for_each();
bfbdd35b
BVA
765 return true;
766}
767
bfbdd35b
BVA
768static int ilog2(uint64_t i)
769{
770 int log = -1;
771
772 while (i) {
773 i >>= 1;
774 log++;
775 }
776 return log;
777}
778
779/*
780 * Initialize f->zbd_info for devices that are not zoned block devices. This
781 * allows to execute a ZBD workload against a non-ZBD device.
782 */
783static int init_zone_info(struct thread_data *td, struct fio_file *f)
784{
785 uint32_t nr_zones;
786 struct fio_zone_info *p;
a4b7f12b 787 uint64_t zone_size = td->o.zone_size;
b8dd9750 788 uint64_t zone_capacity = td->o.zone_capacity;
bfbdd35b 789 struct zoned_block_device_info *zbd_info = NULL;
bfbdd35b
BVA
790 int i;
791
a4b7f12b 792 if (zone_size == 0) {
9db0cde8 793 log_err("%s: Specifying the zone size is mandatory for regular file/block device with --zonemode=zbd\n\n",
a4b7f12b
DLM
794 f->file_name);
795 return 1;
796 }
797
798 if (zone_size < 512) {
799 log_err("%s: zone size must be at least 512 bytes for --zonemode=zbd\n\n",
800 f->file_name);
801 return 1;
802 }
803
b8dd9750
HH
804 if (zone_capacity == 0)
805 zone_capacity = zone_size;
806
807 if (zone_capacity > zone_size) {
808 log_err("%s: job parameter zonecapacity %llu is larger than zone size %llu\n",
ee5e3436 809 f->file_name, td->o.zone_capacity, td->o.zone_size);
b8dd9750
HH
810 return 1;
811 }
812
9db0cde8
NC
813 if (f->real_file_size < zone_size) {
814 log_err("%s: file/device size %"PRIu64" is smaller than zone size %"PRIu64"\n",
815 f->file_name, f->real_file_size, zone_size);
816 return -EINVAL;
817 }
818
ee3696bd 819 nr_zones = (f->real_file_size + zone_size - 1) / zone_size;
bfbdd35b
BVA
820 zbd_info = scalloc(1, sizeof(*zbd_info) +
821 (nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
822 if (!zbd_info)
823 return -ENOMEM;
824
44ec32cb 825 mutex_init_pshared(&zbd_info->mutex);
bfbdd35b
BVA
826 zbd_info->refcount = 1;
827 p = &zbd_info->zone_info[0];
828 for (i = 0; i < nr_zones; i++, p++) {
44ec32cb
SK
829 mutex_init_pshared_with_type(&p->mutex,
830 PTHREAD_MUTEX_RECURSIVE);
bfbdd35b 831 p->start = i * zone_size;
b14651a2 832 p->wp = p->start;
b7694961
DLM
833 p->type = ZBD_ZONE_TYPE_SWR;
834 p->cond = ZBD_ZONE_COND_EMPTY;
b8dd9750 835 p->capacity = zone_capacity;
be7a6bae 836 p->has_wp = 1;
bfbdd35b
BVA
837 }
838 /* a sentinel */
839 p->start = nr_zones * zone_size;
840
841 f->zbd_info = zbd_info;
842 f->zbd_info->zone_size = zone_size;
843 f->zbd_info->zone_size_log2 = is_power_of_2(zone_size) ?
ebc403fe 844 ilog2(zone_size) : 0;
bfbdd35b 845 f->zbd_info->nr_zones = nr_zones;
bfbdd35b
BVA
846 return 0;
847}
848
849/*
b7694961
DLM
850 * Maximum number of zones to report in one operation.
851 */
852#define ZBD_REPORT_MAX_ZONES 8192U
853
854/*
855 * Parse the device zone report and store it in f->zbd_info. Must be called
856 * only for devices that are zoned, namely those with a model != ZBD_NONE.
bfbdd35b
BVA
857 */
858static int parse_zone_info(struct thread_data *td, struct fio_file *f)
859{
b7694961
DLM
860 int nr_zones, nrz;
861 struct zbd_zone *zones, *z;
bfbdd35b 862 struct fio_zone_info *p;
04f9090b
BVA
863 uint64_t zone_size, offset, capacity;
864 bool same_zone_cap = true;
bfbdd35b 865 struct zoned_block_device_info *zbd_info = NULL;
d060babc 866 int i, j, ret = -ENOMEM;
bfbdd35b 867
b7694961
DLM
868 zones = calloc(ZBD_REPORT_MAX_ZONES, sizeof(struct zbd_zone));
869 if (!zones)
bfbdd35b
BVA
870 goto out;
871
b7694961
DLM
872 nrz = zbd_report_zones(td, f, 0, zones, ZBD_REPORT_MAX_ZONES);
873 if (nrz < 0) {
874 ret = nrz;
875 log_info("fio: report zones (offset 0) failed for %s (%d).\n",
876 f->file_name, -ret);
877 goto out;
bfbdd35b
BVA
878 }
879
b7694961 880 zone_size = zones[0].len;
04f9090b 881 capacity = zones[0].capacity;
ee3696bd 882 nr_zones = (f->real_file_size + zone_size - 1) / zone_size;
bfbdd35b
BVA
883
884 if (td->o.zone_size == 0) {
ee3696bd
DLM
885 td->o.zone_size = zone_size;
886 } else if (td->o.zone_size != zone_size) {
ee5e3436
SK
887 log_err("fio: %s job parameter zonesize %llu does not match disk zone size %"PRIu64".\n",
888 f->file_name, td->o.zone_size, zone_size);
bfbdd35b 889 ret = -EINVAL;
b7694961 890 goto out;
bfbdd35b
BVA
891 }
892
9724b4f5
NC
893 dprint(FD_ZBD, "Device %s has %d zones of size %"PRIu64" KB\n",
894 f->file_name, nr_zones, zone_size / 1024);
bfbdd35b
BVA
895
896 zbd_info = scalloc(1, sizeof(*zbd_info) +
897 (nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
bfbdd35b 898 if (!zbd_info)
b7694961 899 goto out;
44ec32cb 900 mutex_init_pshared(&zbd_info->mutex);
bfbdd35b
BVA
901 zbd_info->refcount = 1;
902 p = &zbd_info->zone_info[0];
b7694961
DLM
903 for (offset = 0, j = 0; j < nr_zones;) {
904 z = &zones[0];
905 for (i = 0; i < nrz; i++, j++, z++, p++) {
44ec32cb
SK
906 mutex_init_pshared_with_type(&p->mutex,
907 PTHREAD_MUTEX_RECURSIVE);
b7694961 908 p->start = z->start;
236d23a8 909 p->capacity = z->capacity;
04f9090b
BVA
910 if (capacity != z->capacity)
911 same_zone_cap = false;
139d8dc6 912
bfbdd35b 913 switch (z->cond) {
b7694961
DLM
914 case ZBD_ZONE_COND_NOT_WP:
915 case ZBD_ZONE_COND_FULL:
236d23a8 916 p->wp = p->start + p->capacity;
bfbdd35b
BVA
917 break;
918 default:
919 assert(z->start <= z->wp);
b7694961
DLM
920 assert(z->wp <= z->start + zone_size);
921 p->wp = z->wp;
bfbdd35b
BVA
922 break;
923 }
be7a6bae
DF
924
925 switch (z->type) {
926 case ZBD_ZONE_TYPE_SWR:
927 p->has_wp = 1;
928 break;
929 default:
930 p->has_wp = 0;
931 }
bfbdd35b
BVA
932 p->type = z->type;
933 p->cond = z->cond;
be7a6bae 934
bfbdd35b 935 if (j > 0 && p->start != p[-1].start + zone_size) {
adfa7b7c
BVA
936 log_info("%s: invalid zone data [%d:%d]: %"PRIu64" + %"PRIu64" != %"PRIu64"\n",
937 f->file_name, j, i,
938 p[-1].start, zone_size, p->start);
bfbdd35b 939 ret = -EINVAL;
b7694961 940 goto out;
bfbdd35b
BVA
941 }
942 }
943 z--;
b7694961 944 offset = z->start + z->len;
bfbdd35b
BVA
945 if (j >= nr_zones)
946 break;
139d8dc6 947
6c3f1cc1
DF
948 nrz = zbd_report_zones(td, f, offset, zones,
949 min((uint32_t)(nr_zones - j),
950 ZBD_REPORT_MAX_ZONES));
b7694961
DLM
951 if (nrz < 0) {
952 ret = nrz;
ee5e3436
SK
953 log_info("fio: report zones (offset %"PRIu64") failed for %s (%d).\n",
954 offset, f->file_name, -ret);
b7694961 955 goto out;
bfbdd35b
BVA
956 }
957 }
b7694961 958
bfbdd35b 959 /* a sentinel */
b7694961 960 zbd_info->zone_info[nr_zones].start = offset;
bfbdd35b
BVA
961
962 f->zbd_info = zbd_info;
963 f->zbd_info->zone_size = zone_size;
964 f->zbd_info->zone_size_log2 = is_power_of_2(zone_size) ?
ebc403fe 965 ilog2(zone_size) : 0;
bfbdd35b 966 f->zbd_info->nr_zones = nr_zones;
9e523ef8 967 f->zbd_info->max_active_zones = zbd_get_max_active_zones(td, f);
04f9090b
BVA
968
969 if (same_zone_cap)
970 dprint(FD_ZBD, "Zone capacity = %"PRIu64" KB\n",
971 capacity / 1024);
972
bfbdd35b
BVA
973 zbd_info = NULL;
974 ret = 0;
975
bfbdd35b 976out:
b7694961
DLM
977 sfree(zbd_info);
978 free(zones);
bfbdd35b
BVA
979 return ret;
980}
981
a4807046 982static int zbd_set_max_write_zones(struct thread_data *td, struct fio_file *f)
d2f442bc
NC
983{
984 struct zoned_block_device_info *zbd = f->zbd_info;
985 unsigned int max_open_zones;
986 int ret;
987
575686bb 988 if (zbd->model != ZBD_HOST_MANAGED || td->o.ignore_zone_limits) {
d2f442bc 989 /* Only host-managed devices have a max open limit */
a4807046 990 zbd->max_write_zones = td->o.max_open_zones;
d2f442bc
NC
991 goto out;
992 }
993
994 /* If host-managed, get the max open limit */
995 ret = zbd_get_max_open_zones(td, f, &max_open_zones);
996 if (ret)
997 return ret;
998
999 if (!max_open_zones) {
1000 /* No device limit */
a4807046 1001 zbd->max_write_zones = td->o.max_open_zones;
d2f442bc
NC
1002 } else if (!td->o.max_open_zones) {
1003 /* No user limit. Set limit to device limit */
a4807046 1004 zbd->max_write_zones = max_open_zones;
d2f442bc
NC
1005 } else if (td->o.max_open_zones <= max_open_zones) {
1006 /* Both user limit and dev limit. User limit not too large */
a4807046 1007 zbd->max_write_zones = td->o.max_open_zones;
d2f442bc
NC
1008 } else {
1009 /* Both user limit and dev limit. User limit too large */
1010 td_verror(td, EINVAL,
1011 "Specified --max_open_zones is too large");
1012 log_err("Specified --max_open_zones (%d) is larger than max (%u)\n",
1013 td->o.max_open_zones, max_open_zones);
1014 return -EINVAL;
1015 }
1016
1017out:
1018 /* Ensure that the limit is not larger than FIO's internal limit */
a4807046 1019 if (zbd->max_write_zones > ZBD_MAX_WRITE_ZONES) {
b346af90 1020 td_verror(td, EINVAL, "'max_open_zones' value is too large");
139d8dc6 1021 log_err("'max_open_zones' value is larger than %u\n",
a4807046 1022 ZBD_MAX_WRITE_ZONES);
b346af90
NC
1023 return -EINVAL;
1024 }
1025
a4807046
SK
1026 dprint(FD_ZBD, "%s: using max write zones limit: %"PRIu32"\n",
1027 f->file_name, zbd->max_write_zones);
d2f442bc
NC
1028
1029 return 0;
1030}
1031
bfbdd35b
BVA
1032/*
1033 * Allocate zone information and store it into f->zbd_info if zonemode=zbd.
1034 *
1035 * Returns 0 upon success and a negative error code upon failure.
1036 */
379e5f09 1037static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
bfbdd35b 1038{
b7694961
DLM
1039 enum zbd_zoned_model zbd_model;
1040 int ret;
bfbdd35b
BVA
1041
1042 assert(td->o.zone_mode == ZONE_MODE_ZBD);
1043
b7694961
DLM
1044 ret = zbd_get_zoned_model(td, f, &zbd_model);
1045 if (ret)
1046 return ret;
1047
bfbdd35b 1048 switch (zbd_model) {
b7694961
DLM
1049 case ZBD_HOST_AWARE:
1050 case ZBD_HOST_MANAGED:
bfbdd35b 1051 ret = parse_zone_info(td, f);
d2f442bc
NC
1052 if (ret)
1053 return ret;
bfbdd35b 1054 break;
b7694961 1055 case ZBD_NONE:
bfbdd35b 1056 ret = init_zone_info(td, f);
d2f442bc
NC
1057 if (ret)
1058 return ret;
bfbdd35b 1059 break;
b7694961
DLM
1060 default:
1061 td_verror(td, EINVAL, "Unsupported zoned model");
1062 log_err("Unsupported zoned model\n");
1063 return -EINVAL;
bfbdd35b 1064 }
b7694961 1065
2c7dd23e 1066 assert(f->zbd_info);
d2f442bc
NC
1067 f->zbd_info->model = zbd_model;
1068
a4807046 1069 ret = zbd_set_max_write_zones(td, f);
d2f442bc
NC
1070 if (ret) {
1071 zbd_free_zone_info(f);
1072 return ret;
219c662d 1073 }
d2f442bc
NC
1074
1075 return 0;
bfbdd35b
BVA
1076}
1077
1078void zbd_free_zone_info(struct fio_file *f)
1079{
1080 uint32_t refcount;
1081
3c1dc34c 1082 assert(f->zbd_info);
bfbdd35b
BVA
1083
1084 pthread_mutex_lock(&f->zbd_info->mutex);
1085 refcount = --f->zbd_info->refcount;
1086 pthread_mutex_unlock(&f->zbd_info->mutex);
1087
1088 assert((int32_t)refcount >= 0);
1089 if (refcount == 0)
1090 sfree(f->zbd_info);
1091 f->zbd_info = NULL;
1092}
1093
1094/*
1095 * Initialize f->zbd_info.
1096 *
1097 * Returns 0 upon success and a negative error code upon failure.
1098 *
1099 * Note: this function can only work correctly if it is called before the first
1100 * fio fork() call.
1101 */
1102static int zbd_init_zone_info(struct thread_data *td, struct fio_file *file)
1103{
bfbdd35b 1104 struct fio_file *f2;
da8f124f 1105 int j, ret;
bfbdd35b 1106
da8f124f 1107 for_each_td(td2) {
bfbdd35b
BVA
1108 for_each_file(td2, f2, j) {
1109 if (td2 == td && f2 == file)
1110 continue;
1111 if (!f2->zbd_info ||
1112 strcmp(f2->file_name, file->file_name) != 0)
1113 continue;
1114 file->zbd_info = f2->zbd_info;
1115 file->zbd_info->refcount++;
1116 return 0;
1117 }
da8f124f 1118 } end_for_each();
bfbdd35b
BVA
1119
1120 ret = zbd_create_zone_info(td, file);
1121 if (ret < 0)
c5837eec 1122 td_verror(td, -ret, "zbd_create_zone_info() failed");
139d8dc6 1123
bfbdd35b
BVA
1124 return ret;
1125}
1126
8f39afa7 1127int zbd_init_files(struct thread_data *td)
bfbdd35b
BVA
1128{
1129 struct fio_file *f;
1130 int i;
1131
1132 for_each_file(td, f, i) {
a4b7f12b 1133 if (zbd_init_zone_info(td, f))
bfbdd35b 1134 return 1;
bfbdd35b 1135 }
139d8dc6 1136
8f39afa7
AD
1137 return 0;
1138}
1139
1140void zbd_recalc_options_with_zone_granularity(struct thread_data *td)
1141{
1142 struct fio_file *f;
1143 int i;
1144
1145 for_each_file(td, f, i) {
1146 struct zoned_block_device_info *zbd = f->zbd_info;
139d8dc6 1147 uint64_t zone_size;
8f39afa7 1148
139d8dc6
DLM
1149 /* zonemode=strided doesn't get per-file zone size. */
1150 zone_size = zbd ? zbd->zone_size : td->o.zone_size;
8f39afa7
AD
1151 if (zone_size == 0)
1152 continue;
1153
139d8dc6 1154 if (td->o.size_nz > 0)
8f39afa7 1155 td->o.size = td->o.size_nz * zone_size;
139d8dc6 1156 if (td->o.io_size_nz > 0)
8f39afa7 1157 td->o.io_size = td->o.io_size_nz * zone_size;
139d8dc6 1158 if (td->o.start_offset_nz > 0)
8f39afa7 1159 td->o.start_offset = td->o.start_offset_nz * zone_size;
139d8dc6
DLM
1160 if (td->o.offset_increment_nz > 0)
1161 td->o.offset_increment =
1162 td->o.offset_increment_nz * zone_size;
1163 if (td->o.zone_skip_nz > 0)
8f39afa7 1164 td->o.zone_skip = td->o.zone_skip_nz * zone_size;
8f39afa7
AD
1165 }
1166}
1167
9fb714da
SK
1168static uint64_t zbd_verify_and_set_vdb(struct thread_data *td,
1169 const struct fio_file *f)
1170{
1171 struct fio_zone_info *zb, *ze, *z;
1172 uint64_t wp_vdb = 0;
1173 struct zoned_block_device_info *zbdi = f->zbd_info;
1174
1175 assert(td->runstate < TD_RUNNING);
1176 assert(zbdi);
1177
1178 if (!accounting_vdb(td, f))
1179 return 0;
1180
1181 /*
1182 * Ensure that the I/O range includes one or more sequential zones so
1183 * that f->min_zone and f->max_zone have different values.
1184 */
1185 if (!zbd_is_seq_job(f))
1186 return 0;
1187
1188 if (zbdi->write_min_zone != zbdi->write_max_zone) {
1189 if (zbdi->write_min_zone != f->min_zone ||
1190 zbdi->write_max_zone != f->max_zone) {
1191 td_verror(td, EINVAL,
1192 "multi-jobs with different write ranges are "
1193 "not supported with zone_reset_threshold");
1194 log_err("multi-jobs with different write ranges are "
1195 "not supported with zone_reset_threshold\n");
1196 }
1197 return 0;
1198 }
1199
1200 zbdi->write_min_zone = f->min_zone;
1201 zbdi->write_max_zone = f->max_zone;
1202
1203 zb = zbd_get_zone(f, f->min_zone);
1204 ze = zbd_get_zone(f, f->max_zone);
1205 for (z = zb; z < ze; z++)
1206 if (z->has_wp)
1207 wp_vdb += z->wp - z->start;
1208
1209 zbdi->wp_valid_data_bytes = wp_vdb;
1210
1211 return wp_vdb;
1212}
1213
8f39afa7
AD
1214int zbd_setup_files(struct thread_data *td)
1215{
1216 struct fio_file *f;
1217 int i;
bfbdd35b
BVA
1218
1219 if (!zbd_using_direct_io()) {
1220 log_err("Using direct I/O is mandatory for writing to ZBD drives\n\n");
1221 return 1;
1222 }
1223
1224 if (!zbd_verify_sizes())
1225 return 1;
1226
1227 if (!zbd_verify_bs())
1228 return 1;
1229
6e2da06a
SK
1230 if (td->o.experimental_verify) {
1231 log_err("zonemode=zbd does not support experimental verify\n");
1232 return 1;
1233 }
1234
219c662d
AD
1235 for_each_file(td, f, i) {
1236 struct zoned_block_device_info *zbd = f->zbd_info;
954217b9
SK
1237 struct fio_zone_info *z;
1238 int zi;
9fb714da 1239 uint64_t vdb;
219c662d 1240
5ddf46d0 1241 assert(zbd);
219c662d 1242
dc8a3d62
DLM
1243 f->min_zone = zbd_offset_to_zone_idx(f, f->file_offset);
1244 f->max_zone =
1245 zbd_offset_to_zone_idx(f, f->file_offset + f->io_size);
f952800a 1246
9fb714da
SK
1247 vdb = zbd_verify_and_set_vdb(td, f);
1248
1249 dprint(FD_ZBD, "%s(%s): valid data bytes = %" PRIu64 "\n",
1250 __func__, f->file_name, vdb);
1251
f952800a
SK
1252 /*
1253 * When all zones in the I/O range are conventional, io_size
1254 * can be smaller than zone size, making min_zone the same
1255 * as max_zone. This is why the assert below needs to be made
1256 * conditional.
1257 */
1258 if (zbd_is_seq_job(f))
1259 assert(f->min_zone < f->max_zone);
1260
219c662d 1261 if (td->o.max_open_zones > 0 &&
a4807046 1262 zbd->max_write_zones != td->o.max_open_zones) {
219c662d
AD
1263 log_err("Different 'max_open_zones' values\n");
1264 return 1;
1265 }
b346af90
NC
1266
1267 /*
1268 * The per job max open zones limit cannot be used without a
1269 * global max open zones limit. (As the tracking of open zones
1270 * is disabled when there is no global max open zones limit.)
1271 */
a4807046 1272 if (td->o.job_max_open_zones && !zbd->max_write_zones) {
b346af90 1273 log_err("'job_max_open_zones' cannot be used without a global open zones limit\n");
219c662d
AD
1274 return 1;
1275 }
954217b9 1276
ea51055c 1277 /*
a4807046 1278 * zbd->max_write_zones is the global limit shared for all jobs
ea51055c
NC
1279 * that target the same zoned block device. Force sync the per
1280 * thread global limit with the actual global limit. (The real
1281 * per thread/job limit is stored in td->o.job_max_open_zones).
1282 */
a4807046 1283 td->o.max_open_zones = zbd->max_write_zones;
ea51055c 1284
954217b9
SK
1285 for (zi = f->min_zone; zi < f->max_zone; zi++) {
1286 z = &zbd->zone_info[zi];
1287 if (z->cond != ZBD_ZONE_COND_IMP_OPEN &&
bab838f8
SK
1288 z->cond != ZBD_ZONE_COND_EXP_OPEN &&
1289 z->cond != ZBD_ZONE_COND_CLOSED)
1290 continue;
1291 if (!zbd->max_active_zones &&
1292 z->cond == ZBD_ZONE_COND_CLOSED)
954217b9 1293 continue;
f539b98c 1294 if (__zbd_write_zone_get(td, f, z))
954217b9
SK
1295 continue;
1296 /*
1297 * If the number of open zones exceeds specified limits,
8ac76889 1298 * error out.
954217b9 1299 */
8ac76889
SK
1300 log_err("Number of open zones exceeds max_open_zones limit\n");
1301 return 1;
954217b9 1302 }
219c662d
AD
1303 }
1304
bfbdd35b
BVA
1305 return 0;
1306}
1307
a7c2b6fc
BVA
1308/*
1309 * Reset zbd_info.write_cnt, the counter that counts down towards the next
1310 * zone reset.
1311 */
1bb1bcad
AD
1312static void _zbd_reset_write_cnt(const struct thread_data *td,
1313 const struct fio_file *f)
a7c2b6fc
BVA
1314{
1315 assert(0 <= td->o.zrf.u.f && td->o.zrf.u.f <= 1);
1316
a7c2b6fc
BVA
1317 f->zbd_info->write_cnt = td->o.zrf.u.f ?
1318 min(1.0 / td->o.zrf.u.f, 0.0 + UINT_MAX) : UINT_MAX;
1bb1bcad
AD
1319}
1320
1321static void zbd_reset_write_cnt(const struct thread_data *td,
1322 const struct fio_file *f)
1323{
1324 pthread_mutex_lock(&f->zbd_info->mutex);
1325 _zbd_reset_write_cnt(td, f);
a7c2b6fc
BVA
1326 pthread_mutex_unlock(&f->zbd_info->mutex);
1327}
1328
1329static bool zbd_dec_and_reset_write_cnt(const struct thread_data *td,
1330 const struct fio_file *f)
1331{
1332 uint32_t write_cnt = 0;
1333
1334 pthread_mutex_lock(&f->zbd_info->mutex);
1335 assert(f->zbd_info->write_cnt);
1336 if (f->zbd_info->write_cnt)
1337 write_cnt = --f->zbd_info->write_cnt;
1338 if (write_cnt == 0)
1bb1bcad 1339 _zbd_reset_write_cnt(td, f);
a7c2b6fc
BVA
1340 pthread_mutex_unlock(&f->zbd_info->mutex);
1341
1342 return write_cnt == 0;
1343}
1344
bfbdd35b
BVA
1345void zbd_file_reset(struct thread_data *td, struct fio_file *f)
1346{
91d25131 1347 struct fio_zone_info *zb, *ze;
c5c8b92b 1348 bool verify_data_left = false;
bfbdd35b 1349
767d1372 1350 if (!f->zbd_info || !td_write(td))
bfbdd35b
BVA
1351 return;
1352
39e06ee7
DLM
1353 zb = zbd_get_zone(f, f->min_zone);
1354 ze = zbd_get_zone(f, f->max_zone);
139d8dc6 1355
bfbdd35b
BVA
1356 /*
1357 * If data verification is enabled reset the affected zones before
1358 * writing any data to avoid that a zone reset has to be issued while
1359 * writing data, which causes data loss.
1360 */
c5c8b92b
SK
1361 if (td->o.verify != VERIFY_NONE) {
1362 verify_data_left = td->runstate == TD_VERIFYING ||
1363 td->io_hist_len || td->verify_batch;
1364 if (td->io_hist_len && td->o.verify_backlog)
1365 verify_data_left =
1366 td->io_hist_len % td->o.verify_backlog;
1367 if (!verify_data_left)
1368 zbd_reset_zones(td, f, zb, ze);
1369 }
1370
a7c2b6fc 1371 zbd_reset_write_cnt(td, f);
bfbdd35b
BVA
1372}
1373
a4807046 1374/* Return random zone index for one of the write target zones. */
6463db6c
AD
1375static uint32_t pick_random_zone_idx(const struct fio_file *f,
1376 const struct io_u *io_u)
1377{
139d8dc6 1378 return (io_u->offset - f->file_offset) *
a4807046 1379 f->zbd_info->num_write_zones / f->io_size;
6463db6c
AD
1380}
1381
0f77c977
SK
1382static bool any_io_in_flight(void)
1383{
da8f124f 1384 for_each_td(td) {
0f77c977
SK
1385 if (td->io_u_in_flight)
1386 return true;
da8f124f 1387 } end_for_each();
0f77c977
SK
1388
1389 return false;
1390}
1391
59b07544 1392/*
a4807046
SK
1393 * Modify the offset of an I/O unit that does not refer to a zone such that
1394 * in write target zones array. Add a zone to or remove a zone from the lsit if
1395 * necessary. The write target zone is searched across sequential zones.
21c0c884 1396 * This algorithm can only work correctly if all write pointers are
59b07544
BVA
1397 * a multiple of the fio block size. The caller must neither hold z->mutex
1398 * nor f->zbd_info->mutex. Returns with z->mutex held upon success.
1399 */
a4807046
SK
1400static struct fio_zone_info *zbd_convert_to_write_zone(struct thread_data *td,
1401 struct io_u *io_u)
59b07544 1402{
07fc3f57 1403 const uint64_t min_bs = td->o.min_bs[io_u->ddir];
fae3b9a0 1404 struct fio_file *f = io_u->file;
af94a8c3 1405 struct zoned_block_device_info *zbdi = f->zbd_info;
59b07544 1406 struct fio_zone_info *z;
a4807046 1407 unsigned int write_zone_idx = -1;
59b07544
BVA
1408 uint32_t zone_idx, new_zone_idx;
1409 int i;
a4807046 1410 bool wait_zone_write;
0f77c977
SK
1411 bool in_flight;
1412 bool should_retry = true;
59b07544
BVA
1413
1414 assert(is_valid_offset(f, io_u->offset));
1415
a4807046 1416 if (zbdi->max_write_zones || td->o.job_max_open_zones) {
59b07544 1417 /*
a4807046 1418 * This statement accesses zbdi->write_zones[] on purpose
59b07544
BVA
1419 * without locking.
1420 */
a4807046 1421 zone_idx = zbdi->write_zones[pick_random_zone_idx(f, io_u)];
59b07544 1422 } else {
dc8a3d62 1423 zone_idx = zbd_offset_to_zone_idx(f, io_u->offset);
59b07544 1424 }
fae3b9a0
AD
1425 if (zone_idx < f->min_zone)
1426 zone_idx = f->min_zone;
1427 else if (zone_idx >= f->max_zone)
1428 zone_idx = f->max_zone - 1;
139d8dc6
DLM
1429
1430 dprint(FD_ZBD,
1431 "%s(%s): starting from zone %d (offset %lld, buflen %lld)\n",
59b07544
BVA
1432 __func__, f->file_name, zone_idx, io_u->offset, io_u->buflen);
1433
1434 /*
af94a8c3 1435 * Since z->mutex is the outer lock and zbdi->mutex the inner
59b07544 1436 * lock it can happen that the state of the zone with index zone_idx
af94a8c3 1437 * has changed after 'z' has been assigned and before zbdi->mutex
59b07544
BVA
1438 * has been obtained. Hence the loop.
1439 */
1440 for (;;) {
6463db6c
AD
1441 uint32_t tmp_idx;
1442
39e06ee7 1443 z = zbd_get_zone(f, zone_idx);
14351148
DF
1444 if (z->has_wp)
1445 zone_lock(td, f, z);
139d8dc6 1446
af94a8c3 1447 pthread_mutex_lock(&zbdi->mutex);
139d8dc6 1448
14351148
DF
1449 if (z->has_wp) {
1450 if (z->cond != ZBD_ZONE_COND_OFFLINE &&
a4807046 1451 zbdi->max_write_zones == 0 &&
139d8dc6 1452 td->o.job_max_open_zones == 0)
14351148 1453 goto examine_zone;
a4807046
SK
1454 if (zbdi->num_write_zones == 0) {
1455 dprint(FD_ZBD, "%s(%s): no zone is write target\n",
14351148 1456 __func__, f->file_name);
a4807046 1457 goto choose_other_zone;
14351148 1458 }
59b07544 1459 }
6463db6c
AD
1460
1461 /*
a4807046 1462 * Array of write target zones is per-device, shared across all
139d8dc6
DLM
1463 * threads. Start with quasi-random candidate zone. Ignore
1464 * zones which don't belong to thread's offset/size area.
6463db6c 1465 */
a4807046
SK
1466 write_zone_idx = pick_random_zone_idx(f, io_u);
1467 assert(!write_zone_idx ||
1468 write_zone_idx < zbdi->num_write_zones);
1469 tmp_idx = write_zone_idx;
139d8dc6 1470
a4807046 1471 for (i = 0; i < zbdi->num_write_zones; i++) {
6463db6c
AD
1472 uint32_t tmpz;
1473
a4807046 1474 if (tmp_idx >= zbdi->num_write_zones)
6463db6c 1475 tmp_idx = 0;
a4807046 1476 tmpz = zbdi->write_zones[tmp_idx];
fae3b9a0 1477 if (f->min_zone <= tmpz && tmpz < f->max_zone) {
a4807046 1478 write_zone_idx = tmp_idx;
6463db6c
AD
1479 goto found_candidate_zone;
1480 }
1481
1482 tmp_idx++;
1483 }
1484
1485 dprint(FD_ZBD, "%s(%s): no candidate zone\n",
1486 __func__, f->file_name);
139d8dc6 1487
af94a8c3 1488 pthread_mutex_unlock(&zbdi->mutex);
139d8dc6 1489
14351148
DF
1490 if (z->has_wp)
1491 zone_unlock(z);
139d8dc6 1492
6463db6c
AD
1493 return NULL;
1494
1495found_candidate_zone:
a4807046 1496 new_zone_idx = zbdi->write_zones[write_zone_idx];
59b07544
BVA
1497 if (new_zone_idx == zone_idx)
1498 break;
1499 zone_idx = new_zone_idx;
139d8dc6 1500
af94a8c3 1501 pthread_mutex_unlock(&zbdi->mutex);
139d8dc6 1502
14351148
DF
1503 if (z->has_wp)
1504 zone_unlock(z);
59b07544
BVA
1505 }
1506
af94a8c3 1507 /* Both z->mutex and zbdi->mutex are held. */
59b07544
BVA
1508
1509examine_zone:
df67bf1e 1510 if (zbd_zone_remainder(z) >= min_bs) {
af94a8c3 1511 pthread_mutex_unlock(&zbdi->mutex);
59b07544
BVA
1512 goto out;
1513 }
b2da58c4 1514
a4807046
SK
1515choose_other_zone:
1516 /* Check if number of write target zones reaches one of limits. */
1517 wait_zone_write =
1518 zbdi->num_write_zones == f->max_zone - f->min_zone ||
1519 (zbdi->max_write_zones &&
1520 zbdi->num_write_zones == zbdi->max_write_zones) ||
b2da58c4 1521 (td->o.job_max_open_zones &&
a4807046 1522 td->num_write_zones == td->o.job_max_open_zones);
b2da58c4 1523
af94a8c3 1524 pthread_mutex_unlock(&zbdi->mutex);
59b07544
BVA
1525
1526 /* Only z->mutex is held. */
1527
b2da58c4 1528 /*
a4807046
SK
1529 * When number of write target zones reaches to one of limits, wait for
1530 * zone write completion to one of them before trying a new zone.
b2da58c4 1531 */
a4807046 1532 if (wait_zone_write) {
139d8dc6 1533 dprint(FD_ZBD,
a4807046 1534 "%s(%s): quiesce to remove a zone from write target zones array\n",
b2da58c4
SK
1535 __func__, f->file_name);
1536 io_u_quiesce(td);
1537 }
1538
0f77c977 1539retry:
a4807046 1540 /* Zone 'z' is full, so try to choose a new zone. */
af94a8c3 1541 for (i = f->io_size / zbdi->zone_size; i > 0; i--) {
59b07544 1542 zone_idx++;
21c0c884
SK
1543 if (z->has_wp)
1544 zone_unlock(z);
59b07544 1545 z++;
ee3696bd 1546 if (!is_valid_offset(f, z->start)) {
59b07544 1547 /* Wrap-around. */
fae3b9a0 1548 zone_idx = f->min_zone;
39e06ee7 1549 z = zbd_get_zone(f, zone_idx);
59b07544 1550 }
ee3696bd 1551 assert(is_valid_offset(f, z->start));
21c0c884
SK
1552 if (!z->has_wp)
1553 continue;
fae3b9a0 1554 zone_lock(td, f, z);
a4807046 1555 if (z->write)
59b07544 1556 continue;
a4807046 1557 if (zbd_write_zone_get(td, f, z))
59b07544
BVA
1558 goto out;
1559 }
1560
1561 /* Only z->mutex is held. */
1562
a4807046 1563 /* Check whether the write fits in any of the write target zones. */
af94a8c3 1564 pthread_mutex_lock(&zbdi->mutex);
a4807046
SK
1565 for (i = 0; i < zbdi->num_write_zones; i++) {
1566 zone_idx = zbdi->write_zones[i];
fae3b9a0
AD
1567 if (zone_idx < f->min_zone || zone_idx >= f->max_zone)
1568 continue;
af94a8c3 1569 pthread_mutex_unlock(&zbdi->mutex);
4d4c71e6 1570 zone_unlock(z);
59b07544 1571
39e06ee7 1572 z = zbd_get_zone(f, zone_idx);
59b07544 1573
fae3b9a0 1574 zone_lock(td, f, z);
df67bf1e 1575 if (zbd_zone_remainder(z) >= min_bs)
59b07544 1576 goto out;
af94a8c3 1577 pthread_mutex_lock(&zbdi->mutex);
59b07544 1578 }
0f77c977
SK
1579
1580 /*
1581 * When any I/O is in-flight or when all I/Os in-flight get completed,
a4807046
SK
1582 * the I/Os might have removed zones from the write target array then
1583 * retry the steps to choose a zone. Before retry, call io_u_quiesce()
1584 * to complete in-flight writes.
0f77c977
SK
1585 */
1586 in_flight = any_io_in_flight();
1587 if (in_flight || should_retry) {
139d8dc6 1588 dprint(FD_ZBD,
a4807046 1589 "%s(%s): wait zone write and retry write target zone selection\n",
0f77c977 1590 __func__, f->file_name);
62ac6649 1591 should_retry = in_flight;
0f77c977
SK
1592 pthread_mutex_unlock(&zbdi->mutex);
1593 zone_unlock(z);
1594 io_u_quiesce(td);
1595 zone_lock(td, f, z);
0f77c977
SK
1596 goto retry;
1597 }
1598
af94a8c3 1599 pthread_mutex_unlock(&zbdi->mutex);
139d8dc6 1600
4d4c71e6 1601 zone_unlock(z);
139d8dc6 1602
a4807046 1603 dprint(FD_ZBD, "%s(%s): did not choose another write zone\n",
139d8dc6
DLM
1604 __func__, f->file_name);
1605
59b07544
BVA
1606 return NULL;
1607
1608out:
139d8dc6
DLM
1609 dprint(FD_ZBD, "%s(%s): returning zone %d\n",
1610 __func__, f->file_name, zone_idx);
1611
ee3696bd 1612 io_u->offset = z->start;
21c0c884 1613 assert(z->has_wp);
8a866de7 1614 assert(z->cond != ZBD_ZONE_COND_OFFLINE);
139d8dc6 1615
59b07544
BVA
1616 return z;
1617}
1618
bfbdd35b 1619/*
5c86fdf6
SK
1620 * Find another zone which has @min_bytes of readable data. Search in zones
1621 * @zb + 1 .. @zl. For random workload, also search in zones @zb - 1 .. @zf.
bfbdd35b 1622 *
21c0c884
SK
1623 * Either returns NULL or returns a zone pointer. When the zone has write
1624 * pointer, hold the mutex for the zone.
bfbdd35b
BVA
1625 */
1626static struct fio_zone_info *
07fc3f57 1627zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint64_t min_bytes,
bfbdd35b
BVA
1628 struct fio_zone_info *zb, struct fio_zone_info *zl)
1629{
fae3b9a0 1630 struct fio_file *f = io_u->file;
bfbdd35b 1631 struct fio_zone_info *z1, *z2;
39e06ee7 1632 const struct fio_zone_info *const zf = zbd_get_zone(f, f->min_zone);
bfbdd35b
BVA
1633
1634 /*
1635 * Skip to the next non-empty zone in case of sequential I/O and to
1636 * the nearest non-empty zone in case of random I/O.
1637 */
1638 for (z1 = zb + 1, z2 = zb - 1; z1 < zl || z2 >= zf; z1++, z2--) {
b7694961 1639 if (z1 < zl && z1->cond != ZBD_ZONE_COND_OFFLINE) {
21c0c884
SK
1640 if (z1->has_wp)
1641 zone_lock(td, f, z1);
5c86fdf6 1642 if (z1->start + min_bytes <= z1->wp)
bfbdd35b 1643 return z1;
21c0c884
SK
1644 if (z1->has_wp)
1645 zone_unlock(z1);
bfbdd35b
BVA
1646 } else if (!td_random(td)) {
1647 break;
1648 }
139d8dc6 1649
bfbdd35b 1650 if (td_random(td) && z2 >= zf &&
b7694961 1651 z2->cond != ZBD_ZONE_COND_OFFLINE) {
21c0c884
SK
1652 if (z2->has_wp)
1653 zone_lock(td, f, z2);
5c86fdf6 1654 if (z2->start + min_bytes <= z2->wp)
bfbdd35b 1655 return z2;
21c0c884
SK
1656 if (z2->has_wp)
1657 zone_unlock(z2);
bfbdd35b
BVA
1658 }
1659 }
139d8dc6
DLM
1660
1661 dprint(FD_ZBD,
1662 "%s: no zone has %"PRIu64" bytes of readable data\n",
5c86fdf6 1663 f->file_name, min_bytes);
139d8dc6 1664
bfbdd35b
BVA
1665 return NULL;
1666}
1667
b2da58c4
SK
1668/**
1669 * zbd_end_zone_io - update zone status at command completion
1670 * @io_u: I/O unit
1671 * @z: zone info pointer
1672 *
a4807046
SK
1673 * If the write command made the zone full, remove it from the write target
1674 * zones array.
b2da58c4
SK
1675 *
1676 * The caller must hold z->mutex.
1677 */
1678static void zbd_end_zone_io(struct thread_data *td, const struct io_u *io_u,
1679 struct fio_zone_info *z)
1680{
1681 const struct fio_file *f = io_u->file;
1682
1683 if (io_u->ddir == DDIR_WRITE &&
1684 io_u->offset + io_u->buflen >= zbd_zone_capacity_end(z)) {
1685 pthread_mutex_lock(&f->zbd_info->mutex);
a4807046 1686 zbd_write_zone_put(td, f, z);
b2da58c4
SK
1687 pthread_mutex_unlock(&f->zbd_info->mutex);
1688 }
1689}
1690
bfbdd35b 1691/**
d9ed3e63 1692 * zbd_queue_io - update the write pointer of a sequential zone
bfbdd35b 1693 * @io_u: I/O unit
d9ed3e63
DLM
1694 * @success: Whether or not the I/O unit has been queued successfully
1695 * @q: queueing status (busy, completed or queued).
bfbdd35b 1696 *
d9ed3e63
DLM
1697 * For write and trim operations, update the write pointer of the I/O unit
1698 * target zone.
bfbdd35b 1699 */
b2da58c4
SK
1700static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q,
1701 bool success)
bfbdd35b 1702{
d9ed3e63
DLM
1703 const struct fio_file *f = io_u->file;
1704 struct zoned_block_device_info *zbd_info = f->zbd_info;
bfbdd35b 1705 struct fio_zone_info *z;
d9ed3e63 1706 uint64_t zone_end;
bfbdd35b 1707
5ddf46d0 1708 assert(zbd_info);
bfbdd35b 1709
53aa6171 1710 z = zbd_offset_to_zone(f, io_u->offset);
43bcbd5b 1711 assert(z->has_wp);
d9ed3e63 1712
bfbdd35b
BVA
1713 if (!success)
1714 goto unlock;
d9ed3e63
DLM
1715
1716 dprint(FD_ZBD,
1717 "%s: queued I/O (%lld, %llu) for zone %u\n",
53aa6171 1718 f->file_name, io_u->offset, io_u->buflen, zbd_zone_idx(f, z));
d9ed3e63 1719
bfbdd35b
BVA
1720 switch (io_u->ddir) {
1721 case DDIR_WRITE:
d9ed3e63 1722 zone_end = min((uint64_t)(io_u->offset + io_u->buflen),
236d23a8 1723 zbd_zone_capacity_end(z));
139d8dc6 1724
a7c2b6fc
BVA
1725 /*
1726 * z->wp > zone_end means that one or more I/O errors
1727 * have occurred.
1728 */
2fb29f27
SK
1729 if (accounting_vdb(td, f) && z->wp <= zone_end) {
1730 pthread_mutex_lock(&zbd_info->mutex);
d56a6df3 1731 zbd_info->wp_valid_data_bytes += zone_end - z->wp;
2fb29f27
SK
1732 pthread_mutex_unlock(&zbd_info->mutex);
1733 }
bfbdd35b
BVA
1734 z->wp = zone_end;
1735 break;
bfbdd35b
BVA
1736 default:
1737 break;
1738 }
d9ed3e63 1739
b2da58c4
SK
1740 if (q == FIO_Q_COMPLETED && !io_u->error)
1741 zbd_end_zone_io(td, io_u, z);
1742
bfbdd35b 1743unlock:
d9ed3e63
DLM
1744 if (!success || q != FIO_Q_QUEUED) {
1745 /* BUSY or COMPLETED: unlock the zone */
4d4c71e6 1746 zone_unlock(z);
d9ed3e63
DLM
1747 io_u->zbd_put_io = NULL;
1748 }
1749}
1750
1751/**
1752 * zbd_put_io - Unlock an I/O unit target zone lock
1753 * @io_u: I/O unit
1754 */
b2da58c4 1755static void zbd_put_io(struct thread_data *td, const struct io_u *io_u)
d9ed3e63
DLM
1756{
1757 const struct fio_file *f = io_u->file;
d9ed3e63 1758 struct fio_zone_info *z;
d9ed3e63 1759
83276370 1760 assert(f->zbd_info);
615555bb 1761
53aa6171 1762 z = zbd_offset_to_zone(f, io_u->offset);
43bcbd5b 1763 assert(z->has_wp);
d9ed3e63
DLM
1764
1765 dprint(FD_ZBD,
1766 "%s: terminate I/O (%lld, %llu) for zone %u\n",
53aa6171 1767 f->file_name, io_u->offset, io_u->buflen, zbd_zone_idx(f, z));
d9ed3e63 1768
b2da58c4
SK
1769 zbd_end_zone_io(td, io_u, z);
1770
4d4c71e6 1771 zone_unlock(z);
bfbdd35b
BVA
1772}
1773
9d87c646
DLM
1774/*
1775 * Windows and MacOS do not define this.
1776 */
1777#ifndef EREMOTEIO
1778#define EREMOTEIO 121 /* POSIX value */
1779#endif
1780
bfbdd35b
BVA
1781bool zbd_unaligned_write(int error_code)
1782{
1783 switch (error_code) {
1784 case EIO:
1785 case EREMOTEIO:
1786 return true;
1787 }
1788 return false;
1789}
1790
4d37720a
DLM
1791/**
1792 * setup_zbd_zone_mode - handle zoneskip as necessary for ZBD drives
1793 * @td: FIO thread data.
1794 * @io_u: FIO I/O unit.
1795 *
1796 * For sequential workloads, change the file offset to skip zoneskip bytes when
1797 * no more IO can be performed in the current zone.
1798 * - For read workloads, zoneskip is applied when the io has reached the end of
1799 * the zone or the zone write position (when td->o.read_beyond_wp is false).
1800 * - For write workloads, zoneskip is applied when the zone is full.
1801 * This applies only to read and write operations.
1802 */
1803void setup_zbd_zone_mode(struct thread_data *td, struct io_u *io_u)
1804{
1805 struct fio_file *f = io_u->file;
1806 enum fio_ddir ddir = io_u->ddir;
1807 struct fio_zone_info *z;
4d37720a
DLM
1808
1809 assert(td->o.zone_mode == ZONE_MODE_ZBD);
1810 assert(td->o.zone_size);
5ddf46d0 1811 assert(f->zbd_info);
4d37720a 1812
53aa6171 1813 z = zbd_offset_to_zone(f, f->last_pos[ddir]);
236d23a8
SK
1814
1815 /*
1816 * When the zone capacity is smaller than the zone size and the I/O is
1817 * sequential write, skip to zone end if the latest position is at the
1818 * zone capacity limit.
1819 */
139d8dc6
DLM
1820 if (z->capacity < f->zbd_info->zone_size &&
1821 !td_random(td) && ddir == DDIR_WRITE &&
236d23a8
SK
1822 f->last_pos[ddir] >= zbd_zone_capacity_end(z)) {
1823 dprint(FD_ZBD,
1824 "%s: Jump from zone capacity limit to zone end:"
ee5e3436
SK
1825 " (%"PRIu64" -> %"PRIu64") for zone %u (%"PRIu64")\n",
1826 f->file_name, f->last_pos[ddir],
53aa6171 1827 zbd_zone_end(z), zbd_zone_idx(f, z), z->capacity);
236d23a8
SK
1828 td->io_skip_bytes += zbd_zone_end(z) - f->last_pos[ddir];
1829 f->last_pos[ddir] = zbd_zone_end(z);
1830 }
1831
4d37720a
DLM
1832 /*
1833 * zone_skip is valid only for sequential workloads.
1834 */
1835 if (td_random(td) || !td->o.zone_skip)
1836 return;
1837
1838 /*
1839 * It is time to switch to a new zone if:
1840 * - zone_bytes == zone_size bytes have already been accessed
1841 * - The last position reached the end of the current zone.
1842 * - For reads with td->o.read_beyond_wp == false, the last position
1843 * reached the zone write pointer.
1844 */
4d37720a 1845 if (td->zone_bytes >= td->o.zone_size ||
236d23a8 1846 f->last_pos[ddir] >= zbd_zone_end(z) ||
4d37720a
DLM
1847 (ddir == DDIR_READ &&
1848 (!td->o.read_beyond_wp) && f->last_pos[ddir] >= z->wp)) {
1849 /*
1850 * Skip zones.
1851 */
1852 td->zone_bytes = 0;
1853 f->file_offset += td->o.zone_size + td->o.zone_skip;
1854
1855 /*
1856 * Wrap from the beginning, if we exceed the file size
1857 */
1858 if (f->file_offset >= f->real_file_size)
1859 f->file_offset = get_start_offset(td, f);
1860
1861 f->last_pos[ddir] = f->file_offset;
1862 td->io_skip_bytes += td->o.zone_skip;
1863 }
1864}
1865
c65057f9 1866/**
c7d5e152 1867 * zbd_adjust_ddir - Adjust an I/O direction for zonemode=zbd.
c65057f9
SK
1868 *
1869 * @td: FIO thread data.
1870 * @io_u: FIO I/O unit.
1871 * @ddir: I/O direction before adjustment.
1872 *
1873 * Return adjusted I/O direction.
1874 */
1875enum fio_ddir zbd_adjust_ddir(struct thread_data *td, struct io_u *io_u,
1876 enum fio_ddir ddir)
1877{
1878 /*
1879 * In case read direction is chosen for the first random I/O, fio with
1880 * zonemode=zbd stops because no data can be read from zoned block
1881 * devices with all empty zones. Overwrite the first I/O direction as
1882 * write to make sure data to read exists.
1883 */
5ddf46d0 1884 assert(io_u->file->zbd_info);
731461cc 1885 if (ddir != DDIR_READ || !td_rw(td))
c65057f9
SK
1886 return ddir;
1887
cbbfe5a9
SK
1888 if (io_u->file->last_start[DDIR_WRITE] != -1ULL ||
1889 td->o.read_beyond_wp || td->o.rwmix[DDIR_WRITE] == 0)
c65057f9
SK
1890 return DDIR_READ;
1891
1892 return DDIR_WRITE;
1893}
1894
bfbdd35b
BVA
1895/**
1896 * zbd_adjust_block - adjust the offset and length as necessary for ZBD drives
1897 * @td: FIO thread data.
1898 * @io_u: FIO I/O unit.
1899 *
1900 * Locking strategy: returns with z->mutex locked if and only if z refers
1901 * to a sequential zone and if io_u_accept is returned. z is the zone that
1902 * corresponds to io_u->offset at the end of this function.
1903 */
1904enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
1905{
b7694961 1906 struct fio_file *f = io_u->file;
af94a8c3 1907 struct zoned_block_device_info *zbdi = f->zbd_info;
de65f7b7 1908 struct fio_zone_info *zb, *zl, *orig_zb;
bfbdd35b 1909 uint32_t orig_len = io_u->buflen;
07fc3f57 1910 uint64_t min_bs = td->o.min_bs[io_u->ddir];
bfbdd35b
BVA
1911 uint64_t new_len;
1912 int64_t range;
1913
af94a8c3 1914 assert(zbdi);
adc6adcb 1915 assert(min_bs);
bfbdd35b
BVA
1916 assert(is_valid_offset(f, io_u->offset));
1917 assert(io_u->buflen);
139d8dc6 1918
53aa6171 1919 zb = zbd_offset_to_zone(f, io_u->offset);
de65f7b7 1920 orig_zb = zb;
bfbdd35b 1921
2efcf74b
SK
1922 if (!zb->has_wp) {
1923 /* Accept non-write I/Os for conventional zones. */
1924 if (io_u->ddir != DDIR_WRITE)
1925 return io_u_accept;
139d8dc6 1926
2efcf74b
SK
1927 /*
1928 * Make sure that writes to conventional zones
1929 * don't cross over to any sequential zones.
1930 */
1931 if (!(zb + 1)->has_wp ||
1932 io_u->offset + io_u->buflen <= (zb + 1)->start)
1933 return io_u_accept;
1934
1935 if (io_u->offset + min_bs > (zb + 1)->start) {
1936 dprint(FD_IO,
07fc3f57 1937 "%s: off=%llu + min_bs=%"PRIu64" > next zone %"PRIu64"\n",
1e3d6e03 1938 f->file_name, io_u->offset,
ee5e3436 1939 min_bs, (zb + 1)->start);
139d8dc6
DLM
1940 io_u->offset =
1941 zb->start + (zb + 1)->start - io_u->offset;
1942 new_len = min(io_u->buflen,
1943 (zb + 1)->start - io_u->offset);
2efcf74b
SK
1944 } else {
1945 new_len = (zb + 1)->start - io_u->offset;
1946 }
139d8dc6 1947
2efcf74b 1948 io_u->buflen = new_len / min_bs * min_bs;
139d8dc6 1949
bfbdd35b 1950 return io_u_accept;
2efcf74b 1951 }
bfbdd35b
BVA
1952
1953 /*
1954 * Accept the I/O offset for reads if reading beyond the write pointer
1955 * is enabled.
1956 */
b7694961 1957 if (zb->cond != ZBD_ZONE_COND_OFFLINE &&
bfbdd35b
BVA
1958 io_u->ddir == DDIR_READ && td->o.read_beyond_wp)
1959 return io_u_accept;
1960
fae3b9a0 1961 zone_lock(td, f, zb);
6f0c6085 1962
bfbdd35b
BVA
1963 switch (io_u->ddir) {
1964 case DDIR_READ:
6e2da06a 1965 if (td->runstate == TD_VERIFYING && td_write(td))
bfbdd35b 1966 goto accept;
139d8dc6 1967
bfbdd35b 1968 /*
de65f7b7
DLM
1969 * Check that there is enough written data in the zone to do an
1970 * I/O of at least min_bs B. If there isn't, find a new zone for
1971 * the I/O.
bfbdd35b 1972 */
b7694961 1973 range = zb->cond != ZBD_ZONE_COND_OFFLINE ?
ee3696bd 1974 zb->wp - zb->start : 0;
de65f7b7 1975 if (range < min_bs ||
ee3696bd 1976 ((!td_random(td)) && (io_u->offset + min_bs > zb->wp))) {
4d4c71e6 1977 zone_unlock(zb);
39e06ee7 1978 zl = zbd_get_zone(f, f->max_zone);
5c86fdf6 1979 zb = zbd_find_zone(td, io_u, min_bs, zb, zl);
bfbdd35b
BVA
1980 if (!zb) {
1981 dprint(FD_ZBD,
1982 "%s: zbd_find_zone(%lld, %llu) failed\n",
1983 f->file_name, io_u->offset,
1984 io_u->buflen);
1985 goto eof;
1986 }
de65f7b7
DLM
1987 /*
1988 * zbd_find_zone() returned a zone with a range of at
1989 * least min_bs.
1990 */
ee3696bd 1991 range = zb->wp - zb->start;
de65f7b7
DLM
1992 assert(range >= min_bs);
1993
1994 if (!td_random(td))
ee3696bd 1995 io_u->offset = zb->start;
bfbdd35b 1996 }
139d8dc6 1997
de65f7b7
DLM
1998 /*
1999 * Make sure the I/O is within the zone valid data range while
2000 * maximizing the I/O size and preserving randomness.
2001 */
2002 if (range <= io_u->buflen)
ee3696bd 2003 io_u->offset = zb->start;
de65f7b7 2004 else if (td_random(td))
ee3696bd
DLM
2005 io_u->offset = zb->start +
2006 ((io_u->offset - orig_zb->start) %
de65f7b7 2007 (range - io_u->buflen)) / min_bs * min_bs;
139d8dc6 2008
43bcbd5b
SK
2009 /*
2010 * When zbd_find_zone() returns a conventional zone,
2011 * we can simply accept the new i/o offset here.
2012 */
2013 if (!zb->has_wp)
2014 return io_u_accept;
139d8dc6 2015
de65f7b7
DLM
2016 /*
2017 * Make sure the I/O does not cross over the zone wp position.
2018 */
2019 new_len = min((unsigned long long)io_u->buflen,
ee3696bd 2020 (unsigned long long)(zb->wp - io_u->offset));
de65f7b7
DLM
2021 new_len = new_len / min_bs * min_bs;
2022 if (new_len < io_u->buflen) {
2023 io_u->buflen = new_len;
2024 dprint(FD_IO, "Changed length from %u into %llu\n",
2025 orig_len, io_u->buflen);
bfbdd35b 2026 }
139d8dc6 2027
ee3696bd
DLM
2028 assert(zb->start <= io_u->offset);
2029 assert(io_u->offset + io_u->buflen <= zb->wp);
139d8dc6 2030
bfbdd35b 2031 goto accept;
139d8dc6 2032
bfbdd35b 2033 case DDIR_WRITE:
af94a8c3 2034 if (io_u->buflen > zbdi->zone_size) {
1c74aadc
DF
2035 td_verror(td, EINVAL, "I/O buflen exceeds zone size");
2036 dprint(FD_IO,
ee5e3436
SK
2037 "%s: I/O buflen %llu exceeds zone size %"PRIu64"\n",
2038 f->file_name, io_u->buflen, zbdi->zone_size);
bfbdd35b 2039 goto eof;
1c74aadc 2040 }
139d8dc6 2041
e1a1b59b
SK
2042retry:
2043 if (zbd_zone_remainder(zb) > 0 &&
2044 zbd_zone_remainder(zb) < min_bs) {
2045 pthread_mutex_lock(&f->zbd_info->mutex);
a4807046 2046 zbd_write_zone_put(td, f, zb);
e1a1b59b
SK
2047 pthread_mutex_unlock(&f->zbd_info->mutex);
2048 dprint(FD_ZBD,
2049 "%s: finish zone %d\n",
2050 f->file_name, zbd_zone_idx(f, zb));
2051 io_u_quiesce(td);
2052 zbd_finish_zone(td, f, zb);
2053 if (zbd_zone_idx(f, zb) + 1 >= f->max_zone) {
2054 if (!td_random(td))
2055 goto eof;
2056 }
2057 zone_unlock(zb);
2058
2059 /* Find the next write pointer zone */
2060 do {
2061 zb++;
2062 if (zbd_zone_idx(f, zb) >= f->max_zone)
2063 zb = zbd_get_zone(f, f->min_zone);
2064 } while (!zb->has_wp);
2065
2066 zone_lock(td, f, zb);
2067 }
2068
a4807046 2069 if (!zbd_write_zone_get(td, f, zb)) {
4d4c71e6 2070 zone_unlock(zb);
a4807046 2071 zb = zbd_convert_to_write_zone(td, io_u);
1c74aadc 2072 if (!zb) {
a4807046 2073 dprint(FD_IO, "%s: can't convert to write target zone",
1c74aadc 2074 f->file_name);
59b07544 2075 goto eof;
1c74aadc 2076 }
59b07544 2077 }
139d8dc6 2078
e1a1b59b
SK
2079 if (zbd_zone_remainder(zb) > 0 &&
2080 zbd_zone_remainder(zb) < min_bs)
2081 goto retry;
2082
a7c2b6fc
BVA
2083 /* Check whether the zone reset threshold has been exceeded */
2084 if (td->o.zrf.u.f) {
d56a6df3
SK
2085 if (zbdi->wp_valid_data_bytes >=
2086 f->io_size * td->o.zrt.u.f &&
139d8dc6 2087 zbd_dec_and_reset_write_cnt(td, f))
a7c2b6fc 2088 zb->reset_zone = 1;
a7c2b6fc 2089 }
139d8dc6 2090
bfbdd35b
BVA
2091 /* Reset the zone pointer if necessary */
2092 if (zb->reset_zone || zbd_zone_full(f, zb, min_bs)) {
cef8006c
SK
2093 if (td->o.verify != VERIFY_NONE) {
2094 /*
2095 * Unset io-u->file to tell get_next_verify()
2096 * that this IO is not requeue.
2097 */
2098 io_u->file = NULL;
2099 if (!get_next_verify(td, io_u)) {
2100 zone_unlock(zb);
2101 return io_u_accept;
2102 }
2103 io_u->file = f;
2104 }
2105
bfbdd35b
BVA
2106 /*
2107 * Since previous write requests may have been submitted
2108 * asynchronously and since we will submit the zone
2109 * reset synchronously, wait until previously submitted
2110 * write requests have completed before issuing a
2111 * zone reset.
2112 */
2113 io_u_quiesce(td);
2114 zb->reset_zone = 0;
67282020 2115 if (__zbd_reset_zone(td, f, zb) < 0)
bfbdd35b 2116 goto eof;
236d23a8
SK
2117
2118 if (zb->capacity < min_bs) {
1c74aadc 2119 td_verror(td, EINVAL, "ZCAP is less min_bs");
07fc3f57 2120 log_err("zone capacity %"PRIu64" smaller than minimum block size %"PRIu64"\n",
ee5e3436 2121 zb->capacity, min_bs);
236d23a8
SK
2122 goto eof;
2123 }
bfbdd35b 2124 }
139d8dc6 2125
bfbdd35b
BVA
2126 /* Make writes occur at the write pointer */
2127 assert(!zbd_zone_full(f, zb, min_bs));
ee3696bd 2128 io_u->offset = zb->wp;
bfbdd35b 2129 if (!is_valid_offset(f, io_u->offset)) {
1c74aadc
DF
2130 td_verror(td, EINVAL, "invalid WP value");
2131 dprint(FD_ZBD, "%s: dropped request with offset %llu\n",
2132 f->file_name, io_u->offset);
bfbdd35b
BVA
2133 goto eof;
2134 }
139d8dc6 2135
bfbdd35b
BVA
2136 /*
2137 * Make sure that the buflen is a multiple of the minimal
2138 * block size. Give up if shrinking would make the request too
2139 * small.
2140 */
2141 new_len = min((unsigned long long)io_u->buflen,
236d23a8 2142 zbd_zone_capacity_end(zb) - io_u->offset);
bfbdd35b
BVA
2143 new_len = new_len / min_bs * min_bs;
2144 if (new_len == io_u->buflen)
2145 goto accept;
2146 if (new_len >= min_bs) {
2147 io_u->buflen = new_len;
2148 dprint(FD_IO, "Changed length from %u into %llu\n",
2149 orig_len, io_u->buflen);
2150 goto accept;
2151 }
139d8dc6 2152
1c74aadc 2153 td_verror(td, EIO, "zone remainder too small");
07fc3f57 2154 log_err("zone remainder %lld smaller than min block size %"PRIu64"\n",
1c74aadc 2155 (zbd_zone_capacity_end(zb) - io_u->offset), min_bs);
139d8dc6 2156
bfbdd35b 2157 goto eof;
139d8dc6 2158
bfbdd35b 2159 case DDIR_TRIM:
e3be810b
SK
2160 /* Check random trim targets a non-empty zone */
2161 if (!td_random(td) || zb->wp > zb->start)
2162 goto accept;
2163
2164 /* Find out a non-empty zone to trim */
2165 zone_unlock(zb);
39e06ee7 2166 zl = zbd_get_zone(f, f->max_zone);
e3be810b
SK
2167 zb = zbd_find_zone(td, io_u, 1, zb, zl);
2168 if (zb) {
2169 io_u->offset = zb->start;
2170 dprint(FD_ZBD, "%s: found new zone(%lld) for trim\n",
2171 f->file_name, io_u->offset);
2172 goto accept;
2173 }
139d8dc6 2174
e3be810b 2175 goto eof;
139d8dc6 2176
bfbdd35b 2177 case DDIR_SYNC:
e3be810b 2178 /* fall-through */
bfbdd35b
BVA
2179 case DDIR_DATASYNC:
2180 case DDIR_SYNC_FILE_RANGE:
2181 case DDIR_WAIT:
2182 case DDIR_LAST:
2183 case DDIR_INVAL:
e8a0b539 2184 case DDIR_TIMEOUT:
bfbdd35b
BVA
2185 goto accept;
2186 }
2187
2188 assert(false);
2189
2190accept:
43bcbd5b 2191 assert(zb->has_wp);
b7694961 2192 assert(zb->cond != ZBD_ZONE_COND_OFFLINE);
d9ed3e63
DLM
2193 assert(!io_u->zbd_queue_io);
2194 assert(!io_u->zbd_put_io);
139d8dc6 2195
d9ed3e63
DLM
2196 io_u->zbd_queue_io = zbd_queue_io;
2197 io_u->zbd_put_io = zbd_put_io;
139d8dc6 2198
2ef3c1b0
DF
2199 /*
2200 * Since we return with the zone lock still held,
2201 * add an annotation to let Coverity know that it
2202 * is intentional.
2203 */
2204 /* coverity[missing_unlock] */
139d8dc6 2205
bfbdd35b
BVA
2206 return io_u_accept;
2207
2208eof:
43bcbd5b 2209 if (zb && zb->has_wp)
4d4c71e6 2210 zone_unlock(zb);
139d8dc6 2211
bfbdd35b
BVA
2212 return io_u_eof;
2213}
fd5d733f
BVA
2214
2215/* Return a string with ZBD statistics */
2216char *zbd_write_status(const struct thread_stat *ts)
2217{
2218 char *res;
2219
ee5e3436 2220 if (asprintf(&res, "; %"PRIu64" zone resets", ts->nr_zone_resets) < 0)
fd5d733f
BVA
2221 return NULL;
2222 return res;
2223}
e3be810b
SK
2224
2225/**
2226 * zbd_do_io_u_trim - If reset zone is applicable, do reset zone instead of trim
2227 *
2228 * @td: FIO thread data.
2229 * @io_u: FIO I/O unit.
2230 *
2231 * It is assumed that z->mutex is already locked.
2232 * Return io_u_completed when reset zone succeeds. Return 0 when the target zone
2233 * does not have write pointer. On error, return negative errno.
2234 */
67282020 2235int zbd_do_io_u_trim(struct thread_data *td, struct io_u *io_u)
e3be810b
SK
2236{
2237 struct fio_file *f = io_u->file;
2238 struct fio_zone_info *z;
e3be810b
SK
2239 int ret;
2240
53aa6171 2241 z = zbd_offset_to_zone(f, io_u->offset);
e3be810b
SK
2242 if (!z->has_wp)
2243 return 0;
2244
2245 if (io_u->offset != z->start) {
139d8dc6
DLM
2246 log_err("Trim offset not at zone start (%lld)\n",
2247 io_u->offset);
e3be810b
SK
2248 return -EINVAL;
2249 }
2250
2251 ret = zbd_reset_zone((struct thread_data *)td, f, z);
2252 if (ret < 0)
2253 return ret;
2254
2255 return io_u_completed;
2256}
8b403508
SK
2257
2258void zbd_log_err(const struct thread_data *td, const struct io_u *io_u)
2259{
2260 const struct fio_file *f = io_u->file;
2261
2262 if (td->o.zone_mode != ZONE_MODE_ZBD)
2263 return;
2264
2265 if (io_u->error == EOVERFLOW)
2266 log_err("%s: Exceeded max_active_zones limit. Check conditions of zones out of I/O ranges.\n",
2267 f->file_name);
2268}