Merge branch 'fix-randtrimwrite' of https://github.com/minwooim/fio
[fio.git] / zbd.c
CommitLineData
bfbdd35b
BVA
1/*
2 * Copyright (C) 2018 Western Digital Corporation or its affiliates.
3 *
4 * This file is released under the GPL.
5 */
6
7#include <errno.h>
8#include <string.h>
9#include <stdlib.h>
bfbdd35b 10#include <fcntl.h>
bfbdd35b
BVA
11#include <sys/stat.h>
12#include <unistd.h>
f5bff36e 13
83276370 14#include "compiler/compiler.h"
cf42d79e 15#include "os/os.h"
bfbdd35b
BVA
16#include "file.h"
17#include "fio.h"
18#include "lib/pow2.h"
19#include "log.h"
f5bff36e 20#include "oslib/asprintf.h"
bfbdd35b
BVA
21#include "smalloc.h"
22#include "verify.h"
44ec32cb 23#include "pshared.h"
bfbdd35b
BVA
24#include "zbd.h"
25
410a071c
DLM
26static bool is_valid_offset(const struct fio_file *f, uint64_t offset)
27{
28 return (uint64_t)(offset - f->file_offset) < f->io_size;
29}
30
dc8a3d62
DLM
31static inline unsigned int zbd_zone_idx(const struct fio_file *f,
32 struct fio_zone_info *zone)
410a071c
DLM
33{
34 return zone - f->zbd_info->zone_info;
35}
36
37/**
dc8a3d62 38 * zbd_offset_to_zone_idx - convert an offset into a zone number
410a071c
DLM
39 * @f: file pointer.
40 * @offset: offset in bytes. If this offset is in the first zone_size bytes
41 * past the disk size then the index of the sentinel is returned.
42 */
dc8a3d62
DLM
43static unsigned int zbd_offset_to_zone_idx(const struct fio_file *f,
44 uint64_t offset)
410a071c
DLM
45{
46 uint32_t zone_idx;
47
48 if (f->zbd_info->zone_size_log2 > 0)
49 zone_idx = offset >> f->zbd_info->zone_size_log2;
50 else
51 zone_idx = offset / f->zbd_info->zone_size;
52
53 return min(zone_idx, f->zbd_info->nr_zones);
54}
55
56/**
57 * zbd_zone_end - Return zone end location
58 * @z: zone info pointer.
59 */
60static inline uint64_t zbd_zone_end(const struct fio_zone_info *z)
61{
62 return (z+1)->start;
63}
64
65/**
66 * zbd_zone_capacity_end - Return zone capacity limit end location
67 * @z: zone info pointer.
68 */
69static inline uint64_t zbd_zone_capacity_end(const struct fio_zone_info *z)
70{
71 return z->start + z->capacity;
72}
73
df67bf1e
SK
74/**
75 * zbd_zone_remainder - Return the number of bytes that are still available for
76 * writing before the zone gets full
77 * @z: zone info pointer.
78 */
79static inline uint64_t zbd_zone_remainder(struct fio_zone_info *z)
80{
81 if (z->wp >= zbd_zone_capacity_end(z))
82 return 0;
83
84 return zbd_zone_capacity_end(z) - z->wp;
85}
86
410a071c
DLM
87/**
88 * zbd_zone_full - verify whether a minimum number of bytes remain in a zone
89 * @f: file pointer.
90 * @z: zone info pointer.
91 * @required: minimum number of bytes that must remain in a zone.
92 *
93 * The caller must hold z->mutex.
94 */
95static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z,
96 uint64_t required)
97{
98 assert((required & 511) == 0);
99
df67bf1e 100 return z->has_wp && required > zbd_zone_remainder(z);
410a071c
DLM
101}
102
103static void zone_lock(struct thread_data *td, const struct fio_file *f,
104 struct fio_zone_info *z)
105{
83276370 106#ifndef NDEBUG
69c53a63 107 unsigned int const nz = zbd_zone_idx(f, z);
410a071c
DLM
108 /* A thread should never lock zones outside its working area. */
109 assert(f->min_zone <= nz && nz < f->max_zone);
410a071c 110 assert(z->has_wp);
83276370 111#endif
410a071c
DLM
112
113 /*
114 * Lock the io_u target zone. The zone will be unlocked if io_u offset
115 * is changed or when io_u completes and zbd_put_io() executed.
116 * To avoid multiple jobs doing asynchronous I/Os from deadlocking each
117 * other waiting for zone locks when building an io_u batch, first
118 * only trylock the zone. If the zone is already locked by another job,
119 * process the currently queued I/Os so that I/O progress is made and
120 * zones unlocked.
121 */
122 if (pthread_mutex_trylock(&z->mutex) != 0) {
123 if (!td_ioengine_flagged(td, FIO_SYNCIO))
124 io_u_quiesce(td);
125 pthread_mutex_lock(&z->mutex);
126 }
127}
128
129static inline void zone_unlock(struct fio_zone_info *z)
130{
410a071c 131 assert(z->has_wp);
83276370 132 pthread_mutex_unlock(&z->mutex);
410a071c
DLM
133}
134
39e06ee7
DLM
135static inline struct fio_zone_info *zbd_get_zone(const struct fio_file *f,
136 unsigned int zone_idx)
410a071c 137{
39e06ee7 138 return &f->zbd_info->zone_info[zone_idx];
410a071c
DLM
139}
140
53aa6171
DLM
141static inline struct fio_zone_info *
142zbd_offset_to_zone(const struct fio_file *f, uint64_t offset)
143{
144 return zbd_get_zone(f, zbd_offset_to_zone_idx(f, offset));
145}
146
2fb29f27
SK
147static bool accounting_vdb(struct thread_data *td, const struct fio_file *f)
148{
149 return td->o.zrt.u.f && td_write(td);
150}
151
b7694961
DLM
152/**
153 * zbd_get_zoned_model - Get a device zoned model
154 * @td: FIO thread data
155 * @f: FIO file for which to get model information
156 */
38334c13
DLM
157static int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f,
158 enum zbd_zoned_model *model)
b7694961
DLM
159{
160 int ret;
161
50cc48d5
NC
162 if (f->filetype == FIO_TYPE_PIPE) {
163 log_err("zonemode=zbd does not support pipes\n");
164 return -EINVAL;
165 }
166
9db0cde8
NC
167 /* If regular file, always emulate zones inside the file. */
168 if (f->filetype == FIO_TYPE_FILE) {
169 *model = ZBD_NONE;
170 return 0;
171 }
172
6c5b11d3
DLM
173 if (td->io_ops && td->io_ops->get_zoned_model)
174 ret = td->io_ops->get_zoned_model(td, f, model);
175 else
176 ret = blkzoned_get_zoned_model(td, f, model);
b7694961
DLM
177 if (ret < 0) {
178 td_verror(td, errno, "get zoned model failed");
179 log_err("%s: get zoned model failed (%d).\n",
180 f->file_name, errno);
181 }
182
183 return ret;
184}
185
186/**
187 * zbd_report_zones - Get zone information
188 * @td: FIO thread data.
189 * @f: FIO file for which to get zone information
190 * @offset: offset from which to report zones
191 * @zones: Array of struct zbd_zone
192 * @nr_zones: Size of @zones array
193 *
194 * Get zone information into @zones starting from the zone at offset @offset
195 * for the device specified by @f.
196 *
197 * Returns the number of zones reported upon success and a negative error code
198 * upon failure. If the zone report is empty, always assume an error (device
199 * problem) and return -EIO.
200 */
38334c13
DLM
201static int zbd_report_zones(struct thread_data *td, struct fio_file *f,
202 uint64_t offset, struct zbd_zone *zones,
203 unsigned int nr_zones)
b7694961
DLM
204{
205 int ret;
206
6c5b11d3
DLM
207 if (td->io_ops && td->io_ops->report_zones)
208 ret = td->io_ops->report_zones(td, f, offset, zones, nr_zones);
209 else
210 ret = blkzoned_report_zones(td, f, offset, zones, nr_zones);
b7694961
DLM
211 if (ret < 0) {
212 td_verror(td, errno, "report zones failed");
362ce037
BVA
213 log_err("%s: report zones from sector %"PRIu64" failed (nr_zones=%d; errno=%d).\n",
214 f->file_name, offset >> 9, nr_zones, errno);
b7694961
DLM
215 } else if (ret == 0) {
216 td_verror(td, errno, "Empty zone report");
ee5e3436
SK
217 log_err("%s: report zones from sector %"PRIu64" is empty.\n",
218 f->file_name, offset >> 9);
b7694961
DLM
219 ret = -EIO;
220 }
221
222 return ret;
223}
224
225/**
226 * zbd_reset_wp - reset the write pointer of a range of zones
227 * @td: FIO thread data.
228 * @f: FIO file for which to reset zones
229 * @offset: Starting offset of the first zone to reset
230 * @length: Length of the range of zones to reset
231 *
232 * Reset the write pointer of all zones in the range @offset...@offset+@length.
233 * Returns 0 upon success and a negative error code upon failure.
234 */
38334c13
DLM
235static int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
236 uint64_t offset, uint64_t length)
b7694961
DLM
237{
238 int ret;
239
6c5b11d3
DLM
240 if (td->io_ops && td->io_ops->reset_wp)
241 ret = td->io_ops->reset_wp(td, f, offset, length);
242 else
243 ret = blkzoned_reset_wp(td, f, offset, length);
b7694961
DLM
244 if (ret < 0) {
245 td_verror(td, errno, "resetting wp failed");
ee5e3436
SK
246 log_err("%s: resetting wp for %"PRIu64" sectors at sector %"PRIu64" failed (%d).\n",
247 f->file_name, length >> 9, offset >> 9, errno);
b7694961
DLM
248 }
249
250 return ret;
251}
252
410a071c 253/**
67282020 254 * __zbd_reset_zone - reset the write pointer of a single zone
410a071c
DLM
255 * @td: FIO thread data.
256 * @f: FIO file associated with the disk for which to reset a write pointer.
257 * @z: Zone to reset.
258 *
259 * Returns 0 upon success and a negative error code upon failure.
260 *
261 * The caller must hold z->mutex.
262 */
67282020
SK
263static int __zbd_reset_zone(struct thread_data *td, struct fio_file *f,
264 struct fio_zone_info *z)
410a071c
DLM
265{
266 uint64_t offset = z->start;
267 uint64_t length = (z+1)->start - offset;
268 uint64_t data_in_zone = z->wp - z->start;
269 int ret = 0;
270
271 if (!data_in_zone)
272 return 0;
273
274 assert(is_valid_offset(f, offset + length - 1));
275
139d8dc6 276 dprint(FD_ZBD, "%s: resetting wp of zone %u.\n",
dc8a3d62 277 f->file_name, zbd_zone_idx(f, z));
139d8dc6 278
410a071c
DLM
279 switch (f->zbd_info->model) {
280 case ZBD_HOST_AWARE:
281 case ZBD_HOST_MANAGED:
282 ret = zbd_reset_wp(td, f, offset, length);
283 if (ret < 0)
284 return ret;
285 break;
286 default:
287 break;
288 }
289
2fb29f27
SK
290 if (accounting_vdb(td, f)) {
291 pthread_mutex_lock(&f->zbd_info->mutex);
292 f->zbd_info->wp_valid_data_bytes -= data_in_zone;
293 pthread_mutex_unlock(&f->zbd_info->mutex);
294 }
139d8dc6 295
410a071c 296 z->wp = z->start;
410a071c
DLM
297
298 td->ts.nr_zone_resets++;
299
300 return ret;
301}
302
303/**
a4807046 304 * zbd_write_zone_put - Remove a zone from the write target zones array.
410a071c 305 * @td: FIO thread data.
a4807046 306 * @f: FIO file that has the write zones array to remove.
410a071c
DLM
307 * @zone_idx: Index of the zone to remove.
308 *
309 * The caller must hold f->zbd_info->mutex.
310 */
a4807046
SK
311static void zbd_write_zone_put(struct thread_data *td, const struct fio_file *f,
312 struct fio_zone_info *z)
410a071c 313{
a4807046 314 uint32_t zi;
410a071c 315
a4807046 316 if (!z->write)
a23411bb
DLM
317 return;
318
a4807046
SK
319 for (zi = 0; zi < f->zbd_info->num_write_zones; zi++) {
320 if (zbd_get_zone(f, f->zbd_info->write_zones[zi]) == z)
410a071c
DLM
321 break;
322 }
a4807046 323 if (zi == f->zbd_info->num_write_zones)
410a071c
DLM
324 return;
325
a4807046 326 dprint(FD_ZBD, "%s: removing zone %u from write zone array\n",
dc8a3d62 327 f->file_name, zbd_zone_idx(f, z));
139d8dc6 328
a4807046
SK
329 memmove(f->zbd_info->write_zones + zi,
330 f->zbd_info->write_zones + zi + 1,
331 (ZBD_MAX_WRITE_ZONES - (zi + 1)) *
332 sizeof(f->zbd_info->write_zones[0]));
139d8dc6 333
a4807046
SK
334 f->zbd_info->num_write_zones--;
335 td->num_write_zones--;
336 z->write = 0;
410a071c
DLM
337}
338
67282020
SK
339/**
340 * zbd_reset_zone - reset the write pointer of a single zone and remove the zone
341 * from the array of write zones.
342 * @td: FIO thread data.
343 * @f: FIO file associated with the disk for which to reset a write pointer.
344 * @z: Zone to reset.
345 *
346 * Returns 0 upon success and a negative error code upon failure.
347 *
348 * The caller must hold z->mutex.
349 */
350static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
351 struct fio_zone_info *z)
352{
353 int ret;
354
355 ret = __zbd_reset_zone(td, f, z);
356 if (ret)
357 return ret;
358
359 pthread_mutex_lock(&f->zbd_info->mutex);
360 zbd_write_zone_put(td, f, z);
361 pthread_mutex_unlock(&f->zbd_info->mutex);
362 return 0;
363}
364
e1a1b59b
SK
365/**
366 * zbd_finish_zone - finish the specified zone
367 * @td: FIO thread data.
368 * @f: FIO file for which to finish a zone
369 * @z: Zone to finish.
370 *
371 * Finish the zone at @offset with open or close status.
372 */
373static int zbd_finish_zone(struct thread_data *td, struct fio_file *f,
374 struct fio_zone_info *z)
375{
376 uint64_t offset = z->start;
377 uint64_t length = f->zbd_info->zone_size;
378 int ret = 0;
379
380 switch (f->zbd_info->model) {
381 case ZBD_HOST_AWARE:
382 case ZBD_HOST_MANAGED:
383 if (td->io_ops && td->io_ops->finish_zone)
384 ret = td->io_ops->finish_zone(td, f, offset, length);
385 else
386 ret = blkzoned_finish_zone(td, f, offset, length);
387 break;
388 default:
389 break;
390 }
391
392 if (ret < 0) {
393 td_verror(td, errno, "finish zone failed");
394 log_err("%s: finish zone at sector %"PRIu64" failed (%d).\n",
395 f->file_name, offset >> 9, errno);
396 } else {
397 z->wp = (z+1)->start;
398 }
399
400 return ret;
401}
402
410a071c
DLM
403/**
404 * zbd_reset_zones - Reset a range of zones.
405 * @td: fio thread data.
406 * @f: fio file for which to reset zones
407 * @zb: first zone to reset.
408 * @ze: first zone not to reset.
409 *
410 * Returns 0 upon success and 1 upon failure.
411 */
412static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
413 struct fio_zone_info *const zb,
414 struct fio_zone_info *const ze)
415{
416 struct fio_zone_info *z;
417 const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
418 int res = 0;
419
83276370
DP
420 if (fio_unlikely(0 == min_bs))
421 return 1;
410a071c 422
139d8dc6 423 dprint(FD_ZBD, "%s: examining zones %u .. %u\n",
dc8a3d62 424 f->file_name, zbd_zone_idx(f, zb), zbd_zone_idx(f, ze));
139d8dc6 425
410a071c 426 for (z = zb; z < ze; z++) {
410a071c
DLM
427 if (!z->has_wp)
428 continue;
139d8dc6 429
410a071c 430 zone_lock(td, f, z);
139d8dc6 431
410a071c
DLM
432 if (z->wp != z->start) {
433 dprint(FD_ZBD, "%s: resetting zone %u\n",
dc8a3d62 434 f->file_name, zbd_zone_idx(f, z));
410a071c
DLM
435 if (zbd_reset_zone(td, f, z) < 0)
436 res = 1;
437 }
139d8dc6 438
410a071c
DLM
439 zone_unlock(z);
440 }
441
442 return res;
443}
444
143aaff9
SK
445/**
446 * zbd_move_zone_wp - move the write pointer of a zone by writing the data in
447 * the specified buffer
448 * @td: FIO thread data.
449 * @f: FIO file for which to move write pointer
450 * @z: Target zone to move the write pointer
451 * @length: Length of the move
452 * @buf: Buffer which holds the data to write
453 *
454 * Move the write pointer at the specified offset by writing the data
455 * in the specified buffer.
456 * Returns 0 upon success and a negative error code upon failure.
457 */
458static int zbd_move_zone_wp(struct thread_data *td, struct fio_file *f,
459 struct zbd_zone *z, uint64_t length,
460 const char *buf)
461{
462 int ret = 0;
463
464 switch (f->zbd_info->model) {
465 case ZBD_HOST_AWARE:
466 case ZBD_HOST_MANAGED:
467 if (td->io_ops && td->io_ops->move_zone_wp)
468 ret = td->io_ops->move_zone_wp(td, f, z, length, buf);
469 else
470 ret = blkzoned_move_zone_wp(td, f, z, length, buf);
471 break;
472 default:
473 break;
474 }
475
476 if (ret < 0) {
477 td_verror(td, errno, "move wp failed");
478 log_err("%s: moving wp for %"PRIu64" sectors at sector %"PRIu64" failed (%d).\n",
479 f->file_name, length >> 9, z->wp >> 9, errno);
480 }
481
482 return ret;
483}
484
d2f442bc
NC
485/**
486 * zbd_get_max_open_zones - Get the maximum number of open zones
487 * @td: FIO thread data
488 * @f: FIO file for which to get max open zones
489 * @max_open_zones: Upon success, result will be stored here.
490 *
491 * A @max_open_zones value set to zero means no limit.
492 *
493 * Returns 0 upon success and a negative error code upon failure.
494 */
38334c13
DLM
495static int zbd_get_max_open_zones(struct thread_data *td, struct fio_file *f,
496 unsigned int *max_open_zones)
d2f442bc
NC
497{
498 int ret;
499
500 if (td->io_ops && td->io_ops->get_max_open_zones)
501 ret = td->io_ops->get_max_open_zones(td, f, max_open_zones);
502 else
503 ret = blkzoned_get_max_open_zones(td, f, max_open_zones);
504 if (ret < 0) {
505 td_verror(td, errno, "get max open zones failed");
506 log_err("%s: get max open zones failed (%d).\n",
507 f->file_name, errno);
508 }
509
510 return ret;
511}
512
9e523ef8
SK
513/**
514 * zbd_get_max_active_zones - Get the maximum number of active zones
515 * @td: FIO thread data
516 * @f: FIO file for which to get max active zones
517 *
518 * Returns max_active_zones limit value of the target file if it is available.
519 * Otherwise return zero, which means no limit.
520 */
521static unsigned int zbd_get_max_active_zones(struct thread_data *td,
522 struct fio_file *f)
523{
524 unsigned int max_active_zones;
525 int ret;
526
527 if (td->io_ops && td->io_ops->get_max_active_zones)
528 ret = td->io_ops->get_max_active_zones(td, f,
529 &max_active_zones);
530 else
531 ret = blkzoned_get_max_active_zones(td, f, &max_active_zones);
532 if (ret < 0) {
533 dprint(FD_ZBD, "%s: max_active_zones is not available\n",
534 f->file_name);
535 return 0;
536 }
537
538 return max_active_zones;
539}
540
bfbdd35b 541/**
f539b98c 542 * __zbd_write_zone_get - Add a zone to the array of write zones.
410a071c 543 * @td: fio thread data.
a4807046 544 * @f: fio file that has the write zones array to add.
410a071c 545 * @zone_idx: Index of the zone to add.
bfbdd35b 546 *
f539b98c
SK
547 * Do same operation as @zbd_write_zone_get, except it adds the zone at
548 * @zone_idx to write target zones array even when it does not have remainder
549 * space to write one block.
bfbdd35b 550 */
f539b98c
SK
551static bool __zbd_write_zone_get(struct thread_data *td,
552 const struct fio_file *f,
553 struct fio_zone_info *z)
1f57803b 554{
410a071c 555 struct zoned_block_device_info *zbdi = f->zbd_info;
dc8a3d62 556 uint32_t zone_idx = zbd_zone_idx(f, z);
410a071c 557 bool res = true;
fae3b9a0 558
410a071c
DLM
559 if (z->cond == ZBD_ZONE_COND_OFFLINE)
560 return false;
43bcbd5b 561
1f57803b 562 /*
410a071c
DLM
563 * Skip full zones with data verification enabled because resetting a
564 * zone causes data loss and hence causes verification to fail.
1f57803b 565 */
f539b98c 566 if (td->o.verify != VERIFY_NONE && zbd_zone_remainder(z) == 0)
410a071c 567 return false;
4d4c71e6 568
410a071c 569 /*
a4807046
SK
570 * zbdi->max_write_zones == 0 means that there is no limit on the
571 * maximum number of write target zones. In this case, do no track write
572 * target zones in zbdi->write_zones array.
410a071c 573 */
a4807046 574 if (!zbdi->max_write_zones)
410a071c 575 return true;
4d4c71e6 576
410a071c 577 pthread_mutex_lock(&zbdi->mutex);
b5a0f7ce 578
a4807046 579 if (z->write) {
410a071c 580 /*
b5a0f7ce 581 * If the zone is going to be completely filled by writes
a4807046
SK
582 * already in-flight, handle it as a full zone instead of a
583 * write target zone.
410a071c 584 */
df67bf1e 585 if (!zbd_zone_remainder(z))
410a071c
DLM
586 res = false;
587 goto out;
588 }
139d8dc6 589
410a071c
DLM
590 res = false;
591 /* Zero means no limit */
592 if (td->o.job_max_open_zones > 0 &&
a4807046 593 td->num_write_zones >= td->o.job_max_open_zones)
410a071c 594 goto out;
a4807046 595 if (zbdi->num_write_zones >= zbdi->max_write_zones)
410a071c 596 goto out;
139d8dc6 597
a4807046 598 dprint(FD_ZBD, "%s: adding zone %u to write zone array\n",
139d8dc6
DLM
599 f->file_name, zone_idx);
600
a4807046
SK
601 zbdi->write_zones[zbdi->num_write_zones++] = zone_idx;
602 td->num_write_zones++;
603 z->write = 1;
410a071c 604 res = true;
bfbdd35b 605
410a071c
DLM
606out:
607 pthread_mutex_unlock(&zbdi->mutex);
608 return res;
923f7c1e
DF
609}
610
f539b98c
SK
611/**
612 * zbd_write_zone_get - Add a zone to the array of write zones.
613 * @td: fio thread data.
614 * @f: fio file that has the open zones to add.
615 * @zone_idx: Index of the zone to add.
616 *
617 * Add a ZBD zone to write target zones array, if it is not yet added. Returns
618 * true if either the zone was already added or if the zone was successfully
619 * added to the array without exceeding the maximum number of write zones.
620 * Returns false if the zone was not already added and addition of the zone
621 * would cause the zone limit to be exceeded.
622 */
623static bool zbd_write_zone_get(struct thread_data *td, const struct fio_file *f,
624 struct fio_zone_info *z)
625{
626 const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
627
628 /*
629 * Skip full zones with data verification enabled because resetting a
630 * zone causes data loss and hence causes verification to fail.
631 */
632 if (td->o.verify != VERIFY_NONE && zbd_zone_full(f, z, min_bs))
633 return false;
634
635 return __zbd_write_zone_get(td, f, z);
636}
637
59c3200d 638/* Verify whether direct I/O is used for all host-managed zoned block drives. */
bfbdd35b
BVA
639static bool zbd_using_direct_io(void)
640{
bfbdd35b 641 struct fio_file *f;
da8f124f 642 int j;
bfbdd35b 643
da8f124f 644 for_each_td(td) {
bfbdd35b
BVA
645 if (td->o.odirect || !(td->o.td_ddir & TD_DDIR_WRITE))
646 continue;
647 for_each_file(td, f, j) {
59c3200d 648 if (f->zbd_info && f->filetype == FIO_TYPE_BLOCK &&
b7694961 649 f->zbd_info->model == ZBD_HOST_MANAGED)
bfbdd35b
BVA
650 return false;
651 }
da8f124f 652 } end_for_each();
bfbdd35b
BVA
653
654 return true;
655}
656
657/* Whether or not the I/O range for f includes one or more sequential zones */
b3e9bd03 658static bool zbd_is_seq_job(const struct fio_file *f)
bfbdd35b
BVA
659{
660 uint32_t zone_idx, zone_idx_b, zone_idx_e;
661
662 assert(f->zbd_info);
139d8dc6 663
bfbdd35b
BVA
664 if (f->io_size == 0)
665 return false;
139d8dc6 666
dc8a3d62
DLM
667 zone_idx_b = zbd_offset_to_zone_idx(f, f->file_offset);
668 zone_idx_e =
669 zbd_offset_to_zone_idx(f, f->file_offset + f->io_size - 1);
bfbdd35b 670 for (zone_idx = zone_idx_b; zone_idx <= zone_idx_e; zone_idx++)
39e06ee7 671 if (zbd_get_zone(f, zone_idx)->has_wp)
bfbdd35b
BVA
672 return true;
673
674 return false;
675}
676
0bf93a1a
DLM
677/*
678 * Verify whether the file offset and size parameters are aligned with zone
679 * boundaries. If the file offset is not aligned, align it down to the start of
680 * the zone containing the start offset and align up the file io_size parameter.
681 */
682static bool zbd_zone_align_file_sizes(struct thread_data *td,
683 struct fio_file *f)
684{
685 const struct fio_zone_info *z;
686 uint64_t new_offset, new_end;
0bf93a1a
DLM
687
688 if (!f->zbd_info)
689 return true;
690 if (f->file_offset >= f->real_file_size)
691 return true;
692 if (!zbd_is_seq_job(f))
693 return true;
694
695 if (!td->o.zone_size) {
696 td->o.zone_size = f->zbd_info->zone_size;
697 if (!td->o.zone_size) {
698 log_err("%s: invalid 0 zone size\n",
699 f->file_name);
700 return false;
701 }
702 } else if (td->o.zone_size != f->zbd_info->zone_size) {
703 log_err("%s: zonesize %llu does not match the device zone size %"PRIu64".\n",
704 f->file_name, td->o.zone_size,
705 f->zbd_info->zone_size);
706 return false;
707 }
708
709 if (td->o.zone_skip % td->o.zone_size) {
710 log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n",
711 f->file_name, td->o.zone_skip,
712 td->o.zone_size);
713 return false;
714 }
715
3f96645f
DF
716 if (td->o.td_ddir == TD_DDIR_READ) {
717 z = zbd_offset_to_zone(f, f->file_offset + f->io_size);
718 new_end = z->start;
719 if (f->file_offset + f->io_size > new_end) {
720 log_info("%s: rounded io_size from %"PRIu64" to %"PRIu64"\n",
721 f->file_name, f->io_size,
722 new_end - f->file_offset);
723 f->io_size = new_end - f->file_offset;
724 }
725 return true;
726 }
727
53aa6171 728 z = zbd_offset_to_zone(f, f->file_offset);
3f96645f 729 if (f->file_offset != z->start) {
0bf93a1a
DLM
730 new_offset = zbd_zone_end(z);
731 if (new_offset >= f->file_offset + f->io_size) {
732 log_info("%s: io_size must be at least one zone\n",
733 f->file_name);
734 return false;
735 }
736 log_info("%s: rounded up offset from %"PRIu64" to %"PRIu64"\n",
737 f->file_name, f->file_offset,
738 new_offset);
739 f->io_size -= (new_offset - f->file_offset);
740 f->file_offset = new_offset;
741 }
742
53aa6171 743 z = zbd_offset_to_zone(f, f->file_offset + f->io_size);
0bf93a1a 744 new_end = z->start;
3f96645f 745 if (f->file_offset + f->io_size != new_end) {
0bf93a1a
DLM
746 if (new_end <= f->file_offset) {
747 log_info("%s: io_size must be at least one zone\n",
748 f->file_name);
749 return false;
750 }
751 log_info("%s: rounded down io_size from %"PRIu64" to %"PRIu64"\n",
752 f->file_name, f->io_size,
753 new_end - f->file_offset);
754 f->io_size = new_end - f->file_offset;
755 }
756
757 return true;
758}
759
bfbdd35b
BVA
760/*
761 * Verify whether offset and size parameters are aligned with zone boundaries.
762 */
763static bool zbd_verify_sizes(void)
764{
bfbdd35b 765 struct fio_file *f;
da8f124f 766 int j;
bfbdd35b 767
da8f124f 768 for_each_td(td) {
bfbdd35b 769 for_each_file(td, f, j) {
0bf93a1a 770 if (!zbd_zone_align_file_sizes(td, f))
4d37720a 771 return false;
bfbdd35b 772 }
da8f124f 773 } end_for_each();
bfbdd35b
BVA
774
775 return true;
776}
777
778static bool zbd_verify_bs(void)
779{
bfbdd35b 780 struct fio_file *f;
da8f124f 781 int j;
bfbdd35b 782
da8f124f 783 for_each_td(td) {
e3be810b
SK
784 if (td_trim(td) &&
785 (td->o.min_bs[DDIR_TRIM] != td->o.max_bs[DDIR_TRIM] ||
786 td->o.bssplit_nr[DDIR_TRIM])) {
787 log_info("bsrange and bssplit are not allowed for trim with zonemode=zbd\n");
788 return false;
789 }
bfbdd35b 790 for_each_file(td, f, j) {
1ddd225e
AD
791 uint64_t zone_size;
792
bfbdd35b
BVA
793 if (!f->zbd_info)
794 continue;
139d8dc6 795
bfbdd35b 796 zone_size = f->zbd_info->zone_size;
e3be810b 797 if (td_trim(td) && td->o.bs[DDIR_TRIM] != zone_size) {
ee5e3436 798 log_info("%s: trim block size %llu is not the zone size %"PRIu64"\n",
e3be810b 799 f->file_name, td->o.bs[DDIR_TRIM],
ee5e3436 800 zone_size);
e3be810b
SK
801 return false;
802 }
bfbdd35b 803 }
da8f124f 804 } end_for_each();
bfbdd35b
BVA
805 return true;
806}
807
bfbdd35b
BVA
808static int ilog2(uint64_t i)
809{
810 int log = -1;
811
812 while (i) {
813 i >>= 1;
814 log++;
815 }
816 return log;
817}
818
819/*
820 * Initialize f->zbd_info for devices that are not zoned block devices. This
821 * allows to execute a ZBD workload against a non-ZBD device.
822 */
823static int init_zone_info(struct thread_data *td, struct fio_file *f)
824{
825 uint32_t nr_zones;
826 struct fio_zone_info *p;
a4b7f12b 827 uint64_t zone_size = td->o.zone_size;
b8dd9750 828 uint64_t zone_capacity = td->o.zone_capacity;
bfbdd35b 829 struct zoned_block_device_info *zbd_info = NULL;
bfbdd35b
BVA
830 int i;
831
a4b7f12b 832 if (zone_size == 0) {
9db0cde8 833 log_err("%s: Specifying the zone size is mandatory for regular file/block device with --zonemode=zbd\n\n",
a4b7f12b
DLM
834 f->file_name);
835 return 1;
836 }
837
838 if (zone_size < 512) {
839 log_err("%s: zone size must be at least 512 bytes for --zonemode=zbd\n\n",
840 f->file_name);
841 return 1;
842 }
843
b8dd9750
HH
844 if (zone_capacity == 0)
845 zone_capacity = zone_size;
846
847 if (zone_capacity > zone_size) {
848 log_err("%s: job parameter zonecapacity %llu is larger than zone size %llu\n",
ee5e3436 849 f->file_name, td->o.zone_capacity, td->o.zone_size);
b8dd9750
HH
850 return 1;
851 }
852
9db0cde8
NC
853 if (f->real_file_size < zone_size) {
854 log_err("%s: file/device size %"PRIu64" is smaller than zone size %"PRIu64"\n",
855 f->file_name, f->real_file_size, zone_size);
856 return -EINVAL;
857 }
858
ee3696bd 859 nr_zones = (f->real_file_size + zone_size - 1) / zone_size;
bfbdd35b
BVA
860 zbd_info = scalloc(1, sizeof(*zbd_info) +
861 (nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
862 if (!zbd_info)
863 return -ENOMEM;
864
44ec32cb 865 mutex_init_pshared(&zbd_info->mutex);
bfbdd35b
BVA
866 zbd_info->refcount = 1;
867 p = &zbd_info->zone_info[0];
868 for (i = 0; i < nr_zones; i++, p++) {
44ec32cb
SK
869 mutex_init_pshared_with_type(&p->mutex,
870 PTHREAD_MUTEX_RECURSIVE);
bfbdd35b 871 p->start = i * zone_size;
b14651a2 872 p->wp = p->start;
b7694961
DLM
873 p->type = ZBD_ZONE_TYPE_SWR;
874 p->cond = ZBD_ZONE_COND_EMPTY;
b8dd9750 875 p->capacity = zone_capacity;
be7a6bae 876 p->has_wp = 1;
bfbdd35b
BVA
877 }
878 /* a sentinel */
879 p->start = nr_zones * zone_size;
880
881 f->zbd_info = zbd_info;
882 f->zbd_info->zone_size = zone_size;
883 f->zbd_info->zone_size_log2 = is_power_of_2(zone_size) ?
ebc403fe 884 ilog2(zone_size) : 0;
bfbdd35b 885 f->zbd_info->nr_zones = nr_zones;
bfbdd35b
BVA
886 return 0;
887}
888
889/*
b7694961
DLM
890 * Maximum number of zones to report in one operation.
891 */
892#define ZBD_REPORT_MAX_ZONES 8192U
893
894/*
895 * Parse the device zone report and store it in f->zbd_info. Must be called
896 * only for devices that are zoned, namely those with a model != ZBD_NONE.
bfbdd35b
BVA
897 */
898static int parse_zone_info(struct thread_data *td, struct fio_file *f)
899{
b7694961
DLM
900 int nr_zones, nrz;
901 struct zbd_zone *zones, *z;
bfbdd35b 902 struct fio_zone_info *p;
04f9090b
BVA
903 uint64_t zone_size, offset, capacity;
904 bool same_zone_cap = true;
bfbdd35b 905 struct zoned_block_device_info *zbd_info = NULL;
d060babc 906 int i, j, ret = -ENOMEM;
bfbdd35b 907
b7694961
DLM
908 zones = calloc(ZBD_REPORT_MAX_ZONES, sizeof(struct zbd_zone));
909 if (!zones)
bfbdd35b
BVA
910 goto out;
911
b7694961
DLM
912 nrz = zbd_report_zones(td, f, 0, zones, ZBD_REPORT_MAX_ZONES);
913 if (nrz < 0) {
914 ret = nrz;
915 log_info("fio: report zones (offset 0) failed for %s (%d).\n",
916 f->file_name, -ret);
917 goto out;
bfbdd35b
BVA
918 }
919
b7694961 920 zone_size = zones[0].len;
04f9090b 921 capacity = zones[0].capacity;
ee3696bd 922 nr_zones = (f->real_file_size + zone_size - 1) / zone_size;
bfbdd35b
BVA
923
924 if (td->o.zone_size == 0) {
ee3696bd
DLM
925 td->o.zone_size = zone_size;
926 } else if (td->o.zone_size != zone_size) {
ee5e3436
SK
927 log_err("fio: %s job parameter zonesize %llu does not match disk zone size %"PRIu64".\n",
928 f->file_name, td->o.zone_size, zone_size);
bfbdd35b 929 ret = -EINVAL;
b7694961 930 goto out;
bfbdd35b
BVA
931 }
932
9724b4f5
NC
933 dprint(FD_ZBD, "Device %s has %d zones of size %"PRIu64" KB\n",
934 f->file_name, nr_zones, zone_size / 1024);
bfbdd35b
BVA
935
936 zbd_info = scalloc(1, sizeof(*zbd_info) +
937 (nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
bfbdd35b 938 if (!zbd_info)
b7694961 939 goto out;
44ec32cb 940 mutex_init_pshared(&zbd_info->mutex);
bfbdd35b
BVA
941 zbd_info->refcount = 1;
942 p = &zbd_info->zone_info[0];
b7694961
DLM
943 for (offset = 0, j = 0; j < nr_zones;) {
944 z = &zones[0];
945 for (i = 0; i < nrz; i++, j++, z++, p++) {
44ec32cb
SK
946 mutex_init_pshared_with_type(&p->mutex,
947 PTHREAD_MUTEX_RECURSIVE);
b7694961 948 p->start = z->start;
236d23a8 949 p->capacity = z->capacity;
04f9090b
BVA
950 if (capacity != z->capacity)
951 same_zone_cap = false;
139d8dc6 952
bfbdd35b 953 switch (z->cond) {
b7694961
DLM
954 case ZBD_ZONE_COND_NOT_WP:
955 case ZBD_ZONE_COND_FULL:
236d23a8 956 p->wp = p->start + p->capacity;
bfbdd35b
BVA
957 break;
958 default:
959 assert(z->start <= z->wp);
b7694961
DLM
960 assert(z->wp <= z->start + zone_size);
961 p->wp = z->wp;
bfbdd35b
BVA
962 break;
963 }
be7a6bae
DF
964
965 switch (z->type) {
966 case ZBD_ZONE_TYPE_SWR:
967 p->has_wp = 1;
968 break;
969 default:
970 p->has_wp = 0;
971 }
bfbdd35b
BVA
972 p->type = z->type;
973 p->cond = z->cond;
be7a6bae 974
bfbdd35b 975 if (j > 0 && p->start != p[-1].start + zone_size) {
adfa7b7c
BVA
976 log_info("%s: invalid zone data [%d:%d]: %"PRIu64" + %"PRIu64" != %"PRIu64"\n",
977 f->file_name, j, i,
978 p[-1].start, zone_size, p->start);
bfbdd35b 979 ret = -EINVAL;
b7694961 980 goto out;
bfbdd35b
BVA
981 }
982 }
983 z--;
b7694961 984 offset = z->start + z->len;
bfbdd35b
BVA
985 if (j >= nr_zones)
986 break;
139d8dc6 987
6c3f1cc1
DF
988 nrz = zbd_report_zones(td, f, offset, zones,
989 min((uint32_t)(nr_zones - j),
990 ZBD_REPORT_MAX_ZONES));
b7694961
DLM
991 if (nrz < 0) {
992 ret = nrz;
ee5e3436
SK
993 log_info("fio: report zones (offset %"PRIu64") failed for %s (%d).\n",
994 offset, f->file_name, -ret);
b7694961 995 goto out;
bfbdd35b
BVA
996 }
997 }
b7694961 998
bfbdd35b 999 /* a sentinel */
b7694961 1000 zbd_info->zone_info[nr_zones].start = offset;
bfbdd35b
BVA
1001
1002 f->zbd_info = zbd_info;
1003 f->zbd_info->zone_size = zone_size;
1004 f->zbd_info->zone_size_log2 = is_power_of_2(zone_size) ?
ebc403fe 1005 ilog2(zone_size) : 0;
bfbdd35b 1006 f->zbd_info->nr_zones = nr_zones;
9e523ef8 1007 f->zbd_info->max_active_zones = zbd_get_max_active_zones(td, f);
04f9090b
BVA
1008
1009 if (same_zone_cap)
1010 dprint(FD_ZBD, "Zone capacity = %"PRIu64" KB\n",
1011 capacity / 1024);
1012
bfbdd35b
BVA
1013 zbd_info = NULL;
1014 ret = 0;
1015
bfbdd35b 1016out:
b7694961
DLM
1017 sfree(zbd_info);
1018 free(zones);
bfbdd35b
BVA
1019 return ret;
1020}
1021
a4807046 1022static int zbd_set_max_write_zones(struct thread_data *td, struct fio_file *f)
d2f442bc
NC
1023{
1024 struct zoned_block_device_info *zbd = f->zbd_info;
1025 unsigned int max_open_zones;
1026 int ret;
1027
575686bb 1028 if (zbd->model != ZBD_HOST_MANAGED || td->o.ignore_zone_limits) {
d2f442bc 1029 /* Only host-managed devices have a max open limit */
a4807046 1030 zbd->max_write_zones = td->o.max_open_zones;
d2f442bc
NC
1031 goto out;
1032 }
1033
1034 /* If host-managed, get the max open limit */
1035 ret = zbd_get_max_open_zones(td, f, &max_open_zones);
1036 if (ret)
1037 return ret;
1038
1039 if (!max_open_zones) {
1040 /* No device limit */
a4807046 1041 zbd->max_write_zones = td->o.max_open_zones;
d2f442bc
NC
1042 } else if (!td->o.max_open_zones) {
1043 /* No user limit. Set limit to device limit */
a4807046 1044 zbd->max_write_zones = max_open_zones;
d2f442bc
NC
1045 } else if (td->o.max_open_zones <= max_open_zones) {
1046 /* Both user limit and dev limit. User limit not too large */
a4807046 1047 zbd->max_write_zones = td->o.max_open_zones;
d2f442bc
NC
1048 } else {
1049 /* Both user limit and dev limit. User limit too large */
1050 td_verror(td, EINVAL,
1051 "Specified --max_open_zones is too large");
1052 log_err("Specified --max_open_zones (%d) is larger than max (%u)\n",
1053 td->o.max_open_zones, max_open_zones);
1054 return -EINVAL;
1055 }
1056
1057out:
1058 /* Ensure that the limit is not larger than FIO's internal limit */
a4807046 1059 if (zbd->max_write_zones > ZBD_MAX_WRITE_ZONES) {
b346af90 1060 td_verror(td, EINVAL, "'max_open_zones' value is too large");
139d8dc6 1061 log_err("'max_open_zones' value is larger than %u\n",
a4807046 1062 ZBD_MAX_WRITE_ZONES);
b346af90
NC
1063 return -EINVAL;
1064 }
1065
a4807046
SK
1066 dprint(FD_ZBD, "%s: using max write zones limit: %"PRIu32"\n",
1067 f->file_name, zbd->max_write_zones);
d2f442bc
NC
1068
1069 return 0;
1070}
1071
bfbdd35b
BVA
1072/*
1073 * Allocate zone information and store it into f->zbd_info if zonemode=zbd.
1074 *
1075 * Returns 0 upon success and a negative error code upon failure.
1076 */
379e5f09 1077static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
bfbdd35b 1078{
b7694961
DLM
1079 enum zbd_zoned_model zbd_model;
1080 int ret;
bfbdd35b
BVA
1081
1082 assert(td->o.zone_mode == ZONE_MODE_ZBD);
1083
b7694961
DLM
1084 ret = zbd_get_zoned_model(td, f, &zbd_model);
1085 if (ret)
1086 return ret;
1087
bfbdd35b 1088 switch (zbd_model) {
b7694961
DLM
1089 case ZBD_HOST_AWARE:
1090 case ZBD_HOST_MANAGED:
bfbdd35b 1091 ret = parse_zone_info(td, f);
d2f442bc
NC
1092 if (ret)
1093 return ret;
bfbdd35b 1094 break;
b7694961 1095 case ZBD_NONE:
bfbdd35b 1096 ret = init_zone_info(td, f);
d2f442bc
NC
1097 if (ret)
1098 return ret;
bfbdd35b 1099 break;
b7694961
DLM
1100 default:
1101 td_verror(td, EINVAL, "Unsupported zoned model");
1102 log_err("Unsupported zoned model\n");
1103 return -EINVAL;
bfbdd35b 1104 }
b7694961 1105
2c7dd23e 1106 assert(f->zbd_info);
d2f442bc
NC
1107 f->zbd_info->model = zbd_model;
1108
a4807046 1109 ret = zbd_set_max_write_zones(td, f);
d2f442bc
NC
1110 if (ret) {
1111 zbd_free_zone_info(f);
1112 return ret;
219c662d 1113 }
d2f442bc
NC
1114
1115 return 0;
bfbdd35b
BVA
1116}
1117
1118void zbd_free_zone_info(struct fio_file *f)
1119{
1120 uint32_t refcount;
1121
3c1dc34c 1122 assert(f->zbd_info);
bfbdd35b
BVA
1123
1124 pthread_mutex_lock(&f->zbd_info->mutex);
1125 refcount = --f->zbd_info->refcount;
1126 pthread_mutex_unlock(&f->zbd_info->mutex);
1127
1128 assert((int32_t)refcount >= 0);
1129 if (refcount == 0)
1130 sfree(f->zbd_info);
1131 f->zbd_info = NULL;
1132}
1133
1134/*
1135 * Initialize f->zbd_info.
1136 *
1137 * Returns 0 upon success and a negative error code upon failure.
1138 *
1139 * Note: this function can only work correctly if it is called before the first
1140 * fio fork() call.
1141 */
1142static int zbd_init_zone_info(struct thread_data *td, struct fio_file *file)
1143{
bfbdd35b 1144 struct fio_file *f2;
da8f124f 1145 int j, ret;
bfbdd35b 1146
da8f124f 1147 for_each_td(td2) {
bfbdd35b
BVA
1148 for_each_file(td2, f2, j) {
1149 if (td2 == td && f2 == file)
1150 continue;
1151 if (!f2->zbd_info ||
1152 strcmp(f2->file_name, file->file_name) != 0)
1153 continue;
1154 file->zbd_info = f2->zbd_info;
1155 file->zbd_info->refcount++;
1156 return 0;
1157 }
da8f124f 1158 } end_for_each();
bfbdd35b
BVA
1159
1160 ret = zbd_create_zone_info(td, file);
1161 if (ret < 0)
c5837eec 1162 td_verror(td, -ret, "zbd_create_zone_info() failed");
139d8dc6 1163
bfbdd35b
BVA
1164 return ret;
1165}
1166
8f39afa7 1167int zbd_init_files(struct thread_data *td)
bfbdd35b
BVA
1168{
1169 struct fio_file *f;
1170 int i;
1171
1172 for_each_file(td, f, i) {
a4b7f12b 1173 if (zbd_init_zone_info(td, f))
bfbdd35b 1174 return 1;
bfbdd35b 1175 }
139d8dc6 1176
8f39afa7
AD
1177 return 0;
1178}
1179
1180void zbd_recalc_options_with_zone_granularity(struct thread_data *td)
1181{
1182 struct fio_file *f;
1183 int i;
1184
1185 for_each_file(td, f, i) {
1186 struct zoned_block_device_info *zbd = f->zbd_info;
139d8dc6 1187 uint64_t zone_size;
8f39afa7 1188
139d8dc6
DLM
1189 /* zonemode=strided doesn't get per-file zone size. */
1190 zone_size = zbd ? zbd->zone_size : td->o.zone_size;
8f39afa7
AD
1191 if (zone_size == 0)
1192 continue;
1193
139d8dc6 1194 if (td->o.size_nz > 0)
8f39afa7 1195 td->o.size = td->o.size_nz * zone_size;
139d8dc6 1196 if (td->o.io_size_nz > 0)
8f39afa7 1197 td->o.io_size = td->o.io_size_nz * zone_size;
139d8dc6 1198 if (td->o.start_offset_nz > 0)
8f39afa7 1199 td->o.start_offset = td->o.start_offset_nz * zone_size;
139d8dc6
DLM
1200 if (td->o.offset_increment_nz > 0)
1201 td->o.offset_increment =
1202 td->o.offset_increment_nz * zone_size;
1203 if (td->o.zone_skip_nz > 0)
8f39afa7 1204 td->o.zone_skip = td->o.zone_skip_nz * zone_size;
8f39afa7
AD
1205 }
1206}
1207
9fb714da
SK
1208static uint64_t zbd_verify_and_set_vdb(struct thread_data *td,
1209 const struct fio_file *f)
1210{
1211 struct fio_zone_info *zb, *ze, *z;
1212 uint64_t wp_vdb = 0;
1213 struct zoned_block_device_info *zbdi = f->zbd_info;
1214
1215 assert(td->runstate < TD_RUNNING);
1216 assert(zbdi);
1217
1218 if (!accounting_vdb(td, f))
1219 return 0;
1220
1221 /*
1222 * Ensure that the I/O range includes one or more sequential zones so
1223 * that f->min_zone and f->max_zone have different values.
1224 */
1225 if (!zbd_is_seq_job(f))
1226 return 0;
1227
1228 if (zbdi->write_min_zone != zbdi->write_max_zone) {
1229 if (zbdi->write_min_zone != f->min_zone ||
1230 zbdi->write_max_zone != f->max_zone) {
1231 td_verror(td, EINVAL,
1232 "multi-jobs with different write ranges are "
1233 "not supported with zone_reset_threshold");
1234 log_err("multi-jobs with different write ranges are "
1235 "not supported with zone_reset_threshold\n");
1236 }
1237 return 0;
1238 }
1239
1240 zbdi->write_min_zone = f->min_zone;
1241 zbdi->write_max_zone = f->max_zone;
1242
1243 zb = zbd_get_zone(f, f->min_zone);
1244 ze = zbd_get_zone(f, f->max_zone);
1245 for (z = zb; z < ze; z++)
1246 if (z->has_wp)
1247 wp_vdb += z->wp - z->start;
1248
1249 zbdi->wp_valid_data_bytes = wp_vdb;
1250
1251 return wp_vdb;
1252}
1253
8f39afa7
AD
1254int zbd_setup_files(struct thread_data *td)
1255{
1256 struct fio_file *f;
1257 int i;
bfbdd35b
BVA
1258
1259 if (!zbd_using_direct_io()) {
1260 log_err("Using direct I/O is mandatory for writing to ZBD drives\n\n");
1261 return 1;
1262 }
1263
1264 if (!zbd_verify_sizes())
1265 return 1;
1266
1267 if (!zbd_verify_bs())
1268 return 1;
1269
650c4ad3
SK
1270 if (td->o.recover_zbd_write_error && td_write(td)) {
1271 if (!td->o.continue_on_error) {
1272 log_err("recover_zbd_write_error works only when continue_on_error is set\n");
1273 return 1;
1274 }
1275 if (td->o.verify != VERIFY_NONE &&
1276 !td_ioengine_flagged(td, FIO_SYNCIO)) {
1277 log_err("recover_zbd_write_error for async IO engines does not support verify\n");
1278 return 1;
1279 }
1280 }
1281
6e2da06a
SK
1282 if (td->o.experimental_verify) {
1283 log_err("zonemode=zbd does not support experimental verify\n");
1284 return 1;
1285 }
1286
219c662d
AD
1287 for_each_file(td, f, i) {
1288 struct zoned_block_device_info *zbd = f->zbd_info;
954217b9
SK
1289 struct fio_zone_info *z;
1290 int zi;
9fb714da 1291 uint64_t vdb;
219c662d 1292
5ddf46d0 1293 assert(zbd);
219c662d 1294
dc8a3d62
DLM
1295 f->min_zone = zbd_offset_to_zone_idx(f, f->file_offset);
1296 f->max_zone =
1297 zbd_offset_to_zone_idx(f, f->file_offset + f->io_size);
f952800a 1298
9fb714da
SK
1299 vdb = zbd_verify_and_set_vdb(td, f);
1300
1301 dprint(FD_ZBD, "%s(%s): valid data bytes = %" PRIu64 "\n",
1302 __func__, f->file_name, vdb);
1303
f952800a
SK
1304 /*
1305 * When all zones in the I/O range are conventional, io_size
1306 * can be smaller than zone size, making min_zone the same
1307 * as max_zone. This is why the assert below needs to be made
1308 * conditional.
1309 */
1310 if (zbd_is_seq_job(f))
1311 assert(f->min_zone < f->max_zone);
1312
219c662d 1313 if (td->o.max_open_zones > 0 &&
a4807046 1314 zbd->max_write_zones != td->o.max_open_zones) {
219c662d
AD
1315 log_err("Different 'max_open_zones' values\n");
1316 return 1;
1317 }
b346af90 1318
f3abed70
SK
1319 /*
1320 * If this job does not do write operations, skip open zone
1321 * condition check.
1322 */
1323 if (!td_write(td)) {
1324 if (td->o.job_max_open_zones)
1325 log_info("'job_max_open_zones' is valid only for write jobs\n");
1326 continue;
1327 }
1328
b346af90
NC
1329 /*
1330 * The per job max open zones limit cannot be used without a
1331 * global max open zones limit. (As the tracking of open zones
1332 * is disabled when there is no global max open zones limit.)
1333 */
a4807046 1334 if (td->o.job_max_open_zones && !zbd->max_write_zones) {
b346af90 1335 log_err("'job_max_open_zones' cannot be used without a global open zones limit\n");
219c662d
AD
1336 return 1;
1337 }
954217b9 1338
ea51055c 1339 /*
a4807046 1340 * zbd->max_write_zones is the global limit shared for all jobs
ea51055c
NC
1341 * that target the same zoned block device. Force sync the per
1342 * thread global limit with the actual global limit. (The real
1343 * per thread/job limit is stored in td->o.job_max_open_zones).
1344 */
a4807046 1345 td->o.max_open_zones = zbd->max_write_zones;
ea51055c 1346
954217b9
SK
1347 for (zi = f->min_zone; zi < f->max_zone; zi++) {
1348 z = &zbd->zone_info[zi];
1349 if (z->cond != ZBD_ZONE_COND_IMP_OPEN &&
bab838f8
SK
1350 z->cond != ZBD_ZONE_COND_EXP_OPEN &&
1351 z->cond != ZBD_ZONE_COND_CLOSED)
1352 continue;
1353 if (!zbd->max_active_zones &&
1354 z->cond == ZBD_ZONE_COND_CLOSED)
954217b9 1355 continue;
f539b98c 1356 if (__zbd_write_zone_get(td, f, z))
954217b9
SK
1357 continue;
1358 /*
1359 * If the number of open zones exceeds specified limits,
8ac76889 1360 * error out.
954217b9 1361 */
8ac76889
SK
1362 log_err("Number of open zones exceeds max_open_zones limit\n");
1363 return 1;
954217b9 1364 }
219c662d
AD
1365 }
1366
bfbdd35b
BVA
1367 return 0;
1368}
1369
a7c2b6fc
BVA
1370/*
1371 * Reset zbd_info.write_cnt, the counter that counts down towards the next
1372 * zone reset.
1373 */
1bb1bcad
AD
1374static void _zbd_reset_write_cnt(const struct thread_data *td,
1375 const struct fio_file *f)
a7c2b6fc
BVA
1376{
1377 assert(0 <= td->o.zrf.u.f && td->o.zrf.u.f <= 1);
1378
a7c2b6fc
BVA
1379 f->zbd_info->write_cnt = td->o.zrf.u.f ?
1380 min(1.0 / td->o.zrf.u.f, 0.0 + UINT_MAX) : UINT_MAX;
1bb1bcad
AD
1381}
1382
1383static void zbd_reset_write_cnt(const struct thread_data *td,
1384 const struct fio_file *f)
1385{
1386 pthread_mutex_lock(&f->zbd_info->mutex);
1387 _zbd_reset_write_cnt(td, f);
a7c2b6fc
BVA
1388 pthread_mutex_unlock(&f->zbd_info->mutex);
1389}
1390
1391static bool zbd_dec_and_reset_write_cnt(const struct thread_data *td,
1392 const struct fio_file *f)
1393{
1394 uint32_t write_cnt = 0;
1395
1396 pthread_mutex_lock(&f->zbd_info->mutex);
1397 assert(f->zbd_info->write_cnt);
1398 if (f->zbd_info->write_cnt)
1399 write_cnt = --f->zbd_info->write_cnt;
1400 if (write_cnt == 0)
1bb1bcad 1401 _zbd_reset_write_cnt(td, f);
a7c2b6fc
BVA
1402 pthread_mutex_unlock(&f->zbd_info->mutex);
1403
1404 return write_cnt == 0;
1405}
1406
bfbdd35b
BVA
1407void zbd_file_reset(struct thread_data *td, struct fio_file *f)
1408{
91d25131 1409 struct fio_zone_info *zb, *ze;
c5c8b92b 1410 bool verify_data_left = false;
bfbdd35b 1411
767d1372 1412 if (!f->zbd_info || !td_write(td))
bfbdd35b
BVA
1413 return;
1414
39e06ee7
DLM
1415 zb = zbd_get_zone(f, f->min_zone);
1416 ze = zbd_get_zone(f, f->max_zone);
139d8dc6 1417
bfbdd35b
BVA
1418 /*
1419 * If data verification is enabled reset the affected zones before
1420 * writing any data to avoid that a zone reset has to be issued while
1421 * writing data, which causes data loss.
1422 */
c5c8b92b
SK
1423 if (td->o.verify != VERIFY_NONE) {
1424 verify_data_left = td->runstate == TD_VERIFYING ||
1425 td->io_hist_len || td->verify_batch;
c5c8b92b
SK
1426 if (!verify_data_left)
1427 zbd_reset_zones(td, f, zb, ze);
1428 }
1429
a7c2b6fc 1430 zbd_reset_write_cnt(td, f);
bfbdd35b
BVA
1431}
1432
a4807046 1433/* Return random zone index for one of the write target zones. */
6463db6c
AD
1434static uint32_t pick_random_zone_idx(const struct fio_file *f,
1435 const struct io_u *io_u)
1436{
139d8dc6 1437 return (io_u->offset - f->file_offset) *
a4807046 1438 f->zbd_info->num_write_zones / f->io_size;
6463db6c
AD
1439}
1440
fbac34f1
SK
1441/*
1442 * Randomly choose a zone in the array of write zones and in the range for the
1443 * file f. If such a zone is found, return its index in f->zbd_info->zone_info[]
1444 * using @zone_idx, and return true. Otherwise, return false.
1445 *
1446 * Caller must hold f->zbd_info->mutex.
1447 */
1448static bool zbd_pick_write_zone(const struct fio_file* f,
1449 const struct io_u *io_u, uint32_t *zone_idx)
1450{
1451 struct zoned_block_device_info *zbdi = f->zbd_info;
1452 uint32_t write_zone_idx;
1453 uint32_t cur_zone_idx;
1454 int i;
1455
1456 /*
1457 * An array of write target zones is per-device, shared across all jobs.
1458 * Start with quasi-random candidate zone. Ignore zones which do not
1459 * belong to offset/size range of the current job.
1460 */
1461 write_zone_idx = pick_random_zone_idx(f, io_u);
1462 assert(!write_zone_idx || write_zone_idx < zbdi->num_write_zones);
1463
1464 for (i = 0; i < zbdi->num_write_zones; i++) {
1465 if (write_zone_idx >= zbdi->num_write_zones)
1466 write_zone_idx = 0;
1467 cur_zone_idx = zbdi->write_zones[write_zone_idx];
1468 if (f->min_zone <= cur_zone_idx && cur_zone_idx < f->max_zone) {
1469 *zone_idx = cur_zone_idx;
1470 return true;
1471 }
1472 write_zone_idx++;
1473 }
1474
1475 return false;
1476}
1477
0f77c977
SK
1478static bool any_io_in_flight(void)
1479{
da8f124f 1480 for_each_td(td) {
0f77c977
SK
1481 if (td->io_u_in_flight)
1482 return true;
da8f124f 1483 } end_for_each();
0f77c977
SK
1484
1485 return false;
1486}
1487
2a87f269
SK
1488/**
1489 * zbd_convert_to_write_zone - Convert the target zone of an io_u to a writable zone
1490 * @td: The fio thread data
1491 * @io_u: The I/O unit that targets the zone to convert
1492 * @zb: The zone selected at the beginning of the function call. The caller must
1493 * hold zb->mutex.
1494 *
a4807046 1495 * Modify the offset of an I/O unit that does not refer to a zone such that
2a87f269 1496 * in write target zones array. Add a zone to or remove a zone from the array if
a4807046 1497 * necessary. The write target zone is searched across sequential zones.
21c0c884 1498 * This algorithm can only work correctly if all write pointers are
2a87f269
SK
1499 * a multiple of the fio block size. The caller must not hold
1500 * f->zbd_info->mutex. Returns with z->mutex held upon success.
59b07544 1501 */
a4807046 1502static struct fio_zone_info *zbd_convert_to_write_zone(struct thread_data *td,
2a87f269
SK
1503 struct io_u *io_u,
1504 struct fio_zone_info *zb)
59b07544 1505{
07fc3f57 1506 const uint64_t min_bs = td->o.min_bs[io_u->ddir];
fae3b9a0 1507 struct fio_file *f = io_u->file;
af94a8c3 1508 struct zoned_block_device_info *zbdi = f->zbd_info;
59b07544 1509 struct fio_zone_info *z;
59b07544
BVA
1510 uint32_t zone_idx, new_zone_idx;
1511 int i;
a4807046 1512 bool wait_zone_write;
0f77c977
SK
1513 bool in_flight;
1514 bool should_retry = true;
e2e29bf6 1515 bool need_zone_finish;
59b07544
BVA
1516
1517 assert(is_valid_offset(f, io_u->offset));
1518
2a87f269
SK
1519 if (zbd_zone_remainder(zb) > 0 && zbd_zone_remainder(zb) < min_bs) {
1520 pthread_mutex_lock(&f->zbd_info->mutex);
1521 zbd_write_zone_put(td, f, zb);
1522 pthread_mutex_unlock(&f->zbd_info->mutex);
1523 dprint(FD_ZBD, "%s: finish zone %d\n",
1524 f->file_name, zbd_zone_idx(f, zb));
1525 io_u_quiesce(td);
1526 zbd_finish_zone(td, f, zb);
1527 zone_unlock(zb);
1528
1529 if (zbd_zone_idx(f, zb) + 1 >= f->max_zone && !td_random(td))
1530 return NULL;
1531
1532 /* Find the next write pointer zone */
1533 do {
1534 zb++;
1535 if (zbd_zone_idx(f, zb) >= f->max_zone)
1536 zb = zbd_get_zone(f, f->min_zone);
1537 } while (!zb->has_wp);
1538
1539 zone_lock(td, f, zb);
1540 }
1541
1542 if (zbd_write_zone_get(td, f, zb))
1543 return zb;
1544
1545 zone_unlock(zb);
1546
a4807046 1547 if (zbdi->max_write_zones || td->o.job_max_open_zones) {
59b07544 1548 /*
a4807046 1549 * This statement accesses zbdi->write_zones[] on purpose
59b07544
BVA
1550 * without locking.
1551 */
a4807046 1552 zone_idx = zbdi->write_zones[pick_random_zone_idx(f, io_u)];
59b07544 1553 } else {
dc8a3d62 1554 zone_idx = zbd_offset_to_zone_idx(f, io_u->offset);
59b07544 1555 }
fae3b9a0
AD
1556 if (zone_idx < f->min_zone)
1557 zone_idx = f->min_zone;
1558 else if (zone_idx >= f->max_zone)
1559 zone_idx = f->max_zone - 1;
139d8dc6
DLM
1560
1561 dprint(FD_ZBD,
1562 "%s(%s): starting from zone %d (offset %lld, buflen %lld)\n",
59b07544
BVA
1563 __func__, f->file_name, zone_idx, io_u->offset, io_u->buflen);
1564
1565 /*
af94a8c3 1566 * Since z->mutex is the outer lock and zbdi->mutex the inner
59b07544 1567 * lock it can happen that the state of the zone with index zone_idx
af94a8c3 1568 * has changed after 'z' has been assigned and before zbdi->mutex
59b07544
BVA
1569 * has been obtained. Hence the loop.
1570 */
1571 for (;;) {
39e06ee7 1572 z = zbd_get_zone(f, zone_idx);
14351148
DF
1573 if (z->has_wp)
1574 zone_lock(td, f, z);
139d8dc6 1575
af94a8c3 1576 pthread_mutex_lock(&zbdi->mutex);
139d8dc6 1577
14351148
DF
1578 if (z->has_wp) {
1579 if (z->cond != ZBD_ZONE_COND_OFFLINE &&
a4807046 1580 zbdi->max_write_zones == 0 &&
139d8dc6 1581 td->o.job_max_open_zones == 0)
14351148 1582 goto examine_zone;
a4807046
SK
1583 if (zbdi->num_write_zones == 0) {
1584 dprint(FD_ZBD, "%s(%s): no zone is write target\n",
14351148 1585 __func__, f->file_name);
a4807046 1586 goto choose_other_zone;
14351148 1587 }
59b07544 1588 }
6463db6c 1589
fbac34f1
SK
1590 if (!zbd_pick_write_zone(f, io_u, &new_zone_idx)) {
1591 dprint(FD_ZBD, "%s(%s): no candidate zone\n",
1592 __func__, f->file_name);
1593 pthread_mutex_unlock(&zbdi->mutex);
1594 if (z->has_wp)
1595 zone_unlock(z);
1596 return NULL;
6463db6c
AD
1597 }
1598
59b07544
BVA
1599 if (new_zone_idx == zone_idx)
1600 break;
1601 zone_idx = new_zone_idx;
139d8dc6 1602
af94a8c3 1603 pthread_mutex_unlock(&zbdi->mutex);
139d8dc6 1604
14351148
DF
1605 if (z->has_wp)
1606 zone_unlock(z);
59b07544
BVA
1607 }
1608
af94a8c3 1609 /* Both z->mutex and zbdi->mutex are held. */
59b07544
BVA
1610
1611examine_zone:
df67bf1e 1612 if (zbd_zone_remainder(z) >= min_bs) {
af94a8c3 1613 pthread_mutex_unlock(&zbdi->mutex);
59b07544
BVA
1614 goto out;
1615 }
b2da58c4 1616
a4807046
SK
1617choose_other_zone:
1618 /* Check if number of write target zones reaches one of limits. */
1619 wait_zone_write =
1620 zbdi->num_write_zones == f->max_zone - f->min_zone ||
1621 (zbdi->max_write_zones &&
1622 zbdi->num_write_zones == zbdi->max_write_zones) ||
b2da58c4 1623 (td->o.job_max_open_zones &&
a4807046 1624 td->num_write_zones == td->o.job_max_open_zones);
b2da58c4 1625
af94a8c3 1626 pthread_mutex_unlock(&zbdi->mutex);
59b07544
BVA
1627
1628 /* Only z->mutex is held. */
1629
b2da58c4 1630 /*
a4807046
SK
1631 * When number of write target zones reaches to one of limits, wait for
1632 * zone write completion to one of them before trying a new zone.
b2da58c4 1633 */
a4807046 1634 if (wait_zone_write) {
139d8dc6 1635 dprint(FD_ZBD,
a4807046 1636 "%s(%s): quiesce to remove a zone from write target zones array\n",
b2da58c4
SK
1637 __func__, f->file_name);
1638 io_u_quiesce(td);
1639 }
1640
0f77c977 1641retry:
a4807046 1642 /* Zone 'z' is full, so try to choose a new zone. */
af94a8c3 1643 for (i = f->io_size / zbdi->zone_size; i > 0; i--) {
59b07544 1644 zone_idx++;
21c0c884
SK
1645 if (z->has_wp)
1646 zone_unlock(z);
59b07544 1647 z++;
ee3696bd 1648 if (!is_valid_offset(f, z->start)) {
59b07544 1649 /* Wrap-around. */
fae3b9a0 1650 zone_idx = f->min_zone;
39e06ee7 1651 z = zbd_get_zone(f, zone_idx);
59b07544 1652 }
ee3696bd 1653 assert(is_valid_offset(f, z->start));
21c0c884
SK
1654 if (!z->has_wp)
1655 continue;
fae3b9a0 1656 zone_lock(td, f, z);
a4807046 1657 if (z->write)
59b07544 1658 continue;
a4807046 1659 if (zbd_write_zone_get(td, f, z))
59b07544
BVA
1660 goto out;
1661 }
1662
1663 /* Only z->mutex is held. */
1664
a4807046 1665 /* Check whether the write fits in any of the write target zones. */
af94a8c3 1666 pthread_mutex_lock(&zbdi->mutex);
e2e29bf6 1667 need_zone_finish = true;
a4807046
SK
1668 for (i = 0; i < zbdi->num_write_zones; i++) {
1669 zone_idx = zbdi->write_zones[i];
fae3b9a0
AD
1670 if (zone_idx < f->min_zone || zone_idx >= f->max_zone)
1671 continue;
af94a8c3 1672 pthread_mutex_unlock(&zbdi->mutex);
4d4c71e6 1673 zone_unlock(z);
59b07544 1674
39e06ee7 1675 z = zbd_get_zone(f, zone_idx);
59b07544 1676
fae3b9a0 1677 zone_lock(td, f, z);
e2e29bf6
SK
1678 if (zbd_zone_remainder(z) >= min_bs) {
1679 need_zone_finish = false;
59b07544 1680 goto out;
e2e29bf6 1681 }
af94a8c3 1682 pthread_mutex_lock(&zbdi->mutex);
59b07544 1683 }
0f77c977
SK
1684
1685 /*
1686 * When any I/O is in-flight or when all I/Os in-flight get completed,
a4807046
SK
1687 * the I/Os might have removed zones from the write target array then
1688 * retry the steps to choose a zone. Before retry, call io_u_quiesce()
1689 * to complete in-flight writes.
0f77c977
SK
1690 */
1691 in_flight = any_io_in_flight();
1692 if (in_flight || should_retry) {
139d8dc6 1693 dprint(FD_ZBD,
a4807046 1694 "%s(%s): wait zone write and retry write target zone selection\n",
0f77c977 1695 __func__, f->file_name);
62ac6649 1696 should_retry = in_flight;
0f77c977
SK
1697 pthread_mutex_unlock(&zbdi->mutex);
1698 zone_unlock(z);
1699 io_u_quiesce(td);
1700 zone_lock(td, f, z);
0f77c977
SK
1701 goto retry;
1702 }
1703
e2e29bf6
SK
1704 if (td_random(td) && td->o.verify == VERIFY_NONE && need_zone_finish)
1705 /*
1706 * If all open zones have remainder smaller than the block size
1707 * for random write jobs, choose one of the write target zones
1708 * and finish it. When verify is enabled, skip this zone finish
1709 * operation to avoid verify data corruption by overwrite to the
1710 * zone.
1711 */
1712 if (zbd_pick_write_zone(f, io_u, &zone_idx)) {
1713 pthread_mutex_unlock(&zbdi->mutex);
1714 zone_unlock(z);
1715 z = zbd_get_zone(f, zone_idx);
1716 zone_lock(td, f, z);
1717 io_u_quiesce(td);
1718 dprint(FD_ZBD, "%s(%s): All write target zones have remainder smaller than block size. Choose zone %d and finish.\n",
1719 __func__, f->file_name, zone_idx);
1720 zbd_finish_zone(td, f, z);
1721 goto out;
1722 }
1723
af94a8c3 1724 pthread_mutex_unlock(&zbdi->mutex);
139d8dc6 1725
4d4c71e6 1726 zone_unlock(z);
139d8dc6 1727
a4807046 1728 dprint(FD_ZBD, "%s(%s): did not choose another write zone\n",
139d8dc6
DLM
1729 __func__, f->file_name);
1730
59b07544
BVA
1731 return NULL;
1732
1733out:
139d8dc6
DLM
1734 dprint(FD_ZBD, "%s(%s): returning zone %d\n",
1735 __func__, f->file_name, zone_idx);
1736
ee3696bd 1737 io_u->offset = z->start;
21c0c884 1738 assert(z->has_wp);
8a866de7 1739 assert(z->cond != ZBD_ZONE_COND_OFFLINE);
139d8dc6 1740
59b07544
BVA
1741 return z;
1742}
1743
bfbdd35b 1744/*
5c86fdf6
SK
1745 * Find another zone which has @min_bytes of readable data. Search in zones
1746 * @zb + 1 .. @zl. For random workload, also search in zones @zb - 1 .. @zf.
bfbdd35b 1747 *
21c0c884
SK
1748 * Either returns NULL or returns a zone pointer. When the zone has write
1749 * pointer, hold the mutex for the zone.
bfbdd35b
BVA
1750 */
1751static struct fio_zone_info *
07fc3f57 1752zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint64_t min_bytes,
bfbdd35b
BVA
1753 struct fio_zone_info *zb, struct fio_zone_info *zl)
1754{
fae3b9a0 1755 struct fio_file *f = io_u->file;
bfbdd35b 1756 struct fio_zone_info *z1, *z2;
39e06ee7 1757 const struct fio_zone_info *const zf = zbd_get_zone(f, f->min_zone);
bfbdd35b
BVA
1758
1759 /*
1760 * Skip to the next non-empty zone in case of sequential I/O and to
1761 * the nearest non-empty zone in case of random I/O.
1762 */
1763 for (z1 = zb + 1, z2 = zb - 1; z1 < zl || z2 >= zf; z1++, z2--) {
b7694961 1764 if (z1 < zl && z1->cond != ZBD_ZONE_COND_OFFLINE) {
21c0c884
SK
1765 if (z1->has_wp)
1766 zone_lock(td, f, z1);
5c86fdf6 1767 if (z1->start + min_bytes <= z1->wp)
bfbdd35b 1768 return z1;
21c0c884
SK
1769 if (z1->has_wp)
1770 zone_unlock(z1);
bfbdd35b
BVA
1771 } else if (!td_random(td)) {
1772 break;
1773 }
139d8dc6 1774
bfbdd35b 1775 if (td_random(td) && z2 >= zf &&
b7694961 1776 z2->cond != ZBD_ZONE_COND_OFFLINE) {
21c0c884
SK
1777 if (z2->has_wp)
1778 zone_lock(td, f, z2);
5c86fdf6 1779 if (z2->start + min_bytes <= z2->wp)
bfbdd35b 1780 return z2;
21c0c884
SK
1781 if (z2->has_wp)
1782 zone_unlock(z2);
bfbdd35b
BVA
1783 }
1784 }
139d8dc6
DLM
1785
1786 dprint(FD_ZBD,
1787 "%s: no zone has %"PRIu64" bytes of readable data\n",
5c86fdf6 1788 f->file_name, min_bytes);
139d8dc6 1789
bfbdd35b
BVA
1790 return NULL;
1791}
1792
b2da58c4
SK
1793/**
1794 * zbd_end_zone_io - update zone status at command completion
1795 * @io_u: I/O unit
1796 * @z: zone info pointer
1797 *
a4807046
SK
1798 * If the write command made the zone full, remove it from the write target
1799 * zones array.
b2da58c4
SK
1800 *
1801 * The caller must hold z->mutex.
1802 */
1803static void zbd_end_zone_io(struct thread_data *td, const struct io_u *io_u,
1804 struct fio_zone_info *z)
1805{
1806 const struct fio_file *f = io_u->file;
1807
1808 if (io_u->ddir == DDIR_WRITE &&
1809 io_u->offset + io_u->buflen >= zbd_zone_capacity_end(z)) {
1810 pthread_mutex_lock(&f->zbd_info->mutex);
a4807046 1811 zbd_write_zone_put(td, f, z);
b2da58c4
SK
1812 pthread_mutex_unlock(&f->zbd_info->mutex);
1813 }
1814}
1815
bfbdd35b 1816/**
d9ed3e63 1817 * zbd_queue_io - update the write pointer of a sequential zone
bfbdd35b 1818 * @io_u: I/O unit
d9ed3e63
DLM
1819 * @success: Whether or not the I/O unit has been queued successfully
1820 * @q: queueing status (busy, completed or queued).
bfbdd35b 1821 *
d9ed3e63
DLM
1822 * For write and trim operations, update the write pointer of the I/O unit
1823 * target zone.
bfbdd35b 1824 */
650c4ad3 1825static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int *q)
bfbdd35b 1826{
d9ed3e63
DLM
1827 const struct fio_file *f = io_u->file;
1828 struct zoned_block_device_info *zbd_info = f->zbd_info;
650c4ad3 1829 bool success = io_u->error == 0;
bfbdd35b 1830 struct fio_zone_info *z;
d9ed3e63 1831 uint64_t zone_end;
bfbdd35b 1832
5ddf46d0 1833 assert(zbd_info);
bfbdd35b 1834
53aa6171 1835 z = zbd_offset_to_zone(f, io_u->offset);
43bcbd5b 1836 assert(z->has_wp);
d9ed3e63 1837
650c4ad3
SK
1838 if (!success && td->o.recover_zbd_write_error &&
1839 io_u->ddir == DDIR_WRITE && td_ioengine_flagged(td, FIO_SYNCIO) &&
1840 *q == FIO_Q_COMPLETED) {
1841 zbd_recover_write_error(td, io_u);
1842 if (!io_u->error)
1843 success = true;
1844 }
1845
bfbdd35b
BVA
1846 if (!success)
1847 goto unlock;
d9ed3e63
DLM
1848
1849 dprint(FD_ZBD,
1850 "%s: queued I/O (%lld, %llu) for zone %u\n",
53aa6171 1851 f->file_name, io_u->offset, io_u->buflen, zbd_zone_idx(f, z));
d9ed3e63 1852
bfbdd35b
BVA
1853 switch (io_u->ddir) {
1854 case DDIR_WRITE:
d9ed3e63 1855 zone_end = min((uint64_t)(io_u->offset + io_u->buflen),
236d23a8 1856 zbd_zone_capacity_end(z));
139d8dc6 1857
a7c2b6fc
BVA
1858 /*
1859 * z->wp > zone_end means that one or more I/O errors
1860 * have occurred.
1861 */
2fb29f27
SK
1862 if (accounting_vdb(td, f) && z->wp <= zone_end) {
1863 pthread_mutex_lock(&zbd_info->mutex);
d56a6df3 1864 zbd_info->wp_valid_data_bytes += zone_end - z->wp;
2fb29f27
SK
1865 pthread_mutex_unlock(&zbd_info->mutex);
1866 }
bfbdd35b
BVA
1867 z->wp = zone_end;
1868 break;
bfbdd35b
BVA
1869 default:
1870 break;
1871 }
d9ed3e63 1872
650c4ad3 1873 if (*q == FIO_Q_COMPLETED && !io_u->error)
b2da58c4
SK
1874 zbd_end_zone_io(td, io_u, z);
1875
bfbdd35b 1876unlock:
650c4ad3
SK
1877 if (!success || *q != FIO_Q_QUEUED) {
1878 if (io_u->ddir == DDIR_WRITE) {
1879 z->writes_in_flight--;
1880 if (z->writes_in_flight == 0 && z->fixing_zone_wp) {
1881 dprint(FD_ZBD, "%s: Fixed write pointer of the zone %u\n",
1882 f->file_name, zbd_zone_idx(f, z));
1883 z->fixing_zone_wp = 0;
1884 }
1885 }
d9ed3e63 1886 /* BUSY or COMPLETED: unlock the zone */
4d4c71e6 1887 zone_unlock(z);
d9ed3e63
DLM
1888 io_u->zbd_put_io = NULL;
1889 }
1890}
1891
1892/**
1893 * zbd_put_io - Unlock an I/O unit target zone lock
1894 * @io_u: I/O unit
1895 */
b2da58c4 1896static void zbd_put_io(struct thread_data *td, const struct io_u *io_u)
d9ed3e63
DLM
1897{
1898 const struct fio_file *f = io_u->file;
d9ed3e63 1899 struct fio_zone_info *z;
d9ed3e63 1900
83276370 1901 assert(f->zbd_info);
615555bb 1902
53aa6171 1903 z = zbd_offset_to_zone(f, io_u->offset);
43bcbd5b 1904 assert(z->has_wp);
d9ed3e63
DLM
1905
1906 dprint(FD_ZBD,
1907 "%s: terminate I/O (%lld, %llu) for zone %u\n",
53aa6171 1908 f->file_name, io_u->offset, io_u->buflen, zbd_zone_idx(f, z));
d9ed3e63 1909
b2da58c4
SK
1910 zbd_end_zone_io(td, io_u, z);
1911
650c4ad3
SK
1912 if (io_u->ddir == DDIR_WRITE) {
1913 z->writes_in_flight--;
1914 if (z->writes_in_flight == 0 && z->fixing_zone_wp) {
1915 z->fixing_zone_wp = 0;
1916 dprint(FD_ZBD, "%s: Fixed write pointer of the zone %u\n",
1917 f->file_name, zbd_zone_idx(f, z));
1918 }
1919 }
1920
4d4c71e6 1921 zone_unlock(z);
bfbdd35b
BVA
1922}
1923
9d87c646
DLM
1924/*
1925 * Windows and MacOS do not define this.
1926 */
1927#ifndef EREMOTEIO
1928#define EREMOTEIO 121 /* POSIX value */
1929#endif
1930
bfbdd35b
BVA
1931bool zbd_unaligned_write(int error_code)
1932{
1933 switch (error_code) {
1934 case EIO:
1935 case EREMOTEIO:
1936 return true;
1937 }
1938 return false;
1939}
1940
4d37720a
DLM
1941/**
1942 * setup_zbd_zone_mode - handle zoneskip as necessary for ZBD drives
1943 * @td: FIO thread data.
1944 * @io_u: FIO I/O unit.
1945 *
1946 * For sequential workloads, change the file offset to skip zoneskip bytes when
1947 * no more IO can be performed in the current zone.
1948 * - For read workloads, zoneskip is applied when the io has reached the end of
1949 * the zone or the zone write position (when td->o.read_beyond_wp is false).
1950 * - For write workloads, zoneskip is applied when the zone is full.
1951 * This applies only to read and write operations.
1952 */
1953void setup_zbd_zone_mode(struct thread_data *td, struct io_u *io_u)
1954{
1955 struct fio_file *f = io_u->file;
1956 enum fio_ddir ddir = io_u->ddir;
1957 struct fio_zone_info *z;
4d37720a
DLM
1958
1959 assert(td->o.zone_mode == ZONE_MODE_ZBD);
1960 assert(td->o.zone_size);
5ddf46d0 1961 assert(f->zbd_info);
4d37720a 1962
53aa6171 1963 z = zbd_offset_to_zone(f, f->last_pos[ddir]);
236d23a8
SK
1964
1965 /*
1966 * When the zone capacity is smaller than the zone size and the I/O is
1967 * sequential write, skip to zone end if the latest position is at the
1968 * zone capacity limit.
1969 */
139d8dc6
DLM
1970 if (z->capacity < f->zbd_info->zone_size &&
1971 !td_random(td) && ddir == DDIR_WRITE &&
236d23a8
SK
1972 f->last_pos[ddir] >= zbd_zone_capacity_end(z)) {
1973 dprint(FD_ZBD,
1974 "%s: Jump from zone capacity limit to zone end:"
ee5e3436
SK
1975 " (%"PRIu64" -> %"PRIu64") for zone %u (%"PRIu64")\n",
1976 f->file_name, f->last_pos[ddir],
53aa6171 1977 zbd_zone_end(z), zbd_zone_idx(f, z), z->capacity);
236d23a8
SK
1978 td->io_skip_bytes += zbd_zone_end(z) - f->last_pos[ddir];
1979 f->last_pos[ddir] = zbd_zone_end(z);
1980 }
1981
4d37720a
DLM
1982 /*
1983 * zone_skip is valid only for sequential workloads.
1984 */
1985 if (td_random(td) || !td->o.zone_skip)
1986 return;
1987
1988 /*
1989 * It is time to switch to a new zone if:
1990 * - zone_bytes == zone_size bytes have already been accessed
1991 * - The last position reached the end of the current zone.
1992 * - For reads with td->o.read_beyond_wp == false, the last position
1993 * reached the zone write pointer.
1994 */
4d37720a 1995 if (td->zone_bytes >= td->o.zone_size ||
236d23a8 1996 f->last_pos[ddir] >= zbd_zone_end(z) ||
4d37720a
DLM
1997 (ddir == DDIR_READ &&
1998 (!td->o.read_beyond_wp) && f->last_pos[ddir] >= z->wp)) {
1999 /*
2000 * Skip zones.
2001 */
2002 td->zone_bytes = 0;
2003 f->file_offset += td->o.zone_size + td->o.zone_skip;
2004
2005 /*
2006 * Wrap from the beginning, if we exceed the file size
2007 */
2008 if (f->file_offset >= f->real_file_size)
2009 f->file_offset = get_start_offset(td, f);
2010
2011 f->last_pos[ddir] = f->file_offset;
2012 td->io_skip_bytes += td->o.zone_skip;
2013 }
2014}
2015
c65057f9 2016/**
c7d5e152 2017 * zbd_adjust_ddir - Adjust an I/O direction for zonemode=zbd.
c65057f9
SK
2018 *
2019 * @td: FIO thread data.
2020 * @io_u: FIO I/O unit.
2021 * @ddir: I/O direction before adjustment.
2022 *
2023 * Return adjusted I/O direction.
2024 */
2025enum fio_ddir zbd_adjust_ddir(struct thread_data *td, struct io_u *io_u,
2026 enum fio_ddir ddir)
2027{
2028 /*
2029 * In case read direction is chosen for the first random I/O, fio with
2030 * zonemode=zbd stops because no data can be read from zoned block
2031 * devices with all empty zones. Overwrite the first I/O direction as
2032 * write to make sure data to read exists.
2033 */
5ddf46d0 2034 assert(io_u->file->zbd_info);
731461cc 2035 if (ddir != DDIR_READ || !td_rw(td))
c65057f9
SK
2036 return ddir;
2037
cbbfe5a9
SK
2038 if (io_u->file->last_start[DDIR_WRITE] != -1ULL ||
2039 td->o.read_beyond_wp || td->o.rwmix[DDIR_WRITE] == 0)
c65057f9
SK
2040 return DDIR_READ;
2041
2042 return DDIR_WRITE;
2043}
2044
bfbdd35b
BVA
2045/**
2046 * zbd_adjust_block - adjust the offset and length as necessary for ZBD drives
2047 * @td: FIO thread data.
2048 * @io_u: FIO I/O unit.
2049 *
2050 * Locking strategy: returns with z->mutex locked if and only if z refers
2051 * to a sequential zone and if io_u_accept is returned. z is the zone that
2052 * corresponds to io_u->offset at the end of this function.
2053 */
2054enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
2055{
b7694961 2056 struct fio_file *f = io_u->file;
af94a8c3 2057 struct zoned_block_device_info *zbdi = f->zbd_info;
de65f7b7 2058 struct fio_zone_info *zb, *zl, *orig_zb;
bfbdd35b 2059 uint32_t orig_len = io_u->buflen;
07fc3f57 2060 uint64_t min_bs = td->o.min_bs[io_u->ddir];
bfbdd35b
BVA
2061 uint64_t new_len;
2062 int64_t range;
2063
af94a8c3 2064 assert(zbdi);
adc6adcb 2065 assert(min_bs);
bfbdd35b
BVA
2066 assert(is_valid_offset(f, io_u->offset));
2067 assert(io_u->buflen);
139d8dc6 2068
53aa6171 2069 zb = zbd_offset_to_zone(f, io_u->offset);
de65f7b7 2070 orig_zb = zb;
bfbdd35b 2071
2efcf74b
SK
2072 if (!zb->has_wp) {
2073 /* Accept non-write I/Os for conventional zones. */
2074 if (io_u->ddir != DDIR_WRITE)
2075 return io_u_accept;
139d8dc6 2076
2efcf74b
SK
2077 /*
2078 * Make sure that writes to conventional zones
2079 * don't cross over to any sequential zones.
2080 */
2081 if (!(zb + 1)->has_wp ||
2082 io_u->offset + io_u->buflen <= (zb + 1)->start)
2083 return io_u_accept;
2084
2085 if (io_u->offset + min_bs > (zb + 1)->start) {
2086 dprint(FD_IO,
07fc3f57 2087 "%s: off=%llu + min_bs=%"PRIu64" > next zone %"PRIu64"\n",
1e3d6e03 2088 f->file_name, io_u->offset,
ee5e3436 2089 min_bs, (zb + 1)->start);
139d8dc6
DLM
2090 io_u->offset =
2091 zb->start + (zb + 1)->start - io_u->offset;
2092 new_len = min(io_u->buflen,
2093 (zb + 1)->start - io_u->offset);
2efcf74b
SK
2094 } else {
2095 new_len = (zb + 1)->start - io_u->offset;
2096 }
139d8dc6 2097
2efcf74b 2098 io_u->buflen = new_len / min_bs * min_bs;
139d8dc6 2099
bfbdd35b 2100 return io_u_accept;
2efcf74b 2101 }
bfbdd35b
BVA
2102
2103 /*
2104 * Accept the I/O offset for reads if reading beyond the write pointer
2105 * is enabled.
2106 */
b7694961 2107 if (zb->cond != ZBD_ZONE_COND_OFFLINE &&
bfbdd35b
BVA
2108 io_u->ddir == DDIR_READ && td->o.read_beyond_wp)
2109 return io_u_accept;
2110
650c4ad3 2111retry_lock:
fae3b9a0 2112 zone_lock(td, f, zb);
6f0c6085 2113
650c4ad3
SK
2114 if (!td_ioengine_flagged(td, FIO_SYNCIO) && zb->fixing_zone_wp) {
2115 zone_unlock(zb);
2116 io_u_quiesce(td);
2117 goto retry_lock;
2118 }
2119
bfbdd35b
BVA
2120 switch (io_u->ddir) {
2121 case DDIR_READ:
6e2da06a 2122 if (td->runstate == TD_VERIFYING && td_write(td))
bfbdd35b 2123 goto accept;
139d8dc6 2124
bfbdd35b 2125 /*
de65f7b7
DLM
2126 * Check that there is enough written data in the zone to do an
2127 * I/O of at least min_bs B. If there isn't, find a new zone for
2128 * the I/O.
bfbdd35b 2129 */
b7694961 2130 range = zb->cond != ZBD_ZONE_COND_OFFLINE ?
ee3696bd 2131 zb->wp - zb->start : 0;
de65f7b7 2132 if (range < min_bs ||
ee3696bd 2133 ((!td_random(td)) && (io_u->offset + min_bs > zb->wp))) {
4d4c71e6 2134 zone_unlock(zb);
39e06ee7 2135 zl = zbd_get_zone(f, f->max_zone);
5c86fdf6 2136 zb = zbd_find_zone(td, io_u, min_bs, zb, zl);
bfbdd35b
BVA
2137 if (!zb) {
2138 dprint(FD_ZBD,
2139 "%s: zbd_find_zone(%lld, %llu) failed\n",
2140 f->file_name, io_u->offset,
2141 io_u->buflen);
2142 goto eof;
2143 }
de65f7b7
DLM
2144 /*
2145 * zbd_find_zone() returned a zone with a range of at
2146 * least min_bs.
2147 */
ee3696bd 2148 range = zb->wp - zb->start;
de65f7b7
DLM
2149 assert(range >= min_bs);
2150
2151 if (!td_random(td))
ee3696bd 2152 io_u->offset = zb->start;
bfbdd35b 2153 }
139d8dc6 2154
de65f7b7
DLM
2155 /*
2156 * Make sure the I/O is within the zone valid data range while
2157 * maximizing the I/O size and preserving randomness.
2158 */
2159 if (range <= io_u->buflen)
ee3696bd 2160 io_u->offset = zb->start;
de65f7b7 2161 else if (td_random(td))
ee3696bd
DLM
2162 io_u->offset = zb->start +
2163 ((io_u->offset - orig_zb->start) %
de65f7b7 2164 (range - io_u->buflen)) / min_bs * min_bs;
139d8dc6 2165
43bcbd5b
SK
2166 /*
2167 * When zbd_find_zone() returns a conventional zone,
2168 * we can simply accept the new i/o offset here.
2169 */
2170 if (!zb->has_wp)
2171 return io_u_accept;
139d8dc6 2172
de65f7b7
DLM
2173 /*
2174 * Make sure the I/O does not cross over the zone wp position.
2175 */
2176 new_len = min((unsigned long long)io_u->buflen,
ee3696bd 2177 (unsigned long long)(zb->wp - io_u->offset));
de65f7b7
DLM
2178 new_len = new_len / min_bs * min_bs;
2179 if (new_len < io_u->buflen) {
2180 io_u->buflen = new_len;
2181 dprint(FD_IO, "Changed length from %u into %llu\n",
2182 orig_len, io_u->buflen);
bfbdd35b 2183 }
139d8dc6 2184
ee3696bd
DLM
2185 assert(zb->start <= io_u->offset);
2186 assert(io_u->offset + io_u->buflen <= zb->wp);
139d8dc6 2187
bfbdd35b 2188 goto accept;
139d8dc6 2189
bfbdd35b 2190 case DDIR_WRITE:
af94a8c3 2191 if (io_u->buflen > zbdi->zone_size) {
1c74aadc
DF
2192 td_verror(td, EINVAL, "I/O buflen exceeds zone size");
2193 dprint(FD_IO,
ee5e3436
SK
2194 "%s: I/O buflen %llu exceeds zone size %"PRIu64"\n",
2195 f->file_name, io_u->buflen, zbdi->zone_size);
bfbdd35b 2196 goto eof;
1c74aadc 2197 }
139d8dc6 2198
e1a1b59b 2199retry:
2a87f269
SK
2200 zb = zbd_convert_to_write_zone(td, io_u, zb);
2201 if (!zb) {
2202 dprint(FD_IO, "%s: can't convert to write target zone",
2203 f->file_name);
2204 goto eof;
59b07544 2205 }
139d8dc6 2206
e1a1b59b
SK
2207 if (zbd_zone_remainder(zb) > 0 &&
2208 zbd_zone_remainder(zb) < min_bs)
2209 goto retry;
2210
a7c2b6fc
BVA
2211 /* Check whether the zone reset threshold has been exceeded */
2212 if (td->o.zrf.u.f) {
d56a6df3
SK
2213 if (zbdi->wp_valid_data_bytes >=
2214 f->io_size * td->o.zrt.u.f &&
139d8dc6 2215 zbd_dec_and_reset_write_cnt(td, f))
a7c2b6fc 2216 zb->reset_zone = 1;
a7c2b6fc 2217 }
139d8dc6 2218
bfbdd35b
BVA
2219 /* Reset the zone pointer if necessary */
2220 if (zb->reset_zone || zbd_zone_full(f, zb, min_bs)) {
cef8006c
SK
2221 if (td->o.verify != VERIFY_NONE) {
2222 /*
2223 * Unset io-u->file to tell get_next_verify()
2224 * that this IO is not requeue.
2225 */
2226 io_u->file = NULL;
2227 if (!get_next_verify(td, io_u)) {
2228 zone_unlock(zb);
2229 return io_u_accept;
2230 }
2231 io_u->file = f;
2232 }
2233
bfbdd35b
BVA
2234 /*
2235 * Since previous write requests may have been submitted
2236 * asynchronously and since we will submit the zone
2237 * reset synchronously, wait until previously submitted
2238 * write requests have completed before issuing a
2239 * zone reset.
2240 */
2241 io_u_quiesce(td);
2242 zb->reset_zone = 0;
67282020 2243 if (__zbd_reset_zone(td, f, zb) < 0)
bfbdd35b 2244 goto eof;
236d23a8
SK
2245
2246 if (zb->capacity < min_bs) {
1c74aadc 2247 td_verror(td, EINVAL, "ZCAP is less min_bs");
07fc3f57 2248 log_err("zone capacity %"PRIu64" smaller than minimum block size %"PRIu64"\n",
ee5e3436 2249 zb->capacity, min_bs);
236d23a8
SK
2250 goto eof;
2251 }
bfbdd35b 2252 }
139d8dc6 2253
bfbdd35b
BVA
2254 /* Make writes occur at the write pointer */
2255 assert(!zbd_zone_full(f, zb, min_bs));
ee3696bd 2256 io_u->offset = zb->wp;
bfbdd35b 2257 if (!is_valid_offset(f, io_u->offset)) {
1c74aadc
DF
2258 td_verror(td, EINVAL, "invalid WP value");
2259 dprint(FD_ZBD, "%s: dropped request with offset %llu\n",
2260 f->file_name, io_u->offset);
bfbdd35b
BVA
2261 goto eof;
2262 }
139d8dc6 2263
bfbdd35b
BVA
2264 /*
2265 * Make sure that the buflen is a multiple of the minimal
2266 * block size. Give up if shrinking would make the request too
2267 * small.
2268 */
2269 new_len = min((unsigned long long)io_u->buflen,
236d23a8 2270 zbd_zone_capacity_end(zb) - io_u->offset);
bfbdd35b
BVA
2271 new_len = new_len / min_bs * min_bs;
2272 if (new_len == io_u->buflen)
2273 goto accept;
2274 if (new_len >= min_bs) {
2275 io_u->buflen = new_len;
2276 dprint(FD_IO, "Changed length from %u into %llu\n",
2277 orig_len, io_u->buflen);
2278 goto accept;
2279 }
139d8dc6 2280
1c74aadc 2281 td_verror(td, EIO, "zone remainder too small");
07fc3f57 2282 log_err("zone remainder %lld smaller than min block size %"PRIu64"\n",
1c74aadc 2283 (zbd_zone_capacity_end(zb) - io_u->offset), min_bs);
139d8dc6 2284
bfbdd35b 2285 goto eof;
139d8dc6 2286
bfbdd35b 2287 case DDIR_TRIM:
e3be810b
SK
2288 /* Check random trim targets a non-empty zone */
2289 if (!td_random(td) || zb->wp > zb->start)
2290 goto accept;
2291
2292 /* Find out a non-empty zone to trim */
2293 zone_unlock(zb);
39e06ee7 2294 zl = zbd_get_zone(f, f->max_zone);
e3be810b
SK
2295 zb = zbd_find_zone(td, io_u, 1, zb, zl);
2296 if (zb) {
2297 io_u->offset = zb->start;
2298 dprint(FD_ZBD, "%s: found new zone(%lld) for trim\n",
2299 f->file_name, io_u->offset);
2300 goto accept;
2301 }
139d8dc6 2302
e3be810b 2303 goto eof;
139d8dc6 2304
bfbdd35b 2305 case DDIR_SYNC:
e3be810b 2306 /* fall-through */
bfbdd35b
BVA
2307 case DDIR_DATASYNC:
2308 case DDIR_SYNC_FILE_RANGE:
2309 case DDIR_WAIT:
2310 case DDIR_LAST:
2311 case DDIR_INVAL:
e8a0b539 2312 case DDIR_TIMEOUT:
bfbdd35b
BVA
2313 goto accept;
2314 }
2315
2316 assert(false);
2317
2318accept:
43bcbd5b 2319 assert(zb->has_wp);
b7694961 2320 assert(zb->cond != ZBD_ZONE_COND_OFFLINE);
d9ed3e63
DLM
2321 assert(!io_u->zbd_queue_io);
2322 assert(!io_u->zbd_put_io);
139d8dc6 2323
d9ed3e63
DLM
2324 io_u->zbd_queue_io = zbd_queue_io;
2325 io_u->zbd_put_io = zbd_put_io;
650c4ad3
SK
2326 if (io_u->ddir == DDIR_WRITE)
2327 zb->writes_in_flight++;
139d8dc6 2328
2ef3c1b0
DF
2329 /*
2330 * Since we return with the zone lock still held,
2331 * add an annotation to let Coverity know that it
2332 * is intentional.
2333 */
2334 /* coverity[missing_unlock] */
139d8dc6 2335
bfbdd35b
BVA
2336 return io_u_accept;
2337
2338eof:
43bcbd5b 2339 if (zb && zb->has_wp)
4d4c71e6 2340 zone_unlock(zb);
139d8dc6 2341
bfbdd35b
BVA
2342 return io_u_eof;
2343}
fd5d733f
BVA
2344
2345/* Return a string with ZBD statistics */
2346char *zbd_write_status(const struct thread_stat *ts)
2347{
2348 char *res;
2349
ee5e3436 2350 if (asprintf(&res, "; %"PRIu64" zone resets", ts->nr_zone_resets) < 0)
fd5d733f
BVA
2351 return NULL;
2352 return res;
2353}
e3be810b
SK
2354
2355/**
2356 * zbd_do_io_u_trim - If reset zone is applicable, do reset zone instead of trim
2357 *
2358 * @td: FIO thread data.
2359 * @io_u: FIO I/O unit.
2360 *
2361 * It is assumed that z->mutex is already locked.
2362 * Return io_u_completed when reset zone succeeds. Return 0 when the target zone
2363 * does not have write pointer. On error, return negative errno.
2364 */
67282020 2365int zbd_do_io_u_trim(struct thread_data *td, struct io_u *io_u)
e3be810b
SK
2366{
2367 struct fio_file *f = io_u->file;
2368 struct fio_zone_info *z;
e3be810b
SK
2369 int ret;
2370
53aa6171 2371 z = zbd_offset_to_zone(f, io_u->offset);
e3be810b
SK
2372 if (!z->has_wp)
2373 return 0;
2374
2375 if (io_u->offset != z->start) {
139d8dc6
DLM
2376 log_err("Trim offset not at zone start (%lld)\n",
2377 io_u->offset);
e3be810b
SK
2378 return -EINVAL;
2379 }
2380
2381 ret = zbd_reset_zone((struct thread_data *)td, f, z);
2382 if (ret < 0)
2383 return ret;
2384
2385 return io_u_completed;
2386}
8b403508
SK
2387
2388void zbd_log_err(const struct thread_data *td, const struct io_u *io_u)
2389{
2390 const struct fio_file *f = io_u->file;
2391
2392 if (td->o.zone_mode != ZONE_MODE_ZBD)
2393 return;
2394
2395 if (io_u->error == EOVERFLOW)
2396 log_err("%s: Exceeded max_active_zones limit. Check conditions of zones out of I/O ranges.\n",
2397 f->file_name);
2398}
650c4ad3
SK
2399
2400void zbd_recover_write_error(struct thread_data *td, struct io_u *io_u)
2401{
2402 struct fio_file *f = io_u->file;
2403 struct fio_zone_info *z;
2404 struct zbd_zone zrep;
2405 unsigned long long retry_offset;
2406 unsigned long long retry_len;
2407 char *retry_buf;
2408 uint64_t write_end_offset;
2409 int ret;
2410
2411 z = zbd_offset_to_zone(f, io_u->offset);
2412 if (!z->has_wp)
2413 return;
2414 write_end_offset = io_u->offset + io_u->buflen - z->start;
2415
2416 assert(z->writes_in_flight);
2417
2418 if (!z->fixing_zone_wp) {
2419 z->fixing_zone_wp = 1;
2420 dprint(FD_ZBD, "%s: Start fixing %u write pointer\n",
2421 f->file_name, zbd_zone_idx(f, z));
2422 }
2423
2424 if (z->max_write_error_offset < write_end_offset)
2425 z->max_write_error_offset = write_end_offset;
2426
2427 if (z->writes_in_flight > 1)
2428 return;
2429
2430 /*
2431 * This is the last write to the zone since the write error to recover.
2432 * Get the zone current write pointer and recover the write pointer
2433 * position so that next write can continue.
2434 */
2435 ret = zbd_report_zones(td, f, z->start, &zrep, 1);
2436 if (ret != 1) {
2437 log_info("fio: Report zone for write recovery failed for %s\n",
2438 f->file_name);
2439 return;
2440 }
2441
2442 if (zrep.wp < z->start ||
2443 z->start + z->max_write_error_offset < zrep.wp ) {
2444 log_info("fio: unexpected write pointer position on error for %s: wp=%"PRIu64"\n",
2445 f->file_name, zrep.wp);
2446 return;
2447 }
2448
2449 retry_offset = zrep.wp;
2450 retry_len = z->start + z->max_write_error_offset - retry_offset;
2451 retry_buf = NULL;
2452 if (retry_offset >= io_u->offset)
2453 retry_buf = (char *)io_u->buf + (retry_offset - io_u->offset);
2454
2455 ret = zbd_move_zone_wp(td, io_u->file, &zrep, retry_len, retry_buf);
2456 if (ret) {
2457 log_info("fio: Failed to recover write pointer for %s\n",
2458 f->file_name);
2459 return;
2460 }
2461
2462 z->wp = retry_offset + retry_len;
2463
2464 dprint(FD_ZBD, "%s: Write pointer move succeeded for error=%d\n",
2465 f->file_name, io_u->error);
2466}