When the continue_on_error options is specified, it is expected that the
workload continues to run when non-critical errors happen. However,
write workloads with zonemode=zbd option can not continue after errors,
if the failed writes cause partial data write on the target device. This
partial write creates write pointer gap between the device and fio, then
the next write requests by fio will fail due to unaligned write command
errors. This restriction results in undesirable test stops during long
runs for SMR drives which can recover defect sectors.
To allow the write workloads with zonemode=zbd to continue after write
failures with partial data writes, introduce the new option
recover_zbd_write_error. When this option is specified together with the
continue_on_error option, fio checks the write pointer positions of the
write target zones in the error handling step. Then fix the write
pointer by moving it to the position that the failed writes would have
moved. Bump up FIO_SERVER_VER to note that the new option is added.
For that purpose, add a new function zbd_recover_write_error(). Call it
from zbd_queue_io() for sync IO engines, and from io_completed() for
async IO engines. Modify zbd_queue_io() to pass the pointer to the
status so that zbd_recover_write_error() can modify the status to ignore
the errors. Add three fields to struct fio_zone_info. The two new fields
writes_in_flight and max_write_error_offset track status of in-flight
writes at the write error, so that the write pointer positions can be
fixed after the in-flight writes completed. The field fixing_zone_wp
stores that the write pointer fix is ongoing, then prohibit the new
writes get issued to the zone.
When the failed write is synchronous, the write pointer fix is done by
writing the left data for the failed write. This keeps the verify
patterns written to the device, then verify works together with the
continue_on_zbd_write_error option. When the failed write is
asynchronous, other in-flight writes fail together. In this case, fio
waits for all in-flight writes complete then fix the write pointer. Then
verify data of the failed writes are lost and verify does not work.
Check the continue_on_zbd_write_error option is not specified together
with the verify workload and asynchronous IO engine.
Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Link: https://lore.kernel.org/r/20250425052148.126788-6-shinichiro.kawasaki@wdc.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
requests. This and the previous parameter can be used to simulate
garbage collection activity.
+.. option:: recover_zbd_write_error=bool
+
+ If this option is specified together with the option
+ :option:`continue_on_error`, check the write pointer positions after the
+ failed writes to sequential write required zones. Then move the write
+ pointers so that the next writes do not fail due to partial writes and
+ unexpected write pointer positions. If :option:`continue_on_error` is
+ not specified, errors out. When the writes are asynchronous, the write
+ pointer move fills blocks with zero then breaks verify data. If an
+ asynchronous IO engine and :option:`verify` workload are specified,
+ errors out. Default: false.
I/O type
~~~~~~~~
o->zone_mode = le32_to_cpu(top->zone_mode);
o->max_open_zones = __le32_to_cpu(top->max_open_zones);
o->ignore_zone_limits = le32_to_cpu(top->ignore_zone_limits);
+ o->recover_zbd_write_error = le32_to_cpu(top->recover_zbd_write_error);
o->lockmem = le64_to_cpu(top->lockmem);
o->offset_increment_percent = le32_to_cpu(top->offset_increment_percent);
o->offset_increment = le64_to_cpu(top->offset_increment);
top->zone_mode = __cpu_to_le32(o->zone_mode);
top->max_open_zones = __cpu_to_le32(o->max_open_zones);
top->ignore_zone_limits = cpu_to_le32(o->ignore_zone_limits);
+ top->recover_zbd_write_error = cpu_to_le32(o->recover_zbd_write_error);
top->lockmem = __cpu_to_le64(o->lockmem);
top->ddir_seq_add = __cpu_to_le64(o->ddir_seq_add);
top->file_size_low = __cpu_to_le64(o->file_size_low);
issued if the zone reset threshold has been exceeded. A zone reset is
submitted after each (1 / zone_reset_frequency) write requests. This and the
previous parameter can be used to simulate garbage collection activity.
+.BI recover_zbd_write_error \fR=\fPbool
+If this option is specified together with the option \fBcontinue_on_error\fR,
+check the write pointer positions after the failed writes to sequential write
+required zones. Then move the write pointers so that the next writes do not
+fail due to partial writes and unexpected write pointer positions. If
+\fBcontinue_on_error\fR is not specified, errors out. When the writes are
+asynchronous, the write pointer move fills blocks with zero then breaks verify
+data. If an asynchronous IO engine and \fBverify\fR workload are specified,
+errors out. Default: false.
.SS "I/O type"
.TP
assert(io_u->flags & IO_U_F_FLIGHT);
io_u_clear(td, io_u, IO_U_F_FLIGHT | IO_U_F_BUSY_OK | IO_U_F_PATTERN_DONE);
+ if (td->o.zone_mode == ZONE_MODE_ZBD && td->o.recover_zbd_write_error &&
+ io_u->error && io_u->ddir == DDIR_WRITE &&
+ !td_ioengine_flagged(td, FIO_SYNCIO))
+ zbd_recover_write_error(td, io_u);
+
/*
* Mark IO ok to verify
*/
* @success == true means that the I/O operation has been queued or
* completed successfully.
*/
- void (*zbd_queue_io)(struct thread_data *td, struct io_u *, int q,
- bool success);
+ void (*zbd_queue_io)(struct thread_data *td, struct io_u *, int *q);
/*
* ZBD mode zbd_put_io callback: called in after completion of an I/O
}
ret = td->io_ops->queue(td, io_u);
- zbd_queue_io_u(td, io_u, ret);
+ zbd_queue_io_u(td, io_u, &ret);
unlock_file(td, io_u->file);
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_ZONE,
},
+ {
+ .name = "recover_zbd_write_error",
+ .lname = "Recover write errors when zonemode=zbd is set",
+ .type = FIO_OPT_BOOL,
+ .off1 = offsetof(struct thread_options, recover_zbd_write_error),
+ .def = 0,
+ .help = "Continue writes for sequential write required zones after recovering write errors with care for partial write pointer move",
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_ZONE,
+ },
{
.name = "fdp",
.lname = "Flexible data placement",
};
enum {
- FIO_SERVER_VER = 109,
+ FIO_SERVER_VER = 110,
FIO_SERVER_MAX_FRAGMENT_PDU = 1024,
FIO_SERVER_MAX_CMD_MB = 2048,
int max_open_zones;
unsigned int job_max_open_zones;
unsigned int ignore_zone_limits;
+ unsigned int recover_zbd_write_error;
fio_fp64_t zrt;
fio_fp64_t zrf;
uint32_t zone_mode;
int32_t max_open_zones;
uint32_t ignore_zone_limits;
+ uint32_t recover_zbd_write_error;
uint32_t log_entries;
uint32_t log_prio;
if (!zbd_verify_bs())
return 1;
+ if (td->o.recover_zbd_write_error && td_write(td)) {
+ if (!td->o.continue_on_error) {
+ log_err("recover_zbd_write_error works only when continue_on_error is set\n");
+ return 1;
+ }
+ if (td->o.verify != VERIFY_NONE &&
+ !td_ioengine_flagged(td, FIO_SYNCIO)) {
+ log_err("recover_zbd_write_error for async IO engines does not support verify\n");
+ return 1;
+ }
+ }
+
if (td->o.experimental_verify) {
log_err("zonemode=zbd does not support experimental verify\n");
return 1;
* For write and trim operations, update the write pointer of the I/O unit
* target zone.
*/
-static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q,
- bool success)
+static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int *q)
{
const struct fio_file *f = io_u->file;
struct zoned_block_device_info *zbd_info = f->zbd_info;
+ bool success = io_u->error == 0;
struct fio_zone_info *z;
uint64_t zone_end;
z = zbd_offset_to_zone(f, io_u->offset);
assert(z->has_wp);
+ if (!success && td->o.recover_zbd_write_error &&
+ io_u->ddir == DDIR_WRITE && td_ioengine_flagged(td, FIO_SYNCIO) &&
+ *q == FIO_Q_COMPLETED) {
+ zbd_recover_write_error(td, io_u);
+ if (!io_u->error)
+ success = true;
+ }
+
if (!success)
goto unlock;
break;
}
- if (q == FIO_Q_COMPLETED && !io_u->error)
+ if (*q == FIO_Q_COMPLETED && !io_u->error)
zbd_end_zone_io(td, io_u, z);
unlock:
- if (!success || q != FIO_Q_QUEUED) {
+ if (!success || *q != FIO_Q_QUEUED) {
+ if (io_u->ddir == DDIR_WRITE) {
+ z->writes_in_flight--;
+ if (z->writes_in_flight == 0 && z->fixing_zone_wp) {
+ dprint(FD_ZBD, "%s: Fixed write pointer of the zone %u\n",
+ f->file_name, zbd_zone_idx(f, z));
+ z->fixing_zone_wp = 0;
+ }
+ }
/* BUSY or COMPLETED: unlock the zone */
zone_unlock(z);
io_u->zbd_put_io = NULL;
zbd_end_zone_io(td, io_u, z);
+ if (io_u->ddir == DDIR_WRITE) {
+ z->writes_in_flight--;
+ if (z->writes_in_flight == 0 && z->fixing_zone_wp) {
+ z->fixing_zone_wp = 0;
+ dprint(FD_ZBD, "%s: Fixed write pointer of the zone %u\n",
+ f->file_name, zbd_zone_idx(f, z));
+ }
+ }
+
zone_unlock(z);
}
io_u->ddir == DDIR_READ && td->o.read_beyond_wp)
return io_u_accept;
+retry_lock:
zone_lock(td, f, zb);
+ if (!td_ioengine_flagged(td, FIO_SYNCIO) && zb->fixing_zone_wp) {
+ zone_unlock(zb);
+ io_u_quiesce(td);
+ goto retry_lock;
+ }
+
switch (io_u->ddir) {
case DDIR_READ:
if (td->runstate == TD_VERIFYING && td_write(td))
io_u->zbd_queue_io = zbd_queue_io;
io_u->zbd_put_io = zbd_put_io;
+ if (io_u->ddir == DDIR_WRITE)
+ zb->writes_in_flight++;
/*
* Since we return with the zone lock still held,
log_err("%s: Exceeded max_active_zones limit. Check conditions of zones out of I/O ranges.\n",
f->file_name);
}
+
+void zbd_recover_write_error(struct thread_data *td, struct io_u *io_u)
+{
+ struct fio_file *f = io_u->file;
+ struct fio_zone_info *z;
+ struct zbd_zone zrep;
+ unsigned long long retry_offset;
+ unsigned long long retry_len;
+ char *retry_buf;
+ uint64_t write_end_offset;
+ int ret;
+
+ z = zbd_offset_to_zone(f, io_u->offset);
+ if (!z->has_wp)
+ return;
+ write_end_offset = io_u->offset + io_u->buflen - z->start;
+
+ assert(z->writes_in_flight);
+
+ if (!z->fixing_zone_wp) {
+ z->fixing_zone_wp = 1;
+ dprint(FD_ZBD, "%s: Start fixing %u write pointer\n",
+ f->file_name, zbd_zone_idx(f, z));
+ }
+
+ if (z->max_write_error_offset < write_end_offset)
+ z->max_write_error_offset = write_end_offset;
+
+ if (z->writes_in_flight > 1)
+ return;
+
+ /*
+ * This is the last write to the zone since the write error to recover.
+ * Get the zone current write pointer and recover the write pointer
+ * position so that next write can continue.
+ */
+ ret = zbd_report_zones(td, f, z->start, &zrep, 1);
+ if (ret != 1) {
+ log_info("fio: Report zone for write recovery failed for %s\n",
+ f->file_name);
+ return;
+ }
+
+ if (zrep.wp < z->start ||
+ z->start + z->max_write_error_offset < zrep.wp ) {
+ log_info("fio: unexpected write pointer position on error for %s: wp=%"PRIu64"\n",
+ f->file_name, zrep.wp);
+ return;
+ }
+
+ retry_offset = zrep.wp;
+ retry_len = z->start + z->max_write_error_offset - retry_offset;
+ retry_buf = NULL;
+ if (retry_offset >= io_u->offset)
+ retry_buf = (char *)io_u->buf + (retry_offset - io_u->offset);
+
+ ret = zbd_move_zone_wp(td, io_u->file, &zrep, retry_len, retry_buf);
+ if (ret) {
+ log_info("fio: Failed to recover write pointer for %s\n",
+ f->file_name);
+ return;
+ }
+
+ z->wp = retry_offset + retry_len;
+
+ dprint(FD_ZBD, "%s: Write pointer move succeeded for error=%d\n",
+ f->file_name, io_u->error);
+}
* @start: zone start location (bytes)
* @wp: zone write pointer location (bytes)
* @capacity: maximum size usable from the start of a zone (bytes)
+ * @writes_in_flight: number of writes in flight fo the zone
+ * @max_write_error_offset: maximum offset from zone start among the failed
+ * writes to the zone
* @mutex: protects the modifiable members in this structure
* @type: zone type (BLK_ZONE_TYPE_*)
* @cond: zone state (BLK_ZONE_COND_*)
* @write: whether or not this zone is the write target at this moment. Only
* relevant if zbd->max_open_zones > 0.
* @reset_zone: whether or not this zone should be reset before writing to it
+ * @fixing_zone_wp: whether or not the write pointer of this zone is under fix
*/
struct fio_zone_info {
pthread_mutex_t mutex;
uint64_t start;
uint64_t wp;
uint64_t capacity;
+ uint32_t writes_in_flight;
+ uint32_t max_write_error_offset;
enum zbd_zone_type type:2;
enum zbd_zone_cond cond:4;
unsigned int has_wp:1;
unsigned int write:1;
unsigned int reset_zone:1;
+ unsigned int fixing_zone_wp:1;
};
/**
char *zbd_write_status(const struct thread_stat *ts);
int zbd_do_io_u_trim(struct thread_data *td, struct io_u *io_u);
void zbd_log_err(const struct thread_data *td, const struct io_u *io_u);
+void zbd_recover_write_error(struct thread_data *td, struct io_u *io_u);
static inline void zbd_close_file(struct fio_file *f)
{
}
static inline void zbd_queue_io_u(struct thread_data *td, struct io_u *io_u,
- enum fio_q_status status)
+ enum fio_q_status *status)
{
if (io_u->zbd_queue_io) {
- io_u->zbd_queue_io(td, io_u, status, io_u->error == 0);
+ io_u->zbd_queue_io(td, io_u, (int *)status);
io_u->zbd_queue_io = NULL;
}
}