zbd: add the recover_zbd_write_error option

author Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>

Fri, 25 Apr 2025 05:21:45 +0000 (14:21 +0900)

committer Jens Axboe <axboe@kernel.dk>

Wed, 7 May 2025 11:28:47 +0000 (05:28 -0600)
author Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Fri, 25 Apr 2025 05:21:45 +0000 (14:21 +0900)
committer Jens Axboe <axboe@kernel.dk>
Wed, 7 May 2025 11:28:47 +0000 (05:28 -0600)
diff --git a/HOWTO.rst b/HOWTO.rst

index bde3496e1b5b2089f7da7479e0a35bb9f745a3f8..a7e2f6937b6787100f8c84f2c452593134d86272 100644 (file)
--- a/HOWTO.rst
+++ b/HOWTO.rst
@@ -1126,6 +1126,17 @@ Target file/device
         requests. This and the previous parameter can be used to simulate
         garbage collection activity.
  
+.. option:: recover_zbd_write_error=bool
+
+       If this option is specified together with the option
+       :option:`continue_on_error`, check the write pointer positions after the
+       failed writes to sequential write required zones. Then move the write
+       pointers so that the next writes do not fail due to partial writes and
+       unexpected write pointer positions. If :option:`continue_on_error` is
+       not specified, errors out. When the writes are asynchronous, the write
+       pointer move fills blocks with zero then breaks verify data. If an
+       asynchronous IO engine and :option:`verify` workload are specified,
+       errors out. Default: false.
  
  I/O type
  ~~~~~~~~
diff --git a/cconv.c b/cconv.c

index df8417033dee9912d63d49a55f50259eab80a777..cc1a52c7c605040bd8c64533313c3a255ff7f1df 100644 (file)
--- a/cconv.c
+++ b/cconv.c
@@ -265,6 +265,7 @@ int convert_thread_options_to_cpu(struct thread_options *o,
         o->zone_mode = le32_to_cpu(top->zone_mode);
         o->max_open_zones = __le32_to_cpu(top->max_open_zones);
         o->ignore_zone_limits = le32_to_cpu(top->ignore_zone_limits);
+       o->recover_zbd_write_error = le32_to_cpu(top->recover_zbd_write_error);
         o->lockmem = le64_to_cpu(top->lockmem);
         o->offset_increment_percent = le32_to_cpu(top->offset_increment_percent);
         o->offset_increment = le64_to_cpu(top->offset_increment);
@@ -637,6 +638,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
         top->zone_mode = __cpu_to_le32(o->zone_mode);
         top->max_open_zones = __cpu_to_le32(o->max_open_zones);
         top->ignore_zone_limits = cpu_to_le32(o->ignore_zone_limits);
+       top->recover_zbd_write_error = cpu_to_le32(o->recover_zbd_write_error);
         top->lockmem = __cpu_to_le64(o->lockmem);
         top->ddir_seq_add = __cpu_to_le64(o->ddir_seq_add);
         top->file_size_low = __cpu_to_le64(o->file_size_low);
diff --git a/fio.1 b/fio.1

index 0ea239b828c64e5185061063150cae2b1e87bb7a..8476b68135c1aab168af8fab768e41019d67075c 100644 (file)
--- a/fio.1
+++ b/fio.1
@@ -890,6 +890,15 @@ A number between zero and one that indicates how often a zone reset should be
  issued if the zone reset threshold has been exceeded. A zone reset is
  submitted after each (1 / zone_reset_frequency) write requests. This and the
  previous parameter can be used to simulate garbage collection activity.
+.BI recover_zbd_write_error \fR=\fPbool
+If this option is specified together with the option \fBcontinue_on_error\fR,
+check the write pointer positions after the failed writes to sequential write
+required zones. Then move the write pointers so that the next writes do not
+fail due to partial writes and unexpected write pointer positions. If
+\fBcontinue_on_error\fR is not specified, errors out. When the writes are
+asynchronous, the write pointer move fills blocks with zero then breaks verify
+data. If an asynchronous IO engine and \fBverify\fR workload are specified,
+errors out. Default: false.
  
  .SS "I/O type"
  .TP
diff --git a/io_u.c b/io_u.c

index 17f5e8530c5b20804518384448073440c6c7fff1..70a1183794c53e6274f4de7f852737f0746e9a70 100644 (file)
--- a/io_u.c
+++ b/io_u.c
@@ -2102,6 +2102,11 @@ static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
         assert(io_u->flags & IO_U_F_FLIGHT);
         io_u_clear(td, io_u, IO_U_F_FLIGHT | IO_U_F_BUSY_OK | IO_U_F_PATTERN_DONE);
  
+       if (td->o.zone_mode == ZONE_MODE_ZBD && td->o.recover_zbd_write_error &&
+           io_u->error && io_u->ddir == DDIR_WRITE &&
+           !td_ioengine_flagged(td, FIO_SYNCIO))
+               zbd_recover_write_error(td, io_u);
+
         /*
          * Mark IO ok to verify
          */
diff --git a/io_u.h b/io_u.h

index 22ae6ed4f1bfb9937b7f697b704fbf272dc3169f..178c12293fadd37e55a2820aa5edb97a2a70426a 100644 (file)
--- a/io_u.h
+++ b/io_u.h
@@ -111,8 +111,7 @@ struct io_u {
          * @success == true means that the I/O operation has been queued or
          * completed successfully.
          */
-       void (*zbd_queue_io)(struct thread_data *td, struct io_u *, int q,
-                            bool success);
+       void (*zbd_queue_io)(struct thread_data *td, struct io_u *, int *q);
  
         /*
          * ZBD mode zbd_put_io callback: called in after completion of an I/O
diff --git a/ioengines.c b/ioengines.c

index dcd4164d4e8f00ec9b4de514cac9b8b6d844e1e7..05d01a0f0782daa9a72a5326488d2c84ada64cfc 100644 (file)
--- a/ioengines.c
+++ b/ioengines.c
@@ -386,7 +386,7 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)
         }
  
         ret = td->io_ops->queue(td, io_u);
-       zbd_queue_io_u(td, io_u, ret);
+       zbd_queue_io_u(td, io_u, &ret);
  
         unlock_file(td, io_u->file);
  
diff --git a/options.c b/options.c

index 416bc91c615880f6afc587cb944cdfa25727a44c..71c97e9e35289c5b6fcc30baccef342e3118af7d 100644 (file)
--- a/options.c
+++ b/options.c
@@ -3794,6 +3794,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .category = FIO_OPT_C_IO,
                 .group  = FIO_OPT_G_ZONE,
         },
+       {
+               .name   = "recover_zbd_write_error",
+               .lname  = "Recover write errors when zonemode=zbd is set",
+               .type   = FIO_OPT_BOOL,
+               .off1   = offsetof(struct thread_options, recover_zbd_write_error),
+               .def    = 0,
+               .help   = "Continue writes for sequential write required zones after recovering write errors with care for partial write pointer move",
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_ZONE,
+       },
         {
                 .name   = "fdp",
                 .lname  = "Flexible data placement",
diff --git a/server.h b/server.h

index e596811248cf7af57f03b87ba6885c917b51575a..0b93cd02009a2d349a944dc4b1184367787e5303 100644 (file)
--- a/server.h
+++ b/server.h
@@ -51,7 +51,7 @@ struct fio_net_cmd_reply {
  };
  
  enum {
-       FIO_SERVER_VER                  = 109,
+       FIO_SERVER_VER                  = 110,
  
         FIO_SERVER_MAX_FRAGMENT_PDU     = 1024,
         FIO_SERVER_MAX_CMD_MB           = 2048,
diff --git a/thread_options.h b/thread_options.h

index d25ba891e32cdec49467a979bd07a8bde1d1cfc2..b00946512e80659a8f54b42af18984f475dba2cc 100644 (file)
--- a/thread_options.h
+++ b/thread_options.h
@@ -390,6 +390,7 @@ struct thread_options {
         int max_open_zones;
         unsigned int job_max_open_zones;
         unsigned int ignore_zone_limits;
+       unsigned int recover_zbd_write_error;
         fio_fp64_t zrt;
         fio_fp64_t zrf;
  
@@ -710,6 +711,7 @@ struct thread_options_pack {
         uint32_t zone_mode;
         int32_t max_open_zones;
         uint32_t ignore_zone_limits;
+       uint32_t recover_zbd_write_error;
  
         uint32_t log_entries;
         uint32_t log_prio;
diff --git a/zbd.c b/zbd.c

index 617705752b881b29cab7a1a23e2503c8524ce102..8f0e4bc6dea47b48f96bcbbdc01a199848b38d35 100644 (file)
--- a/zbd.c
+++ b/zbd.c
@@ -1267,6 +1267,18 @@ int zbd_setup_files(struct thread_data *td)
         if (!zbd_verify_bs())
                 return 1;
  
+       if (td->o.recover_zbd_write_error && td_write(td)) {
+               if (!td->o.continue_on_error) {
+                       log_err("recover_zbd_write_error works only when continue_on_error is set\n");
+                       return 1;
+               }
+               if (td->o.verify != VERIFY_NONE &&
+                   !td_ioengine_flagged(td, FIO_SYNCIO)) {
+                       log_err("recover_zbd_write_error for async IO engines does not support verify\n");
+                       return 1;
+               }
+       }
+
         if (td->o.experimental_verify) {
                 log_err("zonemode=zbd does not support experimental verify\n");
                 return 1;
@@ -1810,11 +1822,11 @@ static void zbd_end_zone_io(struct thread_data *td, const struct io_u *io_u,
   * For write and trim operations, update the write pointer of the I/O unit
   * target zone.
   */
-static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q,
-                        bool success)
+static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int *q)
  {
         const struct fio_file *f = io_u->file;
         struct zoned_block_device_info *zbd_info = f->zbd_info;
+       bool success = io_u->error == 0;
         struct fio_zone_info *z;
         uint64_t zone_end;
  
@@ -1823,6 +1835,14 @@ static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q,
         z = zbd_offset_to_zone(f, io_u->offset);
         assert(z->has_wp);
  
+       if (!success && td->o.recover_zbd_write_error &&
+           io_u->ddir == DDIR_WRITE && td_ioengine_flagged(td, FIO_SYNCIO) &&
+           *q == FIO_Q_COMPLETED) {
+               zbd_recover_write_error(td, io_u);
+               if (!io_u->error)
+                       success = true;
+       }
+
         if (!success)
                 goto unlock;
  
@@ -1850,11 +1870,19 @@ static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q,
                 break;
         }
  
-       if (q == FIO_Q_COMPLETED && !io_u->error)
+       if (*q == FIO_Q_COMPLETED && !io_u->error)
                 zbd_end_zone_io(td, io_u, z);
  
  unlock:
-       if (!success || q != FIO_Q_QUEUED) {
+       if (!success || *q != FIO_Q_QUEUED) {
+               if (io_u->ddir == DDIR_WRITE) {
+                       z->writes_in_flight--;
+                       if (z->writes_in_flight == 0 && z->fixing_zone_wp) {
+                               dprint(FD_ZBD, "%s: Fixed write pointer of the zone %u\n",
+                                      f->file_name, zbd_zone_idx(f, z));
+                               z->fixing_zone_wp = 0;
+                       }
+               }
                 /* BUSY or COMPLETED: unlock the zone */
                 zone_unlock(z);
                 io_u->zbd_put_io = NULL;
@@ -1881,6 +1909,15 @@ static void zbd_put_io(struct thread_data *td, const struct io_u *io_u)
  
         zbd_end_zone_io(td, io_u, z);
  
+       if (io_u->ddir == DDIR_WRITE) {
+               z->writes_in_flight--;
+               if (z->writes_in_flight == 0 && z->fixing_zone_wp) {
+                       z->fixing_zone_wp = 0;
+                       dprint(FD_ZBD, "%s: Fixed write pointer of the zone %u\n",
+                              f->file_name, zbd_zone_idx(f, z));
+               }
+       }
+
         zone_unlock(z);
  }
  
@@ -2071,8 +2108,15 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
             io_u->ddir == DDIR_READ && td->o.read_beyond_wp)
                 return io_u_accept;
  
+retry_lock:
         zone_lock(td, f, zb);
  
+       if (!td_ioengine_flagged(td, FIO_SYNCIO) && zb->fixing_zone_wp) {
+               zone_unlock(zb);
+               io_u_quiesce(td);
+               goto retry_lock;
+       }
+
         switch (io_u->ddir) {
         case DDIR_READ:
                 if (td->runstate == TD_VERIFYING && td_write(td))
@@ -2279,6 +2323,8 @@ accept:
  
         io_u->zbd_queue_io = zbd_queue_io;
         io_u->zbd_put_io = zbd_put_io;
+       if (io_u->ddir == DDIR_WRITE)
+               zb->writes_in_flight++;
  
         /*
          * Since we return with the zone lock still held,
@@ -2350,3 +2396,71 @@ void zbd_log_err(const struct thread_data *td, const struct io_u *io_u)
                 log_err("%s: Exceeded max_active_zones limit. Check conditions of zones out of I/O ranges.\n",
                         f->file_name);
  }
+
+void zbd_recover_write_error(struct thread_data *td, struct io_u *io_u)
+{
+       struct fio_file *f = io_u->file;
+       struct fio_zone_info *z;
+       struct zbd_zone zrep;
+       unsigned long long retry_offset;
+       unsigned long long retry_len;
+       char *retry_buf;
+       uint64_t write_end_offset;
+       int ret;
+
+       z = zbd_offset_to_zone(f, io_u->offset);
+       if (!z->has_wp)
+               return;
+       write_end_offset = io_u->offset + io_u->buflen - z->start;
+
+       assert(z->writes_in_flight);
+
+       if (!z->fixing_zone_wp) {
+               z->fixing_zone_wp = 1;
+               dprint(FD_ZBD, "%s: Start fixing %u write pointer\n",
+                      f->file_name, zbd_zone_idx(f, z));
+       }
+
+       if (z->max_write_error_offset < write_end_offset)
+               z->max_write_error_offset = write_end_offset;
+
+       if (z->writes_in_flight > 1)
+               return;
+
+       /*
+        * This is the last write to the zone since the write error to recover.
+        * Get the zone current write pointer and recover the write pointer
+        * position so that next write can continue.
+        */
+       ret = zbd_report_zones(td, f, z->start, &zrep, 1);
+       if (ret != 1) {
+               log_info("fio: Report zone for write recovery failed for %s\n",
+                        f->file_name);
+               return;
+       }
+
+       if (zrep.wp < z->start ||
+           z->start + z->max_write_error_offset < zrep.wp ) {
+               log_info("fio: unexpected write pointer position on error for %s: wp=%"PRIu64"\n",
+                        f->file_name, zrep.wp);
+               return;
+       }
+
+       retry_offset = zrep.wp;
+       retry_len = z->start + z->max_write_error_offset - retry_offset;
+       retry_buf = NULL;
+       if (retry_offset >= io_u->offset)
+               retry_buf = (char *)io_u->buf + (retry_offset - io_u->offset);
+
+       ret = zbd_move_zone_wp(td, io_u->file, &zrep, retry_len, retry_buf);
+       if (ret) {
+               log_info("fio: Failed to recover write pointer for %s\n",
+                        f->file_name);
+               return;
+       }
+
+       z->wp = retry_offset + retry_len;
+
+       dprint(FD_ZBD, "%s: Write pointer move succeeded for error=%d\n",
+              f->file_name, io_u->error);
+}
diff --git a/zbd.h b/zbd.h

index 5750a0b8088e7437ccdaa270a8997fa0bc1110af..14204316d42fde994b9838da292fc61a3862b70d 100644 (file)
--- a/zbd.h
+++ b/zbd.h
@@ -25,6 +25,9 @@ enum io_u_action {
   * @start: zone start location (bytes)
   * @wp: zone write pointer location (bytes)
   * @capacity: maximum size usable from the start of a zone (bytes)
+ * @writes_in_flight: number of writes in flight fo the zone
+ * @max_write_error_offset: maximum offset from zone start among the failed
+ *                          writes to the zone
   * @mutex: protects the modifiable members in this structure
   * @type: zone type (BLK_ZONE_TYPE_*)
   * @cond: zone state (BLK_ZONE_COND_*)
@@ -32,17 +35,21 @@ enum io_u_action {
   * @write: whether or not this zone is the write target at this moment. Only
   *              relevant if zbd->max_open_zones > 0.
   * @reset_zone: whether or not this zone should be reset before writing to it
+ * @fixing_zone_wp: whether or not the write pointer of this zone is under fix
   */
  struct fio_zone_info {
         pthread_mutex_t         mutex;
         uint64_t                start;
         uint64_t                wp;
         uint64_t                capacity;
+       uint32_t                writes_in_flight;
+       uint32_t                max_write_error_offset;
         enum zbd_zone_type      type:2;
         enum zbd_zone_cond      cond:4;
         unsigned int            has_wp:1;
         unsigned int            write:1;
         unsigned int            reset_zone:1;
+       unsigned int            fixing_zone_wp:1;
  };
  
  /**
@@ -106,6 +113,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u);
  char *zbd_write_status(const struct thread_stat *ts);
  int zbd_do_io_u_trim(struct thread_data *td, struct io_u *io_u);
  void zbd_log_err(const struct thread_data *td, const struct io_u *io_u);
+void zbd_recover_write_error(struct thread_data *td, struct io_u *io_u);
  
  static inline void zbd_close_file(struct fio_file *f)
  {
@@ -114,10 +122,10 @@ static inline void zbd_close_file(struct fio_file *f)
  }
  
  static inline void zbd_queue_io_u(struct thread_data *td, struct io_u *io_u,
-                                 enum fio_q_status status)
+                                 enum fio_q_status *status)
  {
         if (io_u->zbd_queue_io) {
-               io_u->zbd_queue_io(td, io_u, status, io_u->error == 0);
+               io_u->zbd_queue_io(td, io_u, (int *)status);
                 io_u->zbd_queue_io = NULL;
         }
  }
author	Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
	Fri, 25 Apr 2025 05:21:45 +0000 (14:21 +0900)
committer	Jens Axboe <axboe@kernel.dk>
	Wed, 7 May 2025 11:28:47 +0000 (05:28 -0600)
HOWTO.rst		patch \| blob \| blame \| history
cconv.c		patch \| blob \| blame \| history
fio.1		patch \| blob \| blame \| history
io_u.c		patch \| blob \| blame \| history
io_u.h		patch \| blob \| blame \| history
ioengines.c		patch \| blob \| blame \| history
options.c		patch \| blob \| blame \| history
server.h		patch \| blob \| blame \| history
thread_options.h		patch \| blob \| blame \| history
zbd.c		patch \| blob \| blame \| history
zbd.h		patch \| blob \| blame \| history