zbd: Support zone capacity smaller than zone size
authorShin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Fri, 17 Jul 2020 09:36:32 +0000 (18:36 +0900)
committerJens Axboe <axboe@kernel.dk>
Tue, 21 Jul 2020 15:17:29 +0000 (09:17 -0600)
NVMe ZNS specification defines zone capacity. The report zone interface
of Linux kernel supports it. This patch adds support for this new
interface.

Modify configure to check availability of BLK_ZONE_REP_CAPACITY flag as
well as blkzoned.h header only when target OS is Linux. If this flag is
defined and set in the report zone, all sectors from the zone capacity up
to the zone size are not accessible. Prevent read and write operations to
these sectors by modifying zbd_adjust_block(). Of note is that this skips
the region between zone capacity and zone size then reduces total I/O
bytes of sequential workloads.

Introduce helper functions zbd_zone_end() and zbd_zone_capacity_end() to
improve readability of zone limit check code.

Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
configure
engines/libzbc.c
oslib/linux-blkzoned.c
zbd.c
zbd.h
zbd_types.h

index 6991393bbf74bb7c8f36ca396845d58ef334107e..b079a2a50e282c44f627e68329da4af3ff5471b6 100755 (executable)
--- a/configure
+++ b/configure
@@ -2390,6 +2390,7 @@ if compile_prog "" "" "valgrind_dev"; then
 fi
 print_config "Valgrind headers" "$valgrind_dev"
 
+if test "$targetos" = "Linux" ; then
 ##########################################
 # <linux/blkzoned.h> probe
 if test "$linux_blkzoned" != "yes" ; then
@@ -2407,6 +2408,24 @@ if compile_prog "" "" "linux_blkzoned"; then
 fi
 print_config "Zoned block device support" "$linux_blkzoned"
 
+##########################################
+# Check BLK_ZONE_REP_CAPACITY
+cat > $TMPC << EOF
+#include <linux/blkzoned.h>
+int main(void)
+{
+  return BLK_ZONE_REP_CAPACITY;
+}
+EOF
+if compile_prog "" "" "blkzoned report capacity"; then
+  output_sym "CONFIG_HAVE_REP_CAPACITY"
+  rep_capacity="yes"
+else
+  rep_capacity="no"
+fi
+print_config "Zoned block device capacity" "$rep_capacity"
+fi
+
 ##########################################
 # libzbc probe
 if test "$libzbc" != "yes" ; then
index fdde8ca65fff242ac7f405aba8a91536db89a5d7..4b9002330dcfd66d106087d1dc9275c6402fe1d3 100644 (file)
@@ -235,6 +235,11 @@ static int libzbc_report_zones(struct thread_data *td, struct fio_file *f,
                zbdz->start = zones[i].zbz_start << 9;
                zbdz->len = zones[i].zbz_length << 9;
                zbdz->wp = zones[i].zbz_write_pointer << 9;
+               /*
+                * ZBC/ZAC do not define zone capacity, so use the zone size as
+                * the zone capacity.
+                */
+               zbdz->capacity = zbdz->len;
 
                switch (zones[i].zbz_type) {
                case ZBC_ZT_CONVENTIONAL:
index 1cf06363d1a2b2b9a8dc9220878b00b64439a44a..6fe78b9ce79dc24abe24af38b7784f34e1029648 100644 (file)
@@ -113,6 +113,16 @@ out:
        return 0;
 }
 
+static uint64_t zone_capacity(struct blk_zone_report *hdr,
+                             struct blk_zone *blkz)
+{
+#ifdef CONFIG_HAVE_REP_CAPACITY
+       if (hdr->flags & BLK_ZONE_REP_CAPACITY)
+               return blkz->capacity << 9;
+#endif
+       return blkz->len << 9;
+}
+
 int blkzoned_report_zones(struct thread_data *td, struct fio_file *f,
                          uint64_t offset, struct zbd_zone *zones,
                          unsigned int nr_zones)
@@ -149,6 +159,7 @@ int blkzoned_report_zones(struct thread_data *td, struct fio_file *f,
                z->start = blkz->start << 9;
                z->wp = blkz->wp << 9;
                z->len = blkz->len << 9;
+               z->capacity = zone_capacity(hdr, blkz);
 
                switch (blkz->type) {
                case BLK_ZONE_TYPE_CONVENTIONAL:
diff --git a/zbd.c b/zbd.c
index cf2cded93cff4371263106d3e7c0c9c02a6640c0..c738a58b206b4ee6a29f2837108f9a2c4ae06b21 100644 (file)
--- a/zbd.c
+++ b/zbd.c
@@ -140,6 +140,24 @@ static inline bool zbd_zone_swr(struct fio_zone_info *z)
        return z->type == ZBD_ZONE_TYPE_SWR;
 }
 
+/**
+ * zbd_zone_end - Return zone end location
+ * @z: zone info pointer.
+ */
+static inline uint64_t zbd_zone_end(const struct fio_zone_info *z)
+{
+       return (z+1)->start;
+}
+
+/**
+ * zbd_zone_capacity_end - Return zone capacity limit end location
+ * @z: zone info pointer.
+ */
+static inline uint64_t zbd_zone_capacity_end(const struct fio_zone_info *z)
+{
+       return z->start + z->capacity;
+}
+
 /**
  * zbd_zone_full - verify whether a minimum number of bytes remain in a zone
  * @f: file pointer.
@@ -154,7 +172,7 @@ static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z,
        assert((required & 511) == 0);
 
        return zbd_zone_swr(z) &&
-               z->wp + required > z->start + f->zbd_info->zone_size;
+               z->wp + required > zbd_zone_capacity_end(z);
 }
 
 static void zone_lock(struct thread_data *td, struct fio_file *f, struct fio_zone_info *z)
@@ -271,7 +289,7 @@ static bool zbd_verify_sizes(void)
                        z = &f->zbd_info->zone_info[zone_idx];
                        if ((f->file_offset != z->start) &&
                            (td->o.td_ddir != TD_DDIR_READ)) {
-                               new_offset = (z+1)->start;
+                               new_offset = zbd_zone_end(z);
                                if (new_offset >= f->file_offset + f->io_size) {
                                        log_info("%s: io_size must be at least one zone\n",
                                                 f->file_name);
@@ -384,6 +402,7 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
                p->wp = p->start;
                p->type = ZBD_ZONE_TYPE_SWR;
                p->cond = ZBD_ZONE_COND_EMPTY;
+               p->capacity = zone_size;
        }
        /* a sentinel */
        p->start = nr_zones * zone_size;
@@ -456,10 +475,11 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
                        mutex_init_pshared_with_type(&p->mutex,
                                                     PTHREAD_MUTEX_RECURSIVE);
                        p->start = z->start;
+                       p->capacity = z->capacity;
                        switch (z->cond) {
                        case ZBD_ZONE_COND_NOT_WP:
                        case ZBD_ZONE_COND_FULL:
-                               p->wp = p->start + zone_size;
+                               p->wp = p->start + p->capacity;
                                break;
                        default:
                                assert(z->start <= z->wp);
@@ -707,7 +727,7 @@ static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
        dprint(FD_ZBD, "%s: resetting wp of zone %u.\n", f->file_name,
                zbd_zone_nr(f->zbd_info, z));
 
-       return zbd_reset_range(td, f, z->start, (z+1)->start - z->start);
+       return zbd_reset_range(td, f, z->start, zbd_zone_end(z) - z->start);
 }
 
 /* The caller must hold f->zbd_info->mutex */
@@ -1068,7 +1088,7 @@ found_candidate_zone:
        /* Both z->mutex and f->zbd_info->mutex are held. */
 
 examine_zone:
-       if (z->wp + min_bs <= (z+1)->start) {
+       if (z->wp + min_bs <= zbd_zone_capacity_end(z)) {
                pthread_mutex_unlock(&f->zbd_info->mutex);
                goto out;
        }
@@ -1112,7 +1132,7 @@ examine_zone:
                z = &f->zbd_info->zone_info[zone_idx];
 
                zone_lock(td, f, z);
-               if (z->wp + min_bs <= (z+1)->start)
+               if (z->wp + min_bs <= zbd_zone_capacity_end(z))
                        goto out;
                pthread_mutex_lock(&f->zbd_info->mutex);
        }
@@ -1143,9 +1163,9 @@ static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td,
                assert(z);
        }
 
-       if (z->verify_block * min_bs >= f->zbd_info->zone_size)
+       if (z->verify_block * min_bs >= z->capacity)
                log_err("%s: %d * %d >= %llu\n", f->file_name, z->verify_block,
-                       min_bs, (unsigned long long) f->zbd_info->zone_size);
+                       min_bs, (unsigned long long)z->capacity);
        io_u->offset = z->start + z->verify_block++ * min_bs;
        return z;
 }
@@ -1231,7 +1251,7 @@ static void zbd_queue_io(struct io_u *io_u, int q, bool success)
        switch (io_u->ddir) {
        case DDIR_WRITE:
                zone_end = min((uint64_t)(io_u->offset + io_u->buflen),
-                              (z + 1)->start);
+                              zbd_zone_capacity_end(z));
                pthread_mutex_lock(&zbd_info->mutex);
                /*
                 * z->wp > zone_end means that one or more I/O errors
@@ -1327,6 +1347,26 @@ void setup_zbd_zone_mode(struct thread_data *td, struct io_u *io_u)
        assert(td->o.zone_mode == ZONE_MODE_ZBD);
        assert(td->o.zone_size);
 
+       zone_idx = zbd_zone_idx(f, f->last_pos[ddir]);
+       z = &f->zbd_info->zone_info[zone_idx];
+
+       /*
+        * When the zone capacity is smaller than the zone size and the I/O is
+        * sequential write, skip to zone end if the latest position is at the
+        * zone capacity limit.
+        */
+       if (z->capacity < f->zbd_info->zone_size && !td_random(td) &&
+           ddir == DDIR_WRITE &&
+           f->last_pos[ddir] >= zbd_zone_capacity_end(z)) {
+               dprint(FD_ZBD,
+                      "%s: Jump from zone capacity limit to zone end:"
+                      " (%lu -> %lu) for zone %u (%ld)\n",
+                      f->file_name, f->last_pos[ddir], zbd_zone_end(z),
+                      zbd_zone_nr(f->zbd_info, z), z->capacity);
+               td->io_skip_bytes += zbd_zone_end(z) - f->last_pos[ddir];
+               f->last_pos[ddir] = zbd_zone_end(z);
+       }
+
        /*
         * zone_skip is valid only for sequential workloads.
         */
@@ -1340,11 +1380,8 @@ void setup_zbd_zone_mode(struct thread_data *td, struct io_u *io_u)
         * - For reads with td->o.read_beyond_wp == false, the last position
         *   reached the zone write pointer.
         */
-       zone_idx = zbd_zone_idx(f, f->last_pos[ddir]);
-       z = &f->zbd_info->zone_info[zone_idx];
-
        if (td->zone_bytes >= td->o.zone_size ||
-           f->last_pos[ddir] >= (z+1)->start ||
+           f->last_pos[ddir] >= zbd_zone_end(z) ||
            (ddir == DDIR_READ &&
             (!td->o.read_beyond_wp) && f->last_pos[ddir] >= z->wp)) {
                /*
@@ -1530,6 +1567,13 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                        zb->reset_zone = 0;
                        if (zbd_reset_zone(td, f, zb) < 0)
                                goto eof;
+
+                       if (zb->capacity < min_bs) {
+                               log_err("zone capacity %llu smaller than minimum block size %d\n",
+                                       (unsigned long long)zb->capacity,
+                                       min_bs);
+                               goto eof;
+                       }
                }
                /* Make writes occur at the write pointer */
                assert(!zbd_zone_full(f, zb, min_bs));
@@ -1545,7 +1589,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                 * small.
                 */
                new_len = min((unsigned long long)io_u->buflen,
-                             (zb + 1)->start - io_u->offset);
+                             zbd_zone_capacity_end(zb) - io_u->offset);
                new_len = new_len / min_bs * min_bs;
                if (new_len == io_u->buflen)
                        goto accept;
@@ -1556,7 +1600,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                        goto accept;
                }
                log_err("Zone remainder %lld smaller than minimum block size %d\n",
-                       ((zb + 1)->start - io_u->offset),
+                       (zbd_zone_capacity_end(zb) - io_u->offset),
                        min_bs);
                goto eof;
        case DDIR_TRIM:
diff --git a/zbd.h b/zbd.h
index e942a7f61b0ce21e39e14c77ecafccd9f35ce712..021174c110ed459fea763338be666619dfd1965a 100644 (file)
--- a/zbd.h
+++ b/zbd.h
@@ -23,6 +23,7 @@ enum io_u_action {
  * struct fio_zone_info - information about a single ZBD zone
  * @start: zone start location (bytes)
  * @wp: zone write pointer location (bytes)
+ * @capacity: maximum size usable from the start of a zone (bytes)
  * @verify_block: number of blocks that have been verified for this zone
  * @mutex: protects the modifiable members in this structure
  * @type: zone type (BLK_ZONE_TYPE_*)
@@ -35,6 +36,7 @@ struct fio_zone_info {
        pthread_mutex_t         mutex;
        uint64_t                start;
        uint64_t                wp;
+       uint64_t                capacity;
        uint32_t                verify_block;
        enum zbd_zone_type      type:2;
        enum zbd_zone_cond      cond:4;
index d63c0d0a7262978db87315a1e7daea1461603c5a..5ed41aa06c8ac371e264b9fd62af3a33a848bf9f 100644 (file)
@@ -50,6 +50,7 @@ struct zbd_zone {
        uint64_t                start;
        uint64_t                wp;
        uint64_t                len;
+       uint64_t                capacity;
        enum zbd_zone_type      type;
        enum zbd_zone_cond      cond;
 };