engines/io_uring: Enable zone device support for io_uring_cmd I/O engine
authorAnkit Kumar <ankit.kumar@samsung.com>
Tue, 31 May 2022 13:31:54 +0000 (19:01 +0530)
committerJens Axboe <axboe@kernel.dk>
Thu, 2 Jun 2022 08:19:41 +0000 (02:19 -0600)
Add zone device specific ioengine_ops for io_uring_cmd.
* get_zoned_model
* report_zones
* reset_wp
* get_max_open_zones

Add the necessary NVMe ZNS specfication opcodes and structures. Add
helper functions to submit admin and I/O passthrough commands for these
new NVMe ZNS specific commands.

For write workload iodepth must be set to 1 as there is no IO scheduler

Tested-by: Vincent Fu <vincent.fu@samsung.com>
Signed-off-by: Ankit Kumar <ankit.kumar@samsung.com>
Link: https://lore.kernel.org/r/20220531133155.17493-9-ankit.kumar@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
engines/io_uring.c
engines/nvme.c
engines/nvme.h

index a7b7b1663c70cd7e661c6f59040915dd589b05ac..5a5406d4bf4ded1d1f7a4b6d3b04aa47d019d018 100644 (file)
@@ -1164,6 +1164,34 @@ static int fio_ioring_cmd_get_file_size(struct thread_data *td,
        return generic_get_file_size(td, f);
 }
 
+static int fio_ioring_cmd_get_zoned_model(struct thread_data *td,
+                                         struct fio_file *f,
+                                         enum zbd_zoned_model *model)
+{
+       return fio_nvme_get_zoned_model(td, f, model);
+}
+
+static int fio_ioring_cmd_report_zones(struct thread_data *td,
+                                      struct fio_file *f, uint64_t offset,
+                                      struct zbd_zone *zbdz,
+                                      unsigned int nr_zones)
+{
+       return fio_nvme_report_zones(td, f, offset, zbdz, nr_zones);
+}
+
+static int fio_ioring_cmd_reset_wp(struct thread_data *td, struct fio_file *f,
+                                  uint64_t offset, uint64_t length)
+{
+       return fio_nvme_reset_wp(td, f, offset, length);
+}
+
+static int fio_ioring_cmd_get_max_open_zones(struct thread_data *td,
+                                            struct fio_file *f,
+                                            unsigned int *max_open_zones)
+{
+       return fio_nvme_get_max_open_zones(td, f, max_open_zones);
+}
+
 static struct ioengine_ops ioengine_uring = {
        .name                   = "io_uring",
        .version                = FIO_IOOPS_VERSION,
@@ -1200,6 +1228,10 @@ static struct ioengine_ops ioengine_uring_cmd = {
        .open_file              = fio_ioring_cmd_open_file,
        .close_file             = fio_ioring_cmd_close_file,
        .get_file_size          = fio_ioring_cmd_get_file_size,
+       .get_zoned_model        = fio_ioring_cmd_get_zoned_model,
+       .report_zones           = fio_ioring_cmd_report_zones,
+       .reset_wp               = fio_ioring_cmd_reset_wp,
+       .get_max_open_zones     = fio_ioring_cmd_get_max_open_zones,
        .options                = options,
        .option_struct_size     = sizeof(struct ioring_options),
 };
index 6fecf0ba7892f2aa87c129717839482f7311e057..59550deff7677f0f5418f52681ac89f60d0afd4f 100644 (file)
@@ -101,3 +101,245 @@ int fio_nvme_get_info(struct fio_file *f, __u32 *nsid, __u32 *lba_sz,
        close(fd);
        return 0;
 }
+
+int fio_nvme_get_zoned_model(struct thread_data *td, struct fio_file *f,
+                            enum zbd_zoned_model *model)
+{
+       struct nvme_data *data = FILE_ENG_DATA(f);
+       struct nvme_id_ns ns;
+       struct nvme_passthru_cmd cmd;
+       int fd, ret = 0;
+
+       if (f->filetype != FIO_TYPE_CHAR)
+               return -EINVAL;
+
+       /* File is not yet opened */
+       fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+       if (fd < 0)
+               return -errno;
+
+       /* Using nvme_id_ns for data as sizes are same */
+       ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_CTRL,
+                               NVME_CSI_ZNS, &ns);
+       if (ret) {
+               *model = ZBD_NONE;
+               goto out;
+       }
+
+       memset(&cmd, 0, sizeof(struct nvme_passthru_cmd));
+
+       /* Using nvme_id_ns for data as sizes are same */
+       ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
+                               NVME_CSI_ZNS, &ns);
+       if (ret) {
+               *model = ZBD_NONE;
+               goto out;
+       }
+
+       *model = ZBD_HOST_MANAGED;
+out:
+       close(fd);
+       return 0;
+}
+
+static int nvme_report_zones(int fd, __u32 nsid, __u64 slba, __u32 zras_feat,
+                            __u32 data_len, void *data)
+{
+       struct nvme_passthru_cmd cmd = {
+               .opcode         = nvme_zns_cmd_mgmt_recv,
+               .nsid           = nsid,
+               .addr           = (__u64)(uintptr_t)data,
+               .data_len       = data_len,
+               .cdw10          = slba & 0xffffffff,
+               .cdw11          = slba >> 32,
+               .cdw12          = (data_len >> 2) - 1,
+               .cdw13          = NVME_ZNS_ZRA_REPORT_ZONES | zras_feat,
+               .timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
+       };
+
+       return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
+}
+
+int fio_nvme_report_zones(struct thread_data *td, struct fio_file *f,
+                         uint64_t offset, struct zbd_zone *zbdz,
+                         unsigned int nr_zones)
+{
+       struct nvme_data *data = FILE_ENG_DATA(f);
+       struct nvme_zone_report *zr;
+       struct nvme_zns_id_ns zns_ns;
+       struct nvme_id_ns ns;
+       unsigned int i = 0, j, zones_fetched = 0;
+       unsigned int max_zones, zones_chunks = 1024;
+       int fd, ret = 0;
+       __u32 zr_len;
+       __u64 zlen;
+
+       /* File is not yet opened */
+       fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+       if (fd < 0)
+               return -errno;
+
+       zones_fetched = 0;
+       zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc));
+       zr = calloc(1, zr_len);
+       if (!zr)
+               return -ENOMEM;
+
+       ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_NS,
+                               NVME_CSI_NVM, &ns);
+       if (ret) {
+               log_err("%s: nvme_identify_ns failed, err=%d\n", f->file_name,
+                       ret);
+               goto out;
+       }
+
+       ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
+                               NVME_CSI_ZNS, &zns_ns);
+       if (ret) {
+               log_err("%s: nvme_zns_identify_ns failed, err=%d\n",
+                       f->file_name, ret);
+               goto out;
+       }
+       zlen = zns_ns.lbafe[ns.flbas & 0x0f].zsze << data->lba_shift;
+
+       max_zones = (f->real_file_size - offset) / zlen;
+       if (max_zones < nr_zones)
+               nr_zones = max_zones;
+
+       if (nr_zones < zones_chunks)
+               zones_chunks = nr_zones;
+
+       while (zones_fetched < nr_zones) {
+               if (zones_fetched + zones_chunks >= nr_zones) {
+                       zones_chunks = nr_zones - zones_fetched;
+                       zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc));
+               }
+               ret = nvme_report_zones(fd, data->nsid, offset >> data->lba_shift,
+                                       NVME_ZNS_ZRAS_FEAT_ERZ, zr_len, (void *)zr);
+               if (ret) {
+                       log_err("%s: nvme_zns_report_zones failed, err=%d\n",
+                               f->file_name, ret);
+                       goto out;
+               }
+
+               /* Transform the zone-report */
+               for (j = 0; j < zr->nr_zones; j++, i++) {
+                       struct nvme_zns_desc *desc = (struct nvme_zns_desc *)&(zr->entries[j]);
+
+                       zbdz[i].start = desc->zslba << data->lba_shift;
+                       zbdz[i].len = zlen;
+                       zbdz[i].wp = desc->wp << data->lba_shift;
+                       zbdz[i].capacity = desc->zcap << data->lba_shift;
+
+                       /* Zone Type is stored in first 4 bits. */
+                       switch (desc->zt & 0x0f) {
+                       case NVME_ZONE_TYPE_SEQWRITE_REQ:
+                               zbdz[i].type = ZBD_ZONE_TYPE_SWR;
+                               break;
+                       default:
+                               log_err("%s: invalid type for zone at offset %llu.\n",
+                                       f->file_name, desc->zslba);
+                               ret = -EIO;
+                               goto out;
+                       }
+
+                       /* Zone State is stored in last 4 bits. */
+                       switch (desc->zs >> 4) {
+                       case NVME_ZNS_ZS_EMPTY:
+                               zbdz[i].cond = ZBD_ZONE_COND_EMPTY;
+                               break;
+                       case NVME_ZNS_ZS_IMPL_OPEN:
+                               zbdz[i].cond = ZBD_ZONE_COND_IMP_OPEN;
+                               break;
+                       case NVME_ZNS_ZS_EXPL_OPEN:
+                               zbdz[i].cond = ZBD_ZONE_COND_EXP_OPEN;
+                               break;
+                       case NVME_ZNS_ZS_CLOSED:
+                               zbdz[i].cond = ZBD_ZONE_COND_CLOSED;
+                               break;
+                       case NVME_ZNS_ZS_FULL:
+                               zbdz[i].cond = ZBD_ZONE_COND_FULL;
+                               break;
+                       case NVME_ZNS_ZS_READ_ONLY:
+                       case NVME_ZNS_ZS_OFFLINE:
+                       default:
+                               /* Treat all these conditions as offline (don't use!) */
+                               zbdz[i].cond = ZBD_ZONE_COND_OFFLINE;
+                               zbdz[i].wp = zbdz[i].start;
+                       }
+               }
+               zones_fetched += zr->nr_zones;
+               offset += zr->nr_zones * zlen;
+       }
+
+       ret = zones_fetched;
+out:
+       free(zr);
+       close(fd);
+
+       return ret;
+}
+
+int fio_nvme_reset_wp(struct thread_data *td, struct fio_file *f,
+                     uint64_t offset, uint64_t length)
+{
+       struct nvme_data *data = FILE_ENG_DATA(f);
+       unsigned int nr_zones;
+       unsigned long long zslba;
+       int i, fd, ret = 0;
+
+       /* If the file is not yet opened, open it for this function. */
+       fd = f->fd;
+       if (fd < 0) {
+               fd = open(f->file_name, O_RDWR | O_LARGEFILE);
+               if (fd < 0)
+                       return -errno;
+       }
+
+       zslba = offset >> data->lba_shift;
+       nr_zones = (length + td->o.zone_size - 1) / td->o.zone_size;
+
+       for (i = 0; i < nr_zones; i++, zslba += (td->o.zone_size >> data->lba_shift)) {
+               struct nvme_passthru_cmd cmd = {
+                       .opcode         = nvme_zns_cmd_mgmt_send,
+                       .nsid           = data->nsid,
+                       .cdw10          = zslba & 0xffffffff,
+                       .cdw11          = zslba >> 32,
+                       .cdw13          = NVME_ZNS_ZSA_RESET,
+                       .addr           = (__u64)(uintptr_t)NULL,
+                       .data_len       = 0,
+                       .timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
+               };
+
+               ret = ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
+       }
+
+       if (f->fd < 0)
+               close(fd);
+       return -ret;
+}
+
+int fio_nvme_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+                               unsigned int *max_open_zones)
+{
+       struct nvme_data *data = FILE_ENG_DATA(f);
+       struct nvme_zns_id_ns zns_ns;
+       int fd, ret = 0;
+
+       fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+       if (fd < 0)
+               return -errno;
+
+       ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
+                               NVME_CSI_ZNS, &zns_ns);
+       if (ret) {
+               log_err("%s: nvme_zns_identify_ns failed, err=%d\n",
+                       f->file_name, ret);
+               goto out;
+       }
+
+       *max_open_zones = zns_ns.mor + 1;
+out:
+       close(fd);
+       return ret;
+}
index 8e626bb23791bc8c374f33f8f6fe11ca92177c22..70a89b7406df391b00105a04a85ccdff1a8dec17 100644 (file)
@@ -43,8 +43,15 @@ struct nvme_uring_cmd {
 #define NVME_IDENTIFY_DATA_SIZE 4096
 #define NVME_IDENTIFY_CSI_SHIFT 24
 
+#define NVME_ZNS_ZRA_REPORT_ZONES 0
+#define NVME_ZNS_ZRAS_FEAT_ERZ (1 << 16)
+#define NVME_ZNS_ZSA_RESET 0x4
+#define NVME_ZONE_TYPE_SEQWRITE_REQ 0x2
+
 enum nvme_identify_cns {
-       NVME_IDENTIFY_CNS_NS = 0x00,
+       NVME_IDENTIFY_CNS_NS            = 0x00,
+       NVME_IDENTIFY_CNS_CSI_NS        = 0x05,
+       NVME_IDENTIFY_CNS_CSI_CTRL      = 0x06,
 };
 
 enum nvme_csi {
@@ -60,6 +67,18 @@ enum nvme_admin_opcode {
 enum nvme_io_opcode {
        nvme_cmd_write                  = 0x01,
        nvme_cmd_read                   = 0x02,
+       nvme_zns_cmd_mgmt_send          = 0x79,
+       nvme_zns_cmd_mgmt_recv          = 0x7a,
+};
+
+enum nvme_zns_zs {
+       NVME_ZNS_ZS_EMPTY               = 0x1,
+       NVME_ZNS_ZS_IMPL_OPEN           = 0x2,
+       NVME_ZNS_ZS_EXPL_OPEN           = 0x3,
+       NVME_ZNS_ZS_CLOSED              = 0x4,
+       NVME_ZNS_ZS_READ_ONLY           = 0xd,
+       NVME_ZNS_ZS_FULL                = 0xe,
+       NVME_ZNS_ZS_OFFLINE             = 0xf,
 };
 
 struct nvme_data {
@@ -127,10 +146,69 @@ static inline int ilog2(uint32_t i)
        return log;
 }
 
+struct nvme_zns_lbafe {
+       __le64  zsze;
+       __u8    zdes;
+       __u8    rsvd9[7];
+};
+
+struct nvme_zns_id_ns {
+       __le16                  zoc;
+       __le16                  ozcs;
+       __le32                  mar;
+       __le32                  mor;
+       __le32                  rrl;
+       __le32                  frl;
+       __le32                  rrl1;
+       __le32                  rrl2;
+       __le32                  rrl3;
+       __le32                  frl1;
+       __le32                  frl2;
+       __le32                  frl3;
+       __le32                  numzrwa;
+       __le16                  zrwafg;
+       __le16                  zrwasz;
+       __u8                    zrwacap;
+       __u8                    rsvd53[2763];
+       struct nvme_zns_lbafe   lbafe[64];
+       __u8                    vs[256];
+};
+
+struct nvme_zns_desc {
+       __u8    zt;
+       __u8    zs;
+       __u8    za;
+       __u8    zai;
+       __u8    rsvd4[4];
+       __le64  zcap;
+       __le64  zslba;
+       __le64  wp;
+       __u8    rsvd32[32];
+};
+
+struct nvme_zone_report {
+       __le64                  nr_zones;
+       __u8                    rsvd8[56];
+       struct nvme_zns_desc    entries[];
+};
+
 int fio_nvme_get_info(struct fio_file *f, __u32 *nsid, __u32 *lba_sz,
                      __u64 *nlba);
 
 int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
                            struct iovec *iov);
 
+int fio_nvme_get_zoned_model(struct thread_data *td, struct fio_file *f,
+                            enum zbd_zoned_model *model);
+
+int fio_nvme_report_zones(struct thread_data *td, struct fio_file *f,
+                         uint64_t offset, struct zbd_zone *zbdz,
+                         unsigned int nr_zones);
+
+int fio_nvme_reset_wp(struct thread_data *td, struct fio_file *f,
+                     uint64_t offset, uint64_t length);
+
+int fio_nvme_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+                               unsigned int *max_open_zones);
+
 #endif