engines/io_uring: eliminate FDP memory corruption risk
[fio.git] / engines / io_uring.c
index 38c36fdca26063fcbb8aacbda7a9b1ec71fb655b..7e0830102c625c868b8355c8a845662f281dad5a 100644 (file)
@@ -34,6 +34,13 @@ enum uring_cmd_type {
        FIO_URING_CMD_NVME = 1,
 };
 
+enum uring_cmd_write_mode {
+       FIO_URING_CMD_WMODE_WRITE = 1,
+       FIO_URING_CMD_WMODE_UNCOR,
+       FIO_URING_CMD_WMODE_ZEROES,
+       FIO_URING_CMD_WMODE_VERIFY,
+};
+
 struct io_sq_ring {
        unsigned *head;
        unsigned *tail;
@@ -81,12 +88,17 @@ struct ioring_data {
 
        struct cmdprio cmdprio;
 
-       struct nvme_dsm_range *dsm;
+       struct nvme_dsm *dsm;
+       uint32_t cdw12_flags[DDIR_RWDIR_CNT];
+       uint8_t write_opcode;
 };
 
 struct ioring_options {
        struct thread_data *td;
        unsigned int hipri;
+       unsigned int readfua;
+       unsigned int writefua;
+       unsigned int write_mode;
        struct cmdprio_options cmdprio_options;
        unsigned int fixedbufs;
        unsigned int registerfiles;
@@ -135,6 +147,54 @@ static struct fio_option options[] = {
                .category = FIO_OPT_C_ENGINE,
                .group  = FIO_OPT_G_IOURING,
        },
+       {
+               .name   = "readfua",
+               .lname  = "Read fua flag support",
+               .type   = FIO_OPT_BOOL,
+               .off1   = offsetof(struct ioring_options, readfua),
+               .help   = "Set FUA flag (force unit access) for all Read operations",
+               .def    = "0",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_IOURING,
+       },
+       {
+               .name   = "writefua",
+               .lname  = "Write fua flag support",
+               .type   = FIO_OPT_BOOL,
+               .off1   = offsetof(struct ioring_options, writefua),
+               .help   = "Set FUA flag (force unit access) for all Write operations",
+               .def    = "0",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_IOURING,
+       },
+       {
+               .name   = "write_mode",
+               .lname  = "Additional Write commands support (Write Uncorrectable, Write Zeores)",
+               .type   = FIO_OPT_STR,
+               .off1   = offsetof(struct ioring_options, write_mode),
+               .help   = "Issue Write Uncorrectable or Zeroes command instaed of Write command",
+               .def    = "write",
+               .posval = {
+                         { .ival = "write",
+                           .oval = FIO_URING_CMD_WMODE_WRITE,
+                           .help = "Issue Write commands for write operations"
+                         },
+                         { .ival = "uncor",
+                           .oval = FIO_URING_CMD_WMODE_UNCOR,
+                           .help = "Issue Write Uncorrectable commands for write operations"
+                         },
+                         { .ival = "zeroes",
+                           .oval = FIO_URING_CMD_WMODE_ZEROES,
+                           .help = "Issue Write Zeroes commands for write operations"
+                         },
+                         { .ival = "verify",
+                           .oval = FIO_URING_CMD_WMODE_VERIFY,
+                           .help = "Issue Verify commands for write operations"
+                         },
+               },
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_IOURING,
+       },
        {
                .name   = "fixedbufs",
                .lname  = "Fixed (pre-mapped) IO buffers",
@@ -385,6 +445,9 @@ static int fio_ioring_cmd_prep(struct thread_data *td, struct io_u *io_u)
        struct fio_file *f = io_u->file;
        struct nvme_uring_cmd *cmd;
        struct io_uring_sqe *sqe;
+       struct nvme_dsm *dsm;
+       void *ptr = ld->dsm;
+       unsigned int dsm_size;
 
        /* only supports nvme_uring_cmd */
        if (o->cmd_type != FIO_URING_CMD_NVME)
@@ -423,9 +486,13 @@ static int fio_ioring_cmd_prep(struct thread_data *td, struct io_u *io_u)
        }
 
        cmd = (struct nvme_uring_cmd *)sqe->cmd;
+       dsm_size = sizeof(*ld->dsm) + td->o.num_range * sizeof(struct nvme_dsm_range);
+       ptr += io_u->index * dsm_size;
+       dsm = (struct nvme_dsm *)ptr;
+
        return fio_nvme_uring_cmd_prep(cmd, io_u,
                        o->nonvectored ? NULL : &ld->iovecs[io_u->index],
-                       &ld->dsm[io_u->index]);
+                       dsm, ld->write_opcode, ld->cdw12_flags[io_u->ddir]);
 }
 
 static struct io_u *fio_ioring_event(struct thread_data *td, int event)
@@ -468,10 +535,12 @@ static struct io_u *fio_ioring_cmd_event(struct thread_data *td, int event)
        cqe = &ld->cq_ring.cqes[index];
        io_u = (struct io_u *) (uintptr_t) cqe->user_data;
 
-       if (cqe->res != 0)
-               io_u->error = -cqe->res;
-       else
+       if (cqe->res != 0) {
+               io_u->error = abs(cqe->res);
+               return io_u;
+       } else {
                io_u->error = 0;
+       }
 
        if (o->cmd_type == FIO_URING_CMD_NVME) {
                data = FILE_ENG_DATA(io_u->file);
@@ -1131,8 +1200,11 @@ static int fio_ioring_init(struct thread_data *td)
 {
        struct ioring_options *o = td->eo;
        struct ioring_data *ld;
+       struct nvme_dsm *dsm;
+       void *ptr;
+       unsigned int dsm_size;
        unsigned long long md_size;
-       int ret;
+       int ret, i;
 
        /* sqthread submission requires registered files */
        if (o->sqpoll_thread)
@@ -1193,10 +1265,43 @@ static int fio_ioring_init(struct thread_data *td)
         * in zbd mode where trim means zone reset.
         */
        if (!strcmp(td->io_ops->name, "io_uring_cmd") && td_trim(td) &&
-           td->o.zone_mode == ZONE_MODE_ZBD)
+           td->o.zone_mode == ZONE_MODE_ZBD) {
                td->io_ops->flags |= FIO_ASYNCIO_SYNC_TRIM;
-       else
-               ld->dsm = calloc(td->o.iodepth, sizeof(*ld->dsm));
+       } else {
+               dsm_size = sizeof(*ld->dsm) +
+                       td->o.num_range * sizeof(struct nvme_dsm_range);
+               ld->dsm = calloc(td->o.iodepth, dsm_size);
+               ptr = ld->dsm;
+               for (i = 0; i < td->o.iodepth; i++) {
+                       dsm = (struct nvme_dsm *)ptr;
+                       dsm->nr_ranges = td->o.num_range;
+                       ptr += dsm_size;
+               }
+       }
+
+       if (!strcmp(td->io_ops->name, "io_uring_cmd")) {
+               if (td_write(td)) {
+                       switch (o->write_mode) {
+                       case FIO_URING_CMD_WMODE_UNCOR:
+                               ld->write_opcode = nvme_cmd_write_uncor;
+                               break;
+                       case FIO_URING_CMD_WMODE_ZEROES:
+                               ld->write_opcode = nvme_cmd_write_zeroes;
+                               break;
+                       case FIO_URING_CMD_WMODE_VERIFY:
+                               ld->write_opcode = nvme_cmd_verify;
+                               break;
+                       default:
+                               ld->write_opcode = nvme_cmd_write;
+                               break;
+                       }
+               }
+
+               if (o->readfua)
+                       ld->cdw12_flags[DDIR_READ] = 1 << 30;
+               if (o->writefua)
+                       ld->cdw12_flags[DDIR_WRITE] = 1 << 30;
+       }
 
        return 0;
 }
@@ -1279,14 +1384,21 @@ static int fio_ioring_cmd_open_file(struct thread_data *td, struct fio_file *f)
                lba_size = data->lba_ext ? data->lba_ext : data->lba_size;
 
                for_each_rw_ddir(ddir) {
-                       if (td->o.min_bs[ddir] % lba_size ||
-                               td->o.max_bs[ddir] % lba_size) {
-                               if (data->lba_ext)
-                                       log_err("%s: block size must be a multiple of (LBA data size + Metadata size)\n",
-                                               f->file_name);
-                               else
+                       if (td->o.min_bs[ddir] % lba_size || td->o.max_bs[ddir] % lba_size) {
+                               if (data->lba_ext) {
+                                       log_err("%s: block size must be a multiple of %u "
+                                               "(LBA data size + Metadata size)\n", f->file_name, lba_size);
+                                       if (td->o.min_bs[ddir] == td->o.max_bs[ddir] &&
+                                           !(td->o.min_bs[ddir] % data->lba_size)) {
+                                               /* fixed block size is actually a multiple of LBA data size */
+                                               unsigned long long suggestion = lba_size *
+                                                       (td->o.min_bs[ddir] / data->lba_size);
+                                               log_err("Did you mean to use a block size of %llu?\n", suggestion);
+                                       }
+                               } else {
                                        log_err("%s: block size must be a multiple of LBA data size\n",
                                                f->file_name);
+                               }
                                td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
                                return 1;
                        }
@@ -1313,6 +1425,14 @@ static int fio_ioring_cmd_open_file(struct thread_data *td, struct fio_file *f)
                        td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
                        return 1;
                }
+
+               if (o->write_mode != FIO_URING_CMD_WMODE_WRITE &&
+                   !td_write(td)) {
+                       log_err("%s: 'readwrite=|rw=' has no write\n",
+                                       f->file_name);
+                       td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
+                       return 1;
+               }
        }
        if (!ld || !o->registerfiles)
                return generic_open_file(td, f);
@@ -1425,6 +1545,8 @@ static int fio_ioring_cmd_fetch_ruhs(struct thread_data *td, struct fio_file *f,
                goto free;
 
        fruhs_info->nr_ruhs = le16_to_cpu(ruhs->nruhsd);
+       if (fruhs_info->nr_ruhs > FDP_MAX_RUHS)
+               fruhs_info->nr_ruhs = FDP_MAX_RUHS;
        for (i = 0; i < fruhs_info->nr_ruhs; i++)
                fruhs_info->plis[i] = le16_to_cpu(ruhs->ruhss[i].pid);
 free:
@@ -1457,7 +1579,8 @@ static struct ioengine_ops ioengine_uring_cmd = {
        .name                   = "io_uring_cmd",
        .version                = FIO_IOOPS_VERSION,
        .flags                  = FIO_NO_OFFLOAD | FIO_MEMALIGN | FIO_RAWIO |
-                                       FIO_ASYNCIO_SETS_ISSUE_TIME,
+                                       FIO_ASYNCIO_SETS_ISSUE_TIME |
+                                       FIO_MULTI_RANGE_TRIM,
        .init                   = fio_ioring_init,
        .post_init              = fio_ioring_cmd_post_init,
        .io_u_init              = fio_ioring_io_u_init,