08503b33998edeea72639d92fb35a798222f3514
[fio.git] / engines / nvme.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * nvme structure declarations and helper functions for the
4  * io_uring_cmd engine.
5  */
6
7 #include "nvme.h"
8 #include "../crc/crc-t10dif.h"
9 #include "../crc/crc64.h"
10
11 static inline __u64 get_slba(struct nvme_data *data, struct io_u *io_u)
12 {
13         if (data->lba_ext)
14                 return io_u->offset / data->lba_ext;
15         else
16                 return io_u->offset >> data->lba_shift;
17 }
18
19 static inline __u32 get_nlb(struct nvme_data *data, struct io_u *io_u)
20 {
21         if (data->lba_ext)
22                 return io_u->xfer_buflen / data->lba_ext - 1;
23         else
24                 return (io_u->xfer_buflen >> data->lba_shift) - 1;
25 }
26
27 static void fio_nvme_generate_pi_16b_guard(struct nvme_data *data,
28                                            struct io_u *io_u,
29                                            struct nvme_cmd_ext_io_opts *opts)
30 {
31         struct nvme_pi_data *pi_data = io_u->engine_data;
32         struct nvme_16b_guard_pif *pi;
33         unsigned char *buf = io_u->xfer_buf;
34         unsigned char *md_buf = io_u->mmap_data;
35         __u64 slba = get_slba(data, io_u);
36         __u32 nlb = get_nlb(data, io_u) + 1;
37         __u32 lba_num = 0;
38         __u16 guard = 0;
39
40         if (data->pi_loc) {
41                 if (data->lba_ext)
42                         pi_data->interval = data->lba_ext - data->ms;
43                 else
44                         pi_data->interval = 0;
45         } else {
46                 if (data->lba_ext)
47                         pi_data->interval = data->lba_ext - sizeof(struct nvme_16b_guard_pif);
48                 else
49                         pi_data->interval = data->ms - sizeof(struct nvme_16b_guard_pif);
50         }
51
52         if (io_u->ddir != DDIR_WRITE)
53                 return;
54
55         while (lba_num < nlb) {
56                 if (data->lba_ext)
57                         pi = (struct nvme_16b_guard_pif *)(buf + pi_data->interval);
58                 else
59                         pi = (struct nvme_16b_guard_pif *)(md_buf + pi_data->interval);
60
61                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
62                         if (data->lba_ext) {
63                                 guard = fio_crc_t10dif(0, buf, pi_data->interval);
64                         } else {
65                                 guard = fio_crc_t10dif(0, buf, data->lba_size);
66                                 guard = fio_crc_t10dif(guard, md_buf, pi_data->interval);
67                         }
68                         pi->guard = cpu_to_be16(guard);
69                 }
70
71                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
72                         pi->apptag = cpu_to_be16(pi_data->apptag);
73
74                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
75                         switch (data->pi_type) {
76                         case NVME_NS_DPS_PI_TYPE1:
77                         case NVME_NS_DPS_PI_TYPE2:
78                                 pi->srtag = cpu_to_be32((__u32)slba + lba_num);
79                                 break;
80                         case NVME_NS_DPS_PI_TYPE3:
81                                 break;
82                         }
83                 }
84                 if (data->lba_ext) {
85                         buf += data->lba_ext;
86                 } else {
87                         buf += data->lba_size;
88                         md_buf += data->ms;
89                 }
90                 lba_num++;
91         }
92 }
93
94 static int fio_nvme_verify_pi_16b_guard(struct nvme_data *data,
95                                         struct io_u *io_u)
96 {
97         struct nvme_pi_data *pi_data = io_u->engine_data;
98         struct nvme_16b_guard_pif *pi;
99         struct fio_file *f = io_u->file;
100         unsigned char *buf = io_u->xfer_buf;
101         unsigned char *md_buf = io_u->mmap_data;
102         __u64 slba = get_slba(data, io_u);
103         __u32 nlb = get_nlb(data, io_u) + 1;
104         __u32 lba_num = 0;
105         __u16 unmask_app, unmask_app_exp, guard = 0;
106
107         while (lba_num < nlb) {
108                 if (data->lba_ext)
109                         pi = (struct nvme_16b_guard_pif *)(buf + pi_data->interval);
110                 else
111                         pi = (struct nvme_16b_guard_pif *)(md_buf + pi_data->interval);
112
113                 if (data->pi_type == NVME_NS_DPS_PI_TYPE3) {
114                         if (pi->apptag == NVME_PI_APP_DISABLE &&
115                             pi->srtag == NVME_PI_REF_DISABLE)
116                                 goto next;
117                 } else if (data->pi_type == NVME_NS_DPS_PI_TYPE1 ||
118                            data->pi_type == NVME_NS_DPS_PI_TYPE2) {
119                         if (pi->apptag == NVME_PI_APP_DISABLE)
120                                 goto next;
121                 }
122
123                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
124                         if (data->lba_ext) {
125                                 guard = fio_crc_t10dif(0, buf, pi_data->interval);
126                         } else {
127                                 guard = fio_crc_t10dif(0, buf, data->lba_size);
128                                 guard = fio_crc_t10dif(guard, md_buf, pi_data->interval);
129                         }
130                         if (be16_to_cpu(pi->guard) != guard) {
131                                 log_err("%s: Guard compare error: LBA: %llu Expected=%x, Actual=%x\n",
132                                         f->file_name, (unsigned long long)slba,
133                                         guard, be16_to_cpu(pi->guard));
134                                 return -EIO;
135                         }
136                 }
137
138                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_APP) {
139                         unmask_app = be16_to_cpu(pi->apptag) & pi_data->apptag_mask;
140                         unmask_app_exp = pi_data->apptag & pi_data->apptag_mask;
141                         if (unmask_app != unmask_app_exp) {
142                                 log_err("%s: APPTAG compare error: LBA: %llu Expected=%x, Actual=%x\n",
143                                         f->file_name, (unsigned long long)slba,
144                                         unmask_app_exp, unmask_app);
145                                 return -EIO;
146                         }
147                 }
148
149                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
150                         switch (data->pi_type) {
151                         case NVME_NS_DPS_PI_TYPE1:
152                         case NVME_NS_DPS_PI_TYPE2:
153                                 if (be32_to_cpu(pi->srtag) !=
154                                     ((__u32)slba + lba_num)) {
155                                         log_err("%s: REFTAG compare error: LBA: %llu Expected=%x, Actual=%x\n",
156                                                 f->file_name, (unsigned long long)slba,
157                                                 (__u32)slba + lba_num,
158                                                 be32_to_cpu(pi->srtag));
159                                         return -EIO;
160                                 }
161                                 break;
162                         case NVME_NS_DPS_PI_TYPE3:
163                                 break;
164                         }
165                 }
166 next:
167                 if (data->lba_ext) {
168                         buf += data->lba_ext;
169                 } else {
170                         buf += data->lba_size;
171                         md_buf += data->ms;
172                 }
173                 lba_num++;
174         }
175
176         return 0;
177 }
178
179 static void fio_nvme_generate_pi_64b_guard(struct nvme_data *data,
180                                            struct io_u *io_u,
181                                            struct nvme_cmd_ext_io_opts *opts)
182 {
183         struct nvme_pi_data *pi_data = io_u->engine_data;
184         struct nvme_64b_guard_pif *pi;
185         unsigned char *buf = io_u->xfer_buf;
186         unsigned char *md_buf = io_u->mmap_data;
187         uint64_t guard = 0;
188         __u64 slba = get_slba(data, io_u);
189         __u32 nlb = get_nlb(data, io_u) + 1;
190         __u32 lba_num = 0;
191
192         if (data->pi_loc) {
193                 if (data->lba_ext)
194                         pi_data->interval = data->lba_ext - data->ms;
195                 else
196                         pi_data->interval = 0;
197         } else {
198                 if (data->lba_ext)
199                         pi_data->interval = data->lba_ext - sizeof(struct nvme_64b_guard_pif);
200                 else
201                         pi_data->interval = data->ms - sizeof(struct nvme_64b_guard_pif);
202         }
203
204         if (io_u->ddir != DDIR_WRITE)
205                 return;
206
207         while (lba_num < nlb) {
208                 if (data->lba_ext)
209                         pi = (struct nvme_64b_guard_pif *)(buf + pi_data->interval);
210                 else
211                         pi = (struct nvme_64b_guard_pif *)(md_buf + pi_data->interval);
212
213                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
214                         if (data->lba_ext) {
215                                 guard = fio_crc64_nvme(0, buf, pi_data->interval);
216                         } else {
217                                 guard = fio_crc64_nvme(0, buf, data->lba_size);
218                                 guard = fio_crc64_nvme(guard, md_buf, pi_data->interval);
219                         }
220                         pi->guard = cpu_to_be64(guard);
221                 }
222
223                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
224                         pi->apptag = cpu_to_be16(pi_data->apptag);
225
226                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
227                         switch (data->pi_type) {
228                         case NVME_NS_DPS_PI_TYPE1:
229                         case NVME_NS_DPS_PI_TYPE2:
230                                 put_unaligned_be48(slba + lba_num, pi->srtag);
231                                 break;
232                         case NVME_NS_DPS_PI_TYPE3:
233                                 break;
234                         }
235                 }
236                 if (data->lba_ext) {
237                         buf += data->lba_ext;
238                 } else {
239                         buf += data->lba_size;
240                         md_buf += data->ms;
241                 }
242                 lba_num++;
243         }
244 }
245
246 static int fio_nvme_verify_pi_64b_guard(struct nvme_data *data,
247                                         struct io_u *io_u)
248 {
249         struct nvme_pi_data *pi_data = io_u->engine_data;
250         struct nvme_64b_guard_pif *pi;
251         struct fio_file *f = io_u->file;
252         unsigned char *buf = io_u->xfer_buf;
253         unsigned char *md_buf = io_u->mmap_data;
254         __u64 slba = get_slba(data, io_u);
255         __u64 ref, ref_exp, guard = 0;
256         __u32 nlb = get_nlb(data, io_u) + 1;
257         __u32 lba_num = 0;
258         __u16 unmask_app, unmask_app_exp;
259
260         while (lba_num < nlb) {
261                 if (data->lba_ext)
262                         pi = (struct nvme_64b_guard_pif *)(buf + pi_data->interval);
263                 else
264                         pi = (struct nvme_64b_guard_pif *)(md_buf + pi_data->interval);
265
266                 if (data->pi_type == NVME_NS_DPS_PI_TYPE3) {
267                         if (pi->apptag == NVME_PI_APP_DISABLE &&
268                             fio_nvme_pi_ref_escape(pi->srtag))
269                                 goto next;
270                 } else if (data->pi_type == NVME_NS_DPS_PI_TYPE1 ||
271                            data->pi_type == NVME_NS_DPS_PI_TYPE2) {
272                         if (pi->apptag == NVME_PI_APP_DISABLE)
273                                 goto next;
274                 }
275
276                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
277                         if (data->lba_ext) {
278                                 guard = fio_crc64_nvme(0, buf, pi_data->interval);
279                         } else {
280                                 guard = fio_crc64_nvme(0, buf, data->lba_size);
281                                 guard = fio_crc64_nvme(guard, md_buf, pi_data->interval);
282                         }
283                         if (be64_to_cpu((uint64_t)pi->guard) != guard) {
284                                 log_err("%s: Guard compare error: LBA: %llu Expected=%llx, Actual=%llx\n",
285                                         f->file_name, (unsigned long long)slba,
286                                         guard, be64_to_cpu((uint64_t)pi->guard));
287                                 return -EIO;
288                         }
289                 }
290
291                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_APP) {
292                         unmask_app = be16_to_cpu(pi->apptag) & pi_data->apptag_mask;
293                         unmask_app_exp = pi_data->apptag & pi_data->apptag_mask;
294                         if (unmask_app != unmask_app_exp) {
295                                 log_err("%s: APPTAG compare error: LBA: %llu Expected=%x, Actual=%x\n",
296                                         f->file_name, (unsigned long long)slba,
297                                         unmask_app_exp, unmask_app);
298                                 return -EIO;
299                         }
300                 }
301
302                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
303                         switch (data->pi_type) {
304                         case NVME_NS_DPS_PI_TYPE1:
305                         case NVME_NS_DPS_PI_TYPE2:
306                                 ref = get_unaligned_be48(pi->srtag);
307                                 ref_exp = (slba + lba_num) & ((1ULL << 48) - 1);
308                                 if (ref != ref_exp) {
309                                         log_err("%s: REFTAG compare error: LBA: %llu Expected=%llx, Actual=%llx\n",
310                                                 f->file_name, (unsigned long long)slba,
311                                                 ref_exp, ref);
312                                         return -EIO;
313                                 }
314                                 break;
315                         case NVME_NS_DPS_PI_TYPE3:
316                                 break;
317                         }
318                 }
319 next:
320                 if (data->lba_ext) {
321                         buf += data->lba_ext;
322                 } else {
323                         buf += data->lba_size;
324                         md_buf += data->ms;
325                 }
326                 lba_num++;
327         }
328
329         return 0;
330 }
331 void fio_nvme_uring_cmd_trim_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
332                                   struct nvme_dsm_range *dsm)
333 {
334         struct nvme_data *data = FILE_ENG_DATA(io_u->file);
335
336         cmd->opcode = nvme_cmd_dsm;
337         cmd->nsid = data->nsid;
338         cmd->cdw10 = 0;
339         cmd->cdw11 = NVME_ATTRIBUTE_DEALLOCATE;
340         cmd->addr = (__u64) (uintptr_t) dsm;
341         cmd->data_len = sizeof(*dsm);
342
343         dsm->slba = get_slba(data, io_u);
344         /* nlb is a 1-based value for deallocate */
345         dsm->nlb = get_nlb(data, io_u) + 1;
346 }
347
348 int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
349                             struct iovec *iov, struct nvme_dsm_range *dsm)
350 {
351         struct nvme_data *data = FILE_ENG_DATA(io_u->file);
352         __u64 slba;
353         __u32 nlb;
354
355         memset(cmd, 0, sizeof(struct nvme_uring_cmd));
356
357         switch (io_u->ddir) {
358         case DDIR_READ:
359                 cmd->opcode = nvme_cmd_read;
360                 break;
361         case DDIR_WRITE:
362                 cmd->opcode = nvme_cmd_write;
363                 break;
364         case DDIR_TRIM:
365                 fio_nvme_uring_cmd_trim_prep(cmd, io_u, dsm);
366                 return 0;
367         default:
368                 return -ENOTSUP;
369         }
370
371         slba = get_slba(data, io_u);
372         nlb = get_nlb(data, io_u);
373
374         /* cdw10 and cdw11 represent starting lba */
375         cmd->cdw10 = slba & 0xffffffff;
376         cmd->cdw11 = slba >> 32;
377         /* cdw12 represent number of lba's for read/write */
378         cmd->cdw12 = nlb | (io_u->dtype << 20);
379         cmd->cdw13 = io_u->dspec << 16;
380         if (iov) {
381                 iov->iov_base = io_u->xfer_buf;
382                 iov->iov_len = io_u->xfer_buflen;
383                 cmd->addr = (__u64)(uintptr_t)iov;
384                 cmd->data_len = 1;
385         } else {
386                 cmd->addr = (__u64)(uintptr_t)io_u->xfer_buf;
387                 cmd->data_len = io_u->xfer_buflen;
388         }
389         if (data->lba_shift && data->ms) {
390                 cmd->metadata = (__u64)(uintptr_t)io_u->mmap_data;
391                 cmd->metadata_len = (nlb + 1) * data->ms;
392         }
393         cmd->nsid = data->nsid;
394         return 0;
395 }
396
397 void fio_nvme_pi_fill(struct nvme_uring_cmd *cmd, struct io_u *io_u,
398                       struct nvme_cmd_ext_io_opts *opts)
399 {
400         struct nvme_data *data = FILE_ENG_DATA(io_u->file);
401         __u64 slba;
402
403         slba = get_slba(data, io_u);
404         cmd->cdw12 |= opts->io_flags;
405
406         if (data->pi_type && !(opts->io_flags & NVME_IO_PRINFO_PRACT)) {
407                 if (data->guard_type == NVME_NVM_NS_16B_GUARD)
408                         fio_nvme_generate_pi_16b_guard(data, io_u, opts);
409                 else if (data->guard_type == NVME_NVM_NS_64B_GUARD)
410                         fio_nvme_generate_pi_64b_guard(data, io_u, opts);
411         }
412
413         switch (data->pi_type) {
414         case NVME_NS_DPS_PI_TYPE1:
415         case NVME_NS_DPS_PI_TYPE2:
416                 switch (data->guard_type) {
417                 case NVME_NVM_NS_16B_GUARD:
418                         cmd->cdw14 = (__u32)slba;
419                         break;
420                 case NVME_NVM_NS_64B_GUARD:
421                         cmd->cdw14 = (__u32)slba;
422                         cmd->cdw3 = ((slba >> 32) & 0xffff);
423                         break;
424                 default:
425                         break;
426                 }
427                 cmd->cdw15 = (opts->apptag_mask << 16 | opts->apptag);
428                 break;
429         case NVME_NS_DPS_PI_TYPE3:
430                 cmd->cdw15 = (opts->apptag_mask << 16 | opts->apptag);
431                 break;
432         case NVME_NS_DPS_PI_NONE:
433                 break;
434         }
435 }
436
437 int fio_nvme_pi_verify(struct nvme_data *data, struct io_u *io_u)
438 {
439         int ret = 0;
440
441         switch (data->guard_type) {
442         case NVME_NVM_NS_16B_GUARD:
443                 ret = fio_nvme_verify_pi_16b_guard(data, io_u);
444                 break;
445         case NVME_NVM_NS_64B_GUARD:
446                 ret = fio_nvme_verify_pi_64b_guard(data, io_u);
447                 break;
448         default:
449                 break;
450         }
451
452         return ret;
453 }
454
455 static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns,
456                          enum nvme_csi csi, void *data)
457 {
458         struct nvme_passthru_cmd cmd = {
459                 .opcode         = nvme_admin_identify,
460                 .nsid           = nsid,
461                 .addr           = (__u64)(uintptr_t)data,
462                 .data_len       = NVME_IDENTIFY_DATA_SIZE,
463                 .cdw10          = cns,
464                 .cdw11          = csi << NVME_IDENTIFY_CSI_SHIFT,
465                 .timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
466         };
467
468         return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd);
469 }
470
471 int fio_nvme_get_info(struct fio_file *f, __u64 *nlba, __u32 pi_act,
472                       struct nvme_data *data)
473 {
474         struct nvme_id_ns ns;
475         struct nvme_id_ctrl ctrl;
476         struct nvme_nvm_id_ns nvm_ns;
477         int namespace_id;
478         int fd, err;
479         __u32 format_idx, elbaf;
480
481         if (f->filetype != FIO_TYPE_CHAR) {
482                 log_err("ioengine io_uring_cmd only works with nvme ns "
483                         "generic char devices (/dev/ngXnY)\n");
484                 return 1;
485         }
486
487         fd = open(f->file_name, O_RDONLY);
488         if (fd < 0)
489                 return -errno;
490
491         namespace_id = ioctl(fd, NVME_IOCTL_ID);
492         if (namespace_id < 0) {
493                 err = -errno;
494                 log_err("%s: failed to fetch namespace-id\n", f->file_name);
495                 goto out;
496         }
497
498         err = nvme_identify(fd, 0, NVME_IDENTIFY_CNS_CTRL, NVME_CSI_NVM, &ctrl);
499         if (err) {
500                 log_err("%s: failed to fetch identify ctrl\n", f->file_name);
501                 goto out;
502         }
503
504         /*
505          * Identify namespace to get namespace-id, namespace size in LBA's
506          * and LBA data size.
507          */
508         err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_NS,
509                                 NVME_CSI_NVM, &ns);
510         if (err) {
511                 log_err("%s: failed to fetch identify namespace\n",
512                         f->file_name);
513                 goto out;
514         }
515
516         data->nsid = namespace_id;
517
518         /*
519          * 16 or 64 as maximum number of supported LBA formats.
520          * From flbas bit 0-3 indicates lsb and bit 5-6 indicates msb
521          * of the format index used to format the namespace.
522          */
523         if (ns.nlbaf < 16)
524                 format_idx = ns.flbas & 0xf;
525         else
526                 format_idx = (ns.flbas & 0xf) + (((ns.flbas >> 5) & 0x3) << 4);
527
528         data->lba_size = 1 << ns.lbaf[format_idx].ds;
529         data->ms = le16_to_cpu(ns.lbaf[format_idx].ms);
530
531         /* Check for end to end data protection support */
532         if (data->ms && (ns.dps & NVME_NS_DPS_PI_MASK))
533                 data->pi_type = (ns.dps & NVME_NS_DPS_PI_MASK);
534
535         if (!data->pi_type)
536                 goto check_elba;
537
538         if (ctrl.ctratt & NVME_CTRL_CTRATT_ELBAS) {
539                 err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_CSI_NS,
540                                         NVME_CSI_NVM, &nvm_ns);
541                 if (err) {
542                         log_err("%s: failed to fetch identify nvm namespace\n",
543                                 f->file_name);
544                         goto out;
545                 }
546
547                 elbaf = le32_to_cpu(nvm_ns.elbaf[format_idx]);
548
549                 /* Currently we don't support storage tags */
550                 if (elbaf & NVME_ID_NS_NVM_STS_MASK) {
551                         log_err("%s: Storage tag not supported\n",
552                                 f->file_name);
553                         err = -ENOTSUP;
554                         goto out;
555                 }
556
557                 data->guard_type = (elbaf >> NVME_ID_NS_NVM_GUARD_SHIFT) &
558                                 NVME_ID_NS_NVM_GUARD_MASK;
559
560                 /* No 32 bit guard, as storage tag is mandatory for it */
561                 switch (data->guard_type) {
562                 case NVME_NVM_NS_16B_GUARD:
563                         data->pi_size = sizeof(struct nvme_16b_guard_pif);
564                         break;
565                 case NVME_NVM_NS_64B_GUARD:
566                         data->pi_size = sizeof(struct nvme_64b_guard_pif);
567                         break;
568                 default:
569                         break;
570                 }
571         } else {
572                 data->guard_type = NVME_NVM_NS_16B_GUARD;
573                 data->pi_size = sizeof(struct nvme_16b_guard_pif);
574         }
575
576         /*
577          * when PRACT bit is set to 1, and metadata size is equal to protection
578          * information size, controller inserts and removes PI for write and
579          * read commands respectively.
580          */
581         if (pi_act && data->ms == data->pi_size)
582                 data->ms = 0;
583
584         data->pi_loc = (ns.dps & NVME_NS_DPS_PI_FIRST);
585
586 check_elba:
587         /*
588          * Bit 4 for flbas indicates if metadata is transferred at the end of
589          * logical block creating an extended LBA.
590          */
591         if (data->ms && ((ns.flbas >> 4) & 0x1))
592                 data->lba_ext = data->lba_size + data->ms;
593         else
594                 data->lba_shift = ilog2(data->lba_size);
595
596         *nlba = ns.nsze;
597
598 out:
599         close(fd);
600         return err;
601 }
602
603 int fio_nvme_get_zoned_model(struct thread_data *td, struct fio_file *f,
604                              enum zbd_zoned_model *model)
605 {
606         struct nvme_data *data = FILE_ENG_DATA(f);
607         struct nvme_id_ns ns;
608         struct nvme_passthru_cmd cmd;
609         int fd, ret = 0;
610
611         if (f->filetype != FIO_TYPE_CHAR)
612                 return -EINVAL;
613
614         /* File is not yet opened */
615         fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
616         if (fd < 0)
617                 return -errno;
618
619         /* Using nvme_id_ns for data as sizes are same */
620         ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_CTRL,
621                                 NVME_CSI_ZNS, &ns);
622         if (ret) {
623                 *model = ZBD_NONE;
624                 goto out;
625         }
626
627         memset(&cmd, 0, sizeof(struct nvme_passthru_cmd));
628
629         /* Using nvme_id_ns for data as sizes are same */
630         ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
631                                 NVME_CSI_ZNS, &ns);
632         if (ret) {
633                 *model = ZBD_NONE;
634                 goto out;
635         }
636
637         *model = ZBD_HOST_MANAGED;
638 out:
639         close(fd);
640         return 0;
641 }
642
643 static int nvme_report_zones(int fd, __u32 nsid, __u64 slba, __u32 zras_feat,
644                              __u32 data_len, void *data)
645 {
646         struct nvme_passthru_cmd cmd = {
647                 .opcode         = nvme_zns_cmd_mgmt_recv,
648                 .nsid           = nsid,
649                 .addr           = (__u64)(uintptr_t)data,
650                 .data_len       = data_len,
651                 .cdw10          = slba & 0xffffffff,
652                 .cdw11          = slba >> 32,
653                 .cdw12          = (data_len >> 2) - 1,
654                 .cdw13          = NVME_ZNS_ZRA_REPORT_ZONES | zras_feat,
655                 .timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
656         };
657
658         return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
659 }
660
661 int fio_nvme_report_zones(struct thread_data *td, struct fio_file *f,
662                           uint64_t offset, struct zbd_zone *zbdz,
663                           unsigned int nr_zones)
664 {
665         struct nvme_data *data = FILE_ENG_DATA(f);
666         struct nvme_zone_report *zr;
667         struct nvme_zns_id_ns zns_ns;
668         struct nvme_id_ns ns;
669         unsigned int i = 0, j, zones_fetched = 0;
670         unsigned int max_zones, zones_chunks = 1024;
671         int fd, ret = 0;
672         __u32 zr_len;
673         __u64 zlen;
674
675         /* File is not yet opened */
676         fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
677         if (fd < 0)
678                 return -errno;
679
680         zones_fetched = 0;
681         zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc));
682         zr = calloc(1, zr_len);
683         if (!zr) {
684                 close(fd);
685                 return -ENOMEM;
686         }
687
688         ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_NS,
689                                 NVME_CSI_NVM, &ns);
690         if (ret) {
691                 log_err("%s: nvme_identify_ns failed, err=%d\n", f->file_name,
692                         ret);
693                 goto out;
694         }
695
696         ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
697                                 NVME_CSI_ZNS, &zns_ns);
698         if (ret) {
699                 log_err("%s: nvme_zns_identify_ns failed, err=%d\n",
700                         f->file_name, ret);
701                 goto out;
702         }
703         zlen = zns_ns.lbafe[ns.flbas & 0x0f].zsze << data->lba_shift;
704
705         max_zones = (f->real_file_size - offset) / zlen;
706         if (max_zones < nr_zones)
707                 nr_zones = max_zones;
708
709         if (nr_zones < zones_chunks)
710                 zones_chunks = nr_zones;
711
712         while (zones_fetched < nr_zones) {
713                 if (zones_fetched + zones_chunks >= nr_zones) {
714                         zones_chunks = nr_zones - zones_fetched;
715                         zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc));
716                 }
717                 ret = nvme_report_zones(fd, data->nsid, offset >> data->lba_shift,
718                                         NVME_ZNS_ZRAS_FEAT_ERZ, zr_len, (void *)zr);
719                 if (ret) {
720                         log_err("%s: nvme_zns_report_zones failed, err=%d\n",
721                                 f->file_name, ret);
722                         goto out;
723                 }
724
725                 /* Transform the zone-report */
726                 for (j = 0; j < zr->nr_zones; j++, i++) {
727                         struct nvme_zns_desc *desc = (struct nvme_zns_desc *)&(zr->entries[j]);
728
729                         zbdz[i].start = desc->zslba << data->lba_shift;
730                         zbdz[i].len = zlen;
731                         zbdz[i].wp = desc->wp << data->lba_shift;
732                         zbdz[i].capacity = desc->zcap << data->lba_shift;
733
734                         /* Zone Type is stored in first 4 bits. */
735                         switch (desc->zt & 0x0f) {
736                         case NVME_ZONE_TYPE_SEQWRITE_REQ:
737                                 zbdz[i].type = ZBD_ZONE_TYPE_SWR;
738                                 break;
739                         default:
740                                 log_err("%s: invalid type for zone at offset %llu.\n",
741                                         f->file_name, (unsigned long long) desc->zslba);
742                                 ret = -EIO;
743                                 goto out;
744                         }
745
746                         /* Zone State is stored in last 4 bits. */
747                         switch (desc->zs >> 4) {
748                         case NVME_ZNS_ZS_EMPTY:
749                                 zbdz[i].cond = ZBD_ZONE_COND_EMPTY;
750                                 break;
751                         case NVME_ZNS_ZS_IMPL_OPEN:
752                                 zbdz[i].cond = ZBD_ZONE_COND_IMP_OPEN;
753                                 break;
754                         case NVME_ZNS_ZS_EXPL_OPEN:
755                                 zbdz[i].cond = ZBD_ZONE_COND_EXP_OPEN;
756                                 break;
757                         case NVME_ZNS_ZS_CLOSED:
758                                 zbdz[i].cond = ZBD_ZONE_COND_CLOSED;
759                                 break;
760                         case NVME_ZNS_ZS_FULL:
761                                 zbdz[i].cond = ZBD_ZONE_COND_FULL;
762                                 break;
763                         case NVME_ZNS_ZS_READ_ONLY:
764                         case NVME_ZNS_ZS_OFFLINE:
765                         default:
766                                 /* Treat all these conditions as offline (don't use!) */
767                                 zbdz[i].cond = ZBD_ZONE_COND_OFFLINE;
768                                 zbdz[i].wp = zbdz[i].start;
769                         }
770                 }
771                 zones_fetched += zr->nr_zones;
772                 offset += zr->nr_zones * zlen;
773         }
774
775         ret = zones_fetched;
776 out:
777         free(zr);
778         close(fd);
779
780         return ret;
781 }
782
783 int fio_nvme_reset_wp(struct thread_data *td, struct fio_file *f,
784                       uint64_t offset, uint64_t length)
785 {
786         struct nvme_data *data = FILE_ENG_DATA(f);
787         unsigned int nr_zones;
788         unsigned long long zslba;
789         int i, fd, ret = 0;
790
791         /* If the file is not yet opened, open it for this function. */
792         fd = f->fd;
793         if (fd < 0) {
794                 fd = open(f->file_name, O_RDWR | O_LARGEFILE);
795                 if (fd < 0)
796                         return -errno;
797         }
798
799         zslba = offset >> data->lba_shift;
800         nr_zones = (length + td->o.zone_size - 1) / td->o.zone_size;
801
802         for (i = 0; i < nr_zones; i++, zslba += (td->o.zone_size >> data->lba_shift)) {
803                 struct nvme_passthru_cmd cmd = {
804                         .opcode         = nvme_zns_cmd_mgmt_send,
805                         .nsid           = data->nsid,
806                         .cdw10          = zslba & 0xffffffff,
807                         .cdw11          = zslba >> 32,
808                         .cdw13          = NVME_ZNS_ZSA_RESET,
809                         .addr           = (__u64)(uintptr_t)NULL,
810                         .data_len       = 0,
811                         .timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
812                 };
813
814                 ret = ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
815         }
816
817         if (f->fd < 0)
818                 close(fd);
819         return -ret;
820 }
821
822 int fio_nvme_get_max_open_zones(struct thread_data *td, struct fio_file *f,
823                                 unsigned int *max_open_zones)
824 {
825         struct nvme_data *data = FILE_ENG_DATA(f);
826         struct nvme_zns_id_ns zns_ns;
827         int fd, ret = 0;
828
829         fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
830         if (fd < 0)
831                 return -errno;
832
833         ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
834                                 NVME_CSI_ZNS, &zns_ns);
835         if (ret) {
836                 log_err("%s: nvme_zns_identify_ns failed, err=%d\n",
837                         f->file_name, ret);
838                 goto out;
839         }
840
841         *max_open_zones = zns_ns.mor + 1;
842 out:
843         close(fd);
844         return ret;
845 }
846
847 static inline int nvme_fdp_reclaim_unit_handle_status(int fd, __u32 nsid,
848                                                       __u32 data_len, void *data)
849 {
850         struct nvme_passthru_cmd cmd = {
851                 .opcode         = nvme_cmd_io_mgmt_recv,
852                 .nsid           = nsid,
853                 .addr           = (__u64)(uintptr_t)data,
854                 .data_len       = data_len,
855                 .cdw10          = 1,
856                 .cdw11          = (data_len >> 2) - 1,
857         };
858
859         return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
860 }
861
862 int fio_nvme_iomgmt_ruhs(struct thread_data *td, struct fio_file *f,
863                          struct nvme_fdp_ruh_status *ruhs, __u32 bytes)
864 {
865         struct nvme_data *data = FILE_ENG_DATA(f);
866         int fd, ret;
867
868         fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
869         if (fd < 0)
870                 return -errno;
871
872         ret = nvme_fdp_reclaim_unit_handle_status(fd, data->nsid, bytes, ruhs);
873         if (ret) {
874                 log_err("%s: nvme_fdp_reclaim_unit_handle_status failed, err=%d\n",
875                         f->file_name, ret);
876                 errno = ENOTSUP;
877         } else
878                 errno = 0;
879
880         ret = -errno;
881         close(fd);
882         return ret;
883 }