528b2a29ae74f52f9ae287598e4d58254327858a
[fio.git] / engines / nvme.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * nvme structure declarations and helper functions for the
4  * io_uring_cmd engine.
5  */
6
7 #include "nvme.h"
8 #include "../crc/crc-t10dif.h"
9 #include "../crc/crc64.h"
10
11 static void fio_nvme_generate_pi_16b_guard(struct nvme_data *data,
12                                            struct io_u *io_u,
13                                            struct nvme_cmd_ext_io_opts *opts)
14 {
15         struct nvme_pi_data *pi_data = io_u->engine_data;
16         struct nvme_16b_guard_pif *pi;
17         unsigned char *buf = io_u->xfer_buf;
18         unsigned char *md_buf = io_u->mmap_data;
19         __u64 slba = get_slba(data, io_u->offset);
20         __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1;
21         __u32 lba_num = 0;
22         __u16 guard = 0;
23
24         if (data->pi_loc) {
25                 if (data->lba_ext)
26                         pi_data->interval = data->lba_ext - data->ms;
27                 else
28                         pi_data->interval = 0;
29         } else {
30                 if (data->lba_ext)
31                         pi_data->interval = data->lba_ext - sizeof(struct nvme_16b_guard_pif);
32                 else
33                         pi_data->interval = data->ms - sizeof(struct nvme_16b_guard_pif);
34         }
35
36         if (io_u->ddir != DDIR_WRITE)
37                 return;
38
39         while (lba_num < nlb) {
40                 if (data->lba_ext)
41                         pi = (struct nvme_16b_guard_pif *)(buf + pi_data->interval);
42                 else
43                         pi = (struct nvme_16b_guard_pif *)(md_buf + pi_data->interval);
44
45                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
46                         if (data->lba_ext) {
47                                 guard = fio_crc_t10dif(0, buf, pi_data->interval);
48                         } else {
49                                 guard = fio_crc_t10dif(0, buf, data->lba_size);
50                                 guard = fio_crc_t10dif(guard, md_buf, pi_data->interval);
51                         }
52                         pi->guard = cpu_to_be16(guard);
53                 }
54
55                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
56                         pi->apptag = cpu_to_be16(pi_data->apptag);
57
58                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
59                         switch (data->pi_type) {
60                         case NVME_NS_DPS_PI_TYPE1:
61                         case NVME_NS_DPS_PI_TYPE2:
62                                 pi->srtag = cpu_to_be32((__u32)slba + lba_num);
63                                 break;
64                         case NVME_NS_DPS_PI_TYPE3:
65                                 break;
66                         }
67                 }
68                 if (data->lba_ext) {
69                         buf += data->lba_ext;
70                 } else {
71                         buf += data->lba_size;
72                         md_buf += data->ms;
73                 }
74                 lba_num++;
75         }
76 }
77
78 static int fio_nvme_verify_pi_16b_guard(struct nvme_data *data,
79                                         struct io_u *io_u)
80 {
81         struct nvme_pi_data *pi_data = io_u->engine_data;
82         struct nvme_16b_guard_pif *pi;
83         struct fio_file *f = io_u->file;
84         unsigned char *buf = io_u->xfer_buf;
85         unsigned char *md_buf = io_u->mmap_data;
86         __u64 slba = get_slba(data, io_u->offset);
87         __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1;
88         __u32 lba_num = 0;
89         __u16 unmask_app, unmask_app_exp, guard = 0;
90
91         while (lba_num < nlb) {
92                 if (data->lba_ext)
93                         pi = (struct nvme_16b_guard_pif *)(buf + pi_data->interval);
94                 else
95                         pi = (struct nvme_16b_guard_pif *)(md_buf + pi_data->interval);
96
97                 if (data->pi_type == NVME_NS_DPS_PI_TYPE3) {
98                         if (pi->apptag == NVME_PI_APP_DISABLE &&
99                             pi->srtag == NVME_PI_REF_DISABLE)
100                                 goto next;
101                 } else if (data->pi_type == NVME_NS_DPS_PI_TYPE1 ||
102                            data->pi_type == NVME_NS_DPS_PI_TYPE2) {
103                         if (pi->apptag == NVME_PI_APP_DISABLE)
104                                 goto next;
105                 }
106
107                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
108                         if (data->lba_ext) {
109                                 guard = fio_crc_t10dif(0, buf, pi_data->interval);
110                         } else {
111                                 guard = fio_crc_t10dif(0, buf, data->lba_size);
112                                 guard = fio_crc_t10dif(guard, md_buf, pi_data->interval);
113                         }
114                         if (be16_to_cpu(pi->guard) != guard) {
115                                 log_err("%s: Guard compare error: LBA: %llu Expected=%x, Actual=%x\n",
116                                         f->file_name, (unsigned long long)slba,
117                                         guard, be16_to_cpu(pi->guard));
118                                 return -EIO;
119                         }
120                 }
121
122                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_APP) {
123                         unmask_app = be16_to_cpu(pi->apptag) & pi_data->apptag_mask;
124                         unmask_app_exp = pi_data->apptag & pi_data->apptag_mask;
125                         if (unmask_app != unmask_app_exp) {
126                                 log_err("%s: APPTAG compare error: LBA: %llu Expected=%x, Actual=%x\n",
127                                         f->file_name, (unsigned long long)slba,
128                                         unmask_app_exp, unmask_app);
129                                 return -EIO;
130                         }
131                 }
132
133                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
134                         switch (data->pi_type) {
135                         case NVME_NS_DPS_PI_TYPE1:
136                         case NVME_NS_DPS_PI_TYPE2:
137                                 if (be32_to_cpu(pi->srtag) !=
138                                     ((__u32)slba + lba_num)) {
139                                         log_err("%s: REFTAG compare error: LBA: %llu Expected=%x, Actual=%x\n",
140                                                 f->file_name, (unsigned long long)slba,
141                                                 (__u32)slba + lba_num,
142                                                 be32_to_cpu(pi->srtag));
143                                         return -EIO;
144                                 }
145                                 break;
146                         case NVME_NS_DPS_PI_TYPE3:
147                                 break;
148                         }
149                 }
150 next:
151                 if (data->lba_ext) {
152                         buf += data->lba_ext;
153                 } else {
154                         buf += data->lba_size;
155                         md_buf += data->ms;
156                 }
157                 lba_num++;
158         }
159
160         return 0;
161 }
162
163 static void fio_nvme_generate_pi_64b_guard(struct nvme_data *data,
164                                            struct io_u *io_u,
165                                            struct nvme_cmd_ext_io_opts *opts)
166 {
167         struct nvme_pi_data *pi_data = io_u->engine_data;
168         struct nvme_64b_guard_pif *pi;
169         unsigned char *buf = io_u->xfer_buf;
170         unsigned char *md_buf = io_u->mmap_data;
171         uint64_t guard = 0;
172         __u64 slba = get_slba(data, io_u->offset);
173         __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1;
174         __u32 lba_num = 0;
175
176         if (data->pi_loc) {
177                 if (data->lba_ext)
178                         pi_data->interval = data->lba_ext - data->ms;
179                 else
180                         pi_data->interval = 0;
181         } else {
182                 if (data->lba_ext)
183                         pi_data->interval = data->lba_ext - sizeof(struct nvme_64b_guard_pif);
184                 else
185                         pi_data->interval = data->ms - sizeof(struct nvme_64b_guard_pif);
186         }
187
188         if (io_u->ddir != DDIR_WRITE)
189                 return;
190
191         while (lba_num < nlb) {
192                 if (data->lba_ext)
193                         pi = (struct nvme_64b_guard_pif *)(buf + pi_data->interval);
194                 else
195                         pi = (struct nvme_64b_guard_pif *)(md_buf + pi_data->interval);
196
197                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
198                         if (data->lba_ext) {
199                                 guard = fio_crc64_nvme(0, buf, pi_data->interval);
200                         } else {
201                                 guard = fio_crc64_nvme(0, buf, data->lba_size);
202                                 guard = fio_crc64_nvme(guard, md_buf, pi_data->interval);
203                         }
204                         pi->guard = cpu_to_be64(guard);
205                 }
206
207                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
208                         pi->apptag = cpu_to_be16(pi_data->apptag);
209
210                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
211                         switch (data->pi_type) {
212                         case NVME_NS_DPS_PI_TYPE1:
213                         case NVME_NS_DPS_PI_TYPE2:
214                                 put_unaligned_be48(slba + lba_num, pi->srtag);
215                                 break;
216                         case NVME_NS_DPS_PI_TYPE3:
217                                 break;
218                         }
219                 }
220                 if (data->lba_ext) {
221                         buf += data->lba_ext;
222                 } else {
223                         buf += data->lba_size;
224                         md_buf += data->ms;
225                 }
226                 lba_num++;
227         }
228 }
229
230 static int fio_nvme_verify_pi_64b_guard(struct nvme_data *data,
231                                         struct io_u *io_u)
232 {
233         struct nvme_pi_data *pi_data = io_u->engine_data;
234         struct nvme_64b_guard_pif *pi;
235         struct fio_file *f = io_u->file;
236         unsigned char *buf = io_u->xfer_buf;
237         unsigned char *md_buf = io_u->mmap_data;
238         __u64 slba = get_slba(data, io_u->offset);
239         __u64 ref, ref_exp, guard = 0;
240         __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1;
241         __u32 lba_num = 0;
242         __u16 unmask_app, unmask_app_exp;
243
244         while (lba_num < nlb) {
245                 if (data->lba_ext)
246                         pi = (struct nvme_64b_guard_pif *)(buf + pi_data->interval);
247                 else
248                         pi = (struct nvme_64b_guard_pif *)(md_buf + pi_data->interval);
249
250                 if (data->pi_type == NVME_NS_DPS_PI_TYPE3) {
251                         if (pi->apptag == NVME_PI_APP_DISABLE &&
252                             fio_nvme_pi_ref_escape(pi->srtag))
253                                 goto next;
254                 } else if (data->pi_type == NVME_NS_DPS_PI_TYPE1 ||
255                            data->pi_type == NVME_NS_DPS_PI_TYPE2) {
256                         if (pi->apptag == NVME_PI_APP_DISABLE)
257                                 goto next;
258                 }
259
260                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
261                         if (data->lba_ext) {
262                                 guard = fio_crc64_nvme(0, buf, pi_data->interval);
263                         } else {
264                                 guard = fio_crc64_nvme(0, buf, data->lba_size);
265                                 guard = fio_crc64_nvme(guard, md_buf, pi_data->interval);
266                         }
267                         if (be64_to_cpu((uint64_t)pi->guard) != guard) {
268                                 log_err("%s: Guard compare error: LBA: %llu Expected=%llx, Actual=%llx\n",
269                                         f->file_name, (unsigned long long)slba,
270                                         guard, be64_to_cpu((uint64_t)pi->guard));
271                                 return -EIO;
272                         }
273                 }
274
275                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_APP) {
276                         unmask_app = be16_to_cpu(pi->apptag) & pi_data->apptag_mask;
277                         unmask_app_exp = pi_data->apptag & pi_data->apptag_mask;
278                         if (unmask_app != unmask_app_exp) {
279                                 log_err("%s: APPTAG compare error: LBA: %llu Expected=%x, Actual=%x\n",
280                                         f->file_name, (unsigned long long)slba,
281                                         unmask_app_exp, unmask_app);
282                                 return -EIO;
283                         }
284                 }
285
286                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
287                         switch (data->pi_type) {
288                         case NVME_NS_DPS_PI_TYPE1:
289                         case NVME_NS_DPS_PI_TYPE2:
290                                 ref = get_unaligned_be48(pi->srtag);
291                                 ref_exp = (slba + lba_num) & ((1ULL << 48) - 1);
292                                 if (ref != ref_exp) {
293                                         log_err("%s: REFTAG compare error: LBA: %llu Expected=%llx, Actual=%llx\n",
294                                                 f->file_name, (unsigned long long)slba,
295                                                 ref_exp, ref);
296                                         return -EIO;
297                                 }
298                                 break;
299                         case NVME_NS_DPS_PI_TYPE3:
300                                 break;
301                         }
302                 }
303 next:
304                 if (data->lba_ext) {
305                         buf += data->lba_ext;
306                 } else {
307                         buf += data->lba_size;
308                         md_buf += data->ms;
309                 }
310                 lba_num++;
311         }
312
313         return 0;
314 }
315 static void fio_nvme_uring_cmd_trim_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
316                                          struct nvme_dsm *dsm)
317 {
318         struct nvme_data *data = FILE_ENG_DATA(io_u->file);
319         struct trim_range *range;
320         uint8_t *buf_point;
321         int i;
322
323         cmd->opcode = nvme_cmd_dsm;
324         cmd->nsid = data->nsid;
325         cmd->cdw11 = NVME_ATTRIBUTE_DEALLOCATE;
326         cmd->addr = (__u64) (uintptr_t) (&dsm->range[0]);
327
328         if (dsm->nr_ranges == 1) {
329                 dsm->range[0].slba = get_slba(data, io_u->offset);
330                 /* nlb is a 1-based value for deallocate */
331                 dsm->range[0].nlb = get_nlb(data, io_u->xfer_buflen) + 1;
332                 cmd->cdw10 = 0;
333                 cmd->data_len = sizeof(struct nvme_dsm_range);
334         } else {
335                 buf_point = io_u->xfer_buf;
336                 for (i = 0; i < io_u->number_trim; i++) {
337                         range = (struct trim_range *)buf_point;
338                         dsm->range[i].slba = get_slba(data, range->start);
339                         /* nlb is a 1-based value for deallocate */
340                         dsm->range[i].nlb = get_nlb(data, range->len) + 1;
341                         buf_point += sizeof(struct trim_range);
342                 }
343                 cmd->cdw10 = io_u->number_trim - 1;
344                 cmd->data_len = io_u->number_trim * sizeof(struct nvme_dsm_range);
345         }
346 }
347
348 int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
349                             struct iovec *iov, struct nvme_dsm *dsm,
350                             uint8_t read_opcode, uint8_t write_opcode,
351                             unsigned int cdw12_flags)
352 {
353         struct nvme_data *data = FILE_ENG_DATA(io_u->file);
354         __u64 slba;
355         __u32 nlb;
356
357         memset(cmd, 0, sizeof(struct nvme_uring_cmd));
358
359         switch (io_u->ddir) {
360         case DDIR_READ:
361                 cmd->opcode = read_opcode;
362                 break;
363         case DDIR_WRITE:
364                 cmd->opcode = write_opcode;
365                 break;
366         case DDIR_TRIM:
367                 fio_nvme_uring_cmd_trim_prep(cmd, io_u, dsm);
368                 return 0;
369         case DDIR_SYNC:
370         case DDIR_DATASYNC:
371                 cmd->opcode = nvme_cmd_flush;
372                 cmd->nsid = data->nsid;
373                 return 0;
374         default:
375                 return -ENOTSUP;
376         }
377
378         slba = get_slba(data, io_u->offset);
379         nlb = get_nlb(data, io_u->xfer_buflen);
380
381         /* cdw10 and cdw11 represent starting lba */
382         cmd->cdw10 = slba & 0xffffffff;
383         cmd->cdw11 = slba >> 32;
384         /* cdw12 represent number of lba's for read/write */
385         cmd->cdw12 = nlb | (io_u->dtype << 20) | cdw12_flags;
386         cmd->cdw13 = io_u->dspec << 16;
387         if (iov) {
388                 iov->iov_base = io_u->xfer_buf;
389                 iov->iov_len = io_u->xfer_buflen;
390                 cmd->addr = (__u64)(uintptr_t)iov;
391                 cmd->data_len = 1;
392         } else {
393                 /* no buffer for write zeroes */
394                 if (cmd->opcode != nvme_cmd_write_zeroes)
395                         cmd->addr = (__u64)(uintptr_t)io_u->xfer_buf;
396                 else
397                         cmd->addr = (__u64)(uintptr_t)NULL;
398                 cmd->data_len = io_u->xfer_buflen;
399         }
400         if (data->lba_shift && data->ms) {
401                 cmd->metadata = (__u64)(uintptr_t)io_u->mmap_data;
402                 cmd->metadata_len = (nlb + 1) * data->ms;
403         }
404         cmd->nsid = data->nsid;
405         return 0;
406 }
407
408 void fio_nvme_generate_guard(struct io_u *io_u, struct nvme_cmd_ext_io_opts *opts)
409 {
410         struct nvme_data *data = FILE_ENG_DATA(io_u->file);
411
412         if (data->pi_type && !(opts->io_flags & NVME_IO_PRINFO_PRACT)) {
413                 if (data->guard_type == NVME_NVM_NS_16B_GUARD)
414                         fio_nvme_generate_pi_16b_guard(data, io_u, opts);
415                 else if (data->guard_type == NVME_NVM_NS_64B_GUARD)
416                         fio_nvme_generate_pi_64b_guard(data, io_u, opts);
417         }
418 }
419
420 void fio_nvme_pi_fill(struct nvme_uring_cmd *cmd, struct io_u *io_u,
421                       struct nvme_cmd_ext_io_opts *opts)
422 {
423         struct nvme_data *data = FILE_ENG_DATA(io_u->file);
424         __u64 slba;
425
426         slba = get_slba(data, io_u->offset);
427         cmd->cdw12 |= opts->io_flags;
428
429         fio_nvme_generate_guard(io_u, opts);
430
431         switch (data->pi_type) {
432         case NVME_NS_DPS_PI_TYPE1:
433         case NVME_NS_DPS_PI_TYPE2:
434                 switch (data->guard_type) {
435                 case NVME_NVM_NS_16B_GUARD:
436                         if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF)
437                                 cmd->cdw14 = (__u32)slba;
438                         break;
439                 case NVME_NVM_NS_64B_GUARD:
440                         if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
441                                 cmd->cdw14 = (__u32)slba;
442                                 cmd->cdw3 = ((slba >> 32) & 0xffff);
443                         }
444                         break;
445                 default:
446                         break;
447                 }
448                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
449                         cmd->cdw15 = (opts->apptag_mask << 16 | opts->apptag);
450                 break;
451         case NVME_NS_DPS_PI_TYPE3:
452                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
453                         cmd->cdw15 = (opts->apptag_mask << 16 | opts->apptag);
454                 break;
455         case NVME_NS_DPS_PI_NONE:
456                 break;
457         }
458 }
459
460 int fio_nvme_pi_verify(struct nvme_data *data, struct io_u *io_u)
461 {
462         int ret = 0;
463
464         switch (data->guard_type) {
465         case NVME_NVM_NS_16B_GUARD:
466                 ret = fio_nvme_verify_pi_16b_guard(data, io_u);
467                 break;
468         case NVME_NVM_NS_64B_GUARD:
469                 ret = fio_nvme_verify_pi_64b_guard(data, io_u);
470                 break;
471         default:
472                 break;
473         }
474
475         return ret;
476 }
477
478 static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns,
479                          enum nvme_csi csi, void *data)
480 {
481         struct nvme_passthru_cmd cmd = {
482                 .opcode         = nvme_admin_identify,
483                 .nsid           = nsid,
484                 .addr           = (__u64)(uintptr_t)data,
485                 .data_len       = NVME_IDENTIFY_DATA_SIZE,
486                 .cdw10          = cns,
487                 .cdw11          = csi << NVME_IDENTIFY_CSI_SHIFT,
488                 .timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
489         };
490
491         return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd);
492 }
493
494 int fio_nvme_get_info(struct fio_file *f, __u64 *nlba, __u32 pi_act,
495                       struct nvme_data *data)
496 {
497         struct nvme_id_ns ns;
498         struct nvme_id_ctrl ctrl;
499         struct nvme_nvm_id_ns nvm_ns;
500         int namespace_id;
501         int fd, err;
502         __u32 format_idx, elbaf;
503
504         if (f->filetype != FIO_TYPE_CHAR) {
505                 log_err("ioengine io_uring_cmd only works with nvme ns "
506                         "generic char devices (/dev/ngXnY)\n");
507                 return 1;
508         }
509
510         fd = open(f->file_name, O_RDONLY);
511         if (fd < 0)
512                 return -errno;
513
514         namespace_id = ioctl(fd, NVME_IOCTL_ID);
515         if (namespace_id < 0) {
516                 err = -errno;
517                 log_err("%s: failed to fetch namespace-id\n", f->file_name);
518                 goto out;
519         }
520
521         err = nvme_identify(fd, 0, NVME_IDENTIFY_CNS_CTRL, NVME_CSI_NVM, &ctrl);
522         if (err) {
523                 log_err("%s: failed to fetch identify ctrl\n", f->file_name);
524                 goto out;
525         }
526
527         /*
528          * Identify namespace to get namespace-id, namespace size in LBA's
529          * and LBA data size.
530          */
531         err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_NS,
532                                 NVME_CSI_NVM, &ns);
533         if (err) {
534                 log_err("%s: failed to fetch identify namespace\n",
535                         f->file_name);
536                 goto out;
537         }
538
539         data->nsid = namespace_id;
540
541         /*
542          * 16 or 64 as maximum number of supported LBA formats.
543          * From flbas bit 0-3 indicates lsb and bit 5-6 indicates msb
544          * of the format index used to format the namespace.
545          */
546         if (ns.nlbaf < 16)
547                 format_idx = ns.flbas & 0xf;
548         else
549                 format_idx = (ns.flbas & 0xf) + (((ns.flbas >> 5) & 0x3) << 4);
550
551         data->lba_size = 1 << ns.lbaf[format_idx].ds;
552         data->ms = le16_to_cpu(ns.lbaf[format_idx].ms);
553
554         /* Check for end to end data protection support */
555         if (data->ms && (ns.dps & NVME_NS_DPS_PI_MASK))
556                 data->pi_type = (ns.dps & NVME_NS_DPS_PI_MASK);
557
558         if (!data->pi_type)
559                 goto check_elba;
560
561         if (ctrl.ctratt & NVME_CTRL_CTRATT_ELBAS) {
562                 err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_CSI_NS,
563                                         NVME_CSI_NVM, &nvm_ns);
564                 if (err) {
565                         log_err("%s: failed to fetch identify nvm namespace\n",
566                                 f->file_name);
567                         goto out;
568                 }
569
570                 elbaf = le32_to_cpu(nvm_ns.elbaf[format_idx]);
571
572                 /* Currently we don't support storage tags */
573                 if (elbaf & NVME_ID_NS_NVM_STS_MASK) {
574                         log_err("%s: Storage tag not supported\n",
575                                 f->file_name);
576                         err = -ENOTSUP;
577                         goto out;
578                 }
579
580                 data->guard_type = (elbaf >> NVME_ID_NS_NVM_GUARD_SHIFT) &
581                                 NVME_ID_NS_NVM_GUARD_MASK;
582
583                 /* No 32 bit guard, as storage tag is mandatory for it */
584                 switch (data->guard_type) {
585                 case NVME_NVM_NS_16B_GUARD:
586                         data->pi_size = sizeof(struct nvme_16b_guard_pif);
587                         break;
588                 case NVME_NVM_NS_64B_GUARD:
589                         data->pi_size = sizeof(struct nvme_64b_guard_pif);
590                         break;
591                 default:
592                         break;
593                 }
594         } else {
595                 data->guard_type = NVME_NVM_NS_16B_GUARD;
596                 data->pi_size = sizeof(struct nvme_16b_guard_pif);
597         }
598
599         /*
600          * when PRACT bit is set to 1, and metadata size is equal to protection
601          * information size, controller inserts and removes PI for write and
602          * read commands respectively.
603          */
604         if (pi_act && data->ms == data->pi_size)
605                 data->ms = 0;
606
607         data->pi_loc = (ns.dps & NVME_NS_DPS_PI_FIRST);
608
609 check_elba:
610         /*
611          * Bit 4 for flbas indicates if metadata is transferred at the end of
612          * logical block creating an extended LBA.
613          */
614         if (data->ms && ((ns.flbas >> 4) & 0x1))
615                 data->lba_ext = data->lba_size + data->ms;
616         else
617                 data->lba_shift = ilog2(data->lba_size);
618
619         *nlba = ns.nsze;
620
621 out:
622         close(fd);
623         return err;
624 }
625
626 int fio_nvme_get_zoned_model(struct thread_data *td, struct fio_file *f,
627                              enum zbd_zoned_model *model)
628 {
629         struct nvme_data *data = FILE_ENG_DATA(f);
630         struct nvme_id_ns ns;
631         struct nvme_passthru_cmd cmd;
632         int fd, ret = 0;
633
634         if (f->filetype != FIO_TYPE_CHAR)
635                 return -EINVAL;
636
637         /* File is not yet opened */
638         fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
639         if (fd < 0)
640                 return -errno;
641
642         /* Using nvme_id_ns for data as sizes are same */
643         ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_CTRL,
644                                 NVME_CSI_ZNS, &ns);
645         if (ret) {
646                 *model = ZBD_NONE;
647                 goto out;
648         }
649
650         memset(&cmd, 0, sizeof(struct nvme_passthru_cmd));
651
652         /* Using nvme_id_ns for data as sizes are same */
653         ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
654                                 NVME_CSI_ZNS, &ns);
655         if (ret) {
656                 *model = ZBD_NONE;
657                 goto out;
658         }
659
660         *model = ZBD_HOST_MANAGED;
661 out:
662         close(fd);
663         return 0;
664 }
665
666 static int nvme_report_zones(int fd, __u32 nsid, __u64 slba, __u32 zras_feat,
667                              __u32 data_len, void *data)
668 {
669         struct nvme_passthru_cmd cmd = {
670                 .opcode         = nvme_zns_cmd_mgmt_recv,
671                 .nsid           = nsid,
672                 .addr           = (__u64)(uintptr_t)data,
673                 .data_len       = data_len,
674                 .cdw10          = slba & 0xffffffff,
675                 .cdw11          = slba >> 32,
676                 .cdw12          = (data_len >> 2) - 1,
677                 .cdw13          = NVME_ZNS_ZRA_REPORT_ZONES | zras_feat,
678                 .timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
679         };
680
681         return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
682 }
683
684 int fio_nvme_report_zones(struct thread_data *td, struct fio_file *f,
685                           uint64_t offset, struct zbd_zone *zbdz,
686                           unsigned int nr_zones)
687 {
688         struct nvme_data *data = FILE_ENG_DATA(f);
689         struct nvme_zone_report *zr;
690         struct nvme_zns_id_ns zns_ns;
691         struct nvme_id_ns ns;
692         unsigned int i = 0, j, zones_fetched = 0;
693         unsigned int max_zones, zones_chunks = 1024;
694         int fd, ret = 0;
695         __u32 zr_len;
696         __u64 zlen;
697
698         /* File is not yet opened */
699         fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
700         if (fd < 0)
701                 return -errno;
702
703         zones_fetched = 0;
704         zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc));
705         zr = calloc(1, zr_len);
706         if (!zr) {
707                 close(fd);
708                 return -ENOMEM;
709         }
710
711         ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_NS,
712                                 NVME_CSI_NVM, &ns);
713         if (ret) {
714                 log_err("%s: nvme_identify_ns failed, err=%d\n", f->file_name,
715                         ret);
716                 goto out;
717         }
718
719         ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
720                                 NVME_CSI_ZNS, &zns_ns);
721         if (ret) {
722                 log_err("%s: nvme_zns_identify_ns failed, err=%d\n",
723                         f->file_name, ret);
724                 goto out;
725         }
726         zlen = zns_ns.lbafe[ns.flbas & 0x0f].zsze << data->lba_shift;
727
728         max_zones = (f->real_file_size - offset) / zlen;
729         if (max_zones < nr_zones)
730                 nr_zones = max_zones;
731
732         if (nr_zones < zones_chunks)
733                 zones_chunks = nr_zones;
734
735         while (zones_fetched < nr_zones) {
736                 if (zones_fetched + zones_chunks >= nr_zones) {
737                         zones_chunks = nr_zones - zones_fetched;
738                         zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc));
739                 }
740                 ret = nvme_report_zones(fd, data->nsid, offset >> data->lba_shift,
741                                         NVME_ZNS_ZRAS_FEAT_ERZ, zr_len, (void *)zr);
742                 if (ret) {
743                         log_err("%s: nvme_zns_report_zones failed, err=%d\n",
744                                 f->file_name, ret);
745                         goto out;
746                 }
747
748                 /* Transform the zone-report */
749                 for (j = 0; j < zr->nr_zones; j++, i++) {
750                         struct nvme_zns_desc *desc = (struct nvme_zns_desc *)&(zr->entries[j]);
751
752                         zbdz[i].start = desc->zslba << data->lba_shift;
753                         zbdz[i].len = zlen;
754                         zbdz[i].wp = desc->wp << data->lba_shift;
755                         zbdz[i].capacity = desc->zcap << data->lba_shift;
756
757                         /* Zone Type is stored in first 4 bits. */
758                         switch (desc->zt & 0x0f) {
759                         case NVME_ZONE_TYPE_SEQWRITE_REQ:
760                                 zbdz[i].type = ZBD_ZONE_TYPE_SWR;
761                                 break;
762                         default:
763                                 log_err("%s: invalid type for zone at offset %llu.\n",
764                                         f->file_name, (unsigned long long) desc->zslba);
765                                 ret = -EIO;
766                                 goto out;
767                         }
768
769                         /* Zone State is stored in last 4 bits. */
770                         switch (desc->zs >> 4) {
771                         case NVME_ZNS_ZS_EMPTY:
772                                 zbdz[i].cond = ZBD_ZONE_COND_EMPTY;
773                                 break;
774                         case NVME_ZNS_ZS_IMPL_OPEN:
775                                 zbdz[i].cond = ZBD_ZONE_COND_IMP_OPEN;
776                                 break;
777                         case NVME_ZNS_ZS_EXPL_OPEN:
778                                 zbdz[i].cond = ZBD_ZONE_COND_EXP_OPEN;
779                                 break;
780                         case NVME_ZNS_ZS_CLOSED:
781                                 zbdz[i].cond = ZBD_ZONE_COND_CLOSED;
782                                 break;
783                         case NVME_ZNS_ZS_FULL:
784                                 zbdz[i].cond = ZBD_ZONE_COND_FULL;
785                                 break;
786                         case NVME_ZNS_ZS_READ_ONLY:
787                         case NVME_ZNS_ZS_OFFLINE:
788                         default:
789                                 /* Treat all these conditions as offline (don't use!) */
790                                 zbdz[i].cond = ZBD_ZONE_COND_OFFLINE;
791                                 zbdz[i].wp = zbdz[i].start;
792                         }
793                 }
794                 zones_fetched += zr->nr_zones;
795                 offset += zr->nr_zones * zlen;
796         }
797
798         ret = zones_fetched;
799 out:
800         free(zr);
801         close(fd);
802
803         return ret;
804 }
805
806 int fio_nvme_reset_wp(struct thread_data *td, struct fio_file *f,
807                       uint64_t offset, uint64_t length)
808 {
809         struct nvme_data *data = FILE_ENG_DATA(f);
810         unsigned int nr_zones;
811         unsigned long long zslba;
812         int i, fd, ret = 0;
813
814         /* If the file is not yet opened, open it for this function. */
815         fd = f->fd;
816         if (fd < 0) {
817                 fd = open(f->file_name, O_RDWR | O_LARGEFILE);
818                 if (fd < 0)
819                         return -errno;
820         }
821
822         zslba = offset >> data->lba_shift;
823         nr_zones = (length + td->o.zone_size - 1) / td->o.zone_size;
824
825         for (i = 0; i < nr_zones; i++, zslba += (td->o.zone_size >> data->lba_shift)) {
826                 struct nvme_passthru_cmd cmd = {
827                         .opcode         = nvme_zns_cmd_mgmt_send,
828                         .nsid           = data->nsid,
829                         .cdw10          = zslba & 0xffffffff,
830                         .cdw11          = zslba >> 32,
831                         .cdw13          = NVME_ZNS_ZSA_RESET,
832                         .addr           = (__u64)(uintptr_t)NULL,
833                         .data_len       = 0,
834                         .timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
835                 };
836
837                 ret = ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
838         }
839
840         if (f->fd < 0)
841                 close(fd);
842         return -ret;
843 }
844
845 int fio_nvme_get_max_open_zones(struct thread_data *td, struct fio_file *f,
846                                 unsigned int *max_open_zones)
847 {
848         struct nvme_data *data = FILE_ENG_DATA(f);
849         struct nvme_zns_id_ns zns_ns;
850         int fd, ret = 0;
851
852         fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
853         if (fd < 0)
854                 return -errno;
855
856         ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
857                                 NVME_CSI_ZNS, &zns_ns);
858         if (ret) {
859                 log_err("%s: nvme_zns_identify_ns failed, err=%d\n",
860                         f->file_name, ret);
861                 goto out;
862         }
863
864         *max_open_zones = zns_ns.mor + 1;
865 out:
866         close(fd);
867         return ret;
868 }
869
870 static inline int nvme_fdp_reclaim_unit_handle_status(int fd, __u32 nsid,
871                                                       __u32 data_len, void *data)
872 {
873         struct nvme_passthru_cmd cmd = {
874                 .opcode         = nvme_cmd_io_mgmt_recv,
875                 .nsid           = nsid,
876                 .addr           = (__u64)(uintptr_t)data,
877                 .data_len       = data_len,
878                 .cdw10          = 1,
879                 .cdw11          = (data_len >> 2) - 1,
880         };
881
882         return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
883 }
884
885 int fio_nvme_iomgmt_ruhs(struct thread_data *td, struct fio_file *f,
886                          struct nvme_fdp_ruh_status *ruhs, __u32 bytes)
887 {
888         struct nvme_data *data = FILE_ENG_DATA(f);
889         int fd, ret;
890
891         fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
892         if (fd < 0)
893                 return -errno;
894
895         ret = nvme_fdp_reclaim_unit_handle_status(fd, data->nsid, bytes, ruhs);
896         if (ret) {
897                 log_err("%s: nvme_fdp_reclaim_unit_handle_status failed, err=%d\n",
898                         f->file_name, ret);
899                 errno = ENOTSUP;
900         } else
901                 errno = 0;
902
903         ret = -errno;
904         close(fd);
905         return ret;
906 }