engines/io_uring: add multi range dsm support
[fio.git] / engines / nvme.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * nvme structure declarations and helper functions for the
4  * io_uring_cmd engine.
5  */
6
7 #include "nvme.h"
8 #include "../crc/crc-t10dif.h"
9 #include "../crc/crc64.h"
10
11 static inline __u64 get_slba(struct nvme_data *data, __u64 offset)
12 {
13         if (data->lba_ext)
14                 return offset / data->lba_ext;
15
16         return offset >> data->lba_shift;
17 }
18
19 static inline __u32 get_nlb(struct nvme_data *data, __u64 len)
20 {
21         if (data->lba_ext)
22                 return len / data->lba_ext - 1;
23
24         return (len >> data->lba_shift) - 1;
25 }
26
27 static void fio_nvme_generate_pi_16b_guard(struct nvme_data *data,
28                                            struct io_u *io_u,
29                                            struct nvme_cmd_ext_io_opts *opts)
30 {
31         struct nvme_pi_data *pi_data = io_u->engine_data;
32         struct nvme_16b_guard_pif *pi;
33         unsigned char *buf = io_u->xfer_buf;
34         unsigned char *md_buf = io_u->mmap_data;
35         __u64 slba = get_slba(data, io_u->offset);
36         __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1;
37         __u32 lba_num = 0;
38         __u16 guard = 0;
39
40         if (data->pi_loc) {
41                 if (data->lba_ext)
42                         pi_data->interval = data->lba_ext - data->ms;
43                 else
44                         pi_data->interval = 0;
45         } else {
46                 if (data->lba_ext)
47                         pi_data->interval = data->lba_ext - sizeof(struct nvme_16b_guard_pif);
48                 else
49                         pi_data->interval = data->ms - sizeof(struct nvme_16b_guard_pif);
50         }
51
52         if (io_u->ddir != DDIR_WRITE)
53                 return;
54
55         while (lba_num < nlb) {
56                 if (data->lba_ext)
57                         pi = (struct nvme_16b_guard_pif *)(buf + pi_data->interval);
58                 else
59                         pi = (struct nvme_16b_guard_pif *)(md_buf + pi_data->interval);
60
61                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
62                         if (data->lba_ext) {
63                                 guard = fio_crc_t10dif(0, buf, pi_data->interval);
64                         } else {
65                                 guard = fio_crc_t10dif(0, buf, data->lba_size);
66                                 guard = fio_crc_t10dif(guard, md_buf, pi_data->interval);
67                         }
68                         pi->guard = cpu_to_be16(guard);
69                 }
70
71                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
72                         pi->apptag = cpu_to_be16(pi_data->apptag);
73
74                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
75                         switch (data->pi_type) {
76                         case NVME_NS_DPS_PI_TYPE1:
77                         case NVME_NS_DPS_PI_TYPE2:
78                                 pi->srtag = cpu_to_be32((__u32)slba + lba_num);
79                                 break;
80                         case NVME_NS_DPS_PI_TYPE3:
81                                 break;
82                         }
83                 }
84                 if (data->lba_ext) {
85                         buf += data->lba_ext;
86                 } else {
87                         buf += data->lba_size;
88                         md_buf += data->ms;
89                 }
90                 lba_num++;
91         }
92 }
93
94 static int fio_nvme_verify_pi_16b_guard(struct nvme_data *data,
95                                         struct io_u *io_u)
96 {
97         struct nvme_pi_data *pi_data = io_u->engine_data;
98         struct nvme_16b_guard_pif *pi;
99         struct fio_file *f = io_u->file;
100         unsigned char *buf = io_u->xfer_buf;
101         unsigned char *md_buf = io_u->mmap_data;
102         __u64 slba = get_slba(data, io_u->offset);
103         __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1;
104         __u32 lba_num = 0;
105         __u16 unmask_app, unmask_app_exp, guard = 0;
106
107         while (lba_num < nlb) {
108                 if (data->lba_ext)
109                         pi = (struct nvme_16b_guard_pif *)(buf + pi_data->interval);
110                 else
111                         pi = (struct nvme_16b_guard_pif *)(md_buf + pi_data->interval);
112
113                 if (data->pi_type == NVME_NS_DPS_PI_TYPE3) {
114                         if (pi->apptag == NVME_PI_APP_DISABLE &&
115                             pi->srtag == NVME_PI_REF_DISABLE)
116                                 goto next;
117                 } else if (data->pi_type == NVME_NS_DPS_PI_TYPE1 ||
118                            data->pi_type == NVME_NS_DPS_PI_TYPE2) {
119                         if (pi->apptag == NVME_PI_APP_DISABLE)
120                                 goto next;
121                 }
122
123                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
124                         if (data->lba_ext) {
125                                 guard = fio_crc_t10dif(0, buf, pi_data->interval);
126                         } else {
127                                 guard = fio_crc_t10dif(0, buf, data->lba_size);
128                                 guard = fio_crc_t10dif(guard, md_buf, pi_data->interval);
129                         }
130                         if (be16_to_cpu(pi->guard) != guard) {
131                                 log_err("%s: Guard compare error: LBA: %llu Expected=%x, Actual=%x\n",
132                                         f->file_name, (unsigned long long)slba,
133                                         guard, be16_to_cpu(pi->guard));
134                                 return -EIO;
135                         }
136                 }
137
138                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_APP) {
139                         unmask_app = be16_to_cpu(pi->apptag) & pi_data->apptag_mask;
140                         unmask_app_exp = pi_data->apptag & pi_data->apptag_mask;
141                         if (unmask_app != unmask_app_exp) {
142                                 log_err("%s: APPTAG compare error: LBA: %llu Expected=%x, Actual=%x\n",
143                                         f->file_name, (unsigned long long)slba,
144                                         unmask_app_exp, unmask_app);
145                                 return -EIO;
146                         }
147                 }
148
149                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
150                         switch (data->pi_type) {
151                         case NVME_NS_DPS_PI_TYPE1:
152                         case NVME_NS_DPS_PI_TYPE2:
153                                 if (be32_to_cpu(pi->srtag) !=
154                                     ((__u32)slba + lba_num)) {
155                                         log_err("%s: REFTAG compare error: LBA: %llu Expected=%x, Actual=%x\n",
156                                                 f->file_name, (unsigned long long)slba,
157                                                 (__u32)slba + lba_num,
158                                                 be32_to_cpu(pi->srtag));
159                                         return -EIO;
160                                 }
161                                 break;
162                         case NVME_NS_DPS_PI_TYPE3:
163                                 break;
164                         }
165                 }
166 next:
167                 if (data->lba_ext) {
168                         buf += data->lba_ext;
169                 } else {
170                         buf += data->lba_size;
171                         md_buf += data->ms;
172                 }
173                 lba_num++;
174         }
175
176         return 0;
177 }
178
179 static void fio_nvme_generate_pi_64b_guard(struct nvme_data *data,
180                                            struct io_u *io_u,
181                                            struct nvme_cmd_ext_io_opts *opts)
182 {
183         struct nvme_pi_data *pi_data = io_u->engine_data;
184         struct nvme_64b_guard_pif *pi;
185         unsigned char *buf = io_u->xfer_buf;
186         unsigned char *md_buf = io_u->mmap_data;
187         uint64_t guard = 0;
188         __u64 slba = get_slba(data, io_u->offset);
189         __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1;
190         __u32 lba_num = 0;
191
192         if (data->pi_loc) {
193                 if (data->lba_ext)
194                         pi_data->interval = data->lba_ext - data->ms;
195                 else
196                         pi_data->interval = 0;
197         } else {
198                 if (data->lba_ext)
199                         pi_data->interval = data->lba_ext - sizeof(struct nvme_64b_guard_pif);
200                 else
201                         pi_data->interval = data->ms - sizeof(struct nvme_64b_guard_pif);
202         }
203
204         if (io_u->ddir != DDIR_WRITE)
205                 return;
206
207         while (lba_num < nlb) {
208                 if (data->lba_ext)
209                         pi = (struct nvme_64b_guard_pif *)(buf + pi_data->interval);
210                 else
211                         pi = (struct nvme_64b_guard_pif *)(md_buf + pi_data->interval);
212
213                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
214                         if (data->lba_ext) {
215                                 guard = fio_crc64_nvme(0, buf, pi_data->interval);
216                         } else {
217                                 guard = fio_crc64_nvme(0, buf, data->lba_size);
218                                 guard = fio_crc64_nvme(guard, md_buf, pi_data->interval);
219                         }
220                         pi->guard = cpu_to_be64(guard);
221                 }
222
223                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
224                         pi->apptag = cpu_to_be16(pi_data->apptag);
225
226                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
227                         switch (data->pi_type) {
228                         case NVME_NS_DPS_PI_TYPE1:
229                         case NVME_NS_DPS_PI_TYPE2:
230                                 put_unaligned_be48(slba + lba_num, pi->srtag);
231                                 break;
232                         case NVME_NS_DPS_PI_TYPE3:
233                                 break;
234                         }
235                 }
236                 if (data->lba_ext) {
237                         buf += data->lba_ext;
238                 } else {
239                         buf += data->lba_size;
240                         md_buf += data->ms;
241                 }
242                 lba_num++;
243         }
244 }
245
246 static int fio_nvme_verify_pi_64b_guard(struct nvme_data *data,
247                                         struct io_u *io_u)
248 {
249         struct nvme_pi_data *pi_data = io_u->engine_data;
250         struct nvme_64b_guard_pif *pi;
251         struct fio_file *f = io_u->file;
252         unsigned char *buf = io_u->xfer_buf;
253         unsigned char *md_buf = io_u->mmap_data;
254         __u64 slba = get_slba(data, io_u->offset);
255         __u64 ref, ref_exp, guard = 0;
256         __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1;
257         __u32 lba_num = 0;
258         __u16 unmask_app, unmask_app_exp;
259
260         while (lba_num < nlb) {
261                 if (data->lba_ext)
262                         pi = (struct nvme_64b_guard_pif *)(buf + pi_data->interval);
263                 else
264                         pi = (struct nvme_64b_guard_pif *)(md_buf + pi_data->interval);
265
266                 if (data->pi_type == NVME_NS_DPS_PI_TYPE3) {
267                         if (pi->apptag == NVME_PI_APP_DISABLE &&
268                             fio_nvme_pi_ref_escape(pi->srtag))
269                                 goto next;
270                 } else if (data->pi_type == NVME_NS_DPS_PI_TYPE1 ||
271                            data->pi_type == NVME_NS_DPS_PI_TYPE2) {
272                         if (pi->apptag == NVME_PI_APP_DISABLE)
273                                 goto next;
274                 }
275
276                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
277                         if (data->lba_ext) {
278                                 guard = fio_crc64_nvme(0, buf, pi_data->interval);
279                         } else {
280                                 guard = fio_crc64_nvme(0, buf, data->lba_size);
281                                 guard = fio_crc64_nvme(guard, md_buf, pi_data->interval);
282                         }
283                         if (be64_to_cpu((uint64_t)pi->guard) != guard) {
284                                 log_err("%s: Guard compare error: LBA: %llu Expected=%llx, Actual=%llx\n",
285                                         f->file_name, (unsigned long long)slba,
286                                         guard, be64_to_cpu((uint64_t)pi->guard));
287                                 return -EIO;
288                         }
289                 }
290
291                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_APP) {
292                         unmask_app = be16_to_cpu(pi->apptag) & pi_data->apptag_mask;
293                         unmask_app_exp = pi_data->apptag & pi_data->apptag_mask;
294                         if (unmask_app != unmask_app_exp) {
295                                 log_err("%s: APPTAG compare error: LBA: %llu Expected=%x, Actual=%x\n",
296                                         f->file_name, (unsigned long long)slba,
297                                         unmask_app_exp, unmask_app);
298                                 return -EIO;
299                         }
300                 }
301
302                 if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
303                         switch (data->pi_type) {
304                         case NVME_NS_DPS_PI_TYPE1:
305                         case NVME_NS_DPS_PI_TYPE2:
306                                 ref = get_unaligned_be48(pi->srtag);
307                                 ref_exp = (slba + lba_num) & ((1ULL << 48) - 1);
308                                 if (ref != ref_exp) {
309                                         log_err("%s: REFTAG compare error: LBA: %llu Expected=%llx, Actual=%llx\n",
310                                                 f->file_name, (unsigned long long)slba,
311                                                 ref_exp, ref);
312                                         return -EIO;
313                                 }
314                                 break;
315                         case NVME_NS_DPS_PI_TYPE3:
316                                 break;
317                         }
318                 }
319 next:
320                 if (data->lba_ext) {
321                         buf += data->lba_ext;
322                 } else {
323                         buf += data->lba_size;
324                         md_buf += data->ms;
325                 }
326                 lba_num++;
327         }
328
329         return 0;
330 }
331 void fio_nvme_uring_cmd_trim_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
332                                   struct nvme_dsm *dsm)
333 {
334         struct nvme_data *data = FILE_ENG_DATA(io_u->file);
335         struct trim_range *range;
336         uint8_t *buf_point;
337         int i;
338
339         cmd->opcode = nvme_cmd_dsm;
340         cmd->nsid = data->nsid;
341         cmd->cdw11 = NVME_ATTRIBUTE_DEALLOCATE;
342         cmd->addr = (__u64) (uintptr_t) (&dsm->range[0]);
343
344         if (dsm->nr_ranges == 1) {
345                 dsm->range[0].slba = get_slba(data, io_u->offset);
346                 /* nlb is a 1-based value for deallocate */
347                 dsm->range[0].nlb = get_nlb(data, io_u->xfer_buflen) + 1;
348                 cmd->cdw10 = 0;
349                 cmd->data_len = sizeof(struct nvme_dsm_range);
350         } else {
351                 buf_point = io_u->xfer_buf;
352                 for (i = 0; i < io_u->number_trim; i++) {
353                         range = (struct trim_range *)buf_point;
354                         dsm->range[i].slba = get_slba(data, range->start);
355                         /* nlb is a 1-based value for deallocate */
356                         dsm->range[i].nlb = get_nlb(data, range->len) + 1;
357                         buf_point += sizeof(struct trim_range);
358                 }
359                 cmd->cdw10 = io_u->number_trim - 1;
360                 cmd->data_len = io_u->number_trim * sizeof(struct nvme_dsm_range);
361         }
362 }
363
364 int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
365                             struct iovec *iov, struct nvme_dsm *dsm)
366 {
367         struct nvme_data *data = FILE_ENG_DATA(io_u->file);
368         __u64 slba;
369         __u32 nlb;
370
371         memset(cmd, 0, sizeof(struct nvme_uring_cmd));
372
373         switch (io_u->ddir) {
374         case DDIR_READ:
375                 cmd->opcode = nvme_cmd_read;
376                 break;
377         case DDIR_WRITE:
378                 cmd->opcode = nvme_cmd_write;
379                 break;
380         case DDIR_TRIM:
381                 fio_nvme_uring_cmd_trim_prep(cmd, io_u, dsm);
382                 return 0;
383         default:
384                 return -ENOTSUP;
385         }
386
387         slba = get_slba(data, io_u->offset);
388         nlb = get_nlb(data, io_u->xfer_buflen);
389
390         /* cdw10 and cdw11 represent starting lba */
391         cmd->cdw10 = slba & 0xffffffff;
392         cmd->cdw11 = slba >> 32;
393         /* cdw12 represent number of lba's for read/write */
394         cmd->cdw12 = nlb | (io_u->dtype << 20);
395         cmd->cdw13 = io_u->dspec << 16;
396         if (iov) {
397                 iov->iov_base = io_u->xfer_buf;
398                 iov->iov_len = io_u->xfer_buflen;
399                 cmd->addr = (__u64)(uintptr_t)iov;
400                 cmd->data_len = 1;
401         } else {
402                 cmd->addr = (__u64)(uintptr_t)io_u->xfer_buf;
403                 cmd->data_len = io_u->xfer_buflen;
404         }
405         if (data->lba_shift && data->ms) {
406                 cmd->metadata = (__u64)(uintptr_t)io_u->mmap_data;
407                 cmd->metadata_len = (nlb + 1) * data->ms;
408         }
409         cmd->nsid = data->nsid;
410         return 0;
411 }
412
413 void fio_nvme_pi_fill(struct nvme_uring_cmd *cmd, struct io_u *io_u,
414                       struct nvme_cmd_ext_io_opts *opts)
415 {
416         struct nvme_data *data = FILE_ENG_DATA(io_u->file);
417         __u64 slba;
418
419         slba = get_slba(data, io_u->offset);
420         cmd->cdw12 |= opts->io_flags;
421
422         if (data->pi_type && !(opts->io_flags & NVME_IO_PRINFO_PRACT)) {
423                 if (data->guard_type == NVME_NVM_NS_16B_GUARD)
424                         fio_nvme_generate_pi_16b_guard(data, io_u, opts);
425                 else if (data->guard_type == NVME_NVM_NS_64B_GUARD)
426                         fio_nvme_generate_pi_64b_guard(data, io_u, opts);
427         }
428
429         switch (data->pi_type) {
430         case NVME_NS_DPS_PI_TYPE1:
431         case NVME_NS_DPS_PI_TYPE2:
432                 switch (data->guard_type) {
433                 case NVME_NVM_NS_16B_GUARD:
434                         if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF)
435                                 cmd->cdw14 = (__u32)slba;
436                         break;
437                 case NVME_NVM_NS_64B_GUARD:
438                         if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
439                                 cmd->cdw14 = (__u32)slba;
440                                 cmd->cdw3 = ((slba >> 32) & 0xffff);
441                         }
442                         break;
443                 default:
444                         break;
445                 }
446                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
447                         cmd->cdw15 = (opts->apptag_mask << 16 | opts->apptag);
448                 break;
449         case NVME_NS_DPS_PI_TYPE3:
450                 if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
451                         cmd->cdw15 = (opts->apptag_mask << 16 | opts->apptag);
452                 break;
453         case NVME_NS_DPS_PI_NONE:
454                 break;
455         }
456 }
457
458 int fio_nvme_pi_verify(struct nvme_data *data, struct io_u *io_u)
459 {
460         int ret = 0;
461
462         switch (data->guard_type) {
463         case NVME_NVM_NS_16B_GUARD:
464                 ret = fio_nvme_verify_pi_16b_guard(data, io_u);
465                 break;
466         case NVME_NVM_NS_64B_GUARD:
467                 ret = fio_nvme_verify_pi_64b_guard(data, io_u);
468                 break;
469         default:
470                 break;
471         }
472
473         return ret;
474 }
475
476 static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns,
477                          enum nvme_csi csi, void *data)
478 {
479         struct nvme_passthru_cmd cmd = {
480                 .opcode         = nvme_admin_identify,
481                 .nsid           = nsid,
482                 .addr           = (__u64)(uintptr_t)data,
483                 .data_len       = NVME_IDENTIFY_DATA_SIZE,
484                 .cdw10          = cns,
485                 .cdw11          = csi << NVME_IDENTIFY_CSI_SHIFT,
486                 .timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
487         };
488
489         return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd);
490 }
491
492 int fio_nvme_get_info(struct fio_file *f, __u64 *nlba, __u32 pi_act,
493                       struct nvme_data *data)
494 {
495         struct nvme_id_ns ns;
496         struct nvme_id_ctrl ctrl;
497         struct nvme_nvm_id_ns nvm_ns;
498         int namespace_id;
499         int fd, err;
500         __u32 format_idx, elbaf;
501
502         if (f->filetype != FIO_TYPE_CHAR) {
503                 log_err("ioengine io_uring_cmd only works with nvme ns "
504                         "generic char devices (/dev/ngXnY)\n");
505                 return 1;
506         }
507
508         fd = open(f->file_name, O_RDONLY);
509         if (fd < 0)
510                 return -errno;
511
512         namespace_id = ioctl(fd, NVME_IOCTL_ID);
513         if (namespace_id < 0) {
514                 err = -errno;
515                 log_err("%s: failed to fetch namespace-id\n", f->file_name);
516                 goto out;
517         }
518
519         err = nvme_identify(fd, 0, NVME_IDENTIFY_CNS_CTRL, NVME_CSI_NVM, &ctrl);
520         if (err) {
521                 log_err("%s: failed to fetch identify ctrl\n", f->file_name);
522                 goto out;
523         }
524
525         /*
526          * Identify namespace to get namespace-id, namespace size in LBA's
527          * and LBA data size.
528          */
529         err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_NS,
530                                 NVME_CSI_NVM, &ns);
531         if (err) {
532                 log_err("%s: failed to fetch identify namespace\n",
533                         f->file_name);
534                 goto out;
535         }
536
537         data->nsid = namespace_id;
538
539         /*
540          * 16 or 64 as maximum number of supported LBA formats.
541          * From flbas bit 0-3 indicates lsb and bit 5-6 indicates msb
542          * of the format index used to format the namespace.
543          */
544         if (ns.nlbaf < 16)
545                 format_idx = ns.flbas & 0xf;
546         else
547                 format_idx = (ns.flbas & 0xf) + (((ns.flbas >> 5) & 0x3) << 4);
548
549         data->lba_size = 1 << ns.lbaf[format_idx].ds;
550         data->ms = le16_to_cpu(ns.lbaf[format_idx].ms);
551
552         /* Check for end to end data protection support */
553         if (data->ms && (ns.dps & NVME_NS_DPS_PI_MASK))
554                 data->pi_type = (ns.dps & NVME_NS_DPS_PI_MASK);
555
556         if (!data->pi_type)
557                 goto check_elba;
558
559         if (ctrl.ctratt & NVME_CTRL_CTRATT_ELBAS) {
560                 err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_CSI_NS,
561                                         NVME_CSI_NVM, &nvm_ns);
562                 if (err) {
563                         log_err("%s: failed to fetch identify nvm namespace\n",
564                                 f->file_name);
565                         goto out;
566                 }
567
568                 elbaf = le32_to_cpu(nvm_ns.elbaf[format_idx]);
569
570                 /* Currently we don't support storage tags */
571                 if (elbaf & NVME_ID_NS_NVM_STS_MASK) {
572                         log_err("%s: Storage tag not supported\n",
573                                 f->file_name);
574                         err = -ENOTSUP;
575                         goto out;
576                 }
577
578                 data->guard_type = (elbaf >> NVME_ID_NS_NVM_GUARD_SHIFT) &
579                                 NVME_ID_NS_NVM_GUARD_MASK;
580
581                 /* No 32 bit guard, as storage tag is mandatory for it */
582                 switch (data->guard_type) {
583                 case NVME_NVM_NS_16B_GUARD:
584                         data->pi_size = sizeof(struct nvme_16b_guard_pif);
585                         break;
586                 case NVME_NVM_NS_64B_GUARD:
587                         data->pi_size = sizeof(struct nvme_64b_guard_pif);
588                         break;
589                 default:
590                         break;
591                 }
592         } else {
593                 data->guard_type = NVME_NVM_NS_16B_GUARD;
594                 data->pi_size = sizeof(struct nvme_16b_guard_pif);
595         }
596
597         /*
598          * when PRACT bit is set to 1, and metadata size is equal to protection
599          * information size, controller inserts and removes PI for write and
600          * read commands respectively.
601          */
602         if (pi_act && data->ms == data->pi_size)
603                 data->ms = 0;
604
605         data->pi_loc = (ns.dps & NVME_NS_DPS_PI_FIRST);
606
607 check_elba:
608         /*
609          * Bit 4 for flbas indicates if metadata is transferred at the end of
610          * logical block creating an extended LBA.
611          */
612         if (data->ms && ((ns.flbas >> 4) & 0x1))
613                 data->lba_ext = data->lba_size + data->ms;
614         else
615                 data->lba_shift = ilog2(data->lba_size);
616
617         *nlba = ns.nsze;
618
619 out:
620         close(fd);
621         return err;
622 }
623
624 int fio_nvme_get_zoned_model(struct thread_data *td, struct fio_file *f,
625                              enum zbd_zoned_model *model)
626 {
627         struct nvme_data *data = FILE_ENG_DATA(f);
628         struct nvme_id_ns ns;
629         struct nvme_passthru_cmd cmd;
630         int fd, ret = 0;
631
632         if (f->filetype != FIO_TYPE_CHAR)
633                 return -EINVAL;
634
635         /* File is not yet opened */
636         fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
637         if (fd < 0)
638                 return -errno;
639
640         /* Using nvme_id_ns for data as sizes are same */
641         ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_CTRL,
642                                 NVME_CSI_ZNS, &ns);
643         if (ret) {
644                 *model = ZBD_NONE;
645                 goto out;
646         }
647
648         memset(&cmd, 0, sizeof(struct nvme_passthru_cmd));
649
650         /* Using nvme_id_ns for data as sizes are same */
651         ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
652                                 NVME_CSI_ZNS, &ns);
653         if (ret) {
654                 *model = ZBD_NONE;
655                 goto out;
656         }
657
658         *model = ZBD_HOST_MANAGED;
659 out:
660         close(fd);
661         return 0;
662 }
663
664 static int nvme_report_zones(int fd, __u32 nsid, __u64 slba, __u32 zras_feat,
665                              __u32 data_len, void *data)
666 {
667         struct nvme_passthru_cmd cmd = {
668                 .opcode         = nvme_zns_cmd_mgmt_recv,
669                 .nsid           = nsid,
670                 .addr           = (__u64)(uintptr_t)data,
671                 .data_len       = data_len,
672                 .cdw10          = slba & 0xffffffff,
673                 .cdw11          = slba >> 32,
674                 .cdw12          = (data_len >> 2) - 1,
675                 .cdw13          = NVME_ZNS_ZRA_REPORT_ZONES | zras_feat,
676                 .timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
677         };
678
679         return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
680 }
681
682 int fio_nvme_report_zones(struct thread_data *td, struct fio_file *f,
683                           uint64_t offset, struct zbd_zone *zbdz,
684                           unsigned int nr_zones)
685 {
686         struct nvme_data *data = FILE_ENG_DATA(f);
687         struct nvme_zone_report *zr;
688         struct nvme_zns_id_ns zns_ns;
689         struct nvme_id_ns ns;
690         unsigned int i = 0, j, zones_fetched = 0;
691         unsigned int max_zones, zones_chunks = 1024;
692         int fd, ret = 0;
693         __u32 zr_len;
694         __u64 zlen;
695
696         /* File is not yet opened */
697         fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
698         if (fd < 0)
699                 return -errno;
700
701         zones_fetched = 0;
702         zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc));
703         zr = calloc(1, zr_len);
704         if (!zr) {
705                 close(fd);
706                 return -ENOMEM;
707         }
708
709         ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_NS,
710                                 NVME_CSI_NVM, &ns);
711         if (ret) {
712                 log_err("%s: nvme_identify_ns failed, err=%d\n", f->file_name,
713                         ret);
714                 goto out;
715         }
716
717         ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
718                                 NVME_CSI_ZNS, &zns_ns);
719         if (ret) {
720                 log_err("%s: nvme_zns_identify_ns failed, err=%d\n",
721                         f->file_name, ret);
722                 goto out;
723         }
724         zlen = zns_ns.lbafe[ns.flbas & 0x0f].zsze << data->lba_shift;
725
726         max_zones = (f->real_file_size - offset) / zlen;
727         if (max_zones < nr_zones)
728                 nr_zones = max_zones;
729
730         if (nr_zones < zones_chunks)
731                 zones_chunks = nr_zones;
732
733         while (zones_fetched < nr_zones) {
734                 if (zones_fetched + zones_chunks >= nr_zones) {
735                         zones_chunks = nr_zones - zones_fetched;
736                         zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc));
737                 }
738                 ret = nvme_report_zones(fd, data->nsid, offset >> data->lba_shift,
739                                         NVME_ZNS_ZRAS_FEAT_ERZ, zr_len, (void *)zr);
740                 if (ret) {
741                         log_err("%s: nvme_zns_report_zones failed, err=%d\n",
742                                 f->file_name, ret);
743                         goto out;
744                 }
745
746                 /* Transform the zone-report */
747                 for (j = 0; j < zr->nr_zones; j++, i++) {
748                         struct nvme_zns_desc *desc = (struct nvme_zns_desc *)&(zr->entries[j]);
749
750                         zbdz[i].start = desc->zslba << data->lba_shift;
751                         zbdz[i].len = zlen;
752                         zbdz[i].wp = desc->wp << data->lba_shift;
753                         zbdz[i].capacity = desc->zcap << data->lba_shift;
754
755                         /* Zone Type is stored in first 4 bits. */
756                         switch (desc->zt & 0x0f) {
757                         case NVME_ZONE_TYPE_SEQWRITE_REQ:
758                                 zbdz[i].type = ZBD_ZONE_TYPE_SWR;
759                                 break;
760                         default:
761                                 log_err("%s: invalid type for zone at offset %llu.\n",
762                                         f->file_name, (unsigned long long) desc->zslba);
763                                 ret = -EIO;
764                                 goto out;
765                         }
766
767                         /* Zone State is stored in last 4 bits. */
768                         switch (desc->zs >> 4) {
769                         case NVME_ZNS_ZS_EMPTY:
770                                 zbdz[i].cond = ZBD_ZONE_COND_EMPTY;
771                                 break;
772                         case NVME_ZNS_ZS_IMPL_OPEN:
773                                 zbdz[i].cond = ZBD_ZONE_COND_IMP_OPEN;
774                                 break;
775                         case NVME_ZNS_ZS_EXPL_OPEN:
776                                 zbdz[i].cond = ZBD_ZONE_COND_EXP_OPEN;
777                                 break;
778                         case NVME_ZNS_ZS_CLOSED:
779                                 zbdz[i].cond = ZBD_ZONE_COND_CLOSED;
780                                 break;
781                         case NVME_ZNS_ZS_FULL:
782                                 zbdz[i].cond = ZBD_ZONE_COND_FULL;
783                                 break;
784                         case NVME_ZNS_ZS_READ_ONLY:
785                         case NVME_ZNS_ZS_OFFLINE:
786                         default:
787                                 /* Treat all these conditions as offline (don't use!) */
788                                 zbdz[i].cond = ZBD_ZONE_COND_OFFLINE;
789                                 zbdz[i].wp = zbdz[i].start;
790                         }
791                 }
792                 zones_fetched += zr->nr_zones;
793                 offset += zr->nr_zones * zlen;
794         }
795
796         ret = zones_fetched;
797 out:
798         free(zr);
799         close(fd);
800
801         return ret;
802 }
803
804 int fio_nvme_reset_wp(struct thread_data *td, struct fio_file *f,
805                       uint64_t offset, uint64_t length)
806 {
807         struct nvme_data *data = FILE_ENG_DATA(f);
808         unsigned int nr_zones;
809         unsigned long long zslba;
810         int i, fd, ret = 0;
811
812         /* If the file is not yet opened, open it for this function. */
813         fd = f->fd;
814         if (fd < 0) {
815                 fd = open(f->file_name, O_RDWR | O_LARGEFILE);
816                 if (fd < 0)
817                         return -errno;
818         }
819
820         zslba = offset >> data->lba_shift;
821         nr_zones = (length + td->o.zone_size - 1) / td->o.zone_size;
822
823         for (i = 0; i < nr_zones; i++, zslba += (td->o.zone_size >> data->lba_shift)) {
824                 struct nvme_passthru_cmd cmd = {
825                         .opcode         = nvme_zns_cmd_mgmt_send,
826                         .nsid           = data->nsid,
827                         .cdw10          = zslba & 0xffffffff,
828                         .cdw11          = zslba >> 32,
829                         .cdw13          = NVME_ZNS_ZSA_RESET,
830                         .addr           = (__u64)(uintptr_t)NULL,
831                         .data_len       = 0,
832                         .timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
833                 };
834
835                 ret = ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
836         }
837
838         if (f->fd < 0)
839                 close(fd);
840         return -ret;
841 }
842
843 int fio_nvme_get_max_open_zones(struct thread_data *td, struct fio_file *f,
844                                 unsigned int *max_open_zones)
845 {
846         struct nvme_data *data = FILE_ENG_DATA(f);
847         struct nvme_zns_id_ns zns_ns;
848         int fd, ret = 0;
849
850         fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
851         if (fd < 0)
852                 return -errno;
853
854         ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
855                                 NVME_CSI_ZNS, &zns_ns);
856         if (ret) {
857                 log_err("%s: nvme_zns_identify_ns failed, err=%d\n",
858                         f->file_name, ret);
859                 goto out;
860         }
861
862         *max_open_zones = zns_ns.mor + 1;
863 out:
864         close(fd);
865         return ret;
866 }
867
868 static inline int nvme_fdp_reclaim_unit_handle_status(int fd, __u32 nsid,
869                                                       __u32 data_len, void *data)
870 {
871         struct nvme_passthru_cmd cmd = {
872                 .opcode         = nvme_cmd_io_mgmt_recv,
873                 .nsid           = nsid,
874                 .addr           = (__u64)(uintptr_t)data,
875                 .data_len       = data_len,
876                 .cdw10          = 1,
877                 .cdw11          = (data_len >> 2) - 1,
878         };
879
880         return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
881 }
882
883 int fio_nvme_iomgmt_ruhs(struct thread_data *td, struct fio_file *f,
884                          struct nvme_fdp_ruh_status *ruhs, __u32 bytes)
885 {
886         struct nvme_data *data = FILE_ENG_DATA(f);
887         int fd, ret;
888
889         fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
890         if (fd < 0)
891                 return -errno;
892
893         ret = nvme_fdp_reclaim_unit_handle_status(fd, data->nsid, bytes, ruhs);
894         if (ret) {
895                 log_err("%s: nvme_fdp_reclaim_unit_handle_status failed, err=%d\n",
896                         f->file_name, ret);
897                 errno = ENOTSUP;
898         } else
899                 errno = 0;
900
901         ret = -errno;
902         close(fd);
903         return ret;
904 }