X-Git-Url: https://git.kernel.dk/?p=fio.git;a=blobdiff_plain;f=zbd.c;h=99310c49043b8cf79ca63b51689ecd6817eab99f;hp=9c525875ffd5be11c0e479b6df73828a1113b7c6;hb=971d6a22bad5942234496683d89a2f8deed57172;hpb=d60be7d51cbb601cc59dccc9f2a418072046a985 diff --git a/zbd.c b/zbd.c index 9c525875..99310c49 100644 --- a/zbd.c +++ b/zbd.c @@ -119,6 +119,30 @@ static bool zbd_verify_sizes(void) continue; if (!zbd_is_seq_job(f)) continue; + + if (!td->o.zone_size) { + td->o.zone_size = f->zbd_info->zone_size; + if (!td->o.zone_size) { + log_err("%s: invalid 0 zone size\n", + f->file_name); + return false; + } + } else if (td->o.zone_size != f->zbd_info->zone_size) { + log_err("%s: job parameter zonesize %llu does not match disk zone size %llu.\n", + f->file_name, (unsigned long long) td->o.zone_size, + (unsigned long long) f->zbd_info->zone_size); + return false; + } + + if (td->o.zone_skip && + (td->o.zone_skip < td->o.zone_size || + td->o.zone_skip % td->o.zone_size)) { + log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n", + f->file_name, (unsigned long long) td->o.zone_skip, + (unsigned long long) td->o.zone_size); + return false; + } + zone_idx = zbd_zone_idx(f, f->file_offset); z = &f->zbd_info->zone_info[zone_idx]; if (f->file_offset != z->start) { @@ -186,11 +210,14 @@ static bool zbd_verify_bs(void) * size of @buf. * * Returns 0 upon success and a negative error code upon failure. + * If the zone report is empty, always assume an error (device problem) and + * return -EIO. */ static int read_zone_info(int fd, uint64_t start_sector, void *buf, unsigned int bufsz) { struct blk_zone_report *hdr = buf; + int ret; if (bufsz < sizeof(*hdr)) return -EINVAL; @@ -199,7 +226,12 @@ static int read_zone_info(int fd, uint64_t start_sector, hdr->nr_zones = (bufsz - sizeof(*hdr)) / sizeof(struct blk_zone); hdr->sector = start_sector; - return ioctl(fd, BLKREPORTZONE, hdr) >= 0 ? 0 : -errno; + ret = ioctl(fd, BLKREPORTZONE, hdr); + if (ret) + return -errno; + if (!hdr->nr_zones) + return -EIO; + return 0; } /* @@ -228,12 +260,45 @@ static enum blk_zoned_model get_zbd_model(const char *file_name) char *zoned_attr_path = NULL; char *model_str = NULL; struct stat statbuf; + char *sys_devno_path = NULL; + char *part_attr_path = NULL; + char *part_str = NULL; + char sys_path[PATH_MAX]; + ssize_t sz; + char *delim = NULL; if (stat(file_name, &statbuf) < 0) goto out; - if (asprintf(&zoned_attr_path, "/sys/dev/block/%d:%d/queue/zoned", + + if (asprintf(&sys_devno_path, "/sys/dev/block/%d:%d", major(statbuf.st_rdev), minor(statbuf.st_rdev)) < 0) goto out; + + sz = readlink(sys_devno_path, sys_path, sizeof(sys_path) - 1); + if (sz < 0) + goto out; + sys_path[sz] = '\0'; + + /* + * If the device is a partition device, cut the device name in the + * canonical sysfs path to obtain the sysfs path of the holder device. + * e.g.: /sys/devices/.../sda/sda1 -> /sys/devices/.../sda + */ + if (asprintf(&part_attr_path, "/sys/dev/block/%s/partition", + sys_path) < 0) + goto out; + part_str = read_file(part_attr_path); + if (part_str && *part_str == '1') { + delim = strrchr(sys_path, '/'); + if (!delim) + goto out; + *delim = '\0'; + } + + if (asprintf(&zoned_attr_path, + "/sys/dev/block/%s/queue/zoned", sys_path) < 0) + goto out; + model_str = read_file(zoned_attr_path); if (!model_str) goto out; @@ -246,6 +311,9 @@ static enum blk_zoned_model get_zbd_model(const char *file_name) out: free(model_str); free(zoned_attr_path); + free(part_str); + free(part_attr_path); + free(sys_devno_path); return model; } @@ -268,13 +336,23 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f) { uint32_t nr_zones; struct fio_zone_info *p; - uint64_t zone_size; + uint64_t zone_size = td->o.zone_size; struct zoned_block_device_info *zbd_info = NULL; pthread_mutexattr_t attr; int i; - zone_size = td->o.zone_size; - assert(zone_size); + if (zone_size == 0) { + log_err("%s: Specifying the zone size is mandatory for regular block devices with --zonemode=zbd\n\n", + f->file_name); + return 1; + } + + if (zone_size < 512) { + log_err("%s: zone size must be at least 512 bytes for --zonemode=zbd\n\n", + f->file_name); + return 1; + } + nr_zones = (f->real_file_size + zone_size - 1) / zone_size; zbd_info = scalloc(1, sizeof(*zbd_info) + (nr_zones + 1) * sizeof(zbd_info->zone_info[0])); @@ -357,8 +435,8 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f) if (td->o.zone_size == 0) { td->o.zone_size = zone_size; } else if (td->o.zone_size != zone_size) { - log_info("fio: %s job parameter zonesize %llu does not match disk zone size %llu.\n", - f->file_name, (unsigned long long) td->o.zone_size, + log_err("fio: %s job parameter zonesize %llu does not match disk zone size %llu.\n", + f->file_name, (unsigned long long) td->o.zone_size, (unsigned long long) zone_size); ret = -EINVAL; goto close; @@ -382,8 +460,6 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f) p->start = z->start << 9; switch (z->cond) { case BLK_ZONE_COND_NOT_WP: - p->wp = p->start; - break; case BLK_ZONE_COND_FULL: p->wp = p->start + zone_size; break; @@ -439,7 +515,7 @@ out: * * Returns 0 upon success and a negative error code upon failure. */ -int zbd_create_zone_info(struct thread_data *td, struct fio_file *f) +static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f) { enum blk_zoned_model zbd_model; int ret = 0; @@ -507,7 +583,7 @@ static int zbd_init_zone_info(struct thread_data *td, struct fio_file *file) ret = zbd_create_zone_info(td, file); if (ret < 0) - td_verror(td, -ret, "BLKREPORTZONE failed"); + td_verror(td, -ret, "zbd_create_zone_info() failed"); return ret; } @@ -519,18 +595,8 @@ int zbd_init(struct thread_data *td) for_each_file(td, f, i) { if (f->filetype != FIO_TYPE_BLOCK) continue; - if (td->o.zone_size && td->o.zone_size < 512) { - log_err("%s: zone size must be at least 512 bytes for --zonemode=zbd\n\n", - f->file_name); - return 1; - } - if (td->o.zone_size == 0 && - get_zbd_model(f->file_name) == ZBD_DM_NONE) { - log_err("%s: Specifying the zone size is mandatory for regular block devices with --zonemode=zbd\n\n", - f->file_name); + if (zbd_init_zone_info(td, f)) return 1; - } - zbd_init_zone_info(td, f); } if (!zbd_using_direct_io()) { @@ -606,7 +672,7 @@ static int zbd_reset_range(struct thread_data *td, const struct fio_file *f, static unsigned int zbd_zone_nr(struct zoned_block_device_info *zbd_info, struct fio_zone_info *zone) { - return (uintptr_t) zone - (uintptr_t) zbd_info->zone_info; + return zone - zbd_info->zone_info; } /** @@ -726,29 +792,76 @@ static bool zbd_dec_and_reset_write_cnt(const struct thread_data *td, return write_cnt == 0; } -void zbd_file_reset(struct thread_data *td, struct fio_file *f) +enum swd_action { + CHECK_SWD, + SET_SWD, +}; + +/* Calculate the number of sectors with data (swd) and perform action 'a' */ +static uint64_t zbd_process_swd(const struct fio_file *f, enum swd_action a) { struct fio_zone_info *zb, *ze, *z; - uint32_t zone_idx_e; uint64_t swd = 0; - if (!f->zbd_info) - return; - zb = &f->zbd_info->zone_info[zbd_zone_idx(f, f->file_offset)]; - zone_idx_e = zbd_zone_idx(f, f->file_offset + f->io_size); - ze = &f->zbd_info->zone_info[zone_idx_e]; - for (z = zb ; z < ze; z++) { + ze = &f->zbd_info->zone_info[zbd_zone_idx(f, f->file_offset + + f->io_size)]; + for (z = zb; z < ze; z++) { pthread_mutex_lock(&z->mutex); swd += z->wp - z->start; } pthread_mutex_lock(&f->zbd_info->mutex); - f->zbd_info->sectors_with_data = swd; + switch (a) { + case CHECK_SWD: + assert(f->zbd_info->sectors_with_data == swd); + break; + case SET_SWD: + f->zbd_info->sectors_with_data = swd; + break; + } pthread_mutex_unlock(&f->zbd_info->mutex); - for (z = zb ; z < ze; z++) + for (z = zb; z < ze; z++) pthread_mutex_unlock(&z->mutex); - dprint(FD_ZBD, "%s(%s): swd = %llu\n", __func__, f->file_name, - (unsigned long long) swd); + + return swd; +} + +/* + * The swd check is useful for debugging but takes too much time to leave + * it enabled all the time. Hence it is disabled by default. + */ +static const bool enable_check_swd = false; + +/* Check whether the value of zbd_info.sectors_with_data is correct. */ +static void zbd_check_swd(const struct fio_file *f) +{ + if (!enable_check_swd) + return; + + zbd_process_swd(f, CHECK_SWD); +} + +static void zbd_init_swd(struct fio_file *f) +{ + uint64_t swd; + + swd = zbd_process_swd(f, SET_SWD); + dprint(FD_ZBD, "%s(%s): swd = %" PRIu64 "\n", __func__, f->file_name, + swd); +} + +void zbd_file_reset(struct thread_data *td, struct fio_file *f) +{ + struct fio_zone_info *zb, *ze; + uint32_t zone_idx_e; + + if (!f->zbd_info) + return; + + zb = &f->zbd_info->zone_info[zbd_zone_idx(f, f->file_offset)]; + zone_idx_e = zbd_zone_idx(f, f->file_offset + f->io_size); + ze = &f->zbd_info->zone_info[zone_idx_e]; + zbd_init_swd(f); /* * If data verification is enabled reset the affected zones before * writing any data to avoid that a zone reset has to be issued while @@ -844,8 +957,8 @@ static void zbd_close_zone(struct thread_data *td, const struct fio_file *f, * a multiple of the fio block size. The caller must neither hold z->mutex * nor f->zbd_info->mutex. Returns with z->mutex held upon success. */ -struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td, - struct io_u *io_u) +static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td, + struct io_u *io_u) { const uint32_t min_bs = td->o.min_bs[io_u->ddir]; const struct fio_file *f = io_u->file; @@ -1028,37 +1141,44 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u, return NULL; } - /** - * zbd_post_submit - update the write pointer and unlock the zone lock + * zbd_queue_io - update the write pointer of a sequential zone * @io_u: I/O unit - * @success: Whether or not the I/O unit has been executed successfully + * @success: Whether or not the I/O unit has been queued successfully + * @q: queueing status (busy, completed or queued). * - * For write and trim operations, update the write pointer of all affected - * zones. + * For write and trim operations, update the write pointer of the I/O unit + * target zone. */ -static void zbd_post_submit(const struct io_u *io_u, bool success) +static void zbd_queue_io(struct io_u *io_u, int q, bool success) { - struct zoned_block_device_info *zbd_info; + const struct fio_file *f = io_u->file; + struct zoned_block_device_info *zbd_info = f->zbd_info; struct fio_zone_info *z; uint32_t zone_idx; - uint64_t end, zone_end; + uint64_t zone_end; - zbd_info = io_u->file->zbd_info; if (!zbd_info) return; - zone_idx = zbd_zone_idx(io_u->file, io_u->offset); - end = io_u->offset + io_u->buflen; - z = &zbd_info->zone_info[zone_idx]; + zone_idx = zbd_zone_idx(f, io_u->offset); assert(zone_idx < zbd_info->nr_zones); + z = &zbd_info->zone_info[zone_idx]; + if (z->type != BLK_ZONE_TYPE_SEQWRITE_REQ) return; + if (!success) goto unlock; + + dprint(FD_ZBD, + "%s: queued I/O (%lld, %llu) for zone %u\n", + f->file_name, io_u->offset, io_u->buflen, zone_idx); + switch (io_u->ddir) { case DDIR_WRITE: - zone_end = min(end, (z + 1)->start); + zone_end = min((uint64_t)(io_u->offset + io_u->buflen), + (z + 1)->start); pthread_mutex_lock(&zbd_info->mutex); /* * z->wp > zone_end means that one or more I/O errors @@ -1075,8 +1195,42 @@ static void zbd_post_submit(const struct io_u *io_u, bool success) default: break; } + unlock: - pthread_mutex_unlock(&z->mutex); + if (!success || q != FIO_Q_QUEUED) { + /* BUSY or COMPLETED: unlock the zone */ + pthread_mutex_unlock(&z->mutex); + io_u->zbd_put_io = NULL; + } +} + +/** + * zbd_put_io - Unlock an I/O unit target zone lock + * @io_u: I/O unit + */ +static void zbd_put_io(const struct io_u *io_u) +{ + const struct fio_file *f = io_u->file; + struct zoned_block_device_info *zbd_info = f->zbd_info; + struct fio_zone_info *z; + uint32_t zone_idx; + + if (!zbd_info) + return; + + zone_idx = zbd_zone_idx(f, io_u->offset); + assert(zone_idx < zbd_info->nr_zones); + z = &zbd_info->zone_info[zone_idx]; + + if (z->type != BLK_ZONE_TYPE_SEQWRITE_REQ) + return; + + dprint(FD_ZBD, + "%s: terminate I/O (%lld, %llu) for zone %u\n", + f->file_name, io_u->offset, io_u->buflen, zone_idx); + + assert(pthread_mutex_unlock(&z->mutex) == 0); + zbd_check_swd(f); } bool zbd_unaligned_write(int error_code) @@ -1089,6 +1243,65 @@ bool zbd_unaligned_write(int error_code) return false; } +/** + * setup_zbd_zone_mode - handle zoneskip as necessary for ZBD drives + * @td: FIO thread data. + * @io_u: FIO I/O unit. + * + * For sequential workloads, change the file offset to skip zoneskip bytes when + * no more IO can be performed in the current zone. + * - For read workloads, zoneskip is applied when the io has reached the end of + * the zone or the zone write position (when td->o.read_beyond_wp is false). + * - For write workloads, zoneskip is applied when the zone is full. + * This applies only to read and write operations. + */ +void setup_zbd_zone_mode(struct thread_data *td, struct io_u *io_u) +{ + struct fio_file *f = io_u->file; + enum fio_ddir ddir = io_u->ddir; + struct fio_zone_info *z; + uint32_t zone_idx; + + assert(td->o.zone_mode == ZONE_MODE_ZBD); + assert(td->o.zone_size); + + /* + * zone_skip is valid only for sequential workloads. + */ + if (td_random(td) || !td->o.zone_skip) + return; + + /* + * It is time to switch to a new zone if: + * - zone_bytes == zone_size bytes have already been accessed + * - The last position reached the end of the current zone. + * - For reads with td->o.read_beyond_wp == false, the last position + * reached the zone write pointer. + */ + zone_idx = zbd_zone_idx(f, f->last_pos[ddir]); + z = &f->zbd_info->zone_info[zone_idx]; + + if (td->zone_bytes >= td->o.zone_size || + f->last_pos[ddir] >= (z+1)->start || + (ddir == DDIR_READ && + (!td->o.read_beyond_wp) && f->last_pos[ddir] >= z->wp)) { + /* + * Skip zones. + */ + td->zone_bytes = 0; + f->file_offset += td->o.zone_size + td->o.zone_skip; + + /* + * Wrap from the beginning, if we exceed the file size + */ + if (f->file_offset >= f->real_file_size) + f->file_offset = get_start_offset(td, f); + + f->last_pos[ddir] = f->file_offset; + td->io_skip_bytes += td->o.zone_skip; + } +} + /** * zbd_adjust_block - adjust the offset and length as necessary for ZBD drives * @td: FIO thread data. @@ -1129,7 +1342,23 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) io_u->ddir == DDIR_READ && td->o.read_beyond_wp) return io_u_accept; - pthread_mutex_lock(&zb->mutex); + zbd_check_swd(f); + + /* + * Lock the io_u target zone. The zone will be unlocked if io_u offset + * is changed or when io_u completes and zbd_put_io() executed. + * To avoid multiple jobs doing asynchronous I/Os from deadlocking each + * other waiting for zone locks when building an io_u batch, first + * only trylock the zone. If the zone is already locked by another job, + * process the currently queued I/Os so that I/O progress is made and + * zones unlocked. + */ + if (pthread_mutex_trylock(&zb->mutex) != 0) { + if (!td_ioengine_flagged(td, FIO_SYNCIO)) + io_u_quiesce(td); + pthread_mutex_lock(&zb->mutex); + } + switch (io_u->ddir) { case DDIR_READ: if (td->runstate == TD_VERIFYING) { @@ -1267,8 +1496,10 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) accept: assert(zb); assert(zb->cond != BLK_ZONE_COND_OFFLINE); - assert(!io_u->post_submit); - io_u->post_submit = zbd_post_submit; + assert(!io_u->zbd_queue_io); + assert(!io_u->zbd_put_io); + io_u->zbd_queue_io = zbd_queue_io; + io_u->zbd_put_io = zbd_put_io; return io_u_accept; eof: