From 268975a87c7b6f6b0ceb62df236c1e1b08b89379 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 23 Apr 2025 22:26:33 -0500 Subject: [PATCH] scsi: target: Move delayed/ordered tracking to per CPU The atomic use from the delayed/ordered tracking is causing perf issues when using higher perf backend devices and multiple queues. This moves the values to a per CPU counter. Combined with the per CPU stats patch, this improves IOPS by up to 33% for 8K IOS when using 4 or more queues from the initiator. Signed-off-by: Mike Christie Link: https://lore.kernel.org/r/20250424032741.16216-3-michael.christie@oracle.com Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/target/target_core_device.c | 20 +++++ drivers/target/target_core_transport.c | 119 +++++++++++++------------ include/target/target_core_base.h | 4 +- 3 files changed, 83 insertions(+), 60 deletions(-) diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index 39aad464c0bf..7bb711b24c0d 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -700,6 +700,18 @@ static void scsi_dump_inquiry(struct se_device *dev) pr_debug(" Type: %s ", scsi_device_type(device_type)); } +static void target_non_ordered_release(struct percpu_ref *ref) +{ + struct se_device *dev = container_of(ref, struct se_device, + non_ordered); + unsigned long flags; + + spin_lock_irqsave(&dev->delayed_cmd_lock, flags); + if (!list_empty(&dev->delayed_cmd_list)) + schedule_work(&dev->delayed_cmd_work); + spin_unlock_irqrestore(&dev->delayed_cmd_lock, flags); +} + struct se_device *target_alloc_device(struct se_hba *hba, const char *name) { struct se_device *dev; @@ -730,6 +742,9 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name) INIT_WORK(&q->sq.work, target_queued_submit_work); } + if (percpu_ref_init(&dev->non_ordered, target_non_ordered_release, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) + goto free_queues; dev->se_hba = hba; dev->transport = hba->backend->ops; @@ -816,6 +831,8 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name) return dev; +free_queues: + kfree(dev->queues); free_stats: free_percpu(dev->stats); free_device: @@ -1010,6 +1027,9 @@ void target_free_device(struct se_device *dev) WARN_ON(!list_empty(&dev->dev_sep_list)); + percpu_ref_exit(&dev->non_ordered); + cancel_work_sync(&dev->delayed_cmd_work); + if (target_dev_configured(dev)) { dev->transport->destroy_device(dev); diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 05d29201b730..0a76bdfe5528 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -2213,6 +2213,7 @@ static int target_write_prot_action(struct se_cmd *cmd) static bool target_handle_task_attr(struct se_cmd *cmd) { struct se_device *dev = cmd->se_dev; + unsigned long flags; if (dev->transport_flags & TRANSPORT_FLAG_PASSTHROUGH) return false; @@ -2225,13 +2226,10 @@ static bool target_handle_task_attr(struct se_cmd *cmd) */ switch (cmd->sam_task_attr) { case TCM_HEAD_TAG: - atomic_inc_mb(&dev->non_ordered); pr_debug("Added HEAD_OF_QUEUE for CDB: 0x%02x\n", cmd->t_task_cdb[0]); return false; case TCM_ORDERED_TAG: - atomic_inc_mb(&dev->delayed_cmd_count); - pr_debug("Added ORDERED for CDB: 0x%02x to ordered list\n", cmd->t_task_cdb[0]); break; @@ -2239,29 +2237,29 @@ static bool target_handle_task_attr(struct se_cmd *cmd) /* * For SIMPLE and UNTAGGED Task Attribute commands */ - atomic_inc_mb(&dev->non_ordered); - - if (atomic_read(&dev->delayed_cmd_count) == 0) +retry: + if (percpu_ref_tryget_live(&dev->non_ordered)) return false; + break; } - if (cmd->sam_task_attr != TCM_ORDERED_TAG) { - atomic_inc_mb(&dev->delayed_cmd_count); - /* - * We will account for this when we dequeue from the delayed - * list. - */ - atomic_dec_mb(&dev->non_ordered); + spin_lock_irqsave(&dev->delayed_cmd_lock, flags); + if (cmd->sam_task_attr == TCM_SIMPLE_TAG && + !percpu_ref_is_dying(&dev->non_ordered)) { + spin_unlock_irqrestore(&dev->delayed_cmd_lock, flags); + /* We raced with the last ordered completion so retry. */ + goto retry; + } else if (!percpu_ref_is_dying(&dev->non_ordered)) { + percpu_ref_kill(&dev->non_ordered); } - spin_lock_irq(&cmd->t_state_lock); + spin_lock(&cmd->t_state_lock); cmd->transport_state &= ~CMD_T_SENT; - spin_unlock_irq(&cmd->t_state_lock); + spin_unlock(&cmd->t_state_lock); - spin_lock(&dev->delayed_cmd_lock); list_add_tail(&cmd->se_delayed_node, &dev->delayed_cmd_list); - spin_unlock(&dev->delayed_cmd_lock); + spin_unlock_irqrestore(&dev->delayed_cmd_lock, flags); pr_debug("Added CDB: 0x%02x Task Attr: 0x%02x to delayed CMD listn", cmd->t_task_cdb[0], cmd->sam_task_attr); @@ -2313,41 +2311,52 @@ void target_do_delayed_work(struct work_struct *work) while (!dev->ordered_sync_in_progress) { struct se_cmd *cmd; - if (list_empty(&dev->delayed_cmd_list)) + /* + * We can be woken up early/late due to races or the + * extra wake up we do when adding commands to the list. + * We check for both cases here. + */ + if (list_empty(&dev->delayed_cmd_list) || + !percpu_ref_is_zero(&dev->non_ordered)) break; cmd = list_entry(dev->delayed_cmd_list.next, struct se_cmd, se_delayed_node); + cmd->se_cmd_flags |= SCF_TASK_ORDERED_SYNC; + cmd->transport_state |= CMD_T_SENT; - if (cmd->sam_task_attr == TCM_ORDERED_TAG) { - /* - * Check if we started with: - * [ordered] [simple] [ordered] - * and we are now at the last ordered so we have to wait - * for the simple cmd. - */ - if (atomic_read(&dev->non_ordered) > 0) - break; - - dev->ordered_sync_in_progress = true; - } + dev->ordered_sync_in_progress = true; list_del(&cmd->se_delayed_node); - atomic_dec_mb(&dev->delayed_cmd_count); spin_unlock(&dev->delayed_cmd_lock); - if (cmd->sam_task_attr != TCM_ORDERED_TAG) - atomic_inc_mb(&dev->non_ordered); - - cmd->transport_state |= CMD_T_SENT; - __target_execute_cmd(cmd, true); - spin_lock(&dev->delayed_cmd_lock); } spin_unlock(&dev->delayed_cmd_lock); } +static void transport_complete_ordered_sync(struct se_cmd *cmd) +{ + struct se_device *dev = cmd->se_dev; + unsigned long flags; + + spin_lock_irqsave(&dev->delayed_cmd_lock, flags); + dev->dev_cur_ordered_id++; + + pr_debug("Incremented dev_cur_ordered_id: %u for type %d\n", + dev->dev_cur_ordered_id, cmd->sam_task_attr); + + dev->ordered_sync_in_progress = false; + + if (list_empty(&dev->delayed_cmd_list)) + percpu_ref_resurrect(&dev->non_ordered); + else + schedule_work(&dev->delayed_cmd_work); + + spin_unlock_irqrestore(&dev->delayed_cmd_lock, flags); +} + /* * Called from I/O completion to determine which dormant/delayed * and ordered cmds need to have their tasks added to the execution queue. @@ -2360,30 +2369,24 @@ static void transport_complete_task_attr(struct se_cmd *cmd) return; if (!(cmd->se_cmd_flags & SCF_TASK_ATTR_SET)) - goto restart; - - if (cmd->sam_task_attr == TCM_SIMPLE_TAG) { - atomic_dec_mb(&dev->non_ordered); - dev->dev_cur_ordered_id++; - } else if (cmd->sam_task_attr == TCM_HEAD_TAG) { - atomic_dec_mb(&dev->non_ordered); - dev->dev_cur_ordered_id++; - pr_debug("Incremented dev_cur_ordered_id: %u for HEAD_OF_QUEUE\n", - dev->dev_cur_ordered_id); - } else if (cmd->sam_task_attr == TCM_ORDERED_TAG) { - spin_lock(&dev->delayed_cmd_lock); - dev->ordered_sync_in_progress = false; - spin_unlock(&dev->delayed_cmd_lock); + return; - dev->dev_cur_ordered_id++; - pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED\n", - dev->dev_cur_ordered_id); - } cmd->se_cmd_flags &= ~SCF_TASK_ATTR_SET; -restart: - if (atomic_read(&dev->delayed_cmd_count) > 0) - schedule_work(&dev->delayed_cmd_work); + if (cmd->se_cmd_flags & SCF_TASK_ORDERED_SYNC) { + transport_complete_ordered_sync(cmd); + return; + } + + switch (cmd->sam_task_attr) { + case TCM_SIMPLE_TAG: + percpu_ref_put(&dev->non_ordered); + break; + case TCM_ORDERED_TAG: + /* All ordered should have been executed as sync */ + WARN_ON(1); + break; + } } static void transport_complete_qf(struct se_cmd *cmd) diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 05e3673607b8..a52d4967c0d3 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -157,6 +157,7 @@ enum se_cmd_flags_table { SCF_USE_CPUID = (1 << 16), SCF_TASK_ATTR_SET = (1 << 17), SCF_TREAT_READ_AS_NORMAL = (1 << 18), + SCF_TASK_ORDERED_SYNC = (1 << 19), }; /* @@ -833,9 +834,8 @@ struct se_device { atomic_long_t aborts_no_task; struct se_dev_io_stats __percpu *stats; /* Active commands on this virtual SE device */ - atomic_t non_ordered; + struct percpu_ref non_ordered; bool ordered_sync_in_progress; - atomic_t delayed_cmd_count; atomic_t dev_qf_count; u32 export_count; spinlock_t delayed_cmd_lock; -- 2.25.1