scsi: target: Move delayed/ordered tracking to per CPU
authorMike Christie <michael.christie@oracle.com>
Thu, 24 Apr 2025 03:26:33 +0000 (22:26 -0500)
committerMartin K. Petersen <martin.petersen@oracle.com>
Tue, 29 Apr 2025 01:47:55 +0000 (21:47 -0400)
The atomic use from the delayed/ordered tracking is causing perf issues
when using higher perf backend devices and multiple queues.  This moves
the values to a per CPU counter. Combined with the per CPU stats patch,
this improves IOPS by up to 33% for 8K IOS when using 4 or more queues
from the initiator.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Link: https://lore.kernel.org/r/20250424032741.16216-3-michael.christie@oracle.com
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
drivers/target/target_core_device.c
drivers/target/target_core_transport.c
include/target/target_core_base.h

index 39aad464c0bfcecb3809d2007ef077276f6fd0ba..7bb711b24c0d72d487600fde0b25addaf1d41dfb 100644 (file)
@@ -700,6 +700,18 @@ static void scsi_dump_inquiry(struct se_device *dev)
        pr_debug("  Type:   %s ", scsi_device_type(device_type));
 }
 
+static void target_non_ordered_release(struct percpu_ref *ref)
+{
+       struct se_device *dev = container_of(ref, struct se_device,
+                                            non_ordered);
+       unsigned long flags;
+
+       spin_lock_irqsave(&dev->delayed_cmd_lock, flags);
+       if (!list_empty(&dev->delayed_cmd_list))
+               schedule_work(&dev->delayed_cmd_work);
+       spin_unlock_irqrestore(&dev->delayed_cmd_lock, flags);
+}
+
 struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
 {
        struct se_device *dev;
@@ -730,6 +742,9 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
                INIT_WORK(&q->sq.work, target_queued_submit_work);
        }
 
+       if (percpu_ref_init(&dev->non_ordered, target_non_ordered_release,
+                           PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
+               goto free_queues;
 
        dev->se_hba = hba;
        dev->transport = hba->backend->ops;
@@ -816,6 +831,8 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
 
        return dev;
 
+free_queues:
+       kfree(dev->queues);
 free_stats:
        free_percpu(dev->stats);
 free_device:
@@ -1010,6 +1027,9 @@ void target_free_device(struct se_device *dev)
 
        WARN_ON(!list_empty(&dev->dev_sep_list));
 
+       percpu_ref_exit(&dev->non_ordered);
+       cancel_work_sync(&dev->delayed_cmd_work);
+
        if (target_dev_configured(dev)) {
                dev->transport->destroy_device(dev);
 
index 05d29201b730f0ef970f6c3962e9c02d17492a9b..0a76bdfe55282073c6cb423051d279d7ff1693b7 100644 (file)
@@ -2213,6 +2213,7 @@ static int target_write_prot_action(struct se_cmd *cmd)
 static bool target_handle_task_attr(struct se_cmd *cmd)
 {
        struct se_device *dev = cmd->se_dev;
+       unsigned long flags;
 
        if (dev->transport_flags & TRANSPORT_FLAG_PASSTHROUGH)
                return false;
@@ -2225,13 +2226,10 @@ static bool target_handle_task_attr(struct se_cmd *cmd)
         */
        switch (cmd->sam_task_attr) {
        case TCM_HEAD_TAG:
-               atomic_inc_mb(&dev->non_ordered);
                pr_debug("Added HEAD_OF_QUEUE for CDB: 0x%02x\n",
                         cmd->t_task_cdb[0]);
                return false;
        case TCM_ORDERED_TAG:
-               atomic_inc_mb(&dev->delayed_cmd_count);
-
                pr_debug("Added ORDERED for CDB: 0x%02x to ordered list\n",
                         cmd->t_task_cdb[0]);
                break;
@@ -2239,29 +2237,29 @@ static bool target_handle_task_attr(struct se_cmd *cmd)
                /*
                 * For SIMPLE and UNTAGGED Task Attribute commands
                 */
-               atomic_inc_mb(&dev->non_ordered);
-
-               if (atomic_read(&dev->delayed_cmd_count) == 0)
+retry:
+               if (percpu_ref_tryget_live(&dev->non_ordered))
                        return false;
+
                break;
        }
 
-       if (cmd->sam_task_attr != TCM_ORDERED_TAG) {
-               atomic_inc_mb(&dev->delayed_cmd_count);
-               /*
-                * We will account for this when we dequeue from the delayed
-                * list.
-                */
-               atomic_dec_mb(&dev->non_ordered);
+       spin_lock_irqsave(&dev->delayed_cmd_lock, flags);
+       if (cmd->sam_task_attr == TCM_SIMPLE_TAG &&
+           !percpu_ref_is_dying(&dev->non_ordered)) {
+               spin_unlock_irqrestore(&dev->delayed_cmd_lock, flags);
+               /* We raced with the last ordered completion so retry. */
+               goto retry;
+       } else if (!percpu_ref_is_dying(&dev->non_ordered)) {
+               percpu_ref_kill(&dev->non_ordered);
        }
 
-       spin_lock_irq(&cmd->t_state_lock);
+       spin_lock(&cmd->t_state_lock);
        cmd->transport_state &= ~CMD_T_SENT;
-       spin_unlock_irq(&cmd->t_state_lock);
+       spin_unlock(&cmd->t_state_lock);
 
-       spin_lock(&dev->delayed_cmd_lock);
        list_add_tail(&cmd->se_delayed_node, &dev->delayed_cmd_list);
-       spin_unlock(&dev->delayed_cmd_lock);
+       spin_unlock_irqrestore(&dev->delayed_cmd_lock, flags);
 
        pr_debug("Added CDB: 0x%02x Task Attr: 0x%02x to delayed CMD listn",
                cmd->t_task_cdb[0], cmd->sam_task_attr);
@@ -2313,41 +2311,52 @@ void target_do_delayed_work(struct work_struct *work)
        while (!dev->ordered_sync_in_progress) {
                struct se_cmd *cmd;
 
-               if (list_empty(&dev->delayed_cmd_list))
+               /*
+                * We can be woken up early/late due to races or the
+                * extra wake up we do when adding commands to the list.
+                * We check for both cases here.
+                */
+               if (list_empty(&dev->delayed_cmd_list) ||
+                   !percpu_ref_is_zero(&dev->non_ordered))
                        break;
 
                cmd = list_entry(dev->delayed_cmd_list.next,
                                 struct se_cmd, se_delayed_node);
+               cmd->se_cmd_flags |= SCF_TASK_ORDERED_SYNC;
+               cmd->transport_state |= CMD_T_SENT;
 
-               if (cmd->sam_task_attr == TCM_ORDERED_TAG) {
-                       /*
-                        * Check if we started with:
-                        * [ordered] [simple] [ordered]
-                        * and we are now at the last ordered so we have to wait
-                        * for the simple cmd.
-                        */
-                       if (atomic_read(&dev->non_ordered) > 0)
-                               break;
-
-                       dev->ordered_sync_in_progress = true;
-               }
+               dev->ordered_sync_in_progress = true;
 
                list_del(&cmd->se_delayed_node);
-               atomic_dec_mb(&dev->delayed_cmd_count);
                spin_unlock(&dev->delayed_cmd_lock);
 
-               if (cmd->sam_task_attr != TCM_ORDERED_TAG)
-                       atomic_inc_mb(&dev->non_ordered);
-
-               cmd->transport_state |= CMD_T_SENT;
-
                __target_execute_cmd(cmd, true);
-
                spin_lock(&dev->delayed_cmd_lock);
        }
        spin_unlock(&dev->delayed_cmd_lock);
 }
 
+static void transport_complete_ordered_sync(struct se_cmd *cmd)
+{
+       struct se_device *dev = cmd->se_dev;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dev->delayed_cmd_lock, flags);
+       dev->dev_cur_ordered_id++;
+
+       pr_debug("Incremented dev_cur_ordered_id: %u for type %d\n",
+                dev->dev_cur_ordered_id, cmd->sam_task_attr);
+
+       dev->ordered_sync_in_progress = false;
+
+       if (list_empty(&dev->delayed_cmd_list))
+               percpu_ref_resurrect(&dev->non_ordered);
+       else
+               schedule_work(&dev->delayed_cmd_work);
+
+       spin_unlock_irqrestore(&dev->delayed_cmd_lock, flags);
+}
+
 /*
  * Called from I/O completion to determine which dormant/delayed
  * and ordered cmds need to have their tasks added to the execution queue.
@@ -2360,30 +2369,24 @@ static void transport_complete_task_attr(struct se_cmd *cmd)
                return;
 
        if (!(cmd->se_cmd_flags & SCF_TASK_ATTR_SET))
-               goto restart;
-
-       if (cmd->sam_task_attr == TCM_SIMPLE_TAG) {
-               atomic_dec_mb(&dev->non_ordered);
-               dev->dev_cur_ordered_id++;
-       } else if (cmd->sam_task_attr == TCM_HEAD_TAG) {
-               atomic_dec_mb(&dev->non_ordered);
-               dev->dev_cur_ordered_id++;
-               pr_debug("Incremented dev_cur_ordered_id: %u for HEAD_OF_QUEUE\n",
-                        dev->dev_cur_ordered_id);
-       } else if (cmd->sam_task_attr == TCM_ORDERED_TAG) {
-               spin_lock(&dev->delayed_cmd_lock);
-               dev->ordered_sync_in_progress = false;
-               spin_unlock(&dev->delayed_cmd_lock);
+               return;
 
-               dev->dev_cur_ordered_id++;
-               pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED\n",
-                        dev->dev_cur_ordered_id);
-       }
        cmd->se_cmd_flags &= ~SCF_TASK_ATTR_SET;
 
-restart:
-       if (atomic_read(&dev->delayed_cmd_count) > 0)
-               schedule_work(&dev->delayed_cmd_work);
+       if (cmd->se_cmd_flags & SCF_TASK_ORDERED_SYNC) {
+               transport_complete_ordered_sync(cmd);
+               return;
+       }
+
+       switch (cmd->sam_task_attr) {
+       case TCM_SIMPLE_TAG:
+               percpu_ref_put(&dev->non_ordered);
+               break;
+       case TCM_ORDERED_TAG:
+               /* All ordered should have been executed as sync */
+               WARN_ON(1);
+               break;
+       }
 }
 
 static void transport_complete_qf(struct se_cmd *cmd)
index 05e3673607b8c069ffd4b66d25536d7e6ad420ec..a52d4967c0d3129b384bd2496583667c8c802c10 100644 (file)
@@ -157,6 +157,7 @@ enum se_cmd_flags_table {
        SCF_USE_CPUID                           = (1 << 16),
        SCF_TASK_ATTR_SET                       = (1 << 17),
        SCF_TREAT_READ_AS_NORMAL                = (1 << 18),
+       SCF_TASK_ORDERED_SYNC                   = (1 << 19),
 };
 
 /*
@@ -833,9 +834,8 @@ struct se_device {
        atomic_long_t           aborts_no_task;
        struct se_dev_io_stats __percpu *stats;
        /* Active commands on this virtual SE device */
-       atomic_t                non_ordered;
+       struct percpu_ref       non_ordered;
        bool                    ordered_sync_in_progress;
-       atomic_t                delayed_cmd_count;
        atomic_t                dev_qf_count;
        u32                     export_count;
        spinlock_t              delayed_cmd_lock;