scsi: target: Move I/O path stats to per CPU
authorMike Christie <michael.christie@oracle.com>
Thu, 24 Apr 2025 03:26:32 +0000 (22:26 -0500)
committerMartin K. Petersen <martin.petersen@oracle.com>
Tue, 29 Apr 2025 01:47:55 +0000 (21:47 -0400)
The atomic use in the main I/O path is causing perf issues when using
higher performance backend devices and multiple queues. This moves the
stats to per CPU. Combined with the next patch that moves the
non_ordered/delayed_cmd_count to per CPU, IOPS by up to 33% for 8K IOS
when using 4 or more queues.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Link: https://lore.kernel.org/r/20250424032741.16216-2-michael.christie@oracle.com
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
drivers/target/target_core_device.c
drivers/target/target_core_stat.c
include/target/target_core_base.h

index cc2da086f96e2f5a4245644641bcfaabefb1bb79..39aad464c0bfcecb3809d2007ef077276f6fd0ba 100644 (file)
@@ -55,14 +55,14 @@ transport_lookup_cmd_lun(struct se_cmd *se_cmd)
        rcu_read_lock();
        deve = target_nacl_find_deve(nacl, se_cmd->orig_fe_lun);
        if (deve) {
-               atomic_long_inc(&deve->total_cmds);
+               this_cpu_inc(deve->stats->total_cmds);
 
                if (se_cmd->data_direction == DMA_TO_DEVICE)
-                       atomic_long_add(se_cmd->data_length,
-                                       &deve->write_bytes);
+                       this_cpu_add(deve->stats->write_bytes,
+                                    se_cmd->data_length);
                else if (se_cmd->data_direction == DMA_FROM_DEVICE)
-                       atomic_long_add(se_cmd->data_length,
-                                       &deve->read_bytes);
+                       this_cpu_add(deve->stats->read_bytes,
+                                    se_cmd->data_length);
 
                if ((se_cmd->data_direction == DMA_TO_DEVICE) &&
                    deve->lun_access_ro) {
@@ -126,14 +126,14 @@ out_unlock:
         * target_core_fabric_configfs.c:target_fabric_port_release
         */
        se_cmd->se_dev = rcu_dereference_raw(se_lun->lun_se_dev);
-       atomic_long_inc(&se_cmd->se_dev->num_cmds);
+       this_cpu_inc(se_cmd->se_dev->stats->total_cmds);
 
        if (se_cmd->data_direction == DMA_TO_DEVICE)
-               atomic_long_add(se_cmd->data_length,
-                               &se_cmd->se_dev->write_bytes);
+               this_cpu_add(se_cmd->se_dev->stats->write_bytes,
+                            se_cmd->data_length);
        else if (se_cmd->data_direction == DMA_FROM_DEVICE)
-               atomic_long_add(se_cmd->data_length,
-                               &se_cmd->se_dev->read_bytes);
+               this_cpu_add(se_cmd->se_dev->stats->read_bytes,
+                            se_cmd->data_length);
 
        return ret;
 }
@@ -322,6 +322,7 @@ int core_enable_device_list_for_node(
        struct se_portal_group *tpg)
 {
        struct se_dev_entry *orig, *new;
+       int ret = 0;
 
        new = kzalloc(sizeof(*new), GFP_KERNEL);
        if (!new) {
@@ -329,6 +330,12 @@ int core_enable_device_list_for_node(
                return -ENOMEM;
        }
 
+       new->stats = alloc_percpu(struct se_dev_entry_io_stats);
+       if (!new->stats) {
+               ret = -ENOMEM;
+               goto free_deve;
+       }
+
        spin_lock_init(&new->ua_lock);
        INIT_LIST_HEAD(&new->ua_list);
        INIT_LIST_HEAD(&new->lun_link);
@@ -351,8 +358,8 @@ int core_enable_device_list_for_node(
                               " for dynamic -> explicit NodeACL conversion:"
                                " %s\n", nacl->initiatorname);
                        mutex_unlock(&nacl->lun_entry_mutex);
-                       kfree(new);
-                       return -EINVAL;
+                       ret = -EINVAL;
+                       goto free_stats;
                }
                if (orig->se_lun_acl != NULL) {
                        pr_warn_ratelimited("Detected existing explicit"
@@ -360,8 +367,8 @@ int core_enable_device_list_for_node(
                                " mapped_lun: %llu, failing\n",
                                 nacl->initiatorname, mapped_lun);
                        mutex_unlock(&nacl->lun_entry_mutex);
-                       kfree(new);
-                       return -EINVAL;
+                       ret = -EINVAL;
+                       goto free_stats;
                }
 
                new->se_lun = lun;
@@ -394,6 +401,20 @@ int core_enable_device_list_for_node(
 
        target_luns_data_has_changed(nacl, new, true);
        return 0;
+
+free_stats:
+       free_percpu(new->stats);
+free_deve:
+       kfree(new);
+       return ret;
+}
+
+static void target_free_dev_entry(struct rcu_head *head)
+{
+       struct se_dev_entry *deve = container_of(head, struct se_dev_entry,
+                                                rcu_head);
+       free_percpu(deve->stats);
+       kfree(deve);
 }
 
 void core_disable_device_list_for_node(
@@ -443,7 +464,7 @@ void core_disable_device_list_for_node(
        kref_put(&orig->pr_kref, target_pr_kref_release);
        wait_for_completion(&orig->pr_comp);
 
-       kfree_rcu(orig, rcu_head);
+       call_rcu(&orig->rcu_head, target_free_dev_entry);
 
        core_scsi3_free_pr_reg_from_nacl(dev, nacl);
        target_luns_data_has_changed(nacl, NULL, false);
@@ -689,11 +710,13 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
        if (!dev)
                return NULL;
 
+       dev->stats = alloc_percpu(struct se_dev_io_stats);
+       if (!dev->stats)
+               goto free_device;
+
        dev->queues = kcalloc(nr_cpu_ids, sizeof(*dev->queues), GFP_KERNEL);
-       if (!dev->queues) {
-               hba->backend->ops->free_device(dev);
-               return NULL;
-       }
+       if (!dev->queues)
+               goto free_stats;
 
        dev->queue_cnt = nr_cpu_ids;
        for (i = 0; i < dev->queue_cnt; i++) {
@@ -707,6 +730,7 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
                INIT_WORK(&q->sq.work, target_queued_submit_work);
        }
 
+
        dev->se_hba = hba;
        dev->transport = hba->backend->ops;
        dev->transport_flags = dev->transport->transport_flags_default;
@@ -791,6 +815,12 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
                sizeof(dev->t10_wwn.revision));
 
        return dev;
+
+free_stats:
+       free_percpu(dev->stats);
+free_device:
+       hba->backend->ops->free_device(dev);
+       return NULL;
 }
 
 /*
@@ -1001,6 +1031,7 @@ void target_free_device(struct se_device *dev)
                dev->transport->free_prot(dev);
 
        kfree(dev->queues);
+       free_percpu(dev->stats);
        dev->transport->free_device(dev);
 }
 
index 210648a0092e2703c551b4a6a128f83932a8a7ce..6bdf2d8bd6942269df376a4c188d174bf89b8d27 100644 (file)
@@ -280,30 +280,51 @@ static ssize_t target_stat_lu_num_cmds_show(struct config_item *item,
                char *page)
 {
        struct se_device *dev = to_stat_lu_dev(item);
+       struct se_dev_io_stats *stats;
+       unsigned int cpu;
+       u32 cmds = 0;
+
+       for_each_possible_cpu(cpu) {
+               stats = per_cpu_ptr(dev->stats, cpu);
+               cmds += stats->total_cmds;
+       }
 
        /* scsiLuNumCommands */
-       return snprintf(page, PAGE_SIZE, "%lu\n",
-                       atomic_long_read(&dev->num_cmds));
+       return snprintf(page, PAGE_SIZE, "%u\n", cmds);
 }
 
 static ssize_t target_stat_lu_read_mbytes_show(struct config_item *item,
                char *page)
 {
        struct se_device *dev = to_stat_lu_dev(item);
+       struct se_dev_io_stats *stats;
+       unsigned int cpu;
+       u32 bytes = 0;
+
+       for_each_possible_cpu(cpu) {
+               stats = per_cpu_ptr(dev->stats, cpu);
+               bytes += stats->read_bytes;
+       }
 
        /* scsiLuReadMegaBytes */
-       return snprintf(page, PAGE_SIZE, "%lu\n",
-                       atomic_long_read(&dev->read_bytes) >> 20);
+       return snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20);
 }
 
 static ssize_t target_stat_lu_write_mbytes_show(struct config_item *item,
                char *page)
 {
        struct se_device *dev = to_stat_lu_dev(item);
+       struct se_dev_io_stats *stats;
+       unsigned int cpu;
+       u32 bytes = 0;
+
+       for_each_possible_cpu(cpu) {
+               stats = per_cpu_ptr(dev->stats, cpu);
+               bytes += stats->write_bytes;
+       }
 
        /* scsiLuWrittenMegaBytes */
-       return snprintf(page, PAGE_SIZE, "%lu\n",
-                       atomic_long_read(&dev->write_bytes) >> 20);
+       return snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20);
 }
 
 static ssize_t target_stat_lu_resets_show(struct config_item *item, char *page)
@@ -1019,8 +1040,11 @@ static ssize_t target_stat_auth_num_cmds_show(struct config_item *item,
 {
        struct se_lun_acl *lacl = auth_to_lacl(item);
        struct se_node_acl *nacl = lacl->se_lun_nacl;
+       struct se_dev_entry_io_stats *stats;
        struct se_dev_entry *deve;
+       unsigned int cpu;
        ssize_t ret;
+       u32 cmds = 0;
 
        rcu_read_lock();
        deve = target_nacl_find_deve(nacl, lacl->mapped_lun);
@@ -1028,9 +1052,14 @@ static ssize_t target_stat_auth_num_cmds_show(struct config_item *item,
                rcu_read_unlock();
                return -ENODEV;
        }
+
+       for_each_possible_cpu(cpu) {
+               stats = per_cpu_ptr(deve->stats, cpu);
+               cmds += stats->total_cmds;
+       }
+
        /* scsiAuthIntrOutCommands */
-       ret = snprintf(page, PAGE_SIZE, "%lu\n",
-                      atomic_long_read(&deve->total_cmds));
+       ret = snprintf(page, PAGE_SIZE, "%u\n", cmds);
        rcu_read_unlock();
        return ret;
 }
@@ -1040,8 +1069,11 @@ static ssize_t target_stat_auth_read_mbytes_show(struct config_item *item,
 {
        struct se_lun_acl *lacl = auth_to_lacl(item);
        struct se_node_acl *nacl = lacl->se_lun_nacl;
+       struct se_dev_entry_io_stats *stats;
        struct se_dev_entry *deve;
+       unsigned int cpu;
        ssize_t ret;
+       u32 bytes = 0;
 
        rcu_read_lock();
        deve = target_nacl_find_deve(nacl, lacl->mapped_lun);
@@ -1049,9 +1081,14 @@ static ssize_t target_stat_auth_read_mbytes_show(struct config_item *item,
                rcu_read_unlock();
                return -ENODEV;
        }
+
+       for_each_possible_cpu(cpu) {
+               stats = per_cpu_ptr(deve->stats, cpu);
+               bytes += stats->read_bytes;
+       }
+
        /* scsiAuthIntrReadMegaBytes */
-       ret = snprintf(page, PAGE_SIZE, "%u\n",
-                     (u32)(atomic_long_read(&deve->read_bytes) >> 20));
+       ret = snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20);
        rcu_read_unlock();
        return ret;
 }
@@ -1061,8 +1098,11 @@ static ssize_t target_stat_auth_write_mbytes_show(struct config_item *item,
 {
        struct se_lun_acl *lacl = auth_to_lacl(item);
        struct se_node_acl *nacl = lacl->se_lun_nacl;
+       struct se_dev_entry_io_stats *stats;
        struct se_dev_entry *deve;
+       unsigned int cpu;
        ssize_t ret;
+       u32 bytes = 0;
 
        rcu_read_lock();
        deve = target_nacl_find_deve(nacl, lacl->mapped_lun);
@@ -1070,9 +1110,14 @@ static ssize_t target_stat_auth_write_mbytes_show(struct config_item *item,
                rcu_read_unlock();
                return -ENODEV;
        }
+
+       for_each_possible_cpu(cpu) {
+               stats = per_cpu_ptr(deve->stats, cpu);
+               bytes += stats->write_bytes;
+       }
+
        /* scsiAuthIntrWrittenMegaBytes */
-       ret = snprintf(page, PAGE_SIZE, "%u\n",
-                     (u32)(atomic_long_read(&deve->write_bytes) >> 20));
+       ret = snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20);
        rcu_read_unlock();
        return ret;
 }
index 97099a5e3f6c800f94f011b93ef7bf97c28a905b..05e3673607b8c069ffd4b66d25536d7e6ad420ec 100644 (file)
@@ -669,15 +669,19 @@ struct se_lun_acl {
        struct se_ml_stat_grps  ml_stat_grps;
 };
 
+struct se_dev_entry_io_stats {
+       u32                     total_cmds;
+       u32                     read_bytes;
+       u32                     write_bytes;
+};
+
 struct se_dev_entry {
        u64                     mapped_lun;
        u64                     pr_res_key;
        u64                     creation_time;
        bool                    lun_access_ro;
        u32                     attach_count;
-       atomic_long_t           total_cmds;
-       atomic_long_t           read_bytes;
-       atomic_long_t           write_bytes;
+       struct se_dev_entry_io_stats __percpu   *stats;
        /* Used for PR SPEC_I_PT=1 and REGISTER_AND_MOVE */
        struct kref             pr_kref;
        struct completion       pr_comp;
@@ -800,6 +804,12 @@ struct se_device_queue {
        struct se_cmd_queue     sq;
 };
 
+struct se_dev_io_stats {
+       u32                     total_cmds;
+       u32                     read_bytes;
+       u32                     write_bytes;
+};
+
 struct se_device {
        /* Used for SAM Task Attribute ordering */
        u32                     dev_cur_ordered_id;
@@ -821,9 +831,7 @@ struct se_device {
        atomic_long_t           num_resets;
        atomic_long_t           aborts_complete;
        atomic_long_t           aborts_no_task;
-       atomic_long_t           num_cmds;
-       atomic_long_t           read_bytes;
-       atomic_long_t           write_bytes;
+       struct se_dev_io_stats __percpu *stats;
        /* Active commands on this virtual SE device */
        atomic_t                non_ordered;
        bool                    ordered_sync_in_progress;