blk-cgroup: don't update io stat for root cgroup

[linux-block.git] / block / blk-cgroup.c
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c

index ed761c62ad0a72b581d58f44d4be22194e003901..9ac1efb053e08cca76d732b90989a219536bb802 100644 (file)
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -33,6 +33,7 @@
  #include "blk-cgroup.h"
  #include "blk-ioprio.h"
  #include "blk-throttle.h"
+#include "blk-rq-qos.h"
  
  /*
   * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
@@ -59,6 +60,37 @@ static struct workqueue_struct *blkcg_punt_bio_wq;
  
  #define BLKG_DESTROY_BATCH_SIZE  64
  
+/*
+ * Lockless lists for tracking IO stats update
+ *
+ * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg).
+ * There are multiple blkg's (one for each block device) attached to each
+ * blkcg. The rstat code keeps track of which cpu has IO stats updated,
+ * but it doesn't know which blkg has the updated stats. If there are many
+ * block devices in a system, the cost of iterating all the blkg's to flush
+ * out the IO stats can be high. To reduce such overhead, a set of percpu
+ * lockless lists (lhead) per blkcg are used to track the set of recently
+ * updated iostat_cpu's since the last flush. An iostat_cpu will be put
+ * onto the lockless list on the update side [blk_cgroup_bio_start()] if
+ * not there yet and then removed when being flushed [blkcg_rstat_flush()].
+ * References to blkg are gotten and then put back in the process to
+ * protect against blkg removal.
+ *
+ * Return: 0 if successful or -ENOMEM if allocation fails.
+ */
+static int init_blkcg_llists(struct blkcg *blkcg)
+{
+       int cpu;
+
+       blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
+       if (!blkcg->lhead)
+               return -ENOMEM;
+
+       for_each_possible_cpu(cpu)
+               init_llist_head(per_cpu_ptr(blkcg->lhead, cpu));
+       return 0;
+}
+
  /**
   * blkcg_css - find the current css
   *
@@ -236,8 +268,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
         blkg->blkcg = blkcg;
  
         u64_stats_init(&blkg->iostat.sync);
-       for_each_possible_cpu(cpu)
+       for_each_possible_cpu(cpu) {
                 u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
+               per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
+       }
  
         for (i = 0; i < BLKCG_MAX_POLS; i++) {
                 struct blkcg_policy *pol = blkcg_policy[i];
@@ -577,7 +611,7 @@ EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
   * @pd: policy private data of interest
   * @v: value to print
   *
- * Print @v to @sf for the device assocaited with @pd.
+ * Print @v to @sf for the device associated with @pd.
   */
  u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
  {
@@ -765,7 +799,7 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
  
  /**
   * blkg_conf_finish - finish up per-blkg config update
- * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
+ * @ctx: blkg_conf_ctx initialized by blkg_conf_prep()
   *
   * Finish up after per-blkg config update.  This function must be paired
   * with blkg_conf_prep().
@@ -827,7 +861,9 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
  static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
  {
         struct blkcg *blkcg = css_to_blkcg(css);
-       struct blkcg_gq *blkg;
+       struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
+       struct llist_node *lnode;
+       struct blkg_iostat_set *bisc, *next_bisc;
  
         /* Root-level stats are sourced from system-wide IO stats */
         if (!cgroup_parent(css->cgroup))
@@ -835,12 +871,21 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
  
         rcu_read_lock();
  
-       hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+       lnode = llist_del_all(lhead);
+       if (!lnode)
+               goto out;
+
+       /*
+        * Iterate only the iostat_cpu's queued in the lockless list.
+        */
+       llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
+               struct blkcg_gq *blkg = bisc->blkg;
                 struct blkcg_gq *parent = blkg->parent;
-               struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
                 struct blkg_iostat cur;
                 unsigned int seq;
  
+               WRITE_ONCE(bisc->lqueued, false);
+
                 /* fetch the current per-cpu values */
                 do {
                         seq = u64_stats_fetch_begin(&bisc->sync);
@@ -853,8 +898,10 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
                 if (parent && parent->parent)
                         blkcg_iostat_update(parent, &blkg->iostat.cur,
                                             &blkg->iostat.last);
+               percpu_ref_put(&blkg->refcnt);
         }
  
+out:
         rcu_read_unlock();
  }
  
@@ -1132,6 +1179,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
  
         mutex_unlock(&blkcg_pol_mutex);
  
+       free_percpu(blkcg->lhead);
         kfree(blkcg);
  }
  
@@ -1139,7 +1187,6 @@ static struct cgroup_subsys_state *
  blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
  {
         struct blkcg *blkcg;
-       struct cgroup_subsys_state *ret;
         int i;
  
         mutex_lock(&blkcg_pol_mutex);
@@ -1148,12 +1195,13 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
                 blkcg = &blkcg_root;
         } else {
                 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
-               if (!blkcg) {
-                       ret = ERR_PTR(-ENOMEM);
+               if (!blkcg)
                         goto unlock;
-               }
         }
  
+       if (init_blkcg_llists(blkcg))
+               goto free_blkcg;
+
         for (i = 0; i < BLKCG_MAX_POLS ; i++) {
                 struct blkcg_policy *pol = blkcg_policy[i];
                 struct blkcg_policy_data *cpd;
@@ -1168,10 +1216,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
                         continue;
  
                 cpd = pol->cpd_alloc_fn(GFP_KERNEL);
-               if (!cpd) {
-                       ret = ERR_PTR(-ENOMEM);
+               if (!cpd)
                         goto free_pd_blkcg;
-               }
+
                 blkcg->cpd[i] = cpd;
                 cpd->blkcg = blkcg;
                 cpd->plid = i;
@@ -1195,12 +1242,13 @@ free_pd_blkcg:
         for (i--; i >= 0; i--)
                 if (blkcg->cpd[i])
                         blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
-
+       free_percpu(blkcg->lhead);
+free_blkcg:
         if (blkcg != &blkcg_root)
                 kfree(blkcg);
  unlock:
         mutex_unlock(&blkcg_pol_mutex);
-       return ret;
+       return ERR_PTR(-ENOMEM);
  }
  
  static int blkcg_css_online(struct cgroup_subsys_state *css)
@@ -1275,6 +1323,7 @@ err_unlock:
  void blkcg_exit_disk(struct gendisk *disk)
  {
         blkg_destroy_all(disk);
+       rq_qos_exit(disk->queue);
         blk_throtl_exit(disk);
  }
  
@@ -1406,6 +1455,10 @@ retry:
                 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
                         pol->pd_init_fn(blkg->pd[pol->plid]);
  
+       if (pol->pd_online_fn)
+               list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
+                       pol->pd_online_fn(blkg->pd[pol->plid]);
+
         __set_bit(pol->plid, q->blkcg_pols);
         ret = 0;
  
@@ -1784,7 +1837,7 @@ out:
  
  /**
   * blkcg_schedule_throttle - this task needs to check for throttling
- * @gendisk: disk to throttle
+ * @disk: disk to throttle
   * @use_memdelay: do we charge this to memory delay for PSI
   *
   * This is called by the IO controller when we know there's delay accumulated
@@ -1943,10 +1996,15 @@ static int blk_cgroup_io_type(struct bio *bio)
  
  void blk_cgroup_bio_start(struct bio *bio)
  {
+       struct blkcg *blkcg = bio->bi_blkg->blkcg;
         int rwd = blk_cgroup_io_type(bio), cpu;
         struct blkg_iostat_set *bis;
         unsigned long flags;
  
+       /* Root-level stats are sourced from system-wide IO stats */
+       if (!cgroup_parent(blkcg->css.cgroup))
+               return;
+
         cpu = get_cpu();
         bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
         flags = u64_stats_update_begin_irqsave(&bis->sync);
@@ -1961,9 +2019,21 @@ void blk_cgroup_bio_start(struct bio *bio)
         }
         bis->cur.ios[rwd]++;
  
+       /*
+        * If the iostat_cpu isn't in a lockless list, put it into the
+        * list to indicate that a stat update is pending.
+        */
+       if (!READ_ONCE(bis->lqueued)) {
+               struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
+
+               llist_add(&bis->lnode, lhead);
+               WRITE_ONCE(bis->lqueued, true);
+               percpu_ref_get(&bis->blkg->refcnt);
+       }
+
         u64_stats_update_end_irqrestore(&bis->sync, flags);
         if (cgroup_subsys_on_dfl(io_cgrp_subsys))
-               cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
+               cgroup_rstat_updated(blkcg->css.cgroup, cpu);
         put_cpu();
  }