writeback, cgroup: keep list of inodes attached to bdi_writeback
authorRoman Gushchin <guro@fb.com>
Tue, 29 Jun 2021 02:35:53 +0000 (19:35 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 29 Jun 2021 17:53:48 +0000 (10:53 -0700)
Currently there is no way to iterate over inodes attached to a specific
cgwb structure.  It limits the ability to efficiently reclaim the
writeback structure itself and associated memory and block cgroup
structures without scanning all inodes belonging to a sb, which can be
prohibitively expensive.

While dirty/in-active-writeback an inode belongs to one of the
bdi_writeback's io lists: b_dirty, b_io, b_more_io and b_dirty_time.  Once
cleaned up, it's removed from all io lists.  So the inode->i_io_list can
be reused to maintain the list of inodes, attached to a bdi_writeback
structure.

This patch introduces a new wb->b_attached list, which contains all inodes
which were dirty at least once and are attached to the given cgwb.  Inodes
attached to the root bdi_writeback structures are never placed on such
list.  The following patch will use this list to try to release cgwbs
structures more efficiently.

Link: https://lkml.kernel.org/r/20210608230225.2078447-6-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Dennis Zhou <dennis@kernel.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Jan Kara <jack@suse.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/fs-writeback.c
include/linux/backing-dev-defs.h
mm/backing-dev.c

index 96974e13a20358b31d87c05921294f60bc0a1236..87b305ee5348c30b366f4da3bc7e65cbf27db39c 100644 (file)
@@ -131,25 +131,6 @@ static bool inode_io_list_move_locked(struct inode *inode,
        return false;
 }
 
-/**
- * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
- * @inode: inode to be removed
- * @wb: bdi_writeback @inode is being removed from
- *
- * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
- * clear %WB_has_dirty_io if all are empty afterwards.
- */
-static void inode_io_list_del_locked(struct inode *inode,
-                                    struct bdi_writeback *wb)
-{
-       assert_spin_locked(&wb->list_lock);
-       assert_spin_locked(&inode->i_lock);
-
-       inode->i_state &= ~I_SYNC_QUEUED;
-       list_del_init(&inode->i_io_list);
-       wb_io_lists_depopulated(wb);
-}
-
 static void wb_wakeup(struct bdi_writeback *wb)
 {
        spin_lock_bh(&wb->work_lock);
@@ -278,6 +259,28 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
 }
 EXPORT_SYMBOL_GPL(__inode_attach_wb);
 
+/**
+ * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
+ * @inode: inode of interest with i_lock held
+ * @wb: target bdi_writeback
+ *
+ * Remove the inode from wb's io lists and if necessarily put onto b_attached
+ * list.  Only inodes attached to cgwb's are kept on this list.
+ */
+static void inode_cgwb_move_to_attached(struct inode *inode,
+                                       struct bdi_writeback *wb)
+{
+       assert_spin_locked(&wb->list_lock);
+       assert_spin_locked(&inode->i_lock);
+
+       inode->i_state &= ~I_SYNC_QUEUED;
+       if (wb != &wb->bdi->wb)
+               list_move(&inode->i_io_list, &wb->b_attached);
+       else
+               list_del_init(&inode->i_io_list);
+       wb_io_lists_depopulated(wb);
+}
+
 /**
  * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
  * @inode: inode of interest with i_lock held
@@ -418,21 +421,28 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
        wb_get(new_wb);
 
        /*
-        * Transfer to @new_wb's IO list if necessary.  The specific list
-        * @inode was on is ignored and the inode is put on ->b_dirty which
-        * is always correct including from ->b_dirty_time.  The transfer
-        * preserves @inode->dirtied_when ordering.
+        * Transfer to @new_wb's IO list if necessary.  If the @inode is dirty,
+        * the specific list @inode was on is ignored and the @inode is put on
+        * ->b_dirty which is always correct including from ->b_dirty_time.
+        * The transfer preserves @inode->dirtied_when ordering.  If the @inode
+        * was clean, it means it was on the b_attached list, so move it onto
+        * the b_attached list of @new_wb.
         */
        if (!list_empty(&inode->i_io_list)) {
-               struct inode *pos;
-
-               inode_io_list_del_locked(inode, old_wb);
                inode->i_wb = new_wb;
-               list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
-                       if (time_after_eq(inode->dirtied_when,
-                                         pos->dirtied_when))
-                               break;
-               inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
+
+               if (inode->i_state & I_DIRTY_ALL) {
+                       struct inode *pos;
+
+                       list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
+                               if (time_after_eq(inode->dirtied_when,
+                                                 pos->dirtied_when))
+                                       break;
+                       inode_io_list_move_locked(inode, new_wb,
+                                                 pos->i_io_list.prev);
+               } else {
+                       inode_cgwb_move_to_attached(inode, new_wb);
+               }
        } else {
                inode->i_wb = new_wb;
        }
@@ -1021,6 +1031,17 @@ fs_initcall(cgroup_writeback_init);
 static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
 static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
 
+static void inode_cgwb_move_to_attached(struct inode *inode,
+                                       struct bdi_writeback *wb)
+{
+       assert_spin_locked(&wb->list_lock);
+       assert_spin_locked(&inode->i_lock);
+
+       inode->i_state &= ~I_SYNC_QUEUED;
+       list_del_init(&inode->i_io_list);
+       wb_io_lists_depopulated(wb);
+}
+
 static struct bdi_writeback *
 locked_inode_to_wb_and_lock_list(struct inode *inode)
        __releases(&inode->i_lock)
@@ -1121,7 +1142,11 @@ void inode_io_list_del(struct inode *inode)
 
        wb = inode_to_wb_and_lock_list(inode);
        spin_lock(&inode->i_lock);
-       inode_io_list_del_locked(inode, wb);
+
+       inode->i_state &= ~I_SYNC_QUEUED;
+       list_del_init(&inode->i_io_list);
+       wb_io_lists_depopulated(wb);
+
        spin_unlock(&inode->i_lock);
        spin_unlock(&wb->list_lock);
 }
@@ -1434,7 +1459,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
                inode->i_state &= ~I_SYNC_QUEUED;
        } else {
                /* The inode is clean. Remove from writeback lists. */
-               inode_io_list_del_locked(inode, wb);
+               inode_cgwb_move_to_attached(inode, wb);
        }
 }
 
@@ -1586,7 +1611,7 @@ static int writeback_single_inode(struct inode *inode,
         * responsible for the writeback lists.
         */
        if (!(inode->i_state & I_DIRTY_ALL))
-               inode_io_list_del_locked(inode, wb);
+               inode_cgwb_move_to_attached(inode, wb);
        spin_unlock(&wb->list_lock);
        inode_sync_complete(inode);
 out:
index fff9367a6348562b07d12240da34f0b55bd7fc94..e5dc238ebe4f85de626903a152fa070f227b23b3 100644 (file)
@@ -154,6 +154,7 @@ struct bdi_writeback {
        struct cgroup_subsys_state *blkcg_css; /* and blkcg */
        struct list_head memcg_node;    /* anchored at memcg->cgwb_list */
        struct list_head blkcg_node;    /* anchored at blkcg->cgwb_list */
+       struct list_head b_attached;    /* attached inodes, protected by list_lock */
 
        union {
                struct work_struct release_work;
index 576220acd686a63ad1276610425461ffbd72cbcf..54c5dc4b8c24eb99d7f7405f2fb50ca60800bfe9 100644 (file)
@@ -396,6 +396,7 @@ static void cgwb_release_workfn(struct work_struct *work)
        fprop_local_destroy_percpu(&wb->memcg_completions);
        percpu_ref_exit(&wb->refcnt);
        wb_exit(wb);
+       WARN_ON_ONCE(!list_empty(&wb->b_attached));
        kfree_rcu(wb, rcu);
 }
 
@@ -472,6 +473,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
 
        wb->memcg_css = memcg_css;
        wb->blkcg_css = blkcg_css;
+       INIT_LIST_HEAD(&wb->b_attached);
        INIT_WORK(&wb->release_work, cgwb_release_workfn);
        set_bit(WB_registered, &wb->state);