ext4: rework fast commit commit path
authorHarshad Shirwadkar <harshadshirwadkar@gmail.com>
Thu, 8 May 2025 17:59:03 +0000 (17:59 +0000)
committerTheodore Ts'o <tytso@mit.edu>
Fri, 9 May 2025 01:56:17 +0000 (21:56 -0400)
This patch reworks fast commit's commit path to remove locking the
journal for the entire duration of a fast commit. Instead, we only lock
the journal while marking all the eligible inodes as "committing". This
allows handles to make progress in parallel with the fast commit.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Link: https://patch.msgid.link/20250508175908.1004880-5-harshadshirwadkar@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
fs/ext4/ext4.h
fs/ext4/fast_commit.c
fs/jbd2/journal.c

index 79dfb57a7046c4af7ad87dcad514e8656e1ab40d..493d9ac7a577e81109d686de589362c772eb7620 100644 (file)
@@ -1916,6 +1916,7 @@ enum {
        EXT4_STATE_LUSTRE_EA_INODE,     /* Lustre-style ea_inode */
        EXT4_STATE_VERITY_IN_PROGRESS,  /* building fs-verity Merkle tree */
        EXT4_STATE_FC_COMMITTING,       /* Fast commit ongoing */
+       EXT4_STATE_FC_FLUSHING_DATA,    /* Fast commit flushing data */
        EXT4_STATE_ORPHAN_FILE,         /* Inode orphaned in orphan file */
 };
 
index c4d3c71d5e6c616835d15b383c0b918d472bedee..a2cb4d965dc14c7abfab4b0ffd060dcac2889d9c 100644 (file)
@@ -287,24 +287,55 @@ void ext4_fc_del(struct inode *inode)
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_fc_dentry_update *fc_dentry;
+       wait_queue_head_t *wq;
 
        if (ext4_fc_disabled(inode->i_sb))
                return;
 
-restart:
        spin_lock(&sbi->s_fc_lock);
        if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
                spin_unlock(&sbi->s_fc_lock);
                return;
        }
 
-       if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
-               ext4_fc_wait_committing_inode(inode);
-               goto restart;
+       /*
+        * Since ext4_fc_del is called from ext4_evict_inode while having a
+        * handle open, there is no need for us to wait here even if a fast
+        * commit is going on. That is because, if this inode is being
+        * committed, ext4_mark_inode_dirty would have waited for inode commit
+        * operation to finish before we come here. So, by the time we come
+        * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
+        * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
+        * here.
+        *
+        * We may come here without any handles open in the "no_delete" case of
+        * ext4_evict_inode as well. However, if that happens, we first mark the
+        * file system as fast commit ineligible anyway. So, even in that case,
+        * it is okay to remove the inode from the fc list.
+        */
+       WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
+               && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
+       while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
+#if (BITS_PER_LONG < 64)
+               DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+                               EXT4_STATE_FC_FLUSHING_DATA);
+               wq = bit_waitqueue(&ei->i_state_flags,
+                                  EXT4_STATE_FC_FLUSHING_DATA);
+#else
+               DEFINE_WAIT_BIT(wait, &ei->i_flags,
+                               EXT4_STATE_FC_FLUSHING_DATA);
+               wq = bit_waitqueue(&ei->i_flags,
+                                  EXT4_STATE_FC_FLUSHING_DATA);
+#endif
+               prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+               if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
+                       spin_unlock(&sbi->s_fc_lock);
+                       schedule();
+                       spin_lock(&sbi->s_fc_lock);
+               }
+               finish_wait(wq, &wait.wq_entry);
        }
-
-       if (!list_empty(&ei->i_fc_list))
-               list_del_init(&ei->i_fc_list);
+       list_del_init(&ei->i_fc_list);
 
        /*
         * Since this inode is getting removed, let's also remove all FC
@@ -325,8 +356,6 @@ restart:
 
        release_dentry_name_snapshot(&fc_dentry->fcd_name);
        kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
-
-       return;
 }
 
 /*
@@ -590,9 +619,6 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
        if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
                return;
 
-       if (!list_empty(&ei->i_fc_list))
-               return;
-
        /*
         * If we come here, we may sleep while waiting for the inode to
         * commit. We shouldn't be holding i_data_sem when we go to sleep since
@@ -988,61 +1014,25 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 }
 
 
-/* Submit data for all the fast commit inodes */
-static int ext4_fc_submit_inode_data_all(journal_t *journal)
+/* Flushes data of all the inodes in the commit queue. */
+static int ext4_fc_flush_data(journal_t *journal)
 {
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_inode_info *ei;
        int ret = 0;
 
-       spin_lock(&sbi->s_fc_lock);
        list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
-               ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
-               while (atomic_read(&ei->i_fc_updates)) {
-                       DEFINE_WAIT(wait);
-
-                       prepare_to_wait(&ei->i_fc_wait, &wait,
-                                               TASK_UNINTERRUPTIBLE);
-                       if (atomic_read(&ei->i_fc_updates)) {
-                               spin_unlock(&sbi->s_fc_lock);
-                               schedule();
-                               spin_lock(&sbi->s_fc_lock);
-                       }
-                       finish_wait(&ei->i_fc_wait, &wait);
-               }
-               spin_unlock(&sbi->s_fc_lock);
                ret = jbd2_submit_inode_data(journal, ei->jinode);
                if (ret)
                        return ret;
-               spin_lock(&sbi->s_fc_lock);
        }
-       spin_unlock(&sbi->s_fc_lock);
-
-       return ret;
-}
-
-/* Wait for completion of data for all the fast commit inodes */
-static int ext4_fc_wait_inode_data_all(journal_t *journal)
-{
-       struct super_block *sb = journal->j_private;
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
-       struct ext4_inode_info *pos, *n;
-       int ret = 0;
 
-       spin_lock(&sbi->s_fc_lock);
-       list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
-               if (!ext4_test_inode_state(&pos->vfs_inode,
-                                          EXT4_STATE_FC_COMMITTING))
-                       continue;
-               spin_unlock(&sbi->s_fc_lock);
-
-               ret = jbd2_wait_inode_data(journal, pos->jinode);
+       list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+               ret = jbd2_wait_inode_data(journal, ei->jinode);
                if (ret)
                        return ret;
-               spin_lock(&sbi->s_fc_lock);
        }
-       spin_unlock(&sbi->s_fc_lock);
 
        return 0;
 }
@@ -1123,26 +1113,81 @@ static int ext4_fc_perform_commit(journal_t *journal)
        int ret = 0;
        u32 crc = 0;
 
-       ret = ext4_fc_submit_inode_data_all(journal);
-       if (ret)
-               return ret;
+       /*
+        * Step 1: Mark all inodes on s_fc_q[MAIN] with
+        * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being
+        * freed until the data flush is over.
+        */
+       spin_lock(&sbi->s_fc_lock);
+       list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+               ext4_set_inode_state(&iter->vfs_inode,
+                                    EXT4_STATE_FC_FLUSHING_DATA);
+       }
+       spin_unlock(&sbi->s_fc_lock);
+
+       /* Step 2: Flush data for all the eligible inodes. */
+       ret = ext4_fc_flush_data(journal);
 
-       ret = ext4_fc_wait_inode_data_all(journal);
+       /*
+        * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning
+        * any error from step 2. This ensures that waiters waiting on
+        * EXT4_STATE_FC_FLUSHING_DATA can resume.
+        */
+       spin_lock(&sbi->s_fc_lock);
+       list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+               ext4_clear_inode_state(&iter->vfs_inode,
+                                      EXT4_STATE_FC_FLUSHING_DATA);
+#if (BITS_PER_LONG < 64)
+               wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA);
+#else
+               wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA);
+#endif
+       }
+
+       /*
+        * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before
+        * the waiter checks the bit. Pairs with implicit barrier in
+        * prepare_to_wait() in ext4_fc_del().
+        */
+       smp_mb();
+       spin_unlock(&sbi->s_fc_lock);
+
+       /*
+        * If we encountered error in Step 2, return it now after clearing
+        * EXT4_STATE_FC_FLUSHING_DATA bit.
+        */
        if (ret)
                return ret;
 
+
+       /* Step 4: Mark all inodes as being committed. */
+       jbd2_journal_lock_updates(journal);
        /*
-        * If file system device is different from journal device, issue a cache
-        * flush before we start writing fast commit blocks.
+        * The journal is now locked. No more handles can start and all the
+        * previous handles are now drained. We now mark the inodes on the
+        * commit queue as being committed.
+        */
+       spin_lock(&sbi->s_fc_lock);
+       list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+               ext4_set_inode_state(&iter->vfs_inode,
+                                    EXT4_STATE_FC_COMMITTING);
+       }
+       spin_unlock(&sbi->s_fc_lock);
+       jbd2_journal_unlock_updates(journal);
+
+       /*
+        * Step 5: If file system device is different from journal device,
+        * issue a cache flush before we start writing fast commit blocks.
         */
        if (journal->j_fs_dev != journal->j_dev)
                blkdev_issue_flush(journal->j_fs_dev);
 
        blk_start_plug(&plug);
+       /* Step 6: Write fast commit blocks to disk. */
        if (sbi->s_fc_bytes == 0) {
                /*
-                * Add a head tag only if this is the first fast commit
-                * in this TID.
+                * Step 6.1: Add a head tag only if this is the first fast
+                * commit in this TID.
                 */
                head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
                head.fc_tid = cpu_to_le32(
@@ -1154,6 +1199,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
                }
        }
 
+       /* Step 6.2: Now write all the dentry updates. */
        spin_lock(&sbi->s_fc_lock);
        ret = ext4_fc_commit_dentry_updates(journal, &crc);
        if (ret) {
@@ -1161,6 +1207,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
                goto out;
        }
 
+       /* Step 6.3: Now write all the changed inodes to disk. */
        list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
                inode = &iter->vfs_inode;
                if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
@@ -1173,10 +1220,8 @@ static int ext4_fc_perform_commit(journal_t *journal)
                ret = ext4_fc_write_inode(inode, &crc);
                if (ret)
                        goto out;
-               spin_lock(&sbi->s_fc_lock);
        }
-       spin_unlock(&sbi->s_fc_lock);
-
+       /* Step 6.4: Finally write tail tag to conclude this fast commit. */
        ret = ext4_fc_write_tail(sb, crc);
 
 out:
@@ -1298,7 +1343,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 {
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-       struct ext4_inode_info *iter, *iter_n;
+       struct ext4_inode_info *ei;
        struct ext4_fc_dentry_update *fc_dentry;
 
        if (full && sbi->s_fc_bh)
@@ -1308,13 +1353,15 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
        jbd2_fc_release_bufs(journal);
 
        spin_lock(&sbi->s_fc_lock);
-       list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
-                                i_fc_list) {
-               list_del_init(&iter->i_fc_list);
-               ext4_clear_inode_state(&iter->vfs_inode,
+       while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
+               ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
+                                       struct ext4_inode_info,
+                                       i_fc_list);
+               list_del_init(&ei->i_fc_list);
+               ext4_clear_inode_state(&ei->vfs_inode,
                                       EXT4_STATE_FC_COMMITTING);
-               if (tid_geq(tid, iter->i_sync_tid)) {
-                       ext4_fc_reset_inode(&iter->vfs_inode);
+               if (tid_geq(tid, ei->i_sync_tid)) {
+                       ext4_fc_reset_inode(&ei->vfs_inode);
                } else if (full) {
                        /*
                         * We are called after a full commit, inode has been
@@ -1325,15 +1372,19 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
                         * time in that case (and tid doesn't increase so
                         * tid check above isn't reliable).
                         */
-                       list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list,
+                       list_add_tail(&ei->i_fc_list,
                                      &sbi->s_fc_q[FC_Q_STAGING]);
                }
-               /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
+               /*
+                * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+                * visible before we send the wakeup. Pairs with implicit
+                * barrier in prepare_to_wait() in ext4_fc_track_inode().
+                */
                smp_mb();
 #if (BITS_PER_LONG < 64)
-               wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
+               wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING);
 #else
-               wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
+               wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING);
 #endif
        }
 
index 743a1d7633cdabb3231eb893193575c739751743..bfaa14bb10490a95dfe8d967a16f8cd730072732 100644 (file)
@@ -728,7 +728,6 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
        }
        journal->j_flags |= JBD2_FAST_COMMIT_ONGOING;
        write_unlock(&journal->j_state_lock);
-       jbd2_journal_lock_updates(journal);
 
        return 0;
 }
@@ -742,7 +741,6 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
 {
        if (journal->j_fc_cleanup_callback)
                journal->j_fc_cleanup_callback(journal, 0, tid);
-       jbd2_journal_unlock_updates(journal);
        write_lock(&journal->j_state_lock);
        journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
        if (fallback)