ext4: data=journal: write-protect pages on j_submit_inode_data_buffers()
authorMauricio Faria de Oliveira <mfo@canonical.com>
Tue, 6 Oct 2020 00:48:41 +0000 (21:48 -0300)
committerTheodore Ts'o <tytso@mit.edu>
Sun, 18 Oct 2020 14:37:15 +0000 (10:37 -0400)
This implements journal callbacks j_submit|finish_inode_data_buffers()
with different behavior for data=journal: to write-protect pages under
commit, preventing changes to buffers writeably mapped to userspace.

If a buffer's content changes between commit's checksum calculation
and write-out to disk, it can cause journal recovery/mount failures
upon a kernel crash or power loss.

    [   27.334874] EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!
    [   27.339492] JBD2: Invalid checksum recovering data block 8705 in log
    [   27.342716] JBD2: recovery failed
    [   27.343316] EXT4-fs (loop0): error loading journal
    mount: /ext4: can't read superblock on /dev/loop0.

In j_submit_inode_data_buffers() we write-protect the inode's pages
with write_cache_pages() and redirty w/ writepage callback if needed.

In j_finish_inode_data_buffers() there is nothing do to.

And in order to use the callbacks, inodes are added to the inode list
in transaction in __ext4_journalled_writepage() and ext4_page_mkwrite().

In ext4_page_mkwrite() we must make sure that the buffers are attached
to the transaction as jbddirty with write_end_fn(), as already done in
__ext4_journalled_writepage().

Signed-off-by: Mauricio Faria de Oliveira <mfo@canonical.com>
Reported-by: Dann Frazier <dann.frazier@canonical.com>
Reported-by: kernel test robot <lkp@intel.com> # wbc.nr_to_write
Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20201006004841.600488-5-mfo@canonical.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
fs/ext4/inode.c
fs/ext4/super.c

index 804fd14fe5e0cbfd373ef65b643a89d4dd56a795..2ac294fc224751baffd70806119642463aa88cbc 100644 (file)
@@ -1911,6 +1911,9 @@ static int __ext4_journalled_writepage(struct page *page,
                err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
                                             write_end_fn);
        }
+       if (ret == 0)
+               ret = err;
+       err = ext4_jbd2_inode_add_write(handle, inode, 0, len);
        if (ret == 0)
                ret = err;
        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
@@ -6062,10 +6065,8 @@ retry_alloc:
                size = i_size_read(inode);
                /* Page got truncated from under us? */
                if (page->mapping != mapping || page_offset(page) > size) {
-                       unlock_page(page);
                        ret = VM_FAULT_NOPAGE;
-                       ext4_journal_stop(handle);
-                       goto out;
+                       goto out_error;
                }
 
                if (page->index == size >> PAGE_SHIFT)
@@ -6075,13 +6076,15 @@ retry_alloc:
 
                err = __block_write_begin(page, 0, len, ext4_get_block);
                if (!err) {
+                       ret = VM_FAULT_SIGBUS;
                        if (ext4_walk_page_buffers(handle, page_buffers(page),
-                                       0, len, NULL, do_journal_get_write_access)) {
-                               unlock_page(page);
-                               ret = VM_FAULT_SIGBUS;
-                               ext4_journal_stop(handle);
-                               goto out;
-                       }
+                                       0, len, NULL, do_journal_get_write_access))
+                               goto out_error;
+                       if (ext4_walk_page_buffers(handle, page_buffers(page),
+                                       0, len, NULL, write_end_fn))
+                               goto out_error;
+                       if (ext4_jbd2_inode_add_write(handle, inode, 0, len))
+                               goto out_error;
                        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
                } else {
                        unlock_page(page);
@@ -6096,6 +6099,10 @@ out:
        up_read(&EXT4_I(inode)->i_mmap_sem);
        sb_end_pagefault(inode->i_sb);
        return ret;
+out_error:
+       unlock_page(page);
+       ext4_journal_stop(handle);
+       goto out;
 }
 
 vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
index a3e57f554f1ba2e3e1edc4181364da966d91c6e1..901c1c938276f48dd209bfe3f7e88f8aa310bf67 100644 (file)
@@ -571,6 +571,89 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
        spin_unlock(&sbi->s_md_lock);
 }
 
+/*
+ * This writepage callback for write_cache_pages()
+ * takes care of a few cases after page cleaning.
+ *
+ * write_cache_pages() already checks for dirty pages
+ * and calls clear_page_dirty_for_io(), which we want,
+ * to write protect the pages.
+ *
+ * However, we may have to redirty a page (see below.)
+ */
+static int ext4_journalled_writepage_callback(struct page *page,
+                                             struct writeback_control *wbc,
+                                             void *data)
+{
+       transaction_t *transaction = (transaction_t *) data;
+       struct buffer_head *bh, *head;
+       struct journal_head *jh;
+
+       bh = head = page_buffers(page);
+       do {
+               /*
+                * We have to redirty a page in these cases:
+                * 1) If buffer is dirty, it means the page was dirty because it
+                * contains a buffer that needs checkpointing. So the dirty bit
+                * needs to be preserved so that checkpointing writes the buffer
+                * properly.
+                * 2) If buffer is not part of the committing transaction
+                * (we may have just accidentally come across this buffer because
+                * inode range tracking is not exact) or if the currently running
+                * transaction already contains this buffer as well, dirty bit
+                * needs to be preserved so that the buffer gets writeprotected
+                * properly on running transaction's commit.
+                */
+               jh = bh2jh(bh);
+               if (buffer_dirty(bh) ||
+                   (jh && (jh->b_transaction != transaction ||
+                           jh->b_next_transaction))) {
+                       redirty_page_for_writepage(wbc, page);
+                       goto out;
+               }
+       } while ((bh = bh->b_this_page) != head);
+
+out:
+       return AOP_WRITEPAGE_ACTIVATE;
+}
+
+static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+       struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
+       struct writeback_control wbc = {
+               .sync_mode =  WB_SYNC_ALL,
+               .nr_to_write = LONG_MAX,
+               .range_start = jinode->i_dirty_start,
+               .range_end = jinode->i_dirty_end,
+        };
+
+       return write_cache_pages(mapping, &wbc,
+                                ext4_journalled_writepage_callback,
+                                jinode->i_transaction);
+}
+
+static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+       int ret;
+
+       if (ext4_should_journal_data(jinode->i_vfs_inode))
+               ret = ext4_journalled_submit_inode_data_buffers(jinode);
+       else
+               ret = jbd2_journal_submit_inode_data_buffers(jinode);
+
+       return ret;
+}
+
+static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
+{
+       int ret = 0;
+
+       if (!ext4_should_journal_data(jinode->i_vfs_inode))
+               ret = jbd2_journal_finish_inode_data_buffers(jinode);
+
+       return ret;
+}
+
 static bool system_going_down(void)
 {
        return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
@@ -4753,9 +4836,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
        sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
        sbi->s_journal->j_submit_inode_data_buffers =
-               jbd2_journal_submit_inode_data_buffers;
+               ext4_journal_submit_inode_data_buffers;
        sbi->s_journal->j_finish_inode_data_buffers =
-               jbd2_journal_finish_inode_data_buffers;
+               ext4_journal_finish_inode_data_buffers;
 
 no_journal:
        if (!test_opt(sb, NO_MBCACHE)) {