Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 12 Mar 2019 22:03:21 +0000 (15:03 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 12 Mar 2019 22:03:21 +0000 (15:03 -0700)
Pull ext4 updates from Ted Ts'o:
 "A large number of bug fixes and cleanups.

  One new feature to allow users to more easily find the jbd2 journal
  thread for a particular ext4 file system"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (25 commits)
  jbd2: jbd2_get_transaction does not need to return a value
  jbd2: fix invalid descriptor block checksum
  ext4: fix bigalloc cluster freeing when hole punching under load
  ext4: add sysfs attr /sys/fs/ext4/<disk>/journal_task
  ext4: Change debugging support help prefix from EXT4 to Ext4
  ext4: fix compile error when using BUFFER_TRACE
  jbd2: fix compile warning when using JBUFFER_TRACE
  ext4: fix some error pointer dereferences
  ext4: annotate more implicit fall throughs
  ext4: annotate implicit fall throughs
  ext4: don't update s_rev_level if not required
  jbd2: fold jbd2_superblock_csum_{verify,set} into their callers
  jbd2: fix race when writing superblock
  ext4: fix crash during online resizing
  ext4: disallow files with EXT4_JOURNAL_DATA_FL from EXT4_IOC_SWAP_BOOT
  ext4: add mask of ext4 flags to swap
  ext4: update quota information while swapping boot loader inode
  ext4: cleanup pagecache before swap i_data
  ext4: fix check of inode in swap_inode_boot_loader
  ext4: unlock unused_pages timely when doing writeback
  ...

18 files changed:
Documentation/ABI/testing/sysfs-fs-ext4
fs/ext4/Kconfig
fs/ext4/ext4.h
fs/ext4/extents.c
fs/ext4/hash.c
fs/ext4/indirect.c
fs/ext4/inode.c
fs/ext4/ioctl.c
fs/ext4/mballoc.c
fs/ext4/page-io.c
fs/ext4/resize.c
fs/ext4/super.c
fs/ext4/sysfs.c
fs/ext4/xattr.c
fs/jbd2/checkpoint.c
fs/jbd2/commit.c
fs/jbd2/journal.c
fs/jbd2/transaction.c

index c631253cf85c763de6cfd347a15ce063fd71be07..78604db56279966246980c2070657bf32667a0cf 100644 (file)
@@ -109,3 +109,10 @@ Description:
                write operation (since a 4k random write might turn
                into a much larger write due to the zeroout
                operation).
+
+What:          /sys/fs/ext4/<disk>/journal_task
+Date:          February 2019
+Contact:       "Theodore Ts'o" <tytso@mit.edu>
+Description:
+               This file is read-only and shows the pid of journal thread in
+               current pid-namespace or 0 if task is unreachable.
index 031e5a82d556dbe9d4bd2cff39d1b8c4e162b659..06f77ca7f36ee737018016883df254948b5851e7 100644 (file)
@@ -97,7 +97,7 @@ config EXT4_FS_SECURITY
          extended attributes for file security labels, say N.
 
 config EXT4_DEBUG
-       bool "EXT4 debugging support"
+       bool "Ext4 debugging support"
        depends on EXT4_FS
        help
          Enables run-time debugging support for the ext4 filesystem.
index 5012ddb6daf9fb1561053a304e39d4ec8900262a..82ffdacdc7fac30b63da30a39bac9349a1244717 100644 (file)
@@ -425,6 +425,9 @@ struct flex_groups {
 /* Flags that are appropriate for non-directories/regular files. */
 #define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
 
+/* The only flags that should be swapped */
+#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)
+
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
 {
@@ -1661,6 +1664,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_INCOMPAT_INLINE_DATA      0x8000 /* data in inode */
 #define EXT4_FEATURE_INCOMPAT_ENCRYPT          0x10000
 
+extern void ext4_update_dynamic_rev(struct super_block *sb);
+
 #define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \
 static inline bool ext4_has_feature_##name(struct super_block *sb) \
 { \
@@ -1669,6 +1674,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \
 } \
 static inline void ext4_set_feature_##name(struct super_block *sb) \
 { \
+       ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_compat |= \
                cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
 } \
@@ -1686,6 +1692,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \
 } \
 static inline void ext4_set_feature_##name(struct super_block *sb) \
 { \
+       ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_ro_compat |= \
                cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
 } \
@@ -1703,6 +1710,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \
 } \
 static inline void ext4_set_feature_##name(struct super_block *sb) \
 { \
+       ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_incompat |= \
                cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
 } \
@@ -2666,7 +2674,6 @@ do {                                                                      \
 
 #endif
 
-extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
                                        __u32 compat);
 extern int ext4_update_rocompat_feature(handle_t *handle,
index 79d986dbf5af551533dd95d5ec61e4f53ca586fc..0f89f5190cd7241aa1b6da7d878816d6787a52db 100644 (file)
@@ -2956,14 +2956,17 @@ again:
                        if (err < 0)
                                goto out;
 
-               } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) {
+               } else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
+                          partial.state == initial) {
                        /*
-                        * If there's an extent to the right its first cluster
-                        * contains the immediate right boundary of the
-                        * truncated/punched region.  Set partial_cluster to
-                        * its negative value so it won't be freed if shared
-                        * with the current extent.  The end < ee_block case
-                        * is handled in ext4_ext_rm_leaf().
+                        * If we're punching, there's an extent to the right.
+                        * If the partial cluster hasn't been set, set it to
+                        * that extent's first cluster and its state to nofree
+                        * so it won't be freed should it contain blocks to be
+                        * removed. If it's already set (tofree/nofree), we're
+                        * retrying and keep the original partial cluster info
+                        * so a cluster marked tofree as a result of earlier
+                        * extent removal is not lost.
                         */
                        lblk = ex_end + 1;
                        err = ext4_ext_search_right(inode, path, &lblk, &pblk,
@@ -4048,18 +4051,8 @@ out:
        } else
                allocated = ret;
        map->m_flags |= EXT4_MAP_NEW;
-       /*
-        * if we allocated more blocks than requested
-        * we need to make sure we unmap the extra block
-        * allocated. The actual needed block will get
-        * unmapped later when we find the buffer_head marked
-        * new.
-        */
-       if (allocated > map->m_len) {
-               clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len,
-                                  allocated - map->m_len);
+       if (allocated > map->m_len)
                allocated = map->m_len;
-       }
        map->m_len = allocated;
 
 map_out:
index e22dcfab308bcc529e32218507ffea5c648c8edd..46b24da33a287d8c07af76844904ad3d37147bc5 100644 (file)
@@ -231,6 +231,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
                break;
        case DX_HASH_HALF_MD4_UNSIGNED:
                str2hashbuf = str2hashbuf_unsigned;
+               /* fall through */
        case DX_HASH_HALF_MD4:
                p = name;
                while (len > 0) {
@@ -244,6 +245,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
                break;
        case DX_HASH_TEA_UNSIGNED:
                str2hashbuf = str2hashbuf_unsigned;
+               /* fall through */
        case DX_HASH_TEA:
                p = name;
                while (len > 0) {
index bf7fa1507e811221523a785cee1ed41159d7a0ca..c2225f0d31b511bbb0c62b176324e4f483437638 100644 (file)
@@ -1183,18 +1183,21 @@ do_indirects:
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
                        i_data[EXT4_IND_BLOCK] = 0;
                }
+               /* fall through */
        case EXT4_IND_BLOCK:
                nr = i_data[EXT4_DIND_BLOCK];
                if (nr) {
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
                        i_data[EXT4_DIND_BLOCK] = 0;
                }
+               /* fall through */
        case EXT4_DIND_BLOCK:
                nr = i_data[EXT4_TIND_BLOCK];
                if (nr) {
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
                        i_data[EXT4_TIND_BLOCK] = 0;
                }
+               /* fall through */
        case EXT4_TIND_BLOCK:
                ;
        }
@@ -1433,6 +1436,7 @@ do_indirects:
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
                        i_data[EXT4_IND_BLOCK] = 0;
                }
+               /* fall through */
        case EXT4_IND_BLOCK:
                if (++n >= n2)
                        return 0;
@@ -1441,6 +1445,7 @@ do_indirects:
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
                        i_data[EXT4_DIND_BLOCK] = 0;
                }
+               /* fall through */
        case EXT4_DIND_BLOCK:
                if (++n >= n2)
                        return 0;
@@ -1449,6 +1454,7 @@ do_indirects:
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
                        i_data[EXT4_TIND_BLOCK] = 0;
                }
+               /* fall through */
        case EXT4_TIND_BLOCK:
                ;
        }
index 4356ef6d728e4019e06742587adba90764caeba3..b54b261ded36f92076d95197e6c456e5bd17698a 100644 (file)
@@ -391,7 +391,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
         * inode's preallocations.
         */
        if ((ei->i_reserved_data_blocks == 0) &&
-           (atomic_read(&inode->i_writecount) == 0))
+           !inode_is_open_for_write(inode))
                ext4_discard_preallocations(inode);
 }
 
@@ -678,8 +678,6 @@ found:
                if (flags & EXT4_GET_BLOCKS_ZERO &&
                    map->m_flags & EXT4_MAP_MAPPED &&
                    map->m_flags & EXT4_MAP_NEW) {
-                       clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
-                                          map->m_len);
                        ret = ext4_issue_zeroout(inode, map->m_lblk,
                                                 map->m_pblk, map->m_len);
                        if (ret) {
@@ -1194,7 +1192,6 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
                        if (err)
                                break;
                        if (buffer_new(bh)) {
-                               clean_bdev_bh_alias(bh);
                                if (PageUptodate(page)) {
                                        clear_buffer_new(bh);
                                        set_buffer_uptodate(bh);
@@ -2489,10 +2486,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
        }
 
        BUG_ON(map->m_len == 0);
-       if (map->m_flags & EXT4_MAP_NEW) {
-               clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
-                                  map->m_len);
-       }
        return 0;
 }
 
@@ -2835,12 +2828,12 @@ retry:
                goto unplug;
        }
        ret = mpage_prepare_extent_to_map(&mpd);
+       /* Unlock pages we didn't use */
+       mpage_release_unused_pages(&mpd, false);
        /* Submit prepared bio */
        ext4_io_submit(&mpd.io_submit);
        ext4_put_io_end_defer(mpd.io_submit.io_end);
        mpd.io_submit.io_end = NULL;
-       /* Unlock pages we didn't use */
-       mpage_release_unused_pages(&mpd, false);
        if (ret < 0)
                goto unplug;
 
@@ -2908,10 +2901,11 @@ retry:
                        handle = NULL;
                        mpd.do_map = 0;
                }
-               /* Submit prepared bio */
-               ext4_io_submit(&mpd.io_submit);
                /* Unlock pages we didn't use */
                mpage_release_unused_pages(&mpd, give_up_on_write);
+               /* Submit prepared bio */
+               ext4_io_submit(&mpd.io_submit);
+
                /*
                 * Drop our io_end reference we got from init. We have
                 * to be careful and use deferred io_end finishing if
@@ -5349,7 +5343,6 @@ static int ext4_do_update_inode(handle_t *handle,
                err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
                if (err)
                        goto out_brelse;
-               ext4_update_dynamic_rev(sb);
                ext4_set_feature_large_file(sb);
                ext4_handle_sync(handle);
                err = ext4_handle_dirty_super(handle, sb);
@@ -6000,7 +5993,7 @@ int ext4_expand_extra_isize(struct inode *inode,
 
        ext4_write_lock_xattr(inode, &no_expand);
 
-       BUFFER_TRACE(iloc.bh, "get_write_access");
+       BUFFER_TRACE(iloc->bh, "get_write_access");
        error = ext4_journal_get_write_access(handle, iloc->bh);
        if (error) {
                brelse(iloc->bh);
index d26bcac291bbc0299ad3f85899ebe12f9a9faca0..3c4f8bb59f8abfd23ceaf36f93c7fceffac0134e 100644 (file)
@@ -63,18 +63,20 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
        loff_t isize;
        struct ext4_inode_info *ei1;
        struct ext4_inode_info *ei2;
+       unsigned long tmp;
 
        ei1 = EXT4_I(inode1);
        ei2 = EXT4_I(inode2);
 
        swap(inode1->i_version, inode2->i_version);
-       swap(inode1->i_blocks, inode2->i_blocks);
-       swap(inode1->i_bytes, inode2->i_bytes);
        swap(inode1->i_atime, inode2->i_atime);
        swap(inode1->i_mtime, inode2->i_mtime);
 
        memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
-       swap(ei1->i_flags, ei2->i_flags);
+       tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP;
+       ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) |
+               (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP);
+       ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP);
        swap(ei1->i_disksize, ei2->i_disksize);
        ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
        ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
@@ -115,28 +117,42 @@ static long swap_inode_boot_loader(struct super_block *sb,
        int err;
        struct inode *inode_bl;
        struct ext4_inode_info *ei_bl;
-
-       if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
-           IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
-           ext4_has_inline_data(inode))
-               return -EINVAL;
-
-       if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
-           !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
-               return -EPERM;
+       qsize_t size, size_bl, diff;
+       blkcnt_t blocks;
+       unsigned short bytes;
 
        inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL);
        if (IS_ERR(inode_bl))
                return PTR_ERR(inode_bl);
        ei_bl = EXT4_I(inode_bl);
 
-       filemap_flush(inode->i_mapping);
-       filemap_flush(inode_bl->i_mapping);
-
        /* Protect orig inodes against a truncate and make sure,
         * that only 1 swap_inode_boot_loader is running. */
        lock_two_nondirectories(inode, inode_bl);
 
+       if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
+           IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
+           (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) ||
+           ext4_has_inline_data(inode)) {
+               err = -EINVAL;
+               goto journal_err_out;
+       }
+
+       if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
+           !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+               err = -EPERM;
+               goto journal_err_out;
+       }
+
+       down_write(&EXT4_I(inode)->i_mmap_sem);
+       err = filemap_write_and_wait(inode->i_mapping);
+       if (err)
+               goto err_out;
+
+       err = filemap_write_and_wait(inode_bl->i_mapping);
+       if (err)
+               goto err_out;
+
        /* Wait for all existing dio workers */
        inode_dio_wait(inode);
        inode_dio_wait(inode_bl);
@@ -147,7 +163,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
        handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
        if (IS_ERR(handle)) {
                err = -EINVAL;
-               goto journal_err_out;
+               goto err_out;
        }
 
        /* Protect extent tree against block allocations via delalloc */
@@ -170,6 +186,13 @@ static long swap_inode_boot_loader(struct super_block *sb,
                        memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
        }
 
+       err = dquot_initialize(inode);
+       if (err)
+               goto err_out1;
+
+       size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes;
+       size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes;
+       diff = size - size_bl;
        swap_inode_data(inode, inode_bl);
 
        inode->i_ctime = inode_bl->i_ctime = current_time(inode);
@@ -183,27 +206,51 @@ static long swap_inode_boot_loader(struct super_block *sb,
 
        err = ext4_mark_inode_dirty(handle, inode);
        if (err < 0) {
+               /* No need to update quota information. */
                ext4_warning(inode->i_sb,
                        "couldn't mark inode #%lu dirty (err %d)",
                        inode->i_ino, err);
                /* Revert all changes: */
                swap_inode_data(inode, inode_bl);
                ext4_mark_inode_dirty(handle, inode);
-       } else {
-               err = ext4_mark_inode_dirty(handle, inode_bl);
-               if (err < 0) {
-                       ext4_warning(inode_bl->i_sb,
-                               "couldn't mark inode #%lu dirty (err %d)",
-                               inode_bl->i_ino, err);
-                       /* Revert all changes: */
-                       swap_inode_data(inode, inode_bl);
-                       ext4_mark_inode_dirty(handle, inode);
-                       ext4_mark_inode_dirty(handle, inode_bl);
-               }
+               goto err_out1;
+       }
+
+       blocks = inode_bl->i_blocks;
+       bytes = inode_bl->i_bytes;
+       inode_bl->i_blocks = inode->i_blocks;
+       inode_bl->i_bytes = inode->i_bytes;
+       err = ext4_mark_inode_dirty(handle, inode_bl);
+       if (err < 0) {
+               /* No need to update quota information. */
+               ext4_warning(inode_bl->i_sb,
+                       "couldn't mark inode #%lu dirty (err %d)",
+                       inode_bl->i_ino, err);
+               goto revert;
+       }
+
+       /* Bootloader inode should not be counted into quota information. */
+       if (diff > 0)
+               dquot_free_space(inode, diff);
+       else
+               err = dquot_alloc_space(inode, -1 * diff);
+
+       if (err < 0) {
+revert:
+               /* Revert all changes: */
+               inode_bl->i_blocks = blocks;
+               inode_bl->i_bytes = bytes;
+               swap_inode_data(inode, inode_bl);
+               ext4_mark_inode_dirty(handle, inode);
+               ext4_mark_inode_dirty(handle, inode_bl);
        }
+
+err_out1:
        ext4_journal_stop(handle);
        ext4_double_up_write_data_sem(inode, inode_bl);
 
+err_out:
+       up_write(&EXT4_I(inode)->i_mmap_sem);
 journal_err_out:
        unlock_two_nondirectories(inode, inode_bl);
        iput(inode_bl);
index e2248083cdcaec02dfd54fdab63acfd753f467ec..6fb76d408093022dfd44bf4375af1acebaedf4bf 100644 (file)
@@ -4176,9 +4176,8 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
        isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
                >> bsbits;
 
-       if ((size == isize) &&
-           !ext4_fs_is_busy(sbi) &&
-           (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+       if ((size == isize) && !ext4_fs_is_busy(sbi) &&
+           !inode_is_open_for_write(ac->ac_inode)) {
                ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
                return;
        }
@@ -4258,7 +4257,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
                        (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
                        (unsigned) ar->lleft, (unsigned) ar->pleft,
                        (unsigned) ar->lright, (unsigned) ar->pright,
-                       atomic_read(&ar->inode->i_writecount) ? "" : "non-");
+                       inode_is_open_for_write(ar->inode) ? "" : "non-");
        return 0;
 
 }
index 6f5305e9a6acccc62c025caed44f47ded8af9947..3e9298e6a705a00b80475fdaf173f58a1177ea85 100644 (file)
@@ -468,10 +468,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
                                ext4_io_submit(io);
                        continue;
                }
-               if (buffer_new(bh)) {
+               if (buffer_new(bh))
                        clear_buffer_new(bh);
-                       clean_bdev_bh_alias(bh);
-               }
                set_buffer_async_write(bh);
                nr_to_submit++;
        } while ((bh = bh->b_this_page) != head);
index 48421de803b7bf237450c43ea369e7abe25c69e8..3d9b18505c0c799b272553d0adf81fa26e6a6833 100644 (file)
@@ -1960,7 +1960,8 @@ retry:
                                le16_to_cpu(es->s_reserved_gdt_blocks);
                        n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb);
                        n_blocks_count = (ext4_fsblk_t)n_group *
-                               EXT4_BLOCKS_PER_GROUP(sb);
+                               EXT4_BLOCKS_PER_GROUP(sb) +
+                               le32_to_cpu(es->s_first_data_block);
                        n_group--; /* set to last group number */
                }
 
index 60da0a6e4d8617bc301a43c86530214145f32fd5..f5b828bf1299f1998d7b8ac2696b979c8f303079 100644 (file)
@@ -2249,7 +2249,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
        le16_add_cpu(&es->s_mnt_count, 1);
        ext4_update_tstamp(es, s_mtime);
-       ext4_update_dynamic_rev(sb);
        if (sbi->s_journal)
                ext4_set_feature_journal_needs_recovery(sb);
 
index 5e4e78fc0b3a2b342a3db60f5cff747f3b95e259..616c075da062a3fdb0cf5e4c23be1f90e7722d60 100644 (file)
@@ -30,6 +30,7 @@ typedef enum {
        attr_feature,
        attr_pointer_ui,
        attr_pointer_atomic,
+       attr_journal_task,
 } attr_id_t;
 
 typedef enum {
@@ -125,6 +126,14 @@ static ssize_t trigger_test_error(struct ext4_sb_info *sbi,
        return count;
 }
 
+static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf)
+{
+       if (!sbi->s_journal)
+               return snprintf(buf, PAGE_SIZE, "<none>\n");
+       return snprintf(buf, PAGE_SIZE, "%d\n",
+                       task_pid_vnr(sbi->s_journal->j_task));
+}
+
 #define EXT4_ATTR(_name,_mode,_id)                                     \
 static struct ext4_attr ext4_attr_##_name = {                          \
        .attr = {.name = __stringify(_name), .mode = _mode },           \
@@ -188,6 +197,7 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
 EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
 EXT4_ATTR(first_error_time, 0444, first_error_time);
 EXT4_ATTR(last_error_time, 0444, last_error_time);
+EXT4_ATTR(journal_task, 0444, journal_task);
 
 static unsigned int old_bump_val = 128;
 EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -217,6 +227,7 @@ static struct attribute *ext4_attrs[] = {
        ATTR_LIST(errors_count),
        ATTR_LIST(first_error_time),
        ATTR_LIST(last_error_time),
+       ATTR_LIST(journal_task),
        NULL,
 };
 
@@ -304,6 +315,8 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
                return print_tstamp(buf, sbi->s_es, s_first_error_time);
        case attr_last_error_time:
                return print_tstamp(buf, sbi->s_es, s_last_error_time);
+       case attr_journal_task:
+               return journal_task_show(sbi, buf);
        }
 
        return 0;
index 86ed9c6862493eb676515ce61cf8d13c94ca4be3..dc82e7757f67de061a432a82faa8e75ca2ce95c7 100644 (file)
@@ -829,6 +829,7 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
                bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
                if (IS_ERR(bh)) {
                        ret = PTR_ERR(bh);
+                       bh = NULL;
                        goto out;
                }
 
@@ -2903,6 +2904,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
                        if (error == -EIO)
                                EXT4_ERROR_INODE(inode, "block %llu read error",
                                                 EXT4_I(inode)->i_file_acl);
+                       bh = NULL;
                        goto cleanup;
                }
                error = ext4_xattr_check_block(inode, bh);
@@ -3059,6 +3061,7 @@ ext4_xattr_block_cache_find(struct inode *inode,
                if (IS_ERR(bh)) {
                        if (PTR_ERR(bh) == -ENOMEM)
                                return NULL;
+                       bh = NULL;
                        EXT4_ERROR_INODE(inode, "block %lu read error",
                                         (unsigned long)ce->e_value);
                } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
index 26f8d7e46462e1ba402ff7fc3050650aac1f096a..02e0b79753e7068e9b4bb896f8b1f0ddfe972ef2 100644 (file)
@@ -113,7 +113,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
        nblocks = jbd2_space_needed(journal);
        while (jbd2_log_space_left(journal) < nblocks) {
                write_unlock(&journal->j_state_lock);
-               mutex_lock(&journal->j_checkpoint_mutex);
+               mutex_lock_io(&journal->j_checkpoint_mutex);
 
                /*
                 * Test again, another process may have checkpointed while we
@@ -276,9 +276,22 @@ restart:
                "JBD2: %s: Waiting for Godot: block %llu\n",
                journal->j_devname, (unsigned long long) bh->b_blocknr);
 
+                       if (batch_count)
+                               __flush_batch(journal, &batch_count);
                        jbd2_log_start_commit(journal, tid);
+                       /*
+                        * jbd2_journal_commit_transaction() may want
+                        * to take the checkpoint_mutex if JBD2_FLUSHED
+                        * is set, jbd2_update_log_tail() called by
+                        * jbd2_journal_commit_transaction() may also take
+                        * checkpoint_mutex.  So we need to temporarily
+                        * drop it.
+                        */
+                       mutex_unlock(&journal->j_checkpoint_mutex);
                        jbd2_log_wait_commit(journal, tid);
-                       goto retry;
+                       mutex_lock_io(&journal->j_checkpoint_mutex);
+                       spin_lock(&journal->j_list_lock);
+                       goto restart;
                }
                if (!buffer_dirty(bh)) {
                        if (unlikely(buffer_write_io_error(bh)) && !result)
index 2eb55c3361a8b170afd64bc807c28408d24127e7..efd0ce9489ae9d453d37abd4a66b8ad574fa5b4b 100644 (file)
@@ -694,9 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                            the last tag we set up. */
 
                        tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
-
-                       jbd2_descriptor_block_csum_set(journal, descriptor);
 start_journal_io:
+                       if (descriptor)
+                               jbd2_descriptor_block_csum_set(journal,
+                                                       descriptor);
+
                        for (i = 0; i < bufs; i++) {
                                struct buffer_head *bh = wbuf[i];
                                /*
index 8ef6b6daaa7a641bc39f507b42ad89db488f98ce..382c030cc78b85516cf6b9869094dbc70bafb4f3 100644 (file)
@@ -142,22 +142,6 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
        return cpu_to_be32(csum);
 }
 
-static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
-{
-       if (!jbd2_journal_has_csum_v2or3(j))
-               return 1;
-
-       return sb->s_checksum == jbd2_superblock_csum(j, sb);
-}
-
-static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
-{
-       if (!jbd2_journal_has_csum_v2or3(j))
-               return;
-
-       sb->s_checksum = jbd2_superblock_csum(j, sb);
-}
-
 /*
  * Helper function used to manage commit timeouts
  */
@@ -1356,6 +1340,10 @@ static int journal_reset(journal_t *journal)
        return jbd2_journal_start_thread(journal);
 }
 
+/*
+ * This function expects that the caller will have locked the journal
+ * buffer head, and will return with it unlocked
+ */
 static int jbd2_write_superblock(journal_t *journal, int write_flags)
 {
        struct buffer_head *bh = journal->j_sb_buffer;
@@ -1365,7 +1353,6 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
        trace_jbd2_write_superblock(journal, write_flags);
        if (!(journal->j_flags & JBD2_BARRIER))
                write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
-       lock_buffer(bh);
        if (buffer_write_io_error(bh)) {
                /*
                 * Oh, dear.  A previous attempt to write the journal
@@ -1381,7 +1368,8 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
                clear_buffer_write_io_error(bh);
                set_buffer_uptodate(bh);
        }
-       jbd2_superblock_csum_set(journal, sb);
+       if (jbd2_journal_has_csum_v2or3(journal))
+               sb->s_checksum = jbd2_superblock_csum(journal, sb);
        get_bh(bh);
        bh->b_end_io = end_buffer_write_sync;
        ret = submit_bh(REQ_OP_WRITE, write_flags, bh);
@@ -1424,6 +1412,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
        jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
                  tail_block, tail_tid);
 
+       lock_buffer(journal->j_sb_buffer);
        sb->s_sequence = cpu_to_be32(tail_tid);
        sb->s_start    = cpu_to_be32(tail_block);
 
@@ -1454,18 +1443,17 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
        journal_superblock_t *sb = journal->j_superblock;
 
        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
-       read_lock(&journal->j_state_lock);
-       /* Is it already empty? */
-       if (sb->s_start == 0) {
-               read_unlock(&journal->j_state_lock);
+       lock_buffer(journal->j_sb_buffer);
+       if (sb->s_start == 0) {         /* Is it already empty? */
+               unlock_buffer(journal->j_sb_buffer);
                return;
        }
+
        jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
                  journal->j_tail_sequence);
 
        sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
        sb->s_start    = cpu_to_be32(0);
-       read_unlock(&journal->j_state_lock);
 
        jbd2_write_superblock(journal, write_op);
 
@@ -1488,9 +1476,8 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
        journal_superblock_t *sb = journal->j_superblock;
        int errcode;
 
-       read_lock(&journal->j_state_lock);
+       lock_buffer(journal->j_sb_buffer);
        errcode = journal->j_errno;
-       read_unlock(&journal->j_state_lock);
        if (errcode == -ESHUTDOWN)
                errcode = 0;
        jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
@@ -1595,17 +1582,18 @@ static int journal_get_superblock(journal_t *journal)
                }
        }
 
-       /* Check superblock checksum */
-       if (!jbd2_superblock_csum_verify(journal, sb)) {
-               printk(KERN_ERR "JBD2: journal checksum error\n");
-               err = -EFSBADCRC;
-               goto out;
-       }
+       if (jbd2_journal_has_csum_v2or3(journal)) {
+               /* Check superblock checksum */
+               if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
+                       printk(KERN_ERR "JBD2: journal checksum error\n");
+                       err = -EFSBADCRC;
+                       goto out;
+               }
 
-       /* Precompute checksum seed for all metadata */
-       if (jbd2_journal_has_csum_v2or3(journal))
+               /* Precompute checksum seed for all metadata */
                journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
                                                   sizeof(sb->s_uuid));
+       }
 
        set_buffer_verified(bh);
 
@@ -1894,28 +1882,27 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
 
        sb = journal->j_superblock;
 
+       /* Load the checksum driver if necessary */
+       if ((journal->j_chksum_driver == NULL) &&
+           INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
+               journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+               if (IS_ERR(journal->j_chksum_driver)) {
+                       printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
+                       journal->j_chksum_driver = NULL;
+                       return 0;
+               }
+               /* Precompute checksum seed for all metadata */
+               journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+                                                  sizeof(sb->s_uuid));
+       }
+
+       lock_buffer(journal->j_sb_buffer);
+
        /* If enabling v3 checksums, update superblock */
        if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
                sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
                sb->s_feature_compat &=
                        ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
-
-               /* Load the checksum driver */
-               if (journal->j_chksum_driver == NULL) {
-                       journal->j_chksum_driver = crypto_alloc_shash("crc32c",
-                                                                     0, 0);
-                       if (IS_ERR(journal->j_chksum_driver)) {
-                               printk(KERN_ERR "JBD2: Cannot load crc32c "
-                                      "driver.\n");
-                               journal->j_chksum_driver = NULL;
-                               return 0;
-                       }
-
-                       /* Precompute checksum seed for all metadata */
-                       journal->j_csum_seed = jbd2_chksum(journal, ~0,
-                                                          sb->s_uuid,
-                                                          sizeof(sb->s_uuid));
-               }
        }
 
        /* If enabling v1 checksums, downgrade superblock */
@@ -1927,6 +1914,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
        sb->s_feature_compat    |= cpu_to_be32(compat);
        sb->s_feature_ro_compat |= cpu_to_be32(ro);
        sb->s_feature_incompat  |= cpu_to_be32(incompat);
+       unlock_buffer(journal->j_sb_buffer);
 
        return 1;
 #undef COMPAT_FEATURE_ON
@@ -2067,7 +2055,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
        err = jbd2_journal_skip_recovery(journal);
        if (write) {
                /* Lock to make assertions happy... */
-               mutex_lock(&journal->j_checkpoint_mutex);
+               mutex_lock_io(&journal->j_checkpoint_mutex);
                jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
                mutex_unlock(&journal->j_checkpoint_mutex);
        }
index cc35537232f28317b9f81691d5fada1276f3e94b..f940d31c2adc5553365160279f5ddd2e3e62c05e 100644 (file)
@@ -63,7 +63,7 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
 /*
  * jbd2_get_transaction: obtain a new transaction_t object.
  *
- * Simply allocate and initialise a new transaction.  Create it in
+ * Simply initialise a new transaction. Initialize it in
  * RUNNING state and add it to the current journal (which should not
  * have an existing running transaction: we only make a new transaction
  * once we have started to commit the old one).
@@ -75,8 +75,8 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
  *
  */
 
-static transaction_t *
-jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
+static void jbd2_get_transaction(journal_t *journal,
+                               transaction_t *transaction)
 {
        transaction->t_journal = journal;
        transaction->t_state = T_RUNNING;
@@ -100,8 +100,6 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
        transaction->t_max_wait = 0;
        transaction->t_start = jiffies;
        transaction->t_requested = 0;
-
-       return transaction;
 }
 
 /*
@@ -1252,11 +1250,12 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
        struct journal_head *jh;
        char *committed_data = NULL;
 
-       JBUFFER_TRACE(jh, "entry");
        if (jbd2_write_access_granted(handle, bh, true))
                return 0;
 
        jh = jbd2_journal_add_journal_head(bh);
+       JBUFFER_TRACE(jh, "entry");
+
        /*
         * Do this first --- it can drop the journal lock, so we want to
         * make sure that obtaining the committed_data is done
@@ -1367,15 +1366,17 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 
        if (is_handle_aborted(handle))
                return -EROFS;
-       if (!buffer_jbd(bh)) {
-               ret = -EUCLEAN;
-               goto out;
-       }
+       if (!buffer_jbd(bh))
+               return -EUCLEAN;
+
        /*
         * We don't grab jh reference here since the buffer must be part
         * of the running transaction.
         */
        jh = bh2jh(bh);
+       jbd_debug(5, "journal_head %p\n", jh);
+       JBUFFER_TRACE(jh, "entry");
+
        /*
         * This and the following assertions are unreliable since we may see jh
         * in inconsistent state unless we grab bh_state lock. But this is
@@ -1409,9 +1410,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
        }
 
        journal = transaction->t_journal;
-       jbd_debug(5, "journal_head %p\n", jh);
-       JBUFFER_TRACE(jh, "entry");
-
        jbd_lock_bh_state(bh);
 
        if (jh->b_modified == 0) {
@@ -1597,9 +1595,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                        __jbd2_journal_unfile_buffer(jh);
                        if (!buffer_jbd(bh)) {
                                spin_unlock(&journal->j_list_lock);
-                               jbd_unlock_bh_state(bh);
-                               __bforget(bh);
-                               goto drop;
+                               goto not_jbd;
                        }
                }
                spin_unlock(&journal->j_list_lock);
@@ -1609,14 +1605,21 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                /* However, if the buffer is still owned by a prior
                 * (committing) transaction, we can't drop it yet... */
                JBUFFER_TRACE(jh, "belongs to older transaction");
-               /* ... but we CAN drop it from the new transaction if we
-                * have also modified it since the original commit. */
+               /* ... but we CAN drop it from the new transaction through
+                * marking the buffer as freed and set j_next_transaction to
+                * the new transaction, so that not only the commit code
+                * knows it should clear dirty bits when it is done with the
+                * buffer, but also the buffer can be checkpointed only
+                * after the new transaction commits. */
 
-               if (jh->b_next_transaction) {
-                       J_ASSERT(jh->b_next_transaction == transaction);
+               set_buffer_freed(bh);
+
+               if (!jh->b_next_transaction) {
                        spin_lock(&journal->j_list_lock);
-                       jh->b_next_transaction = NULL;
+                       jh->b_next_transaction = transaction;
                        spin_unlock(&journal->j_list_lock);
+               } else {
+                       J_ASSERT(jh->b_next_transaction == transaction);
 
                        /*
                         * only drop a reference if this transaction modified
@@ -1625,9 +1628,40 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                        if (was_modified)
                                drop_reserve = 1;
                }
+       } else {
+               /*
+                * Finally, if the buffer is not belongs to any
+                * transaction, we can just drop it now if it has no
+                * checkpoint.
+                */
+               spin_lock(&journal->j_list_lock);
+               if (!jh->b_cp_transaction) {
+                       JBUFFER_TRACE(jh, "belongs to none transaction");
+                       spin_unlock(&journal->j_list_lock);
+                       goto not_jbd;
+               }
+
+               /*
+                * Otherwise, if the buffer has been written to disk,
+                * it is safe to remove the checkpoint and drop it.
+                */
+               if (!buffer_dirty(bh)) {
+                       __jbd2_journal_remove_checkpoint(jh);
+                       spin_unlock(&journal->j_list_lock);
+                       goto not_jbd;
+               }
+
+               /*
+                * The buffer is still not written to disk, we should
+                * attach this buffer to current transaction so that the
+                * buffer can be checkpointed only after the current
+                * transaction commits.
+                */
+               clear_buffer_dirty(bh);
+               __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
+               spin_unlock(&journal->j_list_lock);
        }
 
-not_jbd:
        jbd_unlock_bh_state(bh);
        __brelse(bh);
 drop:
@@ -1636,6 +1670,11 @@ drop:
                handle->h_buffer_credits++;
        }
        return err;
+
+not_jbd:
+       jbd_unlock_bh_state(bh);
+       __bforget(bh);
+       goto drop;
 }
 
 /**