Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 2 Jan 2013 17:57:34 +0000 (09:57 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 2 Jan 2013 17:57:34 +0000 (09:57 -0800)
Pull ext4 bug fixes from Ted Ts'o:
 "Various bug fixes for ext4.  Perhaps the most serious bug fixed is one
  which could cause file system corruptions when performing file punch
  operations."

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: avoid hang when mounting non-journal filesystems with orphan list
  ext4: lock i_mutex when truncating orphan inodes
  ext4: do not try to write superblock on ro remount w/o journal
  ext4: include journal blocks in df overhead calcs
  ext4: remove unaligned AIO warning printk
  ext4: fix an incorrect comment about i_mutex
  ext4: fix deadlock in journal_unmap_buffer()
  ext4: split off ext4_journalled_invalidatepage()
  jbd2: fix assertion failure in jbd2_journal_flush()
  ext4: check dioread_nolock on remount
  ext4: fix extent tree corruption caused by hole punch

fs/ext4/extents.c
fs/ext4/file.c
fs/ext4/fsync.c
fs/ext4/inode.c
fs/ext4/namei.c
fs/ext4/super.c
fs/jbd2/transaction.c
include/linux/jbd2.h
include/trace/events/ext4.h

index 26af22832a846d43d94c20e67e92a2ab825d835c..5ae1674ec12f16b7eea6635244f9b68f252d9ba2 100644 (file)
@@ -2226,13 +2226,14 @@ errout:
  * removes index from the index block.
  */
 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
-                       struct ext4_ext_path *path)
+                       struct ext4_ext_path *path, int depth)
 {
        int err;
        ext4_fsblk_t leaf;
 
        /* free index block */
-       path--;
+       depth--;
+       path = path + depth;
        leaf = ext4_idx_pblock(path->p_idx);
        if (unlikely(path->p_hdr->eh_entries == 0)) {
                EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
@@ -2257,6 +2258,19 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 
        ext4_free_blocks(handle, inode, NULL, leaf, 1,
                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+
+       while (--depth >= 0) {
+               if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
+                       break;
+               path--;
+               err = ext4_ext_get_access(handle, inode, path);
+               if (err)
+                       break;
+               path->p_idx->ei_block = (path+1)->p_idx->ei_block;
+               err = ext4_ext_dirty(handle, inode, path);
+               if (err)
+                       break;
+       }
        return err;
 }
 
@@ -2599,7 +2613,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        /* if this leaf is free, then we should
         * remove it from index block above */
        if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
-               err = ext4_ext_rm_idx(handle, inode, path + depth);
+               err = ext4_ext_rm_idx(handle, inode, path, depth);
 
 out:
        return err;
@@ -2802,7 +2816,7 @@ again:
                                /* index is empty, remove it;
                                 * handle must be already prepared by the
                                 * truncatei_leaf() */
-                               err = ext4_ext_rm_idx(handle, inode, path + i);
+                               err = ext4_ext_rm_idx(handle, inode, path, i);
                        }
                        /* root level has p_bh == NULL, brelse() eats this */
                        brelse(path[i].p_bh);
index d07c27ca594a4578887ed898e71c9b60684730df..405565a62277c5bbbf9d5ea91efbeea4118531b2 100644 (file)
@@ -108,14 +108,6 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
 
        /* Unaligned direct AIO must be serialized; see comment above */
        if (unaligned_aio) {
-               static unsigned long unaligned_warn_time;
-
-               /* Warn about this once per day */
-               if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
-                       ext4_msg(inode->i_sb, KERN_WARNING,
-                                "Unaligned AIO/DIO on inode %ld by %s; "
-                                "performance will be poor.",
-                                inode->i_ino, current->comm);
                mutex_lock(ext4_aio_mutex(inode));
                ext4_unwritten_wait(inode);
        }
index dfbc1fe9667487518d965ecb361d1724f822f983..3278e64e57b61ac51a41db3ceebeecd21003985a 100644 (file)
@@ -109,8 +109,6 @@ static int __sync_inode(struct inode *inode, int datasync)
  *
  * What we do is just kick off a commit and wait on it.  This will snapshot the
  * inode to disk.
- *
- * i_mutex lock is held when entering and exiting this function
  */
 
 int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
index cb1c1ab2720bd6c08c641879adb5ac77c11ca104..cbfe13bf5b2aa3f39b4845fe4fce4f45a02b5f43 100644 (file)
@@ -2880,8 +2880,6 @@ static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offs
 
 static void ext4_invalidatepage(struct page *page, unsigned long offset)
 {
-       journal_t *journal = EXT4_JOURNAL(page->mapping->host);
-
        trace_ext4_invalidatepage(page, offset);
 
        /*
@@ -2889,16 +2887,34 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
         */
        if (ext4_should_dioread_nolock(page->mapping->host))
                ext4_invalidatepage_free_endio(page, offset);
+
+       /* No journalling happens on data buffers when this function is used */
+       WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
+
+       block_invalidatepage(page, offset);
+}
+
+static int __ext4_journalled_invalidatepage(struct page *page,
+                                           unsigned long offset)
+{
+       journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+
+       trace_ext4_journalled_invalidatepage(page, offset);
+
        /*
         * If it's a full truncate we just forget about the pending dirtying
         */
        if (offset == 0)
                ClearPageChecked(page);
 
-       if (journal)
-               jbd2_journal_invalidatepage(journal, page, offset);
-       else
-               block_invalidatepage(page, offset);
+       return jbd2_journal_invalidatepage(journal, page, offset);
+}
+
+/* Wrapper for aops... */
+static void ext4_journalled_invalidatepage(struct page *page,
+                                          unsigned long offset)
+{
+       WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0);
 }
 
 static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3264,7 +3280,7 @@ static const struct address_space_operations ext4_journalled_aops = {
        .write_end              = ext4_journalled_write_end,
        .set_page_dirty         = ext4_journalled_set_page_dirty,
        .bmap                   = ext4_bmap,
-       .invalidatepage         = ext4_invalidatepage,
+       .invalidatepage         = ext4_journalled_invalidatepage,
        .releasepage            = ext4_releasepage,
        .direct_IO              = ext4_direct_IO,
        .is_partially_uptodate  = block_is_partially_uptodate,
@@ -4304,6 +4320,47 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
        return err;
 }
 
+/*
+ * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
+ * buffers that are attached to a page stradding i_size and are undergoing
+ * commit. In that case we have to wait for commit to finish and try again.
+ */
+static void ext4_wait_for_tail_page_commit(struct inode *inode)
+{
+       struct page *page;
+       unsigned offset;
+       journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+       tid_t commit_tid = 0;
+       int ret;
+
+       offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+       /*
+        * All buffers in the last page remain valid? Then there's nothing to
+        * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE ==
+        * blocksize case
+        */
+       if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits))
+               return;
+       while (1) {
+               page = find_lock_page(inode->i_mapping,
+                                     inode->i_size >> PAGE_CACHE_SHIFT);
+               if (!page)
+                       return;
+               ret = __ext4_journalled_invalidatepage(page, offset);
+               unlock_page(page);
+               page_cache_release(page);
+               if (ret != -EBUSY)
+                       return;
+               commit_tid = 0;
+               read_lock(&journal->j_state_lock);
+               if (journal->j_committing_transaction)
+                       commit_tid = journal->j_committing_transaction->t_tid;
+               read_unlock(&journal->j_state_lock);
+               if (commit_tid)
+                       jbd2_log_wait_commit(journal, commit_tid);
+       }
+}
+
 /*
  * ext4_setattr()
  *
@@ -4417,16 +4474,28 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        }
 
        if (attr->ia_valid & ATTR_SIZE) {
-               if (attr->ia_size != i_size_read(inode)) {
-                       truncate_setsize(inode, attr->ia_size);
-                       /* Inode size will be reduced, wait for dio in flight.
-                        * Temporarily disable dioread_nolock to prevent
-                        * livelock. */
+               if (attr->ia_size != inode->i_size) {
+                       loff_t oldsize = inode->i_size;
+
+                       i_size_write(inode, attr->ia_size);
+                       /*
+                        * Blocks are going to be removed from the inode. Wait
+                        * for dio in flight.  Temporarily disable
+                        * dioread_nolock to prevent livelock.
+                        */
                        if (orphan) {
-                               ext4_inode_block_unlocked_dio(inode);
-                               inode_dio_wait(inode);
-                               ext4_inode_resume_unlocked_dio(inode);
+                               if (!ext4_should_journal_data(inode)) {
+                                       ext4_inode_block_unlocked_dio(inode);
+                                       inode_dio_wait(inode);
+                                       ext4_inode_resume_unlocked_dio(inode);
+                               } else
+                                       ext4_wait_for_tail_page_commit(inode);
                        }
+                       /*
+                        * Truncate pagecache after we've waited for commit
+                        * in data=journal mode to make pages freeable.
+                        */
+                       truncate_pagecache(inode, oldsize, inode->i_size);
                }
                ext4_truncate(inode);
        }
index cac44828233159bfeb1d93c502943fe82b813b2a..8990165346ee6aa01a7f9a3264a092fc9669e85b 100644 (file)
@@ -2648,7 +2648,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
        struct ext4_iloc iloc;
        int err = 0;
 
-       if (!EXT4_SB(inode->i_sb)->s_journal)
+       if ((!EXT4_SB(inode->i_sb)->s_journal) &&
+           !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS))
                return 0;
 
        mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
index 3cdb0a2fc64856b041ec4b5f184fb41a5a346771..3d4fb81bacd540ca7b81fd4005f392aa7a516b53 100644 (file)
@@ -1645,9 +1645,7 @@ static int parse_options(char *options, struct super_block *sb,
                         unsigned int *journal_ioprio,
                         int is_remount)
 {
-#ifdef CONFIG_QUOTA
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-#endif
        char *p;
        substring_t args[MAX_OPT_ARGS];
        int token;
@@ -1696,6 +1694,16 @@ static int parse_options(char *options, struct super_block *sb,
                }
        }
 #endif
+       if (test_opt(sb, DIOREAD_NOLOCK)) {
+               int blocksize =
+                       BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
+
+               if (blocksize < PAGE_CACHE_SIZE) {
+                       ext4_msg(sb, KERN_ERR, "can't mount with "
+                                "dioread_nolock if block size != PAGE_SIZE");
+                       return 0;
+               }
+       }
        return 1;
 }
 
@@ -2212,7 +2220,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                                __func__, inode->i_ino, inode->i_size);
                        jbd_debug(2, "truncating inode %lu to %lld bytes\n",
                                  inode->i_ino, inode->i_size);
+                       mutex_lock(&inode->i_mutex);
                        ext4_truncate(inode);
+                       mutex_unlock(&inode->i_mutex);
                        nr_truncates++;
                } else {
                        ext4_msg(sb, KERN_DEBUG,
@@ -3223,6 +3233,10 @@ int ext4_calculate_overhead(struct super_block *sb)
                        memset(buf, 0, PAGE_SIZE);
                cond_resched();
        }
+       /* Add the journal blocks as well */
+       if (sbi->s_journal)
+               overhead += EXT4_B2C(sbi, sbi->s_journal->j_maxlen);
+
        sbi->s_overhead = overhead;
        smp_wmb();
        free_page((unsigned long) buf);
@@ -3436,15 +3450,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        clear_opt(sb, DELALLOC);
        }
 
-       blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
-       if (test_opt(sb, DIOREAD_NOLOCK)) {
-               if (blocksize < PAGE_SIZE) {
-                       ext4_msg(sb, KERN_ERR, "can't mount with "
-                                "dioread_nolock if block size != PAGE_SIZE");
-                       goto failed_mount;
-               }
-       }
-
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
                (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 
@@ -3486,6 +3491,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
                goto failed_mount;
 
+       blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
        if (blocksize < EXT4_MIN_BLOCK_SIZE ||
            blocksize > EXT4_MAX_BLOCK_SIZE) {
                ext4_msg(sb, KERN_ERR,
@@ -4725,7 +4731,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        }
 
        ext4_setup_system_zone(sb);
-       if (sbi->s_journal == NULL)
+       if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
                ext4_commit_super(sb, 1);
 
 #ifdef CONFIG_QUOTA
index 42f6615af0ac47d984fc5084623570d231832cfb..df9f29760efa99931bef9fdd9609e7580d8852cd 100644 (file)
@@ -209,7 +209,8 @@ repeat:
                if (!new_transaction)
                        goto alloc_transaction;
                write_lock(&journal->j_state_lock);
-               if (!journal->j_running_transaction) {
+               if (!journal->j_running_transaction &&
+                   !journal->j_barrier_count) {
                        jbd2_get_transaction(journal, new_transaction);
                        new_transaction = NULL;
                }
@@ -1839,7 +1840,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
 
        BUFFER_TRACE(bh, "entry");
 
-retry:
        /*
         * It is safe to proceed here without the j_list_lock because the
         * buffers cannot be stolen by try_to_free_buffers as long as we are
@@ -1934,14 +1934,11 @@ retry:
                 * for commit and try again.
                 */
                if (partial_page) {
-                       tid_t tid = journal->j_committing_transaction->t_tid;
-
                        jbd2_journal_put_journal_head(jh);
                        spin_unlock(&journal->j_list_lock);
                        jbd_unlock_bh_state(bh);
                        write_unlock(&journal->j_state_lock);
-                       jbd2_log_wait_commit(journal, tid);
-                       goto retry;
+                       return -EBUSY;
                }
                /*
                 * OK, buffer won't be reachable after truncate. We just set
@@ -2002,21 +1999,23 @@ zap_buffer_unlocked:
  * @page:    page to flush
  * @offset:  length of page to invalidate.
  *
- * Reap page buffers containing data after offset in page.
- *
+ * Reap page buffers containing data after offset in page. Can return -EBUSY
+ * if buffers are part of the committing transaction and the page is straddling
+ * i_size. Caller then has to wait for current commit and try again.
  */
-void jbd2_journal_invalidatepage(journal_t *journal,
-                     struct page *page,
-                     unsigned long offset)
+int jbd2_journal_invalidatepage(journal_t *journal,
+                               struct page *page,
+                               unsigned long offset)
 {
        struct buffer_head *head, *bh, *next;
        unsigned int curr_off = 0;
        int may_free = 1;
+       int ret = 0;
 
        if (!PageLocked(page))
                BUG();
        if (!page_has_buffers(page))
-               return;
+               return 0;
 
        /* We will potentially be playing with lists other than just the
         * data lists (especially for journaled data mode), so be
@@ -2030,9 +2029,11 @@ void jbd2_journal_invalidatepage(journal_t *journal,
                if (offset <= curr_off) {
                        /* This block is wholly outside the truncation point */
                        lock_buffer(bh);
-                       may_free &= journal_unmap_buffer(journal, bh,
-                                                        offset > 0);
+                       ret = journal_unmap_buffer(journal, bh, offset > 0);
                        unlock_buffer(bh);
+                       if (ret < 0)
+                               return ret;
+                       may_free &= ret;
                }
                curr_off = next_off;
                bh = next;
@@ -2043,6 +2044,7 @@ void jbd2_journal_invalidatepage(journal_t *journal,
                if (may_free && try_to_free_buffers(page))
                        J_ASSERT(!page_has_buffers(page));
        }
+       return 0;
 }
 
 /*
index 1be23d9fdacb5151a6af6f8b30b2086e50c50bb1..e30b66346942a90a4c79cdc5a0362b3899db0521 100644 (file)
@@ -1098,7 +1098,7 @@ void               jbd2_journal_set_triggers(struct buffer_head *,
 extern int      jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
 extern int      jbd2_journal_forget (handle_t *, struct buffer_head *);
 extern void     journal_sync_buffer (struct buffer_head *);
-extern void     jbd2_journal_invalidatepage(journal_t *,
+extern int      jbd2_journal_invalidatepage(journal_t *,
                                struct page *, unsigned long);
 extern int      jbd2_journal_try_to_free_buffers(journal_t *, struct page *, gfp_t);
 extern int      jbd2_journal_stop(handle_t *);
index f6372b01136657cd5a0b9213090733c87347250e..7e8c36bc708225c0f3eb79c0b33977c1dcb17519 100644 (file)
@@ -451,7 +451,7 @@ DEFINE_EVENT(ext4__page_op, ext4_releasepage,
        TP_ARGS(page)
 );
 
-TRACE_EVENT(ext4_invalidatepage,
+DECLARE_EVENT_CLASS(ext4_invalidatepage_op,
        TP_PROTO(struct page *page, unsigned long offset),
 
        TP_ARGS(page, offset),
@@ -477,6 +477,18 @@ TRACE_EVENT(ext4_invalidatepage,
                  (unsigned long) __entry->index, __entry->offset)
 );
 
+DEFINE_EVENT(ext4_invalidatepage_op, ext4_invalidatepage,
+       TP_PROTO(struct page *page, unsigned long offset),
+
+       TP_ARGS(page, offset)
+);
+
+DEFINE_EVENT(ext4_invalidatepage_op, ext4_journalled_invalidatepage,
+       TP_PROTO(struct page *page, unsigned long offset),
+
+       TP_ARGS(page, offset)
+);
+
 TRACE_EVENT(ext4_discard_blocks,
        TP_PROTO(struct super_block *sb, unsigned long long blk,
                        unsigned long long count),