Merge branch 'master' into upstream
[linux-2.6-block.git] / fs / ocfs2 / aops.c
index bf931ba1d36478e323f3e1c4ebc54be12ea335fb..3d7c082a8f58288f8bdd2cfd1bd1f4fdf2fbab4d 100644 (file)
@@ -276,13 +276,29 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
        return ret;
 }
 
+/* This can also be called from ocfs2_write_zero_page() which has done
+ * it's own cluster locking. */
+int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
+                              unsigned from, unsigned to)
+{
+       int ret;
+
+       down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+       ret = block_prepare_write(page, from, to, ocfs2_get_block);
+
+       up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+       return ret;
+}
+
 /*
  * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
  * from loopback.  It must be able to perform its own locking around
  * ocfs2_get_block().
  */
-int ocfs2_prepare_write(struct file *file, struct page *page,
-                       unsigned from, unsigned to)
+static int ocfs2_prepare_write(struct file *file, struct page *page,
+                              unsigned from, unsigned to)
 {
        struct inode *inode = page->mapping->host;
        int ret;
@@ -295,11 +311,7 @@ int ocfs2_prepare_write(struct file *file, struct page *page,
                goto out;
        }
 
-       down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-       ret = block_prepare_write(page, from, to, ocfs2_get_block);
-
-       up_read(&OCFS2_I(inode)->ip_alloc_sem);
+       ret = ocfs2_prepare_write_nolock(inode, page, from, to);
 
        ocfs2_meta_unlock(inode, 0);
 out:
@@ -379,31 +391,28 @@ out:
 static int ocfs2_commit_write(struct file *file, struct page *page,
                              unsigned from, unsigned to)
 {
-       int ret, extending = 0, locklevel = 0;
-       loff_t new_i_size;
+       int ret;
        struct buffer_head *di_bh = NULL;
        struct inode *inode = page->mapping->host;
        struct ocfs2_journal_handle *handle = NULL;
+       struct ocfs2_dinode *di;
 
        mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
 
        /* NOTE: ocfs2_file_aio_write has ensured that it's safe for
-        * us to sample inode->i_size here without the metadata lock:
+        * us to continue here without rechecking the I/O against
+        * changed inode values.
         *
         * 1) We're currently holding the inode alloc lock, so no
         *    nodes can change it underneath us.
         *
         * 2) We've had to take the metadata lock at least once
-        *    already to check for extending writes, hence insuring
-        *    that our current copy is also up to date.
+        *    already to check for extending writes, suid removal, etc.
+        *    The meta data update code then ensures that we don't get a
+        *    stale inode allocation image (i_size, i_clusters, etc).
         */
-       new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-       if (new_i_size > i_size_read(inode)) {
-               extending = 1;
-               locklevel = 1;
-       }
 
-       ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
+       ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, 1, page);
        if (ret != 0) {
                mlog_errno(ret);
                goto out;
@@ -415,23 +424,20 @@ static int ocfs2_commit_write(struct file *file, struct page *page,
                goto out_unlock_meta;
        }
 
-       if (extending) {
-               handle = ocfs2_start_walk_page_trans(inode, page, from, to);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       handle = NULL;
-                       goto out_unlock_data;
-               }
+       handle = ocfs2_start_walk_page_trans(inode, page, from, to);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               goto out_unlock_data;
+       }
 
-               /* Mark our buffer early. We'd rather catch this error up here
-                * as opposed to after a successful commit_write which would
-                * require us to set back inode->i_size. */
-               ret = ocfs2_journal_access(handle, inode, di_bh,
-                                          OCFS2_JOURNAL_ACCESS_WRITE);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       goto out_commit;
-               }
+       /* Mark our buffer early. We'd rather catch this error up here
+        * as opposed to after a successful commit_write which would
+        * require us to set back inode->i_size. */
+       ret = ocfs2_journal_access(handle, inode, di_bh,
+                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out_commit;
        }
 
        /* might update i_size */
@@ -441,37 +447,28 @@ static int ocfs2_commit_write(struct file *file, struct page *page,
                goto out_commit;
        }
 
-       if (extending) {
-               loff_t size = (u64) i_size_read(inode);
-               struct ocfs2_dinode *di =
-                       (struct ocfs2_dinode *)di_bh->b_data;
+       di = (struct ocfs2_dinode *)di_bh->b_data;
 
-               /* ocfs2_mark_inode_dirty is too heavy to use here. */
-               inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
-               inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+       /* ocfs2_mark_inode_dirty() is too heavy to use here. */
+       inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+       di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+       di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
 
-               di->i_size = cpu_to_le64(size);
-               di->i_ctime = di->i_mtime = 
-                               cpu_to_le64(inode->i_mtime.tv_sec);
-               di->i_ctime_nsec = di->i_mtime_nsec = 
-                               cpu_to_le32(inode->i_mtime.tv_nsec);
+       inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
+       di->i_size = cpu_to_le64((u64)i_size_read(inode));
 
-               ret = ocfs2_journal_dirty(handle, di_bh);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       goto out_commit;
-               }
+       ret = ocfs2_journal_dirty(handle, di_bh);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out_commit;
        }
 
-       BUG_ON(extending && (i_size_read(inode) != new_i_size));
-
 out_commit:
-       if (handle)
-               ocfs2_commit_trans(handle);
+       ocfs2_commit_trans(handle);
 out_unlock_data:
        ocfs2_data_unlock(inode, 1);
 out_unlock_meta:
-       ocfs2_meta_unlock(inode, locklevel);
+       ocfs2_meta_unlock(inode, 1);
 out:
        if (di_bh)
                brelse(di_bh);
@@ -540,21 +537,14 @@ bail:
  *                                     fs_count, map_bh, dio->rw == WRITE);
  */
 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
-                                    unsigned long max_blocks,
                                     struct buffer_head *bh_result, int create)
 {
        int ret;
        u64 vbo_max; /* file offset, max_blocks from iblock */
        u64 p_blkno;
        int contig_blocks;
-       unsigned char blocksize_bits;
-
-       if (!inode || !bh_result) {
-               mlog(ML_ERROR, "inode or bh_result is null\n");
-               return -EIO;
-       }
-
-       blocksize_bits = inode->i_sb->s_blocksize_bits;
+       unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
+       unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
 
        /* This function won't even be called if the request isn't all
         * nicely aligned and of the right size, so there's no need
@@ -625,16 +615,36 @@ static ssize_t ocfs2_direct_IO(int rw,
        int ret;
 
        mlog_entry_void();
+
+       /*
+        * We get PR data locks even for O_DIRECT.  This allows
+        * concurrent O_DIRECT I/O but doesn't let O_DIRECT with
+        * extending and buffered zeroing writes race.  If they did
+        * race then the buffered zeroing could be written back after
+        * the O_DIRECT I/O.  It's one thing to tell people not to mix
+        * buffered and O_DIRECT writes, but expecting them to
+        * understand that file extension is also an implicit buffered
+        * write is too much.  By getting the PR we force writeback of
+        * the buffered zeroing before proceeding.
+        */
+       ret = ocfs2_data_lock(inode, 0);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out;
+       }
+       ocfs2_data_unlock(inode, 0);
+
        ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
                                            inode->i_sb->s_bdev, iov, offset,
                                            nr_segs, 
                                            ocfs2_direct_IO_get_blocks,
                                            ocfs2_dio_end_io);
+out:
        mlog_exit(ret);
        return ret;
 }
 
-struct address_space_operations ocfs2_aops = {
+const struct address_space_operations ocfs2_aops = {
        .readpage       = ocfs2_readpage,
        .writepage      = ocfs2_writepage,
        .prepare_write  = ocfs2_prepare_write,