Merge branch 'master' into upstream

[linux-2.6-block.git] / fs / ocfs2 / aops.c
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c

index bf931ba1d36478e323f3e1c4ebc54be12ea335fb..3d7c082a8f58288f8bdd2cfd1bd1f4fdf2fbab4d 100644 (file)
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -276,13 +276,29 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
         return ret;
  }
  
+/* This can also be called from ocfs2_write_zero_page() which has done
+ * it's own cluster locking. */
+int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
+                              unsigned from, unsigned to)
+{
+       int ret;
+
+       down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+       ret = block_prepare_write(page, from, to, ocfs2_get_block);
+
+       up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+       return ret;
+}
+
  /*
   * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
   * from loopback.  It must be able to perform its own locking around
   * ocfs2_get_block().
   */
-int ocfs2_prepare_write(struct file *file, struct page *page,
-                       unsigned from, unsigned to)
+static int ocfs2_prepare_write(struct file *file, struct page *page,
+                              unsigned from, unsigned to)
  {
         struct inode *inode = page->mapping->host;
         int ret;
@@ -295,11 +311,7 @@ int ocfs2_prepare_write(struct file *file, struct page *page,
                 goto out;
         }
  
-       down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-       ret = block_prepare_write(page, from, to, ocfs2_get_block);
-
-       up_read(&OCFS2_I(inode)->ip_alloc_sem);
+       ret = ocfs2_prepare_write_nolock(inode, page, from, to);
  
         ocfs2_meta_unlock(inode, 0);
  out:
@@ -379,31 +391,28 @@ out:
  static int ocfs2_commit_write(struct file *file, struct page *page,
                               unsigned from, unsigned to)
  {
-       int ret, extending = 0, locklevel = 0;
-       loff_t new_i_size;
+       int ret;
         struct buffer_head *di_bh = NULL;
         struct inode *inode = page->mapping->host;
         struct ocfs2_journal_handle *handle = NULL;
+       struct ocfs2_dinode *di;
  
         mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
  
         /* NOTE: ocfs2_file_aio_write has ensured that it's safe for
-        * us to sample inode->i_size here without the metadata lock:
+        * us to continue here without rechecking the I/O against
+        * changed inode values.
          *
          * 1) We're currently holding the inode alloc lock, so no
          *    nodes can change it underneath us.
          *
          * 2) We've had to take the metadata lock at least once
-        *    already to check for extending writes, hence insuring
-        *    that our current copy is also up to date.
+        *    already to check for extending writes, suid removal, etc.
+        *    The meta data update code then ensures that we don't get a
+        *    stale inode allocation image (i_size, i_clusters, etc).
          */
-       new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-       if (new_i_size > i_size_read(inode)) {
-               extending = 1;
-               locklevel = 1;
-       }
  
-       ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
+       ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, 1, page);
         if (ret != 0) {
                 mlog_errno(ret);
                 goto out;
@@ -415,23 +424,20 @@ static int ocfs2_commit_write(struct file *file, struct page *page,
                 goto out_unlock_meta;
         }
  
-       if (extending) {
-               handle = ocfs2_start_walk_page_trans(inode, page, from, to);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       handle = NULL;
-                       goto out_unlock_data;
-               }
+       handle = ocfs2_start_walk_page_trans(inode, page, from, to);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               goto out_unlock_data;
+       }
  
-               /* Mark our buffer early. We'd rather catch this error up here
-                * as opposed to after a successful commit_write which would
-                * require us to set back inode->i_size. */
-               ret = ocfs2_journal_access(handle, inode, di_bh,
-                                          OCFS2_JOURNAL_ACCESS_WRITE);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       goto out_commit;
-               }
+       /* Mark our buffer early. We'd rather catch this error up here
+        * as opposed to after a successful commit_write which would
+        * require us to set back inode->i_size. */
+       ret = ocfs2_journal_access(handle, inode, di_bh,
+                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out_commit;
         }
  
         /* might update i_size */
@@ -441,37 +447,28 @@ static int ocfs2_commit_write(struct file *file, struct page *page,
                 goto out_commit;
         }
  
-       if (extending) {
-               loff_t size = (u64) i_size_read(inode);
-               struct ocfs2_dinode *di =
-                       (struct ocfs2_dinode *)di_bh->b_data;
+       di = (struct ocfs2_dinode *)di_bh->b_data;
  
-               /* ocfs2_mark_inode_dirty is too heavy to use here. */
-               inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
-               inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+       /* ocfs2_mark_inode_dirty() is too heavy to use here. */
+       inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+       di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+       di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
  
-               di->i_size = cpu_to_le64(size);
-               di->i_ctime = di->i_mtime = 
-                               cpu_to_le64(inode->i_mtime.tv_sec);
-               di->i_ctime_nsec = di->i_mtime_nsec = 
-                               cpu_to_le32(inode->i_mtime.tv_nsec);
+       inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
+       di->i_size = cpu_to_le64((u64)i_size_read(inode));
  
-               ret = ocfs2_journal_dirty(handle, di_bh);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       goto out_commit;
-               }
+       ret = ocfs2_journal_dirty(handle, di_bh);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out_commit;
         }
  
-       BUG_ON(extending && (i_size_read(inode) != new_i_size));
-
  out_commit:
-       if (handle)
-               ocfs2_commit_trans(handle);
+       ocfs2_commit_trans(handle);
  out_unlock_data:
         ocfs2_data_unlock(inode, 1);
  out_unlock_meta:
-       ocfs2_meta_unlock(inode, locklevel);
+       ocfs2_meta_unlock(inode, 1);
  out:
         if (di_bh)
                 brelse(di_bh);
@@ -540,21 +537,14 @@ bail:
   *                                     fs_count, map_bh, dio->rw == WRITE);
   */
  static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
-                                    unsigned long max_blocks,
                                      struct buffer_head *bh_result, int create)
  {
         int ret;
         u64 vbo_max; /* file offset, max_blocks from iblock */
         u64 p_blkno;
         int contig_blocks;
-       unsigned char blocksize_bits;
-
-       if (!inode || !bh_result) {
-               mlog(ML_ERROR, "inode or bh_result is null\n");
-               return -EIO;
-       }
-
-       blocksize_bits = inode->i_sb->s_blocksize_bits;
+       unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
+       unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
  
         /* This function won't even be called if the request isn't all
          * nicely aligned and of the right size, so there's no need
@@ -625,16 +615,36 @@ static ssize_t ocfs2_direct_IO(int rw,
         int ret;
  
         mlog_entry_void();
+
+       /*
+        * We get PR data locks even for O_DIRECT.  This allows
+        * concurrent O_DIRECT I/O but doesn't let O_DIRECT with
+        * extending and buffered zeroing writes race.  If they did
+        * race then the buffered zeroing could be written back after
+        * the O_DIRECT I/O.  It's one thing to tell people not to mix
+        * buffered and O_DIRECT writes, but expecting them to
+        * understand that file extension is also an implicit buffered
+        * write is too much.  By getting the PR we force writeback of
+        * the buffered zeroing before proceeding.
+        */
+       ret = ocfs2_data_lock(inode, 0);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out;
+       }
+       ocfs2_data_unlock(inode, 0);
+
         ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
                                             inode->i_sb->s_bdev, iov, offset,
                                             nr_segs, 
                                             ocfs2_direct_IO_get_blocks,
                                             ocfs2_dio_end_io);
+out:
         mlog_exit(ret);
         return ret;
  }
  
-struct address_space_operations ocfs2_aops = {
+const struct address_space_operations ocfs2_aops = {
         .readpage       = ocfs2_readpage,
         .writepage      = ocfs2_writepage,
         .prepare_write  = ocfs2_prepare_write,