Merge tag 'xfs-for-linus-3.15-rc2' of git://oss.sgi.com/xfs/xfs

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 18 Apr 2014 17:17:37 +0000 (10:17 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 18 Apr 2014 17:17:37 +0000 (10:17 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 18 Apr 2014 17:17:37 +0000 (10:17 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 18 Apr 2014 17:17:37 +0000 (10:17 -0700)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c

index 75df77d09f757d4786c889679c35c5cac2ca00b3..0479c32c5eb1703ba0081f21af9b380e5a3c95cb 100644 (file)
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1344,6 +1344,14 @@ __xfs_get_blocks(
         /*
          * If this is O_DIRECT or the mpage code calling tell them how large
          * the mapping is, so that we can avoid repeated get_blocks calls.
+        *
+        * If the mapping spans EOF, then we have to break the mapping up as the
+        * mapping for blocks beyond EOF must be marked new so that sub block
+        * regions can be correctly zeroed. We can't do this for mappings within
+        * EOF unless the mapping was just allocated or is unwritten, otherwise
+        * the callers would overwrite existing data with zeros. Hence we have
+        * to split the mapping into a range up to and including EOF, and a
+        * second mapping for beyond EOF.
          */
         if (direct || size > (1 << inode->i_blkbits)) {
                 xfs_off_t               mapping_size;
@@ -1354,6 +1362,12 @@ __xfs_get_blocks(
                 ASSERT(mapping_size > 0);
                 if (mapping_size > size)
                         mapping_size = size;
+               if (offset < i_size_read(inode) &&
+                   offset + mapping_size >= i_size_read(inode)) {
+                       /* limit mapping to block that spans EOF */
+                       mapping_size = roundup_64(i_size_read(inode) - offset,
+                                                 1 << inode->i_blkbits);
+               }
                 if (mapping_size > LONG_MAX)
                         mapping_size = LONG_MAX;
  
@@ -1566,6 +1580,16 @@ xfs_vm_write_failed(
  
                 xfs_vm_kill_delalloc_range(inode, block_offset,
                                            block_offset + bh->b_size);
+
+               /*
+                * This buffer does not contain data anymore. make sure anyone
+                * who finds it knows that for certain.
+                */
+               clear_buffer_delay(bh);
+               clear_buffer_uptodate(bh);
+               clear_buffer_mapped(bh);
+               clear_buffer_new(bh);
+               clear_buffer_dirty(bh);
         }
  
  }
@@ -1599,12 +1623,21 @@ xfs_vm_write_begin(
         status = __block_write_begin(page, pos, len, xfs_get_blocks);
         if (unlikely(status)) {
                 struct inode    *inode = mapping->host;
+               size_t          isize = i_size_read(inode);
  
                 xfs_vm_write_failed(inode, page, pos, len);
                 unlock_page(page);
  
-               if (pos + len > i_size_read(inode))
-                       truncate_pagecache(inode, i_size_read(inode));
+               /*
+                * If the write is beyond EOF, we only want to kill blocks
+                * allocated in this write, not blocks that were previously
+                * written successfully.
+                */
+               if (pos + len > isize) {
+                       ssize_t start = max_t(ssize_t, pos, isize);
+
+                       truncate_pagecache_range(inode, start, pos + len);
+               }
  
                 page_cache_release(page);
                 page = NULL;
@@ -1615,9 +1648,12 @@ xfs_vm_write_begin(
  }
  
  /*
- * On failure, we only need to kill delalloc blocks beyond EOF because they
- * will never be written. For blocks within EOF, generic_write_end() zeros them
- * so they are safe to leave alone and be written with all the other valid data.
+ * On failure, we only need to kill delalloc blocks beyond EOF in the range of
+ * this specific write because they will never be written. Previous writes
+ * beyond EOF where block allocation succeeded do not need to be trashed, so
+ * only new blocks from this write should be trashed. For blocks within
+ * EOF, generic_write_end() zeros them so they are safe to leave alone and be
+ * written with all the other valid data.
   */
  STATIC int
  xfs_vm_write_end(
@@ -1640,8 +1676,11 @@ xfs_vm_write_end(
                 loff_t          to = pos + len;
  
                 if (to > isize) {
-                       truncate_pagecache(inode, isize);
+                       /* only kill blocks in this write beyond EOF */
+                       if (pos > isize)
+                               isize = pos;
                         xfs_vm_kill_delalloc_range(inode, isize, to);
+                       truncate_pagecache_range(inode, isize, to);
                 }
         }
         return ret;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c

index 5b6092ef51efa9eb6e02c980980c6aa99e486170..f0efc7e970ef10658e01be3def9f4b55de0ac6ab 100644 (file)
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5413,6 +5413,7 @@ xfs_bmap_shift_extents(
         int                             whichfork = XFS_DATA_FORK;
         int                             logflags;
         xfs_filblks_t                   blockcount = 0;
+       int                             total_extents;
  
         if (unlikely(XFS_TEST_ERROR(
             (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5429,7 +5430,6 @@ xfs_bmap_shift_extents(
         ASSERT(current_ext != NULL);
  
         ifp = XFS_IFORK_PTR(ip, whichfork);
-
         if (!(ifp->if_flags & XFS_IFEXTENTS)) {
                 /* Read in all the extents */
                 error = xfs_iread_extents(tp, ip, whichfork);
@@ -5456,7 +5456,6 @@ xfs_bmap_shift_extents(
  
         /* We are going to change core inode */
         logflags = XFS_ILOG_CORE;
-
         if (ifp->if_flags & XFS_IFBROOT) {
                 cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
                 cur->bc_private.b.firstblock = *firstblock;
@@ -5467,8 +5466,14 @@ xfs_bmap_shift_extents(
                 logflags |= XFS_ILOG_DEXT;
         }
  
-       while (nexts++ < num_exts &&
-              *current_ext <  XFS_IFORK_NEXTENTS(ip, whichfork)) {
+       /*
+        * There may be delalloc extents in the data fork before the range we
+        * are collapsing out, so we cannot
+        * use the count of real extents here. Instead we have to calculate it
+        * from the incore fork.
+        */
+       total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+       while (nexts++ < num_exts && *current_ext < total_extents) {
  
                 gotp = xfs_iext_get_ext(ifp, *current_ext);
                 xfs_bmbt_get_all(gotp, &got);
@@ -5556,10 +5561,11 @@ xfs_bmap_shift_extents(
                 }
  
                 (*current_ext)++;
+               total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
         }
  
         /* Check if we are done */
-       if (*current_ext ==  XFS_IFORK_NEXTENTS(ip, whichfork))
+       if (*current_ext == total_extents)
                 *done = 1;
  
  del_cursor:
@@ -5568,6 +5574,5 @@ del_cursor:
                         error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
  
         xfs_trans_log_inode(tp, ip, logflags);
-
         return error;
  }
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c

index 01f6a646caa121895265cfe33a7d97860f786a18..296160b8e78c694c811e4e4391a06ced33bb3eef 100644 (file)
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1418,6 +1418,8 @@ xfs_zero_file_space(
         xfs_off_t               end_boundary;
         int                     error;
  
+       trace_xfs_zero_file_space(ip);
+
         granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
  
         /*
@@ -1432,9 +1434,18 @@ xfs_zero_file_space(
         ASSERT(end_boundary <= offset + len);
  
         if (start_boundary < end_boundary - 1) {
-               /* punch out the page cache over the conversion range */
+               /*
+                * punch out delayed allocation blocks and the page cache over
+                * the conversion range
+                */
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               error = xfs_bmap_punch_delalloc_range(ip,
+                               XFS_B_TO_FSBT(mp, start_boundary),
+                               XFS_B_TO_FSB(mp, end_boundary - start_boundary));
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
                 truncate_pagecache_range(VFS_I(ip), start_boundary,
                                          end_boundary - 1);
+
                 /* convert the blocks */
                 error = xfs_alloc_file_space(ip, start_boundary,
                                         end_boundary - start_boundary - 1,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c

index 107f2fdfe41fb9ef0e6941bd60542ee12a0510e3..cb10a0aaab3aa7b55f9a6735cf70500d509843f3 100644 (file)
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1372,21 +1372,29 @@ xfs_buf_iorequest(
                 xfs_buf_wait_unpin(bp);
         xfs_buf_hold(bp);
  
-       /* Set the count to 1 initially, this will stop an I/O
+       /*
+        * Set the count to 1 initially, this will stop an I/O
          * completion callout which happens before we have started
          * all the I/O from calling xfs_buf_ioend too early.
          */
         atomic_set(&bp->b_io_remaining, 1);
         _xfs_buf_ioapply(bp);
-       _xfs_buf_ioend(bp, 1);
+       /*
+        * If _xfs_buf_ioapply failed, we'll get back here with
+        * only the reference we took above.  _xfs_buf_ioend will
+        * drop it to zero, so we'd better not queue it for later,
+        * or we'll free it before it's done.
+        */
+       _xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
  
         xfs_buf_rele(bp);
  }
  
  /*
   * Waits for I/O to complete on the buffer supplied.  It returns immediately if
- * no I/O is pending or there is already a pending error on the buffer.  It
- * returns the I/O error code, if any, or 0 if there was no error.
+ * no I/O is pending or there is already a pending error on the buffer, in which
+ * case nothing will ever complete.  It returns the I/O error code, if any, or
+ * 0 if there was no error.
   */
  int
  xfs_buf_iowait(
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index 79e96ce987331cad3aab2d0a7513ce19cad2fcb4..82afdcb33183951350df18d3ce05b3aeecdf3e76 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -679,7 +679,7 @@ xfs_file_dio_aio_write(
                 goto out;
  
         if (mapping->nrpages) {
-               ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+               ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
                                                     pos, -1);
                 if (ret)
                         goto out;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c

index 5e7a38fa6ee6bd82e43c05f66cdc6d4b8404225e..768087bedbac58f9dea71b8f534c303e65972b42 100644 (file)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1334,7 +1334,8 @@ int
  xfs_create_tmpfile(
         struct xfs_inode        *dp,
         struct dentry           *dentry,
-       umode_t                 mode)
+       umode_t                 mode,
+       struct xfs_inode        **ipp)
  {
         struct xfs_mount        *mp = dp->i_mount;
         struct xfs_inode        *ip = NULL;
@@ -1402,7 +1403,6 @@ xfs_create_tmpfile(
         xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
  
         ip->i_d.di_nlink--;
-       d_tmpfile(dentry, VFS_I(ip));
         error = xfs_iunlink(tp, ip);
         if (error)
                 goto out_trans_abort;
@@ -1415,6 +1415,7 @@ xfs_create_tmpfile(
         xfs_qm_dqrele(gdqp);
         xfs_qm_dqrele(pdqp);
  
+       *ipp = ip;
         return 0;
  
   out_trans_abort:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h

index 396cc1fafd0d5e358c5ea8ccc21d3525d2396bfa..f2fcde52b66db98c26286682796e341bca751b33 100644 (file)
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -334,7 +334,7 @@ int         xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
  int            xfs_create(struct xfs_inode *dp, struct xfs_name *name,
                            umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
  int            xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry,
-                          umode_t mode);
+                          umode_t mode, struct xfs_inode **ipp);
  int            xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
                            struct xfs_inode *ip);
  int            xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c

index 89b07e43ca28811349db39aa1ab2534de220fd87..ef1ca010f417713358c0d0f3869189121e2f0ff0 100644 (file)
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1053,11 +1053,25 @@ xfs_vn_tmpfile(
         struct dentry   *dentry,
         umode_t         mode)
  {
-       int             error;
+       int                     error;
+       struct xfs_inode        *ip;
+       struct inode            *inode;
  
-       error = xfs_create_tmpfile(XFS_I(dir), dentry, mode);
+       error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip);
+       if (unlikely(error))
+               return -error;
  
-       return -error;
+       inode = VFS_I(ip);
+
+       error = xfs_init_security(inode, dir, &dentry->d_name);
+       if (unlikely(error)) {
+               iput(inode);
+               return -error;
+       }
+
+       d_tmpfile(dentry, inode);
+
+       return 0;
  }
  
  static const struct inode_operations xfs_inode_operations = {
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c

index 8497a00e399d0ba5960117c977376ae4a23cbbb3..08624dc67317185b1e044fd7d67ef4573d941ef6 100644 (file)
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1181,11 +1181,14 @@ xlog_iodone(xfs_buf_t *bp)
         /* log I/O is always issued ASYNC */
         ASSERT(XFS_BUF_ISASYNC(bp));
         xlog_state_done_syncing(iclog, aborted);
+
         /*
-        * do not reference the buffer (bp) here as we could race
-        * with it being freed after writing the unmount record to the
-        * log.
+        * drop the buffer lock now that we are done. Nothing references
+        * the buffer after this, so an unmount waiting on this lock can now
+        * tear it down safely. As such, it is unsafe to reference the buffer
+        * (bp) after the unlock as we could race with it being freed.
          */
+       xfs_buf_unlock(bp);
  }
  
  /*
@@ -1368,8 +1371,16 @@ xlog_alloc_log(
         bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
         if (!bp)
                 goto out_free_log;
-       bp->b_iodone = xlog_iodone;
+
+       /*
+        * The iclogbuf buffer locks are held over IO but we are not going to do
+        * IO yet.  Hence unlock the buffer so that the log IO path can grab it
+        * when appropriately.
+        */
         ASSERT(xfs_buf_islocked(bp));
+       xfs_buf_unlock(bp);
+
+       bp->b_iodone = xlog_iodone;
         log->l_xbuf = bp;
  
         spin_lock_init(&log->l_icloglock);
@@ -1398,6 +1409,9 @@ xlog_alloc_log(
                 if (!bp)
                         goto out_free_iclog;
  
+               ASSERT(xfs_buf_islocked(bp));
+               xfs_buf_unlock(bp);
+
                 bp->b_iodone = xlog_iodone;
                 iclog->ic_bp = bp;
                 iclog->ic_data = bp->b_addr;
@@ -1422,7 +1436,6 @@ xlog_alloc_log(
                 iclog->ic_callback_tail = &(iclog->ic_callback);
                 iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
  
-               ASSERT(xfs_buf_islocked(iclog->ic_bp));
                 init_waitqueue_head(&iclog->ic_force_wait);
                 init_waitqueue_head(&iclog->ic_write_wait);
  
@@ -1631,6 +1644,12 @@ xlog_cksum(
   * we transition the iclogs to IOERROR state *after* flushing all existing
   * iclogs to disk. This is because we don't want anymore new transactions to be
   * started or completed afterwards.
+ *
+ * We lock the iclogbufs here so that we can serialise against IO completion
+ * during unmount. We might be processing a shutdown triggered during unmount,
+ * and that can occur asynchronously to the unmount thread, and hence we need to
+ * ensure that completes before tearing down the iclogbufs. Hence we need to
+ * hold the buffer lock across the log IO to acheive that.
   */
  STATIC int
  xlog_bdstrat(
@@ -1638,6 +1657,7 @@ xlog_bdstrat(
  {
         struct xlog_in_core     *iclog = bp->b_fspriv;
  
+       xfs_buf_lock(bp);
         if (iclog->ic_state & XLOG_STATE_IOERROR) {
                 xfs_buf_ioerror(bp, EIO);
                 xfs_buf_stale(bp);
@@ -1645,7 +1665,8 @@ xlog_bdstrat(
                 /*
                  * It would seem logical to return EIO here, but we rely on
                  * the log state machine to propagate I/O errors instead of
-                * doing it here.
+                * doing it here. Similarly, IO completion will unlock the
+                * buffer, so we don't do it here.
                  */
                 return 0;
         }
@@ -1847,14 +1868,28 @@ xlog_dealloc_log(
         xlog_cil_destroy(log);
  
         /*
-        * always need to ensure that the extra buffer does not point to memory
-        * owned by another log buffer before we free it.
+        * Cycle all the iclogbuf locks to make sure all log IO completion
+        * is done before we tear down these buffers.
          */
+       iclog = log->l_iclog;
+       for (i = 0; i < log->l_iclog_bufs; i++) {
+               xfs_buf_lock(iclog->ic_bp);
+               xfs_buf_unlock(iclog->ic_bp);
+               iclog = iclog->ic_next;
+       }
+
+       /*
+        * Always need to ensure that the extra buffer does not point to memory
+        * owned by another log buffer before we free it. Also, cycle the lock
+        * first to ensure we've completed IO on it.
+        */
+       xfs_buf_lock(log->l_xbuf);
+       xfs_buf_unlock(log->l_xbuf);
         xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
         xfs_buf_free(log->l_xbuf);
  
         iclog = log->l_iclog;
-       for (i=0; i<log->l_iclog_bufs; i++) {
+       for (i = 0; i < log->l_iclog_bufs; i++) {
                 xfs_buf_free(iclog->ic_bp);
                 next_iclog = iclog->ic_next;
                 kmem_free(iclog);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h

index a4ae41c179a8a66a5772914a61642b8a53be1c4b..65d8c793a25cb10c5ded9bfb89fcfda725c58b5f 100644 (file)
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -603,6 +603,7 @@ DEFINE_INODE_EVENT(xfs_readlink);
  DEFINE_INODE_EVENT(xfs_inactive_symlink);
  DEFINE_INODE_EVENT(xfs_alloc_file_space);
  DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_zero_file_space);
  DEFINE_INODE_EVENT(xfs_collapse_file_space);
  DEFINE_INODE_EVENT(xfs_readdir);
  #ifdef CONFIG_XFS_POSIX_ACL
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 18 Apr 2014 17:17:37 +0000 (10:17 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 18 Apr 2014 17:17:37 +0000 (10:17 -0700)
fs/xfs/xfs_aops.c		patch \| blob \| blame \| history
fs/xfs/xfs_bmap.c		patch \| blob \| blame \| history
fs/xfs/xfs_bmap_util.c		patch \| blob \| blame \| history
fs/xfs/xfs_buf.c		patch \| blob \| blame \| history
fs/xfs/xfs_file.c		patch \| blob \| blame \| history
fs/xfs/xfs_inode.c		patch \| blob \| blame \| history
fs/xfs/xfs_inode.h		patch \| blob \| blame \| history
fs/xfs/xfs_iops.c		patch \| blob \| blame \| history
fs/xfs/xfs_log.c		patch \| blob \| blame \| history
fs/xfs/xfs_trace.h		patch \| blob \| blame \| history