Merge tag 'xfs-4.20-fixes-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 24 Nov 2018 17:11:52 +0000 (09:11 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 24 Nov 2018 17:11:52 +0000 (09:11 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 24 Nov 2018 17:11:52 +0000 (09:11 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 24 Nov 2018 17:11:52 +0000 (09:11 -0800)
diff --git a/fs/iomap.c b/fs/iomap.c

index 64ce240217a18985dd0510fd968b257dce214cd4..3ffb776fbebe33492be0c05c5dee41f5f166b753 100644 (file)
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -142,13 +142,14 @@ static void
  iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
                 loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
  {
+       loff_t orig_pos = *pos;
+       loff_t isize = i_size_read(inode);
         unsigned block_bits = inode->i_blkbits;
         unsigned block_size = (1 << block_bits);
         unsigned poff = offset_in_page(*pos);
         unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
         unsigned first = poff >> block_bits;
         unsigned last = (poff + plen - 1) >> block_bits;
-       unsigned end = offset_in_page(i_size_read(inode)) >> block_bits;
  
         /*
          * If the block size is smaller than the page size we need to check the
@@ -183,8 +184,12 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
          * handle both halves separately so that we properly zero data in the
          * page cache for blocks that are entirely outside of i_size.
          */
-       if (first <= end && last > end)
-               plen -= (last - end) * block_size;
+       if (orig_pos <= isize && orig_pos + length > isize) {
+               unsigned end = offset_in_page(isize - 1) >> block_bits;
+
+               if (first <= end && last > end)
+                       plen -= (last - end) * block_size;
+       }
  
         *offp = poff;
         *lenp = plen;
@@ -1580,7 +1585,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
         struct bio *bio;
         bool need_zeroout = false;
         bool use_fua = false;
-       int nr_pages, ret;
+       int nr_pages, ret = 0;
         size_t copied = 0;
  
         if ((pos | length | align) & ((1 << blkbits) - 1))
@@ -1596,12 +1601,13 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
  
         if (iomap->flags & IOMAP_F_NEW) {
                 need_zeroout = true;
-       } else {
+       } else if (iomap->type == IOMAP_MAPPED) {
                 /*
-                * Use a FUA write if we need datasync semantics, this
-                * is a pure data IO that doesn't require any metadata
-                * updates and the underlying device supports FUA. This
-                * allows us to avoid cache flushes on IO completion.
+                * Use a FUA write if we need datasync semantics, this is a pure
+                * data IO that doesn't require any metadata updates (including
+                * after IO completion such as unwritten extent conversion) and
+                * the underlying device supports FUA. This allows us to avoid
+                * cache flushes on IO completion.
                  */
                 if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
                     (dio->flags & IOMAP_DIO_WRITE_FUA) &&
@@ -1644,8 +1650,14 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
  
                 ret = bio_iov_iter_get_pages(bio, &iter);
                 if (unlikely(ret)) {
+                       /*
+                        * We have to stop part way through an IO. We must fall
+                        * through to the sub-block tail zeroing here, otherwise
+                        * this short IO may expose stale data in the tail of
+                        * the block we haven't written data to.
+                        */
                         bio_put(bio);
-                       return copied ? copied : ret;
+                       goto zero_tail;
                 }
  
                 n = bio->bi_iter.bi_size;
@@ -1676,13 +1688,21 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
                 dio->submit.cookie = submit_bio(bio);
         } while (nr_pages);
  
-       if (need_zeroout) {
+       /*
+        * We need to zeroout the tail of a sub-block write if the extent type
+        * requires zeroing or the write extends beyond EOF. If we don't zero
+        * the block tail in the latter case, we can expose stale data via mmap
+        * reads of the EOF block.
+        */
+zero_tail:
+       if (need_zeroout ||
+           ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
                 /* zero out from the end of the write to the end of the block */
                 pad = pos & (fs_block_size - 1);
                 if (pad)
                         iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
         }
-       return copied;
+       return copied ? copied : ret;
  }
  
  static loff_t
@@ -1857,6 +1877,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                                 dio->wait_for_completion = true;
                                 ret = 0;
                         }
+
+                       /*
+                        * Splicing to pipes can fail on a full pipe. We have to
+                        * swallow this to make it look like a short IO
+                        * otherwise the higher splice layers will completely
+                        * mishandle the error and stop moving data.
+                        */
+                       if (ret == -EFAULT)
+                               ret = 0;
                         break;
                 }
                 pos += ret;
diff --git a/fs/read_write.c b/fs/read_write.c

index bfcb4ced5664c00f2fab706ba483094a4bc1ca5a..4dae0399c75a7227f8c53a5d2a06d985fa31b3a0 100644 (file)
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -2094,17 +2094,18 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
         off = same->src_offset;
         len = same->src_length;
  
-       ret = -EISDIR;
         if (S_ISDIR(src->i_mode))
-               goto out;
+               return -EISDIR;
  
-       ret = -EINVAL;
         if (!S_ISREG(src->i_mode))
-               goto out;
+               return -EINVAL;
+
+       if (!file->f_op->remap_file_range)
+               return -EOPNOTSUPP;
  
         ret = remap_verify_area(file, off, len, false);
         if (ret < 0)
-               goto out;
+               return ret;
         ret = 0;
  
         if (off + len > i_size_read(src))
@@ -2147,10 +2148,8 @@ next_fdput:
                 fdput(dst_fd);
  next_loop:
                 if (fatal_signal_pending(current))
-                       goto out;
+                       break;
         }
-
-out:
         return ret;
  }
  EXPORT_SYMBOL(vfs_dedupe_file_range);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c

index 74d7228e755b3ade097457c0ae68e07e252e67a4..19e921d1586f273f7f234d0c32a8982683821f69 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1694,10 +1694,13 @@ xfs_bmap_add_extent_delay_real(
         case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
                 /*
                  * Filling in all of a previously delayed allocation extent.
-                * The right neighbor is contiguous, the left is not.
+                * The right neighbor is contiguous, the left is not. Take care
+                * with delay -> unwritten extent allocation here because the
+                * delalloc record we are overwriting is always written.
                  */
                 PREV.br_startblock = new->br_startblock;
                 PREV.br_blockcount += RIGHT.br_blockcount;
+               PREV.br_state = new->br_state;
  
                 xfs_iext_next(ifp, &bma->icur);
                 xfs_iext_remove(bma->ip, &bma->icur, state);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c

index 86c50208a14374e2a1588b5686df8d30dc677c57..7fbf8af0b15949fb1e329e270cf66ef9f519eb3f 100644 (file)
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -538,15 +538,18 @@ xfs_inobt_rec_check_count(
  
  static xfs_extlen_t
  xfs_inobt_max_size(
-       struct xfs_mount        *mp)
+       struct xfs_mount        *mp,
+       xfs_agnumber_t          agno)
  {
+       xfs_agblock_t           agblocks = xfs_ag_block_count(mp, agno);
+
         /* Bail out if we're uninitialized, which can happen in mkfs. */
         if (mp->m_inobt_mxr[0] == 0)
                 return 0;
  
         return xfs_btree_calc_size(mp->m_inobt_mnr,
-               (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
-                               XFS_INODES_PER_CHUNK);
+                               (uint64_t)agblocks * mp->m_sb.sb_inopblock /
+                                       XFS_INODES_PER_CHUNK);
  }
  
  static int
@@ -594,7 +597,7 @@ xfs_finobt_calc_reserves(
         if (error)
                 return error;
  
-       *ask += xfs_inobt_max_size(mp);
+       *ask += xfs_inobt_max_size(mp, agno);
         *used += tree_len;
         return 0;
  }
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c

index 5d263dfdb3bcc60ca30708622397e42de9fbaeac..404e581f1ea1e879e4a32099d303e53a89fbb91a 100644 (file)
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1042,7 +1042,7 @@ out_trans_cancel:
         goto out_unlock;
  }
  
-static int
+int
  xfs_flush_unmap_range(
         struct xfs_inode        *ip,
         xfs_off_t               offset,
@@ -1195,13 +1195,7 @@ xfs_prepare_shift(
          * Writeback and invalidate cache for the remainder of the file as we're
          * about to shift down every extent from offset to EOF.
          */
-       error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, offset, -1);
-       if (error)
-               return error;
-       error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
-                                       offset >> PAGE_SHIFT, -1);
-       if (error)
-               return error;
+       error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip));
  
         /*
          * Clean out anything hanging around in the cow fork now that
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h

index 87363d136bb618145c396223da5b4755bdd572c8..7a78229cf1a79807c7794e5f468eff6e4bf8ff7f 100644 (file)
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -80,4 +80,7 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
                           int whichfork, xfs_extnum_t *nextents,
                           xfs_filblks_t *count);
  
+int    xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset,
+                             xfs_off_t len);
+
  #endif /* __XFS_BMAP_UTIL_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c

index 12d8455bfbb29114887744046d52cb75428bc911..010db5f8fb00f81deb3524c6356b35f71a2fd7d3 100644 (file)
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1233,9 +1233,23 @@ xfs_buf_iodone(
  }
  
  /*
- * Requeue a failed buffer for writeback
+ * Requeue a failed buffer for writeback.
   *
- * Return true if the buffer has been re-queued properly, false otherwise
+ * We clear the log item failed state here as well, but we have to be careful
+ * about reference counts because the only active reference counts on the buffer
+ * may be the failed log items. Hence if we clear the log item failed state
+ * before queuing the buffer for IO we can release all active references to
+ * the buffer and free it, leading to use after free problems in
+ * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which
+ * order we process them in - the buffer is locked, and we own the buffer list
+ * so nothing on them is going to change while we are performing this action.
+ *
+ * Hence we can safely queue the buffer for IO before we clear the failed log
+ * item state, therefore  always having an active reference to the buffer and
+ * avoiding the transient zero-reference state that leads to use-after-free.
+ *
+ * Return true if the buffer was added to the buffer list, false if it was
+ * already on the buffer list.
   */
  bool
  xfs_buf_resubmit_failed_buffers(
@@ -1243,16 +1257,16 @@ xfs_buf_resubmit_failed_buffers(
         struct list_head        *buffer_list)
  {
         struct xfs_log_item     *lip;
+       bool                    ret;
+
+       ret = xfs_buf_delwri_queue(bp, buffer_list);
  
         /*
-        * Clear XFS_LI_FAILED flag from all items before resubmit
-        *
-        * XFS_LI_FAILED set/clear is protected by ail_lock, caller  this
+        * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this
          * function already have it acquired
          */
         list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
                 xfs_clear_li_failed(lip);
  
-       /* Add this buffer back to the delayed write list */
-       return xfs_buf_delwri_queue(bp, buffer_list);
+       return ret;
  }
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index 53c9ab8fb777f4d78803da906b151624cfc7053f..e47425071e654473f4b34e7899015cecce19ef5e 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -920,7 +920,7 @@ out_unlock:
  }
  
  
-loff_t
+STATIC loff_t
  xfs_file_remap_range(
         struct file             *file_in,
         loff_t                  pos_in,
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c

index ecdb086bc23e559f6fcfc45ca70590533d04b3c7..322a852ce284a017382ceb9e75bb74762dfbad78 100644 (file)
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -296,6 +296,7 @@ xfs_reflink_reserve_cow(
         if (error)
                 return error;
  
+       xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
         trace_xfs_reflink_cow_alloc(ip, &got);
         return 0;
  }
@@ -1351,10 +1352,19 @@ xfs_reflink_remap_prep(
         if (ret)
                 goto out_unlock;
  
-       /* Zap any page cache for the destination file's range. */
-       truncate_inode_pages_range(&inode_out->i_data,
-                       round_down(pos_out, PAGE_SIZE),
-                       round_up(pos_out + *len, PAGE_SIZE) - 1);
+       /*
+        * If pos_out > EOF, we may have dirtied blocks between EOF and
+        * pos_out. In that case, we need to extend the flush and unmap to cover
+        * from EOF to the end of the copy length.
+        */
+       if (pos_out > XFS_ISIZE(dest)) {
+               loff_t  flen = *len + (pos_out - XFS_ISIZE(dest));
+               ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
+       } else {
+               ret = xfs_flush_unmap_range(dest, pos_out, *len);
+       }
+       if (ret)
+               goto out_unlock;
  
         return 1;
  out_unlock:
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h

index 3043e5ed6495580de11de6932117addca0e85aec..8a6532aae779b49299e8c99b55b6d770d1866be1 100644 (file)
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -280,7 +280,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
         ),
         TP_fast_assign(
                 __entry->dev = bp->b_target->bt_dev;
-               __entry->bno = bp->b_bn;
+               if (bp->b_bn == XFS_BUF_DADDR_NULL)
+                       __entry->bno = bp->b_maps[0].bm_bn;
+               else
+                       __entry->bno = bp->b_bn;
                 __entry->nblks = bp->b_length;
                 __entry->hold = atomic_read(&bp->b_hold);
                 __entry->pincount = atomic_read(&bp->b_pin_count);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 24 Nov 2018 17:11:52 +0000 (09:11 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 24 Nov 2018 17:11:52 +0000 (09:11 -0800)
fs/iomap.c		patch \| blob \| blame \| history
fs/read_write.c		patch \| blob \| blame \| history
fs/xfs/libxfs/xfs_bmap.c		patch \| blob \| blame \| history
fs/xfs/libxfs/xfs_ialloc_btree.c		patch \| blob \| blame \| history
fs/xfs/xfs_bmap_util.c		patch \| blob \| blame \| history
fs/xfs/xfs_bmap_util.h		patch \| blob \| blame \| history
fs/xfs/xfs_buf_item.c		patch \| blob \| blame \| history
fs/xfs/xfs_file.c		patch \| blob \| blame \| history
fs/xfs/xfs_reflink.c		patch \| blob \| blame \| history
fs/xfs/xfs_trace.h		patch \| blob \| blame \| history