bcachefs: Fallocate fixes
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 6 Nov 2021 17:39:42 +0000 (13:39 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:16 +0000 (17:09 -0400)
- fpunch wasn't always correctly updating i_size - when we drop buffered
  writes that were extending a file, we become responsible for writing
  i_size.

- fzero was sometimes zeroing out more data that it should have -
  block_start and block_end were being rounded in the wrong directions

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
fs/bcachefs/fs-io.c

index 7de6b7a7aa6081413b7e604034c27fbef736e59c..12b785c5005f5a528a0e1bdbfde8163a5c6912eb 100644 (file)
@@ -2296,6 +2296,14 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
                s->s[i].state           = SECTOR_UNALLOCATED;
        }
 
+       /*
+        * Caller needs to know whether this page will be written out by
+        * writeback - doing an i_size update if necessary - or whether it will
+        * be responsible for the i_size update:
+        */
+       ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT),
+                         PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY;
+
        zero_user_segment(page, start_offset, end_offset);
 
        /*
@@ -2304,8 +2312,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
         * XXX: because we aren't currently tracking whether the page has actual
         * data in it (vs. just 0s, or only partially written) this wrong. ick.
         */
-       ret = bch2_get_page_disk_reservation(c, inode, page, false);
-       BUG_ON(ret);
+       BUG_ON(bch2_get_page_disk_reservation(c, inode, page, false));
 
        /*
         * This removes any writeable userspace mappings; we need to force
@@ -2327,6 +2334,20 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
                                    from, round_up(from, PAGE_SIZE));
 }
 
+static int bch2_truncate_pages(struct bch_inode_info *inode,
+                              loff_t start, loff_t end)
+{
+       int ret = __bch2_truncate_page(inode, start >> PAGE_SHIFT,
+                                      start, end);
+
+       if (ret >= 0 &&
+           start >> PAGE_SHIFT != end >> PAGE_SHIFT)
+               ret = __bch2_truncate_page(inode,
+                                          end >> PAGE_SHIFT,
+                                          start, end);
+       return ret;
+}
+
 static int bch2_extend(struct mnt_idmap *idmap,
                       struct bch_inode_info *inode,
                       struct bch_inode_unpacked *inode_u,
@@ -2417,7 +2438,7 @@ int bch2_truncate(struct mnt_idmap *idmap,
        iattr->ia_valid &= ~ATTR_SIZE;
 
        ret = bch2_truncate_page(inode, iattr->ia_size);
-       if (unlikely(ret))
+       if (unlikely(ret < 0))
                goto err;
 
        /*
@@ -2483,48 +2504,39 @@ static int inode_update_times_fn(struct bch_inode_info *inode,
 static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       u64 discard_start = round_up(offset, block_bytes(c)) >> 9;
-       u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9;
+       u64 end         = offset + len;
+       u64 block_start = round_up(offset, block_bytes(c));
+       u64 block_end   = round_down(end, block_bytes(c));
+       bool truncated_last_page;
        int ret = 0;
 
-       inode_lock(&inode->v);
-       inode_dio_wait(&inode->v);
-       bch2_pagecache_block_get(&inode->ei_pagecache_lock);
-
-       ret = __bch2_truncate_page(inode,
-                                  offset >> PAGE_SHIFT,
-                                  offset, offset + len);
-       if (unlikely(ret))
+       ret = bch2_truncate_pages(inode, offset, end);
+       if (unlikely(ret < 0))
                goto err;
 
-       if (offset >> PAGE_SHIFT !=
-           (offset + len) >> PAGE_SHIFT) {
-               ret = __bch2_truncate_page(inode,
-                                          (offset + len) >> PAGE_SHIFT,
-                                          offset, offset + len);
-               if (unlikely(ret))
-                       goto err;
-       }
+       truncated_last_page = ret;
 
-       truncate_pagecache_range(&inode->v, offset, offset + len - 1);
+       truncate_pagecache_range(&inode->v, offset, end - 1);
 
-       if (discard_start < discard_end) {
+       if (block_start < block_end ) {
                s64 i_sectors_delta = 0;
 
                ret = bch2_fpunch(c, inode_inum(inode),
-                                 discard_start, discard_end,
+                                 block_start >> 9, block_end >> 9,
                                  &i_sectors_delta);
                i_sectors_acct(c, inode, NULL, i_sectors_delta);
        }
 
        mutex_lock(&inode->ei_update_lock);
-       ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
-                              ATTR_MTIME|ATTR_CTIME) ?: ret;
+       if (end >= inode->v.i_size && !truncated_last_page) {
+               ret = bch2_write_inode_size(c, inode, inode->v.i_size,
+                                           ATTR_MTIME|ATTR_CTIME);
+       } else {
+               ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+                                      ATTR_MTIME|ATTR_CTIME);
+       }
        mutex_unlock(&inode->ei_update_lock);
 err:
-       bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-       inode_unlock(&inode->v);
-
        return ret;
 }
 
@@ -2544,31 +2556,18 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
        if ((offset | len) & (block_bytes(c) - 1))
                return -EINVAL;
 
-       /*
-        * We need i_mutex to keep the page cache consistent with the extents
-        * btree, and the btree consistent with i_size - we don't need outside
-        * locking for the extents btree itself, because we're using linked
-        * iterators
-        */
-       inode_lock(&inode->v);
-       inode_dio_wait(&inode->v);
-       bch2_pagecache_block_get(&inode->ei_pagecache_lock);
-
        if (insert) {
-               ret = -EFBIG;
                if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
-                       goto err;
+                       return -EFBIG;
 
-               ret = -EINVAL;
                if (offset >= inode->v.i_size)
-                       goto err;
+                       return -EINVAL;
 
                src_start       = U64_MAX;
                shift           = len;
        } else {
-               ret = -EINVAL;
                if (offset + len >= inode->v.i_size)
-                       goto err;
+                       return -EINVAL;
 
                src_start       = offset + len;
                shift           = -len;
@@ -2578,7 +2577,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 
        ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
        if (ret)
-               goto err;
+               return ret;
 
        if (insert) {
                i_size_write(&inode->v, new_size);
@@ -2595,7 +2594,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
                i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
                if (ret)
-                       goto err;
+                       return ret;
        }
 
        bch2_bkey_buf_init(&copy);
@@ -2708,18 +2707,19 @@ reassemble:
        bch2_bkey_buf_exit(&copy, c);
 
        if (ret)
-               goto err;
+               return ret;
 
+       mutex_lock(&inode->ei_update_lock);
        if (!insert) {
                i_size_write(&inode->v, new_size);
-               mutex_lock(&inode->ei_update_lock);
                ret = bch2_write_inode_size(c, inode, new_size,
                                            ATTR_MTIME|ATTR_CTIME);
-               mutex_unlock(&inode->ei_update_lock);
+       } else {
+               /* We need an inode update to update bi_journal_seq for fsync: */
+               ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+                                      ATTR_MTIME|ATTR_CTIME);
        }
-err:
-       bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-       inode_unlock(&inode->v);
+       mutex_unlock(&inode->ei_update_lock);
        return ret;
 }
 
@@ -2814,6 +2814,17 @@ bkey_err:
                if (ret == -EINTR)
                        ret = 0;
        }
+
+       if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) {
+               struct quota_res quota_res = { 0 };
+               s64 i_sectors_delta = 0;
+
+               bch2_fpunch_at(&trans, &iter, inode_inum(inode),
+                              end_sector, &i_sectors_delta);
+               i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+               bch2_quota_reservation_put(c, inode, &quota_res);
+       }
+
        bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
@@ -2822,77 +2833,58 @@ bkey_err:
 static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
                            loff_t offset, loff_t len)
 {
-       struct address_space *mapping = inode->v.i_mapping;
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       loff_t end              = offset + len;
-       loff_t block_start      = round_down(offset,    block_bytes(c));
-       loff_t block_end        = round_up(end,         block_bytes(c));
-       int ret;
-
-       inode_lock(&inode->v);
-       inode_dio_wait(&inode->v);
-       bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+       u64 end         = offset + len;
+       u64 block_start = round_down(offset,    block_bytes(c));
+       u64 block_end   = round_up(end,         block_bytes(c));
+       bool truncated_last_page = false;
+       int ret, ret2 = 0;
 
        if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
                ret = inode_newsize_ok(&inode->v, end);
                if (ret)
-                       goto err;
+                       return ret;
        }
 
        if (mode & FALLOC_FL_ZERO_RANGE) {
-               ret = __bch2_truncate_page(inode,
-                                          offset >> PAGE_SHIFT,
-                                          offset, end);
-
-               if (!ret &&
-                   offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
-                       ret = __bch2_truncate_page(inode,
-                                                  end >> PAGE_SHIFT,
-                                                  offset, end);
+               ret = bch2_truncate_pages(inode, offset, end);
+               if (unlikely(ret < 0))
+                       return ret;
 
-               if (unlikely(ret))
-                       goto err;
+               truncated_last_page = ret;
 
                truncate_pagecache_range(&inode->v, offset, end - 1);
+
+               block_start     = round_up(offset,      block_bytes(c));
+               block_end       = round_down(end,       block_bytes(c));
        }
 
        ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
-       if (ret)
-               goto err;
 
        /*
-        * Do we need to extend the file?
-        *
-        * If we zeroed up to the end of the file, we dropped whatever writes
-        * were going to write out the current i_size, so we have to extend
-        * manually even if FL_KEEP_SIZE was set:
+        * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
+        * so that the VFS cache i_size is consistent with the btree i_size:
         */
-       if (end >= inode->v.i_size &&
-           (!(mode & FALLOC_FL_KEEP_SIZE) ||
-            (mode & FALLOC_FL_ZERO_RANGE))) {
+       if (ret &&
+           !(ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)))
+               return ret;
 
-               /*
-                * Sync existing appends before extending i_size,
-                * as in bch2_extend():
-                */
-               ret = filemap_write_and_wait_range(mapping,
-                                       inode->ei_inode.bi_size, S64_MAX);
-               if (ret)
-                       goto err;
+       if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
+               end = inode->v.i_size;
 
-               if (mode & FALLOC_FL_KEEP_SIZE)
-                       end = inode->v.i_size;
-               else
-                       i_size_write(&inode->v, end);
+       if (end >= inode->v.i_size &&
+           (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
+            !(mode & FALLOC_FL_KEEP_SIZE))) {
+               spin_lock(&inode->v.i_lock);
+               i_size_write(&inode->v, end);
+               spin_unlock(&inode->v.i_lock);
 
                mutex_lock(&inode->ei_update_lock);
-               ret = bch2_write_inode_size(c, inode, end, 0);
+               ret2 = bch2_write_inode_size(c, inode, end, 0);
                mutex_unlock(&inode->ei_update_lock);
        }
-err:
-       bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-       inode_unlock(&inode->v);
-       return ret;
+
+       return ret ?: ret2;
 }
 
 long bch2_fallocate_dispatch(struct file *file, int mode,
@@ -2905,6 +2897,10 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
        if (!percpu_ref_tryget(&c->writes))
                return -EROFS;
 
+       inode_lock(&inode->v);
+       inode_dio_wait(&inode->v);
+       bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
        if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
                ret = bchfs_fallocate(inode, mode, offset, len);
        else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
@@ -2916,6 +2912,9 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
        else
                ret = -EOPNOTSUPP;
 
+
+       bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+       inode_unlock(&inode->v);
        percpu_ref_put(&c->writes);
 
        return ret;