f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc
authorJaegeuk Kim <jaegeuk@kernel.org>
Wed, 25 Jul 2018 03:11:56 +0000 (12:11 +0900)
committerJaegeuk Kim <jaegeuk@kernel.org>
Tue, 21 Aug 2018 06:13:42 +0000 (23:13 -0700)
The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of
fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop.

If it hits the miximum retrials in GC, let's give a chance to release
gc_mutex for a short time in order not to go into live lock in the worst
case.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
fs/f2fs/data.c
fs/f2fs/f2fs.h
fs/f2fs/file.c
fs/f2fs/gc.c
fs/f2fs/segment.c
fs/f2fs/segment.h

index bdcb023506a7a5d892236571f536779ae7b31bd9..e73ce11de02d44dcbc156ca264dc206a42933e54 100644 (file)
@@ -2217,14 +2217,14 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
        loff_t i_size = i_size_read(inode);
 
        if (to > i_size) {
-               down_write(&F2FS_I(inode)->i_mmap_sem);
                down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+               down_write(&F2FS_I(inode)->i_mmap_sem);
 
                truncate_pagecache(inode, i_size);
                f2fs_truncate_blocks(inode, i_size, true);
 
-               up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
                up_write(&F2FS_I(inode)->i_mmap_sem);
+               up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
        }
 }
 
index 170573f8a04a389f4a0274d38260150e2870167c..96bde026636ffbcfb84fdafed879f03e550710c7 100644 (file)
@@ -1243,6 +1243,7 @@ struct f2fs_sb_info {
        unsigned int gc_mode;                   /* current GC state */
        /* for skip statistic */
        unsigned long long skipped_atomic_files[2];     /* FG_GC and BG_GC */
+       unsigned long long skipped_gc_rwsem;            /* FG_GC only */
 
        /* threshold for gc trials on pinned files */
        u64 gc_pin_file_threshold;
index 1f76cc3fc46b076dbb343c7f6828e603584f004e..5474aaa274b91d52c8259d31cfc072e5f8674d54 100644 (file)
@@ -797,8 +797,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
        if (attr->ia_valid & ATTR_SIZE) {
                bool to_smaller = (attr->ia_size <= i_size_read(inode));
 
-               down_write(&F2FS_I(inode)->i_mmap_sem);
                down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+               down_write(&F2FS_I(inode)->i_mmap_sem);
 
                truncate_setsize(inode, attr->ia_size);
 
@@ -808,8 +808,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
                 * do not trim all blocks after i_size if target size is
                 * larger than i_size.
                 */
-               up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
                up_write(&F2FS_I(inode)->i_mmap_sem);
+               up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 
                if (err)
                        return err;
@@ -962,8 +962,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
                        blk_start = (loff_t)pg_start << PAGE_SHIFT;
                        blk_end = (loff_t)pg_end << PAGE_SHIFT;
 
-                       down_write(&F2FS_I(inode)->i_mmap_sem);
                        down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+                       down_write(&F2FS_I(inode)->i_mmap_sem);
 
                        truncate_inode_pages_range(mapping, blk_start,
                                        blk_end - 1);
@@ -972,8 +972,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
                        ret = f2fs_truncate_hole(inode, pg_start, pg_end);
                        f2fs_unlock_op(sbi);
 
-                       up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
                        up_write(&F2FS_I(inode)->i_mmap_sem);
+                       up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
                }
        }
 
@@ -1188,25 +1188,33 @@ roll_back:
        return ret;
 }
 
-static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
+static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+       pgoff_t start = offset >> PAGE_SHIFT;
+       pgoff_t end = (offset + len) >> PAGE_SHIFT;
        int ret;
 
        f2fs_balance_fs(sbi, true);
-       f2fs_lock_op(sbi);
 
-       f2fs_drop_extent_tree(inode);
+       /* avoid gc operation during block exchange */
+       down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+       down_write(&F2FS_I(inode)->i_mmap_sem);
 
+       f2fs_lock_op(sbi);
+       f2fs_drop_extent_tree(inode);
+       truncate_pagecache(inode, offset);
        ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true);
        f2fs_unlock_op(sbi);
+
+       up_write(&F2FS_I(inode)->i_mmap_sem);
+       up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
        return ret;
 }
 
 static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 {
-       pgoff_t pg_start, pg_end;
        loff_t new_size;
        int ret;
 
@@ -1221,25 +1229,17 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
        if (ret)
                return ret;
 
-       pg_start = offset >> PAGE_SHIFT;
-       pg_end = (offset + len) >> PAGE_SHIFT;
-
-       /* avoid gc operation during block exchange */
-       down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-
-       down_write(&F2FS_I(inode)->i_mmap_sem);
        /* write out all dirty pages from offset */
        ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
        if (ret)
-               goto out_unlock;
-
-       truncate_pagecache(inode, offset);
+               return ret;
 
-       ret = f2fs_do_collapse(inode, pg_start, pg_end);
+       ret = f2fs_do_collapse(inode, offset, len);
        if (ret)
-               goto out_unlock;
+               return ret;
 
        /* write out all moved pages, if possible */
+       down_write(&F2FS_I(inode)->i_mmap_sem);
        filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
        truncate_pagecache(inode, offset);
 
@@ -1247,11 +1247,9 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
        truncate_pagecache(inode, new_size);
 
        ret = f2fs_truncate_blocks(inode, new_size, true);
+       up_write(&F2FS_I(inode)->i_mmap_sem);
        if (!ret)
                f2fs_i_size_write(inode, new_size);
-out_unlock:
-       up_write(&F2FS_I(inode)->i_mmap_sem);
-       up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
        return ret;
 }
 
@@ -1317,10 +1315,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
        if (ret)
                return ret;
 
-       down_write(&F2FS_I(inode)->i_mmap_sem);
        ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1);
        if (ret)
-               goto out_sem;
+               return ret;
 
        pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
        pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
@@ -1332,7 +1329,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
                ret = fill_zero(inode, pg_start, off_start,
                                                off_end - off_start);
                if (ret)
-                       goto out_sem;
+                       return ret;
 
                new_size = max_t(loff_t, new_size, offset + len);
        } else {
@@ -1340,7 +1337,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
                        ret = fill_zero(inode, pg_start++, off_start,
                                                PAGE_SIZE - off_start);
                        if (ret)
-                               goto out_sem;
+                               return ret;
 
                        new_size = max_t(loff_t, new_size,
                                        (loff_t)pg_start << PAGE_SHIFT);
@@ -1352,6 +1349,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
                        pgoff_t end;
 
                        down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+                       down_write(&F2FS_I(inode)->i_mmap_sem);
 
                        truncate_pagecache_range(inode,
                                (loff_t)index << PAGE_SHIFT,
@@ -1363,6 +1361,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
                        ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE);
                        if (ret) {
                                f2fs_unlock_op(sbi);
+                               up_write(&F2FS_I(inode)->i_mmap_sem);
                                up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
                                goto out;
                        }
@@ -1374,6 +1373,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
                        f2fs_put_dnode(&dn);
 
                        f2fs_unlock_op(sbi);
+                       up_write(&F2FS_I(inode)->i_mmap_sem);
                        up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 
                        f2fs_balance_fs(sbi, dn.node_changed);
@@ -1402,9 +1402,6 @@ out:
                else
                        f2fs_i_size_write(inode, new_size);
        }
-out_sem:
-       up_write(&F2FS_I(inode)->i_mmap_sem);
-
        return ret;
 }
 
@@ -1433,26 +1430,27 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 
        f2fs_balance_fs(sbi, true);
 
-       /* avoid gc operation during block exchange */
-       down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-
        down_write(&F2FS_I(inode)->i_mmap_sem);
        ret = f2fs_truncate_blocks(inode, i_size_read(inode), true);
+       up_write(&F2FS_I(inode)->i_mmap_sem);
        if (ret)
-               goto out;
+               return ret;
 
        /* write out all dirty pages from offset */
        ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
        if (ret)
-               goto out;
-
-       truncate_pagecache(inode, offset);
+               return ret;
 
        pg_start = offset >> PAGE_SHIFT;
        pg_end = (offset + len) >> PAGE_SHIFT;
        delta = pg_end - pg_start;
        idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
 
+       /* avoid gc operation during block exchange */
+       down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+       down_write(&F2FS_I(inode)->i_mmap_sem);
+       truncate_pagecache(inode, offset);
+
        while (!ret && idx > pg_start) {
                nr = idx - pg_start;
                if (nr > delta)
@@ -1466,16 +1464,17 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
                                        idx + delta, nr, false);
                f2fs_unlock_op(sbi);
        }
+       up_write(&F2FS_I(inode)->i_mmap_sem);
+       up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 
        /* write out all moved pages, if possible */
+       down_write(&F2FS_I(inode)->i_mmap_sem);
        filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
        truncate_pagecache(inode, offset);
+       up_write(&F2FS_I(inode)->i_mmap_sem);
 
        if (!ret)
                f2fs_i_size_write(inode, new_size);
-out:
-       up_write(&F2FS_I(inode)->i_mmap_sem);
-       up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
        return ret;
 }
 
@@ -1722,8 +1721,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 
        inode_lock(inode);
 
-       down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-
        if (f2fs_is_atomic_file(inode)) {
                if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST))
                        ret = -EINVAL;
@@ -1734,6 +1731,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
        if (ret)
                goto out;
 
+       down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+
        if (!get_dirty_pages(inode))
                goto skip_flush;
 
@@ -1741,18 +1740,20 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
                "Unexpected flush for atomic writes: ino=%lu, npages=%u",
                                        inode->i_ino, get_dirty_pages(inode));
        ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
-       if (ret)
+       if (ret) {
+               up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
                goto out;
+       }
 skip_flush:
        set_inode_flag(inode, FI_ATOMIC_FILE);
        clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
-       f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+       up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 
+       f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
        F2FS_I(inode)->inmem_task = current;
        stat_inc_atomic_write(inode);
        stat_update_max_atomic_write(inode);
 out:
-       up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
        inode_unlock(inode);
        mnt_drop_write_file(filp);
        return ret;
@@ -1770,9 +1771,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
        if (ret)
                return ret;
 
-       inode_lock(inode);
+       f2fs_balance_fs(F2FS_I_SB(inode), true);
 
-       down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+       inode_lock(inode);
 
        if (f2fs_is_volatile_file(inode)) {
                ret = -EINVAL;
@@ -1798,7 +1799,6 @@ err_out:
                clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
                ret = -EINVAL;
        }
-       up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
        inode_unlock(inode);
        mnt_drop_write_file(filp);
        return ret;
@@ -2394,15 +2394,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
        }
 
        inode_lock(src);
-       down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
        if (src != dst) {
                ret = -EBUSY;
                if (!inode_trylock(dst))
                        goto out;
-               if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) {
-                       inode_unlock(dst);
-                       goto out;
-               }
        }
 
        ret = -EINVAL;
@@ -2447,6 +2442,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
                goto out_unlock;
 
        f2fs_balance_fs(sbi, true);
+
+       down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
+       if (src != dst) {
+               ret = -EBUSY;
+               if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE]))
+                       goto out_src;
+       }
+
        f2fs_lock_op(sbi);
        ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS,
                                pos_out >> F2FS_BLKSIZE_BITS,
@@ -2459,13 +2462,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
                        f2fs_i_size_write(dst, dst_osize);
        }
        f2fs_unlock_op(sbi);
-out_unlock:
-       if (src != dst) {
+
+       if (src != dst)
                up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]);
+out_src:
+       up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
+out_unlock:
+       if (src != dst)
                inode_unlock(dst);
-       }
 out:
-       up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
        inode_unlock(src);
        return ret;
 }
index 76a22b3773bc0d95e991f402196114d8178aedb1..c598ae5ecbfa47866b4c84614f7b6d764c257f86 100644 (file)
@@ -882,6 +882,7 @@ next_step:
                        if (!down_write_trylock(
                                &F2FS_I(inode)->i_gc_rwsem[WRITE])) {
                                iput(inode);
+                               sbi->skipped_gc_rwsem++;
                                continue;
                        }
 
@@ -911,6 +912,7 @@ next_step:
                                        continue;
                                if (!down_write_trylock(
                                                &fi->i_gc_rwsem[WRITE])) {
+                                       sbi->skipped_gc_rwsem++;
                                        up_write(&fi->i_gc_rwsem[READ]);
                                        continue;
                                }
@@ -1048,6 +1050,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
                .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
        };
        unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC];
+       unsigned long long first_skipped;
        unsigned int skipped_round = 0, round = 0;
 
        trace_f2fs_gc_begin(sbi->sb, sync, background,
@@ -1060,6 +1063,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
                                prefree_segments(sbi));
 
        cpc.reason = __get_cp_reason(sbi);
+       sbi->skipped_gc_rwsem = 0;
+       first_skipped = last_skipped;
 gc_more:
        if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) {
                ret = -EINVAL;
@@ -1101,7 +1106,8 @@ gc_more:
        total_freed += seg_freed;
 
        if (gc_type == FG_GC) {
-               if (sbi->skipped_atomic_files[FG_GC] > last_skipped)
+               if (sbi->skipped_atomic_files[FG_GC] > last_skipped ||
+                                               sbi->skipped_gc_rwsem)
                        skipped_round++;
                last_skipped = sbi->skipped_atomic_files[FG_GC];
                round++;
@@ -1110,15 +1116,23 @@ gc_more:
        if (gc_type == FG_GC)
                sbi->cur_victim_sec = NULL_SEGNO;
 
-       if (!sync) {
-               if (has_not_enough_free_secs(sbi, sec_freed, 0)) {
-                       if (skipped_round > MAX_SKIP_ATOMIC_COUNT &&
-                               skipped_round * 2 >= round)
-                               f2fs_drop_inmem_pages_all(sbi, true);
+       if (sync)
+               goto stop;
+
+       if (has_not_enough_free_secs(sbi, sec_freed, 0)) {
+               if (skipped_round <= MAX_SKIP_GC_COUNT ||
+                                       skipped_round * 2 < round) {
                        segno = NULL_SEGNO;
                        goto gc_more;
                }
 
+               if (first_skipped < last_skipped &&
+                               (last_skipped - first_skipped) >
+                                               sbi->skipped_gc_rwsem) {
+                       f2fs_drop_inmem_pages_all(sbi, true);
+                       segno = NULL_SEGNO;
+                       goto gc_more;
+               }
                if (gc_type == FG_GC)
                        ret = f2fs_write_checkpoint(sbi, &cpc);
        }
index 20650e25117b296b01b3aaa541a0552b8654e070..7dcfe38e70cc1d7e346da0ccc27274b387338076 100644 (file)
@@ -445,8 +445,10 @@ int f2fs_commit_inmem_pages(struct inode *inode)
        int err;
 
        f2fs_balance_fs(sbi, true);
-       f2fs_lock_op(sbi);
 
+       down_write(&fi->i_gc_rwsem[WRITE]);
+
+       f2fs_lock_op(sbi);
        set_inode_flag(inode, FI_ATOMIC_COMMIT);
 
        mutex_lock(&fi->inmem_lock);
@@ -461,6 +463,8 @@ int f2fs_commit_inmem_pages(struct inode *inode)
        clear_inode_flag(inode, FI_ATOMIC_COMMIT);
 
        f2fs_unlock_op(sbi);
+       up_write(&fi->i_gc_rwsem[WRITE]);
+
        return err;
 }
 
index 50495515f0a0d1fa8e542abbab3b27d49bb1c78a..b3d9e317ff0c142b111488911edeb222dfebdbcb 100644 (file)
@@ -215,7 +215,7 @@ struct segment_allocation {
 #define IS_DUMMY_WRITTEN_PAGE(page)                    \
                (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE)
 
-#define MAX_SKIP_ATOMIC_COUNT                  16
+#define MAX_SKIP_GC_COUNT                      16
 
 struct inmem_pages {
        struct list_head list;