ext4: add RWF_DONTCACHE write support buffered-uncached-fs.11
authorJens Axboe <axboe@kernel.dk>
Sat, 9 Nov 2024 16:51:58 +0000 (09:51 -0700)
committerJens Axboe <axboe@kernel.dk>
Sat, 22 Feb 2025 00:25:00 +0000 (17:25 -0700)
IOCB_DONTCACHE IO needs to prune writeback regions on IO completion,
and hence need the worker punt that ext4 also does for unwritten
extents. Add an io_end flag to manage that.

If foliop is set to foliop_uncached in ext4_write_begin(), then set
FGP_DONTCACHE so that __filemap_get_folio() will mark newly created
folios as uncached. That in turn will make writeback completion drop
these ranges from the page cache.

Now that ext4 supports both uncached reads and writes, add the fop_flag
FOP_DONTCACHE to enable it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
fs/ext4/ext4.h
fs/ext4/file.c
fs/ext4/inline.c
fs/ext4/inode.c
fs/ext4/page-io.c

index 4e7de7eaa374a0cd404318f6a8a452e64c6ea95c..23ee0e903bb23c019eb43bebc7824d67a11eb47f 100644 (file)
@@ -279,6 +279,7 @@ struct ext4_system_blocks {
  * Flags for ext4_io_end->flags
  */
 #define        EXT4_IO_END_UNWRITTEN   0x0001
+#define EXT4_IO_DONTCACHE      0x0002
 
 struct ext4_io_end_vec {
        struct list_head list;          /* list of io_end_vec */
index a5205149adba352c45b1730483fed5730989b3a8..33ee867b6f25337e9d24a04e8fc71acdfee44512 100644 (file)
@@ -969,7 +969,7 @@ const struct file_operations ext4_file_operations = {
        .splice_write   = iter_file_splice_write,
        .fallocate      = ext4_fallocate,
        .fop_flags      = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
-                         FOP_DIO_PARALLEL_WRITE,
+                         FOP_DIO_PARALLEL_WRITE | FOP_DONTCACHE,
 };
 
 const struct inode_operations ext4_file_inode_operations = {
index 3536ca7e4fccab77b8d4c8fd4cd14cbaa20e7f83..3a3b27bae32ecf4661838aea858dcf8be2f2c30c 100644 (file)
@@ -667,6 +667,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
        handle_t *handle;
        struct folio *folio;
        struct ext4_iloc iloc;
+       fgf_t fgp_flags;
 
        if (pos + len > ext4_get_max_inline_size(inode))
                goto convert;
@@ -702,7 +703,11 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
        if (ret)
                goto out;
 
-       folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
+       fgp_flags = FGP_WRITEBEGIN | FGP_NOFS;
+       if (foliop_is_dropbehind(foliop))
+               fgp_flags |= FGP_DONTCACHE;
+
+       folio = __filemap_get_folio(mapping, 0, fgp_flags,
                                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio)) {
                ret = PTR_ERR(folio);
index 7c54ae5fcbd4540e8ba69b942ffd98dded320339..015fe5cb9a86c1b093b7a7817a0c5c813d22dc69 100644 (file)
@@ -1145,6 +1145,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
        int ret, needed_blocks;
        handle_t *handle;
        int retries = 0;
+       fgf_t fgp_flags;
        struct folio *folio;
        pgoff_t index;
        unsigned from, to;
@@ -1171,6 +1172,15 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
                        return 0;
        }
 
+       /*
+        * Set FGP_WRITEBEGIN, and FGP_DONTCACHE if foliop is marked as
+        * dropbehind. That's how generic_perform_write() informs us that this
+        * is a dropbehind write.
+        */
+       fgp_flags = FGP_WRITEBEGIN;
+       if (foliop_is_dropbehind(foliop))
+               fgp_flags |= FGP_DONTCACHE;
+
        /*
         * __filemap_get_folio() can take a long time if the
         * system is thrashing due to memory pressure, or if the folio
@@ -1179,7 +1189,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
         * the folio (if needed) without using GFP_NOFS.
         */
 retry_grab:
-       folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+       folio = __filemap_get_folio(mapping, index, fgp_flags,
                                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio))
                return PTR_ERR(folio);
@@ -2914,6 +2924,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
        struct folio *folio;
        pgoff_t index;
        struct inode *inode = mapping->host;
+       fgf_t fgp_flags;
 
        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;
@@ -2937,8 +2948,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                        return 0;
        }
 
+       fgp_flags = FGP_WRITEBEGIN;
+       if (foliop_is_dropbehind(foliop))
+               fgp_flags |= FGP_DONTCACHE;
 retry:
-       folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+       folio = __filemap_get_folio(mapping, index, fgp_flags,
                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio))
                return PTR_ERR(folio);
index 69b8a7221a2b19cf14a0a701cb06488f5bdbd478..ad011a7dc41083e79824ab8e8acf0f8d31a5c6ec 100644 (file)
@@ -226,8 +226,6 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
        unsigned long flags;
 
        /* Only reserved conversions from writeback should enter here */
-       WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
-       WARN_ON(!io_end->handle && sbi->s_journal);
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
        wq = sbi->rsv_conversion_wq;
        if (list_empty(&ei->i_rsv_conversion_list))
@@ -252,7 +250,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
 
        while (!list_empty(&unwritten)) {
                io_end = list_entry(unwritten.next, ext4_io_end_t, list);
-               BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+               BUG_ON(!(io_end->flag & (EXT4_IO_END_UNWRITTEN|EXT4_IO_DONTCACHE)));
                list_del_init(&io_end->list);
 
                err = ext4_end_io_end(io_end);
@@ -287,14 +285,15 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
 
 void ext4_put_io_end_defer(ext4_io_end_t *io_end)
 {
-       if (refcount_dec_and_test(&io_end->count)) {
-               if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
-                               list_empty(&io_end->list_vec)) {
-                       ext4_release_io_end(io_end);
-                       return;
-               }
-               ext4_add_complete_io(io_end);
+       if (!refcount_dec_and_test(&io_end->count))
+               return;
+       if ((!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
+           list_empty(&io_end->list_vec)) &&
+           !(io_end->flag & EXT4_IO_DONTCACHE)) {
+               ext4_release_io_end(io_end);
+               return;
        }
+       ext4_add_complete_io(io_end);
 }
 
 int ext4_put_io_end(ext4_io_end_t *io_end)
@@ -348,7 +347,7 @@ static void ext4_end_bio(struct bio *bio)
                                blk_status_to_errno(bio->bi_status));
        }
 
-       if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+       if (io_end->flag & (EXT4_IO_END_UNWRITTEN|EXT4_IO_DONTCACHE)) {
                /*
                 * Link bio into list hanging from io_end. We have to do it
                 * atomically as bio completions can be racing against each
@@ -420,6 +419,10 @@ submit_and_retry:
        if (io->io_bio == NULL) {
                io_submit_init_bio(io, bh);
                io->io_bio->bi_write_hint = inode->i_write_hint;
+               if (folio_test_dropbehind(folio)) {
+                       ext4_io_end_t *io_end = io->io_bio->bi_private;
+                       io_end->flag |= EXT4_IO_DONTCACHE;
+               }
        }
        if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh)))
                goto submit_and_retry;