fs/btrfs/file.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/fs.h>
   7 #include <linux/pagemap.h>
   8 #include <linux/time.h>
   9 #include <linux/init.h>
  10 #include <linux/string.h>
  11 #include <linux/backing-dev.h>
  12 #include <linux/falloc.h>
  13 #include <linux/writeback.h>
  14 #include <linux/compat.h>
  15 #include <linux/slab.h>
  16 #include <linux/btrfs.h>
  17 #include <linux/uio.h>
  18 #include <linux/iversion.h>
  19 #include <linux/fsverity.h>
  20 #include "ctree.h"
  21 #include "direct-io.h"
  22 #include "disk-io.h"
  23 #include "transaction.h"
  24 #include "btrfs_inode.h"
  25 #include "tree-log.h"
  26 #include "locking.h"
  27 #include "qgroup.h"
  28 #include "compression.h"
  29 #include "delalloc-space.h"
  30 #include "reflink.h"
  31 #include "subpage.h"
  32 #include "fs.h"
  33 #include "accessors.h"
  34 #include "extent-tree.h"
  35 #include "file-item.h"
  36 #include "ioctl.h"
  37 #include "file.h"
  38 #include "super.h"
  39 #include "print-tree.h"
  40
  41 /*
  42  * Unlock folio after btrfs_file_write() is done with it.
  43  */
  44 static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
  45                              u64 pos, u64 copied)
  46 {
  47         u64 block_start = round_down(pos, fs_info->sectorsize);
  48         u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
  49
  50         ASSERT(block_len <= U32_MAX);
  51         /*
  52          * Folio checked is some magic around finding folios that have been
  53          * modified without going through btrfs_dirty_folio().  Clear it here.
  54          * There should be no need to mark the pages accessed as
  55          * prepare_one_folio() should have marked them accessed in
  56          * prepare_one_folio() via find_or_create_page()
  57          */
  58         btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len);
  59         folio_unlock(folio);
  60         folio_put(folio);
  61 }
  62
  63 /*
  64  * After copy_folio_from_iter_atomic(), update the following things for delalloc:
  65  * - Mark newly dirtied folio as DELALLOC in the io tree.
  66  *   Used to advise which range is to be written back.
  67  * - Mark modified folio as Uptodate/Dirty and not needing COW fixup
  68  * - Update inode size for past EOF write
  69  */
  70 int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
  71                       size_t write_bytes, struct extent_state **cached, bool noreserve)
  72 {
  73         struct btrfs_fs_info *fs_info = inode->root->fs_info;
  74         int ret = 0;
  75         u64 num_bytes;
  76         u64 start_pos;
  77         u64 end_of_last_block;
  78         u64 end_pos = pos + write_bytes;
  79         loff_t isize = i_size_read(&inode->vfs_inode);
  80         unsigned int extra_bits = 0;
  81
  82         if (write_bytes == 0)
  83                 return 0;
  84
  85         if (noreserve)
  86                 extra_bits |= EXTENT_NORESERVE;
  87
  88         start_pos = round_down(pos, fs_info->sectorsize);
  89         num_bytes = round_up(write_bytes + pos - start_pos,
  90                              fs_info->sectorsize);
  91         ASSERT(num_bytes <= U32_MAX);
  92         ASSERT(folio_pos(folio) <= pos &&
  93                folio_pos(folio) + folio_size(folio) >= pos + write_bytes);
  94
  95         end_of_last_block = start_pos + num_bytes - 1;
  96
  97         /*
  98          * The pages may have already been dirty, clear out old accounting so
  99          * we can set things up properly
 100          */
 101         btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
 102                                EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 103                                cached);
 104
 105         ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
 106                                         extra_bits, cached);
 107         if (ret)
 108                 return ret;
 109
 110         btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes);
 111         btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes);
 112         btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes);
 113
 114         /*
 115          * we've only changed i_size in ram, and we haven't updated
 116          * the disk i_size.  There is no need to log the inode
 117          * at this time.
 118          */
 119         if (end_pos > isize)
 120                 i_size_write(&inode->vfs_inode, end_pos);
 121         return 0;
 122 }
 123
 124 /*
 125  * this is very complex, but the basic idea is to drop all extents
 126  * in the range start - end.  hint_block is filled in with a block number
 127  * that would be a good hint to the block allocator for this file.
 128  *
 129  * If an extent intersects the range but is not entirely inside the range
 130  * it is either truncated or split.  Anything entirely inside the range
 131  * is deleted from the tree.
 132  *
 133  * Note: the VFS' inode number of bytes is not updated, it's up to the caller
 134  * to deal with that. We set the field 'bytes_found' of the arguments structure
 135  * with the number of allocated bytes found in the target range, so that the
 136  * caller can update the inode's number of bytes in an atomic way when
 137  * replacing extents in a range to avoid races with stat(2).
 138  */
 139 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 140                        struct btrfs_root *root, struct btrfs_inode *inode,
 141                        struct btrfs_drop_extents_args *args)
 142 {
 143         struct btrfs_fs_info *fs_info = root->fs_info;
 144         struct extent_buffer *leaf;
 145         struct btrfs_file_extent_item *fi;
 146         struct btrfs_key key;
 147         struct btrfs_key new_key;
 148         u64 ino = btrfs_ino(inode);
 149         u64 search_start = args->start;
 150         u64 disk_bytenr = 0;
 151         u64 num_bytes = 0;
 152         u64 extent_offset = 0;
 153         u64 extent_end = 0;
 154         u64 last_end = args->start;
 155         int del_nr = 0;
 156         int del_slot = 0;
 157         int extent_type;
 158         int recow;
 159         int ret;
 160         int modify_tree = -1;
 161         int update_refs;
 162         int found = 0;
 163         struct btrfs_path *path = args->path;
 164
 165         args->bytes_found = 0;
 166         args->extent_inserted = false;
 167
 168         /* Must always have a path if ->replace_extent is true */
 169         ASSERT(!(args->replace_extent && !args->path));
 170
 171         if (!path) {
 172                 path = btrfs_alloc_path();
 173                 if (!path) {
 174                         ret = -ENOMEM;
 175                         goto out;
 176                 }
 177         }
 178
 179         if (args->drop_cache)
 180                 btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
 181
 182         if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)
 183                 modify_tree = 0;
 184
 185         update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
 186         while (1) {
 187                 recow = 0;
 188                 ret = btrfs_lookup_file_extent(trans, root, path, ino,
 189                                                search_start, modify_tree);
 190                 if (ret < 0)
 191                         break;
 192                 if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
 193                         leaf = path->nodes[0];
 194                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
 195                         if (key.objectid == ino &&
 196                             key.type == BTRFS_EXTENT_DATA_KEY)
 197                                 path->slots[0]--;
 198                 }
 199                 ret = 0;
 200 next_slot:
 201                 leaf = path->nodes[0];
 202                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 203                         if (WARN_ON(del_nr > 0)) {
 204                                 btrfs_print_leaf(leaf);
 205                                 ret = -EINVAL;
 206                                 break;
 207                         }
 208                         ret = btrfs_next_leaf(root, path);
 209                         if (ret < 0)
 210                                 break;
 211                         if (ret > 0) {
 212                                 ret = 0;
 213                                 break;
 214                         }
 215                         leaf = path->nodes[0];
 216                         recow = 1;
 217                 }
 218
 219                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 220
 221                 if (key.objectid > ino)
 222                         break;
 223                 if (WARN_ON_ONCE(key.objectid < ino) ||
 224                     key.type < BTRFS_EXTENT_DATA_KEY) {
 225                         ASSERT(del_nr == 0);
 226                         path->slots[0]++;
 227                         goto next_slot;
 228                 }
 229                 if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
 230                         break;
 231
 232                 fi = btrfs_item_ptr(leaf, path->slots[0],
 233                                     struct btrfs_file_extent_item);
 234                 extent_type = btrfs_file_extent_type(leaf, fi);
 235
 236                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
 237                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 238                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 239                         num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 240                         extent_offset = btrfs_file_extent_offset(leaf, fi);
 241                         extent_end = key.offset +
 242                                 btrfs_file_extent_num_bytes(leaf, fi);
 243                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 244                         extent_end = key.offset +
 245                                 btrfs_file_extent_ram_bytes(leaf, fi);
 246                 } else {
 247                         /* can't happen */
 248                         BUG();
 249                 }
 250
 251                 /*
 252                  * Don't skip extent items representing 0 byte lengths. They
 253                  * used to be created (bug) if while punching holes we hit
 254                  * -ENOSPC condition. So if we find one here, just ensure we
 255                  * delete it, otherwise we would insert a new file extent item
 256                  * with the same key (offset) as that 0 bytes length file
 257                  * extent item in the call to setup_items_for_insert() later
 258                  * in this function.
 259                  */
 260                 if (extent_end == key.offset && extent_end >= search_start) {
 261                         last_end = extent_end;
 262                         goto delete_extent_item;
 263                 }
 264
 265                 if (extent_end <= search_start) {
 266                         path->slots[0]++;
 267                         goto next_slot;
 268                 }
 269
 270                 found = 1;
 271                 search_start = max(key.offset, args->start);
 272                 if (recow || !modify_tree) {
 273                         modify_tree = -1;
 274                         btrfs_release_path(path);
 275                         continue;
 276                 }
 277
 278                 /*
 279                  *     | - range to drop - |
 280                  *  | -------- extent -------- |
 281                  */
 282                 if (args->start > key.offset && args->end < extent_end) {
 283                         if (WARN_ON(del_nr > 0)) {
 284                                 btrfs_print_leaf(leaf);
 285                                 ret = -EINVAL;
 286                                 break;
 287                         }
 288                         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 289                                 ret = -EOPNOTSUPP;
 290                                 break;
 291                         }
 292
 293                         memcpy(&new_key, &key, sizeof(new_key));
 294                         new_key.offset = args->start;
 295                         ret = btrfs_duplicate_item(trans, root, path,
 296                                                    &new_key);
 297                         if (ret == -EAGAIN) {
 298                                 btrfs_release_path(path);
 299                                 continue;
 300                         }
 301                         if (ret < 0)
 302                                 break;
 303
 304                         leaf = path->nodes[0];
 305                         fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 306                                             struct btrfs_file_extent_item);
 307                         btrfs_set_file_extent_num_bytes(leaf, fi,
 308                                                         args->start - key.offset);
 309
 310                         fi = btrfs_item_ptr(leaf, path->slots[0],
 311                                             struct btrfs_file_extent_item);
 312
 313                         extent_offset += args->start - key.offset;
 314                         btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 315                         btrfs_set_file_extent_num_bytes(leaf, fi,
 316                                                         extent_end - args->start);
 317
 318                         if (update_refs && disk_bytenr > 0) {
 319                                 struct btrfs_ref ref = {
 320                                         .action = BTRFS_ADD_DELAYED_REF,
 321                                         .bytenr = disk_bytenr,
 322                                         .num_bytes = num_bytes,
 323                                         .parent = 0,
 324                                         .owning_root = btrfs_root_id(root),
 325                                         .ref_root = btrfs_root_id(root),
 326                                 };
 327                                 btrfs_init_data_ref(&ref, new_key.objectid,
 328                                                     args->start - extent_offset,
 329                                                     0, false);
 330                                 ret = btrfs_inc_extent_ref(trans, &ref);
 331                                 if (ret) {
 332                                         btrfs_abort_transaction(trans, ret);
 333                                         break;
 334                                 }
 335                         }
 336                         key.offset = args->start;
 337                 }
 338                 /*
 339                  * From here on out we will have actually dropped something, so
 340                  * last_end can be updated.
 341                  */
 342                 last_end = extent_end;
 343
 344                 /*
 345                  *  | ---- range to drop ----- |
 346                  *      | -------- extent -------- |
 347                  */
 348                 if (args->start <= key.offset && args->end < extent_end) {
 349                         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 350                                 ret = -EOPNOTSUPP;
 351                                 break;
 352                         }
 353
 354                         memcpy(&new_key, &key, sizeof(new_key));
 355                         new_key.offset = args->end;
 356                         btrfs_set_item_key_safe(trans, path, &new_key);
 357
 358                         extent_offset += args->end - key.offset;
 359                         btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 360                         btrfs_set_file_extent_num_bytes(leaf, fi,
 361                                                         extent_end - args->end);
 362                         if (update_refs && disk_bytenr > 0)
 363                                 args->bytes_found += args->end - key.offset;
 364                         break;
 365                 }
 366
 367                 search_start = extent_end;
 368                 /*
 369                  *       | ---- range to drop ----- |
 370                  *  | -------- extent -------- |
 371                  */
 372                 if (args->start > key.offset && args->end >= extent_end) {
 373                         if (WARN_ON(del_nr > 0)) {
 374                                 btrfs_print_leaf(leaf);
 375                                 ret = -EINVAL;
 376                                 break;
 377                         }
 378                         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 379                                 ret = -EOPNOTSUPP;
 380                                 break;
 381                         }
 382
 383                         btrfs_set_file_extent_num_bytes(leaf, fi,
 384                                                         args->start - key.offset);
 385                         if (update_refs && disk_bytenr > 0)
 386                                 args->bytes_found += extent_end - args->start;
 387                         if (args->end == extent_end)
 388                                 break;
 389
 390                         path->slots[0]++;
 391                         goto next_slot;
 392                 }
 393
 394                 /*
 395                  *  | ---- range to drop ----- |
 396                  *    | ------ extent ------ |
 397                  */
 398                 if (args->start <= key.offset && args->end >= extent_end) {
 399 delete_extent_item:
 400                         if (del_nr == 0) {
 401                                 del_slot = path->slots[0];
 402                                 del_nr = 1;
 403                         } else {
 404                                 if (WARN_ON(del_slot + del_nr != path->slots[0])) {
 405                                         btrfs_print_leaf(leaf);
 406                                         ret = -EINVAL;
 407                                         break;
 408                                 }
 409                                 del_nr++;
 410                         }
 411
 412                         if (update_refs &&
 413                             extent_type == BTRFS_FILE_EXTENT_INLINE) {
 414                                 args->bytes_found += extent_end - key.offset;
 415                                 extent_end = ALIGN(extent_end,
 416                                                    fs_info->sectorsize);
 417                         } else if (update_refs && disk_bytenr > 0) {
 418                                 struct btrfs_ref ref = {
 419                                         .action = BTRFS_DROP_DELAYED_REF,
 420                                         .bytenr = disk_bytenr,
 421                                         .num_bytes = num_bytes,
 422                                         .parent = 0,
 423                                         .owning_root = btrfs_root_id(root),
 424                                         .ref_root = btrfs_root_id(root),
 425                                 };
 426                                 btrfs_init_data_ref(&ref, key.objectid,
 427                                                     key.offset - extent_offset,
 428                                                     0, false);
 429                                 ret = btrfs_free_extent(trans, &ref);
 430                                 if (ret) {
 431                                         btrfs_abort_transaction(trans, ret);
 432                                         break;
 433                                 }
 434                                 args->bytes_found += extent_end - key.offset;
 435                         }
 436
 437                         if (args->end == extent_end)
 438                                 break;
 439
 440                         if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
 441                                 path->slots[0]++;
 442                                 goto next_slot;
 443                         }
 444
 445                         ret = btrfs_del_items(trans, root, path, del_slot,
 446                                               del_nr);
 447                         if (ret) {
 448                                 btrfs_abort_transaction(trans, ret);
 449                                 break;
 450                         }
 451
 452                         del_nr = 0;
 453                         del_slot = 0;
 454
 455                         btrfs_release_path(path);
 456                         continue;
 457                 }
 458
 459                 BUG();
 460         }
 461
 462         if (!ret && del_nr > 0) {
 463                 /*
 464                  * Set path->slots[0] to first slot, so that after the delete
 465                  * if items are move off from our leaf to its immediate left or
 466                  * right neighbor leafs, we end up with a correct and adjusted
 467                  * path->slots[0] for our insertion (if args->replace_extent).
 468                  */
 469                 path->slots[0] = del_slot;
 470                 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 471                 if (ret)
 472                         btrfs_abort_transaction(trans, ret);
 473         }
 474
 475         leaf = path->nodes[0];
 476         /*
 477          * If btrfs_del_items() was called, it might have deleted a leaf, in
 478          * which case it unlocked our path, so check path->locks[0] matches a
 479          * write lock.
 480          */
 481         if (!ret && args->replace_extent &&
 482             path->locks[0] == BTRFS_WRITE_LOCK &&
 483             btrfs_leaf_free_space(leaf) >=
 484             sizeof(struct btrfs_item) + args->extent_item_size) {
 485
 486                 key.objectid = ino;
 487                 key.type = BTRFS_EXTENT_DATA_KEY;
 488                 key.offset = args->start;
 489                 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
 490                         struct btrfs_key slot_key;
 491
 492                         btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
 493                         if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
 494                                 path->slots[0]++;
 495                 }
 496                 btrfs_setup_item_for_insert(trans, root, path, &key,
 497                                             args->extent_item_size);
 498                 args->extent_inserted = true;
 499         }
 500
 501         if (!args->path)
 502                 btrfs_free_path(path);
 503         else if (!args->extent_inserted)
 504                 btrfs_release_path(path);
 505 out:
 506         args->drop_end = found ? min(args->end, last_end) : args->end;
 507
 508         return ret;
 509 }
 510
 511 static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid,
 512                              u64 bytenr, u64 orig_offset, u64 *start, u64 *end)
 513 {
 514         struct btrfs_file_extent_item *fi;
 515         struct btrfs_key key;
 516         u64 extent_end;
 517
 518         if (slot < 0 || slot >= btrfs_header_nritems(leaf))
 519                 return false;
 520
 521         btrfs_item_key_to_cpu(leaf, &key, slot);
 522         if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
 523                 return false;
 524
 525         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 526         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
 527             btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
 528             btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
 529             btrfs_file_extent_compression(leaf, fi) ||
 530             btrfs_file_extent_encryption(leaf, fi) ||
 531             btrfs_file_extent_other_encoding(leaf, fi))
 532                 return false;
 533
 534         extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
 535         if ((*start && *start != key.offset) || (*end && *end != extent_end))
 536                 return false;
 537
 538         *start = key.offset;
 539         *end = extent_end;
 540         return true;
 541 }
 542
 543 /*
 544  * Mark extent in the range start - end as written.
 545  *
 546  * This changes extent type from 'pre-allocated' to 'regular'. If only
 547  * part of extent is marked as written, the extent will be split into
 548  * two or three.
 549  */
 550 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 551                               struct btrfs_inode *inode, u64 start, u64 end)
 552 {
 553         struct btrfs_root *root = inode->root;
 554         struct extent_buffer *leaf;
 555         BTRFS_PATH_AUTO_FREE(path);
 556         struct btrfs_file_extent_item *fi;
 557         struct btrfs_ref ref = { 0 };
 558         struct btrfs_key key;
 559         struct btrfs_key new_key;
 560         u64 bytenr;
 561         u64 num_bytes;
 562         u64 extent_end;
 563         u64 orig_offset;
 564         u64 other_start;
 565         u64 other_end;
 566         u64 split;
 567         int del_nr = 0;
 568         int del_slot = 0;
 569         int recow;
 570         int ret = 0;
 571         u64 ino = btrfs_ino(inode);
 572
 573         path = btrfs_alloc_path();
 574         if (!path)
 575                 return -ENOMEM;
 576 again:
 577         recow = 0;
 578         split = start;
 579         key.objectid = ino;
 580         key.type = BTRFS_EXTENT_DATA_KEY;
 581         key.offset = split;
 582
 583         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 584         if (ret < 0)
 585                 goto out;
 586         if (ret > 0 && path->slots[0] > 0)
 587                 path->slots[0]--;
 588
 589         leaf = path->nodes[0];
 590         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 591         if (key.objectid != ino ||
 592             key.type != BTRFS_EXTENT_DATA_KEY) {
 593                 ret = -EINVAL;
 594                 btrfs_abort_transaction(trans, ret);
 595                 goto out;
 596         }
 597         fi = btrfs_item_ptr(leaf, path->slots[0],
 598                             struct btrfs_file_extent_item);
 599         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
 600                 ret = -EINVAL;
 601                 btrfs_abort_transaction(trans, ret);
 602                 goto out;
 603         }
 604         extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
 605         if (key.offset > start || extent_end < end) {
 606                 ret = -EINVAL;
 607                 btrfs_abort_transaction(trans, ret);
 608                 goto out;
 609         }
 610
 611         bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 612         num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 613         orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
 614         memcpy(&new_key, &key, sizeof(new_key));
 615
 616         if (start == key.offset && end < extent_end) {
 617                 other_start = 0;
 618                 other_end = start;
 619                 if (extent_mergeable(leaf, path->slots[0] - 1,
 620                                      ino, bytenr, orig_offset,
 621                                      &other_start, &other_end)) {
 622                         new_key.offset = end;
 623                         btrfs_set_item_key_safe(trans, path, &new_key);
 624                         fi = btrfs_item_ptr(leaf, path->slots[0],
 625                                             struct btrfs_file_extent_item);
 626                         btrfs_set_file_extent_generation(leaf, fi,
 627                                                          trans->transid);
 628                         btrfs_set_file_extent_num_bytes(leaf, fi,
 629                                                         extent_end - end);
 630                         btrfs_set_file_extent_offset(leaf, fi,
 631                                                      end - orig_offset);
 632                         fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 633                                             struct btrfs_file_extent_item);
 634                         btrfs_set_file_extent_generation(leaf, fi,
 635                                                          trans->transid);
 636                         btrfs_set_file_extent_num_bytes(leaf, fi,
 637                                                         end - other_start);
 638                         goto out;
 639                 }
 640         }
 641
 642         if (start > key.offset && end == extent_end) {
 643                 other_start = end;
 644                 other_end = 0;
 645                 if (extent_mergeable(leaf, path->slots[0] + 1,
 646                                      ino, bytenr, orig_offset,
 647                                      &other_start, &other_end)) {
 648                         fi = btrfs_item_ptr(leaf, path->slots[0],
 649                                             struct btrfs_file_extent_item);
 650                         btrfs_set_file_extent_num_bytes(leaf, fi,
 651                                                         start - key.offset);
 652                         btrfs_set_file_extent_generation(leaf, fi,
 653                                                          trans->transid);
 654                         path->slots[0]++;
 655                         new_key.offset = start;
 656                         btrfs_set_item_key_safe(trans, path, &new_key);
 657
 658                         fi = btrfs_item_ptr(leaf, path->slots[0],
 659                                             struct btrfs_file_extent_item);
 660                         btrfs_set_file_extent_generation(leaf, fi,
 661                                                          trans->transid);
 662                         btrfs_set_file_extent_num_bytes(leaf, fi,
 663                                                         other_end - start);
 664                         btrfs_set_file_extent_offset(leaf, fi,
 665                                                      start - orig_offset);
 666                         goto out;
 667                 }
 668         }
 669
 670         while (start > key.offset || end < extent_end) {
 671                 if (key.offset == start)
 672                         split = end;
 673
 674                 new_key.offset = split;
 675                 ret = btrfs_duplicate_item(trans, root, path, &new_key);
 676                 if (ret == -EAGAIN) {
 677                         btrfs_release_path(path);
 678                         goto again;
 679                 }
 680                 if (ret < 0) {
 681                         btrfs_abort_transaction(trans, ret);
 682                         goto out;
 683                 }
 684
 685                 leaf = path->nodes[0];
 686                 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 687                                     struct btrfs_file_extent_item);
 688                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 689                 btrfs_set_file_extent_num_bytes(leaf, fi,
 690                                                 split - key.offset);
 691
 692                 fi = btrfs_item_ptr(leaf, path->slots[0],
 693                                     struct btrfs_file_extent_item);
 694
 695                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 696                 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
 697                 btrfs_set_file_extent_num_bytes(leaf, fi,
 698                                                 extent_end - split);
 699
 700                 ref.action = BTRFS_ADD_DELAYED_REF;
 701                 ref.bytenr = bytenr;
 702                 ref.num_bytes = num_bytes;
 703                 ref.parent = 0;
 704                 ref.owning_root = btrfs_root_id(root);
 705                 ref.ref_root = btrfs_root_id(root);
 706                 btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
 707                 ret = btrfs_inc_extent_ref(trans, &ref);
 708                 if (ret) {
 709                         btrfs_abort_transaction(trans, ret);
 710                         goto out;
 711                 }
 712
 713                 if (split == start) {
 714                         key.offset = start;
 715                 } else {
 716                         if (start != key.offset) {
 717                                 ret = -EINVAL;
 718                                 btrfs_abort_transaction(trans, ret);
 719                                 goto out;
 720                         }
 721                         path->slots[0]--;
 722                         extent_end = end;
 723                 }
 724                 recow = 1;
 725         }
 726
 727         other_start = end;
 728         other_end = 0;
 729
 730         ref.action = BTRFS_DROP_DELAYED_REF;
 731         ref.bytenr = bytenr;
 732         ref.num_bytes = num_bytes;
 733         ref.parent = 0;
 734         ref.owning_root = btrfs_root_id(root);
 735         ref.ref_root = btrfs_root_id(root);
 736         btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
 737         if (extent_mergeable(leaf, path->slots[0] + 1,
 738                              ino, bytenr, orig_offset,
 739                              &other_start, &other_end)) {
 740                 if (recow) {
 741                         btrfs_release_path(path);
 742                         goto again;
 743                 }
 744                 extent_end = other_end;
 745                 del_slot = path->slots[0] + 1;
 746                 del_nr++;
 747                 ret = btrfs_free_extent(trans, &ref);
 748                 if (ret) {
 749                         btrfs_abort_transaction(trans, ret);
 750                         goto out;
 751                 }
 752         }
 753         other_start = 0;
 754         other_end = start;
 755         if (extent_mergeable(leaf, path->slots[0] - 1,
 756                              ino, bytenr, orig_offset,
 757                              &other_start, &other_end)) {
 758                 if (recow) {
 759                         btrfs_release_path(path);
 760                         goto again;
 761                 }
 762                 key.offset = other_start;
 763                 del_slot = path->slots[0];
 764                 del_nr++;
 765                 ret = btrfs_free_extent(trans, &ref);
 766                 if (ret) {
 767                         btrfs_abort_transaction(trans, ret);
 768                         goto out;
 769                 }
 770         }
 771         if (del_nr == 0) {
 772                 fi = btrfs_item_ptr(leaf, path->slots[0],
 773                            struct btrfs_file_extent_item);
 774                 btrfs_set_file_extent_type(leaf, fi,
 775                                            BTRFS_FILE_EXTENT_REG);
 776                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 777         } else {
 778                 fi = btrfs_item_ptr(leaf, del_slot - 1,
 779                            struct btrfs_file_extent_item);
 780                 btrfs_set_file_extent_type(leaf, fi,
 781                                            BTRFS_FILE_EXTENT_REG);
 782                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 783                 btrfs_set_file_extent_num_bytes(leaf, fi,
 784                                                 extent_end - key.offset);
 785
 786                 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 787                 if (ret < 0) {
 788                         btrfs_abort_transaction(trans, ret);
 789                         goto out;
 790                 }
 791         }
 792 out:
 793         return ret;
 794 }
 795
 796 /*
 797  * On error return an unlocked folio and the error value
 798  * On success return a locked folio and 0
 799  */
 800 static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos,
 801                                   u64 len)
 802 {
 803         u64 clamp_start = max_t(u64, pos, folio_pos(folio));
 804         u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio));
 805         const u32 blocksize = inode_to_fs_info(inode)->sectorsize;
 806         int ret = 0;
 807
 808         if (folio_test_uptodate(folio))
 809                 return 0;
 810
 811         if (IS_ALIGNED(clamp_start, blocksize) &&
 812             IS_ALIGNED(clamp_end, blocksize))
 813                 return 0;
 814
 815         ret = btrfs_read_folio(NULL, folio);
 816         if (ret)
 817                 return ret;
 818         folio_lock(folio);
 819         if (!folio_test_uptodate(folio)) {
 820                 folio_unlock(folio);
 821                 return -EIO;
 822         }
 823
 824         /*
 825          * Since btrfs_read_folio() will unlock the folio before it returns,
 826          * there is a window where btrfs_release_folio() can be called to
 827          * release the page.  Here we check both inode mapping and page
 828          * private to make sure the page was not released.
 829          *
 830          * The private flag check is essential for subpage as we need to store
 831          * extra bitmap using folio private.
 832          */
 833         if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) {
 834                 folio_unlock(folio);
 835                 return -EAGAIN;
 836         }
 837         return 0;
 838 }
 839
 840 static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
 841 {
 842         gfp_t gfp;
 843
 844         gfp = btrfs_alloc_write_mask(inode->i_mapping);
 845         if (nowait) {
 846                 gfp &= ~__GFP_DIRECT_RECLAIM;
 847                 gfp |= GFP_NOWAIT;
 848         }
 849
 850         return gfp;
 851 }
 852
 853 /*
 854  * Get folio into the page cache and lock it.
 855  */
 856 static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret,
 857                                       loff_t pos, size_t write_bytes,
 858                                       bool nowait)
 859 {
 860         unsigned long index = pos >> PAGE_SHIFT;
 861         gfp_t mask = get_prepare_gfp_flags(inode, nowait);
 862         fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) |
 863                           fgf_set_order(write_bytes);
 864         struct folio *folio;
 865         int ret = 0;
 866
 867 again:
 868         folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask);
 869         if (IS_ERR(folio))
 870                 return PTR_ERR(folio);
 871
 872         ret = set_folio_extent_mapped(folio);
 873         if (ret < 0) {
 874                 folio_unlock(folio);
 875                 folio_put(folio);
 876                 return ret;
 877         }
 878         ret = prepare_uptodate_folio(inode, folio, pos, write_bytes);
 879         if (ret) {
 880                 /* The folio is already unlocked. */
 881                 folio_put(folio);
 882                 if (!nowait && ret == -EAGAIN) {
 883                         ret = 0;
 884                         goto again;
 885                 }
 886                 return ret;
 887         }
 888         *folio_ret = folio;
 889         return 0;
 890 }
 891
 892 /*
 893  * Locks the extent and properly waits for data=ordered extents to finish
 894  * before allowing the folios to be modified if need.
 895  *
 896  * Return:
 897  * 1 - the extent is locked
 898  * 0 - the extent is not locked, and everything is OK
 899  * -EAGAIN - need to prepare the folios again
 900  */
 901 static noinline int
 902 lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
 903                                 loff_t pos, size_t write_bytes,
 904                                 u64 *lockstart, u64 *lockend, bool nowait,
 905                                 struct extent_state **cached_state)
 906 {
 907         struct btrfs_fs_info *fs_info = inode->root->fs_info;
 908         u64 start_pos;
 909         u64 last_pos;
 910         int ret = 0;
 911
 912         start_pos = round_down(pos, fs_info->sectorsize);
 913         last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
 914
 915         if (start_pos < inode->vfs_inode.i_size) {
 916                 struct btrfs_ordered_extent *ordered;
 917
 918                 if (nowait) {
 919                         if (!btrfs_try_lock_extent(&inode->io_tree, start_pos,
 920                                                    last_pos, cached_state)) {
 921                                 folio_unlock(folio);
 922                                 folio_put(folio);
 923                                 return -EAGAIN;
 924                         }
 925                 } else {
 926                         btrfs_lock_extent(&inode->io_tree, start_pos, last_pos,
 927                                           cached_state);
 928                 }
 929
 930                 ordered = btrfs_lookup_ordered_range(inode, start_pos,
 931                                                      last_pos - start_pos + 1);
 932                 if (ordered &&
 933                     ordered->file_offset + ordered->num_bytes > start_pos &&
 934                     ordered->file_offset <= last_pos) {
 935                         btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos,
 936                                             cached_state);
 937                         folio_unlock(folio);
 938                         folio_put(folio);
 939                         btrfs_start_ordered_extent(ordered);
 940                         btrfs_put_ordered_extent(ordered);
 941                         return -EAGAIN;
 942                 }
 943                 if (ordered)
 944                         btrfs_put_ordered_extent(ordered);
 945
 946                 *lockstart = start_pos;
 947                 *lockend = last_pos;
 948                 ret = 1;
 949         }
 950
 951         /*
 952          * We should be called after prepare_one_folio() which should have locked
 953          * all pages in the range.
 954          */
 955         WARN_ON(!folio_test_locked(folio));
 956
 957         return ret;
 958 }
 959
 960 /*
 961  * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
 962  *
 963  * @pos:         File offset.
 964  * @write_bytes: The length to write, will be updated to the nocow writeable
 965  *               range.
 966  *
 967  * This function will flush ordered extents in the range to ensure proper
 968  * nocow checks.
 969  *
 970  * Return:
 971  * > 0          If we can nocow, and updates @write_bytes.
 972  *  0           If we can't do a nocow write.
 973  * -EAGAIN      If we can't do a nocow write because snapshoting of the inode's
 974  *              root is in progress.
 975  * < 0          If an error happened.
 976  *
 977  * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
 978  */
 979 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
 980                            size_t *write_bytes, bool nowait)
 981 {
 982         struct btrfs_fs_info *fs_info = inode->root->fs_info;
 983         struct btrfs_root *root = inode->root;
 984         struct extent_state *cached_state = NULL;
 985         u64 lockstart, lockend;
 986         u64 num_bytes;
 987         int ret;
 988
 989         if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
 990                 return 0;
 991
 992         if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
 993                 return -EAGAIN;
 994
 995         lockstart = round_down(pos, fs_info->sectorsize);
 996         lockend = round_up(pos + *write_bytes,
 997                            fs_info->sectorsize) - 1;
 998         num_bytes = lockend - lockstart + 1;
 999
1000         if (nowait) {
1001                 if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
1002                                                   &cached_state)) {
1003                         btrfs_drew_write_unlock(&root->snapshot_lock);
1004                         return -EAGAIN;
1005                 }
1006         } else {
1007                 btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
1008                                                    &cached_state);
1009         }
1010         ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, nowait);
1011         if (ret <= 0)
1012                 btrfs_drew_write_unlock(&root->snapshot_lock);
1013         else
1014                 *write_bytes = min_t(size_t, *write_bytes ,
1015                                      num_bytes - pos + lockstart);
1016         btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
1017
1018         return ret;
1019 }
1020
1021 void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1022 {
1023         btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1024 }
1025
1026 int btrfs_write_check(struct kiocb *iocb, size_t count)
1027 {
1028         struct file *file = iocb->ki_filp;
1029         struct inode *inode = file_inode(file);
1030         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1031         loff_t pos = iocb->ki_pos;
1032         int ret;
1033         loff_t oldsize;
1034
1035         /*
1036          * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
1037          * prealloc flags, as without those flags we always have to COW. We will
1038          * later check if we can really COW into the target range (using
1039          * can_nocow_extent() at btrfs_get_blocks_direct_write()).
1040          */
1041         if ((iocb->ki_flags & IOCB_NOWAIT) &&
1042             !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1043                 return -EAGAIN;
1044
1045         ret = file_remove_privs(file);
1046         if (ret)
1047                 return ret;
1048
1049         /*
1050          * We reserve space for updating the inode when we reserve space for the
1051          * extent we are going to write, so we will enospc out there.  We don't
1052          * need to start yet another transaction to update the inode as we will
1053          * update the inode when we finish writing whatever data we write.
1054          */
1055         if (!IS_NOCMTIME(inode)) {
1056                 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
1057                 inode_inc_iversion(inode);
1058         }
1059
1060         oldsize = i_size_read(inode);
1061         if (pos > oldsize) {
1062                 /* Expand hole size to cover write data, preventing empty gap */
1063                 loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1064
1065                 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1066                 if (ret)
1067                         return ret;
1068         }
1069
1070         return 0;
1071 }
1072
1073 static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved,
1074                           u64 start, u64 len, bool only_release_metadata)
1075 {
1076         if (len == 0)
1077                 return;
1078
1079         if (only_release_metadata) {
1080                 btrfs_check_nocow_unlock(inode);
1081                 btrfs_delalloc_release_metadata(inode, len, true);
1082         } else {
1083                 const struct btrfs_fs_info *fs_info = inode->root->fs_info;
1084
1085                 btrfs_delalloc_release_space(inode, data_reserved,
1086                                              round_down(start, fs_info->sectorsize),
1087                                              len, true);
1088         }
1089 }
1090
1091 /*
1092  * Reserve data and metadata space for this buffered write range.
1093  *
1094  * Return >0 for the number of bytes reserved, which is always block aligned.
1095  * Return <0 for error.
1096  */
1097 static ssize_t reserve_space(struct btrfs_inode *inode,
1098                              struct extent_changeset **data_reserved,
1099                              u64 start, size_t *len, bool nowait,
1100                              bool *only_release_metadata)
1101 {
1102         const struct btrfs_fs_info *fs_info = inode->root->fs_info;
1103         const unsigned int block_offset = (start & (fs_info->sectorsize - 1));
1104         size_t reserve_bytes;
1105         int ret;
1106
1107         ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait);
1108         if (ret < 0) {
1109                 int can_nocow;
1110
1111                 if (nowait && (ret == -ENOSPC || ret == -EAGAIN))
1112                         return -EAGAIN;
1113
1114                 /*
1115                  * If we don't have to COW at the offset, reserve metadata only.
1116                  * write_bytes may get smaller than requested here.
1117                  */
1118                 can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait);
1119                 if (can_nocow < 0)
1120                         ret = can_nocow;
1121                 if (can_nocow > 0)
1122                         ret = 0;
1123                 if (ret)
1124                         return ret;
1125                 *only_release_metadata = true;
1126         }
1127
1128         reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize);
1129         WARN_ON(reserve_bytes == 0);
1130         ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes,
1131                                               reserve_bytes, nowait);
1132         if (ret) {
1133                 if (!*only_release_metadata)
1134                         btrfs_free_reserved_data_space(inode, *data_reserved,
1135                                                        start, *len);
1136                 else
1137                         btrfs_check_nocow_unlock(inode);
1138
1139                 if (nowait && ret == -ENOSPC)
1140                         ret = -EAGAIN;
1141                 return ret;
1142         }
1143         return reserve_bytes;
1144 }
1145
1146 /* Shrink the reserved data and metadata space from @reserved_len to @new_len. */
1147 static void shrink_reserved_space(struct btrfs_inode *inode,
1148                                   struct extent_changeset *data_reserved,
1149                                   u64 reserved_start, u64 reserved_len,
1150                                   u64 new_len, bool only_release_metadata)
1151 {
1152         const u64 diff = reserved_len - new_len;
1153
1154         ASSERT(new_len <= reserved_len);
1155         btrfs_delalloc_shrink_extents(inode, reserved_len, new_len);
1156         if (only_release_metadata)
1157                 btrfs_delalloc_release_metadata(inode, diff, true);
1158         else
1159                 btrfs_delalloc_release_space(inode, data_reserved,
1160                                              reserved_start + new_len, diff, true);
1161 }
1162
1163 /* Calculate the maximum amount of bytes we can write into one folio. */
1164 static size_t calc_write_bytes(const struct btrfs_inode *inode,
1165                                const struct iov_iter *iter, u64 start)
1166 {
1167         const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping);
1168
1169         return min(max_folio_size - (start & (max_folio_size - 1)),
1170                    iov_iter_count(iter));
1171 }
1172
1173 /*
1174  * Do the heavy-lifting work to copy one range into one folio of the page cache.
1175  *
1176  * Return > 0 in case we copied all bytes or just some of them.
1177  * Return 0 if no bytes were copied, in which case the caller should retry.
1178  * Return <0 on error.
1179  */
1180 static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter,
1181                           struct extent_changeset **data_reserved, u64 start,
1182                           bool nowait)
1183 {
1184         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1185         struct extent_state *cached_state = NULL;
1186         size_t write_bytes = calc_write_bytes(inode, iter, start);
1187         size_t copied;
1188         const u64 reserved_start = round_down(start, fs_info->sectorsize);
1189         u64 reserved_len;
1190         struct folio *folio = NULL;
1191         int extents_locked;
1192         u64 lockstart;
1193         u64 lockend;
1194         bool only_release_metadata = false;
1195         const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
1196         int ret;
1197
1198         /*
1199          * Fault all pages before locking them in prepare_one_folio() to avoid
1200          * recursive lock.
1201          */
1202         if (unlikely(fault_in_iov_iter_readable(iter, write_bytes)))
1203                 return -EFAULT;
1204         extent_changeset_release(*data_reserved);
1205         ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait,
1206                             &only_release_metadata);
1207         if (ret < 0)
1208                 return ret;
1209         reserved_len = ret;
1210         /* Write range must be inside the reserved range. */
1211         ASSERT(reserved_start <= start);
1212         ASSERT(start + write_bytes <= reserved_start + reserved_len);
1213
1214 again:
1215         ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping,
1216                                                     bdp_flags);
1217         if (ret) {
1218                 btrfs_delalloc_release_extents(inode, reserved_len);
1219                 release_space(inode, *data_reserved, reserved_start, reserved_len,
1220                               only_release_metadata);
1221                 return ret;
1222         }
1223
1224         ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false);
1225         if (ret) {
1226                 btrfs_delalloc_release_extents(inode, reserved_len);
1227                 release_space(inode, *data_reserved, reserved_start, reserved_len,
1228                               only_release_metadata);
1229                 return ret;
1230         }
1231
1232         /*
1233          * The reserved range goes beyond the current folio, shrink the reserved
1234          * space to the folio boundary.
1235          */
1236         if (reserved_start + reserved_len > folio_pos(folio) + folio_size(folio)) {
1237                 const u64 last_block = folio_pos(folio) + folio_size(folio);
1238
1239                 shrink_reserved_space(inode, *data_reserved, reserved_start,
1240                                       reserved_len, last_block - reserved_start,
1241                                       only_release_metadata);
1242                 write_bytes = last_block - start;
1243                 reserved_len = last_block - reserved_start;
1244         }
1245
1246         extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start,
1247                                                          write_bytes, &lockstart,
1248                                                          &lockend, nowait,
1249                                                          &cached_state);
1250         if (extents_locked < 0) {
1251                 if (!nowait && extents_locked == -EAGAIN)
1252                         goto again;
1253
1254                 btrfs_delalloc_release_extents(inode, reserved_len);
1255                 release_space(inode, *data_reserved, reserved_start, reserved_len,
1256                               only_release_metadata);
1257                 ret = extents_locked;
1258                 return ret;
1259         }
1260
1261         copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start),
1262                                              write_bytes, iter);
1263         flush_dcache_folio(folio);
1264
1265         if (unlikely(copied < write_bytes)) {
1266                 u64 last_block;
1267
1268                 /*
1269                  * The original write range doesn't need an uptodate folio as
1270                  * the range is block aligned. But now a short copy happened.
1271                  * We cannot handle it without an uptodate folio.
1272                  *
1273                  * So just revert the range and we will retry.
1274                  */
1275                 if (!folio_test_uptodate(folio)) {
1276                         iov_iter_revert(iter, copied);
1277                         copied = 0;
1278                 }
1279
1280                 /* No copied bytes, unlock, release reserved space and exit. */
1281                 if (copied == 0) {
1282                         if (extents_locked)
1283                                 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend,
1284                                                     &cached_state);
1285                         else
1286                                 btrfs_free_extent_state(cached_state);
1287                         btrfs_delalloc_release_extents(inode, reserved_len);
1288                         release_space(inode, *data_reserved, reserved_start, reserved_len,
1289                                       only_release_metadata);
1290                         btrfs_drop_folio(fs_info, folio, start, copied);
1291                         return 0;
1292                 }
1293
1294                 /* Release the reserved space beyond the last block. */
1295                 last_block = round_up(start + copied, fs_info->sectorsize);
1296
1297                 shrink_reserved_space(inode, *data_reserved, reserved_start,
1298                                       reserved_len, last_block - reserved_start,
1299                                       only_release_metadata);
1300                 reserved_len = last_block - reserved_start;
1301         }
1302
1303         ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state,
1304                                 only_release_metadata);
1305         /*
1306          * If we have not locked the extent range, because the range's start
1307          * offset is >= i_size, we might still have a non-NULL cached extent
1308          * state, acquired while marking the extent range as delalloc through
1309          * btrfs_dirty_page(). Therefore free any possible cached extent state
1310          * to avoid a memory leak.
1311          */
1312         if (extents_locked)
1313                 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
1314         else
1315                 btrfs_free_extent_state(cached_state);
1316
1317         btrfs_delalloc_release_extents(inode, reserved_len);
1318         if (ret) {
1319                 btrfs_drop_folio(fs_info, folio, start, copied);
1320                 release_space(inode, *data_reserved, reserved_start, reserved_len,
1321                               only_release_metadata);
1322                 return ret;
1323         }
1324         if (only_release_metadata)
1325                 btrfs_check_nocow_unlock(inode);
1326
1327         btrfs_drop_folio(fs_info, folio, start, copied);
1328         return copied;
1329 }
1330
1331 ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
1332 {
1333         struct file *file = iocb->ki_filp;
1334         loff_t pos;
1335         struct inode *inode = file_inode(file);
1336         struct extent_changeset *data_reserved = NULL;
1337         size_t num_written = 0;
1338         ssize_t ret;
1339         loff_t old_isize;
1340         unsigned int ilock_flags = 0;
1341         const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
1342
1343         if (nowait)
1344                 ilock_flags |= BTRFS_ILOCK_TRY;
1345
1346         ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
1347         if (ret < 0)
1348                 return ret;
1349
1350         /*
1351          * We can only trust the isize with inode lock held, or it can race with
1352          * other buffered writes and cause incorrect call of
1353          * pagecache_isize_extended() to overwrite existing data.
1354          */
1355         old_isize = i_size_read(inode);
1356
1357         ret = generic_write_checks(iocb, iter);
1358         if (ret <= 0)
1359                 goto out;
1360
1361         ret = btrfs_write_check(iocb, ret);
1362         if (ret < 0)
1363                 goto out;
1364
1365         pos = iocb->ki_pos;
1366         while (iov_iter_count(iter) > 0) {
1367                 ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait);
1368                 if (ret < 0)
1369                         break;
1370                 pos += ret;
1371                 num_written += ret;
1372                 cond_resched();
1373         }
1374
1375         extent_changeset_free(data_reserved);
1376         if (num_written > 0) {
1377                 pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1378                 iocb->ki_pos += num_written;
1379         }
1380 out:
1381         btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1382         return num_written ? num_written : ret;
1383 }
1384
1385 static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
1386                         const struct btrfs_ioctl_encoded_io_args *encoded)
1387 {
1388         struct file *file = iocb->ki_filp;
1389         struct inode *inode = file_inode(file);
1390         loff_t count;
1391         ssize_t ret;
1392
1393         btrfs_inode_lock(BTRFS_I(inode), 0);
1394         count = encoded->len;
1395         ret = generic_write_checks_count(iocb, &count);
1396         if (ret == 0 && count != encoded->len) {
1397                 /*
1398                  * The write got truncated by generic_write_checks_count(). We
1399                  * can't do a partial encoded write.
1400                  */
1401                 ret = -EFBIG;
1402         }
1403         if (ret || encoded->len == 0)
1404                 goto out;
1405
1406         ret = btrfs_write_check(iocb, encoded->len);
1407         if (ret < 0)
1408                 goto out;
1409
1410         ret = btrfs_do_encoded_write(iocb, from, encoded);
1411 out:
1412         btrfs_inode_unlock(BTRFS_I(inode), 0);
1413         return ret;
1414 }
1415
1416 ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
1417                             const struct btrfs_ioctl_encoded_io_args *encoded)
1418 {
1419         struct file *file = iocb->ki_filp;
1420         struct btrfs_inode *inode = BTRFS_I(file_inode(file));
1421         ssize_t num_written, num_sync;
1422
1423         /*
1424          * If the fs flips readonly due to some impossible error, although we
1425          * have opened a file as writable, we have to stop this write operation
1426          * to ensure consistency.
1427          */
1428         if (BTRFS_FS_ERROR(inode->root->fs_info))
1429                 return -EROFS;
1430
1431         if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
1432                 return -EOPNOTSUPP;
1433
1434         if (encoded) {
1435                 num_written = btrfs_encoded_write(iocb, from, encoded);
1436                 num_sync = encoded->len;
1437         } else if (iocb->ki_flags & IOCB_DIRECT) {
1438                 num_written = btrfs_direct_write(iocb, from);
1439                 num_sync = num_written;
1440         } else {
1441                 num_written = btrfs_buffered_write(iocb, from);
1442                 num_sync = num_written;
1443         }
1444
1445         btrfs_set_inode_last_sub_trans(inode);
1446
1447         if (num_sync > 0) {
1448                 num_sync = generic_write_sync(iocb, num_sync);
1449                 if (num_sync < 0)
1450                         num_written = num_sync;
1451         }
1452
1453         return num_written;
1454 }
1455
1456 static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1457 {
1458         return btrfs_do_write_iter(iocb, from, NULL);
1459 }
1460
1461 int btrfs_release_file(struct inode *inode, struct file *filp)
1462 {
1463         struct btrfs_file_private *private = filp->private_data;
1464
1465         if (private) {
1466                 kfree(private->filldir_buf);
1467                 btrfs_free_extent_state(private->llseek_cached_state);
1468                 kfree(private);
1469                 filp->private_data = NULL;
1470         }
1471
1472         /*
1473          * Set by setattr when we are about to truncate a file from a non-zero
1474          * size to a zero size.  This tries to flush down new bytes that may
1475          * have been written if the application were using truncate to replace
1476          * a file in place.
1477          */
1478         if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
1479                                &BTRFS_I(inode)->runtime_flags))
1480                         filemap_flush(inode->i_mapping);
1481         return 0;
1482 }
1483
1484 static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end)
1485 {
1486         int ret;
1487         struct blk_plug plug;
1488
1489         /*
1490          * This is only called in fsync, which would do synchronous writes, so
1491          * a plug can merge adjacent IOs as much as possible.  Esp. in case of
1492          * multiple disks using raid profile, a large IO can be split to
1493          * several segments of stripe length (currently 64K).
1494          */
1495         blk_start_plug(&plug);
1496         ret = btrfs_fdatawrite_range(inode, start, end);
1497         blk_finish_plug(&plug);
1498
1499         return ret;
1500 }
1501
1502 static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
1503 {
1504         struct btrfs_inode *inode = ctx->inode;
1505         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1506
1507         if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
1508             list_empty(&ctx->ordered_extents))
1509                 return true;
1510
1511         /*
1512          * If we are doing a fast fsync we can not bail out if the inode's
1513          * last_trans is <= then the last committed transaction, because we only
1514          * update the last_trans of the inode during ordered extent completion,
1515          * and for a fast fsync we don't wait for that, we only wait for the
1516          * writeback to complete.
1517          */
1518         if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
1519             (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
1520              list_empty(&ctx->ordered_extents)))
1521                 return true;
1522
1523         return false;
1524 }
1525
1526 /*
1527  * fsync call for both files and directories.  This logs the inode into
1528  * the tree log instead of forcing full commits whenever possible.
1529  *
1530  * It needs to call filemap_fdatawait so that all ordered extent updates are
1531  * in the metadata btree are up to date for copying to the log.
1532  *
1533  * It drops the inode mutex before doing the tree log commit.  This is an
1534  * important optimization for directories because holding the mutex prevents
1535  * new operations on the dir while we write to disk.
1536  */
1537 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1538 {
1539         struct dentry *dentry = file_dentry(file);
1540         struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
1541         struct btrfs_root *root = inode->root;
1542         struct btrfs_fs_info *fs_info = root->fs_info;
1543         struct btrfs_trans_handle *trans;
1544         struct btrfs_log_ctx ctx;
1545         int ret = 0, err;
1546         u64 len;
1547         bool full_sync;
1548         bool skip_ilock = false;
1549
1550         if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
1551                 skip_ilock = true;
1552                 current->journal_info = NULL;
1553                 btrfs_assert_inode_locked(inode);
1554         }
1555
1556         trace_btrfs_sync_file(file, datasync);
1557
1558         btrfs_init_log_ctx(&ctx, inode);
1559
1560         /*
1561          * Always set the range to a full range, otherwise we can get into
1562          * several problems, from missing file extent items to represent holes
1563          * when not using the NO_HOLES feature, to log tree corruption due to
1564          * races between hole detection during logging and completion of ordered
1565          * extents outside the range, to missing checksums due to ordered extents
1566          * for which we flushed only a subset of their pages.
1567          */
1568         start = 0;
1569         end = LLONG_MAX;
1570         len = (u64)LLONG_MAX + 1;
1571
1572         /*
1573          * We write the dirty pages in the range and wait until they complete
1574          * out of the ->i_mutex. If so, we can flush the dirty pages by
1575          * multi-task, and make the performance up.  See
1576          * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1577          */
1578         ret = start_ordered_ops(inode, start, end);
1579         if (ret)
1580                 goto out;
1581
1582         if (skip_ilock)
1583                 down_write(&inode->i_mmap_lock);
1584         else
1585                 btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
1586
1587         atomic_inc(&root->log_batch);
1588
1589         /*
1590          * Before we acquired the inode's lock and the mmap lock, someone may
1591          * have dirtied more pages in the target range. We need to make sure
1592          * that writeback for any such pages does not start while we are logging
1593          * the inode, because if it does, any of the following might happen when
1594          * we are not doing a full inode sync:
1595          *
1596          * 1) We log an extent after its writeback finishes but before its
1597          *    checksums are added to the csum tree, leading to -EIO errors
1598          *    when attempting to read the extent after a log replay.
1599          *
1600          * 2) We can end up logging an extent before its writeback finishes.
1601          *    Therefore after the log replay we will have a file extent item
1602          *    pointing to an unwritten extent (and no data checksums as well).
1603          *
1604          * So trigger writeback for any eventual new dirty pages and then we
1605          * wait for all ordered extents to complete below.
1606          */
1607         ret = start_ordered_ops(inode, start, end);
1608         if (ret) {
1609                 if (skip_ilock)
1610                         up_write(&inode->i_mmap_lock);
1611                 else
1612                         btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1613                 goto out;
1614         }
1615
1616         /*
1617          * Always check for the full sync flag while holding the inode's lock,
1618          * to avoid races with other tasks. The flag must be either set all the
1619          * time during logging or always off all the time while logging.
1620          * We check the flag here after starting delalloc above, because when
1621          * running delalloc the full sync flag may be set if we need to drop
1622          * extra extent map ranges due to temporary memory allocation failures.
1623          */
1624         full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
1625
1626         /*
1627          * We have to do this here to avoid the priority inversion of waiting on
1628          * IO of a lower priority task while holding a transaction open.
1629          *
1630          * For a full fsync we wait for the ordered extents to complete while
1631          * for a fast fsync we wait just for writeback to complete, and then
1632          * attach the ordered extents to the transaction so that a transaction
1633          * commit waits for their completion, to avoid data loss if we fsync,
1634          * the current transaction commits before the ordered extents complete
1635          * and a power failure happens right after that.
1636          *
1637          * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
1638          * logical address recorded in the ordered extent may change. We need
1639          * to wait for the IO to stabilize the logical address.
1640          */
1641         if (full_sync || btrfs_is_zoned(fs_info)) {
1642                 ret = btrfs_wait_ordered_range(inode, start, len);
1643                 clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
1644         } else {
1645                 /*
1646                  * Get our ordered extents as soon as possible to avoid doing
1647                  * checksum lookups in the csum tree, and use instead the
1648                  * checksums attached to the ordered extents.
1649                  */
1650                 btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents);
1651                 ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end);
1652                 if (ret)
1653                         goto out_release_extents;
1654
1655                 /*
1656                  * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
1657                  * starting and waiting for writeback, because for buffered IO
1658                  * it may have been set during the end IO callback
1659                  * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
1660                  * case an error happened and we need to wait for ordered
1661                  * extents to complete so that any extent maps that point to
1662                  * unwritten locations are dropped and we don't log them.
1663                  */
1664                 if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags))
1665                         ret = btrfs_wait_ordered_range(inode, start, len);
1666         }
1667
1668         if (ret)
1669                 goto out_release_extents;
1670
1671         atomic_inc(&root->log_batch);
1672
1673         if (skip_inode_logging(&ctx)) {
1674                 /*
1675                  * We've had everything committed since the last time we were
1676                  * modified so clear this flag in case it was set for whatever
1677                  * reason, it's no longer relevant.
1678                  */
1679                 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
1680                 /*
1681                  * An ordered extent might have started before and completed
1682                  * already with io errors, in which case the inode was not
1683                  * updated and we end up here. So check the inode's mapping
1684                  * for any errors that might have happened since we last
1685                  * checked called fsync.
1686                  */
1687                 ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err);
1688                 goto out_release_extents;
1689         }
1690
1691         btrfs_init_log_ctx_scratch_eb(&ctx);
1692
1693         /*
1694          * We use start here because we will need to wait on the IO to complete
1695          * in btrfs_sync_log, which could require joining a transaction (for
1696          * example checking cross references in the nocow path).  If we use join
1697          * here we could get into a situation where we're waiting on IO to
1698          * happen that is blocked on a transaction trying to commit.  With start
1699          * we inc the extwriter counter, so we wait for all extwriters to exit
1700          * before we start blocking joiners.  This comment is to keep somebody
1701          * from thinking they are super smart and changing this to
1702          * btrfs_join_transaction *cough*Josef*cough*.
1703          */
1704         trans = btrfs_start_transaction(root, 0);
1705         if (IS_ERR(trans)) {
1706                 ret = PTR_ERR(trans);
1707                 goto out_release_extents;
1708         }
1709         trans->in_fsync = true;
1710
1711         ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
1712         /*
1713          * Scratch eb no longer needed, release before syncing log or commit
1714          * transaction, to avoid holding unnecessary memory during such long
1715          * operations.
1716          */
1717         if (ctx.scratch_eb) {
1718                 free_extent_buffer(ctx.scratch_eb);
1719                 ctx.scratch_eb = NULL;
1720         }
1721         btrfs_release_log_ctx_extents(&ctx);
1722         if (ret < 0) {
1723                 /* Fallthrough and commit/free transaction. */
1724                 ret = BTRFS_LOG_FORCE_COMMIT;
1725         }
1726
1727         /* we've logged all the items and now have a consistent
1728          * version of the file in the log.  It is possible that
1729          * someone will come in and modify the file, but that's
1730          * fine because the log is consistent on disk, and we
1731          * have references to all of the file's extents
1732          *
1733          * It is possible that someone will come in and log the
1734          * file again, but that will end up using the synchronization
1735          * inside btrfs_sync_log to keep things safe.
1736          */
1737         if (skip_ilock)
1738                 up_write(&inode->i_mmap_lock);
1739         else
1740                 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1741
1742         if (ret == BTRFS_NO_LOG_SYNC) {
1743                 ret = btrfs_end_transaction(trans);
1744                 goto out;
1745         }
1746
1747         /* We successfully logged the inode, attempt to sync the log. */
1748         if (!ret) {
1749                 ret = btrfs_sync_log(trans, root, &ctx);
1750                 if (!ret) {
1751                         ret = btrfs_end_transaction(trans);
1752                         goto out;
1753                 }
1754         }
1755
1756         /*
1757          * At this point we need to commit the transaction because we had
1758          * btrfs_need_log_full_commit() or some other error.
1759          *
1760          * If we didn't do a full sync we have to stop the trans handle, wait on
1761          * the ordered extents, start it again and commit the transaction.  If
1762          * we attempt to wait on the ordered extents here we could deadlock with
1763          * something like fallocate() that is holding the extent lock trying to
1764          * start a transaction while some other thread is trying to commit the
1765          * transaction while we (fsync) are currently holding the transaction
1766          * open.
1767          */
1768         if (!full_sync) {
1769                 ret = btrfs_end_transaction(trans);
1770                 if (ret)
1771                         goto out;
1772                 ret = btrfs_wait_ordered_range(inode, start, len);
1773                 if (ret)
1774                         goto out;
1775
1776                 /*
1777                  * This is safe to use here because we're only interested in
1778                  * making sure the transaction that had the ordered extents is
1779                  * committed.  We aren't waiting on anything past this point,
1780                  * we're purely getting the transaction and committing it.
1781                  */
1782                 trans = btrfs_attach_transaction_barrier(root);
1783                 if (IS_ERR(trans)) {
1784                         ret = PTR_ERR(trans);
1785
1786                         /*
1787                          * We committed the transaction and there's no currently
1788                          * running transaction, this means everything we care
1789                          * about made it to disk and we are done.
1790                          */
1791                         if (ret == -ENOENT)
1792                                 ret = 0;
1793                         goto out;
1794                 }
1795         }
1796
1797         ret = btrfs_commit_transaction(trans);
1798 out:
1799         free_extent_buffer(ctx.scratch_eb);
1800         ASSERT(list_empty(&ctx.list));
1801         ASSERT(list_empty(&ctx.conflict_inodes));
1802         err = file_check_and_advance_wb_err(file);
1803         if (!ret)
1804                 ret = err;
1805         return ret > 0 ? -EIO : ret;
1806
1807 out_release_extents:
1808         btrfs_release_log_ctx_extents(&ctx);
1809         if (skip_ilock)
1810                 up_write(&inode->i_mmap_lock);
1811         else
1812                 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1813         goto out;
1814 }
1815
1816 /*
1817  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
1818  * called from a page fault handler when a page is first dirtied. Hence we must
1819  * be careful to check for EOF conditions here. We set the page up correctly
1820  * for a written page which means we get ENOSPC checking when writing into
1821  * holes and correct delalloc and unwritten extent mapping on filesystems that
1822  * support these features.
1823  *
1824  * We are not allowed to take the i_mutex here so we have to play games to
1825  * protect against truncate races as the page could now be beyond EOF.  Because
1826  * truncate_setsize() writes the inode size before removing pages, once we have
1827  * the page lock we can determine safely if the page is beyond EOF. If it is not
1828  * beyond EOF, then the page is guaranteed safe against truncation until we
1829  * unlock the page.
1830  */
1831 static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
1832 {
1833         struct page *page = vmf->page;
1834         struct folio *folio = page_folio(page);
1835         struct inode *inode = file_inode(vmf->vma->vm_file);
1836         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1837         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1838         struct btrfs_ordered_extent *ordered;
1839         struct extent_state *cached_state = NULL;
1840         struct extent_changeset *data_reserved = NULL;
1841         unsigned long zero_start;
1842         loff_t size;
1843         size_t fsize = folio_size(folio);
1844         int ret;
1845         u64 reserved_space;
1846         u64 page_start;
1847         u64 page_end;
1848         u64 end;
1849
1850         reserved_space = fsize;
1851
1852         sb_start_pagefault(inode->i_sb);
1853         page_start = folio_pos(folio);
1854         page_end = page_start + folio_size(folio) - 1;
1855         end = page_end;
1856
1857         /*
1858          * Reserving delalloc space after obtaining the page lock can lead to
1859          * deadlock. For example, if a dirty page is locked by this function
1860          * and the call to btrfs_delalloc_reserve_space() ends up triggering
1861          * dirty page write out, then the btrfs_writepages() function could
1862          * end up waiting indefinitely to get a lock on the page currently
1863          * being processed by btrfs_page_mkwrite() function.
1864          */
1865         ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
1866                                            page_start, reserved_space);
1867         if (ret < 0)
1868                 goto out_noreserve;
1869
1870         ret = file_update_time(vmf->vma->vm_file);
1871         if (ret < 0)
1872                 goto out;
1873 again:
1874         down_read(&BTRFS_I(inode)->i_mmap_lock);
1875         folio_lock(folio);
1876         size = i_size_read(inode);
1877
1878         if ((folio->mapping != inode->i_mapping) ||
1879             (page_start >= size)) {
1880                 /* Page got truncated out from underneath us. */
1881                 goto out_unlock;
1882         }
1883         folio_wait_writeback(folio);
1884
1885         btrfs_lock_extent(io_tree, page_start, page_end, &cached_state);
1886         ret = set_folio_extent_mapped(folio);
1887         if (ret < 0) {
1888                 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
1889                 goto out_unlock;
1890         }
1891
1892         /*
1893          * We can't set the delalloc bits if there are pending ordered
1894          * extents.  Drop our locks and wait for them to finish.
1895          */
1896         ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize);
1897         if (ordered) {
1898                 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
1899                 folio_unlock(folio);
1900                 up_read(&BTRFS_I(inode)->i_mmap_lock);
1901                 btrfs_start_ordered_extent(ordered);
1902                 btrfs_put_ordered_extent(ordered);
1903                 goto again;
1904         }
1905
1906         if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) {
1907                 reserved_space = round_up(size - page_start, fs_info->sectorsize);
1908                 if (reserved_space < fsize) {
1909                         end = page_start + reserved_space - 1;
1910                         btrfs_delalloc_release_space(BTRFS_I(inode),
1911                                         data_reserved, end + 1,
1912                                         fsize - reserved_space, true);
1913                 }
1914         }
1915
1916         /*
1917          * page_mkwrite gets called when the page is firstly dirtied after it's
1918          * faulted in, but write(2) could also dirty a page and set delalloc
1919          * bits, thus in this case for space account reason, we still need to
1920          * clear any delalloc bits within this page range since we have to
1921          * reserve data&meta space before lock_page() (see above comments).
1922          */
1923         btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
1924                                EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
1925                                EXTENT_DEFRAG, &cached_state);
1926
1927         ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
1928                                         &cached_state);
1929         if (ret < 0) {
1930                 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
1931                 goto out_unlock;
1932         }
1933
1934         /* Page is wholly or partially inside EOF. */
1935         if (page_start + folio_size(folio) > size)
1936                 zero_start = offset_in_folio(folio, size);
1937         else
1938                 zero_start = fsize;
1939
1940         if (zero_start != fsize)
1941                 folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
1942
1943         btrfs_folio_clear_checked(fs_info, folio, page_start, fsize);
1944         btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
1945         btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
1946
1947         btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
1948
1949         btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
1950         up_read(&BTRFS_I(inode)->i_mmap_lock);
1951
1952         btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
1953         sb_end_pagefault(inode->i_sb);
1954         extent_changeset_free(data_reserved);
1955         return VM_FAULT_LOCKED;
1956
1957 out_unlock:
1958         folio_unlock(folio);
1959         up_read(&BTRFS_I(inode)->i_mmap_lock);
1960 out:
1961         btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
1962         btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
1963                                      reserved_space, true);
1964         extent_changeset_free(data_reserved);
1965 out_noreserve:
1966         sb_end_pagefault(inode->i_sb);
1967
1968         if (ret < 0)
1969                 return vmf_error(ret);
1970
1971         /* Make the VM retry the fault. */
1972         return VM_FAULT_NOPAGE;
1973 }
1974
1975 static const struct vm_operations_struct btrfs_file_vm_ops = {
1976         .fault          = filemap_fault,
1977         .map_pages      = filemap_map_pages,
1978         .page_mkwrite   = btrfs_page_mkwrite,
1979 };
1980
1981 static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
1982 {
1983         struct address_space *mapping = filp->f_mapping;
1984
1985         if (!mapping->a_ops->read_folio)
1986                 return -ENOEXEC;
1987
1988         file_accessed(filp);
1989         vma->vm_ops = &btrfs_file_vm_ops;
1990
1991         return 0;
1992 }
1993
1994 static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
1995                            int slot, u64 start, u64 end)
1996 {
1997         struct btrfs_file_extent_item *fi;
1998         struct btrfs_key key;
1999
2000         if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2001                 return false;
2002
2003         btrfs_item_key_to_cpu(leaf, &key, slot);
2004         if (key.objectid != btrfs_ino(inode) ||
2005             key.type != BTRFS_EXTENT_DATA_KEY)
2006                 return false;
2007
2008         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2009
2010         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2011                 return false;
2012
2013         if (btrfs_file_extent_disk_bytenr(leaf, fi))
2014                 return false;
2015
2016         if (key.offset == end)
2017                 return true;
2018         if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2019                 return true;
2020         return false;
2021 }
2022
2023 static int fill_holes(struct btrfs_trans_handle *trans,
2024                 struct btrfs_inode *inode,
2025                 struct btrfs_path *path, u64 offset, u64 end)
2026 {
2027         struct btrfs_fs_info *fs_info = trans->fs_info;
2028         struct btrfs_root *root = inode->root;
2029         struct extent_buffer *leaf;
2030         struct btrfs_file_extent_item *fi;
2031         struct extent_map *hole_em;
2032         struct btrfs_key key;
2033         int ret;
2034
2035         if (btrfs_fs_incompat(fs_info, NO_HOLES))
2036                 goto out;
2037
2038         key.objectid = btrfs_ino(inode);
2039         key.type = BTRFS_EXTENT_DATA_KEY;
2040         key.offset = offset;
2041
2042         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2043         if (ret <= 0) {
2044                 /*
2045                  * We should have dropped this offset, so if we find it then
2046                  * something has gone horribly wrong.
2047                  */
2048                 if (ret == 0)
2049                         ret = -EINVAL;
2050                 return ret;
2051         }
2052
2053         leaf = path->nodes[0];
2054         if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2055                 u64 num_bytes;
2056
2057                 path->slots[0]--;
2058                 fi = btrfs_item_ptr(leaf, path->slots[0],
2059                                     struct btrfs_file_extent_item);
2060                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2061                         end - offset;
2062                 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2063                 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2064                 btrfs_set_file_extent_offset(leaf, fi, 0);
2065                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2066                 goto out;
2067         }
2068
2069         if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2070                 u64 num_bytes;
2071
2072                 key.offset = offset;
2073                 btrfs_set_item_key_safe(trans, path, &key);
2074                 fi = btrfs_item_ptr(leaf, path->slots[0],
2075                                     struct btrfs_file_extent_item);
2076                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2077                         offset;
2078                 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2079                 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2080                 btrfs_set_file_extent_offset(leaf, fi, 0);
2081                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2082                 goto out;
2083         }
2084         btrfs_release_path(path);
2085
2086         ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
2087                                        end - offset);
2088         if (ret)
2089                 return ret;
2090
2091 out:
2092         btrfs_release_path(path);
2093
2094         hole_em = btrfs_alloc_extent_map();
2095         if (!hole_em) {
2096                 btrfs_drop_extent_map_range(inode, offset, end - 1, false);
2097                 btrfs_set_inode_full_sync(inode);
2098         } else {
2099                 hole_em->start = offset;
2100                 hole_em->len = end - offset;
2101                 hole_em->ram_bytes = hole_em->len;
2102
2103                 hole_em->disk_bytenr = EXTENT_MAP_HOLE;
2104                 hole_em->disk_num_bytes = 0;
2105                 hole_em->generation = trans->transid;
2106
2107                 ret = btrfs_replace_extent_map_range(inode, hole_em, true);
2108                 btrfs_free_extent_map(hole_em);
2109                 if (ret)
2110                         btrfs_set_inode_full_sync(inode);
2111         }
2112
2113         return 0;
2114 }
2115
2116 /*
2117  * Find a hole extent on given inode and change start/len to the end of hole
2118  * extent.(hole/vacuum extent whose em->start <= start &&
2119  *         em->start + em->len > start)
2120  * When a hole extent is found, return 1 and modify start/len.
2121  */
2122 static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
2123 {
2124         struct btrfs_fs_info *fs_info = inode->root->fs_info;
2125         struct extent_map *em;
2126         int ret = 0;
2127
2128         em = btrfs_get_extent(inode, NULL,
2129                               round_down(*start, fs_info->sectorsize),
2130                               round_up(*len, fs_info->sectorsize));
2131         if (IS_ERR(em))
2132                 return PTR_ERR(em);
2133
2134         /* Hole or vacuum extent(only exists in no-hole mode) */
2135         if (em->disk_bytenr == EXTENT_MAP_HOLE) {
2136                 ret = 1;
2137                 *len = em->start + em->len > *start + *len ?
2138                        0 : *start + *len - em->start - em->len;
2139                 *start = em->start + em->len;
2140         }
2141         btrfs_free_extent_map(em);
2142         return ret;
2143 }
2144
2145 /*
2146  * Check if there is no folio in the range.
2147  *
2148  * We cannot utilize filemap_range_has_page() in a filemap with large folios
2149  * as we can hit the following false positive:
2150  *
2151  *        start                            end
2152  *        |                                |
2153  *  |//|//|//|//|  |  |  |  |  |  |  |  |//|//|
2154  *   \         /                         \   /
2155  *    Folio A                            Folio B
2156  *
2157  * That large folio A and B cover the start and end indexes.
2158  * In that case filemap_range_has_page() will always return true, but the above
2159  * case is fine for btrfs_punch_hole_lock_range() usage.
2160  *
2161  * So here we only ensure that no other folios is in the range, excluding the
2162  * head/tail large folio.
2163  */
2164 static bool check_range_has_page(struct inode *inode, u64 start, u64 end)
2165 {
2166         struct folio_batch fbatch;
2167         bool ret = false;
2168         /*
2169          * For subpage case, if the range is not at page boundary, we could
2170          * have pages at the leading/tailing part of the range.
2171          * This could lead to dead loop since filemap_range_has_page()
2172          * will always return true.
2173          * So here we need to do extra page alignment for
2174          * filemap_range_has_page().
2175          *
2176          * And do not decrease page_lockend right now, as it can be 0.
2177          */
2178         const u64 page_lockstart = round_up(start, PAGE_SIZE);
2179         const u64 page_lockend = round_down(end + 1, PAGE_SIZE);
2180         const pgoff_t start_index = page_lockstart >> PAGE_SHIFT;
2181         const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT;
2182         pgoff_t tmp = start_index;
2183         int found_folios;
2184
2185         /* The same page or adjacent pages. */
2186         if (page_lockend <= page_lockstart)
2187                 return false;
2188
2189         folio_batch_init(&fbatch);
2190         found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch);
2191         for (int i = 0; i < found_folios; i++) {
2192                 struct folio *folio = fbatch.folios[i];
2193
2194                 /* A large folio begins before the start. Not a target. */
2195                 if (folio->index < start_index)
2196                         continue;
2197                 /* A large folio extends beyond the end. Not a target. */
2198                 if (folio->index + folio_nr_pages(folio) > end_index)
2199                         continue;
2200                 /* A folio doesn't cover the head/tail index. Found a target. */
2201                 ret = true;
2202                 break;
2203         }
2204         folio_batch_release(&fbatch);
2205         return ret;
2206 }
2207
2208 static void btrfs_punch_hole_lock_range(struct inode *inode,
2209                                         const u64 lockstart, const u64 lockend,
2210                                         struct extent_state **cached_state)
2211 {
2212         while (1) {
2213                 truncate_pagecache_range(inode, lockstart, lockend);
2214
2215                 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2216                                   cached_state);
2217                 /*
2218                  * We can't have ordered extents in the range, nor dirty/writeback
2219                  * pages, because we have locked the inode's VFS lock in exclusive
2220                  * mode, we have locked the inode's i_mmap_lock in exclusive mode,
2221                  * we have flushed all delalloc in the range and we have waited
2222                  * for any ordered extents in the range to complete.
2223                  * We can race with anyone reading pages from this range, so after
2224                  * locking the range check if we have pages in the range, and if
2225                  * we do, unlock the range and retry.
2226                  */
2227                 if (!check_range_has_page(inode, lockstart, lockend))
2228                         break;
2229
2230                 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2231                                     cached_state);
2232         }
2233
2234         btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
2235 }
2236
2237 static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2238                                      struct btrfs_inode *inode,
2239                                      struct btrfs_path *path,
2240                                      struct btrfs_replace_extent_info *extent_info,
2241                                      const u64 replace_len,
2242                                      const u64 bytes_to_drop)
2243 {
2244         struct btrfs_fs_info *fs_info = trans->fs_info;
2245         struct btrfs_root *root = inode->root;
2246         struct btrfs_file_extent_item *extent;
2247         struct extent_buffer *leaf;
2248         struct btrfs_key key;
2249         int slot;
2250         int ret;
2251
2252         if (replace_len == 0)
2253                 return 0;
2254
2255         if (extent_info->disk_offset == 0 &&
2256             btrfs_fs_incompat(fs_info, NO_HOLES)) {
2257                 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2258                 return 0;
2259         }
2260
2261         key.objectid = btrfs_ino(inode);
2262         key.type = BTRFS_EXTENT_DATA_KEY;
2263         key.offset = extent_info->file_offset;
2264         ret = btrfs_insert_empty_item(trans, root, path, &key,
2265                                       sizeof(struct btrfs_file_extent_item));
2266         if (ret)
2267                 return ret;
2268         leaf = path->nodes[0];
2269         slot = path->slots[0];
2270         write_extent_buffer(leaf, extent_info->extent_buf,
2271                             btrfs_item_ptr_offset(leaf, slot),
2272                             sizeof(struct btrfs_file_extent_item));
2273         extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2274         ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2275         btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2276         btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2277         if (extent_info->is_new_extent)
2278                 btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2279         btrfs_release_path(path);
2280
2281         ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
2282                                                 replace_len);
2283         if (ret)
2284                 return ret;
2285
2286         /* If it's a hole, nothing more needs to be done. */
2287         if (extent_info->disk_offset == 0) {
2288                 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2289                 return 0;
2290         }
2291
2292         btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
2293
2294         if (extent_info->is_new_extent && extent_info->insertions == 0) {
2295                 key.objectid = extent_info->disk_offset;
2296                 key.type = BTRFS_EXTENT_ITEM_KEY;
2297                 key.offset = extent_info->disk_len;
2298                 ret = btrfs_alloc_reserved_file_extent(trans, root,
2299                                                        btrfs_ino(inode),
2300                                                        extent_info->file_offset,
2301                                                        extent_info->qgroup_reserved,
2302                                                        &key);
2303         } else {
2304                 struct btrfs_ref ref = {
2305                         .action = BTRFS_ADD_DELAYED_REF,
2306                         .bytenr = extent_info->disk_offset,
2307                         .num_bytes = extent_info->disk_len,
2308                         .owning_root = btrfs_root_id(root),
2309                         .ref_root = btrfs_root_id(root),
2310                 };
2311                 u64 ref_offset;
2312
2313                 ref_offset = extent_info->file_offset - extent_info->data_offset;
2314                 btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false);
2315                 ret = btrfs_inc_extent_ref(trans, &ref);
2316         }
2317
2318         extent_info->insertions++;
2319
2320         return ret;
2321 }
2322
2323 /*
2324  * The respective range must have been previously locked, as well as the inode.
2325  * The end offset is inclusive (last byte of the range).
2326  * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2327  * the file range with an extent.
2328  * When not punching a hole, we don't want to end up in a state where we dropped
2329  * extents without inserting a new one, so we must abort the transaction to avoid
2330  * a corruption.
2331  */
2332 int btrfs_replace_file_extents(struct btrfs_inode *inode,
2333                                struct btrfs_path *path, const u64 start,
2334                                const u64 end,
2335                                struct btrfs_replace_extent_info *extent_info,
2336                                struct btrfs_trans_handle **trans_out)
2337 {
2338         struct btrfs_drop_extents_args drop_args = { 0 };
2339         struct btrfs_root *root = inode->root;
2340         struct btrfs_fs_info *fs_info = root->fs_info;
2341         u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2342         u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2343         struct btrfs_trans_handle *trans = NULL;
2344         struct btrfs_block_rsv *rsv;
2345         unsigned int rsv_count;
2346         u64 cur_offset;
2347         u64 len = end - start;
2348         int ret = 0;
2349
2350         if (end <= start)
2351                 return -EINVAL;
2352
2353         rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2354         if (!rsv) {
2355                 ret = -ENOMEM;
2356                 goto out;
2357         }
2358         rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
2359         rsv->failfast = true;
2360
2361         /*
2362          * 1 - update the inode
2363          * 1 - removing the extents in the range
2364          * 1 - adding the hole extent if no_holes isn't set or if we are
2365          *     replacing the range with a new extent
2366          */
2367         if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2368                 rsv_count = 3;
2369         else
2370                 rsv_count = 2;
2371
2372         trans = btrfs_start_transaction(root, rsv_count);
2373         if (IS_ERR(trans)) {
2374                 ret = PTR_ERR(trans);
2375                 trans = NULL;
2376                 goto out_free;
2377         }
2378
2379         ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2380                                       min_size, false);
2381         if (WARN_ON(ret))
2382                 goto out_trans;
2383         trans->block_rsv = rsv;
2384
2385         cur_offset = start;
2386         drop_args.path = path;
2387         drop_args.end = end + 1;
2388         drop_args.drop_cache = true;
2389         while (cur_offset < end) {
2390                 drop_args.start = cur_offset;
2391                 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2392                 /* If we are punching a hole decrement the inode's byte count */
2393                 if (!extent_info)
2394                         btrfs_update_inode_bytes(inode, 0,
2395                                                  drop_args.bytes_found);
2396                 if (ret != -ENOSPC) {
2397                         /*
2398                          * The only time we don't want to abort is if we are
2399                          * attempting to clone a partial inline extent, in which
2400                          * case we'll get EOPNOTSUPP.  However if we aren't
2401                          * clone we need to abort no matter what, because if we
2402                          * got EOPNOTSUPP via prealloc then we messed up and
2403                          * need to abort.
2404                          */
2405                         if (ret &&
2406                             (ret != -EOPNOTSUPP ||
2407                              (extent_info && extent_info->is_new_extent)))
2408                                 btrfs_abort_transaction(trans, ret);
2409                         break;
2410                 }
2411
2412                 trans->block_rsv = &fs_info->trans_block_rsv;
2413
2414                 if (!extent_info && cur_offset < drop_args.drop_end &&
2415                     cur_offset < ino_size) {
2416                         ret = fill_holes(trans, inode, path, cur_offset,
2417                                          drop_args.drop_end);
2418                         if (ret) {
2419                                 /*
2420                                  * If we failed then we didn't insert our hole
2421                                  * entries for the area we dropped, so now the
2422                                  * fs is corrupted, so we must abort the
2423                                  * transaction.
2424                                  */
2425                                 btrfs_abort_transaction(trans, ret);
2426                                 break;
2427                         }
2428                 } else if (!extent_info && cur_offset < drop_args.drop_end) {
2429                         /*
2430                          * We are past the i_size here, but since we didn't
2431                          * insert holes we need to clear the mapped area so we
2432                          * know to not set disk_i_size in this area until a new
2433                          * file extent is inserted here.
2434                          */
2435                         ret = btrfs_inode_clear_file_extent_range(inode,
2436                                         cur_offset,
2437                                         drop_args.drop_end - cur_offset);
2438                         if (ret) {
2439                                 /*
2440                                  * We couldn't clear our area, so we could
2441                                  * presumably adjust up and corrupt the fs, so
2442                                  * we need to abort.
2443                                  */
2444                                 btrfs_abort_transaction(trans, ret);
2445                                 break;
2446                         }
2447                 }
2448
2449                 if (extent_info &&
2450                     drop_args.drop_end > extent_info->file_offset) {
2451                         u64 replace_len = drop_args.drop_end -
2452                                           extent_info->file_offset;
2453
2454                         ret = btrfs_insert_replace_extent(trans, inode, path,
2455                                         extent_info, replace_len,
2456                                         drop_args.bytes_found);
2457                         if (ret) {
2458                                 btrfs_abort_transaction(trans, ret);
2459                                 break;
2460                         }
2461                         extent_info->data_len -= replace_len;
2462                         extent_info->data_offset += replace_len;
2463                         extent_info->file_offset += replace_len;
2464                 }
2465
2466                 /*
2467                  * We are releasing our handle on the transaction, balance the
2468                  * dirty pages of the btree inode and flush delayed items, and
2469                  * then get a new transaction handle, which may now point to a
2470                  * new transaction in case someone else may have committed the
2471                  * transaction we used to replace/drop file extent items. So
2472                  * bump the inode's iversion and update mtime and ctime except
2473                  * if we are called from a dedupe context. This is because a
2474                  * power failure/crash may happen after the transaction is
2475                  * committed and before we finish replacing/dropping all the
2476                  * file extent items we need.
2477                  */
2478                 inode_inc_iversion(&inode->vfs_inode);
2479
2480                 if (!extent_info || extent_info->update_times)
2481                         inode_set_mtime_to_ts(&inode->vfs_inode,
2482                                               inode_set_ctime_current(&inode->vfs_inode));
2483
2484                 ret = btrfs_update_inode(trans, inode);
2485                 if (ret)
2486                         break;
2487
2488                 btrfs_end_transaction(trans);
2489                 btrfs_btree_balance_dirty(fs_info);
2490
2491                 trans = btrfs_start_transaction(root, rsv_count);
2492                 if (IS_ERR(trans)) {
2493                         ret = PTR_ERR(trans);
2494                         trans = NULL;
2495                         break;
2496                 }
2497
2498                 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2499                                               rsv, min_size, false);
2500                 if (WARN_ON(ret))
2501                         break;
2502                 trans->block_rsv = rsv;
2503
2504                 cur_offset = drop_args.drop_end;
2505                 len = end - cur_offset;
2506                 if (!extent_info && len) {
2507                         ret = find_first_non_hole(inode, &cur_offset, &len);
2508                         if (unlikely(ret < 0))
2509                                 break;
2510                         if (ret && !len) {
2511                                 ret = 0;
2512                                 break;
2513                         }
2514                 }
2515         }
2516
2517         /*
2518          * If we were cloning, force the next fsync to be a full one since we
2519          * we replaced (or just dropped in the case of cloning holes when
2520          * NO_HOLES is enabled) file extent items and did not setup new extent
2521          * maps for the replacement extents (or holes).
2522          */
2523         if (extent_info && !extent_info->is_new_extent)
2524                 btrfs_set_inode_full_sync(inode);
2525
2526         if (ret)
2527                 goto out_trans;
2528
2529         trans->block_rsv = &fs_info->trans_block_rsv;
2530         /*
2531          * If we are using the NO_HOLES feature we might have had already an
2532          * hole that overlaps a part of the region [lockstart, lockend] and
2533          * ends at (or beyond) lockend. Since we have no file extent items to
2534          * represent holes, drop_end can be less than lockend and so we must
2535          * make sure we have an extent map representing the existing hole (the
2536          * call to __btrfs_drop_extents() might have dropped the existing extent
2537          * map representing the existing hole), otherwise the fast fsync path
2538          * will not record the existence of the hole region
2539          * [existing_hole_start, lockend].
2540          */
2541         if (drop_args.drop_end <= end)
2542                 drop_args.drop_end = end + 1;
2543         /*
2544          * Don't insert file hole extent item if it's for a range beyond eof
2545          * (because it's useless) or if it represents a 0 bytes range (when
2546          * cur_offset == drop_end).
2547          */
2548         if (!extent_info && cur_offset < ino_size &&
2549             cur_offset < drop_args.drop_end) {
2550                 ret = fill_holes(trans, inode, path, cur_offset,
2551                                  drop_args.drop_end);
2552                 if (ret) {
2553                         /* Same comment as above. */
2554                         btrfs_abort_transaction(trans, ret);
2555                         goto out_trans;
2556                 }
2557         } else if (!extent_info && cur_offset < drop_args.drop_end) {
2558                 /* See the comment in the loop above for the reasoning here. */
2559                 ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
2560                                         drop_args.drop_end - cur_offset);
2561                 if (ret) {
2562                         btrfs_abort_transaction(trans, ret);
2563                         goto out_trans;
2564                 }
2565
2566         }
2567         if (extent_info) {
2568                 ret = btrfs_insert_replace_extent(trans, inode, path,
2569                                 extent_info, extent_info->data_len,
2570                                 drop_args.bytes_found);
2571                 if (ret) {
2572                         btrfs_abort_transaction(trans, ret);
2573                         goto out_trans;
2574                 }
2575         }
2576
2577 out_trans:
2578         if (!trans)
2579                 goto out_free;
2580
2581         trans->block_rsv = &fs_info->trans_block_rsv;
2582         if (ret)
2583                 btrfs_end_transaction(trans);
2584         else
2585                 *trans_out = trans;
2586 out_free:
2587         btrfs_free_block_rsv(fs_info, rsv);
2588 out:
2589         return ret;
2590 }
2591
2592 static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2593 {
2594         struct inode *inode = file_inode(file);
2595         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2596         struct btrfs_root *root = BTRFS_I(inode)->root;
2597         struct extent_state *cached_state = NULL;
2598         struct btrfs_path *path;
2599         struct btrfs_trans_handle *trans = NULL;
2600         u64 lockstart;
2601         u64 lockend;
2602         u64 tail_start;
2603         u64 tail_len;
2604         const u64 orig_start = offset;
2605         const u64 orig_end = offset + len - 1;
2606         int ret = 0;
2607         bool same_block;
2608         u64 ino_size;
2609         bool truncated_block = false;
2610         bool updated_inode = false;
2611
2612         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2613
2614         ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len);
2615         if (ret)
2616                 goto out_only_mutex;
2617
2618         ino_size = round_up(inode->i_size, fs_info->sectorsize);
2619         ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2620         if (ret < 0)
2621                 goto out_only_mutex;
2622         if (ret && !len) {
2623                 /* Already in a large hole */
2624                 ret = 0;
2625                 goto out_only_mutex;
2626         }
2627
2628         ret = file_modified(file);
2629         if (ret)
2630                 goto out_only_mutex;
2631
2632         lockstart = round_up(offset, fs_info->sectorsize);
2633         lockend = round_down(offset + len, fs_info->sectorsize) - 1;
2634         same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2635                 == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2636         /*
2637          * Only do this if we are in the same block and we aren't doing the
2638          * entire block.
2639          */
2640         if (same_block && len < fs_info->sectorsize) {
2641                 if (offset < ino_size) {
2642                         truncated_block = true;
2643                         ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
2644                                                    orig_start, orig_end);
2645                 } else {
2646                         ret = 0;
2647                 }
2648                 goto out_only_mutex;
2649         }
2650
2651         /* zero back part of the first block */
2652         if (offset < ino_size) {
2653                 truncated_block = true;
2654                 ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end);
2655                 if (ret) {
2656                         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2657                         return ret;
2658                 }
2659         }
2660
2661         /* Check the aligned pages after the first unaligned page,
2662          * if offset != orig_start, which means the first unaligned page
2663          * including several following pages are already in holes,
2664          * the extra check can be skipped */
2665         if (offset == orig_start) {
2666                 /* after truncate page, check hole again */
2667                 len = offset + len - lockstart;
2668                 offset = lockstart;
2669                 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2670                 if (ret < 0)
2671                         goto out_only_mutex;
2672                 if (ret && !len) {
2673                         ret = 0;
2674                         goto out_only_mutex;
2675                 }
2676                 lockstart = offset;
2677         }
2678
2679         /* Check the tail unaligned part is in a hole */
2680         tail_start = lockend + 1;
2681         tail_len = offset + len - tail_start;
2682         if (tail_len) {
2683                 ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
2684                 if (unlikely(ret < 0))
2685                         goto out_only_mutex;
2686                 if (!ret) {
2687                         /* zero the front end of the last page */
2688                         if (tail_start + tail_len < ino_size) {
2689                                 truncated_block = true;
2690                                 ret = btrfs_truncate_block(BTRFS_I(inode),
2691                                                         tail_start + tail_len - 1,
2692                                                         orig_start, orig_end);
2693                                 if (ret)
2694                                         goto out_only_mutex;
2695                         }
2696                 }
2697         }
2698
2699         if (lockend < lockstart) {
2700                 ret = 0;
2701                 goto out_only_mutex;
2702         }
2703
2704         btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
2705
2706         path = btrfs_alloc_path();
2707         if (!path) {
2708                 ret = -ENOMEM;
2709                 goto out;
2710         }
2711
2712         ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
2713                                          lockend, NULL, &trans);
2714         btrfs_free_path(path);
2715         if (ret)
2716                 goto out;
2717
2718         ASSERT(trans != NULL);
2719         inode_inc_iversion(inode);
2720         inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
2721         ret = btrfs_update_inode(trans, BTRFS_I(inode));
2722         updated_inode = true;
2723         btrfs_end_transaction(trans);
2724         btrfs_btree_balance_dirty(fs_info);
2725 out:
2726         btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2727                             &cached_state);
2728 out_only_mutex:
2729         if (!updated_inode && truncated_block && !ret) {
2730                 /*
2731                  * If we only end up zeroing part of a page, we still need to
2732                  * update the inode item, so that all the time fields are
2733                  * updated as well as the necessary btrfs inode in memory fields
2734                  * for detecting, at fsync time, if the inode isn't yet in the
2735                  * log tree or it's there but not up to date.
2736                  */
2737                 struct timespec64 now = inode_set_ctime_current(inode);
2738
2739                 inode_inc_iversion(inode);
2740                 inode_set_mtime_to_ts(inode, now);
2741                 trans = btrfs_start_transaction(root, 1);
2742                 if (IS_ERR(trans)) {
2743                         ret = PTR_ERR(trans);
2744                 } else {
2745                         int ret2;
2746
2747                         ret = btrfs_update_inode(trans, BTRFS_I(inode));
2748                         ret2 = btrfs_end_transaction(trans);
2749                         if (!ret)
2750                                 ret = ret2;
2751                 }
2752         }
2753         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2754         return ret;
2755 }
2756
2757 /* Helper structure to record which range is already reserved */
2758 struct falloc_range {
2759         struct list_head list;
2760         u64 start;
2761         u64 len;
2762 };
2763
2764 /*
2765  * Helper function to add falloc range
2766  *
2767  * Caller should have locked the larger range of extent containing
2768  * [start, len)
2769  */
2770 static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2771 {
2772         struct falloc_range *range = NULL;
2773
2774         if (!list_empty(head)) {
2775                 /*
2776                  * As fallocate iterates by bytenr order, we only need to check
2777                  * the last range.
2778                  */
2779                 range = list_last_entry(head, struct falloc_range, list);
2780                 if (range->start + range->len == start) {
2781                         range->len += len;
2782                         return 0;
2783                 }
2784         }
2785
2786         range = kmalloc(sizeof(*range), GFP_KERNEL);
2787         if (!range)
2788                 return -ENOMEM;
2789         range->start = start;
2790         range->len = len;
2791         list_add_tail(&range->list, head);
2792         return 0;
2793 }
2794
2795 static int btrfs_fallocate_update_isize(struct inode *inode,
2796                                         const u64 end,
2797                                         const int mode)
2798 {
2799         struct btrfs_trans_handle *trans;
2800         struct btrfs_root *root = BTRFS_I(inode)->root;
2801         int ret;
2802         int ret2;
2803
2804         if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
2805                 return 0;
2806
2807         trans = btrfs_start_transaction(root, 1);
2808         if (IS_ERR(trans))
2809                 return PTR_ERR(trans);
2810
2811         inode_set_ctime_current(inode);
2812         i_size_write(inode, end);
2813         btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
2814         ret = btrfs_update_inode(trans, BTRFS_I(inode));
2815         ret2 = btrfs_end_transaction(trans);
2816
2817         return ret ? ret : ret2;
2818 }
2819
2820 enum {
2821         RANGE_BOUNDARY_WRITTEN_EXTENT,
2822         RANGE_BOUNDARY_PREALLOC_EXTENT,
2823         RANGE_BOUNDARY_HOLE,
2824 };
2825
2826 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
2827                                                  u64 offset)
2828 {
2829         const u64 sectorsize = inode->root->fs_info->sectorsize;
2830         struct extent_map *em;
2831         int ret;
2832
2833         offset = round_down(offset, sectorsize);
2834         em = btrfs_get_extent(inode, NULL, offset, sectorsize);
2835         if (IS_ERR(em))
2836                 return PTR_ERR(em);
2837
2838         if (em->disk_bytenr == EXTENT_MAP_HOLE)
2839                 ret = RANGE_BOUNDARY_HOLE;
2840         else if (em->flags & EXTENT_FLAG_PREALLOC)
2841                 ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
2842         else
2843                 ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
2844
2845         btrfs_free_extent_map(em);
2846         return ret;
2847 }
2848
2849 static int btrfs_zero_range(struct inode *inode,
2850                             loff_t offset,
2851                             loff_t len,
2852                             const int mode)
2853 {
2854         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2855         struct extent_map *em;
2856         struct extent_changeset *data_reserved = NULL;
2857         int ret;
2858         u64 alloc_hint = 0;
2859         const u64 sectorsize = fs_info->sectorsize;
2860         const u64 orig_start = offset;
2861         const u64 orig_end = offset + len - 1;
2862         u64 alloc_start = round_down(offset, sectorsize);
2863         u64 alloc_end = round_up(offset + len, sectorsize);
2864         u64 bytes_to_reserve = 0;
2865         bool space_reserved = false;
2866
2867         em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start,
2868                               alloc_end - alloc_start);
2869         if (IS_ERR(em)) {
2870                 ret = PTR_ERR(em);
2871                 goto out;
2872         }
2873
2874         /*
2875          * Avoid hole punching and extent allocation for some cases. More cases
2876          * could be considered, but these are unlikely common and we keep things
2877          * as simple as possible for now. Also, intentionally, if the target
2878          * range contains one or more prealloc extents together with regular
2879          * extents and holes, we drop all the existing extents and allocate a
2880          * new prealloc extent, so that we get a larger contiguous disk extent.
2881          */
2882         if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) {
2883                 const u64 em_end = em->start + em->len;
2884
2885                 if (em_end >= offset + len) {
2886                         /*
2887                          * The whole range is already a prealloc extent,
2888                          * do nothing except updating the inode's i_size if
2889                          * needed.
2890                          */
2891                         btrfs_free_extent_map(em);
2892                         ret = btrfs_fallocate_update_isize(inode, offset + len,
2893                                                            mode);
2894                         goto out;
2895                 }
2896                 /*
2897                  * Part of the range is already a prealloc extent, so operate
2898                  * only on the remaining part of the range.
2899                  */
2900                 alloc_start = em_end;
2901                 ASSERT(IS_ALIGNED(alloc_start, sectorsize));
2902                 len = offset + len - alloc_start;
2903                 offset = alloc_start;
2904                 alloc_hint = btrfs_extent_map_block_start(em) + em->len;
2905         }
2906         btrfs_free_extent_map(em);
2907
2908         if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
2909             BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
2910                 em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize);
2911                 if (IS_ERR(em)) {
2912                         ret = PTR_ERR(em);
2913                         goto out;
2914                 }
2915
2916                 if (em->flags & EXTENT_FLAG_PREALLOC) {
2917                         btrfs_free_extent_map(em);
2918                         ret = btrfs_fallocate_update_isize(inode, offset + len,
2919                                                            mode);
2920                         goto out;
2921                 }
2922                 if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) {
2923                         btrfs_free_extent_map(em);
2924                         ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
2925                                                    orig_start, orig_end);
2926                         if (!ret)
2927                                 ret = btrfs_fallocate_update_isize(inode,
2928                                                                    offset + len,
2929                                                                    mode);
2930                         return ret;
2931                 }
2932                 btrfs_free_extent_map(em);
2933                 alloc_start = round_down(offset, sectorsize);
2934                 alloc_end = alloc_start + sectorsize;
2935                 goto reserve_space;
2936         }
2937
2938         alloc_start = round_up(offset, sectorsize);
2939         alloc_end = round_down(offset + len, sectorsize);
2940
2941         /*
2942          * For unaligned ranges, check the pages at the boundaries, they might
2943          * map to an extent, in which case we need to partially zero them, or
2944          * they might map to a hole, in which case we need our allocation range
2945          * to cover them.
2946          */
2947         if (!IS_ALIGNED(offset, sectorsize)) {
2948                 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
2949                                                             offset);
2950                 if (ret < 0)
2951                         goto out;
2952                 if (ret == RANGE_BOUNDARY_HOLE) {
2953                         alloc_start = round_down(offset, sectorsize);
2954                         ret = 0;
2955                 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2956                         ret = btrfs_truncate_block(BTRFS_I(inode), offset,
2957                                                    orig_start, orig_end);
2958                         if (ret)
2959                                 goto out;
2960                 } else {
2961                         ret = 0;
2962                 }
2963         }
2964
2965         if (!IS_ALIGNED(offset + len, sectorsize)) {
2966                 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
2967                                                             offset + len);
2968                 if (ret < 0)
2969                         goto out;
2970                 if (ret == RANGE_BOUNDARY_HOLE) {
2971                         alloc_end = round_up(offset + len, sectorsize);
2972                         ret = 0;
2973                 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2974                         ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
2975                                                    orig_start, orig_end);
2976                         if (ret)
2977                                 goto out;
2978                 } else {
2979                         ret = 0;
2980                 }
2981         }
2982
2983 reserve_space:
2984         if (alloc_start < alloc_end) {
2985                 struct extent_state *cached_state = NULL;
2986                 const u64 lockstart = alloc_start;
2987                 const u64 lockend = alloc_end - 1;
2988
2989                 bytes_to_reserve = alloc_end - alloc_start;
2990                 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
2991                                                       bytes_to_reserve);
2992                 if (ret < 0)
2993                         goto out;
2994                 space_reserved = true;
2995                 btrfs_punch_hole_lock_range(inode, lockstart, lockend,
2996                                             &cached_state);
2997                 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
2998                                                 alloc_start, bytes_to_reserve);
2999                 if (ret) {
3000                         btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
3001                                             lockend, &cached_state);
3002                         goto out;
3003                 }
3004                 ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
3005                                                 alloc_end - alloc_start,
3006                                                 fs_info->sectorsize,
3007                                                 offset + len, &alloc_hint);
3008                 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3009                                     &cached_state);
3010                 /* btrfs_prealloc_file_range releases reserved space on error */
3011                 if (ret) {
3012                         space_reserved = false;
3013                         goto out;
3014                 }
3015         }
3016         ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
3017  out:
3018         if (ret && space_reserved)
3019                 btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3020                                                alloc_start, bytes_to_reserve);
3021         extent_changeset_free(data_reserved);
3022
3023         return ret;
3024 }
3025
3026 static long btrfs_fallocate(struct file *file, int mode,
3027                             loff_t offset, loff_t len)
3028 {
3029         struct inode *inode = file_inode(file);
3030         struct extent_state *cached_state = NULL;
3031         struct extent_changeset *data_reserved = NULL;
3032         struct falloc_range *range;
3033         struct falloc_range *tmp;
3034         LIST_HEAD(reserve_list);
3035         u64 cur_offset;
3036         u64 last_byte;
3037         u64 alloc_start;
3038         u64 alloc_end;
3039         u64 alloc_hint = 0;
3040         u64 locked_end;
3041         u64 actual_end = 0;
3042         u64 data_space_needed = 0;
3043         u64 data_space_reserved = 0;
3044         u64 qgroup_reserved = 0;
3045         struct extent_map *em;
3046         int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
3047         int ret;
3048
3049         /* Do not allow fallocate in ZONED mode */
3050         if (btrfs_is_zoned(inode_to_fs_info(inode)))
3051                 return -EOPNOTSUPP;
3052
3053         alloc_start = round_down(offset, blocksize);
3054         alloc_end = round_up(offset + len, blocksize);
3055         cur_offset = alloc_start;
3056
3057         /* Make sure we aren't being give some crap mode */
3058         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3059                      FALLOC_FL_ZERO_RANGE))
3060                 return -EOPNOTSUPP;
3061
3062         if (mode & FALLOC_FL_PUNCH_HOLE)
3063                 return btrfs_punch_hole(file, offset, len);
3064
3065         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3066
3067         if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3068                 ret = inode_newsize_ok(inode, offset + len);
3069                 if (ret)
3070                         goto out;
3071         }
3072
3073         ret = file_modified(file);
3074         if (ret)
3075                 goto out;
3076
3077         /*
3078          * TODO: Move these two operations after we have checked
3079          * accurate reserved space, or fallocate can still fail but
3080          * with page truncated or size expanded.
3081          *
3082          * But that's a minor problem and won't do much harm BTW.
3083          */
3084         if (alloc_start > inode->i_size) {
3085                 ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
3086                                         alloc_start);
3087                 if (ret)
3088                         goto out;
3089         } else if (offset + len > inode->i_size) {
3090                 /*
3091                  * If we are fallocating from the end of the file onward we
3092                  * need to zero out the end of the block if i_size lands in the
3093                  * middle of a block.
3094                  */
3095                 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size,
3096                                            inode->i_size, (u64)-1);
3097                 if (ret)
3098                         goto out;
3099         }
3100
3101         /*
3102          * We have locked the inode at the VFS level (in exclusive mode) and we
3103          * have locked the i_mmap_lock lock (in exclusive mode). Now before
3104          * locking the file range, flush all dealloc in the range and wait for
3105          * all ordered extents in the range to complete. After this we can lock
3106          * the file range and, due to the previous locking we did, we know there
3107          * can't be more delalloc or ordered extents in the range.
3108          */
3109         ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start,
3110                                        alloc_end - alloc_start);
3111         if (ret)
3112                 goto out;
3113
3114         if (mode & FALLOC_FL_ZERO_RANGE) {
3115                 ret = btrfs_zero_range(inode, offset, len, mode);
3116                 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3117                 return ret;
3118         }
3119
3120         locked_end = alloc_end - 1;
3121         btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3122                           &cached_state);
3123
3124         btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
3125
3126         /* First, check if we exceed the qgroup limit */
3127         while (cur_offset < alloc_end) {
3128                 em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset,
3129                                       alloc_end - cur_offset);
3130                 if (IS_ERR(em)) {
3131                         ret = PTR_ERR(em);
3132                         break;
3133                 }
3134                 last_byte = min(btrfs_extent_map_end(em), alloc_end);
3135                 actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len);
3136                 last_byte = ALIGN(last_byte, blocksize);
3137                 if (em->disk_bytenr == EXTENT_MAP_HOLE ||
3138                     (cur_offset >= inode->i_size &&
3139                      !(em->flags & EXTENT_FLAG_PREALLOC))) {
3140                         const u64 range_len = last_byte - cur_offset;
3141
3142                         ret = add_falloc_range(&reserve_list, cur_offset, range_len);
3143                         if (ret < 0) {
3144                                 btrfs_free_extent_map(em);
3145                                 break;
3146                         }
3147                         ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3148                                         &data_reserved, cur_offset, range_len);
3149                         if (ret < 0) {
3150                                 btrfs_free_extent_map(em);
3151                                 break;
3152                         }
3153                         qgroup_reserved += range_len;
3154                         data_space_needed += range_len;
3155                 }
3156                 btrfs_free_extent_map(em);
3157                 cur_offset = last_byte;
3158         }
3159
3160         if (!ret && data_space_needed > 0) {
3161                 /*
3162                  * We are safe to reserve space here as we can't have delalloc
3163                  * in the range, see above.
3164                  */
3165                 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3166                                                       data_space_needed);
3167                 if (!ret)
3168                         data_space_reserved = data_space_needed;
3169         }
3170
3171         /*
3172          * If ret is still 0, means we're OK to fallocate.
3173          * Or just cleanup the list and exit.
3174          */
3175         list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3176                 if (!ret) {
3177                         ret = btrfs_prealloc_file_range(inode, mode,
3178                                         range->start,
3179                                         range->len, blocksize,
3180                                         offset + len, &alloc_hint);
3181                         /*
3182                          * btrfs_prealloc_file_range() releases space even
3183                          * if it returns an error.
3184                          */
3185                         data_space_reserved -= range->len;
3186                         qgroup_reserved -= range->len;
3187                 } else if (data_space_reserved > 0) {
3188                         btrfs_free_reserved_data_space(BTRFS_I(inode),
3189                                                data_reserved, range->start,
3190                                                range->len);
3191                         data_space_reserved -= range->len;
3192                         qgroup_reserved -= range->len;
3193                 } else if (qgroup_reserved > 0) {
3194                         btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
3195                                                range->start, range->len, NULL);
3196                         qgroup_reserved -= range->len;
3197                 }
3198                 list_del(&range->list);
3199                 kfree(range);
3200         }
3201         if (ret < 0)
3202                 goto out_unlock;
3203
3204         /*
3205          * We didn't need to allocate any more space, but we still extended the
3206          * size of the file so we need to update i_size and the inode item.
3207          */
3208         ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3209 out_unlock:
3210         btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3211                             &cached_state);
3212 out:
3213         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3214         extent_changeset_free(data_reserved);
3215         return ret;
3216 }
3217
3218 /*
3219  * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
3220  * that has unflushed and/or flushing delalloc. There might be other adjacent
3221  * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
3222  * looping while it gets adjacent subranges, and merging them together.
3223  */
3224 static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
3225                                    struct extent_state **cached_state,
3226                                    bool *search_io_tree,
3227                                    u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3228 {
3229         u64 len = end + 1 - start;
3230         u64 delalloc_len = 0;
3231         struct btrfs_ordered_extent *oe;
3232         u64 oe_start;
3233         u64 oe_end;
3234
3235         /*
3236          * Search the io tree first for EXTENT_DELALLOC. If we find any, it
3237          * means we have delalloc (dirty pages) for which writeback has not
3238          * started yet.
3239          */
3240         if (*search_io_tree) {
3241                 spin_lock(&inode->lock);
3242                 if (inode->delalloc_bytes > 0) {
3243                         spin_unlock(&inode->lock);
3244                         *delalloc_start_ret = start;
3245                         delalloc_len = btrfs_count_range_bits(&inode->io_tree,
3246                                                               delalloc_start_ret, end,
3247                                                               len, EXTENT_DELALLOC, 1,
3248                                                               cached_state);
3249                 } else {
3250                         spin_unlock(&inode->lock);
3251                 }
3252         }
3253
3254         if (delalloc_len > 0) {
3255                 /*
3256                  * If delalloc was found then *delalloc_start_ret has a sector size
3257                  * aligned value (rounded down).
3258                  */
3259                 *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
3260
3261                 if (*delalloc_start_ret == start) {
3262                         /* Delalloc for the whole range, nothing more to do. */
3263                         if (*delalloc_end_ret == end)
3264                                 return true;
3265                         /* Else trim our search range for ordered extents. */
3266                         start = *delalloc_end_ret + 1;
3267                         len = end + 1 - start;
3268                 }
3269         } else {
3270                 /* No delalloc, future calls don't need to search again. */
3271                 *search_io_tree = false;
3272         }
3273
3274         /*
3275          * Now also check if there's any ordered extent in the range.
3276          * We do this because:
3277          *
3278          * 1) When delalloc is flushed, the file range is locked, we clear the
3279          *    EXTENT_DELALLOC bit from the io tree and create an extent map and
3280          *    an ordered extent for the write. So we might just have been called
3281          *    after delalloc is flushed and before the ordered extent completes
3282          *    and inserts the new file extent item in the subvolume's btree;
3283          *
3284          * 2) We may have an ordered extent created by flushing delalloc for a
3285          *    subrange that starts before the subrange we found marked with
3286          *    EXTENT_DELALLOC in the io tree.
3287          *
3288          * We could also use the extent map tree to find such delalloc that is
3289          * being flushed, but using the ordered extents tree is more efficient
3290          * because it's usually much smaller as ordered extents are removed from
3291          * the tree once they complete. With the extent maps, we mau have them
3292          * in the extent map tree for a very long time, and they were either
3293          * created by previous writes or loaded by read operations.
3294          */
3295         oe = btrfs_lookup_first_ordered_range(inode, start, len);
3296         if (!oe)
3297                 return (delalloc_len > 0);
3298
3299         /* The ordered extent may span beyond our search range. */
3300         oe_start = max(oe->file_offset, start);
3301         oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
3302
3303         btrfs_put_ordered_extent(oe);
3304
3305         /* Don't have unflushed delalloc, return the ordered extent range. */
3306         if (delalloc_len == 0) {
3307                 *delalloc_start_ret = oe_start;
3308                 *delalloc_end_ret = oe_end;
3309                 return true;
3310         }
3311
3312         /*
3313          * We have both unflushed delalloc (io_tree) and an ordered extent.
3314          * If the ranges are adjacent returned a combined range, otherwise
3315          * return the leftmost range.
3316          */
3317         if (oe_start < *delalloc_start_ret) {
3318                 if (oe_end < *delalloc_start_ret)
3319                         *delalloc_end_ret = oe_end;
3320                 *delalloc_start_ret = oe_start;
3321         } else if (*delalloc_end_ret + 1 == oe_start) {
3322                 *delalloc_end_ret = oe_end;
3323         }
3324
3325         return true;
3326 }
3327
3328 /*
3329  * Check if there's delalloc in a given range.
3330  *
3331  * @inode:               The inode.
3332  * @start:               The start offset of the range. It does not need to be
3333  *                       sector size aligned.
3334  * @end:                 The end offset (inclusive value) of the search range.
3335  *                       It does not need to be sector size aligned.
3336  * @cached_state:        Extent state record used for speeding up delalloc
3337  *                       searches in the inode's io_tree. Can be NULL.
3338  * @delalloc_start_ret:  Output argument, set to the start offset of the
3339  *                       subrange found with delalloc (may not be sector size
3340  *                       aligned).
3341  * @delalloc_end_ret:    Output argument, set to he end offset (inclusive value)
3342  *                       of the subrange found with delalloc.
3343  *
3344  * Returns true if a subrange with delalloc is found within the given range, and
3345  * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
3346  * end offsets of the subrange.
3347  */
3348 bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
3349                                   struct extent_state **cached_state,
3350                                   u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3351 {
3352         u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
3353         u64 prev_delalloc_end = 0;
3354         bool search_io_tree = true;
3355         bool ret = false;
3356
3357         while (cur_offset <= end) {
3358                 u64 delalloc_start;
3359                 u64 delalloc_end;
3360                 bool delalloc;
3361
3362                 delalloc = find_delalloc_subrange(inode, cur_offset, end,
3363                                                   cached_state, &search_io_tree,
3364                                                   &delalloc_start,
3365                                                   &delalloc_end);
3366                 if (!delalloc)
3367                         break;
3368
3369                 if (prev_delalloc_end == 0) {
3370                         /* First subrange found. */
3371                         *delalloc_start_ret = max(delalloc_start, start);
3372                         *delalloc_end_ret = delalloc_end;
3373                         ret = true;
3374                 } else if (delalloc_start == prev_delalloc_end + 1) {
3375                         /* Subrange adjacent to the previous one, merge them. */
3376                         *delalloc_end_ret = delalloc_end;
3377                 } else {
3378                         /* Subrange not adjacent to the previous one, exit. */
3379                         break;
3380                 }
3381
3382                 prev_delalloc_end = delalloc_end;
3383                 cur_offset = delalloc_end + 1;
3384                 cond_resched();
3385         }
3386
3387         return ret;
3388 }
3389
3390 /*
3391  * Check if there's a hole or delalloc range in a range representing a hole (or
3392  * prealloc extent) found in the inode's subvolume btree.
3393  *
3394  * @inode:      The inode.
3395  * @whence:     Seek mode (SEEK_DATA or SEEK_HOLE).
3396  * @start:      Start offset of the hole region. It does not need to be sector
3397  *              size aligned.
3398  * @end:        End offset (inclusive value) of the hole region. It does not
3399  *              need to be sector size aligned.
3400  * @start_ret:  Return parameter, used to set the start of the subrange in the
3401  *              hole that matches the search criteria (seek mode), if such
3402  *              subrange is found (return value of the function is true).
3403  *              The value returned here may not be sector size aligned.
3404  *
3405  * Returns true if a subrange matching the given seek mode is found, and if one
3406  * is found, it updates @start_ret with the start of the subrange.
3407  */
3408 static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
3409                                         struct extent_state **cached_state,
3410                                         u64 start, u64 end, u64 *start_ret)
3411 {
3412         u64 delalloc_start;
3413         u64 delalloc_end;
3414         bool delalloc;
3415
3416         delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
3417                                                 &delalloc_start, &delalloc_end);
3418         if (delalloc && whence == SEEK_DATA) {
3419                 *start_ret = delalloc_start;
3420                 return true;
3421         }
3422
3423         if (delalloc && whence == SEEK_HOLE) {
3424                 /*
3425                  * We found delalloc but it starts after out start offset. So we
3426                  * have a hole between our start offset and the delalloc start.
3427                  */
3428                 if (start < delalloc_start) {
3429                         *start_ret = start;
3430                         return true;
3431                 }
3432                 /*
3433                  * Delalloc range starts at our start offset.
3434                  * If the delalloc range's length is smaller than our range,
3435                  * then it means we have a hole that starts where the delalloc
3436                  * subrange ends.
3437                  */
3438                 if (delalloc_end < end) {
3439                         *start_ret = delalloc_end + 1;
3440                         return true;
3441                 }
3442
3443                 /* There's delalloc for the whole range. */
3444                 return false;
3445         }
3446
3447         if (!delalloc && whence == SEEK_HOLE) {
3448                 *start_ret = start;
3449                 return true;
3450         }
3451
3452         /*
3453          * No delalloc in the range and we are seeking for data. The caller has
3454          * to iterate to the next extent item in the subvolume btree.
3455          */
3456         return false;
3457 }
3458
3459 static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
3460 {
3461         struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
3462         struct btrfs_file_private *private;
3463         struct btrfs_fs_info *fs_info = inode->root->fs_info;
3464         struct extent_state *cached_state = NULL;
3465         struct extent_state **delalloc_cached_state;
3466         const loff_t i_size = i_size_read(&inode->vfs_inode);
3467         const u64 ino = btrfs_ino(inode);
3468         struct btrfs_root *root = inode->root;
3469         struct btrfs_path *path;
3470         struct btrfs_key key;
3471         u64 last_extent_end;
3472         u64 lockstart;
3473         u64 lockend;
3474         u64 start;
3475         int ret;
3476         bool found = false;
3477
3478         if (i_size == 0 || offset >= i_size)
3479                 return -ENXIO;
3480
3481         /*
3482          * Quick path. If the inode has no prealloc extents and its number of
3483          * bytes used matches its i_size, then it can not have holes.
3484          */
3485         if (whence == SEEK_HOLE &&
3486             !(inode->flags & BTRFS_INODE_PREALLOC) &&
3487             inode_get_bytes(&inode->vfs_inode) == i_size)
3488                 return i_size;
3489
3490         spin_lock(&inode->lock);
3491         private = file->private_data;
3492         spin_unlock(&inode->lock);
3493
3494         if (private && private->owner_task != current) {
3495                 /*
3496                  * Not allocated by us, don't use it as its cached state is used
3497                  * by the task that allocated it and we don't want neither to
3498                  * mess with it nor get incorrect results because it reflects an
3499                  * invalid state for the current task.
3500                  */
3501                 private = NULL;
3502         } else if (!private) {
3503                 private = kzalloc(sizeof(*private), GFP_KERNEL);
3504                 /*
3505                  * No worries if memory allocation failed.
3506                  * The private structure is used only for speeding up multiple
3507                  * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
3508                  * so everything will still be correct.
3509                  */
3510                 if (private) {
3511                         bool free = false;
3512
3513                         private->owner_task = current;
3514
3515                         spin_lock(&inode->lock);
3516                         if (file->private_data)
3517                                 free = true;
3518                         else
3519                                 file->private_data = private;
3520                         spin_unlock(&inode->lock);
3521
3522                         if (free) {
3523                                 kfree(private);
3524                                 private = NULL;
3525                         }
3526                 }
3527         }
3528
3529         if (private)
3530                 delalloc_cached_state = &private->llseek_cached_state;
3531         else
3532                 delalloc_cached_state = NULL;
3533
3534         /*
3535          * offset can be negative, in this case we start finding DATA/HOLE from
3536          * the very start of the file.
3537          */
3538         start = max_t(loff_t, 0, offset);
3539
3540         lockstart = round_down(start, fs_info->sectorsize);
3541         lockend = round_up(i_size, fs_info->sectorsize);
3542         if (lockend <= lockstart)
3543                 lockend = lockstart + fs_info->sectorsize;
3544         lockend--;
3545
3546         path = btrfs_alloc_path();
3547         if (!path)
3548                 return -ENOMEM;
3549         path->reada = READA_FORWARD;
3550
3551         key.objectid = ino;
3552         key.type = BTRFS_EXTENT_DATA_KEY;
3553         key.offset = start;
3554
3555         last_extent_end = lockstart;
3556
3557         btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3558
3559         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3560         if (ret < 0) {
3561                 goto out;
3562         } else if (ret > 0 && path->slots[0] > 0) {
3563                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
3564                 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
3565                         path->slots[0]--;
3566         }
3567
3568         while (start < i_size) {
3569                 struct extent_buffer *leaf = path->nodes[0];
3570                 struct btrfs_file_extent_item *extent;
3571                 u64 extent_end;
3572                 u8 type;
3573
3574                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3575                         ret = btrfs_next_leaf(root, path);
3576                         if (ret < 0)
3577                                 goto out;
3578                         else if (ret > 0)
3579                                 break;
3580
3581                         leaf = path->nodes[0];
3582                 }
3583
3584                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3585                 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
3586                         break;
3587
3588                 extent_end = btrfs_file_extent_end(path);
3589
3590                 /*
3591                  * In the first iteration we may have a slot that points to an
3592                  * extent that ends before our start offset, so skip it.
3593                  */
3594                 if (extent_end <= start) {
3595                         path->slots[0]++;
3596                         continue;
3597                 }
3598
3599                 /* We have an implicit hole, NO_HOLES feature is likely set. */
3600                 if (last_extent_end < key.offset) {
3601                         u64 search_start = last_extent_end;
3602                         u64 found_start;
3603
3604                         /*
3605                          * First iteration, @start matches @offset and it's
3606                          * within the hole.
3607                          */
3608                         if (start == offset)
3609                                 search_start = offset;
3610
3611                         found = find_desired_extent_in_hole(inode, whence,
3612                                                             delalloc_cached_state,
3613                                                             search_start,
3614                                                             key.offset - 1,
3615                                                             &found_start);
3616                         if (found) {
3617                                 start = found_start;
3618                                 break;
3619                         }
3620                         /*
3621                          * Didn't find data or a hole (due to delalloc) in the
3622                          * implicit hole range, so need to analyze the extent.
3623                          */
3624                 }
3625
3626                 extent = btrfs_item_ptr(leaf, path->slots[0],
3627                                         struct btrfs_file_extent_item);
3628                 type = btrfs_file_extent_type(leaf, extent);
3629
3630                 /*
3631                  * Can't access the extent's disk_bytenr field if this is an
3632                  * inline extent, since at that offset, it's where the extent
3633                  * data starts.
3634                  */
3635                 if (type == BTRFS_FILE_EXTENT_PREALLOC ||
3636                     (type == BTRFS_FILE_EXTENT_REG &&
3637                      btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
3638                         /*
3639                          * Explicit hole or prealloc extent, search for delalloc.
3640                          * A prealloc extent is treated like a hole.
3641                          */
3642                         u64 search_start = key.offset;
3643                         u64 found_start;
3644
3645                         /*
3646                          * First iteration, @start matches @offset and it's
3647                          * within the hole.
3648                          */
3649                         if (start == offset)
3650                                 search_start = offset;
3651
3652                         found = find_desired_extent_in_hole(inode, whence,
3653                                                             delalloc_cached_state,
3654                                                             search_start,
3655                                                             extent_end - 1,
3656                                                             &found_start);
3657                         if (found) {
3658                                 start = found_start;
3659                                 break;
3660                         }
3661                         /*
3662                          * Didn't find data or a hole (due to delalloc) in the
3663                          * implicit hole range, so need to analyze the next
3664                          * extent item.
3665                          */
3666                 } else {
3667                         /*
3668                          * Found a regular or inline extent.
3669                          * If we are seeking for data, adjust the start offset
3670                          * and stop, we're done.
3671                          */
3672                         if (whence == SEEK_DATA) {
3673                                 start = max_t(u64, key.offset, offset);
3674                                 found = true;
3675                                 break;
3676                         }
3677                         /*
3678                          * Else, we are seeking for a hole, check the next file
3679                          * extent item.
3680                          */
3681                 }
3682
3683                 start = extent_end;
3684                 last_extent_end = extent_end;
3685                 path->slots[0]++;
3686                 if (fatal_signal_pending(current)) {
3687                         ret = -EINTR;
3688                         goto out;
3689                 }
3690                 cond_resched();
3691         }
3692
3693         /* We have an implicit hole from the last extent found up to i_size. */
3694         if (!found && start < i_size) {
3695                 found = find_desired_extent_in_hole(inode, whence,
3696                                                     delalloc_cached_state, start,
3697                                                     i_size - 1, &start);
3698                 if (!found)
3699                         start = i_size;
3700         }
3701
3702 out:
3703         btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3704         btrfs_free_path(path);
3705
3706         if (ret < 0)
3707                 return ret;
3708
3709         if (whence == SEEK_DATA && start >= i_size)
3710                 return -ENXIO;
3711
3712         return min_t(loff_t, start, i_size);
3713 }
3714
3715 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3716 {
3717         struct inode *inode = file->f_mapping->host;
3718
3719         switch (whence) {
3720         default:
3721                 return generic_file_llseek(file, offset, whence);
3722         case SEEK_DATA:
3723         case SEEK_HOLE:
3724                 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3725                 offset = find_desired_extent(file, offset, whence);
3726                 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3727                 break;
3728         }
3729
3730         if (offset < 0)
3731                 return offset;
3732
3733         return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3734 }
3735
3736 static int btrfs_file_open(struct inode *inode, struct file *filp)
3737 {
3738         int ret;
3739
3740         filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
3741
3742         ret = fsverity_file_open(inode, filp);
3743         if (ret)
3744                 return ret;
3745         return generic_file_open(inode, filp);
3746 }
3747
3748 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3749 {
3750         ssize_t ret = 0;
3751
3752         if (iocb->ki_flags & IOCB_DIRECT) {
3753                 ret = btrfs_direct_read(iocb, to);
3754                 if (ret < 0 || !iov_iter_count(to) ||
3755                     iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3756                         return ret;
3757         }
3758
3759         return filemap_read(iocb, to, ret);
3760 }
3761
3762 const struct file_operations btrfs_file_operations = {
3763         .llseek         = btrfs_file_llseek,
3764         .read_iter      = btrfs_file_read_iter,
3765         .splice_read    = filemap_splice_read,
3766         .write_iter     = btrfs_file_write_iter,
3767         .splice_write   = iter_file_splice_write,
3768         .mmap           = btrfs_file_mmap,
3769         .open           = btrfs_file_open,
3770         .release        = btrfs_release_file,
3771         .get_unmapped_area = thp_get_unmapped_area,
3772         .fsync          = btrfs_sync_file,
3773         .fallocate      = btrfs_fallocate,
3774         .unlocked_ioctl = btrfs_ioctl,
3775 #ifdef CONFIG_COMPAT
3776         .compat_ioctl   = btrfs_compat_ioctl,
3777 #endif
3778         .remap_file_range = btrfs_remap_file_range,
3779         .uring_cmd      = btrfs_uring_cmd,
3780         .fop_flags      = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
3781 };
3782
3783 int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end)
3784 {
3785         struct address_space *mapping = inode->vfs_inode.i_mapping;
3786         int ret;
3787
3788         /*
3789          * So with compression we will find and lock a dirty page and clear the
3790          * first one as dirty, setup an async extent, and immediately return
3791          * with the entire range locked but with nobody actually marked with
3792          * writeback.  So we can't just filemap_write_and_wait_range() and
3793          * expect it to work since it will just kick off a thread to do the
3794          * actual work.  So we need to call filemap_fdatawrite_range _again_
3795          * since it will wait on the page lock, which won't be unlocked until
3796          * after the pages have been marked as writeback and so we're good to go
3797          * from there.  We have to do this otherwise we'll miss the ordered
3798          * extents and that results in badness.  Please Josef, do not think you
3799          * know better and pull this out at some point in the future, it is
3800          * right and you are wrong.
3801          */
3802         ret = filemap_fdatawrite_range(mapping, start, end);
3803         if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags))
3804                 ret = filemap_fdatawrite_range(mapping, start, end);
3805
3806         return ret;
3807 }