fs/btrfs/tree-log.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2008 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/slab.h>
   8 #include <linux/blkdev.h>
   9 #include <linux/list_sort.h>
  10 #include <linux/iversion.h>
  11 #include "misc.h"
  12 #include "ctree.h"
  13 #include "tree-log.h"
  14 #include "disk-io.h"
  15 #include "locking.h"
  16 #include "backref.h"
  17 #include "compression.h"
  18 #include "qgroup.h"
  19 #include "block-group.h"
  20 #include "space-info.h"
  21 #include "inode-item.h"
  22 #include "fs.h"
  23 #include "accessors.h"
  24 #include "extent-tree.h"
  25 #include "root-tree.h"
  26 #include "dir-item.h"
  27 #include "file-item.h"
  28 #include "file.h"
  29 #include "orphan.h"
  30 #include "tree-checker.h"
  31
  32 #define MAX_CONFLICT_INODES 10
  33
  34 /* magic values for the inode_only field in btrfs_log_inode:
  35  *
  36  * LOG_INODE_ALL means to log everything
  37  * LOG_INODE_EXISTS means to log just enough to recreate the inode
  38  * during log replay
  39  */
  40 enum {
  41         LOG_INODE_ALL,
  42         LOG_INODE_EXISTS,
  43 };
  44
  45 /*
  46  * directory trouble cases
  47  *
  48  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
  49  * log, we must force a full commit before doing an fsync of the directory
  50  * where the unlink was done.
  51  * ---> record transid of last unlink/rename per directory
  52  *
  53  * mkdir foo/some_dir
  54  * normal commit
  55  * rename foo/some_dir foo2/some_dir
  56  * mkdir foo/some_dir
  57  * fsync foo/some_dir/some_file
  58  *
  59  * The fsync above will unlink the original some_dir without recording
  60  * it in its new location (foo2).  After a crash, some_dir will be gone
  61  * unless the fsync of some_file forces a full commit
  62  *
  63  * 2) we must log any new names for any file or dir that is in the fsync
  64  * log. ---> check inode while renaming/linking.
  65  *
  66  * 2a) we must log any new names for any file or dir during rename
  67  * when the directory they are being removed from was logged.
  68  * ---> check inode and old parent dir during rename
  69  *
  70  *  2a is actually the more important variant.  With the extra logging
  71  *  a crash might unlink the old name without recreating the new one
  72  *
  73  * 3) after a crash, we must go through any directories with a link count
  74  * of zero and redo the rm -rf
  75  *
  76  * mkdir f1/foo
  77  * normal commit
  78  * rm -rf f1/foo
  79  * fsync(f1)
  80  *
  81  * The directory f1 was fully removed from the FS, but fsync was never
  82  * called on f1, only its parent dir.  After a crash the rm -rf must
  83  * be replayed.  This must be able to recurse down the entire
  84  * directory tree.  The inode link count fixup code takes care of the
  85  * ugly details.
  86  */
  87
  88 /*
  89  * stages for the tree walking.  The first
  90  * stage (0) is to only pin down the blocks we find
  91  * the second stage (1) is to make sure that all the inodes
  92  * we find in the log are created in the subvolume.
  93  *
  94  * The last stage is to deal with directories and links and extents
  95  * and all the other fun semantics
  96  */
  97 enum {
  98         LOG_WALK_PIN_ONLY,
  99         LOG_WALK_REPLAY_INODES,
 100         LOG_WALK_REPLAY_DIR_INDEX,
 101         LOG_WALK_REPLAY_ALL,
 102 };
 103
 104 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 105                            struct btrfs_inode *inode,
 106                            int inode_only,
 107                            struct btrfs_log_ctx *ctx);
 108 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 109                              struct btrfs_root *root,
 110                              struct btrfs_path *path, u64 objectid);
 111 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 112                                        struct btrfs_root *root,
 113                                        struct btrfs_root *log,
 114                                        struct btrfs_path *path,
 115                                        u64 dirid, int del_all);
 116 static void wait_log_commit(struct btrfs_root *root, int transid);
 117
 118 /*
 119  * tree logging is a special write ahead log used to make sure that
 120  * fsyncs and O_SYNCs can happen without doing full tree commits.
 121  *
 122  * Full tree commits are expensive because they require commonly
 123  * modified blocks to be recowed, creating many dirty pages in the
 124  * extent tree an 4x-6x higher write load than ext3.
 125  *
 126  * Instead of doing a tree commit on every fsync, we use the
 127  * key ranges and transaction ids to find items for a given file or directory
 128  * that have changed in this transaction.  Those items are copied into
 129  * a special tree (one per subvolume root), that tree is written to disk
 130  * and then the fsync is considered complete.
 131  *
 132  * After a crash, items are copied out of the log-tree back into the
 133  * subvolume tree.  Any file data extents found are recorded in the extent
 134  * allocation tree, and the log-tree freed.
 135  *
 136  * The log tree is read three times, once to pin down all the extents it is
 137  * using in ram and once, once to create all the inodes logged in the tree
 138  * and once to do all the other items.
 139  */
 140
 141 static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
 142 {
 143         unsigned int nofs_flag;
 144         struct btrfs_inode *inode;
 145
 146         /* Only meant to be called for subvolume roots and not for log roots. */
 147         ASSERT(is_fstree(btrfs_root_id(root)));
 148
 149         /*
 150          * We're holding a transaction handle whether we are logging or
 151          * replaying a log tree, so we must make sure NOFS semantics apply
 152          * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL
 153          * to allocate an inode, which can recurse back into the filesystem and
 154          * attempt a transaction commit, resulting in a deadlock.
 155          */
 156         nofs_flag = memalloc_nofs_save();
 157         inode = btrfs_iget(objectid, root);
 158         memalloc_nofs_restore(nofs_flag);
 159
 160         return inode;
 161 }
 162
 163 /*
 164  * start a sub transaction and setup the log tree
 165  * this increments the log tree writer count to make the people
 166  * syncing the tree wait for us to finish
 167  */
 168 static int start_log_trans(struct btrfs_trans_handle *trans,
 169                            struct btrfs_root *root,
 170                            struct btrfs_log_ctx *ctx)
 171 {
 172         struct btrfs_fs_info *fs_info = root->fs_info;
 173         struct btrfs_root *tree_root = fs_info->tree_root;
 174         const bool zoned = btrfs_is_zoned(fs_info);
 175         int ret = 0;
 176         bool created = false;
 177
 178         /*
 179          * First check if the log root tree was already created. If not, create
 180          * it before locking the root's log_mutex, just to keep lockdep happy.
 181          */
 182         if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
 183                 mutex_lock(&tree_root->log_mutex);
 184                 if (!fs_info->log_root_tree) {
 185                         ret = btrfs_init_log_root_tree(trans, fs_info);
 186                         if (!ret) {
 187                                 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
 188                                 created = true;
 189                         }
 190                 }
 191                 mutex_unlock(&tree_root->log_mutex);
 192                 if (ret)
 193                         return ret;
 194         }
 195
 196         mutex_lock(&root->log_mutex);
 197
 198 again:
 199         if (root->log_root) {
 200                 int index = (root->log_transid + 1) % 2;
 201
 202                 if (btrfs_need_log_full_commit(trans)) {
 203                         ret = BTRFS_LOG_FORCE_COMMIT;
 204                         goto out;
 205                 }
 206
 207                 if (zoned && atomic_read(&root->log_commit[index])) {
 208                         wait_log_commit(root, root->log_transid - 1);
 209                         goto again;
 210                 }
 211
 212                 if (!root->log_start_pid) {
 213                         clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 214                         root->log_start_pid = current->pid;
 215                 } else if (root->log_start_pid != current->pid) {
 216                         set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 217                 }
 218         } else {
 219                 /*
 220                  * This means fs_info->log_root_tree was already created
 221                  * for some other FS trees. Do the full commit not to mix
 222                  * nodes from multiple log transactions to do sequential
 223                  * writing.
 224                  */
 225                 if (zoned && !created) {
 226                         ret = BTRFS_LOG_FORCE_COMMIT;
 227                         goto out;
 228                 }
 229
 230                 ret = btrfs_add_log_tree(trans, root);
 231                 if (ret)
 232                         goto out;
 233
 234                 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
 235                 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 236                 root->log_start_pid = current->pid;
 237         }
 238
 239         atomic_inc(&root->log_writers);
 240         if (!ctx->logging_new_name) {
 241                 int index = root->log_transid % 2;
 242                 list_add_tail(&ctx->list, &root->log_ctxs[index]);
 243                 ctx->log_transid = root->log_transid;
 244         }
 245
 246 out:
 247         mutex_unlock(&root->log_mutex);
 248         return ret;
 249 }
 250
 251 /*
 252  * returns 0 if there was a log transaction running and we were able
 253  * to join, or returns -ENOENT if there were not transactions
 254  * in progress
 255  */
 256 static int join_running_log_trans(struct btrfs_root *root)
 257 {
 258         const bool zoned = btrfs_is_zoned(root->fs_info);
 259         int ret = -ENOENT;
 260
 261         if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
 262                 return ret;
 263
 264         mutex_lock(&root->log_mutex);
 265 again:
 266         if (root->log_root) {
 267                 int index = (root->log_transid + 1) % 2;
 268
 269                 ret = 0;
 270                 if (zoned && atomic_read(&root->log_commit[index])) {
 271                         wait_log_commit(root, root->log_transid - 1);
 272                         goto again;
 273                 }
 274                 atomic_inc(&root->log_writers);
 275         }
 276         mutex_unlock(&root->log_mutex);
 277         return ret;
 278 }
 279
 280 /*
 281  * This either makes the current running log transaction wait
 282  * until you call btrfs_end_log_trans() or it makes any future
 283  * log transactions wait until you call btrfs_end_log_trans()
 284  */
 285 void btrfs_pin_log_trans(struct btrfs_root *root)
 286 {
 287         atomic_inc(&root->log_writers);
 288 }
 289
 290 /*
 291  * indicate we're done making changes to the log tree
 292  * and wake up anyone waiting to do a sync
 293  */
 294 void btrfs_end_log_trans(struct btrfs_root *root)
 295 {
 296         if (atomic_dec_and_test(&root->log_writers)) {
 297                 /* atomic_dec_and_test implies a barrier */
 298                 cond_wake_up_nomb(&root->log_writer_wait);
 299         }
 300 }
 301
 302 /*
 303  * the walk control struct is used to pass state down the chain when
 304  * processing the log tree.  The stage field tells us which part
 305  * of the log tree processing we are currently doing.  The others
 306  * are state fields used for that specific part
 307  */
 308 struct walk_control {
 309         /* should we free the extent on disk when done?  This is used
 310          * at transaction commit time while freeing a log tree
 311          */
 312         int free;
 313
 314         /* pin only walk, we record which extents on disk belong to the
 315          * log trees
 316          */
 317         int pin;
 318
 319         /* what stage of the replay code we're currently in */
 320         int stage;
 321
 322         /*
 323          * Ignore any items from the inode currently being processed. Needs
 324          * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
 325          * the LOG_WALK_REPLAY_INODES stage.
 326          */
 327         bool ignore_cur_inode;
 328
 329         /* the root we are currently replaying */
 330         struct btrfs_root *replay_dest;
 331
 332         /* the trans handle for the current replay */
 333         struct btrfs_trans_handle *trans;
 334
 335         /* the function that gets used to process blocks we find in the
 336          * tree.  Note the extent_buffer might not be up to date when it is
 337          * passed in, and it must be checked or read if you need the data
 338          * inside it
 339          */
 340         int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
 341                             struct walk_control *wc, u64 gen, int level);
 342 };
 343
 344 /*
 345  * process_func used to pin down extents, write them or wait on them
 346  */
 347 static int process_one_buffer(struct btrfs_root *log,
 348                               struct extent_buffer *eb,
 349                               struct walk_control *wc, u64 gen, int level)
 350 {
 351         struct btrfs_fs_info *fs_info = log->fs_info;
 352         int ret = 0;
 353
 354         /*
 355          * If this fs is mixed then we need to be able to process the leaves to
 356          * pin down any logged extents, so we have to read the block.
 357          */
 358         if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
 359                 struct btrfs_tree_parent_check check = {
 360                         .level = level,
 361                         .transid = gen
 362                 };
 363
 364                 ret = btrfs_read_extent_buffer(eb, &check);
 365                 if (ret)
 366                         return ret;
 367         }
 368
 369         if (wc->pin) {
 370                 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb);
 371                 if (ret)
 372                         return ret;
 373
 374                 if (btrfs_buffer_uptodate(eb, gen, 0) &&
 375                     btrfs_header_level(eb) == 0)
 376                         ret = btrfs_exclude_logged_extents(eb);
 377         }
 378         return ret;
 379 }
 380
 381 /*
 382  * Item overwrite used by log replay. The given eb, slot and key all refer to
 383  * the source data we are copying out.
 384  *
 385  * The given root is for the tree we are copying into, and path is a scratch
 386  * path for use in this function (it should be released on entry and will be
 387  * released on exit).
 388  *
 389  * If the key is already in the destination tree the existing item is
 390  * overwritten.  If the existing item isn't big enough, it is extended.
 391  * If it is too large, it is truncated.
 392  *
 393  * If the key isn't in the destination yet, a new item is inserted.
 394  */
 395 static int overwrite_item(struct btrfs_trans_handle *trans,
 396                           struct btrfs_root *root,
 397                           struct btrfs_path *path,
 398                           struct extent_buffer *eb, int slot,
 399                           struct btrfs_key *key)
 400 {
 401         int ret;
 402         u32 item_size;
 403         u64 saved_i_size = 0;
 404         int save_old_i_size = 0;
 405         unsigned long src_ptr;
 406         unsigned long dst_ptr;
 407         struct extent_buffer *dst_eb;
 408         int dst_slot;
 409         bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 410
 411         /*
 412          * This is only used during log replay, so the root is always from a
 413          * fs/subvolume tree. In case we ever need to support a log root, then
 414          * we'll have to clone the leaf in the path, release the path and use
 415          * the leaf before writing into the log tree. See the comments at
 416          * copy_items() for more details.
 417          */
 418         ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
 419
 420         item_size = btrfs_item_size(eb, slot);
 421         src_ptr = btrfs_item_ptr_offset(eb, slot);
 422
 423         /* Look for the key in the destination tree. */
 424         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 425         if (ret < 0)
 426                 return ret;
 427
 428         dst_eb = path->nodes[0];
 429         dst_slot = path->slots[0];
 430
 431         if (ret == 0) {
 432                 char *src_copy;
 433                 const u32 dst_size = btrfs_item_size(dst_eb, dst_slot);
 434
 435                 if (dst_size != item_size)
 436                         goto insert;
 437
 438                 if (item_size == 0) {
 439                         btrfs_release_path(path);
 440                         return 0;
 441                 }
 442                 src_copy = kmalloc(item_size, GFP_NOFS);
 443                 if (!src_copy) {
 444                         btrfs_release_path(path);
 445                         return -ENOMEM;
 446                 }
 447
 448                 read_extent_buffer(eb, src_copy, src_ptr, item_size);
 449                 dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
 450                 ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size);
 451
 452                 kfree(src_copy);
 453                 /*
 454                  * they have the same contents, just return, this saves
 455                  * us from cowing blocks in the destination tree and doing
 456                  * extra writes that may not have been done by a previous
 457                  * sync
 458                  */
 459                 if (ret == 0) {
 460                         btrfs_release_path(path);
 461                         return 0;
 462                 }
 463
 464                 /*
 465                  * We need to load the old nbytes into the inode so when we
 466                  * replay the extents we've logged we get the right nbytes.
 467                  */
 468                 if (inode_item) {
 469                         struct btrfs_inode_item *item;
 470                         u64 nbytes;
 471                         u32 mode;
 472
 473                         item = btrfs_item_ptr(dst_eb, dst_slot,
 474                                               struct btrfs_inode_item);
 475                         nbytes = btrfs_inode_nbytes(dst_eb, item);
 476                         item = btrfs_item_ptr(eb, slot,
 477                                               struct btrfs_inode_item);
 478                         btrfs_set_inode_nbytes(eb, item, nbytes);
 479
 480                         /*
 481                          * If this is a directory we need to reset the i_size to
 482                          * 0 so that we can set it up properly when replaying
 483                          * the rest of the items in this log.
 484                          */
 485                         mode = btrfs_inode_mode(eb, item);
 486                         if (S_ISDIR(mode))
 487                                 btrfs_set_inode_size(eb, item, 0);
 488                 }
 489         } else if (inode_item) {
 490                 struct btrfs_inode_item *item;
 491                 u32 mode;
 492
 493                 /*
 494                  * New inode, set nbytes to 0 so that the nbytes comes out
 495                  * properly when we replay the extents.
 496                  */
 497                 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
 498                 btrfs_set_inode_nbytes(eb, item, 0);
 499
 500                 /*
 501                  * If this is a directory we need to reset the i_size to 0 so
 502                  * that we can set it up properly when replaying the rest of
 503                  * the items in this log.
 504                  */
 505                 mode = btrfs_inode_mode(eb, item);
 506                 if (S_ISDIR(mode))
 507                         btrfs_set_inode_size(eb, item, 0);
 508         }
 509 insert:
 510         btrfs_release_path(path);
 511         /* try to insert the key into the destination tree */
 512         path->skip_release_on_error = 1;
 513         ret = btrfs_insert_empty_item(trans, root, path,
 514                                       key, item_size);
 515         path->skip_release_on_error = 0;
 516
 517         dst_eb = path->nodes[0];
 518         dst_slot = path->slots[0];
 519
 520         /* make sure any existing item is the correct size */
 521         if (ret == -EEXIST || ret == -EOVERFLOW) {
 522                 const u32 found_size = btrfs_item_size(dst_eb, dst_slot);
 523
 524                 if (found_size > item_size)
 525                         btrfs_truncate_item(trans, path, item_size, 1);
 526                 else if (found_size < item_size)
 527                         btrfs_extend_item(trans, path, item_size - found_size);
 528         } else if (ret) {
 529                 return ret;
 530         }
 531         dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
 532
 533         /* don't overwrite an existing inode if the generation number
 534          * was logged as zero.  This is done when the tree logging code
 535          * is just logging an inode to make sure it exists after recovery.
 536          *
 537          * Also, don't overwrite i_size on directories during replay.
 538          * log replay inserts and removes directory items based on the
 539          * state of the tree found in the subvolume, and i_size is modified
 540          * as it goes
 541          */
 542         if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
 543                 struct btrfs_inode_item *src_item;
 544                 struct btrfs_inode_item *dst_item;
 545
 546                 src_item = (struct btrfs_inode_item *)src_ptr;
 547                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 548
 549                 if (btrfs_inode_generation(eb, src_item) == 0) {
 550                         const u64 ino_size = btrfs_inode_size(eb, src_item);
 551
 552                         /*
 553                          * For regular files an ino_size == 0 is used only when
 554                          * logging that an inode exists, as part of a directory
 555                          * fsync, and the inode wasn't fsynced before. In this
 556                          * case don't set the size of the inode in the fs/subvol
 557                          * tree, otherwise we would be throwing valid data away.
 558                          */
 559                         if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
 560                             S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
 561                             ino_size != 0)
 562                                 btrfs_set_inode_size(dst_eb, dst_item, ino_size);
 563                         goto no_copy;
 564                 }
 565
 566                 if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
 567                     S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) {
 568                         save_old_i_size = 1;
 569                         saved_i_size = btrfs_inode_size(dst_eb, dst_item);
 570                 }
 571         }
 572
 573         copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size);
 574
 575         if (save_old_i_size) {
 576                 struct btrfs_inode_item *dst_item;
 577
 578                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 579                 btrfs_set_inode_size(dst_eb, dst_item, saved_i_size);
 580         }
 581
 582         /* make sure the generation is filled in */
 583         if (key->type == BTRFS_INODE_ITEM_KEY) {
 584                 struct btrfs_inode_item *dst_item;
 585
 586                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 587                 if (btrfs_inode_generation(dst_eb, dst_item) == 0)
 588                         btrfs_set_inode_generation(dst_eb, dst_item, trans->transid);
 589         }
 590 no_copy:
 591         btrfs_release_path(path);
 592         return 0;
 593 }
 594
 595 static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
 596                                struct fscrypt_str *name)
 597 {
 598         char *buf;
 599
 600         buf = kmalloc(len, GFP_NOFS);
 601         if (!buf)
 602                 return -ENOMEM;
 603
 604         read_extent_buffer(eb, buf, (unsigned long)start, len);
 605         name->name = buf;
 606         name->len = len;
 607         return 0;
 608 }
 609
 610 /* replays a single extent in 'eb' at 'slot' with 'key' into the
 611  * subvolume 'root'.  path is released on entry and should be released
 612  * on exit.
 613  *
 614  * extents in the log tree have not been allocated out of the extent
 615  * tree yet.  So, this completes the allocation, taking a reference
 616  * as required if the extent already exists or creating a new extent
 617  * if it isn't in the extent allocation tree yet.
 618  *
 619  * The extent is inserted into the file, dropping any existing extents
 620  * from the file that overlap the new one.
 621  */
 622 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 623                                       struct btrfs_root *root,
 624                                       struct btrfs_path *path,
 625                                       struct extent_buffer *eb, int slot,
 626                                       struct btrfs_key *key)
 627 {
 628         struct btrfs_drop_extents_args drop_args = { 0 };
 629         struct btrfs_fs_info *fs_info = root->fs_info;
 630         int found_type;
 631         u64 extent_end;
 632         u64 start = key->offset;
 633         u64 nbytes = 0;
 634         struct btrfs_file_extent_item *item;
 635         struct btrfs_inode *inode = NULL;
 636         unsigned long size;
 637         int ret = 0;
 638
 639         item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 640         found_type = btrfs_file_extent_type(eb, item);
 641
 642         if (found_type == BTRFS_FILE_EXTENT_REG ||
 643             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 644                 nbytes = btrfs_file_extent_num_bytes(eb, item);
 645                 extent_end = start + nbytes;
 646
 647                 /*
 648                  * We don't add to the inodes nbytes if we are prealloc or a
 649                  * hole.
 650                  */
 651                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
 652                         nbytes = 0;
 653         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 654                 size = btrfs_file_extent_ram_bytes(eb, item);
 655                 nbytes = btrfs_file_extent_ram_bytes(eb, item);
 656                 extent_end = ALIGN(start + size,
 657                                    fs_info->sectorsize);
 658         } else {
 659                 btrfs_err(fs_info,
 660                   "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
 661                           found_type, btrfs_root_id(root), key->objectid, key->offset);
 662                 return -EUCLEAN;
 663         }
 664
 665         inode = btrfs_iget_logging(key->objectid, root);
 666         if (IS_ERR(inode))
 667                 return PTR_ERR(inode);
 668
 669         /*
 670          * first check to see if we already have this extent in the
 671          * file.  This must be done before the btrfs_drop_extents run
 672          * so we don't try to drop this extent.
 673          */
 674         ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0);
 675
 676         if (ret == 0 &&
 677             (found_type == BTRFS_FILE_EXTENT_REG ||
 678              found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
 679                 struct btrfs_file_extent_item existing;
 680                 unsigned long ptr;
 681
 682                 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 683                 read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing));
 684
 685                 /*
 686                  * we already have a pointer to this exact extent,
 687                  * we don't have to do anything
 688                  */
 689                 if (memcmp_extent_buffer(eb, &existing, (unsigned long)item,
 690                                          sizeof(existing)) == 0) {
 691                         btrfs_release_path(path);
 692                         goto out;
 693                 }
 694         }
 695         btrfs_release_path(path);
 696
 697         /* drop any overlapping extents */
 698         drop_args.start = start;
 699         drop_args.end = extent_end;
 700         drop_args.drop_cache = true;
 701         ret = btrfs_drop_extents(trans, root, inode, &drop_args);
 702         if (ret)
 703                 goto out;
 704
 705         if (found_type == BTRFS_FILE_EXTENT_REG ||
 706             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 707                 u64 offset;
 708                 unsigned long dest_offset;
 709                 struct btrfs_key ins;
 710
 711                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
 712                     btrfs_fs_incompat(fs_info, NO_HOLES))
 713                         goto update_inode;
 714
 715                 ret = btrfs_insert_empty_item(trans, root, path, key,
 716                                               sizeof(*item));
 717                 if (ret)
 718                         goto out;
 719                 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
 720                                                     path->slots[0]);
 721                 copy_extent_buffer(path->nodes[0], eb, dest_offset,
 722                                 (unsigned long)item,  sizeof(*item));
 723
 724                 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 725                 ins.type = BTRFS_EXTENT_ITEM_KEY;
 726                 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 727                 offset = key->offset - btrfs_file_extent_offset(eb, item);
 728
 729                 /*
 730                  * Manually record dirty extent, as here we did a shallow
 731                  * file extent item copy and skip normal backref update,
 732                  * but modifying extent tree all by ourselves.
 733                  * So need to manually record dirty extent for qgroup,
 734                  * as the owner of the file extent changed from log tree
 735                  * (doesn't affect qgroup) to fs/file tree(affects qgroup)
 736                  */
 737                 ret = btrfs_qgroup_trace_extent(trans,
 738                                 btrfs_file_extent_disk_bytenr(eb, item),
 739                                 btrfs_file_extent_disk_num_bytes(eb, item));
 740                 if (ret < 0)
 741                         goto out;
 742
 743                 if (ins.objectid > 0) {
 744                         u64 csum_start;
 745                         u64 csum_end;
 746                         LIST_HEAD(ordered_sums);
 747
 748                         /*
 749                          * is this extent already allocated in the extent
 750                          * allocation tree?  If so, just add a reference
 751                          */
 752                         ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
 753                                                 ins.offset);
 754                         if (ret < 0) {
 755                                 goto out;
 756                         } else if (ret == 0) {
 757                                 struct btrfs_ref ref = {
 758                                         .action = BTRFS_ADD_DELAYED_REF,
 759                                         .bytenr = ins.objectid,
 760                                         .num_bytes = ins.offset,
 761                                         .owning_root = btrfs_root_id(root),
 762                                         .ref_root = btrfs_root_id(root),
 763                                 };
 764                                 btrfs_init_data_ref(&ref, key->objectid, offset,
 765                                                     0, false);
 766                                 ret = btrfs_inc_extent_ref(trans, &ref);
 767                                 if (ret)
 768                                         goto out;
 769                         } else {
 770                                 /*
 771                                  * insert the extent pointer in the extent
 772                                  * allocation tree
 773                                  */
 774                                 ret = btrfs_alloc_logged_file_extent(trans,
 775                                                 btrfs_root_id(root),
 776                                                 key->objectid, offset, &ins);
 777                                 if (ret)
 778                                         goto out;
 779                         }
 780                         btrfs_release_path(path);
 781
 782                         if (btrfs_file_extent_compression(eb, item)) {
 783                                 csum_start = ins.objectid;
 784                                 csum_end = csum_start + ins.offset;
 785                         } else {
 786                                 csum_start = ins.objectid +
 787                                         btrfs_file_extent_offset(eb, item);
 788                                 csum_end = csum_start +
 789                                         btrfs_file_extent_num_bytes(eb, item);
 790                         }
 791
 792                         ret = btrfs_lookup_csums_list(root->log_root,
 793                                                 csum_start, csum_end - 1,
 794                                                 &ordered_sums, false);
 795                         if (ret < 0)
 796                                 goto out;
 797                         ret = 0;
 798                         /*
 799                          * Now delete all existing cums in the csum root that
 800                          * cover our range. We do this because we can have an
 801                          * extent that is completely referenced by one file
 802                          * extent item and partially referenced by another
 803                          * file extent item (like after using the clone or
 804                          * extent_same ioctls). In this case if we end up doing
 805                          * the replay of the one that partially references the
 806                          * extent first, and we do not do the csum deletion
 807                          * below, we can get 2 csum items in the csum tree that
 808                          * overlap each other. For example, imagine our log has
 809                          * the two following file extent items:
 810                          *
 811                          * key (257 EXTENT_DATA 409600)
 812                          *     extent data disk byte 12845056 nr 102400
 813                          *     extent data offset 20480 nr 20480 ram 102400
 814                          *
 815                          * key (257 EXTENT_DATA 819200)
 816                          *     extent data disk byte 12845056 nr 102400
 817                          *     extent data offset 0 nr 102400 ram 102400
 818                          *
 819                          * Where the second one fully references the 100K extent
 820                          * that starts at disk byte 12845056, and the log tree
 821                          * has a single csum item that covers the entire range
 822                          * of the extent:
 823                          *
 824                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 825                          *
 826                          * After the first file extent item is replayed, the
 827                          * csum tree gets the following csum item:
 828                          *
 829                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 830                          *
 831                          * Which covers the 20K sub-range starting at offset 20K
 832                          * of our extent. Now when we replay the second file
 833                          * extent item, if we do not delete existing csum items
 834                          * that cover any of its blocks, we end up getting two
 835                          * csum items in our csum tree that overlap each other:
 836                          *
 837                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 838                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 839                          *
 840                          * Which is a problem, because after this anyone trying
 841                          * to lookup up for the checksum of any block of our
 842                          * extent starting at an offset of 40K or higher, will
 843                          * end up looking at the second csum item only, which
 844                          * does not contain the checksum for any block starting
 845                          * at offset 40K or higher of our extent.
 846                          */
 847                         while (!list_empty(&ordered_sums)) {
 848                                 struct btrfs_ordered_sum *sums;
 849                                 struct btrfs_root *csum_root;
 850
 851                                 sums = list_first_entry(&ordered_sums,
 852                                                         struct btrfs_ordered_sum,
 853                                                         list);
 854                                 csum_root = btrfs_csum_root(fs_info,
 855                                                             sums->logical);
 856                                 if (!ret)
 857                                         ret = btrfs_del_csums(trans, csum_root,
 858                                                               sums->logical,
 859                                                               sums->len);
 860                                 if (!ret)
 861                                         ret = btrfs_csum_file_blocks(trans,
 862                                                                      csum_root,
 863                                                                      sums);
 864                                 list_del(&sums->list);
 865                                 kfree(sums);
 866                         }
 867                         if (ret)
 868                                 goto out;
 869                 } else {
 870                         btrfs_release_path(path);
 871                 }
 872         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 873                 /* inline extents are easy, we just overwrite them */
 874                 ret = overwrite_item(trans, root, path, eb, slot, key);
 875                 if (ret)
 876                         goto out;
 877         }
 878
 879         ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
 880         if (ret)
 881                 goto out;
 882
 883 update_inode:
 884         btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found);
 885         ret = btrfs_update_inode(trans, inode);
 886 out:
 887         iput(&inode->vfs_inode);
 888         return ret;
 889 }
 890
 891 static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
 892                                        struct btrfs_inode *dir,
 893                                        struct btrfs_inode *inode,
 894                                        const struct fscrypt_str *name)
 895 {
 896         int ret;
 897
 898         ret = btrfs_unlink_inode(trans, dir, inode, name);
 899         if (ret)
 900                 return ret;
 901         /*
 902          * Whenever we need to check if a name exists or not, we check the
 903          * fs/subvolume tree. So after an unlink we must run delayed items, so
 904          * that future checks for a name during log replay see that the name
 905          * does not exists anymore.
 906          */
 907         return btrfs_run_delayed_items(trans);
 908 }
 909
 910 /*
 911  * when cleaning up conflicts between the directory names in the
 912  * subvolume, directory names in the log and directory names in the
 913  * inode back references, we may have to unlink inodes from directories.
 914  *
 915  * This is a helper function to do the unlink of a specific directory
 916  * item
 917  */
 918 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 919                                       struct btrfs_path *path,
 920                                       struct btrfs_inode *dir,
 921                                       struct btrfs_dir_item *di)
 922 {
 923         struct btrfs_root *root = dir->root;
 924         struct btrfs_inode *inode;
 925         struct fscrypt_str name;
 926         struct extent_buffer *leaf;
 927         struct btrfs_key location;
 928         int ret;
 929
 930         leaf = path->nodes[0];
 931
 932         btrfs_dir_item_key_to_cpu(leaf, di, &location);
 933         ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
 934         if (ret)
 935                 return -ENOMEM;
 936
 937         btrfs_release_path(path);
 938
 939         inode = btrfs_iget_logging(location.objectid, root);
 940         if (IS_ERR(inode)) {
 941                 ret = PTR_ERR(inode);
 942                 inode = NULL;
 943                 goto out;
 944         }
 945
 946         ret = link_to_fixup_dir(trans, root, path, location.objectid);
 947         if (ret)
 948                 goto out;
 949
 950         ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
 951 out:
 952         kfree(name.name);
 953         if (inode)
 954                 iput(&inode->vfs_inode);
 955         return ret;
 956 }
 957
 958 /*
 959  * See if a given name and sequence number found in an inode back reference are
 960  * already in a directory and correctly point to this inode.
 961  *
 962  * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
 963  * exists.
 964  */
 965 static noinline int inode_in_dir(struct btrfs_root *root,
 966                                  struct btrfs_path *path,
 967                                  u64 dirid, u64 objectid, u64 index,
 968                                  struct fscrypt_str *name)
 969 {
 970         struct btrfs_dir_item *di;
 971         struct btrfs_key location;
 972         int ret = 0;
 973
 974         di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
 975                                          index, name, 0);
 976         if (IS_ERR(di)) {
 977                 ret = PTR_ERR(di);
 978                 goto out;
 979         } else if (di) {
 980                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 981                 if (location.objectid != objectid)
 982                         goto out;
 983         } else {
 984                 goto out;
 985         }
 986
 987         btrfs_release_path(path);
 988         di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
 989         if (IS_ERR(di)) {
 990                 ret = PTR_ERR(di);
 991                 goto out;
 992         } else if (di) {
 993                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 994                 if (location.objectid == objectid)
 995                         ret = 1;
 996         }
 997 out:
 998         btrfs_release_path(path);
 999         return ret;
1000 }
1001
1002 /*
1003  * helper function to check a log tree for a named back reference in
1004  * an inode.  This is used to decide if a back reference that is
1005  * found in the subvolume conflicts with what we find in the log.
1006  *
1007  * inode backreferences may have multiple refs in a single item,
1008  * during replay we process one reference at a time, and we don't
1009  * want to delete valid links to a file from the subvolume if that
1010  * link is also in the log.
1011  */
1012 static noinline int backref_in_log(struct btrfs_root *log,
1013                                    struct btrfs_key *key,
1014                                    u64 ref_objectid,
1015                                    const struct fscrypt_str *name)
1016 {
1017         struct btrfs_path *path;
1018         int ret;
1019
1020         path = btrfs_alloc_path();
1021         if (!path)
1022                 return -ENOMEM;
1023
1024         ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
1025         if (ret < 0) {
1026                 goto out;
1027         } else if (ret == 1) {
1028                 ret = 0;
1029                 goto out;
1030         }
1031
1032         if (key->type == BTRFS_INODE_EXTREF_KEY)
1033                 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1034                                                        path->slots[0],
1035                                                        ref_objectid, name);
1036         else
1037                 ret = !!btrfs_find_name_in_backref(path->nodes[0],
1038                                                    path->slots[0], name);
1039 out:
1040         btrfs_free_path(path);
1041         return ret;
1042 }
1043
1044 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
1045                                   struct btrfs_root *root,
1046                                   struct btrfs_path *path,
1047                                   struct btrfs_root *log_root,
1048                                   struct btrfs_inode *dir,
1049                                   struct btrfs_inode *inode,
1050                                   u64 inode_objectid, u64 parent_objectid,
1051                                   u64 ref_index, struct fscrypt_str *name)
1052 {
1053         int ret;
1054         struct extent_buffer *leaf;
1055         struct btrfs_dir_item *di;
1056         struct btrfs_key search_key;
1057         struct btrfs_inode_extref *extref;
1058
1059 again:
1060         /* Search old style refs */
1061         search_key.objectid = inode_objectid;
1062         search_key.type = BTRFS_INODE_REF_KEY;
1063         search_key.offset = parent_objectid;
1064         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1065         if (ret < 0) {
1066                 return ret;
1067         } else if (ret == 0) {
1068                 struct btrfs_inode_ref *victim_ref;
1069                 unsigned long ptr;
1070                 unsigned long ptr_end;
1071
1072                 leaf = path->nodes[0];
1073
1074                 /* are we trying to overwrite a back ref for the root directory
1075                  * if so, just jump out, we're done
1076                  */
1077                 if (search_key.objectid == search_key.offset)
1078                         return 1;
1079
1080                 /* check all the names in this back reference to see
1081                  * if they are in the log.  if so, we allow them to stay
1082                  * otherwise they must be unlinked as a conflict
1083                  */
1084                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1085                 ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
1086                 while (ptr < ptr_end) {
1087                         struct fscrypt_str victim_name;
1088
1089                         victim_ref = (struct btrfs_inode_ref *)ptr;
1090                         ret = read_alloc_one_name(leaf, (victim_ref + 1),
1091                                  btrfs_inode_ref_name_len(leaf, victim_ref),
1092                                  &victim_name);
1093                         if (ret)
1094                                 return ret;
1095
1096                         ret = backref_in_log(log_root, &search_key,
1097                                              parent_objectid, &victim_name);
1098                         if (ret < 0) {
1099                                 kfree(victim_name.name);
1100                                 return ret;
1101                         } else if (!ret) {
1102                                 inc_nlink(&inode->vfs_inode);
1103                                 btrfs_release_path(path);
1104
1105                                 ret = unlink_inode_for_log_replay(trans, dir, inode,
1106                                                 &victim_name);
1107                                 kfree(victim_name.name);
1108                                 if (ret)
1109                                         return ret;
1110                                 goto again;
1111                         }
1112                         kfree(victim_name.name);
1113
1114                         ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
1115                 }
1116         }
1117         btrfs_release_path(path);
1118
1119         /* Same search but for extended refs */
1120         extref = btrfs_lookup_inode_extref(NULL, root, path, name,
1121                                            inode_objectid, parent_objectid, 0,
1122                                            0);
1123         if (IS_ERR(extref)) {
1124                 return PTR_ERR(extref);
1125         } else if (extref) {
1126                 u32 item_size;
1127                 u32 cur_offset = 0;
1128                 unsigned long base;
1129                 struct btrfs_inode *victim_parent;
1130
1131                 leaf = path->nodes[0];
1132
1133                 item_size = btrfs_item_size(leaf, path->slots[0]);
1134                 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1135
1136                 while (cur_offset < item_size) {
1137                         struct fscrypt_str victim_name;
1138
1139                         extref = (struct btrfs_inode_extref *)(base + cur_offset);
1140                         victim_name.len = btrfs_inode_extref_name_len(leaf, extref);
1141
1142                         if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1143                                 goto next;
1144
1145                         ret = read_alloc_one_name(leaf, &extref->name,
1146                                                   victim_name.len, &victim_name);
1147                         if (ret)
1148                                 return ret;
1149
1150                         search_key.objectid = inode_objectid;
1151                         search_key.type = BTRFS_INODE_EXTREF_KEY;
1152                         search_key.offset = btrfs_extref_hash(parent_objectid,
1153                                                               victim_name.name,
1154                                                               victim_name.len);
1155                         ret = backref_in_log(log_root, &search_key,
1156                                              parent_objectid, &victim_name);
1157                         if (ret < 0) {
1158                                 kfree(victim_name.name);
1159                                 return ret;
1160                         } else if (!ret) {
1161                                 victim_parent = btrfs_iget_logging(parent_objectid, root);
1162                                 if (IS_ERR(victim_parent)) {
1163                                         ret = PTR_ERR(victim_parent);
1164                                 } else {
1165                                         inc_nlink(&inode->vfs_inode);
1166                                         btrfs_release_path(path);
1167
1168                                         ret = unlink_inode_for_log_replay(trans,
1169                                                         victim_parent,
1170                                                         inode, &victim_name);
1171                                         iput(&victim_parent->vfs_inode);
1172                                 }
1173                                 kfree(victim_name.name);
1174                                 if (ret)
1175                                         return ret;
1176                                 goto again;
1177                         }
1178                         kfree(victim_name.name);
1179 next:
1180                         cur_offset += victim_name.len + sizeof(*extref);
1181                 }
1182         }
1183         btrfs_release_path(path);
1184
1185         /* look for a conflicting sequence number */
1186         di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1187                                          ref_index, name, 0);
1188         if (IS_ERR(di)) {
1189                 return PTR_ERR(di);
1190         } else if (di) {
1191                 ret = drop_one_dir_item(trans, path, dir, di);
1192                 if (ret)
1193                         return ret;
1194         }
1195         btrfs_release_path(path);
1196
1197         /* look for a conflicting name */
1198         di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
1199         if (IS_ERR(di)) {
1200                 return PTR_ERR(di);
1201         } else if (di) {
1202                 ret = drop_one_dir_item(trans, path, dir, di);
1203                 if (ret)
1204                         return ret;
1205         }
1206         btrfs_release_path(path);
1207
1208         return 0;
1209 }
1210
1211 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1212                              struct fscrypt_str *name, u64 *index,
1213                              u64 *parent_objectid)
1214 {
1215         struct btrfs_inode_extref *extref;
1216         int ret;
1217
1218         extref = (struct btrfs_inode_extref *)ref_ptr;
1219
1220         ret = read_alloc_one_name(eb, &extref->name,
1221                                   btrfs_inode_extref_name_len(eb, extref), name);
1222         if (ret)
1223                 return ret;
1224
1225         if (index)
1226                 *index = btrfs_inode_extref_index(eb, extref);
1227         if (parent_objectid)
1228                 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1229
1230         return 0;
1231 }
1232
1233 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1234                           struct fscrypt_str *name, u64 *index)
1235 {
1236         struct btrfs_inode_ref *ref;
1237         int ret;
1238
1239         ref = (struct btrfs_inode_ref *)ref_ptr;
1240
1241         ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
1242                                   name);
1243         if (ret)
1244                 return ret;
1245
1246         if (index)
1247                 *index = btrfs_inode_ref_index(eb, ref);
1248
1249         return 0;
1250 }
1251
1252 /*
1253  * Take an inode reference item from the log tree and iterate all names from the
1254  * inode reference item in the subvolume tree with the same key (if it exists).
1255  * For any name that is not in the inode reference item from the log tree, do a
1256  * proper unlink of that name (that is, remove its entry from the inode
1257  * reference item and both dir index keys).
1258  */
1259 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1260                                  struct btrfs_root *root,
1261                                  struct btrfs_path *path,
1262                                  struct btrfs_inode *inode,
1263                                  struct extent_buffer *log_eb,
1264                                  int log_slot,
1265                                  struct btrfs_key *key)
1266 {
1267         int ret;
1268         unsigned long ref_ptr;
1269         unsigned long ref_end;
1270         struct extent_buffer *eb;
1271
1272 again:
1273         btrfs_release_path(path);
1274         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1275         if (ret > 0) {
1276                 ret = 0;
1277                 goto out;
1278         }
1279         if (ret < 0)
1280                 goto out;
1281
1282         eb = path->nodes[0];
1283         ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1284         ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
1285         while (ref_ptr < ref_end) {
1286                 struct fscrypt_str name;
1287                 u64 parent_id;
1288
1289                 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1290                         ret = extref_get_fields(eb, ref_ptr, &name,
1291                                                 NULL, &parent_id);
1292                 } else {
1293                         parent_id = key->offset;
1294                         ret = ref_get_fields(eb, ref_ptr, &name, NULL);
1295                 }
1296                 if (ret)
1297                         goto out;
1298
1299                 if (key->type == BTRFS_INODE_EXTREF_KEY)
1300                         ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1301                                                                parent_id, &name);
1302                 else
1303                         ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
1304
1305                 if (!ret) {
1306                         struct btrfs_inode *dir;
1307
1308                         btrfs_release_path(path);
1309                         dir = btrfs_iget_logging(parent_id, root);
1310                         if (IS_ERR(dir)) {
1311                                 ret = PTR_ERR(dir);
1312                                 kfree(name.name);
1313                                 goto out;
1314                         }
1315                         ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
1316                         kfree(name.name);
1317                         iput(&dir->vfs_inode);
1318                         if (ret)
1319                                 goto out;
1320                         goto again;
1321                 }
1322
1323                 kfree(name.name);
1324                 ref_ptr += name.len;
1325                 if (key->type == BTRFS_INODE_EXTREF_KEY)
1326                         ref_ptr += sizeof(struct btrfs_inode_extref);
1327                 else
1328                         ref_ptr += sizeof(struct btrfs_inode_ref);
1329         }
1330         ret = 0;
1331  out:
1332         btrfs_release_path(path);
1333         return ret;
1334 }
1335
1336 /*
1337  * replay one inode back reference item found in the log tree.
1338  * eb, slot and key refer to the buffer and key found in the log tree.
1339  * root is the destination we are replaying into, and path is for temp
1340  * use by this function.  (it should be released on return).
1341  */
1342 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1343                                   struct btrfs_root *root,
1344                                   struct btrfs_root *log,
1345                                   struct btrfs_path *path,
1346                                   struct extent_buffer *eb, int slot,
1347                                   struct btrfs_key *key)
1348 {
1349         struct btrfs_inode *dir = NULL;
1350         struct btrfs_inode *inode = NULL;
1351         unsigned long ref_ptr;
1352         unsigned long ref_end;
1353         struct fscrypt_str name = { 0 };
1354         int ret;
1355         int log_ref_ver = 0;
1356         u64 parent_objectid;
1357         u64 inode_objectid;
1358         u64 ref_index = 0;
1359         int ref_struct_size;
1360
1361         ref_ptr = btrfs_item_ptr_offset(eb, slot);
1362         ref_end = ref_ptr + btrfs_item_size(eb, slot);
1363
1364         if (key->type == BTRFS_INODE_EXTREF_KEY) {
1365                 struct btrfs_inode_extref *r;
1366
1367                 ref_struct_size = sizeof(struct btrfs_inode_extref);
1368                 log_ref_ver = 1;
1369                 r = (struct btrfs_inode_extref *)ref_ptr;
1370                 parent_objectid = btrfs_inode_extref_parent(eb, r);
1371         } else {
1372                 ref_struct_size = sizeof(struct btrfs_inode_ref);
1373                 parent_objectid = key->offset;
1374         }
1375         inode_objectid = key->objectid;
1376
1377         /*
1378          * it is possible that we didn't log all the parent directories
1379          * for a given inode.  If we don't find the dir, just don't
1380          * copy the back ref in.  The link count fixup code will take
1381          * care of the rest
1382          */
1383         dir = btrfs_iget_logging(parent_objectid, root);
1384         if (IS_ERR(dir)) {
1385                 ret = PTR_ERR(dir);
1386                 dir = NULL;
1387                 goto out;
1388         }
1389
1390         inode = btrfs_iget_logging(inode_objectid, root);
1391         if (IS_ERR(inode)) {
1392                 ret = PTR_ERR(inode);
1393                 inode = NULL;
1394                 goto out;
1395         }
1396
1397         while (ref_ptr < ref_end) {
1398                 if (log_ref_ver) {
1399                         ret = extref_get_fields(eb, ref_ptr, &name,
1400                                                 &ref_index, &parent_objectid);
1401                         /*
1402                          * parent object can change from one array
1403                          * item to another.
1404                          */
1405                         if (!dir) {
1406                                 dir = btrfs_iget_logging(parent_objectid, root);
1407                                 if (IS_ERR(dir)) {
1408                                         ret = PTR_ERR(dir);
1409                                         dir = NULL;
1410                                         goto out;
1411                                 }
1412                         }
1413                 } else {
1414                         ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
1415                 }
1416                 if (ret)
1417                         goto out;
1418
1419                 ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
1420                                    ref_index, &name);
1421                 if (ret < 0) {
1422                         goto out;
1423                 } else if (ret == 0) {
1424                         /*
1425                          * look for a conflicting back reference in the
1426                          * metadata. if we find one we have to unlink that name
1427                          * of the file before we add our new link.  Later on, we
1428                          * overwrite any existing back reference, and we don't
1429                          * want to create dangling pointers in the directory.
1430                          */
1431                         ret = __add_inode_ref(trans, root, path, log, dir, inode,
1432                                               inode_objectid, parent_objectid,
1433                                               ref_index, &name);
1434                         if (ret) {
1435                                 if (ret == 1)
1436                                         ret = 0;
1437                                 goto out;
1438                         }
1439
1440                         /* insert our name */
1441                         ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index);
1442                         if (ret)
1443                                 goto out;
1444
1445                         ret = btrfs_update_inode(trans, inode);
1446                         if (ret)
1447                                 goto out;
1448                 }
1449                 /* Else, ret == 1, we already have a perfect match, we're done. */
1450
1451                 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
1452                 kfree(name.name);
1453                 name.name = NULL;
1454                 if (log_ref_ver) {
1455                         iput(&dir->vfs_inode);
1456                         dir = NULL;
1457                 }
1458         }
1459
1460         /*
1461          * Before we overwrite the inode reference item in the subvolume tree
1462          * with the item from the log tree, we must unlink all names from the
1463          * parent directory that are in the subvolume's tree inode reference
1464          * item, otherwise we end up with an inconsistent subvolume tree where
1465          * dir index entries exist for a name but there is no inode reference
1466          * item with the same name.
1467          */
1468         ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key);
1469         if (ret)
1470                 goto out;
1471
1472         /* finally write the back reference in the inode */
1473         ret = overwrite_item(trans, root, path, eb, slot, key);
1474 out:
1475         btrfs_release_path(path);
1476         kfree(name.name);
1477         if (dir)
1478                 iput(&dir->vfs_inode);
1479         if (inode)
1480                 iput(&inode->vfs_inode);
1481         return ret;
1482 }
1483
1484 static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path)
1485 {
1486         int ret = 0;
1487         int name_len;
1488         unsigned int nlink = 0;
1489         u32 item_size;
1490         u32 cur_offset = 0;
1491         u64 inode_objectid = btrfs_ino(inode);
1492         u64 offset = 0;
1493         unsigned long ptr;
1494         struct btrfs_inode_extref *extref;
1495         struct extent_buffer *leaf;
1496
1497         while (1) {
1498                 ret = btrfs_find_one_extref(inode->root, inode_objectid, offset,
1499                                             path, &extref, &offset);
1500                 if (ret)
1501                         break;
1502
1503                 leaf = path->nodes[0];
1504                 item_size = btrfs_item_size(leaf, path->slots[0]);
1505                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1506                 cur_offset = 0;
1507
1508                 while (cur_offset < item_size) {
1509                         extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1510                         name_len = btrfs_inode_extref_name_len(leaf, extref);
1511
1512                         nlink++;
1513
1514                         cur_offset += name_len + sizeof(*extref);
1515                 }
1516
1517                 offset++;
1518                 btrfs_release_path(path);
1519         }
1520         btrfs_release_path(path);
1521
1522         if (ret < 0 && ret != -ENOENT)
1523                 return ret;
1524         return nlink;
1525 }
1526
1527 static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path)
1528 {
1529         int ret;
1530         struct btrfs_key key;
1531         unsigned int nlink = 0;
1532         unsigned long ptr;
1533         unsigned long ptr_end;
1534         int name_len;
1535         u64 ino = btrfs_ino(inode);
1536
1537         key.objectid = ino;
1538         key.type = BTRFS_INODE_REF_KEY;
1539         key.offset = (u64)-1;
1540
1541         while (1) {
1542                 ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
1543                 if (ret < 0)
1544                         break;
1545                 if (ret > 0) {
1546                         if (path->slots[0] == 0)
1547                                 break;
1548                         path->slots[0]--;
1549                 }
1550 process_slot:
1551                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1552                                       path->slots[0]);
1553                 if (key.objectid != ino ||
1554                     key.type != BTRFS_INODE_REF_KEY)
1555                         break;
1556                 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1557                 ptr_end = ptr + btrfs_item_size(path->nodes[0],
1558                                                    path->slots[0]);
1559                 while (ptr < ptr_end) {
1560                         struct btrfs_inode_ref *ref;
1561
1562                         ref = (struct btrfs_inode_ref *)ptr;
1563                         name_len = btrfs_inode_ref_name_len(path->nodes[0],
1564                                                             ref);
1565                         ptr = (unsigned long)(ref + 1) + name_len;
1566                         nlink++;
1567                 }
1568
1569                 if (key.offset == 0)
1570                         break;
1571                 if (path->slots[0] > 0) {
1572                         path->slots[0]--;
1573                         goto process_slot;
1574                 }
1575                 key.offset--;
1576                 btrfs_release_path(path);
1577         }
1578         btrfs_release_path(path);
1579
1580         return nlink;
1581 }
1582
1583 /*
1584  * There are a few corners where the link count of the file can't
1585  * be properly maintained during replay.  So, instead of adding
1586  * lots of complexity to the log code, we just scan the backrefs
1587  * for any file that has been through replay.
1588  *
1589  * The scan will update the link count on the inode to reflect the
1590  * number of back refs found.  If it goes down to zero, the iput
1591  * will free the inode.
1592  */
1593 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1594                                            struct btrfs_inode *inode)
1595 {
1596         struct btrfs_root *root = inode->root;
1597         struct btrfs_path *path;
1598         int ret;
1599         u64 nlink = 0;
1600         const u64 ino = btrfs_ino(inode);
1601
1602         path = btrfs_alloc_path();
1603         if (!path)
1604                 return -ENOMEM;
1605
1606         ret = count_inode_refs(inode, path);
1607         if (ret < 0)
1608                 goto out;
1609
1610         nlink = ret;
1611
1612         ret = count_inode_extrefs(inode, path);
1613         if (ret < 0)
1614                 goto out;
1615
1616         nlink += ret;
1617
1618         ret = 0;
1619
1620         if (nlink != inode->vfs_inode.i_nlink) {
1621                 set_nlink(&inode->vfs_inode, nlink);
1622                 ret = btrfs_update_inode(trans, inode);
1623                 if (ret)
1624                         goto out;
1625         }
1626         if (S_ISDIR(inode->vfs_inode.i_mode))
1627                 inode->index_cnt = (u64)-1;
1628
1629         if (inode->vfs_inode.i_nlink == 0) {
1630                 if (S_ISDIR(inode->vfs_inode.i_mode)) {
1631                         ret = replay_dir_deletes(trans, root, NULL, path,
1632                                                  ino, 1);
1633                         if (ret)
1634                                 goto out;
1635                 }
1636                 ret = btrfs_insert_orphan_item(trans, root, ino);
1637                 if (ret == -EEXIST)
1638                         ret = 0;
1639         }
1640
1641 out:
1642         btrfs_free_path(path);
1643         return ret;
1644 }
1645
1646 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1647                                             struct btrfs_root *root,
1648                                             struct btrfs_path *path)
1649 {
1650         int ret;
1651         struct btrfs_key key;
1652
1653         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1654         key.type = BTRFS_ORPHAN_ITEM_KEY;
1655         key.offset = (u64)-1;
1656         while (1) {
1657                 struct btrfs_inode *inode;
1658
1659                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1660                 if (ret < 0)
1661                         break;
1662
1663                 if (ret == 1) {
1664                         ret = 0;
1665                         if (path->slots[0] == 0)
1666                                 break;
1667                         path->slots[0]--;
1668                 }
1669
1670                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1671                 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1672                     key.type != BTRFS_ORPHAN_ITEM_KEY)
1673                         break;
1674
1675                 ret = btrfs_del_item(trans, root, path);
1676                 if (ret)
1677                         break;
1678
1679                 btrfs_release_path(path);
1680                 inode = btrfs_iget_logging(key.offset, root);
1681                 if (IS_ERR(inode)) {
1682                         ret = PTR_ERR(inode);
1683                         break;
1684                 }
1685
1686                 ret = fixup_inode_link_count(trans, inode);
1687                 iput(&inode->vfs_inode);
1688                 if (ret)
1689                         break;
1690
1691                 /*
1692                  * fixup on a directory may create new entries,
1693                  * make sure we always look for the highset possible
1694                  * offset
1695                  */
1696                 key.offset = (u64)-1;
1697         }
1698         btrfs_release_path(path);
1699         return ret;
1700 }
1701
1702
1703 /*
1704  * record a given inode in the fixup dir so we can check its link
1705  * count when replay is done.  The link count is incremented here
1706  * so the inode won't go away until we check it
1707  */
1708 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1709                                       struct btrfs_root *root,
1710                                       struct btrfs_path *path,
1711                                       u64 objectid)
1712 {
1713         struct btrfs_key key;
1714         int ret = 0;
1715         struct btrfs_inode *inode;
1716         struct inode *vfs_inode;
1717
1718         inode = btrfs_iget_logging(objectid, root);
1719         if (IS_ERR(inode))
1720                 return PTR_ERR(inode);
1721
1722         vfs_inode = &inode->vfs_inode;
1723         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1724         key.type = BTRFS_ORPHAN_ITEM_KEY;
1725         key.offset = objectid;
1726
1727         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1728
1729         btrfs_release_path(path);
1730         if (ret == 0) {
1731                 if (!vfs_inode->i_nlink)
1732                         set_nlink(vfs_inode, 1);
1733                 else
1734                         inc_nlink(vfs_inode);
1735                 ret = btrfs_update_inode(trans, inode);
1736         } else if (ret == -EEXIST) {
1737                 ret = 0;
1738         }
1739         iput(vfs_inode);
1740
1741         return ret;
1742 }
1743
1744 /*
1745  * when replaying the log for a directory, we only insert names
1746  * for inodes that actually exist.  This means an fsync on a directory
1747  * does not implicitly fsync all the new files in it
1748  */
1749 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1750                                     struct btrfs_root *root,
1751                                     u64 dirid, u64 index,
1752                                     const struct fscrypt_str *name,
1753                                     struct btrfs_key *location)
1754 {
1755         struct btrfs_inode *inode;
1756         struct btrfs_inode *dir;
1757         int ret;
1758
1759         inode = btrfs_iget_logging(location->objectid, root);
1760         if (IS_ERR(inode))
1761                 return PTR_ERR(inode);
1762
1763         dir = btrfs_iget_logging(dirid, root);
1764         if (IS_ERR(dir)) {
1765                 iput(&inode->vfs_inode);
1766                 return PTR_ERR(dir);
1767         }
1768
1769         ret = btrfs_add_link(trans, dir, inode, name, 1, index);
1770
1771         /* FIXME, put inode into FIXUP list */
1772
1773         iput(&inode->vfs_inode);
1774         iput(&dir->vfs_inode);
1775         return ret;
1776 }
1777
1778 static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
1779                                         struct btrfs_inode *dir,
1780                                         struct btrfs_path *path,
1781                                         struct btrfs_dir_item *dst_di,
1782                                         const struct btrfs_key *log_key,
1783                                         u8 log_flags,
1784                                         bool exists)
1785 {
1786         struct btrfs_key found_key;
1787
1788         btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1789         /* The existing dentry points to the same inode, don't delete it. */
1790         if (found_key.objectid == log_key->objectid &&
1791             found_key.type == log_key->type &&
1792             found_key.offset == log_key->offset &&
1793             btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
1794                 return 1;
1795
1796         /*
1797          * Don't drop the conflicting directory entry if the inode for the new
1798          * entry doesn't exist.
1799          */
1800         if (!exists)
1801                 return 0;
1802
1803         return drop_one_dir_item(trans, path, dir, dst_di);
1804 }
1805
1806 /*
1807  * take a single entry in a log directory item and replay it into
1808  * the subvolume.
1809  *
1810  * if a conflicting item exists in the subdirectory already,
1811  * the inode it points to is unlinked and put into the link count
1812  * fix up tree.
1813  *
1814  * If a name from the log points to a file or directory that does
1815  * not exist in the FS, it is skipped.  fsyncs on directories
1816  * do not force down inodes inside that directory, just changes to the
1817  * names or unlinks in a directory.
1818  *
1819  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1820  * non-existing inode) and 1 if the name was replayed.
1821  */
1822 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1823                                     struct btrfs_root *root,
1824                                     struct btrfs_path *path,
1825                                     struct extent_buffer *eb,
1826                                     struct btrfs_dir_item *di,
1827                                     struct btrfs_key *key)
1828 {
1829         struct fscrypt_str name = { 0 };
1830         struct btrfs_dir_item *dir_dst_di;
1831         struct btrfs_dir_item *index_dst_di;
1832         bool dir_dst_matches = false;
1833         bool index_dst_matches = false;
1834         struct btrfs_key log_key;
1835         struct btrfs_key search_key;
1836         struct btrfs_inode *dir;
1837         u8 log_flags;
1838         bool exists;
1839         int ret;
1840         bool update_size = true;
1841         bool name_added = false;
1842
1843         dir = btrfs_iget_logging(key->objectid, root);
1844         if (IS_ERR(dir))
1845                 return PTR_ERR(dir);
1846
1847         ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
1848         if (ret)
1849                 goto out;
1850
1851         log_flags = btrfs_dir_flags(eb, di);
1852         btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1853         ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1854         btrfs_release_path(path);
1855         if (ret < 0)
1856                 goto out;
1857         exists = (ret == 0);
1858         ret = 0;
1859
1860         dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1861                                            &name, 1);
1862         if (IS_ERR(dir_dst_di)) {
1863                 ret = PTR_ERR(dir_dst_di);
1864                 goto out;
1865         } else if (dir_dst_di) {
1866                 ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di,
1867                                                    &log_key, log_flags, exists);
1868                 if (ret < 0)
1869                         goto out;
1870                 dir_dst_matches = (ret == 1);
1871         }
1872
1873         btrfs_release_path(path);
1874
1875         index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1876                                                    key->objectid, key->offset,
1877                                                    &name, 1);
1878         if (IS_ERR(index_dst_di)) {
1879                 ret = PTR_ERR(index_dst_di);
1880                 goto out;
1881         } else if (index_dst_di) {
1882                 ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di,
1883                                                    &log_key, log_flags, exists);
1884                 if (ret < 0)
1885                         goto out;
1886                 index_dst_matches = (ret == 1);
1887         }
1888
1889         btrfs_release_path(path);
1890
1891         if (dir_dst_matches && index_dst_matches) {
1892                 ret = 0;
1893                 update_size = false;
1894                 goto out;
1895         }
1896
1897         /*
1898          * Check if the inode reference exists in the log for the given name,
1899          * inode and parent inode
1900          */
1901         search_key.objectid = log_key.objectid;
1902         search_key.type = BTRFS_INODE_REF_KEY;
1903         search_key.offset = key->objectid;
1904         ret = backref_in_log(root->log_root, &search_key, 0, &name);
1905         if (ret < 0) {
1906                 goto out;
1907         } else if (ret) {
1908                 /* The dentry will be added later. */
1909                 ret = 0;
1910                 update_size = false;
1911                 goto out;
1912         }
1913
1914         search_key.objectid = log_key.objectid;
1915         search_key.type = BTRFS_INODE_EXTREF_KEY;
1916         search_key.offset = key->objectid;
1917         ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
1918         if (ret < 0) {
1919                 goto out;
1920         } else if (ret) {
1921                 /* The dentry will be added later. */
1922                 ret = 0;
1923                 update_size = false;
1924                 goto out;
1925         }
1926         btrfs_release_path(path);
1927         ret = insert_one_name(trans, root, key->objectid, key->offset,
1928                               &name, &log_key);
1929         if (ret && ret != -ENOENT && ret != -EEXIST)
1930                 goto out;
1931         if (!ret)
1932                 name_added = true;
1933         update_size = false;
1934         ret = 0;
1935
1936 out:
1937         if (!ret && update_size) {
1938                 btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
1939                 ret = btrfs_update_inode(trans, dir);
1940         }
1941         kfree(name.name);
1942         iput(&dir->vfs_inode);
1943         if (!ret && name_added)
1944                 ret = 1;
1945         return ret;
1946 }
1947
1948 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
1949 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1950                                         struct btrfs_root *root,
1951                                         struct btrfs_path *path,
1952                                         struct extent_buffer *eb, int slot,
1953                                         struct btrfs_key *key)
1954 {
1955         int ret;
1956         struct btrfs_dir_item *di;
1957
1958         /* We only log dir index keys, which only contain a single dir item. */
1959         ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
1960
1961         di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
1962         ret = replay_one_name(trans, root, path, eb, di, key);
1963         if (ret < 0)
1964                 return ret;
1965
1966         /*
1967          * If this entry refers to a non-directory (directories can not have a
1968          * link count > 1) and it was added in the transaction that was not
1969          * committed, make sure we fixup the link count of the inode the entry
1970          * points to. Otherwise something like the following would result in a
1971          * directory pointing to an inode with a wrong link that does not account
1972          * for this dir entry:
1973          *
1974          * mkdir testdir
1975          * touch testdir/foo
1976          * touch testdir/bar
1977          * sync
1978          *
1979          * ln testdir/bar testdir/bar_link
1980          * ln testdir/foo testdir/foo_link
1981          * xfs_io -c "fsync" testdir/bar
1982          *
1983          * <power failure>
1984          *
1985          * mount fs, log replay happens
1986          *
1987          * File foo would remain with a link count of 1 when it has two entries
1988          * pointing to it in the directory testdir. This would make it impossible
1989          * to ever delete the parent directory has it would result in stale
1990          * dentries that can never be deleted.
1991          */
1992         if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
1993                 struct btrfs_path *fixup_path;
1994                 struct btrfs_key di_key;
1995
1996                 fixup_path = btrfs_alloc_path();
1997                 if (!fixup_path)
1998                         return -ENOMEM;
1999
2000                 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2001                 ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
2002                 btrfs_free_path(fixup_path);
2003         }
2004
2005         return ret;
2006 }
2007
2008 /*
2009  * directory replay has two parts.  There are the standard directory
2010  * items in the log copied from the subvolume, and range items
2011  * created in the log while the subvolume was logged.
2012  *
2013  * The range items tell us which parts of the key space the log
2014  * is authoritative for.  During replay, if a key in the subvolume
2015  * directory is in a logged range item, but not actually in the log
2016  * that means it was deleted from the directory before the fsync
2017  * and should be removed.
2018  */
2019 static noinline int find_dir_range(struct btrfs_root *root,
2020                                    struct btrfs_path *path,
2021                                    u64 dirid,
2022                                    u64 *start_ret, u64 *end_ret)
2023 {
2024         struct btrfs_key key;
2025         u64 found_end;
2026         struct btrfs_dir_log_item *item;
2027         int ret;
2028         int nritems;
2029
2030         if (*start_ret == (u64)-1)
2031                 return 1;
2032
2033         key.objectid = dirid;
2034         key.type = BTRFS_DIR_LOG_INDEX_KEY;
2035         key.offset = *start_ret;
2036
2037         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2038         if (ret < 0)
2039                 goto out;
2040         if (ret > 0) {
2041                 if (path->slots[0] == 0)
2042                         goto out;
2043                 path->slots[0]--;
2044         }
2045         if (ret != 0)
2046                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2047
2048         if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2049                 ret = 1;
2050                 goto next;
2051         }
2052         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2053                               struct btrfs_dir_log_item);
2054         found_end = btrfs_dir_log_end(path->nodes[0], item);
2055
2056         if (*start_ret >= key.offset && *start_ret <= found_end) {
2057                 ret = 0;
2058                 *start_ret = key.offset;
2059                 *end_ret = found_end;
2060                 goto out;
2061         }
2062         ret = 1;
2063 next:
2064         /* check the next slot in the tree to see if it is a valid item */
2065         nritems = btrfs_header_nritems(path->nodes[0]);
2066         path->slots[0]++;
2067         if (path->slots[0] >= nritems) {
2068                 ret = btrfs_next_leaf(root, path);
2069                 if (ret)
2070                         goto out;
2071         }
2072
2073         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2074
2075         if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2076                 ret = 1;
2077                 goto out;
2078         }
2079         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2080                               struct btrfs_dir_log_item);
2081         found_end = btrfs_dir_log_end(path->nodes[0], item);
2082         *start_ret = key.offset;
2083         *end_ret = found_end;
2084         ret = 0;
2085 out:
2086         btrfs_release_path(path);
2087         return ret;
2088 }
2089
2090 /*
2091  * this looks for a given directory item in the log.  If the directory
2092  * item is not in the log, the item is removed and the inode it points
2093  * to is unlinked
2094  */
2095 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2096                                       struct btrfs_root *log,
2097                                       struct btrfs_path *path,
2098                                       struct btrfs_path *log_path,
2099                                       struct btrfs_inode *dir,
2100                                       struct btrfs_key *dir_key)
2101 {
2102         struct btrfs_root *root = dir->root;
2103         int ret;
2104         struct extent_buffer *eb;
2105         int slot;
2106         struct btrfs_dir_item *di;
2107         struct fscrypt_str name = { 0 };
2108         struct btrfs_inode *inode = NULL;
2109         struct btrfs_key location;
2110
2111         /*
2112          * Currently we only log dir index keys. Even if we replay a log created
2113          * by an older kernel that logged both dir index and dir item keys, all
2114          * we need to do is process the dir index keys, we (and our caller) can
2115          * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
2116          */
2117         ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
2118
2119         eb = path->nodes[0];
2120         slot = path->slots[0];
2121         di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2122         ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
2123         if (ret)
2124                 goto out;
2125
2126         if (log) {
2127                 struct btrfs_dir_item *log_di;
2128
2129                 log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
2130                                                      dir_key->objectid,
2131                                                      dir_key->offset, &name, 0);
2132                 if (IS_ERR(log_di)) {
2133                         ret = PTR_ERR(log_di);
2134                         goto out;
2135                 } else if (log_di) {
2136                         /* The dentry exists in the log, we have nothing to do. */
2137                         ret = 0;
2138                         goto out;
2139                 }
2140         }
2141
2142         btrfs_dir_item_key_to_cpu(eb, di, &location);
2143         btrfs_release_path(path);
2144         btrfs_release_path(log_path);
2145         inode = btrfs_iget_logging(location.objectid, root);
2146         if (IS_ERR(inode)) {
2147                 ret = PTR_ERR(inode);
2148                 inode = NULL;
2149                 goto out;
2150         }
2151
2152         ret = link_to_fixup_dir(trans, root, path, location.objectid);
2153         if (ret)
2154                 goto out;
2155
2156         inc_nlink(&inode->vfs_inode);
2157         ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
2158         /*
2159          * Unlike dir item keys, dir index keys can only have one name (entry) in
2160          * them, as there are no key collisions since each key has a unique offset
2161          * (an index number), so we're done.
2162          */
2163 out:
2164         btrfs_release_path(path);
2165         btrfs_release_path(log_path);
2166         kfree(name.name);
2167         if (inode)
2168                 iput(&inode->vfs_inode);
2169         return ret;
2170 }
2171
2172 static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2173                               struct btrfs_root *root,
2174                               struct btrfs_root *log,
2175                               struct btrfs_path *path,
2176                               const u64 ino)
2177 {
2178         struct btrfs_key search_key;
2179         struct btrfs_path *log_path;
2180         int i;
2181         int nritems;
2182         int ret;
2183
2184         log_path = btrfs_alloc_path();
2185         if (!log_path)
2186                 return -ENOMEM;
2187
2188         search_key.objectid = ino;
2189         search_key.type = BTRFS_XATTR_ITEM_KEY;
2190         search_key.offset = 0;
2191 again:
2192         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2193         if (ret < 0)
2194                 goto out;
2195 process_leaf:
2196         nritems = btrfs_header_nritems(path->nodes[0]);
2197         for (i = path->slots[0]; i < nritems; i++) {
2198                 struct btrfs_key key;
2199                 struct btrfs_dir_item *di;
2200                 struct btrfs_dir_item *log_di;
2201                 u32 total_size;
2202                 u32 cur;
2203
2204                 btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2205                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2206                         ret = 0;
2207                         goto out;
2208                 }
2209
2210                 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2211                 total_size = btrfs_item_size(path->nodes[0], i);
2212                 cur = 0;
2213                 while (cur < total_size) {
2214                         u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2215                         u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2216                         u32 this_len = sizeof(*di) + name_len + data_len;
2217                         char *name;
2218
2219                         name = kmalloc(name_len, GFP_NOFS);
2220                         if (!name) {
2221                                 ret = -ENOMEM;
2222                                 goto out;
2223                         }
2224                         read_extent_buffer(path->nodes[0], name,
2225                                            (unsigned long)(di + 1), name_len);
2226
2227                         log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2228                                                     name, name_len, 0);
2229                         btrfs_release_path(log_path);
2230                         if (!log_di) {
2231                                 /* Doesn't exist in log tree, so delete it. */
2232                                 btrfs_release_path(path);
2233                                 di = btrfs_lookup_xattr(trans, root, path, ino,
2234                                                         name, name_len, -1);
2235                                 kfree(name);
2236                                 if (IS_ERR(di)) {
2237                                         ret = PTR_ERR(di);
2238                                         goto out;
2239                                 }
2240                                 ASSERT(di);
2241                                 ret = btrfs_delete_one_dir_name(trans, root,
2242                                                                 path, di);
2243                                 if (ret)
2244                                         goto out;
2245                                 btrfs_release_path(path);
2246                                 search_key = key;
2247                                 goto again;
2248                         }
2249                         kfree(name);
2250                         if (IS_ERR(log_di)) {
2251                                 ret = PTR_ERR(log_di);
2252                                 goto out;
2253                         }
2254                         cur += this_len;
2255                         di = (struct btrfs_dir_item *)((char *)di + this_len);
2256                 }
2257         }
2258         ret = btrfs_next_leaf(root, path);
2259         if (ret > 0)
2260                 ret = 0;
2261         else if (ret == 0)
2262                 goto process_leaf;
2263 out:
2264         btrfs_free_path(log_path);
2265         btrfs_release_path(path);
2266         return ret;
2267 }
2268
2269
2270 /*
2271  * deletion replay happens before we copy any new directory items
2272  * out of the log or out of backreferences from inodes.  It
2273  * scans the log to find ranges of keys that log is authoritative for,
2274  * and then scans the directory to find items in those ranges that are
2275  * not present in the log.
2276  *
2277  * Anything we don't find in the log is unlinked and removed from the
2278  * directory.
2279  */
2280 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2281                                        struct btrfs_root *root,
2282                                        struct btrfs_root *log,
2283                                        struct btrfs_path *path,
2284                                        u64 dirid, int del_all)
2285 {
2286         u64 range_start;
2287         u64 range_end;
2288         int ret = 0;
2289         struct btrfs_key dir_key;
2290         struct btrfs_key found_key;
2291         struct btrfs_path *log_path;
2292         struct btrfs_inode *dir;
2293
2294         dir_key.objectid = dirid;
2295         dir_key.type = BTRFS_DIR_INDEX_KEY;
2296         log_path = btrfs_alloc_path();
2297         if (!log_path)
2298                 return -ENOMEM;
2299
2300         dir = btrfs_iget_logging(dirid, root);
2301         /*
2302          * It isn't an error if the inode isn't there, that can happen because
2303          * we replay the deletes before we copy in the inode item from the log.
2304          */
2305         if (IS_ERR(dir)) {
2306                 btrfs_free_path(log_path);
2307                 ret = PTR_ERR(dir);
2308                 if (ret == -ENOENT)
2309                         ret = 0;
2310                 return ret;
2311         }
2312
2313         range_start = 0;
2314         range_end = 0;
2315         while (1) {
2316                 if (del_all)
2317                         range_end = (u64)-1;
2318                 else {
2319                         ret = find_dir_range(log, path, dirid,
2320                                              &range_start, &range_end);
2321                         if (ret < 0)
2322                                 goto out;
2323                         else if (ret > 0)
2324                                 break;
2325                 }
2326
2327                 dir_key.offset = range_start;
2328                 while (1) {
2329                         int nritems;
2330                         ret = btrfs_search_slot(NULL, root, &dir_key, path,
2331                                                 0, 0);
2332                         if (ret < 0)
2333                                 goto out;
2334
2335                         nritems = btrfs_header_nritems(path->nodes[0]);
2336                         if (path->slots[0] >= nritems) {
2337                                 ret = btrfs_next_leaf(root, path);
2338                                 if (ret == 1)
2339                                         break;
2340                                 else if (ret < 0)
2341                                         goto out;
2342                         }
2343                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2344                                               path->slots[0]);
2345                         if (found_key.objectid != dirid ||
2346                             found_key.type != dir_key.type) {
2347                                 ret = 0;
2348                                 goto out;
2349                         }
2350
2351                         if (found_key.offset > range_end)
2352                                 break;
2353
2354                         ret = check_item_in_log(trans, log, path,
2355                                                 log_path, dir,
2356                                                 &found_key);
2357                         if (ret)
2358                                 goto out;
2359                         if (found_key.offset == (u64)-1)
2360                                 break;
2361                         dir_key.offset = found_key.offset + 1;
2362                 }
2363                 btrfs_release_path(path);
2364                 if (range_end == (u64)-1)
2365                         break;
2366                 range_start = range_end + 1;
2367         }
2368         ret = 0;
2369 out:
2370         btrfs_release_path(path);
2371         btrfs_free_path(log_path);
2372         iput(&dir->vfs_inode);
2373         return ret;
2374 }
2375
2376 /*
2377  * the process_func used to replay items from the log tree.  This
2378  * gets called in two different stages.  The first stage just looks
2379  * for inodes and makes sure they are all copied into the subvolume.
2380  *
2381  * The second stage copies all the other item types from the log into
2382  * the subvolume.  The two stage approach is slower, but gets rid of
2383  * lots of complexity around inodes referencing other inodes that exist
2384  * only in the log (references come from either directory items or inode
2385  * back refs).
2386  */
2387 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2388                              struct walk_control *wc, u64 gen, int level)
2389 {
2390         int nritems;
2391         struct btrfs_tree_parent_check check = {
2392                 .transid = gen,
2393                 .level = level
2394         };
2395         struct btrfs_path *path;
2396         struct btrfs_root *root = wc->replay_dest;
2397         struct btrfs_key key;
2398         int i;
2399         int ret;
2400
2401         ret = btrfs_read_extent_buffer(eb, &check);
2402         if (ret)
2403                 return ret;
2404
2405         level = btrfs_header_level(eb);
2406
2407         if (level != 0)
2408                 return 0;
2409
2410         path = btrfs_alloc_path();
2411         if (!path)
2412                 return -ENOMEM;
2413
2414         nritems = btrfs_header_nritems(eb);
2415         for (i = 0; i < nritems; i++) {
2416                 btrfs_item_key_to_cpu(eb, &key, i);
2417
2418                 /* inode keys are done during the first stage */
2419                 if (key.type == BTRFS_INODE_ITEM_KEY &&
2420                     wc->stage == LOG_WALK_REPLAY_INODES) {
2421                         struct btrfs_inode_item *inode_item;
2422                         u32 mode;
2423
2424                         inode_item = btrfs_item_ptr(eb, i,
2425                                             struct btrfs_inode_item);
2426                         /*
2427                          * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2428                          * and never got linked before the fsync, skip it, as
2429                          * replaying it is pointless since it would be deleted
2430                          * later. We skip logging tmpfiles, but it's always
2431                          * possible we are replaying a log created with a kernel
2432                          * that used to log tmpfiles.
2433                          */
2434                         if (btrfs_inode_nlink(eb, inode_item) == 0) {
2435                                 wc->ignore_cur_inode = true;
2436                                 continue;
2437                         } else {
2438                                 wc->ignore_cur_inode = false;
2439                         }
2440                         ret = replay_xattr_deletes(wc->trans, root, log,
2441                                                    path, key.objectid);
2442                         if (ret)
2443                                 break;
2444                         mode = btrfs_inode_mode(eb, inode_item);
2445                         if (S_ISDIR(mode)) {
2446                                 ret = replay_dir_deletes(wc->trans,
2447                                          root, log, path, key.objectid, 0);
2448                                 if (ret)
2449                                         break;
2450                         }
2451                         ret = overwrite_item(wc->trans, root, path,
2452                                              eb, i, &key);
2453                         if (ret)
2454                                 break;
2455
2456                         /*
2457                          * Before replaying extents, truncate the inode to its
2458                          * size. We need to do it now and not after log replay
2459                          * because before an fsync we can have prealloc extents
2460                          * added beyond the inode's i_size. If we did it after,
2461                          * through orphan cleanup for example, we would drop
2462                          * those prealloc extents just after replaying them.
2463                          */
2464                         if (S_ISREG(mode)) {
2465                                 struct btrfs_drop_extents_args drop_args = { 0 };
2466                                 struct btrfs_inode *inode;
2467                                 u64 from;
2468
2469                                 inode = btrfs_iget_logging(key.objectid, root);
2470                                 if (IS_ERR(inode)) {
2471                                         ret = PTR_ERR(inode);
2472                                         break;
2473                                 }
2474                                 from = ALIGN(i_size_read(&inode->vfs_inode),
2475                                              root->fs_info->sectorsize);
2476                                 drop_args.start = from;
2477                                 drop_args.end = (u64)-1;
2478                                 drop_args.drop_cache = true;
2479                                 ret = btrfs_drop_extents(wc->trans, root, inode,
2480                                                          &drop_args);
2481                                 if (!ret) {
2482                                         inode_sub_bytes(&inode->vfs_inode,
2483                                                         drop_args.bytes_found);
2484                                         /* Update the inode's nbytes. */
2485                                         ret = btrfs_update_inode(wc->trans, inode);
2486                                 }
2487                                 iput(&inode->vfs_inode);
2488                                 if (ret)
2489                                         break;
2490                         }
2491
2492                         ret = link_to_fixup_dir(wc->trans, root,
2493                                                 path, key.objectid);
2494                         if (ret)
2495                                 break;
2496                 }
2497
2498                 if (wc->ignore_cur_inode)
2499                         continue;
2500
2501                 if (key.type == BTRFS_DIR_INDEX_KEY &&
2502                     wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2503                         ret = replay_one_dir_item(wc->trans, root, path,
2504                                                   eb, i, &key);
2505                         if (ret)
2506                                 break;
2507                 }
2508
2509                 if (wc->stage < LOG_WALK_REPLAY_ALL)
2510                         continue;
2511
2512                 /* these keys are simply copied */
2513                 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2514                         ret = overwrite_item(wc->trans, root, path,
2515                                              eb, i, &key);
2516                         if (ret)
2517                                 break;
2518                 } else if (key.type == BTRFS_INODE_REF_KEY ||
2519                            key.type == BTRFS_INODE_EXTREF_KEY) {
2520                         ret = add_inode_ref(wc->trans, root, log, path,
2521                                             eb, i, &key);
2522                         if (ret && ret != -ENOENT)
2523                                 break;
2524                         ret = 0;
2525                 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2526                         ret = replay_one_extent(wc->trans, root, path,
2527                                                 eb, i, &key);
2528                         if (ret)
2529                                 break;
2530                 }
2531                 /*
2532                  * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
2533                  * BTRFS_DIR_INDEX_KEY items which we use to derive the
2534                  * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
2535                  * older kernel with such keys, ignore them.
2536                  */
2537         }
2538         btrfs_free_path(path);
2539         return ret;
2540 }
2541
2542 /*
2543  * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2544  */
2545 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2546 {
2547         struct btrfs_block_group *cache;
2548
2549         cache = btrfs_lookup_block_group(fs_info, start);
2550         if (!cache) {
2551                 btrfs_err(fs_info, "unable to find block group for %llu", start);
2552                 return;
2553         }
2554
2555         spin_lock(&cache->space_info->lock);
2556         spin_lock(&cache->lock);
2557         cache->reserved -= fs_info->nodesize;
2558         cache->space_info->bytes_reserved -= fs_info->nodesize;
2559         spin_unlock(&cache->lock);
2560         spin_unlock(&cache->space_info->lock);
2561
2562         btrfs_put_block_group(cache);
2563 }
2564
2565 static int clean_log_buffer(struct btrfs_trans_handle *trans,
2566                             struct extent_buffer *eb)
2567 {
2568         int ret;
2569
2570         btrfs_tree_lock(eb);
2571         btrfs_clear_buffer_dirty(trans, eb);
2572         wait_on_extent_buffer_writeback(eb);
2573         btrfs_tree_unlock(eb);
2574
2575         if (trans) {
2576                 ret = btrfs_pin_reserved_extent(trans, eb);
2577                 if (ret)
2578                         return ret;
2579         } else {
2580                 unaccount_log_buffer(eb->fs_info, eb->start);
2581         }
2582
2583         return 0;
2584 }
2585
2586 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2587                                    struct btrfs_root *root,
2588                                    struct btrfs_path *path, int *level,
2589                                    struct walk_control *wc)
2590 {
2591         struct btrfs_fs_info *fs_info = root->fs_info;
2592         u64 bytenr;
2593         u64 ptr_gen;
2594         struct extent_buffer *next;
2595         struct extent_buffer *cur;
2596         int ret = 0;
2597
2598         while (*level > 0) {
2599                 struct btrfs_tree_parent_check check = { 0 };
2600
2601                 cur = path->nodes[*level];
2602
2603                 WARN_ON(btrfs_header_level(cur) != *level);
2604
2605                 if (path->slots[*level] >=
2606                     btrfs_header_nritems(cur))
2607                         break;
2608
2609                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2610                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2611                 check.transid = ptr_gen;
2612                 check.level = *level - 1;
2613                 check.has_first_key = true;
2614                 btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
2615
2616                 next = btrfs_find_create_tree_block(fs_info, bytenr,
2617                                                     btrfs_header_owner(cur),
2618                                                     *level - 1);
2619                 if (IS_ERR(next))
2620                         return PTR_ERR(next);
2621
2622                 if (*level == 1) {
2623                         ret = wc->process_func(root, next, wc, ptr_gen,
2624                                                *level - 1);
2625                         if (ret) {
2626                                 free_extent_buffer(next);
2627                                 return ret;
2628                         }
2629
2630                         path->slots[*level]++;
2631                         if (wc->free) {
2632                                 ret = btrfs_read_extent_buffer(next, &check);
2633                                 if (ret) {
2634                                         free_extent_buffer(next);
2635                                         return ret;
2636                                 }
2637
2638                                 ret = clean_log_buffer(trans, next);
2639                                 if (ret) {
2640                                         free_extent_buffer(next);
2641                                         return ret;
2642                                 }
2643                         }
2644                         free_extent_buffer(next);
2645                         continue;
2646                 }
2647                 ret = btrfs_read_extent_buffer(next, &check);
2648                 if (ret) {
2649                         free_extent_buffer(next);
2650                         return ret;
2651                 }
2652
2653                 if (path->nodes[*level-1])
2654                         free_extent_buffer(path->nodes[*level-1]);
2655                 path->nodes[*level-1] = next;
2656                 *level = btrfs_header_level(next);
2657                 path->slots[*level] = 0;
2658                 cond_resched();
2659         }
2660         path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2661
2662         cond_resched();
2663         return 0;
2664 }
2665
2666 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2667                                  struct btrfs_root *root,
2668                                  struct btrfs_path *path, int *level,
2669                                  struct walk_control *wc)
2670 {
2671         int i;
2672         int slot;
2673         int ret;
2674
2675         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2676                 slot = path->slots[i];
2677                 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2678                         path->slots[i]++;
2679                         *level = i;
2680                         WARN_ON(*level == 0);
2681                         return 0;
2682                 } else {
2683                         ret = wc->process_func(root, path->nodes[*level], wc,
2684                                  btrfs_header_generation(path->nodes[*level]),
2685                                  *level);
2686                         if (ret)
2687                                 return ret;
2688
2689                         if (wc->free) {
2690                                 ret = clean_log_buffer(trans, path->nodes[*level]);
2691                                 if (ret)
2692                                         return ret;
2693                         }
2694                         free_extent_buffer(path->nodes[*level]);
2695                         path->nodes[*level] = NULL;
2696                         *level = i + 1;
2697                 }
2698         }
2699         return 1;
2700 }
2701
2702 /*
2703  * drop the reference count on the tree rooted at 'snap'.  This traverses
2704  * the tree freeing any blocks that have a ref count of zero after being
2705  * decremented.
2706  */
2707 static int walk_log_tree(struct btrfs_trans_handle *trans,
2708                          struct btrfs_root *log, struct walk_control *wc)
2709 {
2710         int ret = 0;
2711         int wret;
2712         int level;
2713         struct btrfs_path *path;
2714         int orig_level;
2715
2716         path = btrfs_alloc_path();
2717         if (!path)
2718                 return -ENOMEM;
2719
2720         level = btrfs_header_level(log->node);
2721         orig_level = level;
2722         path->nodes[level] = log->node;
2723         atomic_inc(&log->node->refs);
2724         path->slots[level] = 0;
2725
2726         while (1) {
2727                 wret = walk_down_log_tree(trans, log, path, &level, wc);
2728                 if (wret > 0)
2729                         break;
2730                 if (wret < 0) {
2731                         ret = wret;
2732                         goto out;
2733                 }
2734
2735                 wret = walk_up_log_tree(trans, log, path, &level, wc);
2736                 if (wret > 0)
2737                         break;
2738                 if (wret < 0) {
2739                         ret = wret;
2740                         goto out;
2741                 }
2742         }
2743
2744         /* was the root node processed? if not, catch it here */
2745         if (path->nodes[orig_level]) {
2746                 ret = wc->process_func(log, path->nodes[orig_level], wc,
2747                          btrfs_header_generation(path->nodes[orig_level]),
2748                          orig_level);
2749                 if (ret)
2750                         goto out;
2751                 if (wc->free)
2752                         ret = clean_log_buffer(trans, path->nodes[orig_level]);
2753         }
2754
2755 out:
2756         btrfs_free_path(path);
2757         return ret;
2758 }
2759
2760 /*
2761  * helper function to update the item for a given subvolumes log root
2762  * in the tree of log roots
2763  */
2764 static int update_log_root(struct btrfs_trans_handle *trans,
2765                            struct btrfs_root *log,
2766                            struct btrfs_root_item *root_item)
2767 {
2768         struct btrfs_fs_info *fs_info = log->fs_info;
2769         int ret;
2770
2771         if (log->log_transid == 1) {
2772                 /* insert root item on the first sync */
2773                 ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2774                                 &log->root_key, root_item);
2775         } else {
2776                 ret = btrfs_update_root(trans, fs_info->log_root_tree,
2777                                 &log->root_key, root_item);
2778         }
2779         return ret;
2780 }
2781
2782 static void wait_log_commit(struct btrfs_root *root, int transid)
2783 {
2784         DEFINE_WAIT(wait);
2785         int index = transid % 2;
2786
2787         /*
2788          * we only allow two pending log transactions at a time,
2789          * so we know that if ours is more than 2 older than the
2790          * current transaction, we're done
2791          */
2792         for (;;) {
2793                 prepare_to_wait(&root->log_commit_wait[index],
2794                                 &wait, TASK_UNINTERRUPTIBLE);
2795
2796                 if (!(root->log_transid_committed < transid &&
2797                       atomic_read(&root->log_commit[index])))
2798                         break;
2799
2800                 mutex_unlock(&root->log_mutex);
2801                 schedule();
2802                 mutex_lock(&root->log_mutex);
2803         }
2804         finish_wait(&root->log_commit_wait[index], &wait);
2805 }
2806
2807 static void wait_for_writer(struct btrfs_root *root)
2808 {
2809         DEFINE_WAIT(wait);
2810
2811         for (;;) {
2812                 prepare_to_wait(&root->log_writer_wait, &wait,
2813                                 TASK_UNINTERRUPTIBLE);
2814                 if (!atomic_read(&root->log_writers))
2815                         break;
2816
2817                 mutex_unlock(&root->log_mutex);
2818                 schedule();
2819                 mutex_lock(&root->log_mutex);
2820         }
2821         finish_wait(&root->log_writer_wait, &wait);
2822 }
2823
2824 void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode)
2825 {
2826         ctx->log_ret = 0;
2827         ctx->log_transid = 0;
2828         ctx->log_new_dentries = false;
2829         ctx->logging_new_name = false;
2830         ctx->logging_new_delayed_dentries = false;
2831         ctx->logged_before = false;
2832         ctx->inode = inode;
2833         INIT_LIST_HEAD(&ctx->list);
2834         INIT_LIST_HEAD(&ctx->ordered_extents);
2835         INIT_LIST_HEAD(&ctx->conflict_inodes);
2836         ctx->num_conflict_inodes = 0;
2837         ctx->logging_conflict_inodes = false;
2838         ctx->scratch_eb = NULL;
2839 }
2840
2841 void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
2842 {
2843         struct btrfs_inode *inode = ctx->inode;
2844
2845         if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
2846             !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
2847                 return;
2848
2849         /*
2850          * Don't care about allocation failure. This is just for optimization,
2851          * if we fail to allocate here, we will try again later if needed.
2852          */
2853         ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0);
2854 }
2855
2856 void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
2857 {
2858         struct btrfs_ordered_extent *ordered;
2859         struct btrfs_ordered_extent *tmp;
2860
2861         btrfs_assert_inode_locked(ctx->inode);
2862
2863         list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
2864                 list_del_init(&ordered->log_list);
2865                 btrfs_put_ordered_extent(ordered);
2866         }
2867 }
2868
2869
2870 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2871                                         struct btrfs_log_ctx *ctx)
2872 {
2873         mutex_lock(&root->log_mutex);
2874         list_del_init(&ctx->list);
2875         mutex_unlock(&root->log_mutex);
2876 }
2877
2878 /*
2879  * Invoked in log mutex context, or be sure there is no other task which
2880  * can access the list.
2881  */
2882 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2883                                              int index, int error)
2884 {
2885         struct btrfs_log_ctx *ctx;
2886         struct btrfs_log_ctx *safe;
2887
2888         list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2889                 list_del_init(&ctx->list);
2890                 ctx->log_ret = error;
2891         }
2892 }
2893
2894 /*
2895  * Sends a given tree log down to the disk and updates the super blocks to
2896  * record it.  When this call is done, you know that any inodes previously
2897  * logged are safely on disk only if it returns 0.
2898  *
2899  * Any other return value means you need to call btrfs_commit_transaction.
2900  * Some of the edge cases for fsyncing directories that have had unlinks
2901  * or renames done in the past mean that sometimes the only safe
2902  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2903  * that has happened.
2904  */
2905 int btrfs_sync_log(struct btrfs_trans_handle *trans,
2906                    struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2907 {
2908         int index1;
2909         int index2;
2910         int mark;
2911         int ret;
2912         struct btrfs_fs_info *fs_info = root->fs_info;
2913         struct btrfs_root *log = root->log_root;
2914         struct btrfs_root *log_root_tree = fs_info->log_root_tree;
2915         struct btrfs_root_item new_root_item;
2916         int log_transid = 0;
2917         struct btrfs_log_ctx root_log_ctx;
2918         struct blk_plug plug;
2919         u64 log_root_start;
2920         u64 log_root_level;
2921
2922         mutex_lock(&root->log_mutex);
2923         log_transid = ctx->log_transid;
2924         if (root->log_transid_committed >= log_transid) {
2925                 mutex_unlock(&root->log_mutex);
2926                 return ctx->log_ret;
2927         }
2928
2929         index1 = log_transid % 2;
2930         if (atomic_read(&root->log_commit[index1])) {
2931                 wait_log_commit(root, log_transid);
2932                 mutex_unlock(&root->log_mutex);
2933                 return ctx->log_ret;
2934         }
2935         ASSERT(log_transid == root->log_transid);
2936         atomic_set(&root->log_commit[index1], 1);
2937
2938         /* wait for previous tree log sync to complete */
2939         if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2940                 wait_log_commit(root, log_transid - 1);
2941
2942         while (1) {
2943                 int batch = atomic_read(&root->log_batch);
2944                 /* when we're on an ssd, just kick the log commit out */
2945                 if (!btrfs_test_opt(fs_info, SSD) &&
2946                     test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2947                         mutex_unlock(&root->log_mutex);
2948                         schedule_timeout_uninterruptible(1);
2949                         mutex_lock(&root->log_mutex);
2950                 }
2951                 wait_for_writer(root);
2952                 if (batch == atomic_read(&root->log_batch))
2953                         break;
2954         }
2955
2956         /* bail out if we need to do a full commit */
2957         if (btrfs_need_log_full_commit(trans)) {
2958                 ret = BTRFS_LOG_FORCE_COMMIT;
2959                 mutex_unlock(&root->log_mutex);
2960                 goto out;
2961         }
2962
2963         if (log_transid % 2 == 0)
2964                 mark = EXTENT_DIRTY;
2965         else
2966                 mark = EXTENT_NEW;
2967
2968         /* we start IO on  all the marked extents here, but we don't actually
2969          * wait for them until later.
2970          */
2971         blk_start_plug(&plug);
2972         ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
2973         /*
2974          * -EAGAIN happens when someone, e.g., a concurrent transaction
2975          *  commit, writes a dirty extent in this tree-log commit. This
2976          *  concurrent write will create a hole writing out the extents,
2977          *  and we cannot proceed on a zoned filesystem, requiring
2978          *  sequential writing. While we can bail out to a full commit
2979          *  here, but we can continue hoping the concurrent writing fills
2980          *  the hole.
2981          */
2982         if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
2983                 ret = 0;
2984         if (ret) {
2985                 blk_finish_plug(&plug);
2986                 btrfs_set_log_full_commit(trans);
2987                 mutex_unlock(&root->log_mutex);
2988                 goto out;
2989         }
2990
2991         /*
2992          * We _must_ update under the root->log_mutex in order to make sure we
2993          * have a consistent view of the log root we are trying to commit at
2994          * this moment.
2995          *
2996          * We _must_ copy this into a local copy, because we are not holding the
2997          * log_root_tree->log_mutex yet.  This is important because when we
2998          * commit the log_root_tree we must have a consistent view of the
2999          * log_root_tree when we update the super block to point at the
3000          * log_root_tree bytenr.  If we update the log_root_tree here we'll race
3001          * with the commit and possibly point at the new block which we may not
3002          * have written out.
3003          */
3004         btrfs_set_root_node(&log->root_item, log->node);
3005         memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
3006
3007         btrfs_set_root_log_transid(root, root->log_transid + 1);
3008         log->log_transid = root->log_transid;
3009         root->log_start_pid = 0;
3010         /*
3011          * IO has been started, blocks of the log tree have WRITTEN flag set
3012          * in their headers. new modifications of the log will be written to
3013          * new positions. so it's safe to allow log writers to go in.
3014          */
3015         mutex_unlock(&root->log_mutex);
3016
3017         if (btrfs_is_zoned(fs_info)) {
3018                 mutex_lock(&fs_info->tree_root->log_mutex);
3019                 if (!log_root_tree->node) {
3020                         ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
3021                         if (ret) {
3022                                 mutex_unlock(&fs_info->tree_root->log_mutex);
3023                                 blk_finish_plug(&plug);
3024                                 goto out;
3025                         }
3026                 }
3027                 mutex_unlock(&fs_info->tree_root->log_mutex);
3028         }
3029
3030         btrfs_init_log_ctx(&root_log_ctx, NULL);
3031
3032         mutex_lock(&log_root_tree->log_mutex);
3033
3034         index2 = log_root_tree->log_transid % 2;
3035         list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3036         root_log_ctx.log_transid = log_root_tree->log_transid;
3037
3038         /*
3039          * Now we are safe to update the log_root_tree because we're under the
3040          * log_mutex, and we're a current writer so we're holding the commit
3041          * open until we drop the log_mutex.
3042          */
3043         ret = update_log_root(trans, log, &new_root_item);
3044         if (ret) {
3045                 list_del_init(&root_log_ctx.list);
3046                 blk_finish_plug(&plug);
3047                 btrfs_set_log_full_commit(trans);
3048                 if (ret != -ENOSPC)
3049                         btrfs_err(fs_info,
3050                                   "failed to update log for root %llu ret %d",
3051                                   btrfs_root_id(root), ret);
3052                 btrfs_wait_tree_log_extents(log, mark);
3053                 mutex_unlock(&log_root_tree->log_mutex);
3054                 goto out;
3055         }
3056
3057         if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3058                 blk_finish_plug(&plug);
3059                 list_del_init(&root_log_ctx.list);
3060                 mutex_unlock(&log_root_tree->log_mutex);
3061                 ret = root_log_ctx.log_ret;
3062                 goto out;
3063         }
3064
3065         if (atomic_read(&log_root_tree->log_commit[index2])) {
3066                 blk_finish_plug(&plug);
3067                 ret = btrfs_wait_tree_log_extents(log, mark);
3068                 wait_log_commit(log_root_tree,
3069                                 root_log_ctx.log_transid);
3070                 mutex_unlock(&log_root_tree->log_mutex);
3071                 if (!ret)
3072                         ret = root_log_ctx.log_ret;
3073                 goto out;
3074         }
3075         ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3076         atomic_set(&log_root_tree->log_commit[index2], 1);
3077
3078         if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3079                 wait_log_commit(log_root_tree,
3080                                 root_log_ctx.log_transid - 1);
3081         }
3082
3083         /*
3084          * now that we've moved on to the tree of log tree roots,
3085          * check the full commit flag again
3086          */
3087         if (btrfs_need_log_full_commit(trans)) {
3088                 blk_finish_plug(&plug);
3089                 btrfs_wait_tree_log_extents(log, mark);
3090                 mutex_unlock(&log_root_tree->log_mutex);
3091                 ret = BTRFS_LOG_FORCE_COMMIT;
3092                 goto out_wake_log_root;
3093         }
3094
3095         ret = btrfs_write_marked_extents(fs_info,
3096                                          &log_root_tree->dirty_log_pages,
3097                                          EXTENT_DIRTY | EXTENT_NEW);
3098         blk_finish_plug(&plug);
3099         /*
3100          * As described above, -EAGAIN indicates a hole in the extents. We
3101          * cannot wait for these write outs since the waiting cause a
3102          * deadlock. Bail out to the full commit instead.
3103          */
3104         if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
3105                 btrfs_set_log_full_commit(trans);
3106                 btrfs_wait_tree_log_extents(log, mark);
3107                 mutex_unlock(&log_root_tree->log_mutex);
3108                 goto out_wake_log_root;
3109         } else if (ret) {
3110                 btrfs_set_log_full_commit(trans);
3111                 mutex_unlock(&log_root_tree->log_mutex);
3112                 goto out_wake_log_root;
3113         }
3114         ret = btrfs_wait_tree_log_extents(log, mark);
3115         if (!ret)
3116                 ret = btrfs_wait_tree_log_extents(log_root_tree,
3117                                                   EXTENT_NEW | EXTENT_DIRTY);
3118         if (ret) {
3119                 btrfs_set_log_full_commit(trans);
3120                 mutex_unlock(&log_root_tree->log_mutex);
3121                 goto out_wake_log_root;
3122         }
3123
3124         log_root_start = log_root_tree->node->start;
3125         log_root_level = btrfs_header_level(log_root_tree->node);
3126         log_root_tree->log_transid++;
3127         mutex_unlock(&log_root_tree->log_mutex);
3128
3129         /*
3130          * Here we are guaranteed that nobody is going to write the superblock
3131          * for the current transaction before us and that neither we do write
3132          * our superblock before the previous transaction finishes its commit
3133          * and writes its superblock, because:
3134          *
3135          * 1) We are holding a handle on the current transaction, so no body
3136          *    can commit it until we release the handle;
3137          *
3138          * 2) Before writing our superblock we acquire the tree_log_mutex, so
3139          *    if the previous transaction is still committing, and hasn't yet
3140          *    written its superblock, we wait for it to do it, because a
3141          *    transaction commit acquires the tree_log_mutex when the commit
3142          *    begins and releases it only after writing its superblock.
3143          */
3144         mutex_lock(&fs_info->tree_log_mutex);
3145
3146         /*
3147          * The previous transaction writeout phase could have failed, and thus
3148          * marked the fs in an error state.  We must not commit here, as we
3149          * could have updated our generation in the super_for_commit and
3150          * writing the super here would result in transid mismatches.  If there
3151          * is an error here just bail.
3152          */
3153         if (BTRFS_FS_ERROR(fs_info)) {
3154                 ret = -EIO;
3155                 btrfs_set_log_full_commit(trans);
3156                 btrfs_abort_transaction(trans, ret);
3157                 mutex_unlock(&fs_info->tree_log_mutex);
3158                 goto out_wake_log_root;
3159         }
3160
3161         btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
3162         btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
3163         ret = write_all_supers(fs_info, 1);
3164         mutex_unlock(&fs_info->tree_log_mutex);
3165         if (ret) {
3166                 btrfs_set_log_full_commit(trans);
3167                 btrfs_abort_transaction(trans, ret);
3168                 goto out_wake_log_root;
3169         }
3170
3171         /*
3172          * We know there can only be one task here, since we have not yet set
3173          * root->log_commit[index1] to 0 and any task attempting to sync the
3174          * log must wait for the previous log transaction to commit if it's
3175          * still in progress or wait for the current log transaction commit if
3176          * someone else already started it. We use <= and not < because the
3177          * first log transaction has an ID of 0.
3178          */
3179         ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid);
3180         btrfs_set_root_last_log_commit(root, log_transid);
3181
3182 out_wake_log_root:
3183         mutex_lock(&log_root_tree->log_mutex);
3184         btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3185
3186         log_root_tree->log_transid_committed++;
3187         atomic_set(&log_root_tree->log_commit[index2], 0);
3188         mutex_unlock(&log_root_tree->log_mutex);
3189
3190         /*
3191          * The barrier before waitqueue_active (in cond_wake_up) is needed so
3192          * all the updates above are seen by the woken threads. It might not be
3193          * necessary, but proving that seems to be hard.
3194          */
3195         cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3196 out:
3197         mutex_lock(&root->log_mutex);
3198         btrfs_remove_all_log_ctxs(root, index1, ret);
3199         root->log_transid_committed++;
3200         atomic_set(&root->log_commit[index1], 0);
3201         mutex_unlock(&root->log_mutex);
3202
3203         /*
3204          * The barrier before waitqueue_active (in cond_wake_up) is needed so
3205          * all the updates above are seen by the woken threads. It might not be
3206          * necessary, but proving that seems to be hard.
3207          */
3208         cond_wake_up(&root->log_commit_wait[index1]);
3209         return ret;
3210 }
3211
3212 static void free_log_tree(struct btrfs_trans_handle *trans,
3213                           struct btrfs_root *log)
3214 {
3215         int ret;
3216         struct walk_control wc = {
3217                 .free = 1,
3218                 .process_func = process_one_buffer
3219         };
3220
3221         if (log->node) {
3222                 ret = walk_log_tree(trans, log, &wc);
3223                 if (ret) {
3224                         /*
3225                          * We weren't able to traverse the entire log tree, the
3226                          * typical scenario is getting an -EIO when reading an
3227                          * extent buffer of the tree, due to a previous writeback
3228                          * failure of it.
3229                          */
3230                         set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
3231                                 &log->fs_info->fs_state);
3232
3233                         /*
3234                          * Some extent buffers of the log tree may still be dirty
3235                          * and not yet written back to storage, because we may
3236                          * have updates to a log tree without syncing a log tree,
3237                          * such as during rename and link operations. So flush
3238                          * them out and wait for their writeback to complete, so
3239                          * that we properly cleanup their state and pages.
3240                          */
3241                         btrfs_write_marked_extents(log->fs_info,
3242                                                    &log->dirty_log_pages,
3243                                                    EXTENT_DIRTY | EXTENT_NEW);
3244                         btrfs_wait_tree_log_extents(log,
3245                                                     EXTENT_DIRTY | EXTENT_NEW);
3246
3247                         if (trans)
3248                                 btrfs_abort_transaction(trans, ret);
3249                         else
3250                                 btrfs_handle_fs_error(log->fs_info, ret, NULL);
3251                 }
3252         }
3253
3254         btrfs_extent_io_tree_release(&log->dirty_log_pages);
3255         btrfs_extent_io_tree_release(&log->log_csum_range);
3256
3257         btrfs_put_root(log);
3258 }
3259
3260 /*
3261  * free all the extents used by the tree log.  This should be called
3262  * at commit time of the full transaction
3263  */
3264 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3265 {
3266         if (root->log_root) {
3267                 free_log_tree(trans, root->log_root);
3268                 root->log_root = NULL;
3269                 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
3270         }
3271         return 0;
3272 }
3273
3274 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3275                              struct btrfs_fs_info *fs_info)
3276 {
3277         if (fs_info->log_root_tree) {
3278                 free_log_tree(trans, fs_info->log_root_tree);
3279                 fs_info->log_root_tree = NULL;
3280                 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
3281         }
3282         return 0;
3283 }
3284
3285 /*
3286  * Check if an inode was logged in the current transaction. This correctly deals
3287  * with the case where the inode was logged but has a logged_trans of 0, which
3288  * happens if the inode is evicted and loaded again, as logged_trans is an in
3289  * memory only field (not persisted).
3290  *
3291  * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
3292  * and < 0 on error.
3293  */
3294 static int inode_logged(const struct btrfs_trans_handle *trans,
3295                         struct btrfs_inode *inode,
3296                         struct btrfs_path *path_in)
3297 {
3298         struct btrfs_path *path = path_in;
3299         struct btrfs_key key;
3300         int ret;
3301
3302         if (inode->logged_trans == trans->transid)
3303                 return 1;
3304
3305         /*
3306          * If logged_trans is not 0, then we know the inode logged was not logged
3307          * in this transaction, so we can return false right away.
3308          */
3309         if (inode->logged_trans > 0)
3310                 return 0;
3311
3312         /*
3313          * If no log tree was created for this root in this transaction, then
3314          * the inode can not have been logged in this transaction. In that case
3315          * set logged_trans to anything greater than 0 and less than the current
3316          * transaction's ID, to avoid the search below in a future call in case
3317          * a log tree gets created after this.
3318          */
3319         if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
3320                 inode->logged_trans = trans->transid - 1;
3321                 return 0;
3322         }
3323
3324         /*
3325          * We have a log tree and the inode's logged_trans is 0. We can't tell
3326          * for sure if the inode was logged before in this transaction by looking
3327          * only at logged_trans. We could be pessimistic and assume it was, but
3328          * that can lead to unnecessarily logging an inode during rename and link
3329          * operations, and then further updating the log in followup rename and
3330          * link operations, specially if it's a directory, which adds latency
3331          * visible to applications doing a series of rename or link operations.
3332          *
3333          * A logged_trans of 0 here can mean several things:
3334          *
3335          * 1) The inode was never logged since the filesystem was mounted, and may
3336          *    or may have not been evicted and loaded again;
3337          *
3338          * 2) The inode was logged in a previous transaction, then evicted and
3339          *    then loaded again;
3340          *
3341          * 3) The inode was logged in the current transaction, then evicted and
3342          *    then loaded again.
3343          *
3344          * For cases 1) and 2) we don't want to return true, but we need to detect
3345          * case 3) and return true. So we do a search in the log root for the inode
3346          * item.
3347          */
3348         key.objectid = btrfs_ino(inode);
3349         key.type = BTRFS_INODE_ITEM_KEY;
3350         key.offset = 0;
3351
3352         if (!path) {
3353                 path = btrfs_alloc_path();
3354                 if (!path)
3355                         return -ENOMEM;
3356         }
3357
3358         ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
3359
3360         if (path_in)
3361                 btrfs_release_path(path);
3362         else
3363                 btrfs_free_path(path);
3364
3365         /*
3366          * Logging an inode always results in logging its inode item. So if we
3367          * did not find the item we know the inode was not logged for sure.
3368          */
3369         if (ret < 0) {
3370                 return ret;
3371         } else if (ret > 0) {
3372                 /*
3373                  * Set logged_trans to a value greater than 0 and less then the
3374                  * current transaction to avoid doing the search in future calls.
3375                  */
3376                 inode->logged_trans = trans->transid - 1;
3377                 return 0;
3378         }
3379
3380         /*
3381          * The inode was previously logged and then evicted, set logged_trans to
3382          * the current transacion's ID, to avoid future tree searches as long as
3383          * the inode is not evicted again.
3384          */
3385         inode->logged_trans = trans->transid;
3386
3387         /*
3388          * If it's a directory, then we must set last_dir_index_offset to the
3389          * maximum possible value, so that the next attempt to log the inode does
3390          * not skip checking if dir index keys found in modified subvolume tree
3391          * leaves have been logged before, otherwise it would result in attempts
3392          * to insert duplicate dir index keys in the log tree. This must be done
3393          * because last_dir_index_offset is an in-memory only field, not persisted
3394          * in the inode item or any other on-disk structure, so its value is lost
3395          * once the inode is evicted.
3396          */
3397         if (S_ISDIR(inode->vfs_inode.i_mode))
3398                 inode->last_dir_index_offset = (u64)-1;
3399
3400         return 1;
3401 }
3402
3403 /*
3404  * Delete a directory entry from the log if it exists.
3405  *
3406  * Returns < 0 on error
3407  *           1 if the entry does not exists
3408  *           0 if the entry existed and was successfully deleted
3409  */
3410 static int del_logged_dentry(struct btrfs_trans_handle *trans,
3411                              struct btrfs_root *log,
3412                              struct btrfs_path *path,
3413                              u64 dir_ino,
3414                              const struct fscrypt_str *name,
3415                              u64 index)
3416 {
3417         struct btrfs_dir_item *di;
3418
3419         /*
3420          * We only log dir index items of a directory, so we don't need to look
3421          * for dir item keys.
3422          */
3423         di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3424                                          index, name, -1);
3425         if (IS_ERR(di))
3426                 return PTR_ERR(di);
3427         else if (!di)
3428                 return 1;
3429
3430         /*
3431          * We do not need to update the size field of the directory's
3432          * inode item because on log replay we update the field to reflect
3433          * all existing entries in the directory (see overwrite_item()).
3434          */
3435         return btrfs_delete_one_dir_name(trans, log, path, di);
3436 }
3437
3438 /*
3439  * If both a file and directory are logged, and unlinks or renames are
3440  * mixed in, we have a few interesting corners:
3441  *
3442  * create file X in dir Y
3443  * link file X to X.link in dir Y
3444  * fsync file X
3445  * unlink file X but leave X.link
3446  * fsync dir Y
3447  *
3448  * After a crash we would expect only X.link to exist.  But file X
3449  * didn't get fsync'd again so the log has back refs for X and X.link.
3450  *
3451  * We solve this by removing directory entries and inode backrefs from the
3452  * log when a file that was logged in the current transaction is
3453  * unlinked.  Any later fsync will include the updated log entries, and
3454  * we'll be able to reconstruct the proper directory items from backrefs.
3455  *
3456  * This optimizations allows us to avoid relogging the entire inode
3457  * or the entire directory.
3458  */
3459 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3460                                   struct btrfs_root *root,
3461                                   const struct fscrypt_str *name,
3462                                   struct btrfs_inode *dir, u64 index)
3463 {
3464         struct btrfs_path *path;
3465         int ret;
3466
3467         ret = inode_logged(trans, dir, NULL);
3468         if (ret == 0)
3469                 return;
3470         else if (ret < 0) {
3471                 btrfs_set_log_full_commit(trans);
3472                 return;
3473         }
3474
3475         ret = join_running_log_trans(root);
3476         if (ret)
3477                 return;
3478
3479         mutex_lock(&dir->log_mutex);
3480
3481         path = btrfs_alloc_path();
3482         if (!path) {
3483                 ret = -ENOMEM;
3484                 goto out_unlock;
3485         }
3486
3487         ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
3488                                 name, index);
3489         btrfs_free_path(path);
3490 out_unlock:
3491         mutex_unlock(&dir->log_mutex);
3492         if (ret < 0)
3493                 btrfs_set_log_full_commit(trans);
3494         btrfs_end_log_trans(root);
3495 }
3496
3497 /* see comments for btrfs_del_dir_entries_in_log */
3498 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3499                                 struct btrfs_root *root,
3500                                 const struct fscrypt_str *name,
3501                                 struct btrfs_inode *inode, u64 dirid)
3502 {
3503         struct btrfs_root *log;
3504         u64 index;
3505         int ret;
3506
3507         ret = inode_logged(trans, inode, NULL);
3508         if (ret == 0)
3509                 return;
3510         else if (ret < 0) {
3511                 btrfs_set_log_full_commit(trans);
3512                 return;
3513         }
3514
3515         ret = join_running_log_trans(root);
3516         if (ret)
3517                 return;
3518         log = root->log_root;
3519         mutex_lock(&inode->log_mutex);
3520
3521         ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode),
3522                                   dirid, &index);
3523         mutex_unlock(&inode->log_mutex);
3524         if (ret < 0 && ret != -ENOENT)
3525                 btrfs_set_log_full_commit(trans);
3526         btrfs_end_log_trans(root);
3527 }
3528
3529 /*
3530  * creates a range item in the log for 'dirid'.  first_offset and
3531  * last_offset tell us which parts of the key space the log should
3532  * be considered authoritative for.
3533  */
3534 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3535                                        struct btrfs_root *log,
3536                                        struct btrfs_path *path,
3537                                        u64 dirid,
3538                                        u64 first_offset, u64 last_offset)
3539 {
3540         int ret;
3541         struct btrfs_key key;
3542         struct btrfs_dir_log_item *item;
3543
3544         key.objectid = dirid;
3545         key.type = BTRFS_DIR_LOG_INDEX_KEY;
3546         key.offset = first_offset;
3547         ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3548         /*
3549          * -EEXIST is fine and can happen sporadically when we are logging a
3550          * directory and have concurrent insertions in the subvolume's tree for
3551          * items from other inodes and that result in pushing off some dir items
3552          * from one leaf to another in order to accommodate for the new items.
3553          * This results in logging the same dir index range key.
3554          */
3555         if (ret && ret != -EEXIST)
3556                 return ret;
3557
3558         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3559                               struct btrfs_dir_log_item);
3560         if (ret == -EEXIST) {
3561                 const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
3562
3563                 /*
3564                  * btrfs_del_dir_entries_in_log() might have been called during
3565                  * an unlink between the initial insertion of this key and the
3566                  * current update, or we might be logging a single entry deletion
3567                  * during a rename, so set the new last_offset to the max value.
3568                  */
3569                 last_offset = max(last_offset, curr_end);
3570         }
3571         btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3572         btrfs_release_path(path);
3573         return 0;
3574 }
3575
3576 static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
3577                                  struct btrfs_inode *inode,
3578                                  struct extent_buffer *src,
3579                                  struct btrfs_path *dst_path,
3580                                  int start_slot,
3581                                  int count)
3582 {
3583         struct btrfs_root *log = inode->root->log_root;
3584         char *ins_data = NULL;
3585         struct btrfs_item_batch batch;
3586         struct extent_buffer *dst;
3587         unsigned long src_offset;
3588         unsigned long dst_offset;
3589         u64 last_index;
3590         struct btrfs_key key;
3591         u32 item_size;
3592         int ret;
3593         int i;
3594
3595         ASSERT(count > 0);
3596         batch.nr = count;
3597
3598         if (count == 1) {
3599                 btrfs_item_key_to_cpu(src, &key, start_slot);
3600                 item_size = btrfs_item_size(src, start_slot);
3601                 batch.keys = &key;
3602                 batch.data_sizes = &item_size;
3603                 batch.total_data_size = item_size;
3604         } else {
3605                 struct btrfs_key *ins_keys;
3606                 u32 *ins_sizes;
3607
3608                 ins_data = kmalloc(count * sizeof(u32) +
3609                                    count * sizeof(struct btrfs_key), GFP_NOFS);
3610                 if (!ins_data)
3611                         return -ENOMEM;
3612
3613                 ins_sizes = (u32 *)ins_data;
3614                 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
3615                 batch.keys = ins_keys;
3616                 batch.data_sizes = ins_sizes;
3617                 batch.total_data_size = 0;
3618
3619                 for (i = 0; i < count; i++) {
3620                         const int slot = start_slot + i;
3621
3622                         btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
3623                         ins_sizes[i] = btrfs_item_size(src, slot);
3624                         batch.total_data_size += ins_sizes[i];
3625                 }
3626         }
3627
3628         ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
3629         if (ret)
3630                 goto out;
3631
3632         dst = dst_path->nodes[0];
3633         /*
3634          * Copy all the items in bulk, in a single copy operation. Item data is
3635          * organized such that it's placed at the end of a leaf and from right
3636          * to left. For example, the data for the second item ends at an offset
3637          * that matches the offset where the data for the first item starts, the
3638          * data for the third item ends at an offset that matches the offset
3639          * where the data of the second items starts, and so on.
3640          * Therefore our source and destination start offsets for copy match the
3641          * offsets of the last items (highest slots).
3642          */
3643         dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
3644         src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
3645         copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
3646         btrfs_release_path(dst_path);
3647
3648         last_index = batch.keys[count - 1].offset;
3649         ASSERT(last_index > inode->last_dir_index_offset);
3650
3651         /*
3652          * If for some unexpected reason the last item's index is not greater
3653          * than the last index we logged, warn and force a transaction commit.
3654          */
3655         if (WARN_ON(last_index <= inode->last_dir_index_offset))
3656                 ret = BTRFS_LOG_FORCE_COMMIT;
3657         else
3658                 inode->last_dir_index_offset = last_index;
3659
3660         if (btrfs_get_first_dir_index_to_log(inode) == 0)
3661                 btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
3662 out:
3663         kfree(ins_data);
3664
3665         return ret;
3666 }
3667
3668 static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
3669 {
3670         const int slot = path->slots[0];
3671
3672         if (ctx->scratch_eb) {
3673                 copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]);
3674         } else {
3675                 ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]);
3676                 if (!ctx->scratch_eb)
3677                         return -ENOMEM;
3678         }
3679
3680         btrfs_release_path(path);
3681         path->nodes[0] = ctx->scratch_eb;
3682         path->slots[0] = slot;
3683         /*
3684          * Add extra ref to scratch eb so that it is not freed when callers
3685          * release the path, so we can reuse it later if needed.
3686          */
3687         atomic_inc(&ctx->scratch_eb->refs);
3688
3689         return 0;
3690 }
3691
3692 static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
3693                                   struct btrfs_inode *inode,
3694                                   struct btrfs_path *path,
3695                                   struct btrfs_path *dst_path,
3696                                   struct btrfs_log_ctx *ctx,
3697                                   u64 *last_old_dentry_offset)
3698 {
3699         struct btrfs_root *log = inode->root->log_root;
3700         struct extent_buffer *src;
3701         const int nritems = btrfs_header_nritems(path->nodes[0]);
3702         const u64 ino = btrfs_ino(inode);
3703         bool last_found = false;
3704         int batch_start = 0;
3705         int batch_size = 0;
3706         int ret;
3707
3708         /*
3709          * We need to clone the leaf, release the read lock on it, and use the
3710          * clone before modifying the log tree. See the comment at copy_items()
3711          * about why we need to do this.
3712          */
3713         ret = clone_leaf(path, ctx);
3714         if (ret < 0)
3715                 return ret;
3716
3717         src = path->nodes[0];
3718
3719         for (int i = path->slots[0]; i < nritems; i++) {
3720                 struct btrfs_dir_item *di;
3721                 struct btrfs_key key;
3722                 int ret;
3723
3724                 btrfs_item_key_to_cpu(src, &key, i);
3725
3726                 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
3727                         last_found = true;
3728                         break;
3729                 }
3730
3731                 di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3732
3733                 /*
3734                  * Skip ranges of items that consist only of dir item keys created
3735                  * in past transactions. However if we find a gap, we must log a
3736                  * dir index range item for that gap, so that index keys in that
3737                  * gap are deleted during log replay.
3738                  */
3739                 if (btrfs_dir_transid(src, di) < trans->transid) {
3740                         if (key.offset > *last_old_dentry_offset + 1) {
3741                                 ret = insert_dir_log_key(trans, log, dst_path,
3742                                                  ino, *last_old_dentry_offset + 1,
3743                                                  key.offset - 1);
3744                                 if (ret < 0)
3745                                         return ret;
3746                         }
3747
3748                         *last_old_dentry_offset = key.offset;
3749                         continue;
3750                 }
3751
3752                 /* If we logged this dir index item before, we can skip it. */
3753                 if (key.offset <= inode->last_dir_index_offset)
3754                         continue;
3755
3756                 /*
3757                  * We must make sure that when we log a directory entry, the
3758                  * corresponding inode, after log replay, has a matching link
3759                  * count. For example:
3760                  *
3761                  * touch foo
3762                  * mkdir mydir
3763                  * sync
3764                  * ln foo mydir/bar
3765                  * xfs_io -c "fsync" mydir
3766                  * <crash>
3767                  * <mount fs and log replay>
3768                  *
3769                  * Would result in a fsync log that when replayed, our file inode
3770                  * would have a link count of 1, but we get two directory entries
3771                  * pointing to the same inode. After removing one of the names,
3772                  * it would not be possible to remove the other name, which
3773                  * resulted always in stale file handle errors, and would not be
3774                  * possible to rmdir the parent directory, since its i_size could
3775                  * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
3776                  * resulting in -ENOTEMPTY errors.
3777                  */
3778                 if (!ctx->log_new_dentries) {
3779                         struct btrfs_key di_key;
3780
3781                         btrfs_dir_item_key_to_cpu(src, di, &di_key);
3782                         if (di_key.type != BTRFS_ROOT_ITEM_KEY)
3783                                 ctx->log_new_dentries = true;
3784                 }
3785
3786                 if (batch_size == 0)
3787                         batch_start = i;
3788                 batch_size++;
3789         }
3790
3791         if (batch_size > 0) {
3792                 int ret;
3793
3794                 ret = flush_dir_items_batch(trans, inode, src, dst_path,
3795                                             batch_start, batch_size);
3796                 if (ret < 0)
3797                         return ret;
3798         }
3799
3800         return last_found ? 1 : 0;
3801 }
3802
3803 /*
3804  * log all the items included in the current transaction for a given
3805  * directory.  This also creates the range items in the log tree required
3806  * to replay anything deleted before the fsync
3807  */
3808 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3809                           struct btrfs_inode *inode,
3810                           struct btrfs_path *path,
3811                           struct btrfs_path *dst_path,
3812                           struct btrfs_log_ctx *ctx,
3813                           u64 min_offset, u64 *last_offset_ret)
3814 {
3815         struct btrfs_key min_key;
3816         struct btrfs_root *root = inode->root;
3817         struct btrfs_root *log = root->log_root;
3818         int ret;
3819         u64 last_old_dentry_offset = min_offset - 1;
3820         u64 last_offset = (u64)-1;
3821         u64 ino = btrfs_ino(inode);
3822
3823         min_key.objectid = ino;
3824         min_key.type = BTRFS_DIR_INDEX_KEY;
3825         min_key.offset = min_offset;
3826
3827         ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3828
3829         /*
3830          * we didn't find anything from this transaction, see if there
3831          * is anything at all
3832          */
3833         if (ret != 0 || min_key.objectid != ino ||
3834             min_key.type != BTRFS_DIR_INDEX_KEY) {
3835                 min_key.objectid = ino;
3836                 min_key.type = BTRFS_DIR_INDEX_KEY;
3837                 min_key.offset = (u64)-1;
3838                 btrfs_release_path(path);
3839                 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3840                 if (ret < 0) {
3841                         btrfs_release_path(path);
3842                         return ret;
3843                 }
3844                 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
3845
3846                 /* if ret == 0 there are items for this type,
3847                  * create a range to tell us the last key of this type.
3848                  * otherwise, there are no items in this directory after
3849                  * *min_offset, and we create a range to indicate that.
3850                  */
3851                 if (ret == 0) {
3852                         struct btrfs_key tmp;
3853
3854                         btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3855                                               path->slots[0]);
3856                         if (tmp.type == BTRFS_DIR_INDEX_KEY)
3857                                 last_old_dentry_offset = tmp.offset;
3858                 } else if (ret > 0) {
3859                         ret = 0;
3860                 }
3861
3862                 goto done;
3863         }
3864
3865         /* go backward to find any previous key */
3866         ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
3867         if (ret == 0) {
3868                 struct btrfs_key tmp;
3869
3870                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3871                 /*
3872                  * The dir index key before the first one we found that needs to
3873                  * be logged might be in a previous leaf, and there might be a
3874                  * gap between these keys, meaning that we had deletions that
3875                  * happened. So the key range item we log (key type
3876                  * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
3877                  * previous key's offset plus 1, so that those deletes are replayed.
3878                  */
3879                 if (tmp.type == BTRFS_DIR_INDEX_KEY)
3880                         last_old_dentry_offset = tmp.offset;
3881         } else if (ret < 0) {
3882                 goto done;
3883         }
3884
3885         btrfs_release_path(path);
3886
3887         /*
3888          * Find the first key from this transaction again or the one we were at
3889          * in the loop below in case we had to reschedule. We may be logging the
3890          * directory without holding its VFS lock, which happen when logging new
3891          * dentries (through log_new_dir_dentries()) or in some cases when we
3892          * need to log the parent directory of an inode. This means a dir index
3893          * key might be deleted from the inode's root, and therefore we may not
3894          * find it anymore. If we can't find it, just move to the next key. We
3895          * can not bail out and ignore, because if we do that we will simply
3896          * not log dir index keys that come after the one that was just deleted
3897          * and we can end up logging a dir index range that ends at (u64)-1
3898          * (@last_offset is initialized to that), resulting in removing dir
3899          * entries we should not remove at log replay time.
3900          */
3901 search:
3902         ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3903         if (ret > 0) {
3904                 ret = btrfs_next_item(root, path);
3905                 if (ret > 0) {
3906                         /* There are no more keys in the inode's root. */
3907                         ret = 0;
3908                         goto done;
3909                 }
3910         }
3911         if (ret < 0)
3912                 goto done;
3913
3914         /*
3915          * we have a block from this transaction, log every item in it
3916          * from our directory
3917          */
3918         while (1) {
3919                 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
3920                                              &last_old_dentry_offset);
3921                 if (ret != 0) {
3922                         if (ret > 0)
3923                                 ret = 0;
3924                         goto done;
3925                 }
3926                 path->slots[0] = btrfs_header_nritems(path->nodes[0]);
3927
3928                 /*
3929                  * look ahead to the next item and see if it is also
3930                  * from this directory and from this transaction
3931                  */
3932                 ret = btrfs_next_leaf(root, path);
3933                 if (ret) {
3934                         if (ret == 1) {
3935                                 last_offset = (u64)-1;
3936                                 ret = 0;
3937                         }
3938                         goto done;
3939                 }
3940                 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
3941                 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
3942                         last_offset = (u64)-1;
3943                         goto done;
3944                 }
3945                 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3946                         /*
3947                          * The next leaf was not changed in the current transaction
3948                          * and has at least one dir index key.
3949                          * We check for the next key because there might have been
3950                          * one or more deletions between the last key we logged and
3951                          * that next key. So the key range item we log (key type
3952                          * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
3953                          * offset minus 1, so that those deletes are replayed.
3954                          */
3955                         last_offset = min_key.offset - 1;
3956                         goto done;
3957                 }
3958                 if (need_resched()) {
3959                         btrfs_release_path(path);
3960                         cond_resched();
3961                         goto search;
3962                 }
3963         }
3964 done:
3965         btrfs_release_path(path);
3966         btrfs_release_path(dst_path);
3967
3968         if (ret == 0) {
3969                 *last_offset_ret = last_offset;
3970                 /*
3971                  * In case the leaf was changed in the current transaction but
3972                  * all its dir items are from a past transaction, the last item
3973                  * in the leaf is a dir item and there's no gap between that last
3974                  * dir item and the first one on the next leaf (which did not
3975                  * change in the current transaction), then we don't need to log
3976                  * a range, last_old_dentry_offset is == to last_offset.
3977                  */
3978                 ASSERT(last_old_dentry_offset <= last_offset);
3979                 if (last_old_dentry_offset < last_offset)
3980                         ret = insert_dir_log_key(trans, log, path, ino,
3981                                                  last_old_dentry_offset + 1,
3982                                                  last_offset);
3983         }
3984
3985         return ret;
3986 }
3987
3988 /*
3989  * If the inode was logged before and it was evicted, then its
3990  * last_dir_index_offset is (u64)-1, so we don't the value of the last index
3991  * key offset. If that's the case, search for it and update the inode. This
3992  * is to avoid lookups in the log tree every time we try to insert a dir index
3993  * key from a leaf changed in the current transaction, and to allow us to always
3994  * do batch insertions of dir index keys.
3995  */
3996 static int update_last_dir_index_offset(struct btrfs_inode *inode,
3997                                         struct btrfs_path *path,
3998                                         const struct btrfs_log_ctx *ctx)
3999 {
4000         const u64 ino = btrfs_ino(inode);
4001         struct btrfs_key key;
4002         int ret;
4003
4004         lockdep_assert_held(&inode->log_mutex);
4005
4006         if (inode->last_dir_index_offset != (u64)-1)
4007                 return 0;
4008
4009         if (!ctx->logged_before) {
4010                 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
4011                 return 0;
4012         }
4013
4014         key.objectid = ino;
4015         key.type = BTRFS_DIR_INDEX_KEY;
4016         key.offset = (u64)-1;
4017
4018         ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
4019         /*
4020          * An error happened or we actually have an index key with an offset
4021          * value of (u64)-1. Bail out, we're done.
4022          */
4023         if (ret <= 0)
4024                 goto out;
4025
4026         ret = 0;
4027         inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
4028
4029         /*
4030          * No dir index items, bail out and leave last_dir_index_offset with
4031          * the value right before the first valid index value.
4032          */
4033         if (path->slots[0] == 0)
4034                 goto out;
4035
4036         /*
4037          * btrfs_search_slot() left us at one slot beyond the slot with the last
4038          * index key, or beyond the last key of the directory that is not an
4039          * index key. If we have an index key before, set last_dir_index_offset
4040          * to its offset value, otherwise leave it with a value right before the
4041          * first valid index value, as it means we have an empty directory.
4042          */
4043         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
4044         if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
4045                 inode->last_dir_index_offset = key.offset;
4046
4047 out:
4048         btrfs_release_path(path);
4049
4050         return ret;
4051 }
4052
4053 /*
4054  * logging directories is very similar to logging inodes, We find all the items
4055  * from the current transaction and write them to the log.
4056  *
4057  * The recovery code scans the directory in the subvolume, and if it finds a
4058  * key in the range logged that is not present in the log tree, then it means
4059  * that dir entry was unlinked during the transaction.
4060  *
4061  * In order for that scan to work, we must include one key smaller than
4062  * the smallest logged by this transaction and one key larger than the largest
4063  * key logged by this transaction.
4064  */
4065 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
4066                           struct btrfs_inode *inode,
4067                           struct btrfs_path *path,
4068                           struct btrfs_path *dst_path,
4069                           struct btrfs_log_ctx *ctx)
4070 {
4071         u64 min_key;
4072         u64 max_key;
4073         int ret;
4074
4075         ret = update_last_dir_index_offset(inode, path, ctx);
4076         if (ret)
4077                 return ret;
4078
4079         min_key = BTRFS_DIR_START_INDEX;
4080         max_key = 0;
4081
4082         while (1) {
4083                 ret = log_dir_items(trans, inode, path, dst_path,
4084                                 ctx, min_key, &max_key);
4085                 if (ret)
4086                         return ret;
4087                 if (max_key == (u64)-1)
4088                         break;
4089                 min_key = max_key + 1;
4090         }
4091
4092         return 0;
4093 }
4094
4095 /*
4096  * a helper function to drop items from the log before we relog an
4097  * inode.  max_key_type indicates the highest item type to remove.
4098  * This cannot be run for file data extents because it does not
4099  * free the extents they point to.
4100  */
4101 static int drop_inode_items(struct btrfs_trans_handle *trans,
4102                                   struct btrfs_root *log,
4103                                   struct btrfs_path *path,
4104                                   struct btrfs_inode *inode,
4105                                   int max_key_type)
4106 {
4107         int ret;
4108         struct btrfs_key key;
4109         struct btrfs_key found_key;
4110         int start_slot;
4111
4112         key.objectid = btrfs_ino(inode);
4113         key.type = max_key_type;
4114         key.offset = (u64)-1;
4115
4116         while (1) {
4117                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
4118                 if (ret < 0) {
4119                         break;
4120                 } else if (ret > 0) {
4121                         if (path->slots[0] == 0)
4122                                 break;
4123                         path->slots[0]--;
4124                 }
4125
4126                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4127                                       path->slots[0]);
4128
4129                 if (found_key.objectid != key.objectid)
4130                         break;
4131
4132                 found_key.offset = 0;
4133                 found_key.type = 0;
4134                 ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot);
4135                 if (ret < 0)
4136                         break;
4137
4138                 ret = btrfs_del_items(trans, log, path, start_slot,
4139                                       path->slots[0] - start_slot + 1);
4140                 /*
4141                  * If start slot isn't 0 then we don't need to re-search, we've
4142                  * found the last guy with the objectid in this tree.
4143                  */
4144                 if (ret || start_slot != 0)
4145                         break;
4146                 btrfs_release_path(path);
4147         }
4148         btrfs_release_path(path);
4149         if (ret > 0)
4150                 ret = 0;
4151         return ret;
4152 }
4153
4154 static int truncate_inode_items(struct btrfs_trans_handle *trans,
4155                                 struct btrfs_root *log_root,
4156                                 struct btrfs_inode *inode,
4157                                 u64 new_size, u32 min_type)
4158 {
4159         struct btrfs_truncate_control control = {
4160                 .new_size = new_size,
4161                 .ino = btrfs_ino(inode),
4162                 .min_type = min_type,
4163                 .skip_ref_updates = true,
4164         };
4165
4166         return btrfs_truncate_inode_items(trans, log_root, &control);
4167 }
4168
4169 static void fill_inode_item(struct btrfs_trans_handle *trans,
4170                             struct extent_buffer *leaf,
4171                             struct btrfs_inode_item *item,
4172                             struct inode *inode, int log_inode_only,
4173                             u64 logged_isize)
4174 {
4175         struct btrfs_map_token token;
4176         u64 flags;
4177
4178         btrfs_init_map_token(&token, leaf);
4179
4180         if (log_inode_only) {
4181                 /* set the generation to zero so the recover code
4182                  * can tell the difference between an logging
4183                  * just to say 'this inode exists' and a logging
4184                  * to say 'update this inode with these values'
4185                  */
4186                 btrfs_set_token_inode_generation(&token, item, 0);
4187                 btrfs_set_token_inode_size(&token, item, logged_isize);
4188         } else {
4189                 btrfs_set_token_inode_generation(&token, item,
4190                                                  BTRFS_I(inode)->generation);
4191                 btrfs_set_token_inode_size(&token, item, inode->i_size);
4192         }
4193
4194         btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
4195         btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
4196         btrfs_set_token_inode_mode(&token, item, inode->i_mode);
4197         btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
4198
4199         btrfs_set_token_timespec_sec(&token, &item->atime,
4200                                      inode_get_atime_sec(inode));
4201         btrfs_set_token_timespec_nsec(&token, &item->atime,
4202                                       inode_get_atime_nsec(inode));
4203
4204         btrfs_set_token_timespec_sec(&token, &item->mtime,
4205                                      inode_get_mtime_sec(inode));
4206         btrfs_set_token_timespec_nsec(&token, &item->mtime,
4207                                       inode_get_mtime_nsec(inode));
4208
4209         btrfs_set_token_timespec_sec(&token, &item->ctime,
4210                                      inode_get_ctime_sec(inode));
4211         btrfs_set_token_timespec_nsec(&token, &item->ctime,
4212                                       inode_get_ctime_nsec(inode));
4213
4214         /*
4215          * We do not need to set the nbytes field, in fact during a fast fsync
4216          * its value may not even be correct, since a fast fsync does not wait
4217          * for ordered extent completion, which is where we update nbytes, it
4218          * only waits for writeback to complete. During log replay as we find
4219          * file extent items and replay them, we adjust the nbytes field of the
4220          * inode item in subvolume tree as needed (see overwrite_item()).
4221          */
4222
4223         btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
4224         btrfs_set_token_inode_transid(&token, item, trans->transid);
4225         btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
4226         flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4227                                           BTRFS_I(inode)->ro_flags);
4228         btrfs_set_token_inode_flags(&token, item, flags);
4229         btrfs_set_token_inode_block_group(&token, item, 0);
4230 }
4231
4232 static int log_inode_item(struct btrfs_trans_handle *trans,
4233                           struct btrfs_root *log, struct btrfs_path *path,
4234                           struct btrfs_inode *inode, bool inode_item_dropped)
4235 {
4236         struct btrfs_inode_item *inode_item;
4237         struct btrfs_key key;
4238         int ret;
4239
4240         btrfs_get_inode_key(inode, &key);
4241         /*
4242          * If we are doing a fast fsync and the inode was logged before in the
4243          * current transaction, then we know the inode was previously logged and
4244          * it exists in the log tree. For performance reasons, in this case use
4245          * btrfs_search_slot() directly with ins_len set to 0 so that we never
4246          * attempt a write lock on the leaf's parent, which adds unnecessary lock
4247          * contention in case there are concurrent fsyncs for other inodes of the
4248          * same subvolume. Using btrfs_insert_empty_item() when the inode item
4249          * already exists can also result in unnecessarily splitting a leaf.
4250          */
4251         if (!inode_item_dropped && inode->logged_trans == trans->transid) {
4252                 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
4253                 ASSERT(ret <= 0);
4254                 if (ret > 0)
4255                         ret = -ENOENT;
4256         } else {
4257                 /*
4258                  * This means it is the first fsync in the current transaction,
4259                  * so the inode item is not in the log and we need to insert it.
4260                  * We can never get -EEXIST because we are only called for a fast
4261                  * fsync and in case an inode eviction happens after the inode was
4262                  * logged before in the current transaction, when we load again
4263                  * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
4264                  * flags and set ->logged_trans to 0.
4265                  */
4266                 ret = btrfs_insert_empty_item(trans, log, path, &key,
4267                                               sizeof(*inode_item));
4268                 ASSERT(ret != -EEXIST);
4269         }
4270         if (ret)
4271                 return ret;
4272         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4273                                     struct btrfs_inode_item);
4274         fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
4275                         0, 0);
4276         btrfs_release_path(path);
4277         return 0;
4278 }
4279
4280 static int log_csums(struct btrfs_trans_handle *trans,
4281                      struct btrfs_inode *inode,
4282                      struct btrfs_root *log_root,
4283                      struct btrfs_ordered_sum *sums)
4284 {
4285         const u64 lock_end = sums->logical + sums->len - 1;
4286         struct extent_state *cached_state = NULL;
4287         int ret;
4288
4289         /*
4290          * If this inode was not used for reflink operations in the current
4291          * transaction with new extents, then do the fast path, no need to
4292          * worry about logging checksum items with overlapping ranges.
4293          */
4294         if (inode->last_reflink_trans < trans->transid)
4295                 return btrfs_csum_file_blocks(trans, log_root, sums);
4296
4297         /*
4298          * Serialize logging for checksums. This is to avoid racing with the
4299          * same checksum being logged by another task that is logging another
4300          * file which happens to refer to the same extent as well. Such races
4301          * can leave checksum items in the log with overlapping ranges.
4302          */
4303         ret = btrfs_lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
4304                                 &cached_state);
4305         if (ret)
4306                 return ret;
4307         /*
4308          * Due to extent cloning, we might have logged a csum item that covers a
4309          * subrange of a cloned extent, and later we can end up logging a csum
4310          * item for a larger subrange of the same extent or the entire range.
4311          * This would leave csum items in the log tree that cover the same range
4312          * and break the searches for checksums in the log tree, resulting in
4313          * some checksums missing in the fs/subvolume tree. So just delete (or
4314          * trim and adjust) any existing csum items in the log for this range.
4315          */
4316         ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len);
4317         if (!ret)
4318                 ret = btrfs_csum_file_blocks(trans, log_root, sums);
4319
4320         btrfs_unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
4321                             &cached_state);
4322
4323         return ret;
4324 }
4325
4326 static noinline int copy_items(struct btrfs_trans_handle *trans,
4327                                struct btrfs_inode *inode,
4328                                struct btrfs_path *dst_path,
4329                                struct btrfs_path *src_path,
4330                                int start_slot, int nr, int inode_only,
4331                                u64 logged_isize, struct btrfs_log_ctx *ctx)
4332 {
4333         struct btrfs_root *log = inode->root->log_root;
4334         struct btrfs_file_extent_item *extent;
4335         struct extent_buffer *src;
4336         int ret;
4337         struct btrfs_key *ins_keys;
4338         u32 *ins_sizes;
4339         struct btrfs_item_batch batch;
4340         char *ins_data;
4341         int dst_index;
4342         const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
4343         const u64 i_size = i_size_read(&inode->vfs_inode);
4344
4345         /*
4346          * To keep lockdep happy and avoid deadlocks, clone the source leaf and
4347          * use the clone. This is because otherwise we would be changing the log
4348          * tree, to insert items from the subvolume tree or insert csum items,
4349          * while holding a read lock on a leaf from the subvolume tree, which
4350          * creates a nasty lock dependency when COWing log tree nodes/leaves:
4351          *
4352          * 1) Modifying the log tree triggers an extent buffer allocation while
4353          *    holding a write lock on a parent extent buffer from the log tree.
4354          *    Allocating the pages for an extent buffer, or the extent buffer
4355          *    struct, can trigger inode eviction and finally the inode eviction
4356          *    will trigger a release/remove of a delayed node, which requires
4357          *    taking the delayed node's mutex;
4358          *
4359          * 2) Allocating a metadata extent for a log tree can trigger the async
4360          *    reclaim thread and make us wait for it to release enough space and
4361          *    unblock our reservation ticket. The reclaim thread can start
4362          *    flushing delayed items, and that in turn results in the need to
4363          *    lock delayed node mutexes and in the need to write lock extent
4364          *    buffers of a subvolume tree - all this while holding a write lock
4365          *    on the parent extent buffer in the log tree.
4366          *
4367          * So one task in scenario 1) running in parallel with another task in
4368          * scenario 2) could lead to a deadlock, one wanting to lock a delayed
4369          * node mutex while having a read lock on a leaf from the subvolume,
4370          * while the other is holding the delayed node's mutex and wants to
4371          * write lock the same subvolume leaf for flushing delayed items.
4372          */
4373         ret = clone_leaf(src_path, ctx);
4374         if (ret < 0)
4375                 return ret;
4376
4377         src = src_path->nodes[0];
4378
4379         ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
4380                            nr * sizeof(u32), GFP_NOFS);
4381         if (!ins_data)
4382                 return -ENOMEM;
4383
4384         ins_sizes = (u32 *)ins_data;
4385         ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
4386         batch.keys = ins_keys;
4387         batch.data_sizes = ins_sizes;
4388         batch.total_data_size = 0;
4389         batch.nr = 0;
4390
4391         dst_index = 0;
4392         for (int i = 0; i < nr; i++) {
4393                 const int src_slot = start_slot + i;
4394                 struct btrfs_root *csum_root;
4395                 struct btrfs_ordered_sum *sums;
4396                 struct btrfs_ordered_sum *sums_next;
4397                 LIST_HEAD(ordered_sums);
4398                 u64 disk_bytenr;
4399                 u64 disk_num_bytes;
4400                 u64 extent_offset;
4401                 u64 extent_num_bytes;
4402                 bool is_old_extent;
4403
4404                 btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
4405
4406                 if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
4407                         goto add_to_batch;
4408
4409                 extent = btrfs_item_ptr(src, src_slot,
4410                                         struct btrfs_file_extent_item);
4411
4412                 is_old_extent = (btrfs_file_extent_generation(src, extent) <
4413                                  trans->transid);
4414
4415                 /*
4416                  * Don't copy extents from past generations. That would make us
4417                  * log a lot more metadata for common cases like doing only a
4418                  * few random writes into a file and then fsync it for the first
4419                  * time or after the full sync flag is set on the inode. We can
4420                  * get leaves full of extent items, most of which are from past
4421                  * generations, so we can skip them - as long as the inode has
4422                  * not been the target of a reflink operation in this transaction,
4423                  * as in that case it might have had file extent items with old
4424                  * generations copied into it. We also must always log prealloc
4425                  * extents that start at or beyond eof, otherwise we would lose
4426                  * them on log replay.
4427                  */
4428                 if (is_old_extent &&
4429                     ins_keys[dst_index].offset < i_size &&
4430                     inode->last_reflink_trans < trans->transid)
4431                         continue;
4432
4433                 if (skip_csum)
4434                         goto add_to_batch;
4435
4436                 /* Only regular extents have checksums. */
4437                 if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
4438                         goto add_to_batch;
4439
4440                 /*
4441                  * If it's an extent created in a past transaction, then its
4442                  * checksums are already accessible from the committed csum tree,
4443                  * no need to log them.
4444                  */
4445                 if (is_old_extent)
4446                         goto add_to_batch;
4447
4448                 disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
4449                 /* If it's an explicit hole, there are no checksums. */
4450                 if (disk_bytenr == 0)
4451                         goto add_to_batch;
4452
4453                 disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
4454
4455                 if (btrfs_file_extent_compression(src, extent)) {
4456                         extent_offset = 0;
4457                         extent_num_bytes = disk_num_bytes;
4458                 } else {
4459                         extent_offset = btrfs_file_extent_offset(src, extent);
4460                         extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
4461                 }
4462
4463                 csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
4464                 disk_bytenr += extent_offset;
4465                 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
4466                                               disk_bytenr + extent_num_bytes - 1,
4467                                               &ordered_sums, false);
4468                 if (ret < 0)
4469                         goto out;
4470                 ret = 0;
4471
4472                 list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
4473                         if (!ret)
4474                                 ret = log_csums(trans, inode, log, sums);
4475                         list_del(&sums->list);
4476                         kfree(sums);
4477                 }
4478                 if (ret)
4479                         goto out;
4480
4481 add_to_batch:
4482                 ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
4483                 batch.total_data_size += ins_sizes[dst_index];
4484                 batch.nr++;
4485                 dst_index++;
4486         }
4487
4488         /*
4489          * We have a leaf full of old extent items that don't need to be logged,
4490          * so we don't need to do anything.
4491          */
4492         if (batch.nr == 0)
4493                 goto out;
4494
4495         ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
4496         if (ret)
4497                 goto out;
4498
4499         dst_index = 0;
4500         for (int i = 0; i < nr; i++) {
4501                 const int src_slot = start_slot + i;
4502                 const int dst_slot = dst_path->slots[0] + dst_index;
4503                 struct btrfs_key key;
4504                 unsigned long src_offset;
4505                 unsigned long dst_offset;
4506
4507                 /*
4508                  * We're done, all the remaining items in the source leaf
4509                  * correspond to old file extent items.
4510                  */
4511                 if (dst_index >= batch.nr)
4512                         break;
4513
4514                 btrfs_item_key_to_cpu(src, &key, src_slot);
4515
4516                 if (key.type != BTRFS_EXTENT_DATA_KEY)
4517                         goto copy_item;
4518
4519                 extent = btrfs_item_ptr(src, src_slot,
4520                                         struct btrfs_file_extent_item);
4521
4522                 /* See the comment in the previous loop, same logic. */
4523                 if (btrfs_file_extent_generation(src, extent) < trans->transid &&
4524                     key.offset < i_size &&
4525                     inode->last_reflink_trans < trans->transid)
4526                         continue;
4527
4528 copy_item:
4529                 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
4530                 src_offset = btrfs_item_ptr_offset(src, src_slot);
4531
4532                 if (key.type == BTRFS_INODE_ITEM_KEY) {
4533                         struct btrfs_inode_item *inode_item;
4534
4535                         inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
4536                                                     struct btrfs_inode_item);
4537                         fill_inode_item(trans, dst_path->nodes[0], inode_item,
4538                                         &inode->vfs_inode,
4539                                         inode_only == LOG_INODE_EXISTS,
4540                                         logged_isize);
4541                 } else {
4542                         copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4543                                            src_offset, ins_sizes[dst_index]);
4544                 }
4545
4546                 dst_index++;
4547         }
4548
4549         btrfs_release_path(dst_path);
4550 out:
4551         kfree(ins_data);
4552
4553         return ret;
4554 }
4555
4556 static int extent_cmp(void *priv, const struct list_head *a,
4557                       const struct list_head *b)
4558 {
4559         const struct extent_map *em1, *em2;
4560
4561         em1 = list_entry(a, struct extent_map, list);
4562         em2 = list_entry(b, struct extent_map, list);
4563
4564         if (em1->start < em2->start)
4565                 return -1;
4566         else if (em1->start > em2->start)
4567                 return 1;
4568         return 0;
4569 }
4570
4571 static int log_extent_csums(struct btrfs_trans_handle *trans,
4572                             struct btrfs_inode *inode,
4573                             struct btrfs_root *log_root,
4574                             const struct extent_map *em,
4575                             struct btrfs_log_ctx *ctx)
4576 {
4577         struct btrfs_ordered_extent *ordered;
4578         struct btrfs_root *csum_root;
4579         u64 block_start;
4580         u64 csum_offset;
4581         u64 csum_len;
4582         u64 mod_start = em->start;
4583         u64 mod_len = em->len;
4584         LIST_HEAD(ordered_sums);
4585         int ret = 0;
4586
4587         if (inode->flags & BTRFS_INODE_NODATASUM ||
4588             (em->flags & EXTENT_FLAG_PREALLOC) ||
4589             em->disk_bytenr == EXTENT_MAP_HOLE)
4590                 return 0;
4591
4592         list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4593                 const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4594                 const u64 mod_end = mod_start + mod_len;
4595                 struct btrfs_ordered_sum *sums;
4596
4597                 if (mod_len == 0)
4598                         break;
4599
4600                 if (ordered_end <= mod_start)
4601                         continue;
4602                 if (mod_end <= ordered->file_offset)
4603                         break;
4604
4605                 /*
4606                  * We are going to copy all the csums on this ordered extent, so
4607                  * go ahead and adjust mod_start and mod_len in case this ordered
4608                  * extent has already been logged.
4609                  */
4610                 if (ordered->file_offset > mod_start) {
4611                         if (ordered_end >= mod_end)
4612                                 mod_len = ordered->file_offset - mod_start;
4613                         /*
4614                          * If we have this case
4615                          *
4616                          * |--------- logged extent ---------|
4617                          *       |----- ordered extent ----|
4618                          *
4619                          * Just don't mess with mod_start and mod_len, we'll
4620                          * just end up logging more csums than we need and it
4621                          * will be ok.
4622                          */
4623                 } else {
4624                         if (ordered_end < mod_end) {
4625                                 mod_len = mod_end - ordered_end;
4626                                 mod_start = ordered_end;
4627                         } else {
4628                                 mod_len = 0;
4629                         }
4630                 }
4631
4632                 /*
4633                  * To keep us from looping for the above case of an ordered
4634                  * extent that falls inside of the logged extent.
4635                  */
4636                 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4637                         continue;
4638
4639                 list_for_each_entry(sums, &ordered->list, list) {
4640                         ret = log_csums(trans, inode, log_root, sums);
4641                         if (ret)
4642                                 return ret;
4643                 }
4644         }
4645
4646         /* We're done, found all csums in the ordered extents. */
4647         if (mod_len == 0)
4648                 return 0;
4649
4650         /* If we're compressed we have to save the entire range of csums. */
4651         if (btrfs_extent_map_is_compressed(em)) {
4652                 csum_offset = 0;
4653                 csum_len = em->disk_num_bytes;
4654         } else {
4655                 csum_offset = mod_start - em->start;
4656                 csum_len = mod_len;
4657         }
4658
4659         /* block start is already adjusted for the file extent offset. */
4660         block_start = btrfs_extent_map_block_start(em);
4661         csum_root = btrfs_csum_root(trans->fs_info, block_start);
4662         ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset,
4663                                       block_start + csum_offset + csum_len - 1,
4664                                       &ordered_sums, false);
4665         if (ret < 0)
4666                 return ret;
4667         ret = 0;
4668
4669         while (!list_empty(&ordered_sums)) {
4670                 struct btrfs_ordered_sum *sums = list_first_entry(&ordered_sums,
4671                                                                   struct btrfs_ordered_sum,
4672                                                                   list);
4673                 if (!ret)
4674                         ret = log_csums(trans, inode, log_root, sums);
4675                 list_del(&sums->list);
4676                 kfree(sums);
4677         }
4678
4679         return ret;
4680 }
4681
4682 static int log_one_extent(struct btrfs_trans_handle *trans,
4683                           struct btrfs_inode *inode,
4684                           const struct extent_map *em,
4685                           struct btrfs_path *path,
4686                           struct btrfs_log_ctx *ctx)
4687 {
4688         struct btrfs_drop_extents_args drop_args = { 0 };
4689         struct btrfs_root *log = inode->root->log_root;
4690         struct btrfs_file_extent_item fi = { 0 };
4691         struct extent_buffer *leaf;
4692         struct btrfs_key key;
4693         enum btrfs_compression_type compress_type;
4694         u64 extent_offset = em->offset;
4695         u64 block_start = btrfs_extent_map_block_start(em);
4696         u64 block_len;
4697         int ret;
4698
4699         btrfs_set_stack_file_extent_generation(&fi, trans->transid);
4700         if (em->flags & EXTENT_FLAG_PREALLOC)
4701                 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
4702         else
4703                 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
4704
4705         block_len = em->disk_num_bytes;
4706         compress_type = btrfs_extent_map_compression(em);
4707         if (compress_type != BTRFS_COMPRESS_NONE) {
4708                 btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start);
4709                 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4710         } else if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
4711                 btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start - extent_offset);
4712                 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4713         }
4714
4715         btrfs_set_stack_file_extent_offset(&fi, extent_offset);
4716         btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
4717         btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
4718         btrfs_set_stack_file_extent_compression(&fi, compress_type);
4719
4720         ret = log_extent_csums(trans, inode, log, em, ctx);
4721         if (ret)
4722                 return ret;
4723
4724         /*
4725          * If this is the first time we are logging the inode in the current
4726          * transaction, we can avoid btrfs_drop_extents(), which is expensive
4727          * because it does a deletion search, which always acquires write locks
4728          * for extent buffers at levels 2, 1 and 0. This not only wastes time
4729          * but also adds significant contention in a log tree, since log trees
4730          * are small, with a root at level 2 or 3 at most, due to their short
4731          * life span.
4732          */
4733         if (ctx->logged_before) {
4734                 drop_args.path = path;
4735                 drop_args.start = em->start;
4736                 drop_args.end = em->start + em->len;
4737                 drop_args.replace_extent = true;
4738                 drop_args.extent_item_size = sizeof(fi);
4739                 ret = btrfs_drop_extents(trans, log, inode, &drop_args);
4740                 if (ret)
4741                         return ret;
4742         }
4743
4744         if (!drop_args.extent_inserted) {
4745                 key.objectid = btrfs_ino(inode);
4746                 key.type = BTRFS_EXTENT_DATA_KEY;
4747                 key.offset = em->start;
4748
4749                 ret = btrfs_insert_empty_item(trans, log, path, &key,
4750                                               sizeof(fi));
4751                 if (ret)
4752                         return ret;
4753         }
4754         leaf = path->nodes[0];
4755         write_extent_buffer(leaf, &fi,
4756                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4757                             sizeof(fi));
4758
4759         btrfs_release_path(path);
4760
4761         return ret;
4762 }
4763
4764 /*
4765  * Log all prealloc extents beyond the inode's i_size to make sure we do not
4766  * lose them after doing a full/fast fsync and replaying the log. We scan the
4767  * subvolume's root instead of iterating the inode's extent map tree because
4768  * otherwise we can log incorrect extent items based on extent map conversion.
4769  * That can happen due to the fact that extent maps are merged when they
4770  * are not in the extent map tree's list of modified extents.
4771  */
4772 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4773                                       struct btrfs_inode *inode,
4774                                       struct btrfs_path *path,
4775                                       struct btrfs_log_ctx *ctx)
4776 {
4777         struct btrfs_root *root = inode->root;
4778         struct btrfs_key key;
4779         const u64 i_size = i_size_read(&inode->vfs_inode);
4780         const u64 ino = btrfs_ino(inode);
4781         struct btrfs_path *dst_path = NULL;
4782         bool dropped_extents = false;
4783         u64 truncate_offset = i_size;
4784         struct extent_buffer *leaf;
4785         int slot;
4786         int ins_nr = 0;
4787         int start_slot = 0;
4788         int ret;
4789
4790         if (!(inode->flags & BTRFS_INODE_PREALLOC))
4791                 return 0;
4792
4793         key.objectid = ino;
4794         key.type = BTRFS_EXTENT_DATA_KEY;
4795         key.offset = i_size;
4796         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4797         if (ret < 0)
4798                 goto out;
4799
4800         /*
4801          * We must check if there is a prealloc extent that starts before the
4802          * i_size and crosses the i_size boundary. This is to ensure later we
4803          * truncate down to the end of that extent and not to the i_size, as
4804          * otherwise we end up losing part of the prealloc extent after a log
4805          * replay and with an implicit hole if there is another prealloc extent
4806          * that starts at an offset beyond i_size.
4807          */
4808         ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4809         if (ret < 0)
4810                 goto out;
4811
4812         if (ret == 0) {
4813                 struct btrfs_file_extent_item *ei;
4814
4815                 leaf = path->nodes[0];
4816                 slot = path->slots[0];
4817                 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4818
4819                 if (btrfs_file_extent_type(leaf, ei) ==
4820                     BTRFS_FILE_EXTENT_PREALLOC) {
4821                         u64 extent_end;
4822
4823                         btrfs_item_key_to_cpu(leaf, &key, slot);
4824                         extent_end = key.offset +
4825                                 btrfs_file_extent_num_bytes(leaf, ei);
4826
4827                         if (extent_end > i_size)
4828                                 truncate_offset = extent_end;
4829                 }
4830         } else {
4831                 ret = 0;
4832         }
4833
4834         while (true) {
4835                 leaf = path->nodes[0];
4836                 slot = path->slots[0];
4837
4838                 if (slot >= btrfs_header_nritems(leaf)) {
4839                         if (ins_nr > 0) {
4840                                 ret = copy_items(trans, inode, dst_path, path,
4841                                                  start_slot, ins_nr, 1, 0, ctx);
4842                                 if (ret < 0)
4843                                         goto out;
4844                                 ins_nr = 0;
4845                         }
4846                         ret = btrfs_next_leaf(root, path);
4847                         if (ret < 0)
4848                                 goto out;
4849                         if (ret > 0) {
4850                                 ret = 0;
4851                                 break;
4852                         }
4853                         continue;
4854                 }
4855
4856                 btrfs_item_key_to_cpu(leaf, &key, slot);
4857                 if (key.objectid > ino)
4858                         break;
4859                 if (WARN_ON_ONCE(key.objectid < ino) ||
4860                     key.type < BTRFS_EXTENT_DATA_KEY ||
4861                     key.offset < i_size) {
4862                         path->slots[0]++;
4863                         continue;
4864                 }
4865                 /*
4866                  * Avoid overlapping items in the log tree. The first time we
4867                  * get here, get rid of everything from a past fsync. After
4868                  * that, if the current extent starts before the end of the last
4869                  * extent we copied, truncate the last one. This can happen if
4870                  * an ordered extent completion modifies the subvolume tree
4871                  * while btrfs_next_leaf() has the tree unlocked.
4872                  */
4873                 if (!dropped_extents || key.offset < truncate_offset) {
4874                         ret = truncate_inode_items(trans, root->log_root, inode,
4875                                                    min(key.offset, truncate_offset),
4876                                                    BTRFS_EXTENT_DATA_KEY);
4877                         if (ret)
4878                                 goto out;
4879                         dropped_extents = true;
4880                 }
4881                 truncate_offset = btrfs_file_extent_end(path);
4882                 if (ins_nr == 0)
4883                         start_slot = slot;
4884                 ins_nr++;
4885                 path->slots[0]++;
4886                 if (!dst_path) {
4887                         dst_path = btrfs_alloc_path();
4888                         if (!dst_path) {
4889                                 ret = -ENOMEM;
4890                                 goto out;
4891                         }
4892                 }
4893         }
4894         if (ins_nr > 0)
4895                 ret = copy_items(trans, inode, dst_path, path,
4896                                  start_slot, ins_nr, 1, 0, ctx);
4897 out:
4898         btrfs_release_path(path);
4899         btrfs_free_path(dst_path);
4900         return ret;
4901 }
4902
4903 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4904                                      struct btrfs_inode *inode,
4905                                      struct btrfs_path *path,
4906                                      struct btrfs_log_ctx *ctx)
4907 {
4908         struct btrfs_ordered_extent *ordered;
4909         struct btrfs_ordered_extent *tmp;
4910         struct extent_map *em, *n;
4911         LIST_HEAD(extents);
4912         struct extent_map_tree *tree = &inode->extent_tree;
4913         int ret = 0;
4914         int num = 0;
4915
4916         write_lock(&tree->lock);
4917
4918         list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4919                 list_del_init(&em->list);
4920                 /*
4921                  * Just an arbitrary number, this can be really CPU intensive
4922                  * once we start getting a lot of extents, and really once we
4923                  * have a bunch of extents we just want to commit since it will
4924                  * be faster.
4925                  */
4926                 if (++num > 32768) {
4927                         list_del_init(&tree->modified_extents);
4928                         ret = -EFBIG;
4929                         goto process;
4930                 }
4931
4932                 if (em->generation < trans->transid)
4933                         continue;
4934
4935                 /* We log prealloc extents beyond eof later. */
4936                 if ((em->flags & EXTENT_FLAG_PREALLOC) &&
4937                     em->start >= i_size_read(&inode->vfs_inode))
4938                         continue;
4939
4940                 /* Need a ref to keep it from getting evicted from cache */
4941                 refcount_inc(&em->refs);
4942                 em->flags |= EXTENT_FLAG_LOGGING;
4943                 list_add_tail(&em->list, &extents);
4944                 num++;
4945         }
4946
4947         list_sort(NULL, &extents, extent_cmp);
4948 process:
4949         while (!list_empty(&extents)) {
4950                 em = list_first_entry(&extents, struct extent_map, list);
4951
4952                 list_del_init(&em->list);
4953
4954                 /*
4955                  * If we had an error we just need to delete everybody from our
4956                  * private list.
4957                  */
4958                 if (ret) {
4959                         btrfs_clear_em_logging(inode, em);
4960                         btrfs_free_extent_map(em);
4961                         continue;
4962                 }
4963
4964                 write_unlock(&tree->lock);
4965
4966                 ret = log_one_extent(trans, inode, em, path, ctx);
4967                 write_lock(&tree->lock);
4968                 btrfs_clear_em_logging(inode, em);
4969                 btrfs_free_extent_map(em);
4970         }
4971         WARN_ON(!list_empty(&extents));
4972         write_unlock(&tree->lock);
4973
4974         if (!ret)
4975                 ret = btrfs_log_prealloc_extents(trans, inode, path, ctx);
4976         if (ret)
4977                 return ret;
4978
4979         /*
4980          * We have logged all extents successfully, now make sure the commit of
4981          * the current transaction waits for the ordered extents to complete
4982          * before it commits and wipes out the log trees, otherwise we would
4983          * lose data if an ordered extents completes after the transaction
4984          * commits and a power failure happens after the transaction commit.
4985          */
4986         list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
4987                 list_del_init(&ordered->log_list);
4988                 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
4989
4990                 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4991                         spin_lock_irq(&inode->ordered_tree_lock);
4992                         if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4993                                 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
4994                                 atomic_inc(&trans->transaction->pending_ordered);
4995                         }
4996                         spin_unlock_irq(&inode->ordered_tree_lock);
4997                 }
4998                 btrfs_put_ordered_extent(ordered);
4999         }
5000
5001         return 0;
5002 }
5003
5004 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
5005                              struct btrfs_path *path, u64 *size_ret)
5006 {
5007         struct btrfs_key key;
5008         int ret;
5009
5010         key.objectid = btrfs_ino(inode);
5011         key.type = BTRFS_INODE_ITEM_KEY;
5012         key.offset = 0;
5013
5014         ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
5015         if (ret < 0) {
5016                 return ret;
5017         } else if (ret > 0) {
5018                 *size_ret = 0;
5019         } else {
5020                 struct btrfs_inode_item *item;
5021
5022                 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5023                                       struct btrfs_inode_item);
5024                 *size_ret = btrfs_inode_size(path->nodes[0], item);
5025                 /*
5026                  * If the in-memory inode's i_size is smaller then the inode
5027                  * size stored in the btree, return the inode's i_size, so
5028                  * that we get a correct inode size after replaying the log
5029                  * when before a power failure we had a shrinking truncate
5030                  * followed by addition of a new name (rename / new hard link).
5031                  * Otherwise return the inode size from the btree, to avoid
5032                  * data loss when replaying a log due to previously doing a
5033                  * write that expands the inode's size and logging a new name
5034                  * immediately after.
5035                  */
5036                 if (*size_ret > inode->vfs_inode.i_size)
5037                         *size_ret = inode->vfs_inode.i_size;
5038         }
5039
5040         btrfs_release_path(path);
5041         return 0;
5042 }
5043
5044 /*
5045  * At the moment we always log all xattrs. This is to figure out at log replay
5046  * time which xattrs must have their deletion replayed. If a xattr is missing
5047  * in the log tree and exists in the fs/subvol tree, we delete it. This is
5048  * because if a xattr is deleted, the inode is fsynced and a power failure
5049  * happens, causing the log to be replayed the next time the fs is mounted,
5050  * we want the xattr to not exist anymore (same behaviour as other filesystems
5051  * with a journal, ext3/4, xfs, f2fs, etc).
5052  */
5053 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
5054                                 struct btrfs_inode *inode,
5055                                 struct btrfs_path *path,
5056                                 struct btrfs_path *dst_path,
5057                                 struct btrfs_log_ctx *ctx)
5058 {
5059         struct btrfs_root *root = inode->root;
5060         int ret;
5061         struct btrfs_key key;
5062         const u64 ino = btrfs_ino(inode);
5063         int ins_nr = 0;
5064         int start_slot = 0;
5065         bool found_xattrs = false;
5066
5067         if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
5068                 return 0;
5069
5070         key.objectid = ino;
5071         key.type = BTRFS_XATTR_ITEM_KEY;
5072         key.offset = 0;
5073
5074         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5075         if (ret < 0)
5076                 return ret;
5077
5078         while (true) {
5079                 int slot = path->slots[0];
5080                 struct extent_buffer *leaf = path->nodes[0];
5081                 int nritems = btrfs_header_nritems(leaf);
5082
5083                 if (slot >= nritems) {
5084                         if (ins_nr > 0) {
5085                                 ret = copy_items(trans, inode, dst_path, path,
5086                                                  start_slot, ins_nr, 1, 0, ctx);
5087                                 if (ret < 0)
5088                                         return ret;
5089                                 ins_nr = 0;
5090                         }
5091                         ret = btrfs_next_leaf(root, path);
5092                         if (ret < 0)
5093                                 return ret;
5094                         else if (ret > 0)
5095                                 break;
5096                         continue;
5097                 }
5098
5099                 btrfs_item_key_to_cpu(leaf, &key, slot);
5100                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
5101                         break;
5102
5103                 if (ins_nr == 0)
5104                         start_slot = slot;
5105                 ins_nr++;
5106                 path->slots[0]++;
5107                 found_xattrs = true;
5108                 cond_resched();
5109         }
5110         if (ins_nr > 0) {
5111                 ret = copy_items(trans, inode, dst_path, path,
5112                                  start_slot, ins_nr, 1, 0, ctx);
5113                 if (ret < 0)
5114                         return ret;
5115         }
5116
5117         if (!found_xattrs)
5118                 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
5119
5120         return 0;
5121 }
5122
5123 /*
5124  * When using the NO_HOLES feature if we punched a hole that causes the
5125  * deletion of entire leafs or all the extent items of the first leaf (the one
5126  * that contains the inode item and references) we may end up not processing
5127  * any extents, because there are no leafs with a generation matching the
5128  * current transaction that have extent items for our inode. So we need to find
5129  * if any holes exist and then log them. We also need to log holes after any
5130  * truncate operation that changes the inode's size.
5131  */
5132 static int btrfs_log_holes(struct btrfs_trans_handle *trans,
5133                            struct btrfs_inode *inode,
5134                            struct btrfs_path *path)
5135 {
5136         struct btrfs_root *root = inode->root;
5137         struct btrfs_fs_info *fs_info = root->fs_info;
5138         struct btrfs_key key;
5139         const u64 ino = btrfs_ino(inode);
5140         const u64 i_size = i_size_read(&inode->vfs_inode);
5141         u64 prev_extent_end = 0;
5142         int ret;
5143
5144         if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
5145                 return 0;
5146
5147         key.objectid = ino;
5148         key.type = BTRFS_EXTENT_DATA_KEY;
5149         key.offset = 0;
5150
5151         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5152         if (ret < 0)
5153                 return ret;
5154
5155         while (true) {
5156                 struct extent_buffer *leaf = path->nodes[0];
5157
5158                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5159                         ret = btrfs_next_leaf(root, path);
5160                         if (ret < 0)
5161                                 return ret;
5162                         if (ret > 0) {
5163                                 ret = 0;
5164                                 break;
5165                         }
5166                         leaf = path->nodes[0];
5167                 }
5168
5169                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5170                 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
5171                         break;
5172
5173                 /* We have a hole, log it. */
5174                 if (prev_extent_end < key.offset) {
5175                         const u64 hole_len = key.offset - prev_extent_end;
5176
5177                         /*
5178                          * Release the path to avoid deadlocks with other code
5179                          * paths that search the root while holding locks on
5180                          * leafs from the log root.
5181                          */
5182                         btrfs_release_path(path);
5183                         ret = btrfs_insert_hole_extent(trans, root->log_root,
5184                                                        ino, prev_extent_end,
5185                                                        hole_len);
5186                         if (ret < 0)
5187                                 return ret;
5188
5189                         /*
5190                          * Search for the same key again in the root. Since it's
5191                          * an extent item and we are holding the inode lock, the
5192                          * key must still exist. If it doesn't just emit warning
5193                          * and return an error to fall back to a transaction
5194                          * commit.
5195                          */
5196                         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5197                         if (ret < 0)
5198                                 return ret;
5199                         if (WARN_ON(ret > 0))
5200                                 return -ENOENT;
5201                         leaf = path->nodes[0];
5202                 }
5203
5204                 prev_extent_end = btrfs_file_extent_end(path);
5205                 path->slots[0]++;
5206                 cond_resched();
5207         }
5208
5209         if (prev_extent_end < i_size) {
5210                 u64 hole_len;
5211
5212                 btrfs_release_path(path);
5213                 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
5214                 ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
5215                                                prev_extent_end, hole_len);
5216                 if (ret < 0)
5217                         return ret;
5218         }
5219
5220         return 0;
5221 }
5222
5223 /*
5224  * When we are logging a new inode X, check if it doesn't have a reference that
5225  * matches the reference from some other inode Y created in a past transaction
5226  * and that was renamed in the current transaction. If we don't do this, then at
5227  * log replay time we can lose inode Y (and all its files if it's a directory):
5228  *
5229  * mkdir /mnt/x
5230  * echo "hello world" > /mnt/x/foobar
5231  * sync
5232  * mv /mnt/x /mnt/y
5233  * mkdir /mnt/x                 # or touch /mnt/x
5234  * xfs_io -c fsync /mnt/x
5235  * <power fail>
5236  * mount fs, trigger log replay
5237  *
5238  * After the log replay procedure, we would lose the first directory and all its
5239  * files (file foobar).
5240  * For the case where inode Y is not a directory we simply end up losing it:
5241  *
5242  * echo "123" > /mnt/foo
5243  * sync
5244  * mv /mnt/foo /mnt/bar
5245  * echo "abc" > /mnt/foo
5246  * xfs_io -c fsync /mnt/foo
5247  * <power fail>
5248  *
5249  * We also need this for cases where a snapshot entry is replaced by some other
5250  * entry (file or directory) otherwise we end up with an unreplayable log due to
5251  * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
5252  * if it were a regular entry:
5253  *
5254  * mkdir /mnt/x
5255  * btrfs subvolume snapshot /mnt /mnt/x/snap
5256  * btrfs subvolume delete /mnt/x/snap
5257  * rmdir /mnt/x
5258  * mkdir /mnt/x
5259  * fsync /mnt/x or fsync some new file inside it
5260  * <power fail>
5261  *
5262  * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
5263  * the same transaction.
5264  */
5265 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
5266                                          const int slot,
5267                                          const struct btrfs_key *key,
5268                                          struct btrfs_inode *inode,
5269                                          u64 *other_ino, u64 *other_parent)
5270 {
5271         int ret;
5272         struct btrfs_path *search_path;
5273         char *name = NULL;
5274         u32 name_len = 0;
5275         u32 item_size = btrfs_item_size(eb, slot);
5276         u32 cur_offset = 0;
5277         unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
5278
5279         search_path = btrfs_alloc_path();
5280         if (!search_path)
5281                 return -ENOMEM;
5282         search_path->search_commit_root = 1;
5283         search_path->skip_locking = 1;
5284
5285         while (cur_offset < item_size) {
5286                 u64 parent;
5287                 u32 this_name_len;
5288                 u32 this_len;
5289                 unsigned long name_ptr;
5290                 struct btrfs_dir_item *di;
5291                 struct fscrypt_str name_str;
5292
5293                 if (key->type == BTRFS_INODE_REF_KEY) {
5294                         struct btrfs_inode_ref *iref;
5295
5296                         iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
5297                         parent = key->offset;
5298                         this_name_len = btrfs_inode_ref_name_len(eb, iref);
5299                         name_ptr = (unsigned long)(iref + 1);
5300                         this_len = sizeof(*iref) + this_name_len;
5301                 } else {
5302                         struct btrfs_inode_extref *extref;
5303
5304                         extref = (struct btrfs_inode_extref *)(ptr +
5305                                                                cur_offset);
5306                         parent = btrfs_inode_extref_parent(eb, extref);
5307                         this_name_len = btrfs_inode_extref_name_len(eb, extref);
5308                         name_ptr = (unsigned long)&extref->name;
5309                         this_len = sizeof(*extref) + this_name_len;
5310                 }
5311
5312                 if (this_name_len > name_len) {
5313                         char *new_name;
5314
5315                         new_name = krealloc(name, this_name_len, GFP_NOFS);
5316                         if (!new_name) {
5317                                 ret = -ENOMEM;
5318                                 goto out;
5319                         }
5320                         name_len = this_name_len;
5321                         name = new_name;
5322                 }
5323
5324                 read_extent_buffer(eb, name, name_ptr, this_name_len);
5325
5326                 name_str.name = name;
5327                 name_str.len = this_name_len;
5328                 di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
5329                                 parent, &name_str, 0);
5330                 if (di && !IS_ERR(di)) {
5331                         struct btrfs_key di_key;
5332
5333                         btrfs_dir_item_key_to_cpu(search_path->nodes[0],
5334                                                   di, &di_key);
5335                         if (di_key.type == BTRFS_INODE_ITEM_KEY) {
5336                                 if (di_key.objectid != key->objectid) {
5337                                         ret = 1;
5338                                         *other_ino = di_key.objectid;
5339                                         *other_parent = parent;
5340                                 } else {
5341                                         ret = 0;
5342                                 }
5343                         } else {
5344                                 ret = -EAGAIN;
5345                         }
5346                         goto out;
5347                 } else if (IS_ERR(di)) {
5348                         ret = PTR_ERR(di);
5349                         goto out;
5350                 }
5351                 btrfs_release_path(search_path);
5352
5353                 cur_offset += this_len;
5354         }
5355         ret = 0;
5356 out:
5357         btrfs_free_path(search_path);
5358         kfree(name);
5359         return ret;
5360 }
5361
5362 /*
5363  * Check if we need to log an inode. This is used in contexts where while
5364  * logging an inode we need to log another inode (either that it exists or in
5365  * full mode). This is used instead of btrfs_inode_in_log() because the later
5366  * requires the inode to be in the log and have the log transaction committed,
5367  * while here we do not care if the log transaction was already committed - our
5368  * caller will commit the log later - and we want to avoid logging an inode
5369  * multiple times when multiple tasks have joined the same log transaction.
5370  */
5371 static bool need_log_inode(const struct btrfs_trans_handle *trans,
5372                            struct btrfs_inode *inode)
5373 {
5374         /*
5375          * If a directory was not modified, no dentries added or removed, we can
5376          * and should avoid logging it.
5377          */
5378         if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
5379                 return false;
5380
5381         /*
5382          * If this inode does not have new/updated/deleted xattrs since the last
5383          * time it was logged and is flagged as logged in the current transaction,
5384          * we can skip logging it. As for new/deleted names, those are updated in
5385          * the log by link/unlink/rename operations.
5386          * In case the inode was logged and then evicted and reloaded, its
5387          * logged_trans will be 0, in which case we have to fully log it since
5388          * logged_trans is a transient field, not persisted.
5389          */
5390         if (inode_logged(trans, inode, NULL) == 1 &&
5391             !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
5392                 return false;
5393
5394         return true;
5395 }
5396
5397 struct btrfs_dir_list {
5398         u64 ino;
5399         struct list_head list;
5400 };
5401
5402 /*
5403  * Log the inodes of the new dentries of a directory.
5404  * See process_dir_items_leaf() for details about why it is needed.
5405  * This is a recursive operation - if an existing dentry corresponds to a
5406  * directory, that directory's new entries are logged too (same behaviour as
5407  * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5408  * the dentries point to we do not acquire their VFS lock, otherwise lockdep
5409  * complains about the following circular lock dependency / possible deadlock:
5410  *
5411  *        CPU0                                        CPU1
5412  *        ----                                        ----
5413  * lock(&type->i_mutex_dir_key#3/2);
5414  *                                            lock(sb_internal#2);
5415  *                                            lock(&type->i_mutex_dir_key#3/2);
5416  * lock(&sb->s_type->i_mutex_key#14);
5417  *
5418  * Where sb_internal is the lock (a counter that works as a lock) acquired by
5419  * sb_start_intwrite() in btrfs_start_transaction().
5420  * Not acquiring the VFS lock of the inodes is still safe because:
5421  *
5422  * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5423  *    that while logging the inode new references (names) are added or removed
5424  *    from the inode, leaving the logged inode item with a link count that does
5425  *    not match the number of logged inode reference items. This is fine because
5426  *    at log replay time we compute the real number of links and correct the
5427  *    link count in the inode item (see replay_one_buffer() and
5428  *    link_to_fixup_dir());
5429  *
5430  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5431  *    while logging the inode's items new index items (key type
5432  *    BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
5433  *    has a size that doesn't match the sum of the lengths of all the logged
5434  *    names - this is ok, not a problem, because at log replay time we set the
5435  *    directory's i_size to the correct value (see replay_one_name() and
5436  *    overwrite_item()).
5437  */
5438 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5439                                 struct btrfs_inode *start_inode,
5440                                 struct btrfs_log_ctx *ctx)
5441 {
5442         struct btrfs_root *root = start_inode->root;
5443         struct btrfs_path *path;
5444         LIST_HEAD(dir_list);
5445         struct btrfs_dir_list *dir_elem;
5446         u64 ino = btrfs_ino(start_inode);
5447         struct btrfs_inode *curr_inode = start_inode;
5448         int ret = 0;
5449
5450         /*
5451          * If we are logging a new name, as part of a link or rename operation,
5452          * don't bother logging new dentries, as we just want to log the names
5453          * of an inode and that any new parents exist.
5454          */
5455         if (ctx->logging_new_name)
5456                 return 0;
5457
5458         path = btrfs_alloc_path();
5459         if (!path)
5460                 return -ENOMEM;
5461
5462         /* Pairs with btrfs_add_delayed_iput below. */
5463         ihold(&curr_inode->vfs_inode);
5464
5465         while (true) {
5466                 struct btrfs_key key;
5467                 struct btrfs_key found_key;
5468                 u64 next_index;
5469                 bool continue_curr_inode = true;
5470                 int iter_ret;
5471
5472                 key.objectid = ino;
5473                 key.type = BTRFS_DIR_INDEX_KEY;
5474                 key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
5475                 next_index = key.offset;
5476 again:
5477                 btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) {
5478                         struct extent_buffer *leaf = path->nodes[0];
5479                         struct btrfs_dir_item *di;
5480                         struct btrfs_key di_key;
5481                         struct btrfs_inode *di_inode;
5482                         int log_mode = LOG_INODE_EXISTS;
5483                         int type;
5484
5485                         if (found_key.objectid != ino ||
5486                             found_key.type != BTRFS_DIR_INDEX_KEY) {
5487                                 continue_curr_inode = false;
5488                                 break;
5489                         }
5490
5491                         next_index = found_key.offset + 1;
5492
5493                         di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
5494                         type = btrfs_dir_ftype(leaf, di);
5495                         if (btrfs_dir_transid(leaf, di) < trans->transid)
5496                                 continue;
5497                         btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5498                         if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5499                                 continue;
5500
5501                         btrfs_release_path(path);
5502                         di_inode = btrfs_iget_logging(di_key.objectid, root);
5503                         if (IS_ERR(di_inode)) {
5504                                 ret = PTR_ERR(di_inode);
5505                                 goto out;
5506                         }
5507
5508                         if (!need_log_inode(trans, di_inode)) {
5509                                 btrfs_add_delayed_iput(di_inode);
5510                                 break;
5511                         }
5512
5513                         ctx->log_new_dentries = false;
5514                         if (type == BTRFS_FT_DIR)
5515                                 log_mode = LOG_INODE_ALL;
5516                         ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
5517                         btrfs_add_delayed_iput(di_inode);
5518                         if (ret)
5519                                 goto out;
5520                         if (ctx->log_new_dentries) {
5521                                 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5522                                 if (!dir_elem) {
5523                                         ret = -ENOMEM;
5524                                         goto out;
5525                                 }
5526                                 dir_elem->ino = di_key.objectid;
5527                                 list_add_tail(&dir_elem->list, &dir_list);
5528                         }
5529                         break;
5530                 }
5531
5532                 btrfs_release_path(path);
5533
5534                 if (iter_ret < 0) {
5535                         ret = iter_ret;
5536                         goto out;
5537                 } else if (iter_ret > 0) {
5538                         continue_curr_inode = false;
5539                 } else {
5540                         key = found_key;
5541                 }
5542
5543                 if (continue_curr_inode && key.offset < (u64)-1) {
5544                         key.offset++;
5545                         goto again;
5546                 }
5547
5548                 btrfs_set_first_dir_index_to_log(curr_inode, next_index);
5549
5550                 if (list_empty(&dir_list))
5551                         break;
5552
5553                 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
5554                 ino = dir_elem->ino;
5555                 list_del(&dir_elem->list);
5556                 kfree(dir_elem);
5557
5558                 btrfs_add_delayed_iput(curr_inode);
5559
5560                 curr_inode = btrfs_iget_logging(ino, root);
5561                 if (IS_ERR(curr_inode)) {
5562                         ret = PTR_ERR(curr_inode);
5563                         curr_inode = NULL;
5564                         break;
5565                 }
5566         }
5567 out:
5568         btrfs_free_path(path);
5569         if (curr_inode)
5570                 btrfs_add_delayed_iput(curr_inode);
5571
5572         if (ret) {
5573                 struct btrfs_dir_list *next;
5574
5575                 list_for_each_entry_safe(dir_elem, next, &dir_list, list)
5576                         kfree(dir_elem);
5577         }
5578
5579         return ret;
5580 }
5581
5582 struct btrfs_ino_list {
5583         u64 ino;
5584         u64 parent;
5585         struct list_head list;
5586 };
5587
5588 static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
5589 {
5590         struct btrfs_ino_list *curr;
5591         struct btrfs_ino_list *next;
5592
5593         list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
5594                 list_del(&curr->list);
5595                 kfree(curr);
5596         }
5597 }
5598
5599 static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
5600                                     struct btrfs_path *path)
5601 {
5602         struct btrfs_key key;
5603         int ret;
5604
5605         key.objectid = ino;
5606         key.type = BTRFS_INODE_ITEM_KEY;
5607         key.offset = 0;
5608
5609         path->search_commit_root = 1;
5610         path->skip_locking = 1;
5611
5612         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5613         if (WARN_ON_ONCE(ret > 0)) {
5614                 /*
5615                  * We have previously found the inode through the commit root
5616                  * so this should not happen. If it does, just error out and
5617                  * fallback to a transaction commit.
5618                  */
5619                 ret = -ENOENT;
5620         } else if (ret == 0) {
5621                 struct btrfs_inode_item *item;
5622
5623                 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5624                                       struct btrfs_inode_item);
5625                 if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
5626                         ret = 1;
5627         }
5628
5629         btrfs_release_path(path);
5630         path->search_commit_root = 0;
5631         path->skip_locking = 0;
5632
5633         return ret;
5634 }
5635
5636 static int add_conflicting_inode(struct btrfs_trans_handle *trans,
5637                                  struct btrfs_root *root,
5638                                  struct btrfs_path *path,
5639                                  u64 ino, u64 parent,
5640                                  struct btrfs_log_ctx *ctx)
5641 {
5642         struct btrfs_ino_list *ino_elem;
5643         struct btrfs_inode *inode;
5644
5645         /*
5646          * It's rare to have a lot of conflicting inodes, in practice it is not
5647          * common to have more than 1 or 2. We don't want to collect too many,
5648          * as we could end up logging too many inodes (even if only in
5649          * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
5650          * commits.
5651          */
5652         if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
5653                 return BTRFS_LOG_FORCE_COMMIT;
5654
5655         inode = btrfs_iget_logging(ino, root);
5656         /*
5657          * If the other inode that had a conflicting dir entry was deleted in
5658          * the current transaction then we either:
5659          *
5660          * 1) Log the parent directory (later after adding it to the list) if
5661          *    the inode is a directory. This is because it may be a deleted
5662          *    subvolume/snapshot or it may be a regular directory that had
5663          *    deleted subvolumes/snapshots (or subdirectories that had them),
5664          *    and at the moment we can't deal with dropping subvolumes/snapshots
5665          *    during log replay. So we just log the parent, which will result in
5666          *    a fallback to a transaction commit if we are dealing with those
5667          *    cases (last_unlink_trans will match the current transaction);
5668          *
5669          * 2) Do nothing if it's not a directory. During log replay we simply
5670          *    unlink the conflicting dentry from the parent directory and then
5671          *    add the dentry for our inode. Like this we can avoid logging the
5672          *    parent directory (and maybe fallback to a transaction commit in
5673          *    case it has a last_unlink_trans == trans->transid, due to moving
5674          *    some inode from it to some other directory).
5675          */
5676         if (IS_ERR(inode)) {
5677                 int ret = PTR_ERR(inode);
5678
5679                 if (ret != -ENOENT)
5680                         return ret;
5681
5682                 ret = conflicting_inode_is_dir(root, ino, path);
5683                 /* Not a directory or we got an error. */
5684                 if (ret <= 0)
5685                         return ret;
5686
5687                 /* Conflicting inode is a directory, so we'll log its parent. */
5688                 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5689                 if (!ino_elem)
5690                         return -ENOMEM;
5691                 ino_elem->ino = ino;
5692                 ino_elem->parent = parent;
5693                 list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
5694                 ctx->num_conflict_inodes++;
5695
5696                 return 0;
5697         }
5698
5699         /*
5700          * If the inode was already logged skip it - otherwise we can hit an
5701          * infinite loop. Example:
5702          *
5703          * From the commit root (previous transaction) we have the following
5704          * inodes:
5705          *
5706          * inode 257 a directory
5707          * inode 258 with references "zz" and "zz_link" on inode 257
5708          * inode 259 with reference "a" on inode 257
5709          *
5710          * And in the current (uncommitted) transaction we have:
5711          *
5712          * inode 257 a directory, unchanged
5713          * inode 258 with references "a" and "a2" on inode 257
5714          * inode 259 with reference "zz_link" on inode 257
5715          * inode 261 with reference "zz" on inode 257
5716          *
5717          * When logging inode 261 the following infinite loop could
5718          * happen if we don't skip already logged inodes:
5719          *
5720          * - we detect inode 258 as a conflicting inode, with inode 261
5721          *   on reference "zz", and log it;
5722          *
5723          * - we detect inode 259 as a conflicting inode, with inode 258
5724          *   on reference "a", and log it;
5725          *
5726          * - we detect inode 258 as a conflicting inode, with inode 259
5727          *   on reference "zz_link", and log it - again! After this we
5728          *   repeat the above steps forever.
5729          *
5730          * Here we can use need_log_inode() because we only need to log the
5731          * inode in LOG_INODE_EXISTS mode and rename operations update the log,
5732          * so that the log ends up with the new name and without the old name.
5733          */
5734         if (!need_log_inode(trans, inode)) {
5735                 btrfs_add_delayed_iput(inode);
5736                 return 0;
5737         }
5738
5739         btrfs_add_delayed_iput(inode);
5740
5741         ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5742         if (!ino_elem)
5743                 return -ENOMEM;
5744         ino_elem->ino = ino;
5745         ino_elem->parent = parent;
5746         list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
5747         ctx->num_conflict_inodes++;
5748
5749         return 0;
5750 }
5751
5752 static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
5753                                   struct btrfs_root *root,
5754                                   struct btrfs_log_ctx *ctx)
5755 {
5756         int ret = 0;
5757
5758         /*
5759          * Conflicting inodes are logged by the first call to btrfs_log_inode(),
5760          * otherwise we could have unbounded recursion of btrfs_log_inode()
5761          * calls. This check guarantees we can have only 1 level of recursion.
5762          */
5763         if (ctx->logging_conflict_inodes)
5764                 return 0;
5765
5766         ctx->logging_conflict_inodes = true;
5767
5768         /*
5769          * New conflicting inodes may be found and added to the list while we
5770          * are logging a conflicting inode, so keep iterating while the list is
5771          * not empty.
5772          */
5773         while (!list_empty(&ctx->conflict_inodes)) {
5774                 struct btrfs_ino_list *curr;
5775                 struct btrfs_inode *inode;
5776                 u64 ino;
5777                 u64 parent;
5778
5779                 curr = list_first_entry(&ctx->conflict_inodes,
5780                                         struct btrfs_ino_list, list);
5781                 ino = curr->ino;
5782                 parent = curr->parent;
5783                 list_del(&curr->list);
5784                 kfree(curr);
5785
5786                 inode = btrfs_iget_logging(ino, root);
5787                 /*
5788                  * If the other inode that had a conflicting dir entry was
5789                  * deleted in the current transaction, we need to log its parent
5790                  * directory. See the comment at add_conflicting_inode().
5791                  */
5792                 if (IS_ERR(inode)) {
5793                         ret = PTR_ERR(inode);
5794                         if (ret != -ENOENT)
5795                                 break;
5796
5797                         inode = btrfs_iget_logging(parent, root);
5798                         if (IS_ERR(inode)) {
5799                                 ret = PTR_ERR(inode);
5800                                 break;
5801                         }
5802
5803                         /*
5804                          * Always log the directory, we cannot make this
5805                          * conditional on need_log_inode() because the directory
5806                          * might have been logged in LOG_INODE_EXISTS mode or
5807                          * the dir index of the conflicting inode is not in a
5808                          * dir index key range logged for the directory. So we
5809                          * must make sure the deletion is recorded.
5810                          */
5811                         ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx);
5812                         btrfs_add_delayed_iput(inode);
5813                         if (ret)
5814                                 break;
5815                         continue;
5816                 }
5817
5818                 /*
5819                  * Here we can use need_log_inode() because we only need to log
5820                  * the inode in LOG_INODE_EXISTS mode and rename operations
5821                  * update the log, so that the log ends up with the new name and
5822                  * without the old name.
5823                  *
5824                  * We did this check at add_conflicting_inode(), but here we do
5825                  * it again because if some other task logged the inode after
5826                  * that, we can avoid doing it again.
5827                  */
5828                 if (!need_log_inode(trans, inode)) {
5829                         btrfs_add_delayed_iput(inode);
5830                         continue;
5831                 }
5832
5833                 /*
5834                  * We are safe logging the other inode without acquiring its
5835                  * lock as long as we log with the LOG_INODE_EXISTS mode. We
5836                  * are safe against concurrent renames of the other inode as
5837                  * well because during a rename we pin the log and update the
5838                  * log with the new name before we unpin it.
5839                  */
5840                 ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
5841                 btrfs_add_delayed_iput(inode);
5842                 if (ret)
5843                         break;
5844         }
5845
5846         ctx->logging_conflict_inodes = false;
5847         if (ret)
5848                 free_conflicting_inodes(ctx);
5849
5850         return ret;
5851 }
5852
5853 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5854                                    struct btrfs_inode *inode,
5855                                    struct btrfs_key *min_key,
5856                                    const struct btrfs_key *max_key,
5857                                    struct btrfs_path *path,
5858                                    struct btrfs_path *dst_path,
5859                                    const u64 logged_isize,
5860                                    const int inode_only,
5861                                    struct btrfs_log_ctx *ctx,
5862                                    bool *need_log_inode_item)
5863 {
5864         const u64 i_size = i_size_read(&inode->vfs_inode);
5865         struct btrfs_root *root = inode->root;
5866         int ins_start_slot = 0;
5867         int ins_nr = 0;
5868         int ret;
5869
5870         while (1) {
5871                 ret = btrfs_search_forward(root, min_key, path, trans->transid);
5872                 if (ret < 0)
5873                         return ret;
5874                 if (ret > 0) {
5875                         ret = 0;
5876                         break;
5877                 }
5878 again:
5879                 /* Note, ins_nr might be > 0 here, cleanup outside the loop */
5880                 if (min_key->objectid != max_key->objectid)
5881                         break;
5882                 if (min_key->type > max_key->type)
5883                         break;
5884
5885                 if (min_key->type == BTRFS_INODE_ITEM_KEY) {
5886                         *need_log_inode_item = false;
5887                 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
5888                            min_key->offset >= i_size) {
5889                         /*
5890                          * Extents at and beyond eof are logged with
5891                          * btrfs_log_prealloc_extents().
5892                          * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
5893                          * and no keys greater than that, so bail out.
5894                          */
5895                         break;
5896                 } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
5897                             min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5898                            (inode->generation == trans->transid ||
5899                             ctx->logging_conflict_inodes)) {
5900                         u64 other_ino = 0;
5901                         u64 other_parent = 0;
5902
5903                         ret = btrfs_check_ref_name_override(path->nodes[0],
5904                                         path->slots[0], min_key, inode,
5905                                         &other_ino, &other_parent);
5906                         if (ret < 0) {
5907                                 return ret;
5908                         } else if (ret > 0 &&
5909                                    other_ino != btrfs_ino(ctx->inode)) {
5910                                 if (ins_nr > 0) {
5911                                         ins_nr++;
5912                                 } else {
5913                                         ins_nr = 1;
5914                                         ins_start_slot = path->slots[0];
5915                                 }
5916                                 ret = copy_items(trans, inode, dst_path, path,
5917                                                  ins_start_slot, ins_nr,
5918                                                  inode_only, logged_isize, ctx);
5919                                 if (ret < 0)
5920                                         return ret;
5921                                 ins_nr = 0;
5922
5923                                 btrfs_release_path(path);
5924                                 ret = add_conflicting_inode(trans, root, path,
5925                                                             other_ino,
5926                                                             other_parent, ctx);
5927                                 if (ret)
5928                                         return ret;
5929                                 goto next_key;
5930                         }
5931                 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5932                         /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
5933                         if (ins_nr == 0)
5934                                 goto next_slot;
5935                         ret = copy_items(trans, inode, dst_path, path,
5936                                          ins_start_slot,
5937                                          ins_nr, inode_only, logged_isize, ctx);
5938                         if (ret < 0)
5939                                 return ret;
5940                         ins_nr = 0;
5941                         goto next_slot;
5942                 }
5943
5944                 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5945                         ins_nr++;
5946                         goto next_slot;
5947                 } else if (!ins_nr) {
5948                         ins_start_slot = path->slots[0];
5949                         ins_nr = 1;
5950                         goto next_slot;
5951                 }
5952
5953                 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5954                                  ins_nr, inode_only, logged_isize, ctx);
5955                 if (ret < 0)
5956                         return ret;
5957                 ins_nr = 1;
5958                 ins_start_slot = path->slots[0];
5959 next_slot:
5960                 path->slots[0]++;
5961                 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5962                         btrfs_item_key_to_cpu(path->nodes[0], min_key,
5963                                               path->slots[0]);
5964                         goto again;
5965                 }
5966                 if (ins_nr) {
5967                         ret = copy_items(trans, inode, dst_path, path,
5968                                          ins_start_slot, ins_nr, inode_only,
5969                                          logged_isize, ctx);
5970                         if (ret < 0)
5971                                 return ret;
5972                         ins_nr = 0;
5973                 }
5974                 btrfs_release_path(path);
5975 next_key:
5976                 if (min_key->offset < (u64)-1) {
5977                         min_key->offset++;
5978                 } else if (min_key->type < max_key->type) {
5979                         min_key->type++;
5980                         min_key->offset = 0;
5981                 } else {
5982                         break;
5983                 }
5984
5985                 /*
5986                  * We may process many leaves full of items for our inode, so
5987                  * avoid monopolizing a cpu for too long by rescheduling while
5988                  * not holding locks on any tree.
5989                  */
5990                 cond_resched();
5991         }
5992         if (ins_nr) {
5993                 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5994                                  ins_nr, inode_only, logged_isize, ctx);
5995                 if (ret)
5996                         return ret;
5997         }
5998
5999         if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
6000                 /*
6001                  * Release the path because otherwise we might attempt to double
6002                  * lock the same leaf with btrfs_log_prealloc_extents() below.
6003                  */
6004                 btrfs_release_path(path);
6005                 ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx);
6006         }
6007
6008         return ret;
6009 }
6010
6011 static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
6012                                       struct btrfs_root *log,
6013                                       struct btrfs_path *path,
6014                                       const struct btrfs_item_batch *batch,
6015                                       const struct btrfs_delayed_item *first_item)
6016 {
6017         const struct btrfs_delayed_item *curr = first_item;
6018         int ret;
6019
6020         ret = btrfs_insert_empty_items(trans, log, path, batch);
6021         if (ret)
6022                 return ret;
6023
6024         for (int i = 0; i < batch->nr; i++) {
6025                 char *data_ptr;
6026
6027                 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
6028                 write_extent_buffer(path->nodes[0], &curr->data,
6029                                     (unsigned long)data_ptr, curr->data_len);
6030                 curr = list_next_entry(curr, log_list);
6031                 path->slots[0]++;
6032         }
6033
6034         btrfs_release_path(path);
6035
6036         return 0;
6037 }
6038
6039 static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
6040                                        struct btrfs_inode *inode,
6041                                        struct btrfs_path *path,
6042                                        const struct list_head *delayed_ins_list,
6043                                        struct btrfs_log_ctx *ctx)
6044 {
6045         /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
6046         const int max_batch_size = 195;
6047         const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
6048         const u64 ino = btrfs_ino(inode);
6049         struct btrfs_root *log = inode->root->log_root;
6050         struct btrfs_item_batch batch = {
6051                 .nr = 0,
6052                 .total_data_size = 0,
6053         };
6054         const struct btrfs_delayed_item *first = NULL;
6055         const struct btrfs_delayed_item *curr;
6056         char *ins_data;
6057         struct btrfs_key *ins_keys;
6058         u32 *ins_sizes;
6059         u64 curr_batch_size = 0;
6060         int batch_idx = 0;
6061         int ret;
6062
6063         /* We are adding dir index items to the log tree. */
6064         lockdep_assert_held(&inode->log_mutex);
6065
6066         /*
6067          * We collect delayed items before copying index keys from the subvolume
6068          * to the log tree. However just after we collected them, they may have
6069          * been flushed (all of them or just some of them), and therefore we
6070          * could have copied them from the subvolume tree to the log tree.
6071          * So find the first delayed item that was not yet logged (they are
6072          * sorted by index number).
6073          */
6074         list_for_each_entry(curr, delayed_ins_list, log_list) {
6075                 if (curr->index > inode->last_dir_index_offset) {
6076                         first = curr;
6077                         break;
6078                 }
6079         }
6080
6081         /* Empty list or all delayed items were already logged. */
6082         if (!first)
6083                 return 0;
6084
6085         ins_data = kmalloc(max_batch_size * sizeof(u32) +
6086                            max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
6087         if (!ins_data)
6088                 return -ENOMEM;
6089         ins_sizes = (u32 *)ins_data;
6090         batch.data_sizes = ins_sizes;
6091         ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
6092         batch.keys = ins_keys;
6093
6094         curr = first;
6095         while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
6096                 const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
6097
6098                 if (curr_batch_size + curr_size > leaf_data_size ||
6099                     batch.nr == max_batch_size) {
6100                         ret = insert_delayed_items_batch(trans, log, path,
6101                                                          &batch, first);
6102                         if (ret)
6103                                 goto out;
6104                         batch_idx = 0;
6105                         batch.nr = 0;
6106                         batch.total_data_size = 0;
6107                         curr_batch_size = 0;
6108                         first = curr;
6109                 }
6110
6111                 ins_sizes[batch_idx] = curr->data_len;
6112                 ins_keys[batch_idx].objectid = ino;
6113                 ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
6114                 ins_keys[batch_idx].offset = curr->index;
6115                 curr_batch_size += curr_size;
6116                 batch.total_data_size += curr->data_len;
6117                 batch.nr++;
6118                 batch_idx++;
6119                 curr = list_next_entry(curr, log_list);
6120         }
6121
6122         ASSERT(batch.nr >= 1);
6123         ret = insert_delayed_items_batch(trans, log, path, &batch, first);
6124
6125         curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
6126                                log_list);
6127         inode->last_dir_index_offset = curr->index;
6128 out:
6129         kfree(ins_data);
6130
6131         return ret;
6132 }
6133
6134 static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
6135                                       struct btrfs_inode *inode,
6136                                       struct btrfs_path *path,
6137                                       const struct list_head *delayed_del_list,
6138                                       struct btrfs_log_ctx *ctx)
6139 {
6140         const u64 ino = btrfs_ino(inode);
6141         const struct btrfs_delayed_item *curr;
6142
6143         curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6144                                 log_list);
6145
6146         while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6147                 u64 first_dir_index = curr->index;
6148                 u64 last_dir_index;
6149                 const struct btrfs_delayed_item *next;
6150                 int ret;
6151
6152                 /*
6153                  * Find a range of consecutive dir index items to delete. Like
6154                  * this we log a single dir range item spanning several contiguous
6155                  * dir items instead of logging one range item per dir index item.
6156                  */
6157                 next = list_next_entry(curr, log_list);
6158                 while (!list_entry_is_head(next, delayed_del_list, log_list)) {
6159                         if (next->index != curr->index + 1)
6160                                 break;
6161                         curr = next;
6162                         next = list_next_entry(next, log_list);
6163                 }
6164
6165                 last_dir_index = curr->index;
6166                 ASSERT(last_dir_index >= first_dir_index);
6167
6168                 ret = insert_dir_log_key(trans, inode->root->log_root, path,
6169                                          ino, first_dir_index, last_dir_index);
6170                 if (ret)
6171                         return ret;
6172                 curr = list_next_entry(curr, log_list);
6173         }
6174
6175         return 0;
6176 }
6177
6178 static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
6179                                         struct btrfs_inode *inode,
6180                                         struct btrfs_path *path,
6181                                         const struct list_head *delayed_del_list,
6182                                         const struct btrfs_delayed_item *first,
6183                                         const struct btrfs_delayed_item **last_ret)
6184 {
6185         const struct btrfs_delayed_item *next;
6186         struct extent_buffer *leaf = path->nodes[0];
6187         const int last_slot = btrfs_header_nritems(leaf) - 1;
6188         int slot = path->slots[0] + 1;
6189         const u64 ino = btrfs_ino(inode);
6190
6191         next = list_next_entry(first, log_list);
6192
6193         while (slot < last_slot &&
6194                !list_entry_is_head(next, delayed_del_list, log_list)) {
6195                 struct btrfs_key key;
6196
6197                 btrfs_item_key_to_cpu(leaf, &key, slot);
6198                 if (key.objectid != ino ||
6199                     key.type != BTRFS_DIR_INDEX_KEY ||
6200                     key.offset != next->index)
6201                         break;
6202
6203                 slot++;
6204                 *last_ret = next;
6205                 next = list_next_entry(next, log_list);
6206         }
6207
6208         return btrfs_del_items(trans, inode->root->log_root, path,
6209                                path->slots[0], slot - path->slots[0]);
6210 }
6211
6212 static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
6213                                              struct btrfs_inode *inode,
6214                                              struct btrfs_path *path,
6215                                              const struct list_head *delayed_del_list,
6216                                              struct btrfs_log_ctx *ctx)
6217 {
6218         struct btrfs_root *log = inode->root->log_root;
6219         const struct btrfs_delayed_item *curr;
6220         u64 last_range_start = 0;
6221         u64 last_range_end = 0;
6222         struct btrfs_key key;
6223
6224         key.objectid = btrfs_ino(inode);
6225         key.type = BTRFS_DIR_INDEX_KEY;
6226         curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6227                                 log_list);
6228
6229         while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6230                 const struct btrfs_delayed_item *last = curr;
6231                 u64 first_dir_index = curr->index;
6232                 u64 last_dir_index;
6233                 bool deleted_items = false;
6234                 int ret;
6235
6236                 key.offset = curr->index;
6237                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
6238                 if (ret < 0) {
6239                         return ret;
6240                 } else if (ret == 0) {
6241                         ret = batch_delete_dir_index_items(trans, inode, path,
6242                                                            delayed_del_list, curr,
6243                                                            &last);
6244                         if (ret)
6245                                 return ret;
6246                         deleted_items = true;
6247                 }
6248
6249                 btrfs_release_path(path);
6250
6251                 /*
6252                  * If we deleted items from the leaf, it means we have a range
6253                  * item logging their range, so no need to add one or update an
6254                  * existing one. Otherwise we have to log a dir range item.
6255                  */
6256                 if (deleted_items)
6257                         goto next_batch;
6258
6259                 last_dir_index = last->index;
6260                 ASSERT(last_dir_index >= first_dir_index);
6261                 /*
6262                  * If this range starts right after where the previous one ends,
6263                  * then we want to reuse the previous range item and change its
6264                  * end offset to the end of this range. This is just to minimize
6265                  * leaf space usage, by avoiding adding a new range item.
6266                  */
6267                 if (last_range_end != 0 && first_dir_index == last_range_end + 1)
6268                         first_dir_index = last_range_start;
6269
6270                 ret = insert_dir_log_key(trans, log, path, key.objectid,
6271                                          first_dir_index, last_dir_index);
6272                 if (ret)
6273                         return ret;
6274
6275                 last_range_start = first_dir_index;
6276                 last_range_end = last_dir_index;
6277 next_batch:
6278                 curr = list_next_entry(last, log_list);
6279         }
6280
6281         return 0;
6282 }
6283
6284 static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
6285                                       struct btrfs_inode *inode,
6286                                       struct btrfs_path *path,
6287                                       const struct list_head *delayed_del_list,
6288                                       struct btrfs_log_ctx *ctx)
6289 {
6290         /*
6291          * We are deleting dir index items from the log tree or adding range
6292          * items to it.
6293          */
6294         lockdep_assert_held(&inode->log_mutex);
6295
6296         if (list_empty(delayed_del_list))
6297                 return 0;
6298
6299         if (ctx->logged_before)
6300                 return log_delayed_deletions_incremental(trans, inode, path,
6301                                                          delayed_del_list, ctx);
6302
6303         return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
6304                                           ctx);
6305 }
6306
6307 /*
6308  * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
6309  * items instead of the subvolume tree.
6310  */
6311 static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
6312                                     struct btrfs_inode *inode,
6313                                     const struct list_head *delayed_ins_list,
6314                                     struct btrfs_log_ctx *ctx)
6315 {
6316         const bool orig_log_new_dentries = ctx->log_new_dentries;
6317         struct btrfs_delayed_item *item;
6318         int ret = 0;
6319
6320         /*
6321          * No need for the log mutex, plus to avoid potential deadlocks or
6322          * lockdep annotations due to nesting of delayed inode mutexes and log
6323          * mutexes.
6324          */
6325         lockdep_assert_not_held(&inode->log_mutex);
6326
6327         ASSERT(!ctx->logging_new_delayed_dentries);
6328         ctx->logging_new_delayed_dentries = true;
6329
6330         list_for_each_entry(item, delayed_ins_list, log_list) {
6331                 struct btrfs_dir_item *dir_item;
6332                 struct btrfs_inode *di_inode;
6333                 struct btrfs_key key;
6334                 int log_mode = LOG_INODE_EXISTS;
6335
6336                 dir_item = (struct btrfs_dir_item *)item->data;
6337                 btrfs_disk_key_to_cpu(&key, &dir_item->location);
6338
6339                 if (key.type == BTRFS_ROOT_ITEM_KEY)
6340                         continue;
6341
6342                 di_inode = btrfs_iget_logging(key.objectid, inode->root);
6343                 if (IS_ERR(di_inode)) {
6344                         ret = PTR_ERR(di_inode);
6345                         break;
6346                 }
6347
6348                 if (!need_log_inode(trans, di_inode)) {
6349                         btrfs_add_delayed_iput(di_inode);
6350                         continue;
6351                 }
6352
6353                 if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR)
6354                         log_mode = LOG_INODE_ALL;
6355
6356                 ctx->log_new_dentries = false;
6357                 ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
6358
6359                 if (!ret && ctx->log_new_dentries)
6360                         ret = log_new_dir_dentries(trans, di_inode, ctx);
6361
6362                 btrfs_add_delayed_iput(di_inode);
6363
6364                 if (ret)
6365                         break;
6366         }
6367
6368         ctx->log_new_dentries = orig_log_new_dentries;
6369         ctx->logging_new_delayed_dentries = false;
6370
6371         return ret;
6372 }
6373
6374 /* log a single inode in the tree log.
6375  * At least one parent directory for this inode must exist in the tree
6376  * or be logged already.
6377  *
6378  * Any items from this inode changed by the current transaction are copied
6379  * to the log tree.  An extra reference is taken on any extents in this
6380  * file, allowing us to avoid a whole pile of corner cases around logging
6381  * blocks that have been removed from the tree.
6382  *
6383  * See LOG_INODE_ALL and related defines for a description of what inode_only
6384  * does.
6385  *
6386  * This handles both files and directories.
6387  */
6388 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
6389                            struct btrfs_inode *inode,
6390                            int inode_only,
6391                            struct btrfs_log_ctx *ctx)
6392 {
6393         struct btrfs_path *path;
6394         struct btrfs_path *dst_path;
6395         struct btrfs_key min_key;
6396         struct btrfs_key max_key;
6397         struct btrfs_root *log = inode->root->log_root;
6398         int ret;
6399         bool fast_search = false;
6400         u64 ino = btrfs_ino(inode);
6401         struct extent_map_tree *em_tree = &inode->extent_tree;
6402         u64 logged_isize = 0;
6403         bool need_log_inode_item = true;
6404         bool xattrs_logged = false;
6405         bool inode_item_dropped = true;
6406         bool full_dir_logging = false;
6407         LIST_HEAD(delayed_ins_list);
6408         LIST_HEAD(delayed_del_list);
6409
6410         path = btrfs_alloc_path();
6411         if (!path)
6412                 return -ENOMEM;
6413         dst_path = btrfs_alloc_path();
6414         if (!dst_path) {
6415                 btrfs_free_path(path);
6416                 return -ENOMEM;
6417         }
6418
6419         min_key.objectid = ino;
6420         min_key.type = BTRFS_INODE_ITEM_KEY;
6421         min_key.offset = 0;
6422
6423         max_key.objectid = ino;
6424
6425
6426         /* today the code can only do partial logging of directories */
6427         if (S_ISDIR(inode->vfs_inode.i_mode) ||
6428             (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6429                        &inode->runtime_flags) &&
6430              inode_only >= LOG_INODE_EXISTS))
6431                 max_key.type = BTRFS_XATTR_ITEM_KEY;
6432         else
6433                 max_key.type = (u8)-1;
6434         max_key.offset = (u64)-1;
6435
6436         if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
6437                 full_dir_logging = true;
6438
6439         /*
6440          * If we are logging a directory while we are logging dentries of the
6441          * delayed items of some other inode, then we need to flush the delayed
6442          * items of this directory and not log the delayed items directly. This
6443          * is to prevent more than one level of recursion into btrfs_log_inode()
6444          * by having something like this:
6445          *
6446          *     $ mkdir -p a/b/c/d/e/f/g/h/...
6447          *     $ xfs_io -c "fsync" a
6448          *
6449          * Where all directories in the path did not exist before and are
6450          * created in the current transaction.
6451          * So in such a case we directly log the delayed items of the main
6452          * directory ("a") without flushing them first, while for each of its
6453          * subdirectories we flush their delayed items before logging them.
6454          * This prevents a potential unbounded recursion like this:
6455          *
6456          * btrfs_log_inode()
6457          *   log_new_delayed_dentries()
6458          *      btrfs_log_inode()
6459          *        log_new_delayed_dentries()
6460          *          btrfs_log_inode()
6461          *            log_new_delayed_dentries()
6462          *              (...)
6463          *
6464          * We have thresholds for the maximum number of delayed items to have in
6465          * memory, and once they are hit, the items are flushed asynchronously.
6466          * However the limit is quite high, so lets prevent deep levels of
6467          * recursion to happen by limiting the maximum depth to be 1.
6468          */
6469         if (full_dir_logging && ctx->logging_new_delayed_dentries) {
6470                 ret = btrfs_commit_inode_delayed_items(trans, inode);
6471                 if (ret)
6472                         goto out;
6473         }
6474
6475         mutex_lock(&inode->log_mutex);
6476
6477         /*
6478          * For symlinks, we must always log their content, which is stored in an
6479          * inline extent, otherwise we could end up with an empty symlink after
6480          * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
6481          * one attempts to create an empty symlink).
6482          * We don't need to worry about flushing delalloc, because when we create
6483          * the inline extent when the symlink is created (we never have delalloc
6484          * for symlinks).
6485          */
6486         if (S_ISLNK(inode->vfs_inode.i_mode))
6487                 inode_only = LOG_INODE_ALL;
6488
6489         /*
6490          * Before logging the inode item, cache the value returned by
6491          * inode_logged(), because after that we have the need to figure out if
6492          * the inode was previously logged in this transaction.
6493          */
6494         ret = inode_logged(trans, inode, path);
6495         if (ret < 0)
6496                 goto out_unlock;
6497         ctx->logged_before = (ret == 1);
6498         ret = 0;
6499
6500         /*
6501          * This is for cases where logging a directory could result in losing a
6502          * a file after replaying the log. For example, if we move a file from a
6503          * directory A to a directory B, then fsync directory A, we have no way
6504          * to known the file was moved from A to B, so logging just A would
6505          * result in losing the file after a log replay.
6506          */
6507         if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
6508                 ret = BTRFS_LOG_FORCE_COMMIT;
6509                 goto out_unlock;
6510         }
6511
6512         /*
6513          * a brute force approach to making sure we get the most uptodate
6514          * copies of everything.
6515          */
6516         if (S_ISDIR(inode->vfs_inode.i_mode)) {
6517                 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
6518                 if (ctx->logged_before)
6519                         ret = drop_inode_items(trans, log, path, inode,
6520                                                BTRFS_XATTR_ITEM_KEY);
6521         } else {
6522                 if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
6523                         /*
6524                          * Make sure the new inode item we write to the log has
6525                          * the same isize as the current one (if it exists).
6526                          * This is necessary to prevent data loss after log
6527                          * replay, and also to prevent doing a wrong expanding
6528                          * truncate - for e.g. create file, write 4K into offset
6529                          * 0, fsync, write 4K into offset 4096, add hard link,
6530                          * fsync some other file (to sync log), power fail - if
6531                          * we use the inode's current i_size, after log replay
6532                          * we get a 8Kb file, with the last 4Kb extent as a hole
6533                          * (zeroes), as if an expanding truncate happened,
6534                          * instead of getting a file of 4Kb only.
6535                          */
6536                         ret = logged_inode_size(log, inode, path, &logged_isize);
6537                         if (ret)
6538                                 goto out_unlock;
6539                 }
6540                 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6541                              &inode->runtime_flags)) {
6542                         if (inode_only == LOG_INODE_EXISTS) {
6543                                 max_key.type = BTRFS_XATTR_ITEM_KEY;
6544                                 if (ctx->logged_before)
6545                                         ret = drop_inode_items(trans, log, path,
6546                                                                inode, max_key.type);
6547                         } else {
6548                                 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6549                                           &inode->runtime_flags);
6550                                 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6551                                           &inode->runtime_flags);
6552                                 if (ctx->logged_before)
6553                                         ret = truncate_inode_items(trans, log,
6554                                                                    inode, 0, 0);
6555                         }
6556                 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6557                                               &inode->runtime_flags) ||
6558                            inode_only == LOG_INODE_EXISTS) {
6559                         if (inode_only == LOG_INODE_ALL)
6560                                 fast_search = true;
6561                         max_key.type = BTRFS_XATTR_ITEM_KEY;
6562                         if (ctx->logged_before)
6563                                 ret = drop_inode_items(trans, log, path, inode,
6564                                                        max_key.type);
6565                 } else {
6566                         if (inode_only == LOG_INODE_ALL)
6567                                 fast_search = true;
6568                         inode_item_dropped = false;
6569                         goto log_extents;
6570                 }
6571
6572         }
6573         if (ret)
6574                 goto out_unlock;
6575
6576         /*
6577          * If we are logging a directory in full mode, collect the delayed items
6578          * before iterating the subvolume tree, so that we don't miss any new
6579          * dir index items in case they get flushed while or right after we are
6580          * iterating the subvolume tree.
6581          */
6582         if (full_dir_logging && !ctx->logging_new_delayed_dentries)
6583                 btrfs_log_get_delayed_items(inode, &delayed_ins_list,
6584                                             &delayed_del_list);
6585
6586         /*
6587          * If we are fsyncing a file with 0 hard links, then commit the delayed
6588          * inode because the last inode ref (or extref) item may still be in the
6589          * subvolume tree and if we log it the file will still exist after a log
6590          * replay. So commit the delayed inode to delete that last ref and we
6591          * skip logging it.
6592          */
6593         if (inode->vfs_inode.i_nlink == 0) {
6594                 ret = btrfs_commit_inode_delayed_inode(inode);
6595                 if (ret)
6596                         goto out_unlock;
6597         }
6598
6599         ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
6600                                       path, dst_path, logged_isize,
6601                                       inode_only, ctx,
6602                                       &need_log_inode_item);
6603         if (ret)
6604                 goto out_unlock;
6605
6606         btrfs_release_path(path);
6607         btrfs_release_path(dst_path);
6608         ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
6609         if (ret)
6610                 goto out_unlock;
6611         xattrs_logged = true;
6612         if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
6613                 btrfs_release_path(path);
6614                 btrfs_release_path(dst_path);
6615                 ret = btrfs_log_holes(trans, inode, path);
6616                 if (ret)
6617                         goto out_unlock;
6618         }
6619 log_extents:
6620         btrfs_release_path(path);
6621         btrfs_release_path(dst_path);
6622         if (need_log_inode_item) {
6623                 ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
6624                 if (ret)
6625                         goto out_unlock;
6626                 /*
6627                  * If we are doing a fast fsync and the inode was logged before
6628                  * in this transaction, we don't need to log the xattrs because
6629                  * they were logged before. If xattrs were added, changed or
6630                  * deleted since the last time we logged the inode, then we have
6631                  * already logged them because the inode had the runtime flag
6632                  * BTRFS_INODE_COPY_EVERYTHING set.
6633                  */
6634                 if (!xattrs_logged && inode->logged_trans < trans->transid) {
6635                         ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
6636                         if (ret)
6637                                 goto out_unlock;
6638                         btrfs_release_path(path);
6639                 }
6640         }
6641         if (fast_search) {
6642                 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
6643                 if (ret)
6644                         goto out_unlock;
6645         } else if (inode_only == LOG_INODE_ALL) {
6646                 struct extent_map *em, *n;
6647
6648                 write_lock(&em_tree->lock);
6649                 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
6650                         list_del_init(&em->list);
6651                 write_unlock(&em_tree->lock);
6652         }
6653
6654         if (full_dir_logging) {
6655                 ret = log_directory_changes(trans, inode, path, dst_path, ctx);
6656                 if (ret)
6657                         goto out_unlock;
6658                 ret = log_delayed_insertion_items(trans, inode, path,
6659                                                   &delayed_ins_list, ctx);
6660                 if (ret)
6661                         goto out_unlock;
6662                 ret = log_delayed_deletion_items(trans, inode, path,
6663                                                  &delayed_del_list, ctx);
6664                 if (ret)
6665                         goto out_unlock;
6666         }
6667
6668         spin_lock(&inode->lock);
6669         inode->logged_trans = trans->transid;
6670         /*
6671          * Don't update last_log_commit if we logged that an inode exists.
6672          * We do this for three reasons:
6673          *
6674          * 1) We might have had buffered writes to this inode that were
6675          *    flushed and had their ordered extents completed in this
6676          *    transaction, but we did not previously log the inode with
6677          *    LOG_INODE_ALL. Later the inode was evicted and after that
6678          *    it was loaded again and this LOG_INODE_EXISTS log operation
6679          *    happened. We must make sure that if an explicit fsync against
6680          *    the inode is performed later, it logs the new extents, an
6681          *    updated inode item, etc, and syncs the log. The same logic
6682          *    applies to direct IO writes instead of buffered writes.
6683          *
6684          * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
6685          *    is logged with an i_size of 0 or whatever value was logged
6686          *    before. If later the i_size of the inode is increased by a
6687          *    truncate operation, the log is synced through an fsync of
6688          *    some other inode and then finally an explicit fsync against
6689          *    this inode is made, we must make sure this fsync logs the
6690          *    inode with the new i_size, the hole between old i_size and
6691          *    the new i_size, and syncs the log.
6692          *
6693          * 3) If we are logging that an ancestor inode exists as part of
6694          *    logging a new name from a link or rename operation, don't update
6695          *    its last_log_commit - otherwise if an explicit fsync is made
6696          *    against an ancestor, the fsync considers the inode in the log
6697          *    and doesn't sync the log, resulting in the ancestor missing after
6698          *    a power failure unless the log was synced as part of an fsync
6699          *    against any other unrelated inode.
6700          */
6701         if (inode_only != LOG_INODE_EXISTS)
6702                 inode->last_log_commit = inode->last_sub_trans;
6703         spin_unlock(&inode->lock);
6704
6705         /*
6706          * Reset the last_reflink_trans so that the next fsync does not need to
6707          * go through the slower path when logging extents and their checksums.
6708          */
6709         if (inode_only == LOG_INODE_ALL)
6710                 inode->last_reflink_trans = 0;
6711
6712 out_unlock:
6713         mutex_unlock(&inode->log_mutex);
6714 out:
6715         btrfs_free_path(path);
6716         btrfs_free_path(dst_path);
6717
6718         if (ret)
6719                 free_conflicting_inodes(ctx);
6720         else
6721                 ret = log_conflicting_inodes(trans, inode->root, ctx);
6722
6723         if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
6724                 if (!ret)
6725                         ret = log_new_delayed_dentries(trans, inode,
6726                                                        &delayed_ins_list, ctx);
6727
6728                 btrfs_log_put_delayed_items(inode, &delayed_ins_list,
6729                                             &delayed_del_list);
6730         }
6731
6732         return ret;
6733 }
6734
6735 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
6736                                  struct btrfs_inode *inode,
6737                                  struct btrfs_log_ctx *ctx)
6738 {
6739         int ret;
6740         struct btrfs_path *path;
6741         struct btrfs_key key;
6742         struct btrfs_root *root = inode->root;
6743         const u64 ino = btrfs_ino(inode);
6744
6745         path = btrfs_alloc_path();
6746         if (!path)
6747                 return -ENOMEM;
6748         path->skip_locking = 1;
6749         path->search_commit_root = 1;
6750
6751         key.objectid = ino;
6752         key.type = BTRFS_INODE_REF_KEY;
6753         key.offset = 0;
6754         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6755         if (ret < 0)
6756                 goto out;
6757
6758         while (true) {
6759                 struct extent_buffer *leaf = path->nodes[0];
6760                 int slot = path->slots[0];
6761                 u32 cur_offset = 0;
6762                 u32 item_size;
6763                 unsigned long ptr;
6764
6765                 if (slot >= btrfs_header_nritems(leaf)) {
6766                         ret = btrfs_next_leaf(root, path);
6767                         if (ret < 0)
6768                                 goto out;
6769                         else if (ret > 0)
6770                                 break;
6771                         continue;
6772                 }
6773
6774                 btrfs_item_key_to_cpu(leaf, &key, slot);
6775                 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
6776                 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
6777                         break;
6778
6779                 item_size = btrfs_item_size(leaf, slot);
6780                 ptr = btrfs_item_ptr_offset(leaf, slot);
6781                 while (cur_offset < item_size) {
6782                         struct btrfs_key inode_key;
6783                         struct btrfs_inode *dir_inode;
6784
6785                         inode_key.type = BTRFS_INODE_ITEM_KEY;
6786                         inode_key.offset = 0;
6787
6788                         if (key.type == BTRFS_INODE_EXTREF_KEY) {
6789                                 struct btrfs_inode_extref *extref;
6790
6791                                 extref = (struct btrfs_inode_extref *)
6792                                         (ptr + cur_offset);
6793                                 inode_key.objectid = btrfs_inode_extref_parent(
6794                                         leaf, extref);
6795                                 cur_offset += sizeof(*extref);
6796                                 cur_offset += btrfs_inode_extref_name_len(leaf,
6797                                         extref);
6798                         } else {
6799                                 inode_key.objectid = key.offset;
6800                                 cur_offset = item_size;
6801                         }
6802
6803                         dir_inode = btrfs_iget_logging(inode_key.objectid, root);
6804                         /*
6805                          * If the parent inode was deleted, return an error to
6806                          * fallback to a transaction commit. This is to prevent
6807                          * getting an inode that was moved from one parent A to
6808                          * a parent B, got its former parent A deleted and then
6809                          * it got fsync'ed, from existing at both parents after
6810                          * a log replay (and the old parent still existing).
6811                          * Example:
6812                          *
6813                          * mkdir /mnt/A
6814                          * mkdir /mnt/B
6815                          * touch /mnt/B/bar
6816                          * sync
6817                          * mv /mnt/B/bar /mnt/A/bar
6818                          * mv -T /mnt/A /mnt/B
6819                          * fsync /mnt/B/bar
6820                          * <power fail>
6821                          *
6822                          * If we ignore the old parent B which got deleted,
6823                          * after a log replay we would have file bar linked
6824                          * at both parents and the old parent B would still
6825                          * exist.
6826                          */
6827                         if (IS_ERR(dir_inode)) {
6828                                 ret = PTR_ERR(dir_inode);
6829                                 goto out;
6830                         }
6831
6832                         if (!need_log_inode(trans, dir_inode)) {
6833                                 btrfs_add_delayed_iput(dir_inode);
6834                                 continue;
6835                         }
6836
6837                         ctx->log_new_dentries = false;
6838                         ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx);
6839                         if (!ret && ctx->log_new_dentries)
6840                                 ret = log_new_dir_dentries(trans, dir_inode, ctx);
6841                         btrfs_add_delayed_iput(dir_inode);
6842                         if (ret)
6843                                 goto out;
6844                 }
6845                 path->slots[0]++;
6846         }
6847         ret = 0;
6848 out:
6849         btrfs_free_path(path);
6850         return ret;
6851 }
6852
6853 static int log_new_ancestors(struct btrfs_trans_handle *trans,
6854                              struct btrfs_root *root,
6855                              struct btrfs_path *path,
6856                              struct btrfs_log_ctx *ctx)
6857 {
6858         struct btrfs_key found_key;
6859
6860         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
6861
6862         while (true) {
6863                 struct extent_buffer *leaf;
6864                 int slot;
6865                 struct btrfs_key search_key;
6866                 struct btrfs_inode *inode;
6867                 u64 ino;
6868                 int ret = 0;
6869
6870                 btrfs_release_path(path);
6871
6872                 ino = found_key.offset;
6873
6874                 search_key.objectid = found_key.offset;
6875                 search_key.type = BTRFS_INODE_ITEM_KEY;
6876                 search_key.offset = 0;
6877                 inode = btrfs_iget_logging(ino, root);
6878                 if (IS_ERR(inode))
6879                         return PTR_ERR(inode);
6880
6881                 if (inode->generation >= trans->transid &&
6882                     need_log_inode(trans, inode))
6883                         ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
6884                 btrfs_add_delayed_iput(inode);
6885                 if (ret)
6886                         return ret;
6887
6888                 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
6889                         break;
6890
6891                 search_key.type = BTRFS_INODE_REF_KEY;
6892                 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6893                 if (ret < 0)
6894                         return ret;
6895
6896                 leaf = path->nodes[0];
6897                 slot = path->slots[0];
6898                 if (slot >= btrfs_header_nritems(leaf)) {
6899                         ret = btrfs_next_leaf(root, path);
6900                         if (ret < 0)
6901                                 return ret;
6902                         else if (ret > 0)
6903                                 return -ENOENT;
6904                         leaf = path->nodes[0];
6905                         slot = path->slots[0];
6906                 }
6907
6908                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
6909                 if (found_key.objectid != search_key.objectid ||
6910                     found_key.type != BTRFS_INODE_REF_KEY)
6911                         return -ENOENT;
6912         }
6913         return 0;
6914 }
6915
6916 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
6917                                   struct btrfs_inode *inode,
6918                                   struct dentry *parent,
6919                                   struct btrfs_log_ctx *ctx)
6920 {
6921         struct btrfs_root *root = inode->root;
6922         struct dentry *old_parent = NULL;
6923         struct super_block *sb = inode->vfs_inode.i_sb;
6924         int ret = 0;
6925
6926         while (true) {
6927                 if (!parent || d_really_is_negative(parent) ||
6928                     sb != parent->d_sb)
6929                         break;
6930
6931                 inode = BTRFS_I(d_inode(parent));
6932                 if (root != inode->root)
6933                         break;
6934
6935                 if (inode->generation >= trans->transid &&
6936                     need_log_inode(trans, inode)) {
6937                         ret = btrfs_log_inode(trans, inode,
6938                                               LOG_INODE_EXISTS, ctx);
6939                         if (ret)
6940                                 break;
6941                 }
6942                 if (IS_ROOT(parent))
6943                         break;
6944
6945                 parent = dget_parent(parent);
6946                 dput(old_parent);
6947                 old_parent = parent;
6948         }
6949         dput(old_parent);
6950
6951         return ret;
6952 }
6953
6954 static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
6955                                  struct btrfs_inode *inode,
6956                                  struct dentry *parent,
6957                                  struct btrfs_log_ctx *ctx)
6958 {
6959         struct btrfs_root *root = inode->root;
6960         const u64 ino = btrfs_ino(inode);
6961         struct btrfs_path *path;
6962         struct btrfs_key search_key;
6963         int ret;
6964
6965         /*
6966          * For a single hard link case, go through a fast path that does not
6967          * need to iterate the fs/subvolume tree.
6968          */
6969         if (inode->vfs_inode.i_nlink < 2)
6970                 return log_new_ancestors_fast(trans, inode, parent, ctx);
6971
6972         path = btrfs_alloc_path();
6973         if (!path)
6974                 return -ENOMEM;
6975
6976         search_key.objectid = ino;
6977         search_key.type = BTRFS_INODE_REF_KEY;
6978         search_key.offset = 0;
6979 again:
6980         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6981         if (ret < 0)
6982                 goto out;
6983         if (ret == 0)
6984                 path->slots[0]++;
6985
6986         while (true) {
6987                 struct extent_buffer *leaf = path->nodes[0];
6988                 int slot = path->slots[0];
6989                 struct btrfs_key found_key;
6990
6991                 if (slot >= btrfs_header_nritems(leaf)) {
6992                         ret = btrfs_next_leaf(root, path);
6993                         if (ret < 0)
6994                                 goto out;
6995                         else if (ret > 0)
6996                                 break;
6997                         continue;
6998                 }
6999
7000                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7001                 if (found_key.objectid != ino ||
7002                     found_key.type > BTRFS_INODE_EXTREF_KEY)
7003                         break;
7004
7005                 /*
7006                  * Don't deal with extended references because they are rare
7007                  * cases and too complex to deal with (we would need to keep
7008                  * track of which subitem we are processing for each item in
7009                  * this loop, etc). So just return some error to fallback to
7010                  * a transaction commit.
7011                  */
7012                 if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
7013                         ret = -EMLINK;
7014                         goto out;
7015                 }
7016
7017                 /*
7018                  * Logging ancestors needs to do more searches on the fs/subvol
7019                  * tree, so it releases the path as needed to avoid deadlocks.
7020                  * Keep track of the last inode ref key and resume from that key
7021                  * after logging all new ancestors for the current hard link.
7022                  */
7023                 memcpy(&search_key, &found_key, sizeof(search_key));
7024
7025                 ret = log_new_ancestors(trans, root, path, ctx);
7026                 if (ret)
7027                         goto out;
7028                 btrfs_release_path(path);
7029                 goto again;
7030         }
7031         ret = 0;
7032 out:
7033         btrfs_free_path(path);
7034         return ret;
7035 }
7036
7037 /*
7038  * helper function around btrfs_log_inode to make sure newly created
7039  * parent directories also end up in the log.  A minimal inode and backref
7040  * only logging is done of any parent directories that are older than
7041  * the last committed transaction
7042  */
7043 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
7044                                   struct btrfs_inode *inode,
7045                                   struct dentry *parent,
7046                                   int inode_only,
7047                                   struct btrfs_log_ctx *ctx)
7048 {
7049         struct btrfs_root *root = inode->root;
7050         struct btrfs_fs_info *fs_info = root->fs_info;
7051         int ret = 0;
7052         bool log_dentries;
7053
7054         if (btrfs_test_opt(fs_info, NOTREELOG))
7055                 return BTRFS_LOG_FORCE_COMMIT;
7056
7057         if (btrfs_root_refs(&root->root_item) == 0)
7058                 return BTRFS_LOG_FORCE_COMMIT;
7059
7060         /*
7061          * If we're logging an inode from a subvolume created in the current
7062          * transaction we must force a commit since the root is not persisted.
7063          */
7064         if (btrfs_root_generation(&root->root_item) == trans->transid)
7065                 return BTRFS_LOG_FORCE_COMMIT;
7066
7067         /* Skip already logged inodes and without new extents. */
7068         if (btrfs_inode_in_log(inode, trans->transid) &&
7069             list_empty(&ctx->ordered_extents))
7070                 return BTRFS_NO_LOG_SYNC;
7071
7072         ret = start_log_trans(trans, root, ctx);
7073         if (ret)
7074                 return ret;
7075
7076         ret = btrfs_log_inode(trans, inode, inode_only, ctx);
7077         if (ret)
7078                 goto end_trans;
7079
7080         /*
7081          * for regular files, if its inode is already on disk, we don't
7082          * have to worry about the parents at all.  This is because
7083          * we can use the last_unlink_trans field to record renames
7084          * and other fun in this file.
7085          */
7086         if (S_ISREG(inode->vfs_inode.i_mode) &&
7087             inode->generation < trans->transid &&
7088             inode->last_unlink_trans < trans->transid) {
7089                 ret = 0;
7090                 goto end_trans;
7091         }
7092
7093         /*
7094          * Track if we need to log dentries because ctx->log_new_dentries can
7095          * be modified in the call chains below.
7096          */
7097         log_dentries = ctx->log_new_dentries;
7098
7099         /*
7100          * On unlink we must make sure all our current and old parent directory
7101          * inodes are fully logged. This is to prevent leaving dangling
7102          * directory index entries in directories that were our parents but are
7103          * not anymore. Not doing this results in old parent directory being
7104          * impossible to delete after log replay (rmdir will always fail with
7105          * error -ENOTEMPTY).
7106          *
7107          * Example 1:
7108          *
7109          * mkdir testdir
7110          * touch testdir/foo
7111          * ln testdir/foo testdir/bar
7112          * sync
7113          * unlink testdir/bar
7114          * xfs_io -c fsync testdir/foo
7115          * <power failure>
7116          * mount fs, triggers log replay
7117          *
7118          * If we don't log the parent directory (testdir), after log replay the
7119          * directory still has an entry pointing to the file inode using the bar
7120          * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
7121          * the file inode has a link count of 1.
7122          *
7123          * Example 2:
7124          *
7125          * mkdir testdir
7126          * touch foo
7127          * ln foo testdir/foo2
7128          * ln foo testdir/foo3
7129          * sync
7130          * unlink testdir/foo3
7131          * xfs_io -c fsync foo
7132          * <power failure>
7133          * mount fs, triggers log replay
7134          *
7135          * Similar as the first example, after log replay the parent directory
7136          * testdir still has an entry pointing to the inode file with name foo3
7137          * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
7138          * and has a link count of 2.
7139          */
7140         if (inode->last_unlink_trans >= trans->transid) {
7141                 ret = btrfs_log_all_parents(trans, inode, ctx);
7142                 if (ret)
7143                         goto end_trans;
7144         }
7145
7146         ret = log_all_new_ancestors(trans, inode, parent, ctx);
7147         if (ret)
7148                 goto end_trans;
7149
7150         if (log_dentries)
7151                 ret = log_new_dir_dentries(trans, inode, ctx);
7152 end_trans:
7153         if (ret < 0) {
7154                 btrfs_set_log_full_commit(trans);
7155                 ret = BTRFS_LOG_FORCE_COMMIT;
7156         }
7157
7158         if (ret)
7159                 btrfs_remove_log_ctx(root, ctx);
7160         btrfs_end_log_trans(root);
7161
7162         return ret;
7163 }
7164
7165 /*
7166  * it is not safe to log dentry if the chunk root has added new
7167  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
7168  * If this returns 1, you must commit the transaction to safely get your
7169  * data on disk.
7170  */
7171 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
7172                           struct dentry *dentry,
7173                           struct btrfs_log_ctx *ctx)
7174 {
7175         struct dentry *parent = dget_parent(dentry);
7176         int ret;
7177
7178         ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
7179                                      LOG_INODE_ALL, ctx);
7180         dput(parent);
7181
7182         return ret;
7183 }
7184
7185 /*
7186  * should be called during mount to recover any replay any log trees
7187  * from the FS
7188  */
7189 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
7190 {
7191         int ret;
7192         struct btrfs_path *path;
7193         struct btrfs_trans_handle *trans;
7194         struct btrfs_key key;
7195         struct btrfs_key found_key;
7196         struct btrfs_root *log;
7197         struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
7198         struct walk_control wc = {
7199                 .process_func = process_one_buffer,
7200                 .stage = LOG_WALK_PIN_ONLY,
7201         };
7202
7203         path = btrfs_alloc_path();
7204         if (!path)
7205                 return -ENOMEM;
7206
7207         set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7208
7209         trans = btrfs_start_transaction(fs_info->tree_root, 0);
7210         if (IS_ERR(trans)) {
7211                 ret = PTR_ERR(trans);
7212                 goto error;
7213         }
7214
7215         wc.trans = trans;
7216         wc.pin = 1;
7217
7218         ret = walk_log_tree(trans, log_root_tree, &wc);
7219         if (ret) {
7220                 btrfs_abort_transaction(trans, ret);
7221                 goto error;
7222         }
7223
7224 again:
7225         key.objectid = BTRFS_TREE_LOG_OBJECTID;
7226         key.type = BTRFS_ROOT_ITEM_KEY;
7227         key.offset = (u64)-1;
7228
7229         while (1) {
7230                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
7231
7232                 if (ret < 0) {
7233                         btrfs_abort_transaction(trans, ret);
7234                         goto error;
7235                 }
7236                 if (ret > 0) {
7237                         if (path->slots[0] == 0)
7238                                 break;
7239                         path->slots[0]--;
7240                 }
7241                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
7242                                       path->slots[0]);
7243                 btrfs_release_path(path);
7244                 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
7245                         break;
7246
7247                 log = btrfs_read_tree_root(log_root_tree, &found_key);
7248                 if (IS_ERR(log)) {
7249                         ret = PTR_ERR(log);
7250                         btrfs_abort_transaction(trans, ret);
7251                         goto error;
7252                 }
7253
7254                 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
7255                                                    true);
7256                 if (IS_ERR(wc.replay_dest)) {
7257                         ret = PTR_ERR(wc.replay_dest);
7258
7259                         /*
7260                          * We didn't find the subvol, likely because it was
7261                          * deleted.  This is ok, simply skip this log and go to
7262                          * the next one.
7263                          *
7264                          * We need to exclude the root because we can't have
7265                          * other log replays overwriting this log as we'll read
7266                          * it back in a few more times.  This will keep our
7267                          * block from being modified, and we'll just bail for
7268                          * each subsequent pass.
7269                          */
7270                         if (ret == -ENOENT)
7271                                 ret = btrfs_pin_extent_for_log_replay(trans, log->node);
7272                         btrfs_put_root(log);
7273
7274                         if (!ret)
7275                                 goto next;
7276                         btrfs_abort_transaction(trans, ret);
7277                         goto error;
7278                 }
7279
7280                 wc.replay_dest->log_root = log;
7281                 ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
7282                 if (ret)
7283                         /* The loop needs to continue due to the root refs */
7284                         btrfs_abort_transaction(trans, ret);
7285                 else
7286                         ret = walk_log_tree(trans, log, &wc);
7287
7288                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
7289                         ret = fixup_inode_link_counts(trans, wc.replay_dest,
7290                                                       path);
7291                         if (ret)
7292                                 btrfs_abort_transaction(trans, ret);
7293                 }
7294
7295                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
7296                         struct btrfs_root *root = wc.replay_dest;
7297
7298                         btrfs_release_path(path);
7299
7300                         /*
7301                          * We have just replayed everything, and the highest
7302                          * objectid of fs roots probably has changed in case
7303                          * some inode_item's got replayed.
7304                          *
7305                          * root->objectid_mutex is not acquired as log replay
7306                          * could only happen during mount.
7307                          */
7308                         ret = btrfs_init_root_free_objectid(root);
7309                         if (ret)
7310                                 btrfs_abort_transaction(trans, ret);
7311                 }
7312
7313                 wc.replay_dest->log_root = NULL;
7314                 btrfs_put_root(wc.replay_dest);
7315                 btrfs_put_root(log);
7316
7317                 if (ret)
7318                         goto error;
7319 next:
7320                 if (found_key.offset == 0)
7321                         break;
7322                 key.offset = found_key.offset - 1;
7323         }
7324         btrfs_release_path(path);
7325
7326         /* step one is to pin it all, step two is to replay just inodes */
7327         if (wc.pin) {
7328                 wc.pin = 0;
7329                 wc.process_func = replay_one_buffer;
7330                 wc.stage = LOG_WALK_REPLAY_INODES;
7331                 goto again;
7332         }
7333         /* step three is to replay everything */
7334         if (wc.stage < LOG_WALK_REPLAY_ALL) {
7335                 wc.stage++;
7336                 goto again;
7337         }
7338
7339         btrfs_free_path(path);
7340
7341         /* step 4: commit the transaction, which also unpins the blocks */
7342         ret = btrfs_commit_transaction(trans);
7343         if (ret)
7344                 return ret;
7345
7346         log_root_tree->log_root = NULL;
7347         clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7348         btrfs_put_root(log_root_tree);
7349
7350         return 0;
7351 error:
7352         if (wc.trans)
7353                 btrfs_end_transaction(wc.trans);
7354         clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7355         btrfs_free_path(path);
7356         return ret;
7357 }
7358
7359 /*
7360  * there are some corner cases where we want to force a full
7361  * commit instead of allowing a directory to be logged.
7362  *
7363  * They revolve around files there were unlinked from the directory, and
7364  * this function updates the parent directory so that a full commit is
7365  * properly done if it is fsync'd later after the unlinks are done.
7366  *
7367  * Must be called before the unlink operations (updates to the subvolume tree,
7368  * inodes, etc) are done.
7369  */
7370 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
7371                              struct btrfs_inode *dir, struct btrfs_inode *inode,
7372                              bool for_rename)
7373 {
7374         /*
7375          * when we're logging a file, if it hasn't been renamed
7376          * or unlinked, and its inode is fully committed on disk,
7377          * we don't have to worry about walking up the directory chain
7378          * to log its parents.
7379          *
7380          * So, we use the last_unlink_trans field to put this transid
7381          * into the file.  When the file is logged we check it and
7382          * don't log the parents if the file is fully on disk.
7383          */
7384         mutex_lock(&inode->log_mutex);
7385         inode->last_unlink_trans = trans->transid;
7386         mutex_unlock(&inode->log_mutex);
7387
7388         if (!for_rename)
7389                 return;
7390
7391         /*
7392          * If this directory was already logged, any new names will be logged
7393          * with btrfs_log_new_name() and old names will be deleted from the log
7394          * tree with btrfs_del_dir_entries_in_log() or with
7395          * btrfs_del_inode_ref_in_log().
7396          */
7397         if (inode_logged(trans, dir, NULL) == 1)
7398                 return;
7399
7400         /*
7401          * If the inode we're about to unlink was logged before, the log will be
7402          * properly updated with the new name with btrfs_log_new_name() and the
7403          * old name removed with btrfs_del_dir_entries_in_log() or with
7404          * btrfs_del_inode_ref_in_log().
7405          */
7406         if (inode_logged(trans, inode, NULL) == 1)
7407                 return;
7408
7409         /*
7410          * when renaming files across directories, if the directory
7411          * there we're unlinking from gets fsync'd later on, there's
7412          * no way to find the destination directory later and fsync it
7413          * properly.  So, we have to be conservative and force commits
7414          * so the new name gets discovered.
7415          */
7416         mutex_lock(&dir->log_mutex);
7417         dir->last_unlink_trans = trans->transid;
7418         mutex_unlock(&dir->log_mutex);
7419 }
7420
7421 /*
7422  * Make sure that if someone attempts to fsync the parent directory of a deleted
7423  * snapshot, it ends up triggering a transaction commit. This is to guarantee
7424  * that after replaying the log tree of the parent directory's root we will not
7425  * see the snapshot anymore and at log replay time we will not see any log tree
7426  * corresponding to the deleted snapshot's root, which could lead to replaying
7427  * it after replaying the log tree of the parent directory (which would replay
7428  * the snapshot delete operation).
7429  *
7430  * Must be called before the actual snapshot destroy operation (updates to the
7431  * parent root and tree of tree roots trees, etc) are done.
7432  */
7433 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
7434                                    struct btrfs_inode *dir)
7435 {
7436         mutex_lock(&dir->log_mutex);
7437         dir->last_unlink_trans = trans->transid;
7438         mutex_unlock(&dir->log_mutex);
7439 }
7440
7441 /*
7442  * Call this when creating a subvolume in a directory.
7443  * Because we don't commit a transaction when creating a subvolume, we can't
7444  * allow the directory pointing to the subvolume to be logged with an entry that
7445  * points to an unpersisted root if we are still in the transaction used to
7446  * create the subvolume, so make any attempt to log the directory to result in a
7447  * full log sync.
7448  * Also we don't need to worry with renames, since btrfs_rename() marks the log
7449  * for full commit when renaming a subvolume.
7450  *
7451  * Must be called before creating the subvolume entry in its parent directory.
7452  */
7453 void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
7454                                 struct btrfs_inode *dir)
7455 {
7456         mutex_lock(&dir->log_mutex);
7457         dir->last_unlink_trans = trans->transid;
7458         mutex_unlock(&dir->log_mutex);
7459 }
7460
7461 /*
7462  * Update the log after adding a new name for an inode.
7463  *
7464  * @trans:              Transaction handle.
7465  * @old_dentry:         The dentry associated with the old name and the old
7466  *                      parent directory.
7467  * @old_dir:            The inode of the previous parent directory for the case
7468  *                      of a rename. For a link operation, it must be NULL.
7469  * @old_dir_index:      The index number associated with the old name, meaningful
7470  *                      only for rename operations (when @old_dir is not NULL).
7471  *                      Ignored for link operations.
7472  * @parent:             The dentry associated with the directory under which the
7473  *                      new name is located.
7474  *
7475  * Call this after adding a new name for an inode, as a result of a link or
7476  * rename operation, and it will properly update the log to reflect the new name.
7477  */
7478 void btrfs_log_new_name(struct btrfs_trans_handle *trans,
7479                         struct dentry *old_dentry, struct btrfs_inode *old_dir,
7480                         u64 old_dir_index, struct dentry *parent)
7481 {
7482         struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
7483         struct btrfs_root *root = inode->root;
7484         struct btrfs_log_ctx ctx;
7485         bool log_pinned = false;
7486         int ret;
7487
7488         /*
7489          * this will force the logging code to walk the dentry chain
7490          * up for the file
7491          */
7492         if (!S_ISDIR(inode->vfs_inode.i_mode))
7493                 inode->last_unlink_trans = trans->transid;
7494
7495         /*
7496          * if this inode hasn't been logged and directory we're renaming it
7497          * from hasn't been logged, we don't need to log it
7498          */
7499         ret = inode_logged(trans, inode, NULL);
7500         if (ret < 0) {
7501                 goto out;
7502         } else if (ret == 0) {
7503                 if (!old_dir)
7504                         return;
7505                 /*
7506                  * If the inode was not logged and we are doing a rename (old_dir is not
7507                  * NULL), check if old_dir was logged - if it was not we can return and
7508                  * do nothing.
7509                  */
7510                 ret = inode_logged(trans, old_dir, NULL);
7511                 if (ret < 0)
7512                         goto out;
7513                 else if (ret == 0)
7514                         return;
7515         }
7516         ret = 0;
7517
7518         /*
7519          * If we are doing a rename (old_dir is not NULL) from a directory that
7520          * was previously logged, make sure that on log replay we get the old
7521          * dir entry deleted. This is needed because we will also log the new
7522          * name of the renamed inode, so we need to make sure that after log
7523          * replay we don't end up with both the new and old dir entries existing.
7524          */
7525         if (old_dir && old_dir->logged_trans == trans->transid) {
7526                 struct btrfs_root *log = old_dir->root->log_root;
7527                 struct btrfs_path *path;
7528                 struct fscrypt_name fname;
7529
7530                 ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
7531
7532                 ret = fscrypt_setup_filename(&old_dir->vfs_inode,
7533                                              &old_dentry->d_name, 0, &fname);
7534                 if (ret)
7535                         goto out;
7536                 /*
7537                  * We have two inodes to update in the log, the old directory and
7538                  * the inode that got renamed, so we must pin the log to prevent
7539                  * anyone from syncing the log until we have updated both inodes
7540                  * in the log.
7541                  */
7542                 ret = join_running_log_trans(root);
7543                 /*
7544                  * At least one of the inodes was logged before, so this should
7545                  * not fail, but if it does, it's not serious, just bail out and
7546                  * mark the log for a full commit.
7547                  */
7548                 if (WARN_ON_ONCE(ret < 0)) {
7549                         fscrypt_free_filename(&fname);
7550                         goto out;
7551                 }
7552
7553                 log_pinned = true;
7554
7555                 path = btrfs_alloc_path();
7556                 if (!path) {
7557                         ret = -ENOMEM;
7558                         fscrypt_free_filename(&fname);
7559                         goto out;
7560                 }
7561
7562                 /*
7563                  * Other concurrent task might be logging the old directory,
7564                  * as it can be triggered when logging other inode that had or
7565                  * still has a dentry in the old directory. We lock the old
7566                  * directory's log_mutex to ensure the deletion of the old
7567                  * name is persisted, because during directory logging we
7568                  * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
7569                  * the old name's dir index item is in the delayed items, so
7570                  * it could be missed by an in progress directory logging.
7571                  */
7572                 mutex_lock(&old_dir->log_mutex);
7573                 ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
7574                                         &fname.disk_name, old_dir_index);
7575                 if (ret > 0) {
7576                         /*
7577                          * The dentry does not exist in the log, so record its
7578                          * deletion.
7579                          */
7580                         btrfs_release_path(path);
7581                         ret = insert_dir_log_key(trans, log, path,
7582                                                  btrfs_ino(old_dir),
7583                                                  old_dir_index, old_dir_index);
7584                 }
7585                 mutex_unlock(&old_dir->log_mutex);
7586
7587                 btrfs_free_path(path);
7588                 fscrypt_free_filename(&fname);
7589                 if (ret < 0)
7590                         goto out;
7591         }
7592
7593         btrfs_init_log_ctx(&ctx, inode);
7594         ctx.logging_new_name = true;
7595         btrfs_init_log_ctx_scratch_eb(&ctx);
7596         /*
7597          * We don't care about the return value. If we fail to log the new name
7598          * then we know the next attempt to sync the log will fallback to a full
7599          * transaction commit (due to a call to btrfs_set_log_full_commit()), so
7600          * we don't need to worry about getting a log committed that has an
7601          * inconsistent state after a rename operation.
7602          */
7603         btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
7604         free_extent_buffer(ctx.scratch_eb);
7605         ASSERT(list_empty(&ctx.conflict_inodes));
7606 out:
7607         /*
7608          * If an error happened mark the log for a full commit because it's not
7609          * consistent and up to date or we couldn't find out if one of the
7610          * inodes was logged before in this transaction. Do it before unpinning
7611          * the log, to avoid any races with someone else trying to commit it.
7612          */
7613         if (ret < 0)
7614                 btrfs_set_log_full_commit(trans);
7615         if (log_pinned)
7616                 btrfs_end_log_trans(root);
7617 }
7618