fs/btrfs/tree-log.c

   1 /*
   2  * Copyright (C) 2008 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18
  19 #include <linux/sched.h>
  20 #include <linux/slab.h>
  21 #include <linux/blkdev.h>
  22 #include <linux/list_sort.h>
  23 #include "tree-log.h"
  24 #include "disk-io.h"
  25 #include "locking.h"
  26 #include "print-tree.h"
  27 #include "backref.h"
  28 #include "hash.h"
  29 #include "compression.h"
  30 #include "qgroup.h"
  31
  32 /* magic values for the inode_only field in btrfs_log_inode:
  33  *
  34  * LOG_INODE_ALL means to log everything
  35  * LOG_INODE_EXISTS means to log just enough to recreate the inode
  36  * during log replay
  37  */
  38 #define LOG_INODE_ALL 0
  39 #define LOG_INODE_EXISTS 1
  40 #define LOG_OTHER_INODE 2
  41
  42 /*
  43  * directory trouble cases
  44  *
  45  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
  46  * log, we must force a full commit before doing an fsync of the directory
  47  * where the unlink was done.
  48  * ---> record transid of last unlink/rename per directory
  49  *
  50  * mkdir foo/some_dir
  51  * normal commit
  52  * rename foo/some_dir foo2/some_dir
  53  * mkdir foo/some_dir
  54  * fsync foo/some_dir/some_file
  55  *
  56  * The fsync above will unlink the original some_dir without recording
  57  * it in its new location (foo2).  After a crash, some_dir will be gone
  58  * unless the fsync of some_file forces a full commit
  59  *
  60  * 2) we must log any new names for any file or dir that is in the fsync
  61  * log. ---> check inode while renaming/linking.
  62  *
  63  * 2a) we must log any new names for any file or dir during rename
  64  * when the directory they are being removed from was logged.
  65  * ---> check inode and old parent dir during rename
  66  *
  67  *  2a is actually the more important variant.  With the extra logging
  68  *  a crash might unlink the old name without recreating the new one
  69  *
  70  * 3) after a crash, we must go through any directories with a link count
  71  * of zero and redo the rm -rf
  72  *
  73  * mkdir f1/foo
  74  * normal commit
  75  * rm -rf f1/foo
  76  * fsync(f1)
  77  *
  78  * The directory f1 was fully removed from the FS, but fsync was never
  79  * called on f1, only its parent dir.  After a crash the rm -rf must
  80  * be replayed.  This must be able to recurse down the entire
  81  * directory tree.  The inode link count fixup code takes care of the
  82  * ugly details.
  83  */
  84
  85 /*
  86  * stages for the tree walking.  The first
  87  * stage (0) is to only pin down the blocks we find
  88  * the second stage (1) is to make sure that all the inodes
  89  * we find in the log are created in the subvolume.
  90  *
  91  * The last stage is to deal with directories and links and extents
  92  * and all the other fun semantics
  93  */
  94 #define LOG_WALK_PIN_ONLY 0
  95 #define LOG_WALK_REPLAY_INODES 1
  96 #define LOG_WALK_REPLAY_DIR_INDEX 2
  97 #define LOG_WALK_REPLAY_ALL 3
  98
  99 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 100                            struct btrfs_root *root, struct inode *inode,
 101                            int inode_only,
 102                            const loff_t start,
 103                            const loff_t end,
 104                            struct btrfs_log_ctx *ctx);
 105 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 106                              struct btrfs_root *root,
 107                              struct btrfs_path *path, u64 objectid);
 108 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 109                                        struct btrfs_root *root,
 110                                        struct btrfs_root *log,
 111                                        struct btrfs_path *path,
 112                                        u64 dirid, int del_all);
 113
 114 /*
 115  * tree logging is a special write ahead log used to make sure that
 116  * fsyncs and O_SYNCs can happen without doing full tree commits.
 117  *
 118  * Full tree commits are expensive because they require commonly
 119  * modified blocks to be recowed, creating many dirty pages in the
 120  * extent tree an 4x-6x higher write load than ext3.
 121  *
 122  * Instead of doing a tree commit on every fsync, we use the
 123  * key ranges and transaction ids to find items for a given file or directory
 124  * that have changed in this transaction.  Those items are copied into
 125  * a special tree (one per subvolume root), that tree is written to disk
 126  * and then the fsync is considered complete.
 127  *
 128  * After a crash, items are copied out of the log-tree back into the
 129  * subvolume tree.  Any file data extents found are recorded in the extent
 130  * allocation tree, and the log-tree freed.
 131  *
 132  * The log tree is read three times, once to pin down all the extents it is
 133  * using in ram and once, once to create all the inodes logged in the tree
 134  * and once to do all the other items.
 135  */
 136
 137 /*
 138  * start a sub transaction and setup the log tree
 139  * this increments the log tree writer count to make the people
 140  * syncing the tree wait for us to finish
 141  */
 142 static int start_log_trans(struct btrfs_trans_handle *trans,
 143                            struct btrfs_root *root,
 144                            struct btrfs_log_ctx *ctx)
 145 {
 146         struct btrfs_fs_info *fs_info = root->fs_info;
 147         int ret = 0;
 148
 149         mutex_lock(&root->log_mutex);
 150
 151         if (root->log_root) {
 152                 if (btrfs_need_log_full_commit(fs_info, trans)) {
 153                         ret = -EAGAIN;
 154                         goto out;
 155                 }
 156
 157                 if (!root->log_start_pid) {
 158                         clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 159                         root->log_start_pid = current->pid;
 160                 } else if (root->log_start_pid != current->pid) {
 161                         set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 162                 }
 163         } else {
 164                 mutex_lock(&fs_info->tree_log_mutex);
 165                 if (!fs_info->log_root_tree)
 166                         ret = btrfs_init_log_root_tree(trans, fs_info);
 167                 mutex_unlock(&fs_info->tree_log_mutex);
 168                 if (ret)
 169                         goto out;
 170
 171                 ret = btrfs_add_log_tree(trans, root);
 172                 if (ret)
 173                         goto out;
 174
 175                 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 176                 root->log_start_pid = current->pid;
 177         }
 178
 179         atomic_inc(&root->log_batch);
 180         atomic_inc(&root->log_writers);
 181         if (ctx) {
 182                 int index = root->log_transid % 2;
 183                 list_add_tail(&ctx->list, &root->log_ctxs[index]);
 184                 ctx->log_transid = root->log_transid;
 185         }
 186
 187 out:
 188         mutex_unlock(&root->log_mutex);
 189         return ret;
 190 }
 191
 192 /*
 193  * returns 0 if there was a log transaction running and we were able
 194  * to join, or returns -ENOENT if there were not transactions
 195  * in progress
 196  */
 197 static int join_running_log_trans(struct btrfs_root *root)
 198 {
 199         int ret = -ENOENT;
 200
 201         smp_mb();
 202         if (!root->log_root)
 203                 return -ENOENT;
 204
 205         mutex_lock(&root->log_mutex);
 206         if (root->log_root) {
 207                 ret = 0;
 208                 atomic_inc(&root->log_writers);
 209         }
 210         mutex_unlock(&root->log_mutex);
 211         return ret;
 212 }
 213
 214 /*
 215  * This either makes the current running log transaction wait
 216  * until you call btrfs_end_log_trans() or it makes any future
 217  * log transactions wait until you call btrfs_end_log_trans()
 218  */
 219 int btrfs_pin_log_trans(struct btrfs_root *root)
 220 {
 221         int ret = -ENOENT;
 222
 223         mutex_lock(&root->log_mutex);
 224         atomic_inc(&root->log_writers);
 225         mutex_unlock(&root->log_mutex);
 226         return ret;
 227 }
 228
 229 /*
 230  * indicate we're done making changes to the log tree
 231  * and wake up anyone waiting to do a sync
 232  */
 233 void btrfs_end_log_trans(struct btrfs_root *root)
 234 {
 235         if (atomic_dec_and_test(&root->log_writers)) {
 236                 /*
 237                  * Implicit memory barrier after atomic_dec_and_test
 238                  */
 239                 if (waitqueue_active(&root->log_writer_wait))
 240                         wake_up(&root->log_writer_wait);
 241         }
 242 }
 243
 244
 245 /*
 246  * the walk control struct is used to pass state down the chain when
 247  * processing the log tree.  The stage field tells us which part
 248  * of the log tree processing we are currently doing.  The others
 249  * are state fields used for that specific part
 250  */
 251 struct walk_control {
 252         /* should we free the extent on disk when done?  This is used
 253          * at transaction commit time while freeing a log tree
 254          */
 255         int free;
 256
 257         /* should we write out the extent buffer?  This is used
 258          * while flushing the log tree to disk during a sync
 259          */
 260         int write;
 261
 262         /* should we wait for the extent buffer io to finish?  Also used
 263          * while flushing the log tree to disk for a sync
 264          */
 265         int wait;
 266
 267         /* pin only walk, we record which extents on disk belong to the
 268          * log trees
 269          */
 270         int pin;
 271
 272         /* what stage of the replay code we're currently in */
 273         int stage;
 274
 275         /* the root we are currently replaying */
 276         struct btrfs_root *replay_dest;
 277
 278         /* the trans handle for the current replay */
 279         struct btrfs_trans_handle *trans;
 280
 281         /* the function that gets used to process blocks we find in the
 282          * tree.  Note the extent_buffer might not be up to date when it is
 283          * passed in, and it must be checked or read if you need the data
 284          * inside it
 285          */
 286         int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
 287                             struct walk_control *wc, u64 gen);
 288 };
 289
 290 /*
 291  * process_func used to pin down extents, write them or wait on them
 292  */
 293 static int process_one_buffer(struct btrfs_root *log,
 294                               struct extent_buffer *eb,
 295                               struct walk_control *wc, u64 gen)
 296 {
 297         struct btrfs_fs_info *fs_info = log->fs_info;
 298         int ret = 0;
 299
 300         /*
 301          * If this fs is mixed then we need to be able to process the leaves to
 302          * pin down any logged extents, so we have to read the block.
 303          */
 304         if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
 305                 ret = btrfs_read_buffer(eb, gen);
 306                 if (ret)
 307                         return ret;
 308         }
 309
 310         if (wc->pin)
 311                 ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
 312                                                       eb->len);
 313
 314         if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
 315                 if (wc->pin && btrfs_header_level(eb) == 0)
 316                         ret = btrfs_exclude_logged_extents(fs_info, eb);
 317                 if (wc->write)
 318                         btrfs_write_tree_block(eb);
 319                 if (wc->wait)
 320                         btrfs_wait_tree_block_writeback(eb);
 321         }
 322         return ret;
 323 }
 324
 325 /*
 326  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
 327  * to the src data we are copying out.
 328  *
 329  * root is the tree we are copying into, and path is a scratch
 330  * path for use in this function (it should be released on entry and
 331  * will be released on exit).
 332  *
 333  * If the key is already in the destination tree the existing item is
 334  * overwritten.  If the existing item isn't big enough, it is extended.
 335  * If it is too large, it is truncated.
 336  *
 337  * If the key isn't in the destination yet, a new item is inserted.
 338  */
 339 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 340                                    struct btrfs_root *root,
 341                                    struct btrfs_path *path,
 342                                    struct extent_buffer *eb, int slot,
 343                                    struct btrfs_key *key)
 344 {
 345         struct btrfs_fs_info *fs_info = root->fs_info;
 346         int ret;
 347         u32 item_size;
 348         u64 saved_i_size = 0;
 349         int save_old_i_size = 0;
 350         unsigned long src_ptr;
 351         unsigned long dst_ptr;
 352         int overwrite_root = 0;
 353         bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 354
 355         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 356                 overwrite_root = 1;
 357
 358         item_size = btrfs_item_size_nr(eb, slot);
 359         src_ptr = btrfs_item_ptr_offset(eb, slot);
 360
 361         /* look for the key in the destination tree */
 362         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 363         if (ret < 0)
 364                 return ret;
 365
 366         if (ret == 0) {
 367                 char *src_copy;
 368                 char *dst_copy;
 369                 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
 370                                                   path->slots[0]);
 371                 if (dst_size != item_size)
 372                         goto insert;
 373
 374                 if (item_size == 0) {
 375                         btrfs_release_path(path);
 376                         return 0;
 377                 }
 378                 dst_copy = kmalloc(item_size, GFP_NOFS);
 379                 src_copy = kmalloc(item_size, GFP_NOFS);
 380                 if (!dst_copy || !src_copy) {
 381                         btrfs_release_path(path);
 382                         kfree(dst_copy);
 383                         kfree(src_copy);
 384                         return -ENOMEM;
 385                 }
 386
 387                 read_extent_buffer(eb, src_copy, src_ptr, item_size);
 388
 389                 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 390                 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
 391                                    item_size);
 392                 ret = memcmp(dst_copy, src_copy, item_size);
 393
 394                 kfree(dst_copy);
 395                 kfree(src_copy);
 396                 /*
 397                  * they have the same contents, just return, this saves
 398                  * us from cowing blocks in the destination tree and doing
 399                  * extra writes that may not have been done by a previous
 400                  * sync
 401                  */
 402                 if (ret == 0) {
 403                         btrfs_release_path(path);
 404                         return 0;
 405                 }
 406
 407                 /*
 408                  * We need to load the old nbytes into the inode so when we
 409                  * replay the extents we've logged we get the right nbytes.
 410                  */
 411                 if (inode_item) {
 412                         struct btrfs_inode_item *item;
 413                         u64 nbytes;
 414                         u32 mode;
 415
 416                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 417                                               struct btrfs_inode_item);
 418                         nbytes = btrfs_inode_nbytes(path->nodes[0], item);
 419                         item = btrfs_item_ptr(eb, slot,
 420                                               struct btrfs_inode_item);
 421                         btrfs_set_inode_nbytes(eb, item, nbytes);
 422
 423                         /*
 424                          * If this is a directory we need to reset the i_size to
 425                          * 0 so that we can set it up properly when replaying
 426                          * the rest of the items in this log.
 427                          */
 428                         mode = btrfs_inode_mode(eb, item);
 429                         if (S_ISDIR(mode))
 430                                 btrfs_set_inode_size(eb, item, 0);
 431                 }
 432         } else if (inode_item) {
 433                 struct btrfs_inode_item *item;
 434                 u32 mode;
 435
 436                 /*
 437                  * New inode, set nbytes to 0 so that the nbytes comes out
 438                  * properly when we replay the extents.
 439                  */
 440                 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
 441                 btrfs_set_inode_nbytes(eb, item, 0);
 442
 443                 /*
 444                  * If this is a directory we need to reset the i_size to 0 so
 445                  * that we can set it up properly when replaying the rest of
 446                  * the items in this log.
 447                  */
 448                 mode = btrfs_inode_mode(eb, item);
 449                 if (S_ISDIR(mode))
 450                         btrfs_set_inode_size(eb, item, 0);
 451         }
 452 insert:
 453         btrfs_release_path(path);
 454         /* try to insert the key into the destination tree */
 455         path->skip_release_on_error = 1;
 456         ret = btrfs_insert_empty_item(trans, root, path,
 457                                       key, item_size);
 458         path->skip_release_on_error = 0;
 459
 460         /* make sure any existing item is the correct size */
 461         if (ret == -EEXIST || ret == -EOVERFLOW) {
 462                 u32 found_size;
 463                 found_size = btrfs_item_size_nr(path->nodes[0],
 464                                                 path->slots[0]);
 465                 if (found_size > item_size)
 466                         btrfs_truncate_item(fs_info, path, item_size, 1);
 467                 else if (found_size < item_size)
 468                         btrfs_extend_item(fs_info, path,
 469                                           item_size - found_size);
 470         } else if (ret) {
 471                 return ret;
 472         }
 473         dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
 474                                         path->slots[0]);
 475
 476         /* don't overwrite an existing inode if the generation number
 477          * was logged as zero.  This is done when the tree logging code
 478          * is just logging an inode to make sure it exists after recovery.
 479          *
 480          * Also, don't overwrite i_size on directories during replay.
 481          * log replay inserts and removes directory items based on the
 482          * state of the tree found in the subvolume, and i_size is modified
 483          * as it goes
 484          */
 485         if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
 486                 struct btrfs_inode_item *src_item;
 487                 struct btrfs_inode_item *dst_item;
 488
 489                 src_item = (struct btrfs_inode_item *)src_ptr;
 490                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 491
 492                 if (btrfs_inode_generation(eb, src_item) == 0) {
 493                         struct extent_buffer *dst_eb = path->nodes[0];
 494                         const u64 ino_size = btrfs_inode_size(eb, src_item);
 495
 496                         /*
 497                          * For regular files an ino_size == 0 is used only when
 498                          * logging that an inode exists, as part of a directory
 499                          * fsync, and the inode wasn't fsynced before. In this
 500                          * case don't set the size of the inode in the fs/subvol
 501                          * tree, otherwise we would be throwing valid data away.
 502                          */
 503                         if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
 504                             S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
 505                             ino_size != 0) {
 506                                 struct btrfs_map_token token;
 507
 508                                 btrfs_init_map_token(&token);
 509                                 btrfs_set_token_inode_size(dst_eb, dst_item,
 510                                                            ino_size, &token);
 511                         }
 512                         goto no_copy;
 513                 }
 514
 515                 if (overwrite_root &&
 516                     S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
 517                     S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
 518                         save_old_i_size = 1;
 519                         saved_i_size = btrfs_inode_size(path->nodes[0],
 520                                                         dst_item);
 521                 }
 522         }
 523
 524         copy_extent_buffer(path->nodes[0], eb, dst_ptr,
 525                            src_ptr, item_size);
 526
 527         if (save_old_i_size) {
 528                 struct btrfs_inode_item *dst_item;
 529                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 530                 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
 531         }
 532
 533         /* make sure the generation is filled in */
 534         if (key->type == BTRFS_INODE_ITEM_KEY) {
 535                 struct btrfs_inode_item *dst_item;
 536                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 537                 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
 538                         btrfs_set_inode_generation(path->nodes[0], dst_item,
 539                                                    trans->transid);
 540                 }
 541         }
 542 no_copy:
 543         btrfs_mark_buffer_dirty(path->nodes[0]);
 544         btrfs_release_path(path);
 545         return 0;
 546 }
 547
 548 /*
 549  * simple helper to read an inode off the disk from a given root
 550  * This can only be called for subvolume roots and not for the log
 551  */
 552 static noinline struct inode *read_one_inode(struct btrfs_root *root,
 553                                              u64 objectid)
 554 {
 555         struct btrfs_key key;
 556         struct inode *inode;
 557
 558         key.objectid = objectid;
 559         key.type = BTRFS_INODE_ITEM_KEY;
 560         key.offset = 0;
 561         inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
 562         if (IS_ERR(inode)) {
 563                 inode = NULL;
 564         } else if (is_bad_inode(inode)) {
 565                 iput(inode);
 566                 inode = NULL;
 567         }
 568         return inode;
 569 }
 570
 571 /* replays a single extent in 'eb' at 'slot' with 'key' into the
 572  * subvolume 'root'.  path is released on entry and should be released
 573  * on exit.
 574  *
 575  * extents in the log tree have not been allocated out of the extent
 576  * tree yet.  So, this completes the allocation, taking a reference
 577  * as required if the extent already exists or creating a new extent
 578  * if it isn't in the extent allocation tree yet.
 579  *
 580  * The extent is inserted into the file, dropping any existing extents
 581  * from the file that overlap the new one.
 582  */
 583 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 584                                       struct btrfs_root *root,
 585                                       struct btrfs_path *path,
 586                                       struct extent_buffer *eb, int slot,
 587                                       struct btrfs_key *key)
 588 {
 589         struct btrfs_fs_info *fs_info = root->fs_info;
 590         int found_type;
 591         u64 extent_end;
 592         u64 start = key->offset;
 593         u64 nbytes = 0;
 594         struct btrfs_file_extent_item *item;
 595         struct inode *inode = NULL;
 596         unsigned long size;
 597         int ret = 0;
 598
 599         item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 600         found_type = btrfs_file_extent_type(eb, item);
 601
 602         if (found_type == BTRFS_FILE_EXTENT_REG ||
 603             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 604                 nbytes = btrfs_file_extent_num_bytes(eb, item);
 605                 extent_end = start + nbytes;
 606
 607                 /*
 608                  * We don't add to the inodes nbytes if we are prealloc or a
 609                  * hole.
 610                  */
 611                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
 612                         nbytes = 0;
 613         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 614                 size = btrfs_file_extent_inline_len(eb, slot, item);
 615                 nbytes = btrfs_file_extent_ram_bytes(eb, item);
 616                 extent_end = ALIGN(start + size,
 617                                    fs_info->sectorsize);
 618         } else {
 619                 ret = 0;
 620                 goto out;
 621         }
 622
 623         inode = read_one_inode(root, key->objectid);
 624         if (!inode) {
 625                 ret = -EIO;
 626                 goto out;
 627         }
 628
 629         /*
 630          * first check to see if we already have this extent in the
 631          * file.  This must be done before the btrfs_drop_extents run
 632          * so we don't try to drop this extent.
 633          */
 634         ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(BTRFS_I(inode)),
 635                                        start, 0);
 636
 637         if (ret == 0 &&
 638             (found_type == BTRFS_FILE_EXTENT_REG ||
 639              found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
 640                 struct btrfs_file_extent_item cmp1;
 641                 struct btrfs_file_extent_item cmp2;
 642                 struct btrfs_file_extent_item *existing;
 643                 struct extent_buffer *leaf;
 644
 645                 leaf = path->nodes[0];
 646                 existing = btrfs_item_ptr(leaf, path->slots[0],
 647                                           struct btrfs_file_extent_item);
 648
 649                 read_extent_buffer(eb, &cmp1, (unsigned long)item,
 650                                    sizeof(cmp1));
 651                 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
 652                                    sizeof(cmp2));
 653
 654                 /*
 655                  * we already have a pointer to this exact extent,
 656                  * we don't have to do anything
 657                  */
 658                 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
 659                         btrfs_release_path(path);
 660                         goto out;
 661                 }
 662         }
 663         btrfs_release_path(path);
 664
 665         /* drop any overlapping extents */
 666         ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
 667         if (ret)
 668                 goto out;
 669
 670         if (found_type == BTRFS_FILE_EXTENT_REG ||
 671             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 672                 u64 offset;
 673                 unsigned long dest_offset;
 674                 struct btrfs_key ins;
 675
 676                 ret = btrfs_insert_empty_item(trans, root, path, key,
 677                                               sizeof(*item));
 678                 if (ret)
 679                         goto out;
 680                 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
 681                                                     path->slots[0]);
 682                 copy_extent_buffer(path->nodes[0], eb, dest_offset,
 683                                 (unsigned long)item,  sizeof(*item));
 684
 685                 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 686                 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 687                 ins.type = BTRFS_EXTENT_ITEM_KEY;
 688                 offset = key->offset - btrfs_file_extent_offset(eb, item);
 689
 690                 /*
 691                  * Manually record dirty extent, as here we did a shallow
 692                  * file extent item copy and skip normal backref update,
 693                  * but modifying extent tree all by ourselves.
 694                  * So need to manually record dirty extent for qgroup,
 695                  * as the owner of the file extent changed from log tree
 696                  * (doesn't affect qgroup) to fs/file tree(affects qgroup)
 697                  */
 698                 ret = btrfs_qgroup_trace_extent(trans, fs_info,
 699                                 btrfs_file_extent_disk_bytenr(eb, item),
 700                                 btrfs_file_extent_disk_num_bytes(eb, item),
 701                                 GFP_NOFS);
 702                 if (ret < 0)
 703                         goto out;
 704
 705                 if (ins.objectid > 0) {
 706                         u64 csum_start;
 707                         u64 csum_end;
 708                         LIST_HEAD(ordered_sums);
 709                         /*
 710                          * is this extent already allocated in the extent
 711                          * allocation tree?  If so, just add a reference
 712                          */
 713                         ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
 714                                                 ins.offset);
 715                         if (ret == 0) {
 716                                 ret = btrfs_inc_extent_ref(trans, fs_info,
 717                                                 ins.objectid, ins.offset,
 718                                                 0, root->root_key.objectid,
 719                                                 key->objectid, offset);
 720                                 if (ret)
 721                                         goto out;
 722                         } else {
 723                                 /*
 724                                  * insert the extent pointer in the extent
 725                                  * allocation tree
 726                                  */
 727                                 ret = btrfs_alloc_logged_file_extent(trans,
 728                                                 fs_info,
 729                                                 root->root_key.objectid,
 730                                                 key->objectid, offset, &ins);
 731                                 if (ret)
 732                                         goto out;
 733                         }
 734                         btrfs_release_path(path);
 735
 736                         if (btrfs_file_extent_compression(eb, item)) {
 737                                 csum_start = ins.objectid;
 738                                 csum_end = csum_start + ins.offset;
 739                         } else {
 740                                 csum_start = ins.objectid +
 741                                         btrfs_file_extent_offset(eb, item);
 742                                 csum_end = csum_start +
 743                                         btrfs_file_extent_num_bytes(eb, item);
 744                         }
 745
 746                         ret = btrfs_lookup_csums_range(root->log_root,
 747                                                 csum_start, csum_end - 1,
 748                                                 &ordered_sums, 0);
 749                         if (ret)
 750                                 goto out;
 751                         /*
 752                          * Now delete all existing cums in the csum root that
 753                          * cover our range. We do this because we can have an
 754                          * extent that is completely referenced by one file
 755                          * extent item and partially referenced by another
 756                          * file extent item (like after using the clone or
 757                          * extent_same ioctls). In this case if we end up doing
 758                          * the replay of the one that partially references the
 759                          * extent first, and we do not do the csum deletion
 760                          * below, we can get 2 csum items in the csum tree that
 761                          * overlap each other. For example, imagine our log has
 762                          * the two following file extent items:
 763                          *
 764                          * key (257 EXTENT_DATA 409600)
 765                          *     extent data disk byte 12845056 nr 102400
 766                          *     extent data offset 20480 nr 20480 ram 102400
 767                          *
 768                          * key (257 EXTENT_DATA 819200)
 769                          *     extent data disk byte 12845056 nr 102400
 770                          *     extent data offset 0 nr 102400 ram 102400
 771                          *
 772                          * Where the second one fully references the 100K extent
 773                          * that starts at disk byte 12845056, and the log tree
 774                          * has a single csum item that covers the entire range
 775                          * of the extent:
 776                          *
 777                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 778                          *
 779                          * After the first file extent item is replayed, the
 780                          * csum tree gets the following csum item:
 781                          *
 782                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 783                          *
 784                          * Which covers the 20K sub-range starting at offset 20K
 785                          * of our extent. Now when we replay the second file
 786                          * extent item, if we do not delete existing csum items
 787                          * that cover any of its blocks, we end up getting two
 788                          * csum items in our csum tree that overlap each other:
 789                          *
 790                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 791                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 792                          *
 793                          * Which is a problem, because after this anyone trying
 794                          * to lookup up for the checksum of any block of our
 795                          * extent starting at an offset of 40K or higher, will
 796                          * end up looking at the second csum item only, which
 797                          * does not contain the checksum for any block starting
 798                          * at offset 40K or higher of our extent.
 799                          */
 800                         while (!list_empty(&ordered_sums)) {
 801                                 struct btrfs_ordered_sum *sums;
 802                                 sums = list_entry(ordered_sums.next,
 803                                                 struct btrfs_ordered_sum,
 804                                                 list);
 805                                 if (!ret)
 806                                         ret = btrfs_del_csums(trans, fs_info,
 807                                                               sums->bytenr,
 808                                                               sums->len);
 809                                 if (!ret)
 810                                         ret = btrfs_csum_file_blocks(trans,
 811                                                 fs_info->csum_root, sums);
 812                                 list_del(&sums->list);
 813                                 kfree(sums);
 814                         }
 815                         if (ret)
 816                                 goto out;
 817                 } else {
 818                         btrfs_release_path(path);
 819                 }
 820         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 821                 /* inline extents are easy, we just overwrite them */
 822                 ret = overwrite_item(trans, root, path, eb, slot, key);
 823                 if (ret)
 824                         goto out;
 825         }
 826
 827         inode_add_bytes(inode, nbytes);
 828         ret = btrfs_update_inode(trans, root, inode);
 829 out:
 830         if (inode)
 831                 iput(inode);
 832         return ret;
 833 }
 834
 835 /*
 836  * when cleaning up conflicts between the directory names in the
 837  * subvolume, directory names in the log and directory names in the
 838  * inode back references, we may have to unlink inodes from directories.
 839  *
 840  * This is a helper function to do the unlink of a specific directory
 841  * item
 842  */
 843 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 844                                       struct btrfs_root *root,
 845                                       struct btrfs_path *path,
 846                                       struct inode *dir,
 847                                       struct btrfs_dir_item *di)
 848 {
 849         struct btrfs_fs_info *fs_info = root->fs_info;
 850         struct inode *inode;
 851         char *name;
 852         int name_len;
 853         struct extent_buffer *leaf;
 854         struct btrfs_key location;
 855         int ret;
 856
 857         leaf = path->nodes[0];
 858
 859         btrfs_dir_item_key_to_cpu(leaf, di, &location);
 860         name_len = btrfs_dir_name_len(leaf, di);
 861         name = kmalloc(name_len, GFP_NOFS);
 862         if (!name)
 863                 return -ENOMEM;
 864
 865         read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
 866         btrfs_release_path(path);
 867
 868         inode = read_one_inode(root, location.objectid);
 869         if (!inode) {
 870                 ret = -EIO;
 871                 goto out;
 872         }
 873
 874         ret = link_to_fixup_dir(trans, root, path, location.objectid);
 875         if (ret)
 876                 goto out;
 877
 878         ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(inode),
 879                         name, name_len);
 880         if (ret)
 881                 goto out;
 882         else
 883                 ret = btrfs_run_delayed_items(trans, fs_info);
 884 out:
 885         kfree(name);
 886         iput(inode);
 887         return ret;
 888 }
 889
 890 /*
 891  * helper function to see if a given name and sequence number found
 892  * in an inode back reference are already in a directory and correctly
 893  * point to this inode
 894  */
 895 static noinline int inode_in_dir(struct btrfs_root *root,
 896                                  struct btrfs_path *path,
 897                                  u64 dirid, u64 objectid, u64 index,
 898                                  const char *name, int name_len)
 899 {
 900         struct btrfs_dir_item *di;
 901         struct btrfs_key location;
 902         int match = 0;
 903
 904         di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
 905                                          index, name, name_len, 0);
 906         if (di && !IS_ERR(di)) {
 907                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 908                 if (location.objectid != objectid)
 909                         goto out;
 910         } else
 911                 goto out;
 912         btrfs_release_path(path);
 913
 914         di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
 915         if (di && !IS_ERR(di)) {
 916                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 917                 if (location.objectid != objectid)
 918                         goto out;
 919         } else
 920                 goto out;
 921         match = 1;
 922 out:
 923         btrfs_release_path(path);
 924         return match;
 925 }
 926
 927 /*
 928  * helper function to check a log tree for a named back reference in
 929  * an inode.  This is used to decide if a back reference that is
 930  * found in the subvolume conflicts with what we find in the log.
 931  *
 932  * inode backreferences may have multiple refs in a single item,
 933  * during replay we process one reference at a time, and we don't
 934  * want to delete valid links to a file from the subvolume if that
 935  * link is also in the log.
 936  */
 937 static noinline int backref_in_log(struct btrfs_root *log,
 938                                    struct btrfs_key *key,
 939                                    u64 ref_objectid,
 940                                    const char *name, int namelen)
 941 {
 942         struct btrfs_path *path;
 943         struct btrfs_inode_ref *ref;
 944         unsigned long ptr;
 945         unsigned long ptr_end;
 946         unsigned long name_ptr;
 947         int found_name_len;
 948         int item_size;
 949         int ret;
 950         int match = 0;
 951
 952         path = btrfs_alloc_path();
 953         if (!path)
 954                 return -ENOMEM;
 955
 956         ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
 957         if (ret != 0)
 958                 goto out;
 959
 960         ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 961
 962         if (key->type == BTRFS_INODE_EXTREF_KEY) {
 963                 if (btrfs_find_name_in_ext_backref(path, ref_objectid,
 964                                                    name, namelen, NULL))
 965                         match = 1;
 966
 967                 goto out;
 968         }
 969
 970         item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 971         ptr_end = ptr + item_size;
 972         while (ptr < ptr_end) {
 973                 ref = (struct btrfs_inode_ref *)ptr;
 974                 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
 975                 if (found_name_len == namelen) {
 976                         name_ptr = (unsigned long)(ref + 1);
 977                         ret = memcmp_extent_buffer(path->nodes[0], name,
 978                                                    name_ptr, namelen);
 979                         if (ret == 0) {
 980                                 match = 1;
 981                                 goto out;
 982                         }
 983                 }
 984                 ptr = (unsigned long)(ref + 1) + found_name_len;
 985         }
 986 out:
 987         btrfs_free_path(path);
 988         return match;
 989 }
 990
 991 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 992                                   struct btrfs_root *root,
 993                                   struct btrfs_path *path,
 994                                   struct btrfs_root *log_root,
 995                                   struct inode *dir, struct inode *inode,
 996                                   struct extent_buffer *eb,
 997                                   u64 inode_objectid, u64 parent_objectid,
 998                                   u64 ref_index, char *name, int namelen,
 999                                   int *search_done)
1000 {
1001         struct btrfs_fs_info *fs_info = root->fs_info;
1002         int ret;
1003         char *victim_name;
1004         int victim_name_len;
1005         struct extent_buffer *leaf;
1006         struct btrfs_dir_item *di;
1007         struct btrfs_key search_key;
1008         struct btrfs_inode_extref *extref;
1009
1010 again:
1011         /* Search old style refs */
1012         search_key.objectid = inode_objectid;
1013         search_key.type = BTRFS_INODE_REF_KEY;
1014         search_key.offset = parent_objectid;
1015         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1016         if (ret == 0) {
1017                 struct btrfs_inode_ref *victim_ref;
1018                 unsigned long ptr;
1019                 unsigned long ptr_end;
1020
1021                 leaf = path->nodes[0];
1022
1023                 /* are we trying to overwrite a back ref for the root directory
1024                  * if so, just jump out, we're done
1025                  */
1026                 if (search_key.objectid == search_key.offset)
1027                         return 1;
1028
1029                 /* check all the names in this back reference to see
1030                  * if they are in the log.  if so, we allow them to stay
1031                  * otherwise they must be unlinked as a conflict
1032                  */
1033                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1034                 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
1035                 while (ptr < ptr_end) {
1036                         victim_ref = (struct btrfs_inode_ref *)ptr;
1037                         victim_name_len = btrfs_inode_ref_name_len(leaf,
1038                                                                    victim_ref);
1039                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
1040                         if (!victim_name)
1041                                 return -ENOMEM;
1042
1043                         read_extent_buffer(leaf, victim_name,
1044                                            (unsigned long)(victim_ref + 1),
1045                                            victim_name_len);
1046
1047                         if (!backref_in_log(log_root, &search_key,
1048                                             parent_objectid,
1049                                             victim_name,
1050                                             victim_name_len)) {
1051                                 inc_nlink(inode);
1052                                 btrfs_release_path(path);
1053
1054                                 ret = btrfs_unlink_inode(trans, root,
1055                                                 BTRFS_I(dir), BTRFS_I(inode),
1056                                                 victim_name, victim_name_len);
1057                                 kfree(victim_name);
1058                                 if (ret)
1059                                         return ret;
1060                                 ret = btrfs_run_delayed_items(trans, fs_info);
1061                                 if (ret)
1062                                         return ret;
1063                                 *search_done = 1;
1064                                 goto again;
1065                         }
1066                         kfree(victim_name);
1067
1068                         ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1069                 }
1070
1071                 /*
1072                  * NOTE: we have searched root tree and checked the
1073                  * corresponding ref, it does not need to check again.
1074                  */
1075                 *search_done = 1;
1076         }
1077         btrfs_release_path(path);
1078
1079         /* Same search but for extended refs */
1080         extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1081                                            inode_objectid, parent_objectid, 0,
1082                                            0);
1083         if (!IS_ERR_OR_NULL(extref)) {
1084                 u32 item_size;
1085                 u32 cur_offset = 0;
1086                 unsigned long base;
1087                 struct inode *victim_parent;
1088
1089                 leaf = path->nodes[0];
1090
1091                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1092                 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1093
1094                 while (cur_offset < item_size) {
1095                         extref = (struct btrfs_inode_extref *)(base + cur_offset);
1096
1097                         victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1098
1099                         if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1100                                 goto next;
1101
1102                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
1103                         if (!victim_name)
1104                                 return -ENOMEM;
1105                         read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1106                                            victim_name_len);
1107
1108                         search_key.objectid = inode_objectid;
1109                         search_key.type = BTRFS_INODE_EXTREF_KEY;
1110                         search_key.offset = btrfs_extref_hash(parent_objectid,
1111                                                               victim_name,
1112                                                               victim_name_len);
1113                         ret = 0;
1114                         if (!backref_in_log(log_root, &search_key,
1115                                             parent_objectid, victim_name,
1116                                             victim_name_len)) {
1117                                 ret = -ENOENT;
1118                                 victim_parent = read_one_inode(root,
1119                                                                parent_objectid);
1120                                 if (victim_parent) {
1121                                         inc_nlink(inode);
1122                                         btrfs_release_path(path);
1123
1124                                         ret = btrfs_unlink_inode(trans, root,
1125                                                         BTRFS_I(victim_parent),
1126                                                         BTRFS_I(inode),
1127                                                         victim_name,
1128                                                         victim_name_len);
1129                                         if (!ret)
1130                                                 ret = btrfs_run_delayed_items(
1131                                                                   trans,
1132                                                                   fs_info);
1133                                 }
1134                                 iput(victim_parent);
1135                                 kfree(victim_name);
1136                                 if (ret)
1137                                         return ret;
1138                                 *search_done = 1;
1139                                 goto again;
1140                         }
1141                         kfree(victim_name);
1142                         if (ret)
1143                                 return ret;
1144 next:
1145                         cur_offset += victim_name_len + sizeof(*extref);
1146                 }
1147                 *search_done = 1;
1148         }
1149         btrfs_release_path(path);
1150
1151         /* look for a conflicting sequence number */
1152         di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(BTRFS_I(dir)),
1153                                          ref_index, name, namelen, 0);
1154         if (di && !IS_ERR(di)) {
1155                 ret = drop_one_dir_item(trans, root, path, dir, di);
1156                 if (ret)
1157                         return ret;
1158         }
1159         btrfs_release_path(path);
1160
1161         /* look for a conflicing name */
1162         di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(BTRFS_I(dir)),
1163                                    name, namelen, 0);
1164         if (di && !IS_ERR(di)) {
1165                 ret = drop_one_dir_item(trans, root, path, dir, di);
1166                 if (ret)
1167                         return ret;
1168         }
1169         btrfs_release_path(path);
1170
1171         return 0;
1172 }
1173
1174 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1175                              u32 *namelen, char **name, u64 *index,
1176                              u64 *parent_objectid)
1177 {
1178         struct btrfs_inode_extref *extref;
1179
1180         extref = (struct btrfs_inode_extref *)ref_ptr;
1181
1182         *namelen = btrfs_inode_extref_name_len(eb, extref);
1183         *name = kmalloc(*namelen, GFP_NOFS);
1184         if (*name == NULL)
1185                 return -ENOMEM;
1186
1187         read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1188                            *namelen);
1189
1190         *index = btrfs_inode_extref_index(eb, extref);
1191         if (parent_objectid)
1192                 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1193
1194         return 0;
1195 }
1196
1197 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1198                           u32 *namelen, char **name, u64 *index)
1199 {
1200         struct btrfs_inode_ref *ref;
1201
1202         ref = (struct btrfs_inode_ref *)ref_ptr;
1203
1204         *namelen = btrfs_inode_ref_name_len(eb, ref);
1205         *name = kmalloc(*namelen, GFP_NOFS);
1206         if (*name == NULL)
1207                 return -ENOMEM;
1208
1209         read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1210
1211         *index = btrfs_inode_ref_index(eb, ref);
1212
1213         return 0;
1214 }
1215
1216 /*
1217  * replay one inode back reference item found in the log tree.
1218  * eb, slot and key refer to the buffer and key found in the log tree.
1219  * root is the destination we are replaying into, and path is for temp
1220  * use by this function.  (it should be released on return).
1221  */
1222 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1223                                   struct btrfs_root *root,
1224                                   struct btrfs_root *log,
1225                                   struct btrfs_path *path,
1226                                   struct extent_buffer *eb, int slot,
1227                                   struct btrfs_key *key)
1228 {
1229         struct inode *dir = NULL;
1230         struct inode *inode = NULL;
1231         unsigned long ref_ptr;
1232         unsigned long ref_end;
1233         char *name = NULL;
1234         int namelen;
1235         int ret;
1236         int search_done = 0;
1237         int log_ref_ver = 0;
1238         u64 parent_objectid;
1239         u64 inode_objectid;
1240         u64 ref_index = 0;
1241         int ref_struct_size;
1242
1243         ref_ptr = btrfs_item_ptr_offset(eb, slot);
1244         ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1245
1246         if (key->type == BTRFS_INODE_EXTREF_KEY) {
1247                 struct btrfs_inode_extref *r;
1248
1249                 ref_struct_size = sizeof(struct btrfs_inode_extref);
1250                 log_ref_ver = 1;
1251                 r = (struct btrfs_inode_extref *)ref_ptr;
1252                 parent_objectid = btrfs_inode_extref_parent(eb, r);
1253         } else {
1254                 ref_struct_size = sizeof(struct btrfs_inode_ref);
1255                 parent_objectid = key->offset;
1256         }
1257         inode_objectid = key->objectid;
1258
1259         /*
1260          * it is possible that we didn't log all the parent directories
1261          * for a given inode.  If we don't find the dir, just don't
1262          * copy the back ref in.  The link count fixup code will take
1263          * care of the rest
1264          */
1265         dir = read_one_inode(root, parent_objectid);
1266         if (!dir) {
1267                 ret = -ENOENT;
1268                 goto out;
1269         }
1270
1271         inode = read_one_inode(root, inode_objectid);
1272         if (!inode) {
1273                 ret = -EIO;
1274                 goto out;
1275         }
1276
1277         while (ref_ptr < ref_end) {
1278                 if (log_ref_ver) {
1279                         ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1280                                                 &ref_index, &parent_objectid);
1281                         /*
1282                          * parent object can change from one array
1283                          * item to another.
1284                          */
1285                         if (!dir)
1286                                 dir = read_one_inode(root, parent_objectid);
1287                         if (!dir) {
1288                                 ret = -ENOENT;
1289                                 goto out;
1290                         }
1291                 } else {
1292                         ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1293                                              &ref_index);
1294                 }
1295                 if (ret)
1296                         goto out;
1297
1298                 /* if we already have a perfect match, we're done */
1299                 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), btrfs_ino(BTRFS_I(inode)),
1300                                   ref_index, name, namelen)) {
1301                         /*
1302                          * look for a conflicting back reference in the
1303                          * metadata. if we find one we have to unlink that name
1304                          * of the file before we add our new link.  Later on, we
1305                          * overwrite any existing back reference, and we don't
1306                          * want to create dangling pointers in the directory.
1307                          */
1308
1309                         if (!search_done) {
1310                                 ret = __add_inode_ref(trans, root, path, log,
1311                                                       dir, inode, eb,
1312                                                       inode_objectid,
1313                                                       parent_objectid,
1314                                                       ref_index, name, namelen,
1315                                                       &search_done);
1316                                 if (ret) {
1317                                         if (ret == 1)
1318                                                 ret = 0;
1319                                         goto out;
1320                                 }
1321                         }
1322
1323                         /* insert our name */
1324                         ret = btrfs_add_link(trans, dir, inode, name, namelen,
1325                                              0, ref_index);
1326                         if (ret)
1327                                 goto out;
1328
1329                         btrfs_update_inode(trans, root, inode);
1330                 }
1331
1332                 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1333                 kfree(name);
1334                 name = NULL;
1335                 if (log_ref_ver) {
1336                         iput(dir);
1337                         dir = NULL;
1338                 }
1339         }
1340
1341         /* finally write the back reference in the inode */
1342         ret = overwrite_item(trans, root, path, eb, slot, key);
1343 out:
1344         btrfs_release_path(path);
1345         kfree(name);
1346         iput(dir);
1347         iput(inode);
1348         return ret;
1349 }
1350
1351 static int insert_orphan_item(struct btrfs_trans_handle *trans,
1352                               struct btrfs_root *root, u64 ino)
1353 {
1354         int ret;
1355
1356         ret = btrfs_insert_orphan_item(trans, root, ino);
1357         if (ret == -EEXIST)
1358                 ret = 0;
1359
1360         return ret;
1361 }
1362
1363 static int count_inode_extrefs(struct btrfs_root *root,
1364                                struct inode *inode, struct btrfs_path *path)
1365 {
1366         int ret = 0;
1367         int name_len;
1368         unsigned int nlink = 0;
1369         u32 item_size;
1370         u32 cur_offset = 0;
1371         u64 inode_objectid = btrfs_ino(BTRFS_I(inode));
1372         u64 offset = 0;
1373         unsigned long ptr;
1374         struct btrfs_inode_extref *extref;
1375         struct extent_buffer *leaf;
1376
1377         while (1) {
1378                 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1379                                             &extref, &offset);
1380                 if (ret)
1381                         break;
1382
1383                 leaf = path->nodes[0];
1384                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1385                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1386                 cur_offset = 0;
1387
1388                 while (cur_offset < item_size) {
1389                         extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1390                         name_len = btrfs_inode_extref_name_len(leaf, extref);
1391
1392                         nlink++;
1393
1394                         cur_offset += name_len + sizeof(*extref);
1395                 }
1396
1397                 offset++;
1398                 btrfs_release_path(path);
1399         }
1400         btrfs_release_path(path);
1401
1402         if (ret < 0 && ret != -ENOENT)
1403                 return ret;
1404         return nlink;
1405 }
1406
1407 static int count_inode_refs(struct btrfs_root *root,
1408                                struct inode *inode, struct btrfs_path *path)
1409 {
1410         int ret;
1411         struct btrfs_key key;
1412         unsigned int nlink = 0;
1413         unsigned long ptr;
1414         unsigned long ptr_end;
1415         int name_len;
1416         u64 ino = btrfs_ino(BTRFS_I(inode));
1417
1418         key.objectid = ino;
1419         key.type = BTRFS_INODE_REF_KEY;
1420         key.offset = (u64)-1;
1421
1422         while (1) {
1423                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1424                 if (ret < 0)
1425                         break;
1426                 if (ret > 0) {
1427                         if (path->slots[0] == 0)
1428                                 break;
1429                         path->slots[0]--;
1430                 }
1431 process_slot:
1432                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1433                                       path->slots[0]);
1434                 if (key.objectid != ino ||
1435                     key.type != BTRFS_INODE_REF_KEY)
1436                         break;
1437                 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1438                 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1439                                                    path->slots[0]);
1440                 while (ptr < ptr_end) {
1441                         struct btrfs_inode_ref *ref;
1442
1443                         ref = (struct btrfs_inode_ref *)ptr;
1444                         name_len = btrfs_inode_ref_name_len(path->nodes[0],
1445                                                             ref);
1446                         ptr = (unsigned long)(ref + 1) + name_len;
1447                         nlink++;
1448                 }
1449
1450                 if (key.offset == 0)
1451                         break;
1452                 if (path->slots[0] > 0) {
1453                         path->slots[0]--;
1454                         goto process_slot;
1455                 }
1456                 key.offset--;
1457                 btrfs_release_path(path);
1458         }
1459         btrfs_release_path(path);
1460
1461         return nlink;
1462 }
1463
1464 /*
1465  * There are a few corners where the link count of the file can't
1466  * be properly maintained during replay.  So, instead of adding
1467  * lots of complexity to the log code, we just scan the backrefs
1468  * for any file that has been through replay.
1469  *
1470  * The scan will update the link count on the inode to reflect the
1471  * number of back refs found.  If it goes down to zero, the iput
1472  * will free the inode.
1473  */
1474 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1475                                            struct btrfs_root *root,
1476                                            struct inode *inode)
1477 {
1478         struct btrfs_path *path;
1479         int ret;
1480         u64 nlink = 0;
1481         u64 ino = btrfs_ino(BTRFS_I(inode));
1482
1483         path = btrfs_alloc_path();
1484         if (!path)
1485                 return -ENOMEM;
1486
1487         ret = count_inode_refs(root, inode, path);
1488         if (ret < 0)
1489                 goto out;
1490
1491         nlink = ret;
1492
1493         ret = count_inode_extrefs(root, inode, path);
1494         if (ret < 0)
1495                 goto out;
1496
1497         nlink += ret;
1498
1499         ret = 0;
1500
1501         if (nlink != inode->i_nlink) {
1502                 set_nlink(inode, nlink);
1503                 btrfs_update_inode(trans, root, inode);
1504         }
1505         BTRFS_I(inode)->index_cnt = (u64)-1;
1506
1507         if (inode->i_nlink == 0) {
1508                 if (S_ISDIR(inode->i_mode)) {
1509                         ret = replay_dir_deletes(trans, root, NULL, path,
1510                                                  ino, 1);
1511                         if (ret)
1512                                 goto out;
1513                 }
1514                 ret = insert_orphan_item(trans, root, ino);
1515         }
1516
1517 out:
1518         btrfs_free_path(path);
1519         return ret;
1520 }
1521
1522 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1523                                             struct btrfs_root *root,
1524                                             struct btrfs_path *path)
1525 {
1526         int ret;
1527         struct btrfs_key key;
1528         struct inode *inode;
1529
1530         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1531         key.type = BTRFS_ORPHAN_ITEM_KEY;
1532         key.offset = (u64)-1;
1533         while (1) {
1534                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1535                 if (ret < 0)
1536                         break;
1537
1538                 if (ret == 1) {
1539                         if (path->slots[0] == 0)
1540                                 break;
1541                         path->slots[0]--;
1542                 }
1543
1544                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1545                 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1546                     key.type != BTRFS_ORPHAN_ITEM_KEY)
1547                         break;
1548
1549                 ret = btrfs_del_item(trans, root, path);
1550                 if (ret)
1551                         goto out;
1552
1553                 btrfs_release_path(path);
1554                 inode = read_one_inode(root, key.offset);
1555                 if (!inode)
1556                         return -EIO;
1557
1558                 ret = fixup_inode_link_count(trans, root, inode);
1559                 iput(inode);
1560                 if (ret)
1561                         goto out;
1562
1563                 /*
1564                  * fixup on a directory may create new entries,
1565                  * make sure we always look for the highset possible
1566                  * offset
1567                  */
1568                 key.offset = (u64)-1;
1569         }
1570         ret = 0;
1571 out:
1572         btrfs_release_path(path);
1573         return ret;
1574 }
1575
1576
1577 /*
1578  * record a given inode in the fixup dir so we can check its link
1579  * count when replay is done.  The link count is incremented here
1580  * so the inode won't go away until we check it
1581  */
1582 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1583                                       struct btrfs_root *root,
1584                                       struct btrfs_path *path,
1585                                       u64 objectid)
1586 {
1587         struct btrfs_key key;
1588         int ret = 0;
1589         struct inode *inode;
1590
1591         inode = read_one_inode(root, objectid);
1592         if (!inode)
1593                 return -EIO;
1594
1595         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1596         key.type = BTRFS_ORPHAN_ITEM_KEY;
1597         key.offset = objectid;
1598
1599         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1600
1601         btrfs_release_path(path);
1602         if (ret == 0) {
1603                 if (!inode->i_nlink)
1604                         set_nlink(inode, 1);
1605                 else
1606                         inc_nlink(inode);
1607                 ret = btrfs_update_inode(trans, root, inode);
1608         } else if (ret == -EEXIST) {
1609                 ret = 0;
1610         } else {
1611                 BUG(); /* Logic Error */
1612         }
1613         iput(inode);
1614
1615         return ret;
1616 }
1617
1618 /*
1619  * when replaying the log for a directory, we only insert names
1620  * for inodes that actually exist.  This means an fsync on a directory
1621  * does not implicitly fsync all the new files in it
1622  */
1623 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1624                                     struct btrfs_root *root,
1625                                     u64 dirid, u64 index,
1626                                     char *name, int name_len,
1627                                     struct btrfs_key *location)
1628 {
1629         struct inode *inode;
1630         struct inode *dir;
1631         int ret;
1632
1633         inode = read_one_inode(root, location->objectid);
1634         if (!inode)
1635                 return -ENOENT;
1636
1637         dir = read_one_inode(root, dirid);
1638         if (!dir) {
1639                 iput(inode);
1640                 return -EIO;
1641         }
1642
1643         ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1644
1645         /* FIXME, put inode into FIXUP list */
1646
1647         iput(inode);
1648         iput(dir);
1649         return ret;
1650 }
1651
1652 /*
1653  * Return true if an inode reference exists in the log for the given name,
1654  * inode and parent inode.
1655  */
1656 static bool name_in_log_ref(struct btrfs_root *log_root,
1657                             const char *name, const int name_len,
1658                             const u64 dirid, const u64 ino)
1659 {
1660         struct btrfs_key search_key;
1661
1662         search_key.objectid = ino;
1663         search_key.type = BTRFS_INODE_REF_KEY;
1664         search_key.offset = dirid;
1665         if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1666                 return true;
1667
1668         search_key.type = BTRFS_INODE_EXTREF_KEY;
1669         search_key.offset = btrfs_extref_hash(dirid, name, name_len);
1670         if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1671                 return true;
1672
1673         return false;
1674 }
1675
1676 /*
1677  * take a single entry in a log directory item and replay it into
1678  * the subvolume.
1679  *
1680  * if a conflicting item exists in the subdirectory already,
1681  * the inode it points to is unlinked and put into the link count
1682  * fix up tree.
1683  *
1684  * If a name from the log points to a file or directory that does
1685  * not exist in the FS, it is skipped.  fsyncs on directories
1686  * do not force down inodes inside that directory, just changes to the
1687  * names or unlinks in a directory.
1688  *
1689  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1690  * non-existing inode) and 1 if the name was replayed.
1691  */
1692 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1693                                     struct btrfs_root *root,
1694                                     struct btrfs_path *path,
1695                                     struct extent_buffer *eb,
1696                                     struct btrfs_dir_item *di,
1697                                     struct btrfs_key *key)
1698 {
1699         char *name;
1700         int name_len;
1701         struct btrfs_dir_item *dst_di;
1702         struct btrfs_key found_key;
1703         struct btrfs_key log_key;
1704         struct inode *dir;
1705         u8 log_type;
1706         int exists;
1707         int ret = 0;
1708         bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1709         bool name_added = false;
1710
1711         dir = read_one_inode(root, key->objectid);
1712         if (!dir)
1713                 return -EIO;
1714
1715         name_len = btrfs_dir_name_len(eb, di);
1716         name = kmalloc(name_len, GFP_NOFS);
1717         if (!name) {
1718                 ret = -ENOMEM;
1719                 goto out;
1720         }
1721
1722         log_type = btrfs_dir_type(eb, di);
1723         read_extent_buffer(eb, name, (unsigned long)(di + 1),
1724                    name_len);
1725
1726         btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1727         exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1728         if (exists == 0)
1729                 exists = 1;
1730         else
1731                 exists = 0;
1732         btrfs_release_path(path);
1733
1734         if (key->type == BTRFS_DIR_ITEM_KEY) {
1735                 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1736                                        name, name_len, 1);
1737         } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1738                 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1739                                                      key->objectid,
1740                                                      key->offset, name,
1741                                                      name_len, 1);
1742         } else {
1743                 /* Corruption */
1744                 ret = -EINVAL;
1745                 goto out;
1746         }
1747         if (IS_ERR_OR_NULL(dst_di)) {
1748                 /* we need a sequence number to insert, so we only
1749                  * do inserts for the BTRFS_DIR_INDEX_KEY types
1750                  */
1751                 if (key->type != BTRFS_DIR_INDEX_KEY)
1752                         goto out;
1753                 goto insert;
1754         }
1755
1756         btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1757         /* the existing item matches the logged item */
1758         if (found_key.objectid == log_key.objectid &&
1759             found_key.type == log_key.type &&
1760             found_key.offset == log_key.offset &&
1761             btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1762                 update_size = false;
1763                 goto out;
1764         }
1765
1766         /*
1767          * don't drop the conflicting directory entry if the inode
1768          * for the new entry doesn't exist
1769          */
1770         if (!exists)
1771                 goto out;
1772
1773         ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1774         if (ret)
1775                 goto out;
1776
1777         if (key->type == BTRFS_DIR_INDEX_KEY)
1778                 goto insert;
1779 out:
1780         btrfs_release_path(path);
1781         if (!ret && update_size) {
1782                 btrfs_i_size_write(dir, dir->i_size + name_len * 2);
1783                 ret = btrfs_update_inode(trans, root, dir);
1784         }
1785         kfree(name);
1786         iput(dir);
1787         if (!ret && name_added)
1788                 ret = 1;
1789         return ret;
1790
1791 insert:
1792         if (name_in_log_ref(root->log_root, name, name_len,
1793                             key->objectid, log_key.objectid)) {
1794                 /* The dentry will be added later. */
1795                 ret = 0;
1796                 update_size = false;
1797                 goto out;
1798         }
1799         btrfs_release_path(path);
1800         ret = insert_one_name(trans, root, key->objectid, key->offset,
1801                               name, name_len, &log_key);
1802         if (ret && ret != -ENOENT && ret != -EEXIST)
1803                 goto out;
1804         if (!ret)
1805                 name_added = true;
1806         update_size = false;
1807         ret = 0;
1808         goto out;
1809 }
1810
1811 /*
1812  * find all the names in a directory item and reconcile them into
1813  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1814  * one name in a directory item, but the same code gets used for
1815  * both directory index types
1816  */
1817 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1818                                         struct btrfs_root *root,
1819                                         struct btrfs_path *path,
1820                                         struct extent_buffer *eb, int slot,
1821                                         struct btrfs_key *key)
1822 {
1823         struct btrfs_fs_info *fs_info = root->fs_info;
1824         int ret = 0;
1825         u32 item_size = btrfs_item_size_nr(eb, slot);
1826         struct btrfs_dir_item *di;
1827         int name_len;
1828         unsigned long ptr;
1829         unsigned long ptr_end;
1830         struct btrfs_path *fixup_path = NULL;
1831
1832         ptr = btrfs_item_ptr_offset(eb, slot);
1833         ptr_end = ptr + item_size;
1834         while (ptr < ptr_end) {
1835                 di = (struct btrfs_dir_item *)ptr;
1836                 if (verify_dir_item(fs_info, eb, di))
1837                         return -EIO;
1838                 name_len = btrfs_dir_name_len(eb, di);
1839                 ret = replay_one_name(trans, root, path, eb, di, key);
1840                 if (ret < 0)
1841                         break;
1842                 ptr = (unsigned long)(di + 1);
1843                 ptr += name_len;
1844
1845                 /*
1846                  * If this entry refers to a non-directory (directories can not
1847                  * have a link count > 1) and it was added in the transaction
1848                  * that was not committed, make sure we fixup the link count of
1849                  * the inode it the entry points to. Otherwise something like
1850                  * the following would result in a directory pointing to an
1851                  * inode with a wrong link that does not account for this dir
1852                  * entry:
1853                  *
1854                  * mkdir testdir
1855                  * touch testdir/foo
1856                  * touch testdir/bar
1857                  * sync
1858                  *
1859                  * ln testdir/bar testdir/bar_link
1860                  * ln testdir/foo testdir/foo_link
1861                  * xfs_io -c "fsync" testdir/bar
1862                  *
1863                  * <power failure>
1864                  *
1865                  * mount fs, log replay happens
1866                  *
1867                  * File foo would remain with a link count of 1 when it has two
1868                  * entries pointing to it in the directory testdir. This would
1869                  * make it impossible to ever delete the parent directory has
1870                  * it would result in stale dentries that can never be deleted.
1871                  */
1872                 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
1873                         struct btrfs_key di_key;
1874
1875                         if (!fixup_path) {
1876                                 fixup_path = btrfs_alloc_path();
1877                                 if (!fixup_path) {
1878                                         ret = -ENOMEM;
1879                                         break;
1880                                 }
1881                         }
1882
1883                         btrfs_dir_item_key_to_cpu(eb, di, &di_key);
1884                         ret = link_to_fixup_dir(trans, root, fixup_path,
1885                                                 di_key.objectid);
1886                         if (ret)
1887                                 break;
1888                 }
1889                 ret = 0;
1890         }
1891         btrfs_free_path(fixup_path);
1892         return ret;
1893 }
1894
1895 /*
1896  * directory replay has two parts.  There are the standard directory
1897  * items in the log copied from the subvolume, and range items
1898  * created in the log while the subvolume was logged.
1899  *
1900  * The range items tell us which parts of the key space the log
1901  * is authoritative for.  During replay, if a key in the subvolume
1902  * directory is in a logged range item, but not actually in the log
1903  * that means it was deleted from the directory before the fsync
1904  * and should be removed.
1905  */
1906 static noinline int find_dir_range(struct btrfs_root *root,
1907                                    struct btrfs_path *path,
1908                                    u64 dirid, int key_type,
1909                                    u64 *start_ret, u64 *end_ret)
1910 {
1911         struct btrfs_key key;
1912         u64 found_end;
1913         struct btrfs_dir_log_item *item;
1914         int ret;
1915         int nritems;
1916
1917         if (*start_ret == (u64)-1)
1918                 return 1;
1919
1920         key.objectid = dirid;
1921         key.type = key_type;
1922         key.offset = *start_ret;
1923
1924         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1925         if (ret < 0)
1926                 goto out;
1927         if (ret > 0) {
1928                 if (path->slots[0] == 0)
1929                         goto out;
1930                 path->slots[0]--;
1931         }
1932         if (ret != 0)
1933                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1934
1935         if (key.type != key_type || key.objectid != dirid) {
1936                 ret = 1;
1937                 goto next;
1938         }
1939         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1940                               struct btrfs_dir_log_item);
1941         found_end = btrfs_dir_log_end(path->nodes[0], item);
1942
1943         if (*start_ret >= key.offset && *start_ret <= found_end) {
1944                 ret = 0;
1945                 *start_ret = key.offset;
1946                 *end_ret = found_end;
1947                 goto out;
1948         }
1949         ret = 1;
1950 next:
1951         /* check the next slot in the tree to see if it is a valid item */
1952         nritems = btrfs_header_nritems(path->nodes[0]);
1953         path->slots[0]++;
1954         if (path->slots[0] >= nritems) {
1955                 ret = btrfs_next_leaf(root, path);
1956                 if (ret)
1957                         goto out;
1958         }
1959
1960         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1961
1962         if (key.type != key_type || key.objectid != dirid) {
1963                 ret = 1;
1964                 goto out;
1965         }
1966         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1967                               struct btrfs_dir_log_item);
1968         found_end = btrfs_dir_log_end(path->nodes[0], item);
1969         *start_ret = key.offset;
1970         *end_ret = found_end;
1971         ret = 0;
1972 out:
1973         btrfs_release_path(path);
1974         return ret;
1975 }
1976
1977 /*
1978  * this looks for a given directory item in the log.  If the directory
1979  * item is not in the log, the item is removed and the inode it points
1980  * to is unlinked
1981  */
1982 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1983                                       struct btrfs_root *root,
1984                                       struct btrfs_root *log,
1985                                       struct btrfs_path *path,
1986                                       struct btrfs_path *log_path,
1987                                       struct inode *dir,
1988                                       struct btrfs_key *dir_key)
1989 {
1990         struct btrfs_fs_info *fs_info = root->fs_info;
1991         int ret;
1992         struct extent_buffer *eb;
1993         int slot;
1994         u32 item_size;
1995         struct btrfs_dir_item *di;
1996         struct btrfs_dir_item *log_di;
1997         int name_len;
1998         unsigned long ptr;
1999         unsigned long ptr_end;
2000         char *name;
2001         struct inode *inode;
2002         struct btrfs_key location;
2003
2004 again:
2005         eb = path->nodes[0];
2006         slot = path->slots[0];
2007         item_size = btrfs_item_size_nr(eb, slot);
2008         ptr = btrfs_item_ptr_offset(eb, slot);
2009         ptr_end = ptr + item_size;
2010         while (ptr < ptr_end) {
2011                 di = (struct btrfs_dir_item *)ptr;
2012                 if (verify_dir_item(fs_info, eb, di)) {
2013                         ret = -EIO;
2014                         goto out;
2015                 }
2016
2017                 name_len = btrfs_dir_name_len(eb, di);
2018                 name = kmalloc(name_len, GFP_NOFS);
2019                 if (!name) {
2020                         ret = -ENOMEM;
2021                         goto out;
2022                 }
2023                 read_extent_buffer(eb, name, (unsigned long)(di + 1),
2024                                   name_len);
2025                 log_di = NULL;
2026                 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2027                         log_di = btrfs_lookup_dir_item(trans, log, log_path,
2028                                                        dir_key->objectid,
2029                                                        name, name_len, 0);
2030                 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2031                         log_di = btrfs_lookup_dir_index_item(trans, log,
2032                                                      log_path,
2033                                                      dir_key->objectid,
2034                                                      dir_key->offset,
2035                                                      name, name_len, 0);
2036                 }
2037                 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
2038                         btrfs_dir_item_key_to_cpu(eb, di, &location);
2039                         btrfs_release_path(path);
2040                         btrfs_release_path(log_path);
2041                         inode = read_one_inode(root, location.objectid);
2042                         if (!inode) {
2043                                 kfree(name);
2044                                 return -EIO;
2045                         }
2046
2047                         ret = link_to_fixup_dir(trans, root,
2048                                                 path, location.objectid);
2049                         if (ret) {
2050                                 kfree(name);
2051                                 iput(inode);
2052                                 goto out;
2053                         }
2054
2055                         inc_nlink(inode);
2056                         ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
2057                                         BTRFS_I(inode), name, name_len);
2058                         if (!ret)
2059                                 ret = btrfs_run_delayed_items(trans, fs_info);
2060                         kfree(name);
2061                         iput(inode);
2062                         if (ret)
2063                                 goto out;
2064
2065                         /* there might still be more names under this key
2066                          * check and repeat if required
2067                          */
2068                         ret = btrfs_search_slot(NULL, root, dir_key, path,
2069                                                 0, 0);
2070                         if (ret == 0)
2071                                 goto again;
2072                         ret = 0;
2073                         goto out;
2074                 } else if (IS_ERR(log_di)) {
2075                         kfree(name);
2076                         return PTR_ERR(log_di);
2077                 }
2078                 btrfs_release_path(log_path);
2079                 kfree(name);
2080
2081                 ptr = (unsigned long)(di + 1);
2082                 ptr += name_len;
2083         }
2084         ret = 0;
2085 out:
2086         btrfs_release_path(path);
2087         btrfs_release_path(log_path);
2088         return ret;
2089 }
2090
2091 static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2092                               struct btrfs_root *root,
2093                               struct btrfs_root *log,
2094                               struct btrfs_path *path,
2095                               const u64 ino)
2096 {
2097         struct btrfs_key search_key;
2098         struct btrfs_path *log_path;
2099         int i;
2100         int nritems;
2101         int ret;
2102
2103         log_path = btrfs_alloc_path();
2104         if (!log_path)
2105                 return -ENOMEM;
2106
2107         search_key.objectid = ino;
2108         search_key.type = BTRFS_XATTR_ITEM_KEY;
2109         search_key.offset = 0;
2110 again:
2111         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2112         if (ret < 0)
2113                 goto out;
2114 process_leaf:
2115         nritems = btrfs_header_nritems(path->nodes[0]);
2116         for (i = path->slots[0]; i < nritems; i++) {
2117                 struct btrfs_key key;
2118                 struct btrfs_dir_item *di;
2119                 struct btrfs_dir_item *log_di;
2120                 u32 total_size;
2121                 u32 cur;
2122
2123                 btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2124                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2125                         ret = 0;
2126                         goto out;
2127                 }
2128
2129                 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2130                 total_size = btrfs_item_size_nr(path->nodes[0], i);
2131                 cur = 0;
2132                 while (cur < total_size) {
2133                         u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2134                         u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2135                         u32 this_len = sizeof(*di) + name_len + data_len;
2136                         char *name;
2137
2138                         name = kmalloc(name_len, GFP_NOFS);
2139                         if (!name) {
2140                                 ret = -ENOMEM;
2141                                 goto out;
2142                         }
2143                         read_extent_buffer(path->nodes[0], name,
2144                                            (unsigned long)(di + 1), name_len);
2145
2146                         log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2147                                                     name, name_len, 0);
2148                         btrfs_release_path(log_path);
2149                         if (!log_di) {
2150                                 /* Doesn't exist in log tree, so delete it. */
2151                                 btrfs_release_path(path);
2152                                 di = btrfs_lookup_xattr(trans, root, path, ino,
2153                                                         name, name_len, -1);
2154                                 kfree(name);
2155                                 if (IS_ERR(di)) {
2156                                         ret = PTR_ERR(di);
2157                                         goto out;
2158                                 }
2159                                 ASSERT(di);
2160                                 ret = btrfs_delete_one_dir_name(trans, root,
2161                                                                 path, di);
2162                                 if (ret)
2163                                         goto out;
2164                                 btrfs_release_path(path);
2165                                 search_key = key;
2166                                 goto again;
2167                         }
2168                         kfree(name);
2169                         if (IS_ERR(log_di)) {
2170                                 ret = PTR_ERR(log_di);
2171                                 goto out;
2172                         }
2173                         cur += this_len;
2174                         di = (struct btrfs_dir_item *)((char *)di + this_len);
2175                 }
2176         }
2177         ret = btrfs_next_leaf(root, path);
2178         if (ret > 0)
2179                 ret = 0;
2180         else if (ret == 0)
2181                 goto process_leaf;
2182 out:
2183         btrfs_free_path(log_path);
2184         btrfs_release_path(path);
2185         return ret;
2186 }
2187
2188
2189 /*
2190  * deletion replay happens before we copy any new directory items
2191  * out of the log or out of backreferences from inodes.  It
2192  * scans the log to find ranges of keys that log is authoritative for,
2193  * and then scans the directory to find items in those ranges that are
2194  * not present in the log.
2195  *
2196  * Anything we don't find in the log is unlinked and removed from the
2197  * directory.
2198  */
2199 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2200                                        struct btrfs_root *root,
2201                                        struct btrfs_root *log,
2202                                        struct btrfs_path *path,
2203                                        u64 dirid, int del_all)
2204 {
2205         u64 range_start;
2206         u64 range_end;
2207         int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2208         int ret = 0;
2209         struct btrfs_key dir_key;
2210         struct btrfs_key found_key;
2211         struct btrfs_path *log_path;
2212         struct inode *dir;
2213
2214         dir_key.objectid = dirid;
2215         dir_key.type = BTRFS_DIR_ITEM_KEY;
2216         log_path = btrfs_alloc_path();
2217         if (!log_path)
2218                 return -ENOMEM;
2219
2220         dir = read_one_inode(root, dirid);
2221         /* it isn't an error if the inode isn't there, that can happen
2222          * because we replay the deletes before we copy in the inode item
2223          * from the log
2224          */
2225         if (!dir) {
2226                 btrfs_free_path(log_path);
2227                 return 0;
2228         }
2229 again:
2230         range_start = 0;
2231         range_end = 0;
2232         while (1) {
2233                 if (del_all)
2234                         range_end = (u64)-1;
2235                 else {
2236                         ret = find_dir_range(log, path, dirid, key_type,
2237                                              &range_start, &range_end);
2238                         if (ret != 0)
2239                                 break;
2240                 }
2241
2242                 dir_key.offset = range_start;
2243                 while (1) {
2244                         int nritems;
2245                         ret = btrfs_search_slot(NULL, root, &dir_key, path,
2246                                                 0, 0);
2247                         if (ret < 0)
2248                                 goto out;
2249
2250                         nritems = btrfs_header_nritems(path->nodes[0]);
2251                         if (path->slots[0] >= nritems) {
2252                                 ret = btrfs_next_leaf(root, path);
2253                                 if (ret)
2254                                         break;
2255                         }
2256                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2257                                               path->slots[0]);
2258                         if (found_key.objectid != dirid ||
2259                             found_key.type != dir_key.type)
2260                                 goto next_type;
2261
2262                         if (found_key.offset > range_end)
2263                                 break;
2264
2265                         ret = check_item_in_log(trans, root, log, path,
2266                                                 log_path, dir,
2267                                                 &found_key);
2268                         if (ret)
2269                                 goto out;
2270                         if (found_key.offset == (u64)-1)
2271                                 break;
2272                         dir_key.offset = found_key.offset + 1;
2273                 }
2274                 btrfs_release_path(path);
2275                 if (range_end == (u64)-1)
2276                         break;
2277                 range_start = range_end + 1;
2278         }
2279
2280 next_type:
2281         ret = 0;
2282         if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2283                 key_type = BTRFS_DIR_LOG_INDEX_KEY;
2284                 dir_key.type = BTRFS_DIR_INDEX_KEY;
2285                 btrfs_release_path(path);
2286                 goto again;
2287         }
2288 out:
2289         btrfs_release_path(path);
2290         btrfs_free_path(log_path);
2291         iput(dir);
2292         return ret;
2293 }
2294
2295 /*
2296  * the process_func used to replay items from the log tree.  This
2297  * gets called in two different stages.  The first stage just looks
2298  * for inodes and makes sure they are all copied into the subvolume.
2299  *
2300  * The second stage copies all the other item types from the log into
2301  * the subvolume.  The two stage approach is slower, but gets rid of
2302  * lots of complexity around inodes referencing other inodes that exist
2303  * only in the log (references come from either directory items or inode
2304  * back refs).
2305  */
2306 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2307                              struct walk_control *wc, u64 gen)
2308 {
2309         int nritems;
2310         struct btrfs_path *path;
2311         struct btrfs_root *root = wc->replay_dest;
2312         struct btrfs_key key;
2313         int level;
2314         int i;
2315         int ret;
2316
2317         ret = btrfs_read_buffer(eb, gen);
2318         if (ret)
2319                 return ret;
2320
2321         level = btrfs_header_level(eb);
2322
2323         if (level != 0)
2324                 return 0;
2325
2326         path = btrfs_alloc_path();
2327         if (!path)
2328                 return -ENOMEM;
2329
2330         nritems = btrfs_header_nritems(eb);
2331         for (i = 0; i < nritems; i++) {
2332                 btrfs_item_key_to_cpu(eb, &key, i);
2333
2334                 /* inode keys are done during the first stage */
2335                 if (key.type == BTRFS_INODE_ITEM_KEY &&
2336                     wc->stage == LOG_WALK_REPLAY_INODES) {
2337                         struct btrfs_inode_item *inode_item;
2338                         u32 mode;
2339
2340                         inode_item = btrfs_item_ptr(eb, i,
2341                                             struct btrfs_inode_item);
2342                         ret = replay_xattr_deletes(wc->trans, root, log,
2343                                                    path, key.objectid);
2344                         if (ret)
2345                                 break;
2346                         mode = btrfs_inode_mode(eb, inode_item);
2347                         if (S_ISDIR(mode)) {
2348                                 ret = replay_dir_deletes(wc->trans,
2349                                          root, log, path, key.objectid, 0);
2350                                 if (ret)
2351                                         break;
2352                         }
2353                         ret = overwrite_item(wc->trans, root, path,
2354                                              eb, i, &key);
2355                         if (ret)
2356                                 break;
2357
2358                         /* for regular files, make sure corresponding
2359                          * orphan item exist. extents past the new EOF
2360                          * will be truncated later by orphan cleanup.
2361                          */
2362                         if (S_ISREG(mode)) {
2363                                 ret = insert_orphan_item(wc->trans, root,
2364                                                          key.objectid);
2365                                 if (ret)
2366                                         break;
2367                         }
2368
2369                         ret = link_to_fixup_dir(wc->trans, root,
2370                                                 path, key.objectid);
2371                         if (ret)
2372                                 break;
2373                 }
2374
2375                 if (key.type == BTRFS_DIR_INDEX_KEY &&
2376                     wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2377                         ret = replay_one_dir_item(wc->trans, root, path,
2378                                                   eb, i, &key);
2379                         if (ret)
2380                                 break;
2381                 }
2382
2383                 if (wc->stage < LOG_WALK_REPLAY_ALL)
2384                         continue;
2385
2386                 /* these keys are simply copied */
2387                 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2388                         ret = overwrite_item(wc->trans, root, path,
2389                                              eb, i, &key);
2390                         if (ret)
2391                                 break;
2392                 } else if (key.type == BTRFS_INODE_REF_KEY ||
2393                            key.type == BTRFS_INODE_EXTREF_KEY) {
2394                         ret = add_inode_ref(wc->trans, root, log, path,
2395                                             eb, i, &key);
2396                         if (ret && ret != -ENOENT)
2397                                 break;
2398                         ret = 0;
2399                 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2400                         ret = replay_one_extent(wc->trans, root, path,
2401                                                 eb, i, &key);
2402                         if (ret)
2403                                 break;
2404                 } else if (key.type == BTRFS_DIR_ITEM_KEY) {
2405                         ret = replay_one_dir_item(wc->trans, root, path,
2406                                                   eb, i, &key);
2407                         if (ret)
2408                                 break;
2409                 }
2410         }
2411         btrfs_free_path(path);
2412         return ret;
2413 }
2414
2415 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2416                                    struct btrfs_root *root,
2417                                    struct btrfs_path *path, int *level,
2418                                    struct walk_control *wc)
2419 {
2420         struct btrfs_fs_info *fs_info = root->fs_info;
2421         u64 root_owner;
2422         u64 bytenr;
2423         u64 ptr_gen;
2424         struct extent_buffer *next;
2425         struct extent_buffer *cur;
2426         struct extent_buffer *parent;
2427         u32 blocksize;
2428         int ret = 0;
2429
2430         WARN_ON(*level < 0);
2431         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2432
2433         while (*level > 0) {
2434                 WARN_ON(*level < 0);
2435                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2436                 cur = path->nodes[*level];
2437
2438                 WARN_ON(btrfs_header_level(cur) != *level);
2439
2440                 if (path->slots[*level] >=
2441                     btrfs_header_nritems(cur))
2442                         break;
2443
2444                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2445                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2446                 blocksize = fs_info->nodesize;
2447
2448                 parent = path->nodes[*level];
2449                 root_owner = btrfs_header_owner(parent);
2450
2451                 next = btrfs_find_create_tree_block(fs_info, bytenr);
2452                 if (IS_ERR(next))
2453                         return PTR_ERR(next);
2454
2455                 if (*level == 1) {
2456                         ret = wc->process_func(root, next, wc, ptr_gen);
2457                         if (ret) {
2458                                 free_extent_buffer(next);
2459                                 return ret;
2460                         }
2461
2462                         path->slots[*level]++;
2463                         if (wc->free) {
2464                                 ret = btrfs_read_buffer(next, ptr_gen);
2465                                 if (ret) {
2466                                         free_extent_buffer(next);
2467                                         return ret;
2468                                 }
2469
2470                                 if (trans) {
2471                                         btrfs_tree_lock(next);
2472                                         btrfs_set_lock_blocking(next);
2473                                         clean_tree_block(trans, fs_info, next);
2474                                         btrfs_wait_tree_block_writeback(next);
2475                                         btrfs_tree_unlock(next);
2476                                 }
2477
2478                                 WARN_ON(root_owner !=
2479                                         BTRFS_TREE_LOG_OBJECTID);
2480                                 ret = btrfs_free_and_pin_reserved_extent(
2481                                                         fs_info, bytenr,
2482                                                         blocksize);
2483                                 if (ret) {
2484                                         free_extent_buffer(next);
2485                                         return ret;
2486                                 }
2487                         }
2488                         free_extent_buffer(next);
2489                         continue;
2490                 }
2491                 ret = btrfs_read_buffer(next, ptr_gen);
2492                 if (ret) {
2493                         free_extent_buffer(next);
2494                         return ret;
2495                 }
2496
2497                 WARN_ON(*level <= 0);
2498                 if (path->nodes[*level-1])
2499                         free_extent_buffer(path->nodes[*level-1]);
2500                 path->nodes[*level-1] = next;
2501                 *level = btrfs_header_level(next);
2502                 path->slots[*level] = 0;
2503                 cond_resched();
2504         }
2505         WARN_ON(*level < 0);
2506         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2507
2508         path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2509
2510         cond_resched();
2511         return 0;
2512 }
2513
2514 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2515                                  struct btrfs_root *root,
2516                                  struct btrfs_path *path, int *level,
2517                                  struct walk_control *wc)
2518 {
2519         struct btrfs_fs_info *fs_info = root->fs_info;
2520         u64 root_owner;
2521         int i;
2522         int slot;
2523         int ret;
2524
2525         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2526                 slot = path->slots[i];
2527                 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2528                         path->slots[i]++;
2529                         *level = i;
2530                         WARN_ON(*level == 0);
2531                         return 0;
2532                 } else {
2533                         struct extent_buffer *parent;
2534                         if (path->nodes[*level] == root->node)
2535                                 parent = path->nodes[*level];
2536                         else
2537                                 parent = path->nodes[*level + 1];
2538
2539                         root_owner = btrfs_header_owner(parent);
2540                         ret = wc->process_func(root, path->nodes[*level], wc,
2541                                  btrfs_header_generation(path->nodes[*level]));
2542                         if (ret)
2543                                 return ret;
2544
2545                         if (wc->free) {
2546                                 struct extent_buffer *next;
2547
2548                                 next = path->nodes[*level];
2549
2550                                 if (trans) {
2551                                         btrfs_tree_lock(next);
2552                                         btrfs_set_lock_blocking(next);
2553                                         clean_tree_block(trans, fs_info, next);
2554                                         btrfs_wait_tree_block_writeback(next);
2555                                         btrfs_tree_unlock(next);
2556                                 }
2557
2558                                 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2559                                 ret = btrfs_free_and_pin_reserved_extent(
2560                                                 fs_info,
2561                                                 path->nodes[*level]->start,
2562                                                 path->nodes[*level]->len);
2563                                 if (ret)
2564                                         return ret;
2565                         }
2566                         free_extent_buffer(path->nodes[*level]);
2567                         path->nodes[*level] = NULL;
2568                         *level = i + 1;
2569                 }
2570         }
2571         return 1;
2572 }
2573
2574 /*
2575  * drop the reference count on the tree rooted at 'snap'.  This traverses
2576  * the tree freeing any blocks that have a ref count of zero after being
2577  * decremented.
2578  */
2579 static int walk_log_tree(struct btrfs_trans_handle *trans,
2580                          struct btrfs_root *log, struct walk_control *wc)
2581 {
2582         struct btrfs_fs_info *fs_info = log->fs_info;
2583         int ret = 0;
2584         int wret;
2585         int level;
2586         struct btrfs_path *path;
2587         int orig_level;
2588
2589         path = btrfs_alloc_path();
2590         if (!path)
2591                 return -ENOMEM;
2592
2593         level = btrfs_header_level(log->node);
2594         orig_level = level;
2595         path->nodes[level] = log->node;
2596         extent_buffer_get(log->node);
2597         path->slots[level] = 0;
2598
2599         while (1) {
2600                 wret = walk_down_log_tree(trans, log, path, &level, wc);
2601                 if (wret > 0)
2602                         break;
2603                 if (wret < 0) {
2604                         ret = wret;
2605                         goto out;
2606                 }
2607
2608                 wret = walk_up_log_tree(trans, log, path, &level, wc);
2609                 if (wret > 0)
2610                         break;
2611                 if (wret < 0) {
2612                         ret = wret;
2613                         goto out;
2614                 }
2615         }
2616
2617         /* was the root node processed? if not, catch it here */
2618         if (path->nodes[orig_level]) {
2619                 ret = wc->process_func(log, path->nodes[orig_level], wc,
2620                          btrfs_header_generation(path->nodes[orig_level]));
2621                 if (ret)
2622                         goto out;
2623                 if (wc->free) {
2624                         struct extent_buffer *next;
2625
2626                         next = path->nodes[orig_level];
2627
2628                         if (trans) {
2629                                 btrfs_tree_lock(next);
2630                                 btrfs_set_lock_blocking(next);
2631                                 clean_tree_block(trans, fs_info, next);
2632                                 btrfs_wait_tree_block_writeback(next);
2633                                 btrfs_tree_unlock(next);
2634                         }
2635
2636                         WARN_ON(log->root_key.objectid !=
2637                                 BTRFS_TREE_LOG_OBJECTID);
2638                         ret = btrfs_free_and_pin_reserved_extent(fs_info,
2639                                                         next->start, next->len);
2640                         if (ret)
2641                                 goto out;
2642                 }
2643         }
2644
2645 out:
2646         btrfs_free_path(path);
2647         return ret;
2648 }
2649
2650 /*
2651  * helper function to update the item for a given subvolumes log root
2652  * in the tree of log roots
2653  */
2654 static int update_log_root(struct btrfs_trans_handle *trans,
2655                            struct btrfs_root *log)
2656 {
2657         struct btrfs_fs_info *fs_info = log->fs_info;
2658         int ret;
2659
2660         if (log->log_transid == 1) {
2661                 /* insert root item on the first sync */
2662                 ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2663                                 &log->root_key, &log->root_item);
2664         } else {
2665                 ret = btrfs_update_root(trans, fs_info->log_root_tree,
2666                                 &log->root_key, &log->root_item);
2667         }
2668         return ret;
2669 }
2670
2671 static void wait_log_commit(struct btrfs_root *root, int transid)
2672 {
2673         DEFINE_WAIT(wait);
2674         int index = transid % 2;
2675
2676         /*
2677          * we only allow two pending log transactions at a time,
2678          * so we know that if ours is more than 2 older than the
2679          * current transaction, we're done
2680          */
2681         do {
2682                 prepare_to_wait(&root->log_commit_wait[index],
2683                                 &wait, TASK_UNINTERRUPTIBLE);
2684                 mutex_unlock(&root->log_mutex);
2685
2686                 if (root->log_transid_committed < transid &&
2687                     atomic_read(&root->log_commit[index]))
2688                         schedule();
2689
2690                 finish_wait(&root->log_commit_wait[index], &wait);
2691                 mutex_lock(&root->log_mutex);
2692         } while (root->log_transid_committed < transid &&
2693                  atomic_read(&root->log_commit[index]));
2694 }
2695
2696 static void wait_for_writer(struct btrfs_root *root)
2697 {
2698         DEFINE_WAIT(wait);
2699
2700         while (atomic_read(&root->log_writers)) {
2701                 prepare_to_wait(&root->log_writer_wait,
2702                                 &wait, TASK_UNINTERRUPTIBLE);
2703                 mutex_unlock(&root->log_mutex);
2704                 if (atomic_read(&root->log_writers))
2705                         schedule();
2706                 finish_wait(&root->log_writer_wait, &wait);
2707                 mutex_lock(&root->log_mutex);
2708         }
2709 }
2710
2711 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2712                                         struct btrfs_log_ctx *ctx)
2713 {
2714         if (!ctx)
2715                 return;
2716
2717         mutex_lock(&root->log_mutex);
2718         list_del_init(&ctx->list);
2719         mutex_unlock(&root->log_mutex);
2720 }
2721
2722 /*
2723  * Invoked in log mutex context, or be sure there is no other task which
2724  * can access the list.
2725  */
2726 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2727                                              int index, int error)
2728 {
2729         struct btrfs_log_ctx *ctx;
2730         struct btrfs_log_ctx *safe;
2731
2732         list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2733                 list_del_init(&ctx->list);
2734                 ctx->log_ret = error;
2735         }
2736
2737         INIT_LIST_HEAD(&root->log_ctxs[index]);
2738 }
2739
2740 /*
2741  * btrfs_sync_log does sends a given tree log down to the disk and
2742  * updates the super blocks to record it.  When this call is done,
2743  * you know that any inodes previously logged are safely on disk only
2744  * if it returns 0.
2745  *
2746  * Any other return value means you need to call btrfs_commit_transaction.
2747  * Some of the edge cases for fsyncing directories that have had unlinks
2748  * or renames done in the past mean that sometimes the only safe
2749  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2750  * that has happened.
2751  */
2752 int btrfs_sync_log(struct btrfs_trans_handle *trans,
2753                    struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2754 {
2755         int index1;
2756         int index2;
2757         int mark;
2758         int ret;
2759         struct btrfs_fs_info *fs_info = root->fs_info;
2760         struct btrfs_root *log = root->log_root;
2761         struct btrfs_root *log_root_tree = fs_info->log_root_tree;
2762         int log_transid = 0;
2763         struct btrfs_log_ctx root_log_ctx;
2764         struct blk_plug plug;
2765
2766         mutex_lock(&root->log_mutex);
2767         log_transid = ctx->log_transid;
2768         if (root->log_transid_committed >= log_transid) {
2769                 mutex_unlock(&root->log_mutex);
2770                 return ctx->log_ret;
2771         }
2772
2773         index1 = log_transid % 2;
2774         if (atomic_read(&root->log_commit[index1])) {
2775                 wait_log_commit(root, log_transid);
2776                 mutex_unlock(&root->log_mutex);
2777                 return ctx->log_ret;
2778         }
2779         ASSERT(log_transid == root->log_transid);
2780         atomic_set(&root->log_commit[index1], 1);
2781
2782         /* wait for previous tree log sync to complete */
2783         if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2784                 wait_log_commit(root, log_transid - 1);
2785
2786         while (1) {
2787                 int batch = atomic_read(&root->log_batch);
2788                 /* when we're on an ssd, just kick the log commit out */
2789                 if (!btrfs_test_opt(fs_info, SSD) &&
2790                     test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2791                         mutex_unlock(&root->log_mutex);
2792                         schedule_timeout_uninterruptible(1);
2793                         mutex_lock(&root->log_mutex);
2794                 }
2795                 wait_for_writer(root);
2796                 if (batch == atomic_read(&root->log_batch))
2797                         break;
2798         }
2799
2800         /* bail out if we need to do a full commit */
2801         if (btrfs_need_log_full_commit(fs_info, trans)) {
2802                 ret = -EAGAIN;
2803                 btrfs_free_logged_extents(log, log_transid);
2804                 mutex_unlock(&root->log_mutex);
2805                 goto out;
2806         }
2807
2808         if (log_transid % 2 == 0)
2809                 mark = EXTENT_DIRTY;
2810         else
2811                 mark = EXTENT_NEW;
2812
2813         /* we start IO on  all the marked extents here, but we don't actually
2814          * wait for them until later.
2815          */
2816         blk_start_plug(&plug);
2817         ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
2818         if (ret) {
2819                 blk_finish_plug(&plug);
2820                 btrfs_abort_transaction(trans, ret);
2821                 btrfs_free_logged_extents(log, log_transid);
2822                 btrfs_set_log_full_commit(fs_info, trans);
2823                 mutex_unlock(&root->log_mutex);
2824                 goto out;
2825         }
2826
2827         btrfs_set_root_node(&log->root_item, log->node);
2828
2829         root->log_transid++;
2830         log->log_transid = root->log_transid;
2831         root->log_start_pid = 0;
2832         /*
2833          * IO has been started, blocks of the log tree have WRITTEN flag set
2834          * in their headers. new modifications of the log will be written to
2835          * new positions. so it's safe to allow log writers to go in.
2836          */
2837         mutex_unlock(&root->log_mutex);
2838
2839         btrfs_init_log_ctx(&root_log_ctx, NULL);
2840
2841         mutex_lock(&log_root_tree->log_mutex);
2842         atomic_inc(&log_root_tree->log_batch);
2843         atomic_inc(&log_root_tree->log_writers);
2844
2845         index2 = log_root_tree->log_transid % 2;
2846         list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2847         root_log_ctx.log_transid = log_root_tree->log_transid;
2848
2849         mutex_unlock(&log_root_tree->log_mutex);
2850
2851         ret = update_log_root(trans, log);
2852
2853         mutex_lock(&log_root_tree->log_mutex);
2854         if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2855                 /*
2856                  * Implicit memory barrier after atomic_dec_and_test
2857                  */
2858                 if (waitqueue_active(&log_root_tree->log_writer_wait))
2859                         wake_up(&log_root_tree->log_writer_wait);
2860         }
2861
2862         if (ret) {
2863                 if (!list_empty(&root_log_ctx.list))
2864                         list_del_init(&root_log_ctx.list);
2865
2866                 blk_finish_plug(&plug);
2867                 btrfs_set_log_full_commit(fs_info, trans);
2868
2869                 if (ret != -ENOSPC) {
2870                         btrfs_abort_transaction(trans, ret);
2871                         mutex_unlock(&log_root_tree->log_mutex);
2872                         goto out;
2873                 }
2874                 btrfs_wait_tree_log_extents(log, mark);
2875                 btrfs_free_logged_extents(log, log_transid);
2876                 mutex_unlock(&log_root_tree->log_mutex);
2877                 ret = -EAGAIN;
2878                 goto out;
2879         }
2880
2881         if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
2882                 blk_finish_plug(&plug);
2883                 list_del_init(&root_log_ctx.list);
2884                 mutex_unlock(&log_root_tree->log_mutex);
2885                 ret = root_log_ctx.log_ret;
2886                 goto out;
2887         }
2888
2889         index2 = root_log_ctx.log_transid % 2;
2890         if (atomic_read(&log_root_tree->log_commit[index2])) {
2891                 blk_finish_plug(&plug);
2892                 ret = btrfs_wait_tree_log_extents(log, mark);
2893                 btrfs_wait_logged_extents(trans, log, log_transid);
2894                 wait_log_commit(log_root_tree,
2895                                 root_log_ctx.log_transid);
2896                 mutex_unlock(&log_root_tree->log_mutex);
2897                 if (!ret)
2898                         ret = root_log_ctx.log_ret;
2899                 goto out;
2900         }
2901         ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
2902         atomic_set(&log_root_tree->log_commit[index2], 1);
2903
2904         if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2905                 wait_log_commit(log_root_tree,
2906                                 root_log_ctx.log_transid - 1);
2907         }
2908
2909         wait_for_writer(log_root_tree);
2910
2911         /*
2912          * now that we've moved on to the tree of log tree roots,
2913          * check the full commit flag again
2914          */
2915         if (btrfs_need_log_full_commit(fs_info, trans)) {
2916                 blk_finish_plug(&plug);
2917                 btrfs_wait_tree_log_extents(log, mark);
2918                 btrfs_free_logged_extents(log, log_transid);
2919                 mutex_unlock(&log_root_tree->log_mutex);
2920                 ret = -EAGAIN;
2921                 goto out_wake_log_root;
2922         }
2923
2924         ret = btrfs_write_marked_extents(fs_info,
2925                                          &log_root_tree->dirty_log_pages,
2926                                          EXTENT_DIRTY | EXTENT_NEW);
2927         blk_finish_plug(&plug);
2928         if (ret) {
2929                 btrfs_set_log_full_commit(fs_info, trans);
2930                 btrfs_abort_transaction(trans, ret);
2931                 btrfs_free_logged_extents(log, log_transid);
2932                 mutex_unlock(&log_root_tree->log_mutex);
2933                 goto out_wake_log_root;
2934         }
2935         ret = btrfs_wait_tree_log_extents(log, mark);
2936         if (!ret)
2937                 ret = btrfs_wait_tree_log_extents(log_root_tree,
2938                                                   EXTENT_NEW | EXTENT_DIRTY);
2939         if (ret) {
2940                 btrfs_set_log_full_commit(fs_info, trans);
2941                 btrfs_free_logged_extents(log, log_transid);
2942                 mutex_unlock(&log_root_tree->log_mutex);
2943                 goto out_wake_log_root;
2944         }
2945         btrfs_wait_logged_extents(trans, log, log_transid);
2946
2947         btrfs_set_super_log_root(fs_info->super_for_commit,
2948                                  log_root_tree->node->start);
2949         btrfs_set_super_log_root_level(fs_info->super_for_commit,
2950                                        btrfs_header_level(log_root_tree->node));
2951
2952         log_root_tree->log_transid++;
2953         mutex_unlock(&log_root_tree->log_mutex);
2954
2955         /*
2956          * nobody else is going to jump in and write the the ctree
2957          * super here because the log_commit atomic below is protecting
2958          * us.  We must be called with a transaction handle pinning
2959          * the running transaction open, so a full commit can't hop
2960          * in and cause problems either.
2961          */
2962         ret = write_ctree_super(trans, fs_info, 1);
2963         if (ret) {
2964                 btrfs_set_log_full_commit(fs_info, trans);
2965                 btrfs_abort_transaction(trans, ret);
2966                 goto out_wake_log_root;
2967         }
2968
2969         mutex_lock(&root->log_mutex);
2970         if (root->last_log_commit < log_transid)
2971                 root->last_log_commit = log_transid;
2972         mutex_unlock(&root->log_mutex);
2973
2974 out_wake_log_root:
2975         mutex_lock(&log_root_tree->log_mutex);
2976         btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
2977
2978         log_root_tree->log_transid_committed++;
2979         atomic_set(&log_root_tree->log_commit[index2], 0);
2980         mutex_unlock(&log_root_tree->log_mutex);
2981
2982         /*
2983          * The barrier before waitqueue_active is implied by mutex_unlock
2984          */
2985         if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2986                 wake_up(&log_root_tree->log_commit_wait[index2]);
2987 out:
2988         mutex_lock(&root->log_mutex);
2989         btrfs_remove_all_log_ctxs(root, index1, ret);
2990         root->log_transid_committed++;
2991         atomic_set(&root->log_commit[index1], 0);
2992         mutex_unlock(&root->log_mutex);
2993
2994         /*
2995          * The barrier before waitqueue_active is implied by mutex_unlock
2996          */
2997         if (waitqueue_active(&root->log_commit_wait[index1]))
2998                 wake_up(&root->log_commit_wait[index1]);
2999         return ret;
3000 }
3001
3002 static void free_log_tree(struct btrfs_trans_handle *trans,
3003                           struct btrfs_root *log)
3004 {
3005         int ret;
3006         u64 start;
3007         u64 end;
3008         struct walk_control wc = {
3009                 .free = 1,
3010                 .process_func = process_one_buffer
3011         };
3012
3013         ret = walk_log_tree(trans, log, &wc);
3014         /* I don't think this can happen but just in case */
3015         if (ret)
3016                 btrfs_abort_transaction(trans, ret);
3017
3018         while (1) {
3019                 ret = find_first_extent_bit(&log->dirty_log_pages,
3020                                 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
3021                                 NULL);
3022                 if (ret)
3023                         break;
3024
3025                 clear_extent_bits(&log->dirty_log_pages, start, end,
3026                                   EXTENT_DIRTY | EXTENT_NEW);
3027         }
3028
3029         /*
3030          * We may have short-circuited the log tree with the full commit logic
3031          * and left ordered extents on our list, so clear these out to keep us
3032          * from leaking inodes and memory.
3033          */
3034         btrfs_free_logged_extents(log, 0);
3035         btrfs_free_logged_extents(log, 1);
3036
3037         free_extent_buffer(log->node);
3038         kfree(log);
3039 }
3040
3041 /*
3042  * free all the extents used by the tree log.  This should be called
3043  * at commit time of the full transaction
3044  */
3045 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3046 {
3047         if (root->log_root) {
3048                 free_log_tree(trans, root->log_root);
3049                 root->log_root = NULL;
3050         }
3051         return 0;
3052 }
3053
3054 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3055                              struct btrfs_fs_info *fs_info)
3056 {
3057         if (fs_info->log_root_tree) {
3058                 free_log_tree(trans, fs_info->log_root_tree);
3059                 fs_info->log_root_tree = NULL;
3060         }
3061         return 0;
3062 }
3063
3064 /*
3065  * If both a file and directory are logged, and unlinks or renames are
3066  * mixed in, we have a few interesting corners:
3067  *
3068  * create file X in dir Y
3069  * link file X to X.link in dir Y
3070  * fsync file X
3071  * unlink file X but leave X.link
3072  * fsync dir Y
3073  *
3074  * After a crash we would expect only X.link to exist.  But file X
3075  * didn't get fsync'd again so the log has back refs for X and X.link.
3076  *
3077  * We solve this by removing directory entries and inode backrefs from the
3078  * log when a file that was logged in the current transaction is
3079  * unlinked.  Any later fsync will include the updated log entries, and
3080  * we'll be able to reconstruct the proper directory items from backrefs.
3081  *
3082  * This optimizations allows us to avoid relogging the entire inode
3083  * or the entire directory.
3084  */
3085 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3086                                  struct btrfs_root *root,
3087                                  const char *name, int name_len,
3088                                  struct btrfs_inode *dir, u64 index)
3089 {
3090         struct btrfs_root *log;
3091         struct btrfs_dir_item *di;
3092         struct btrfs_path *path;
3093         int ret;
3094         int err = 0;
3095         int bytes_del = 0;
3096         u64 dir_ino = btrfs_ino(dir);
3097
3098         if (dir->logged_trans < trans->transid)
3099                 return 0;
3100
3101         ret = join_running_log_trans(root);
3102         if (ret)
3103                 return 0;
3104
3105         mutex_lock(&dir->log_mutex);
3106
3107         log = root->log_root;
3108         path = btrfs_alloc_path();
3109         if (!path) {
3110                 err = -ENOMEM;
3111                 goto out_unlock;
3112         }
3113
3114         di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
3115                                    name, name_len, -1);
3116         if (IS_ERR(di)) {
3117                 err = PTR_ERR(di);
3118                 goto fail;
3119         }
3120         if (di) {
3121                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
3122                 bytes_del += name_len;
3123                 if (ret) {
3124                         err = ret;
3125                         goto fail;
3126                 }
3127         }
3128         btrfs_release_path(path);
3129         di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3130                                          index, name, name_len, -1);
3131         if (IS_ERR(di)) {
3132                 err = PTR_ERR(di);
3133                 goto fail;
3134         }
3135         if (di) {
3136                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
3137                 bytes_del += name_len;
3138                 if (ret) {
3139                         err = ret;
3140                         goto fail;
3141                 }
3142         }
3143
3144         /* update the directory size in the log to reflect the names
3145          * we have removed
3146          */
3147         if (bytes_del) {
3148                 struct btrfs_key key;
3149
3150                 key.objectid = dir_ino;
3151                 key.offset = 0;
3152                 key.type = BTRFS_INODE_ITEM_KEY;
3153                 btrfs_release_path(path);
3154
3155                 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
3156                 if (ret < 0) {
3157                         err = ret;
3158                         goto fail;
3159                 }
3160                 if (ret == 0) {
3161                         struct btrfs_inode_item *item;
3162                         u64 i_size;
3163
3164                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3165                                               struct btrfs_inode_item);
3166                         i_size = btrfs_inode_size(path->nodes[0], item);
3167                         if (i_size > bytes_del)
3168                                 i_size -= bytes_del;
3169                         else
3170                                 i_size = 0;
3171                         btrfs_set_inode_size(path->nodes[0], item, i_size);
3172                         btrfs_mark_buffer_dirty(path->nodes[0]);
3173                 } else
3174                         ret = 0;
3175                 btrfs_release_path(path);
3176         }
3177 fail:
3178         btrfs_free_path(path);
3179 out_unlock:
3180         mutex_unlock(&dir->log_mutex);
3181         if (ret == -ENOSPC) {
3182                 btrfs_set_log_full_commit(root->fs_info, trans);
3183                 ret = 0;
3184         } else if (ret < 0)
3185                 btrfs_abort_transaction(trans, ret);
3186
3187         btrfs_end_log_trans(root);
3188
3189         return err;
3190 }
3191
3192 /* see comments for btrfs_del_dir_entries_in_log */
3193 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3194                                struct btrfs_root *root,
3195                                const char *name, int name_len,
3196                                struct btrfs_inode *inode, u64 dirid)
3197 {
3198         struct btrfs_fs_info *fs_info = root->fs_info;
3199         struct btrfs_root *log;
3200         u64 index;
3201         int ret;
3202
3203         if (inode->logged_trans < trans->transid)
3204                 return 0;
3205
3206         ret = join_running_log_trans(root);
3207         if (ret)
3208                 return 0;
3209         log = root->log_root;
3210         mutex_lock(&inode->log_mutex);
3211
3212         ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3213                                   dirid, &index);
3214         mutex_unlock(&inode->log_mutex);
3215         if (ret == -ENOSPC) {
3216                 btrfs_set_log_full_commit(fs_info, trans);
3217                 ret = 0;
3218         } else if (ret < 0 && ret != -ENOENT)
3219                 btrfs_abort_transaction(trans, ret);
3220         btrfs_end_log_trans(root);
3221
3222         return ret;
3223 }
3224
3225 /*
3226  * creates a range item in the log for 'dirid'.  first_offset and
3227  * last_offset tell us which parts of the key space the log should
3228  * be considered authoritative for.
3229  */
3230 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3231                                        struct btrfs_root *log,
3232                                        struct btrfs_path *path,
3233                                        int key_type, u64 dirid,
3234                                        u64 first_offset, u64 last_offset)
3235 {
3236         int ret;
3237         struct btrfs_key key;
3238         struct btrfs_dir_log_item *item;
3239
3240         key.objectid = dirid;
3241         key.offset = first_offset;
3242         if (key_type == BTRFS_DIR_ITEM_KEY)
3243                 key.type = BTRFS_DIR_LOG_ITEM_KEY;
3244         else
3245                 key.type = BTRFS_DIR_LOG_INDEX_KEY;
3246         ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3247         if (ret)
3248                 return ret;
3249
3250         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3251                               struct btrfs_dir_log_item);
3252         btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3253         btrfs_mark_buffer_dirty(path->nodes[0]);
3254         btrfs_release_path(path);
3255         return 0;
3256 }
3257
3258 /*
3259  * log all the items included in the current transaction for a given
3260  * directory.  This also creates the range items in the log tree required
3261  * to replay anything deleted before the fsync
3262  */
3263 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3264                           struct btrfs_root *root, struct btrfs_inode *inode,
3265                           struct btrfs_path *path,
3266                           struct btrfs_path *dst_path, int key_type,
3267                           struct btrfs_log_ctx *ctx,
3268                           u64 min_offset, u64 *last_offset_ret)
3269 {
3270         struct btrfs_key min_key;
3271         struct btrfs_root *log = root->log_root;
3272         struct extent_buffer *src;
3273         int err = 0;
3274         int ret;
3275         int i;
3276         int nritems;
3277         u64 first_offset = min_offset;
3278         u64 last_offset = (u64)-1;
3279         u64 ino = btrfs_ino(inode);
3280
3281         log = root->log_root;
3282
3283         min_key.objectid = ino;
3284         min_key.type = key_type;
3285         min_key.offset = min_offset;
3286
3287         ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3288
3289         /*
3290          * we didn't find anything from this transaction, see if there
3291          * is anything at all
3292          */
3293         if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
3294                 min_key.objectid = ino;
3295                 min_key.type = key_type;
3296                 min_key.offset = (u64)-1;
3297                 btrfs_release_path(path);
3298                 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3299                 if (ret < 0) {
3300                         btrfs_release_path(path);
3301                         return ret;
3302                 }
3303                 ret = btrfs_previous_item(root, path, ino, key_type);
3304
3305                 /* if ret == 0 there are items for this type,
3306                  * create a range to tell us the last key of this type.
3307                  * otherwise, there are no items in this directory after
3308                  * *min_offset, and we create a range to indicate that.
3309                  */
3310                 if (ret == 0) {
3311                         struct btrfs_key tmp;
3312                         btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3313                                               path->slots[0]);
3314                         if (key_type == tmp.type)
3315                                 first_offset = max(min_offset, tmp.offset) + 1;
3316                 }
3317                 goto done;
3318         }
3319
3320         /* go backward to find any previous key */
3321         ret = btrfs_previous_item(root, path, ino, key_type);
3322         if (ret == 0) {
3323                 struct btrfs_key tmp;
3324                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3325                 if (key_type == tmp.type) {
3326                         first_offset = tmp.offset;
3327                         ret = overwrite_item(trans, log, dst_path,
3328                                              path->nodes[0], path->slots[0],
3329                                              &tmp);
3330                         if (ret) {
3331                                 err = ret;
3332                                 goto done;
3333                         }
3334                 }
3335         }
3336         btrfs_release_path(path);
3337
3338         /* find the first key from this transaction again */
3339         ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3340         if (WARN_ON(ret != 0))
3341                 goto done;
3342
3343         /*
3344          * we have a block from this transaction, log every item in it
3345          * from our directory
3346          */
3347         while (1) {
3348                 struct btrfs_key tmp;
3349                 src = path->nodes[0];
3350                 nritems = btrfs_header_nritems(src);
3351                 for (i = path->slots[0]; i < nritems; i++) {
3352                         struct btrfs_dir_item *di;
3353
3354                         btrfs_item_key_to_cpu(src, &min_key, i);
3355
3356                         if (min_key.objectid != ino || min_key.type != key_type)
3357                                 goto done;
3358                         ret = overwrite_item(trans, log, dst_path, src, i,
3359                                              &min_key);
3360                         if (ret) {
3361                                 err = ret;
3362                                 goto done;
3363                         }
3364
3365                         /*
3366                          * We must make sure that when we log a directory entry,
3367                          * the corresponding inode, after log replay, has a
3368                          * matching link count. For example:
3369                          *
3370                          * touch foo
3371                          * mkdir mydir
3372                          * sync
3373                          * ln foo mydir/bar
3374                          * xfs_io -c "fsync" mydir
3375                          * <crash>
3376                          * <mount fs and log replay>
3377                          *
3378                          * Would result in a fsync log that when replayed, our
3379                          * file inode would have a link count of 1, but we get
3380                          * two directory entries pointing to the same inode.
3381                          * After removing one of the names, it would not be
3382                          * possible to remove the other name, which resulted
3383                          * always in stale file handle errors, and would not
3384                          * be possible to rmdir the parent directory, since
3385                          * its i_size could never decrement to the value
3386                          * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3387                          */
3388                         di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3389                         btrfs_dir_item_key_to_cpu(src, di, &tmp);
3390                         if (ctx &&
3391                             (btrfs_dir_transid(src, di) == trans->transid ||
3392                              btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3393                             tmp.type != BTRFS_ROOT_ITEM_KEY)
3394                                 ctx->log_new_dentries = true;
3395                 }
3396                 path->slots[0] = nritems;
3397
3398                 /*
3399                  * look ahead to the next item and see if it is also
3400                  * from this directory and from this transaction
3401                  */
3402                 ret = btrfs_next_leaf(root, path);
3403                 if (ret == 1) {
3404                         last_offset = (u64)-1;
3405                         goto done;
3406                 }
3407                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3408                 if (tmp.objectid != ino || tmp.type != key_type) {
3409                         last_offset = (u64)-1;
3410                         goto done;
3411                 }
3412                 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3413                         ret = overwrite_item(trans, log, dst_path,
3414                                              path->nodes[0], path->slots[0],
3415                                              &tmp);
3416                         if (ret)
3417                                 err = ret;
3418                         else
3419                                 last_offset = tmp.offset;
3420                         goto done;
3421                 }
3422         }
3423 done:
3424         btrfs_release_path(path);
3425         btrfs_release_path(dst_path);
3426
3427         if (err == 0) {
3428                 *last_offset_ret = last_offset;
3429                 /*
3430                  * insert the log range keys to indicate where the log
3431                  * is valid
3432                  */
3433                 ret = insert_dir_log_key(trans, log, path, key_type,
3434                                          ino, first_offset, last_offset);
3435                 if (ret)
3436                         err = ret;
3437         }
3438         return err;
3439 }
3440
3441 /*
3442  * logging directories is very similar to logging inodes, We find all the items
3443  * from the current transaction and write them to the log.
3444  *
3445  * The recovery code scans the directory in the subvolume, and if it finds a
3446  * key in the range logged that is not present in the log tree, then it means
3447  * that dir entry was unlinked during the transaction.
3448  *
3449  * In order for that scan to work, we must include one key smaller than
3450  * the smallest logged by this transaction and one key larger than the largest
3451  * key logged by this transaction.
3452  */
3453 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3454                           struct btrfs_root *root, struct btrfs_inode *inode,
3455                           struct btrfs_path *path,
3456                           struct btrfs_path *dst_path,
3457                           struct btrfs_log_ctx *ctx)
3458 {
3459         u64 min_key;
3460         u64 max_key;
3461         int ret;
3462         int key_type = BTRFS_DIR_ITEM_KEY;
3463
3464 again:
3465         min_key = 0;
3466         max_key = 0;
3467         while (1) {
3468                 ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
3469                                 ctx, min_key, &max_key);
3470                 if (ret)
3471                         return ret;
3472                 if (max_key == (u64)-1)
3473                         break;
3474                 min_key = max_key + 1;
3475         }
3476
3477         if (key_type == BTRFS_DIR_ITEM_KEY) {
3478                 key_type = BTRFS_DIR_INDEX_KEY;
3479                 goto again;
3480         }
3481         return 0;
3482 }
3483
3484 /*
3485  * a helper function to drop items from the log before we relog an
3486  * inode.  max_key_type indicates the highest item type to remove.
3487  * This cannot be run for file data extents because it does not
3488  * free the extents they point to.
3489  */
3490 static int drop_objectid_items(struct btrfs_trans_handle *trans,
3491                                   struct btrfs_root *log,
3492                                   struct btrfs_path *path,
3493                                   u64 objectid, int max_key_type)
3494 {
3495         int ret;
3496         struct btrfs_key key;
3497         struct btrfs_key found_key;
3498         int start_slot;
3499
3500         key.objectid = objectid;
3501         key.type = max_key_type;
3502         key.offset = (u64)-1;
3503
3504         while (1) {
3505                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3506                 BUG_ON(ret == 0); /* Logic error */
3507                 if (ret < 0)
3508                         break;
3509
3510                 if (path->slots[0] == 0)
3511                         break;
3512
3513                 path->slots[0]--;
3514                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3515                                       path->slots[0]);
3516
3517                 if (found_key.objectid != objectid)
3518                         break;
3519
3520                 found_key.offset = 0;
3521                 found_key.type = 0;
3522                 ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
3523                                        &start_slot);
3524
3525                 ret = btrfs_del_items(trans, log, path, start_slot,
3526                                       path->slots[0] - start_slot + 1);
3527                 /*
3528                  * If start slot isn't 0 then we don't need to re-search, we've
3529                  * found the last guy with the objectid in this tree.
3530                  */
3531                 if (ret || start_slot != 0)
3532                         break;
3533                 btrfs_release_path(path);
3534         }
3535         btrfs_release_path(path);
3536         if (ret > 0)
3537                 ret = 0;
3538         return ret;
3539 }
3540
3541 static void fill_inode_item(struct btrfs_trans_handle *trans,
3542                             struct extent_buffer *leaf,
3543                             struct btrfs_inode_item *item,
3544                             struct inode *inode, int log_inode_only,
3545                             u64 logged_isize)
3546 {
3547         struct btrfs_map_token token;
3548
3549         btrfs_init_map_token(&token);
3550
3551         if (log_inode_only) {
3552                 /* set the generation to zero so the recover code
3553                  * can tell the difference between an logging
3554                  * just to say 'this inode exists' and a logging
3555                  * to say 'update this inode with these values'
3556                  */
3557                 btrfs_set_token_inode_generation(leaf, item, 0, &token);
3558                 btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
3559         } else {
3560                 btrfs_set_token_inode_generation(leaf, item,
3561                                                  BTRFS_I(inode)->generation,
3562                                                  &token);
3563                 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
3564         }
3565
3566         btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3567         btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3568         btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3569         btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3570
3571         btrfs_set_token_timespec_sec(leaf, &item->atime,
3572                                      inode->i_atime.tv_sec, &token);
3573         btrfs_set_token_timespec_nsec(leaf, &item->atime,
3574                                       inode->i_atime.tv_nsec, &token);
3575
3576         btrfs_set_token_timespec_sec(leaf, &item->mtime,
3577                                      inode->i_mtime.tv_sec, &token);
3578         btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3579                                       inode->i_mtime.tv_nsec, &token);
3580
3581         btrfs_set_token_timespec_sec(leaf, &item->ctime,
3582                                      inode->i_ctime.tv_sec, &token);
3583         btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3584                                       inode->i_ctime.tv_nsec, &token);
3585
3586         btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3587                                      &token);
3588
3589         btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3590         btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3591         btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3592         btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3593         btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3594 }
3595
3596 static int log_inode_item(struct btrfs_trans_handle *trans,
3597                           struct btrfs_root *log, struct btrfs_path *path,
3598                           struct inode *inode)
3599 {
3600         struct btrfs_inode_item *inode_item;
3601         int ret;
3602
3603         ret = btrfs_insert_empty_item(trans, log, path,
3604                                       &BTRFS_I(inode)->location,
3605                                       sizeof(*inode_item));
3606         if (ret && ret != -EEXIST)
3607                 return ret;
3608         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3609                                     struct btrfs_inode_item);
3610         fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
3611         btrfs_release_path(path);
3612         return 0;
3613 }
3614
3615 static noinline int copy_items(struct btrfs_trans_handle *trans,
3616                                struct btrfs_inode *inode,
3617                                struct btrfs_path *dst_path,
3618                                struct btrfs_path *src_path, u64 *last_extent,
3619                                int start_slot, int nr, int inode_only,
3620                                u64 logged_isize)
3621 {
3622         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
3623         unsigned long src_offset;
3624         unsigned long dst_offset;
3625         struct btrfs_root *log = inode->root->log_root;
3626         struct btrfs_file_extent_item *extent;
3627         struct btrfs_inode_item *inode_item;
3628         struct extent_buffer *src = src_path->nodes[0];
3629         struct btrfs_key first_key, last_key, key;
3630         int ret;
3631         struct btrfs_key *ins_keys;
3632         u32 *ins_sizes;
3633         char *ins_data;
3634         int i;
3635         struct list_head ordered_sums;
3636         int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
3637         bool has_extents = false;
3638         bool need_find_last_extent = true;
3639         bool done = false;
3640
3641         INIT_LIST_HEAD(&ordered_sums);
3642
3643         ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3644                            nr * sizeof(u32), GFP_NOFS);
3645         if (!ins_data)
3646                 return -ENOMEM;
3647
3648         first_key.objectid = (u64)-1;
3649
3650         ins_sizes = (u32 *)ins_data;
3651         ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3652
3653         for (i = 0; i < nr; i++) {
3654                 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
3655                 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
3656         }
3657         ret = btrfs_insert_empty_items(trans, log, dst_path,
3658                                        ins_keys, ins_sizes, nr);
3659         if (ret) {
3660                 kfree(ins_data);
3661                 return ret;
3662         }
3663
3664         for (i = 0; i < nr; i++, dst_path->slots[0]++) {
3665                 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
3666                                                    dst_path->slots[0]);
3667
3668                 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3669
3670                 if ((i == (nr - 1)))
3671                         last_key = ins_keys[i];
3672
3673                 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
3674                         inode_item = btrfs_item_ptr(dst_path->nodes[0],
3675                                                     dst_path->slots[0],
3676                                                     struct btrfs_inode_item);
3677                         fill_inode_item(trans, dst_path->nodes[0], inode_item,
3678                                         &inode->vfs_inode, inode_only == LOG_INODE_EXISTS,
3679                                         logged_isize);
3680                 } else {
3681                         copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3682                                            src_offset, ins_sizes[i]);
3683                 }
3684
3685                 /*
3686                  * We set need_find_last_extent here in case we know we were
3687                  * processing other items and then walk into the first extent in
3688                  * the inode.  If we don't hit an extent then nothing changes,
3689                  * we'll do the last search the next time around.
3690                  */
3691                 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
3692                         has_extents = true;
3693                         if (first_key.objectid == (u64)-1)
3694                                 first_key = ins_keys[i];
3695                 } else {
3696                         need_find_last_extent = false;
3697                 }
3698
3699                 /* take a reference on file data extents so that truncates
3700                  * or deletes of this inode don't have to relog the inode
3701                  * again
3702                  */
3703                 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
3704                     !skip_csum) {
3705                         int found_type;
3706                         extent = btrfs_item_ptr(src, start_slot + i,
3707                                                 struct btrfs_file_extent_item);
3708
3709                         if (btrfs_file_extent_generation(src, extent) < trans->transid)
3710                                 continue;
3711
3712                         found_type = btrfs_file_extent_type(src, extent);
3713                         if (found_type == BTRFS_FILE_EXTENT_REG) {
3714                                 u64 ds, dl, cs, cl;
3715                                 ds = btrfs_file_extent_disk_bytenr(src,
3716                                                                 extent);
3717                                 /* ds == 0 is a hole */
3718                                 if (ds == 0)
3719                                         continue;
3720
3721                                 dl = btrfs_file_extent_disk_num_bytes(src,
3722                                                                 extent);
3723                                 cs = btrfs_file_extent_offset(src, extent);
3724                                 cl = btrfs_file_extent_num_bytes(src,
3725                                                                 extent);
3726                                 if (btrfs_file_extent_compression(src,
3727                                                                   extent)) {
3728                                         cs = 0;
3729                                         cl = dl;
3730                                 }
3731
3732                                 ret = btrfs_lookup_csums_range(
3733                                                 fs_info->csum_root,
3734                                                 ds + cs, ds + cs + cl - 1,
3735                                                 &ordered_sums, 0);
3736                                 if (ret) {
3737                                         btrfs_release_path(dst_path);
3738                                         kfree(ins_data);
3739                                         return ret;
3740                                 }
3741                         }
3742                 }
3743         }
3744
3745         btrfs_mark_buffer_dirty(dst_path->nodes[0]);
3746         btrfs_release_path(dst_path);
3747         kfree(ins_data);
3748
3749         /*
3750          * we have to do this after the loop above to avoid changing the
3751          * log tree while trying to change the log tree.
3752          */
3753         ret = 0;
3754         while (!list_empty(&ordered_sums)) {
3755                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3756                                                    struct btrfs_ordered_sum,
3757                                                    list);
3758                 if (!ret)
3759                         ret = btrfs_csum_file_blocks(trans, log, sums);
3760                 list_del(&sums->list);
3761                 kfree(sums);
3762         }
3763
3764         if (!has_extents)
3765                 return ret;
3766
3767         if (need_find_last_extent && *last_extent == first_key.offset) {
3768                 /*
3769                  * We don't have any leafs between our current one and the one
3770                  * we processed before that can have file extent items for our
3771                  * inode (and have a generation number smaller than our current
3772                  * transaction id).
3773                  */
3774                 need_find_last_extent = false;
3775         }
3776
3777         /*
3778          * Because we use btrfs_search_forward we could skip leaves that were
3779          * not modified and then assume *last_extent is valid when it really
3780          * isn't.  So back up to the previous leaf and read the end of the last
3781          * extent before we go and fill in holes.
3782          */
3783         if (need_find_last_extent) {
3784                 u64 len;
3785
3786                 ret = btrfs_prev_leaf(inode->root, src_path);
3787                 if (ret < 0)
3788                         return ret;
3789                 if (ret)
3790                         goto fill_holes;
3791                 if (src_path->slots[0])
3792                         src_path->slots[0]--;
3793                 src = src_path->nodes[0];
3794                 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
3795                 if (key.objectid != btrfs_ino(inode) ||
3796                     key.type != BTRFS_EXTENT_DATA_KEY)
3797                         goto fill_holes;
3798                 extent = btrfs_item_ptr(src, src_path->slots[0],
3799                                         struct btrfs_file_extent_item);
3800                 if (btrfs_file_extent_type(src, extent) ==
3801                     BTRFS_FILE_EXTENT_INLINE) {
3802                         len = btrfs_file_extent_inline_len(src,
3803                                                            src_path->slots[0],
3804                                                            extent);
3805                         *last_extent = ALIGN(key.offset + len,
3806                                              fs_info->sectorsize);
3807                 } else {
3808                         len = btrfs_file_extent_num_bytes(src, extent);
3809                         *last_extent = key.offset + len;
3810                 }
3811         }
3812 fill_holes:
3813         /* So we did prev_leaf, now we need to move to the next leaf, but a few
3814          * things could have happened
3815          *
3816          * 1) A merge could have happened, so we could currently be on a leaf
3817          * that holds what we were copying in the first place.
3818          * 2) A split could have happened, and now not all of the items we want
3819          * are on the same leaf.
3820          *
3821          * So we need to adjust how we search for holes, we need to drop the
3822          * path and re-search for the first extent key we found, and then walk
3823          * forward until we hit the last one we copied.
3824          */
3825         if (need_find_last_extent) {
3826                 /* btrfs_prev_leaf could return 1 without releasing the path */
3827                 btrfs_release_path(src_path);
3828                 ret = btrfs_search_slot(NULL, inode->root, &first_key, src_path, 0, 0);
3829                 if (ret < 0)
3830                         return ret;
3831                 ASSERT(ret == 0);
3832                 src = src_path->nodes[0];
3833                 i = src_path->slots[0];
3834         } else {
3835                 i = start_slot;
3836         }
3837
3838         /*
3839          * Ok so here we need to go through and fill in any holes we may have
3840          * to make sure that holes are punched for those areas in case they had
3841          * extents previously.
3842          */
3843         while (!done) {
3844                 u64 offset, len;
3845                 u64 extent_end;
3846
3847                 if (i >= btrfs_header_nritems(src_path->nodes[0])) {
3848                         ret = btrfs_next_leaf(inode->root, src_path);
3849                         if (ret < 0)
3850                                 return ret;
3851                         ASSERT(ret == 0);
3852                         src = src_path->nodes[0];
3853                         i = 0;
3854                 }
3855
3856                 btrfs_item_key_to_cpu(src, &key, i);
3857                 if (!btrfs_comp_cpu_keys(&key, &last_key))
3858                         done = true;
3859                 if (key.objectid != btrfs_ino(inode) ||
3860                     key.type != BTRFS_EXTENT_DATA_KEY) {
3861                         i++;
3862                         continue;
3863                 }
3864                 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
3865                 if (btrfs_file_extent_type(src, extent) ==
3866                     BTRFS_FILE_EXTENT_INLINE) {
3867                         len = btrfs_file_extent_inline_len(src, i, extent);
3868                         extent_end = ALIGN(key.offset + len,
3869                                            fs_info->sectorsize);
3870                 } else {
3871                         len = btrfs_file_extent_num_bytes(src, extent);
3872                         extent_end = key.offset + len;
3873                 }
3874                 i++;
3875
3876                 if (*last_extent == key.offset) {
3877                         *last_extent = extent_end;
3878                         continue;
3879                 }
3880                 offset = *last_extent;
3881                 len = key.offset - *last_extent;
3882                 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
3883                                                offset, 0, 0, len, 0, len, 0, 0, 0);
3884                 if (ret)
3885                         break;
3886                 *last_extent = extent_end;
3887         }
3888         /*
3889          * Need to let the callers know we dropped the path so they should
3890          * re-search.
3891          */
3892         if (!ret && need_find_last_extent)
3893                 ret = 1;
3894         return ret;
3895 }
3896
3897 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3898 {
3899         struct extent_map *em1, *em2;
3900
3901         em1 = list_entry(a, struct extent_map, list);
3902         em2 = list_entry(b, struct extent_map, list);
3903
3904         if (em1->start < em2->start)
3905                 return -1;
3906         else if (em1->start > em2->start)
3907                 return 1;
3908         return 0;
3909 }
3910
3911 static int wait_ordered_extents(struct btrfs_trans_handle *trans,
3912                                 struct inode *inode,
3913                                 struct btrfs_root *root,
3914                                 const struct extent_map *em,
3915                                 const struct list_head *logged_list,
3916                                 bool *ordered_io_error)
3917 {
3918         struct btrfs_fs_info *fs_info = root->fs_info;
3919         struct btrfs_ordered_extent *ordered;
3920         struct btrfs_root *log = root->log_root;
3921         u64 mod_start = em->mod_start;
3922         u64 mod_len = em->mod_len;
3923         const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3924         u64 csum_offset;
3925         u64 csum_len;
3926         LIST_HEAD(ordered_sums);
3927         int ret = 0;
3928
3929         *ordered_io_error = false;
3930
3931         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
3932             em->block_start == EXTENT_MAP_HOLE)
3933                 return 0;
3934
3935         /*
3936          * Wait far any ordered extent that covers our extent map. If it
3937          * finishes without an error, first check and see if our csums are on
3938          * our outstanding ordered extents.
3939          */
3940         list_for_each_entry(ordered, logged_list, log_list) {
3941                 struct btrfs_ordered_sum *sum;
3942
3943                 if (!mod_len)
3944                         break;
3945
3946                 if (ordered->file_offset + ordered->len <= mod_start ||
3947                     mod_start + mod_len <= ordered->file_offset)
3948                         continue;
3949
3950                 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
3951                     !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
3952                     !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
3953                         const u64 start = ordered->file_offset;
3954                         const u64 end = ordered->file_offset + ordered->len - 1;
3955
3956                         WARN_ON(ordered->inode != inode);
3957                         filemap_fdatawrite_range(inode->i_mapping, start, end);
3958                 }
3959
3960                 wait_event(ordered->wait,
3961                            (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
3962                             test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
3963
3964                 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
3965                         /*
3966                          * Clear the AS_EIO/AS_ENOSPC flags from the inode's
3967                          * i_mapping flags, so that the next fsync won't get
3968                          * an outdated io error too.
3969                          */
3970                         filemap_check_errors(inode->i_mapping);
3971                         *ordered_io_error = true;
3972                         break;
3973                 }
3974                 /*
3975                  * We are going to copy all the csums on this ordered extent, so
3976                  * go ahead and adjust mod_start and mod_len in case this
3977                  * ordered extent has already been logged.
3978                  */
3979                 if (ordered->file_offset > mod_start) {
3980                         if (ordered->file_offset + ordered->len >=
3981                             mod_start + mod_len)
3982                                 mod_len = ordered->file_offset - mod_start;
3983                         /*
3984                          * If we have this case
3985                          *
3986                          * |--------- logged extent ---------|
3987                          *       |----- ordered extent ----|
3988                          *
3989                          * Just don't mess with mod_start and mod_len, we'll
3990                          * just end up logging more csums than we need and it
3991                          * will be ok.
3992                          */
3993                 } else {
3994                         if (ordered->file_offset + ordered->len <
3995                             mod_start + mod_len) {
3996                                 mod_len = (mod_start + mod_len) -
3997                                         (ordered->file_offset + ordered->len);
3998                                 mod_start = ordered->file_offset +
3999                                         ordered->len;
4000                         } else {
4001                                 mod_len = 0;
4002                         }
4003                 }
4004
4005                 if (skip_csum)
4006                         continue;
4007
4008                 /*
4009                  * To keep us from looping for the above case of an ordered
4010                  * extent that falls inside of the logged extent.
4011                  */
4012                 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
4013                                      &ordered->flags))
4014                         continue;
4015
4016                 list_for_each_entry(sum, &ordered->list, list) {
4017                         ret = btrfs_csum_file_blocks(trans, log, sum);
4018                         if (ret)
4019                                 break;
4020                 }
4021         }
4022
4023         if (*ordered_io_error || !mod_len || ret || skip_csum)
4024                 return ret;
4025
4026         if (em->compress_type) {
4027                 csum_offset = 0;
4028                 csum_len = max(em->block_len, em->orig_block_len);
4029         } else {
4030                 csum_offset = mod_start - em->start;
4031                 csum_len = mod_len;
4032         }
4033
4034         /* block start is already adjusted for the file extent offset. */
4035         ret = btrfs_lookup_csums_range(fs_info->csum_root,
4036                                        em->block_start + csum_offset,
4037                                        em->block_start + csum_offset +
4038                                        csum_len - 1, &ordered_sums, 0);
4039         if (ret)
4040                 return ret;
4041
4042         while (!list_empty(&ordered_sums)) {
4043                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4044                                                    struct btrfs_ordered_sum,
4045                                                    list);
4046                 if (!ret)
4047                         ret = btrfs_csum_file_blocks(trans, log, sums);
4048                 list_del(&sums->list);
4049                 kfree(sums);
4050         }
4051
4052         return ret;
4053 }
4054
4055 static int log_one_extent(struct btrfs_trans_handle *trans,
4056                           struct btrfs_inode *inode, struct btrfs_root *root,
4057                           const struct extent_map *em,
4058                           struct btrfs_path *path,
4059                           const struct list_head *logged_list,
4060                           struct btrfs_log_ctx *ctx)
4061 {
4062         struct btrfs_root *log = root->log_root;
4063         struct btrfs_file_extent_item *fi;
4064         struct extent_buffer *leaf;
4065         struct btrfs_map_token token;
4066         struct btrfs_key key;
4067         u64 extent_offset = em->start - em->orig_start;
4068         u64 block_len;
4069         int ret;
4070         int extent_inserted = 0;
4071         bool ordered_io_err = false;
4072
4073         ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em, logged_list,
4074                                    &ordered_io_err);
4075         if (ret)
4076                 return ret;
4077
4078         if (ordered_io_err) {
4079                 ctx->io_err = -EIO;
4080                 return 0;
4081         }
4082
4083         btrfs_init_map_token(&token);
4084
4085         ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
4086                                    em->start + em->len, NULL, 0, 1,
4087                                    sizeof(*fi), &extent_inserted);
4088         if (ret)
4089                 return ret;
4090
4091         if (!extent_inserted) {
4092                 key.objectid = btrfs_ino(inode);
4093                 key.type = BTRFS_EXTENT_DATA_KEY;
4094                 key.offset = em->start;
4095
4096                 ret = btrfs_insert_empty_item(trans, log, path, &key,
4097                                               sizeof(*fi));
4098                 if (ret)
4099                         return ret;
4100         }
4101         leaf = path->nodes[0];
4102         fi = btrfs_item_ptr(leaf, path->slots[0],
4103                             struct btrfs_file_extent_item);
4104
4105         btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
4106                                                &token);
4107         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4108                 btrfs_set_token_file_extent_type(leaf, fi,
4109                                                  BTRFS_FILE_EXTENT_PREALLOC,
4110                                                  &token);
4111         else
4112                 btrfs_set_token_file_extent_type(leaf, fi,
4113                                                  BTRFS_FILE_EXTENT_REG,
4114                                                  &token);
4115
4116         block_len = max(em->block_len, em->orig_block_len);
4117         if (em->compress_type != BTRFS_COMPRESS_NONE) {
4118                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
4119                                                         em->block_start,
4120                                                         &token);
4121                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
4122                                                            &token);
4123         } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4124                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
4125                                                         em->block_start -
4126                                                         extent_offset, &token);
4127                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
4128                                                            &token);
4129         } else {
4130                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
4131                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
4132                                                            &token);
4133         }
4134
4135         btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
4136         btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
4137         btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
4138         btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
4139                                                 &token);
4140         btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
4141         btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
4142         btrfs_mark_buffer_dirty(leaf);
4143
4144         btrfs_release_path(path);
4145
4146         return ret;
4147 }
4148
4149 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4150                                      struct btrfs_root *root,
4151                                      struct btrfs_inode *inode,
4152                                      struct btrfs_path *path,
4153                                      struct list_head *logged_list,
4154                                      struct btrfs_log_ctx *ctx,
4155                                      const u64 start,
4156                                      const u64 end)
4157 {
4158         struct extent_map *em, *n;
4159         struct list_head extents;
4160         struct extent_map_tree *tree = &inode->extent_tree;
4161         u64 test_gen;
4162         int ret = 0;
4163         int num = 0;
4164
4165         INIT_LIST_HEAD(&extents);
4166
4167         down_write(&inode->dio_sem);
4168         write_lock(&tree->lock);
4169         test_gen = root->fs_info->last_trans_committed;
4170
4171         list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4172                 list_del_init(&em->list);
4173
4174                 /*
4175                  * Just an arbitrary number, this can be really CPU intensive
4176                  * once we start getting a lot of extents, and really once we
4177                  * have a bunch of extents we just want to commit since it will
4178                  * be faster.
4179                  */
4180                 if (++num > 32768) {
4181                         list_del_init(&tree->modified_extents);
4182                         ret = -EFBIG;
4183                         goto process;
4184                 }
4185
4186                 if (em->generation <= test_gen)
4187                         continue;
4188                 /* Need a ref to keep it from getting evicted from cache */
4189                 atomic_inc(&em->refs);
4190                 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4191                 list_add_tail(&em->list, &extents);
4192                 num++;
4193         }
4194
4195         list_sort(NULL, &extents, extent_cmp);
4196         btrfs_get_logged_extents(inode, logged_list, start, end);
4197         /*
4198          * Some ordered extents started by fsync might have completed
4199          * before we could collect them into the list logged_list, which
4200          * means they're gone, not in our logged_list nor in the inode's
4201          * ordered tree. We want the application/user space to know an
4202          * error happened while attempting to persist file data so that
4203          * it can take proper action. If such error happened, we leave
4204          * without writing to the log tree and the fsync must report the
4205          * file data write error and not commit the current transaction.
4206          */
4207         ret = filemap_check_errors(inode->vfs_inode.i_mapping);
4208         if (ret)
4209                 ctx->io_err = ret;
4210 process:
4211         while (!list_empty(&extents)) {
4212                 em = list_entry(extents.next, struct extent_map, list);
4213
4214                 list_del_init(&em->list);
4215
4216                 /*
4217                  * If we had an error we just need to delete everybody from our
4218                  * private list.
4219                  */
4220                 if (ret) {
4221                         clear_em_logging(tree, em);
4222                         free_extent_map(em);
4223                         continue;
4224                 }
4225
4226                 write_unlock(&tree->lock);
4227
4228                 ret = log_one_extent(trans, inode, root, em, path, logged_list,
4229                                      ctx);
4230                 write_lock(&tree->lock);
4231                 clear_em_logging(tree, em);
4232                 free_extent_map(em);
4233         }
4234         WARN_ON(!list_empty(&extents));
4235         write_unlock(&tree->lock);
4236         up_write(&inode->dio_sem);
4237
4238         btrfs_release_path(path);
4239         return ret;
4240 }
4241
4242 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
4243                              struct btrfs_path *path, u64 *size_ret)
4244 {
4245         struct btrfs_key key;
4246         int ret;
4247
4248         key.objectid = btrfs_ino(inode);
4249         key.type = BTRFS_INODE_ITEM_KEY;
4250         key.offset = 0;
4251
4252         ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4253         if (ret < 0) {
4254                 return ret;
4255         } else if (ret > 0) {
4256                 *size_ret = 0;
4257         } else {
4258                 struct btrfs_inode_item *item;
4259
4260                 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4261                                       struct btrfs_inode_item);
4262                 *size_ret = btrfs_inode_size(path->nodes[0], item);
4263         }
4264
4265         btrfs_release_path(path);
4266         return 0;
4267 }
4268
4269 /*
4270  * At the moment we always log all xattrs. This is to figure out at log replay
4271  * time which xattrs must have their deletion replayed. If a xattr is missing
4272  * in the log tree and exists in the fs/subvol tree, we delete it. This is
4273  * because if a xattr is deleted, the inode is fsynced and a power failure
4274  * happens, causing the log to be replayed the next time the fs is mounted,
4275  * we want the xattr to not exist anymore (same behaviour as other filesystems
4276  * with a journal, ext3/4, xfs, f2fs, etc).
4277  */
4278 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4279                                 struct btrfs_root *root,
4280                                 struct btrfs_inode *inode,
4281                                 struct btrfs_path *path,
4282                                 struct btrfs_path *dst_path)
4283 {
4284         int ret;
4285         struct btrfs_key key;
4286         const u64 ino = btrfs_ino(inode);
4287         int ins_nr = 0;
4288         int start_slot = 0;
4289
4290         key.objectid = ino;
4291         key.type = BTRFS_XATTR_ITEM_KEY;
4292         key.offset = 0;
4293
4294         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4295         if (ret < 0)
4296                 return ret;
4297
4298         while (true) {
4299                 int slot = path->slots[0];
4300                 struct extent_buffer *leaf = path->nodes[0];
4301                 int nritems = btrfs_header_nritems(leaf);
4302
4303                 if (slot >= nritems) {
4304                         if (ins_nr > 0) {
4305                                 u64 last_extent = 0;
4306
4307                                 ret = copy_items(trans, inode, dst_path, path,
4308                                                  &last_extent, start_slot,
4309                                                  ins_nr, 1, 0);
4310                                 /* can't be 1, extent items aren't processed */
4311                                 ASSERT(ret <= 0);
4312                                 if (ret < 0)
4313                                         return ret;
4314                                 ins_nr = 0;
4315                         }
4316                         ret = btrfs_next_leaf(root, path);
4317                         if (ret < 0)
4318                                 return ret;
4319                         else if (ret > 0)
4320                                 break;
4321                         continue;
4322                 }
4323
4324                 btrfs_item_key_to_cpu(leaf, &key, slot);
4325                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
4326                         break;
4327
4328                 if (ins_nr == 0)
4329                         start_slot = slot;
4330                 ins_nr++;
4331                 path->slots[0]++;
4332                 cond_resched();
4333         }
4334         if (ins_nr > 0) {
4335                 u64 last_extent = 0;
4336
4337                 ret = copy_items(trans, inode, dst_path, path,
4338                                  &last_extent, start_slot,
4339                                  ins_nr, 1, 0);
4340                 /* can't be 1, extent items aren't processed */
4341                 ASSERT(ret <= 0);
4342                 if (ret < 0)
4343                         return ret;
4344         }
4345
4346         return 0;
4347 }
4348
4349 /*
4350  * If the no holes feature is enabled we need to make sure any hole between the
4351  * last extent and the i_size of our inode is explicitly marked in the log. This
4352  * is to make sure that doing something like:
4353  *
4354  *      1) create file with 128Kb of data
4355  *      2) truncate file to 64Kb
4356  *      3) truncate file to 256Kb
4357  *      4) fsync file
4358  *      5) <crash/power failure>
4359  *      6) mount fs and trigger log replay
4360  *
4361  * Will give us a file with a size of 256Kb, the first 64Kb of data match what
4362  * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
4363  * file correspond to a hole. The presence of explicit holes in a log tree is
4364  * what guarantees that log replay will remove/adjust file extent items in the
4365  * fs/subvol tree.
4366  *
4367  * Here we do not need to care about holes between extents, that is already done
4368  * by copy_items(). We also only need to do this in the full sync path, where we
4369  * lookup for extents from the fs/subvol tree only. In the fast path case, we
4370  * lookup the list of modified extent maps and if any represents a hole, we
4371  * insert a corresponding extent representing a hole in the log tree.
4372  */
4373 static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
4374                                    struct btrfs_root *root,
4375                                    struct btrfs_inode *inode,
4376                                    struct btrfs_path *path)
4377 {
4378         struct btrfs_fs_info *fs_info = root->fs_info;
4379         int ret;
4380         struct btrfs_key key;
4381         u64 hole_start;
4382         u64 hole_size;
4383         struct extent_buffer *leaf;
4384         struct btrfs_root *log = root->log_root;
4385         const u64 ino = btrfs_ino(inode);
4386         const u64 i_size = i_size_read(&inode->vfs_inode);
4387
4388         if (!btrfs_fs_incompat(fs_info, NO_HOLES))
4389                 return 0;
4390
4391         key.objectid = ino;
4392         key.type = BTRFS_EXTENT_DATA_KEY;
4393         key.offset = (u64)-1;
4394
4395         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4396         ASSERT(ret != 0);
4397         if (ret < 0)
4398                 return ret;
4399
4400         ASSERT(path->slots[0] > 0);
4401         path->slots[0]--;
4402         leaf = path->nodes[0];
4403         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4404
4405         if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
4406                 /* inode does not have any extents */
4407                 hole_start = 0;
4408                 hole_size = i_size;
4409         } else {
4410                 struct btrfs_file_extent_item *extent;
4411                 u64 len;
4412
4413                 /*
4414                  * If there's an extent beyond i_size, an explicit hole was
4415                  * already inserted by copy_items().
4416                  */
4417                 if (key.offset >= i_size)
4418                         return 0;
4419
4420                 extent = btrfs_item_ptr(leaf, path->slots[0],
4421                                         struct btrfs_file_extent_item);
4422
4423                 if (btrfs_file_extent_type(leaf, extent) ==
4424                     BTRFS_FILE_EXTENT_INLINE) {
4425                         len = btrfs_file_extent_inline_len(leaf,
4426                                                            path->slots[0],
4427                                                            extent);
4428                         ASSERT(len == i_size);
4429                         return 0;
4430                 }
4431
4432                 len = btrfs_file_extent_num_bytes(leaf, extent);
4433                 /* Last extent goes beyond i_size, no need to log a hole. */
4434                 if (key.offset + len > i_size)
4435                         return 0;
4436                 hole_start = key.offset + len;
4437                 hole_size = i_size - hole_start;
4438         }
4439         btrfs_release_path(path);
4440
4441         /* Last extent ends at i_size. */
4442         if (hole_size == 0)
4443                 return 0;
4444
4445         hole_size = ALIGN(hole_size, fs_info->sectorsize);
4446         ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
4447                                        hole_size, 0, hole_size, 0, 0, 0);
4448         return ret;
4449 }
4450
4451 /*
4452  * When we are logging a new inode X, check if it doesn't have a reference that
4453  * matches the reference from some other inode Y created in a past transaction
4454  * and that was renamed in the current transaction. If we don't do this, then at
4455  * log replay time we can lose inode Y (and all its files if it's a directory):
4456  *
4457  * mkdir /mnt/x
4458  * echo "hello world" > /mnt/x/foobar
4459  * sync
4460  * mv /mnt/x /mnt/y
4461  * mkdir /mnt/x                 # or touch /mnt/x
4462  * xfs_io -c fsync /mnt/x
4463  * <power fail>
4464  * mount fs, trigger log replay
4465  *
4466  * After the log replay procedure, we would lose the first directory and all its
4467  * files (file foobar).
4468  * For the case where inode Y is not a directory we simply end up losing it:
4469  *
4470  * echo "123" > /mnt/foo
4471  * sync
4472  * mv /mnt/foo /mnt/bar
4473  * echo "abc" > /mnt/foo
4474  * xfs_io -c fsync /mnt/foo
4475  * <power fail>
4476  *
4477  * We also need this for cases where a snapshot entry is replaced by some other
4478  * entry (file or directory) otherwise we end up with an unreplayable log due to
4479  * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
4480  * if it were a regular entry:
4481  *
4482  * mkdir /mnt/x
4483  * btrfs subvolume snapshot /mnt /mnt/x/snap
4484  * btrfs subvolume delete /mnt/x/snap
4485  * rmdir /mnt/x
4486  * mkdir /mnt/x
4487  * fsync /mnt/x or fsync some new file inside it
4488  * <power fail>
4489  *
4490  * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
4491  * the same transaction.
4492  */
4493 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4494                                          const int slot,
4495                                          const struct btrfs_key *key,
4496                                          struct btrfs_inode *inode,
4497                                          u64 *other_ino)
4498 {
4499         int ret;
4500         struct btrfs_path *search_path;
4501         char *name = NULL;
4502         u32 name_len = 0;
4503         u32 item_size = btrfs_item_size_nr(eb, slot);
4504         u32 cur_offset = 0;
4505         unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
4506
4507         search_path = btrfs_alloc_path();
4508         if (!search_path)
4509                 return -ENOMEM;
4510         search_path->search_commit_root = 1;
4511         search_path->skip_locking = 1;
4512
4513         while (cur_offset < item_size) {
4514                 u64 parent;
4515                 u32 this_name_len;
4516                 u32 this_len;
4517                 unsigned long name_ptr;
4518                 struct btrfs_dir_item *di;
4519
4520                 if (key->type == BTRFS_INODE_REF_KEY) {
4521                         struct btrfs_inode_ref *iref;
4522
4523                         iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
4524                         parent = key->offset;
4525                         this_name_len = btrfs_inode_ref_name_len(eb, iref);
4526                         name_ptr = (unsigned long)(iref + 1);
4527                         this_len = sizeof(*iref) + this_name_len;
4528                 } else {
4529                         struct btrfs_inode_extref *extref;
4530
4531                         extref = (struct btrfs_inode_extref *)(ptr +
4532                                                                cur_offset);
4533                         parent = btrfs_inode_extref_parent(eb, extref);
4534                         this_name_len = btrfs_inode_extref_name_len(eb, extref);
4535                         name_ptr = (unsigned long)&extref->name;
4536                         this_len = sizeof(*extref) + this_name_len;
4537                 }
4538
4539                 if (this_name_len > name_len) {
4540                         char *new_name;
4541
4542                         new_name = krealloc(name, this_name_len, GFP_NOFS);
4543                         if (!new_name) {
4544                                 ret = -ENOMEM;
4545                                 goto out;
4546                         }
4547                         name_len = this_name_len;
4548                         name = new_name;
4549                 }
4550
4551                 read_extent_buffer(eb, name, name_ptr, this_name_len);
4552                 di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
4553                                 parent, name, this_name_len, 0);
4554                 if (di && !IS_ERR(di)) {
4555                         struct btrfs_key di_key;
4556
4557                         btrfs_dir_item_key_to_cpu(search_path->nodes[0],
4558                                                   di, &di_key);
4559                         if (di_key.type == BTRFS_INODE_ITEM_KEY) {
4560                                 ret = 1;
4561                                 *other_ino = di_key.objectid;
4562                         } else {
4563                                 ret = -EAGAIN;
4564                         }
4565                         goto out;
4566                 } else if (IS_ERR(di)) {
4567                         ret = PTR_ERR(di);
4568                         goto out;
4569                 }
4570                 btrfs_release_path(search_path);
4571
4572                 cur_offset += this_len;
4573         }
4574         ret = 0;
4575 out:
4576         btrfs_free_path(search_path);
4577         kfree(name);
4578         return ret;
4579 }
4580
4581 /* log a single inode in the tree log.
4582  * At least one parent directory for this inode must exist in the tree
4583  * or be logged already.
4584  *
4585  * Any items from this inode changed by the current transaction are copied
4586  * to the log tree.  An extra reference is taken on any extents in this
4587  * file, allowing us to avoid a whole pile of corner cases around logging
4588  * blocks that have been removed from the tree.
4589  *
4590  * See LOG_INODE_ALL and related defines for a description of what inode_only
4591  * does.
4592  *
4593  * This handles both files and directories.
4594  */
4595 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4596                            struct btrfs_root *root, struct inode *inode,
4597                            int inode_only,
4598                            const loff_t start,
4599                            const loff_t end,
4600                            struct btrfs_log_ctx *ctx)
4601 {
4602         struct btrfs_fs_info *fs_info = root->fs_info;
4603         struct btrfs_path *path;
4604         struct btrfs_path *dst_path;
4605         struct btrfs_key min_key;
4606         struct btrfs_key max_key;
4607         struct btrfs_root *log = root->log_root;
4608         struct extent_buffer *src = NULL;
4609         LIST_HEAD(logged_list);
4610         u64 last_extent = 0;
4611         int err = 0;
4612         int ret;
4613         int nritems;
4614         int ins_start_slot = 0;
4615         int ins_nr;
4616         bool fast_search = false;
4617         u64 ino = btrfs_ino(BTRFS_I(inode));
4618         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4619         u64 logged_isize = 0;
4620         bool need_log_inode_item = true;
4621
4622         path = btrfs_alloc_path();
4623         if (!path)
4624                 return -ENOMEM;
4625         dst_path = btrfs_alloc_path();
4626         if (!dst_path) {
4627                 btrfs_free_path(path);
4628                 return -ENOMEM;
4629         }
4630
4631         min_key.objectid = ino;
4632         min_key.type = BTRFS_INODE_ITEM_KEY;
4633         min_key.offset = 0;
4634
4635         max_key.objectid = ino;
4636
4637
4638         /* today the code can only do partial logging of directories */
4639         if (S_ISDIR(inode->i_mode) ||
4640             (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4641                        &BTRFS_I(inode)->runtime_flags) &&
4642              inode_only >= LOG_INODE_EXISTS))
4643                 max_key.type = BTRFS_XATTR_ITEM_KEY;
4644         else
4645                 max_key.type = (u8)-1;
4646         max_key.offset = (u64)-1;
4647
4648         /*
4649          * Only run delayed items if we are a dir or a new file.
4650          * Otherwise commit the delayed inode only, which is needed in
4651          * order for the log replay code to mark inodes for link count
4652          * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
4653          */
4654         if (S_ISDIR(inode->i_mode) ||
4655             BTRFS_I(inode)->generation > fs_info->last_trans_committed)
4656                 ret = btrfs_commit_inode_delayed_items(trans, BTRFS_I(inode));
4657         else
4658                 ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
4659
4660         if (ret) {
4661                 btrfs_free_path(path);
4662                 btrfs_free_path(dst_path);
4663                 return ret;
4664         }
4665
4666         if (inode_only == LOG_OTHER_INODE) {
4667                 inode_only = LOG_INODE_EXISTS;
4668                 mutex_lock_nested(&BTRFS_I(inode)->log_mutex,
4669                                   SINGLE_DEPTH_NESTING);
4670         } else {
4671                 mutex_lock(&BTRFS_I(inode)->log_mutex);
4672         }
4673
4674         /*
4675          * a brute force approach to making sure we get the most uptodate
4676          * copies of everything.
4677          */
4678         if (S_ISDIR(inode->i_mode)) {
4679                 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
4680
4681                 if (inode_only == LOG_INODE_EXISTS)
4682                         max_key_type = BTRFS_XATTR_ITEM_KEY;
4683                 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
4684         } else {
4685                 if (inode_only == LOG_INODE_EXISTS) {
4686                         /*
4687                          * Make sure the new inode item we write to the log has
4688                          * the same isize as the current one (if it exists).
4689                          * This is necessary to prevent data loss after log
4690                          * replay, and also to prevent doing a wrong expanding
4691                          * truncate - for e.g. create file, write 4K into offset
4692                          * 0, fsync, write 4K into offset 4096, add hard link,
4693                          * fsync some other file (to sync log), power fail - if
4694                          * we use the inode's current i_size, after log replay
4695                          * we get a 8Kb file, with the last 4Kb extent as a hole
4696                          * (zeroes), as if an expanding truncate happened,
4697                          * instead of getting a file of 4Kb only.
4698                          */
4699                         err = logged_inode_size(log, BTRFS_I(inode), path,
4700                                                 &logged_isize);
4701                         if (err)
4702                                 goto out_unlock;
4703                 }
4704                 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4705                              &BTRFS_I(inode)->runtime_flags)) {
4706                         if (inode_only == LOG_INODE_EXISTS) {
4707                                 max_key.type = BTRFS_XATTR_ITEM_KEY;
4708                                 ret = drop_objectid_items(trans, log, path, ino,
4709                                                           max_key.type);
4710                         } else {
4711                                 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4712                                           &BTRFS_I(inode)->runtime_flags);
4713                                 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4714                                           &BTRFS_I(inode)->runtime_flags);
4715                                 while(1) {
4716                                         ret = btrfs_truncate_inode_items(trans,
4717                                                          log, inode, 0, 0);
4718                                         if (ret != -EAGAIN)
4719                                                 break;
4720                                 }
4721                         }
4722                 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4723                                               &BTRFS_I(inode)->runtime_flags) ||
4724                            inode_only == LOG_INODE_EXISTS) {
4725                         if (inode_only == LOG_INODE_ALL)
4726                                 fast_search = true;
4727                         max_key.type = BTRFS_XATTR_ITEM_KEY;
4728                         ret = drop_objectid_items(trans, log, path, ino,
4729                                                   max_key.type);
4730                 } else {
4731                         if (inode_only == LOG_INODE_ALL)
4732                                 fast_search = true;
4733                         goto log_extents;
4734                 }
4735
4736         }
4737         if (ret) {
4738                 err = ret;
4739                 goto out_unlock;
4740         }
4741
4742         while (1) {
4743                 ins_nr = 0;
4744                 ret = btrfs_search_forward(root, &min_key,
4745                                            path, trans->transid);
4746                 if (ret < 0) {
4747                         err = ret;
4748                         goto out_unlock;
4749                 }
4750                 if (ret != 0)
4751                         break;
4752 again:
4753                 /* note, ins_nr might be > 0 here, cleanup outside the loop */
4754                 if (min_key.objectid != ino)
4755                         break;
4756                 if (min_key.type > max_key.type)
4757                         break;
4758
4759                 if (min_key.type == BTRFS_INODE_ITEM_KEY)
4760                         need_log_inode_item = false;
4761
4762                 if ((min_key.type == BTRFS_INODE_REF_KEY ||
4763                      min_key.type == BTRFS_INODE_EXTREF_KEY) &&
4764                     BTRFS_I(inode)->generation == trans->transid) {
4765                         u64 other_ino = 0;
4766
4767                         ret = btrfs_check_ref_name_override(path->nodes[0],
4768                                                             path->slots[0],
4769                                                             &min_key, BTRFS_I(inode),
4770                                                             &other_ino);
4771                         if (ret < 0) {
4772                                 err = ret;
4773                                 goto out_unlock;
4774                         } else if (ret > 0 && ctx &&
4775                                    other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
4776                                 struct btrfs_key inode_key;
4777                                 struct inode *other_inode;
4778
4779                                 if (ins_nr > 0) {
4780                                         ins_nr++;
4781                                 } else {
4782                                         ins_nr = 1;
4783                                         ins_start_slot = path->slots[0];
4784                                 }
4785                                 ret = copy_items(trans, BTRFS_I(inode), dst_path, path,
4786                                                  &last_extent, ins_start_slot,
4787                                                  ins_nr, inode_only,
4788                                                  logged_isize);
4789                                 if (ret < 0) {
4790                                         err = ret;
4791                                         goto out_unlock;
4792                                 }
4793                                 ins_nr = 0;
4794                                 btrfs_release_path(path);
4795                                 inode_key.objectid = other_ino;
4796                                 inode_key.type = BTRFS_INODE_ITEM_KEY;
4797                                 inode_key.offset = 0;
4798                                 other_inode = btrfs_iget(fs_info->sb,
4799                                                          &inode_key, root,
4800                                                          NULL);
4801                                 /*
4802                                  * If the other inode that had a conflicting dir
4803                                  * entry was deleted in the current transaction,
4804                                  * we don't need to do more work nor fallback to
4805                                  * a transaction commit.
4806                                  */
4807                                 if (IS_ERR(other_inode) &&
4808                                     PTR_ERR(other_inode) == -ENOENT) {
4809                                         goto next_key;
4810                                 } else if (IS_ERR(other_inode)) {
4811                                         err = PTR_ERR(other_inode);
4812                                         goto out_unlock;
4813                                 }
4814                                 /*
4815                                  * We are safe logging the other inode without
4816                                  * acquiring its i_mutex as long as we log with
4817                                  * the LOG_INODE_EXISTS mode. We're safe against
4818                                  * concurrent renames of the other inode as well
4819                                  * because during a rename we pin the log and
4820                                  * update the log with the new name before we
4821                                  * unpin it.
4822                                  */
4823                                 err = btrfs_log_inode(trans, root, other_inode,
4824                                                       LOG_OTHER_INODE,
4825                                                       0, LLONG_MAX, ctx);
4826                                 iput(other_inode);
4827                                 if (err)
4828                                         goto out_unlock;
4829                                 else
4830                                         goto next_key;
4831                         }
4832                 }
4833
4834                 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
4835                 if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
4836                         if (ins_nr == 0)
4837                                 goto next_slot;
4838                         ret = copy_items(trans, BTRFS_I(inode), dst_path, path,
4839                                          &last_extent, ins_start_slot,
4840                                          ins_nr, inode_only, logged_isize);
4841                         if (ret < 0) {
4842                                 err = ret;
4843                                 goto out_unlock;
4844                         }
4845                         ins_nr = 0;
4846                         if (ret) {
4847                                 btrfs_release_path(path);
4848                                 continue;
4849                         }
4850                         goto next_slot;
4851                 }
4852
4853                 src = path->nodes[0];
4854                 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
4855                         ins_nr++;
4856                         goto next_slot;
4857                 } else if (!ins_nr) {
4858                         ins_start_slot = path->slots[0];
4859                         ins_nr = 1;
4860                         goto next_slot;
4861                 }
4862
4863                 ret = copy_items(trans, BTRFS_I(inode), dst_path, path, &last_extent,
4864                                  ins_start_slot, ins_nr, inode_only,
4865                                  logged_isize);
4866                 if (ret < 0) {
4867                         err = ret;
4868                         goto out_unlock;
4869                 }
4870                 if (ret) {
4871                         ins_nr = 0;
4872                         btrfs_release_path(path);
4873                         continue;
4874                 }
4875                 ins_nr = 1;
4876                 ins_start_slot = path->slots[0];
4877 next_slot:
4878
4879                 nritems = btrfs_header_nritems(path->nodes[0]);
4880                 path->slots[0]++;
4881                 if (path->slots[0] < nritems) {
4882                         btrfs_item_key_to_cpu(path->nodes[0], &min_key,
4883                                               path->slots[0]);
4884                         goto again;
4885                 }
4886                 if (ins_nr) {
4887                         ret = copy_items(trans, BTRFS_I(inode), dst_path, path,
4888                                          &last_extent, ins_start_slot,
4889                                          ins_nr, inode_only, logged_isize);
4890                         if (ret < 0) {
4891                                 err = ret;
4892                                 goto out_unlock;
4893                         }
4894                         ret = 0;
4895                         ins_nr = 0;
4896                 }
4897                 btrfs_release_path(path);
4898 next_key:
4899                 if (min_key.offset < (u64)-1) {
4900                         min_key.offset++;
4901                 } else if (min_key.type < max_key.type) {
4902                         min_key.type++;
4903                         min_key.offset = 0;
4904                 } else {
4905                         break;
4906                 }
4907         }
4908         if (ins_nr) {
4909                 ret = copy_items(trans, BTRFS_I(inode), dst_path, path, &last_extent,
4910                                  ins_start_slot, ins_nr, inode_only,
4911                                  logged_isize);
4912                 if (ret < 0) {
4913                         err = ret;
4914                         goto out_unlock;
4915                 }
4916                 ret = 0;
4917                 ins_nr = 0;
4918         }
4919
4920         btrfs_release_path(path);
4921         btrfs_release_path(dst_path);
4922         err = btrfs_log_all_xattrs(trans, root, BTRFS_I(inode), path, dst_path);
4923         if (err)
4924                 goto out_unlock;
4925         if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
4926                 btrfs_release_path(path);
4927                 btrfs_release_path(dst_path);
4928                 err = btrfs_log_trailing_hole(trans, root, BTRFS_I(inode), path);
4929                 if (err)
4930                         goto out_unlock;
4931         }
4932 log_extents:
4933         btrfs_release_path(path);
4934         btrfs_release_path(dst_path);
4935         if (need_log_inode_item) {
4936                 err = log_inode_item(trans, log, dst_path, inode);
4937                 if (err)
4938                         goto out_unlock;
4939         }
4940         if (fast_search) {
4941                 ret = btrfs_log_changed_extents(trans, root, BTRFS_I(inode), dst_path,
4942                                                 &logged_list, ctx, start, end);
4943                 if (ret) {
4944                         err = ret;
4945                         goto out_unlock;
4946                 }
4947         } else if (inode_only == LOG_INODE_ALL) {
4948                 struct extent_map *em, *n;
4949
4950                 write_lock(&em_tree->lock);
4951                 /*
4952                  * We can't just remove every em if we're called for a ranged
4953                  * fsync - that is, one that doesn't cover the whole possible
4954                  * file range (0 to LLONG_MAX). This is because we can have
4955                  * em's that fall outside the range we're logging and therefore
4956                  * their ordered operations haven't completed yet
4957                  * (btrfs_finish_ordered_io() not invoked yet). This means we
4958                  * didn't get their respective file extent item in the fs/subvol
4959                  * tree yet, and need to let the next fast fsync (one which
4960                  * consults the list of modified extent maps) find the em so
4961                  * that it logs a matching file extent item and waits for the
4962                  * respective ordered operation to complete (if it's still
4963                  * running).
4964                  *
4965                  * Removing every em outside the range we're logging would make
4966                  * the next fast fsync not log their matching file extent items,
4967                  * therefore making us lose data after a log replay.
4968                  */
4969                 list_for_each_entry_safe(em, n, &em_tree->modified_extents,
4970                                          list) {
4971                         const u64 mod_end = em->mod_start + em->mod_len - 1;
4972
4973                         if (em->mod_start >= start && mod_end <= end)
4974                                 list_del_init(&em->list);
4975                 }
4976                 write_unlock(&em_tree->lock);
4977         }
4978
4979         if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
4980                 ret = log_directory_changes(trans, root, BTRFS_I(inode), path,
4981                                 dst_path, ctx);
4982                 if (ret) {
4983                         err = ret;
4984                         goto out_unlock;
4985                 }
4986         }
4987
4988         spin_lock(&BTRFS_I(inode)->lock);
4989         BTRFS_I(inode)->logged_trans = trans->transid;
4990         BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
4991         spin_unlock(&BTRFS_I(inode)->lock);
4992 out_unlock:
4993         if (unlikely(err))
4994                 btrfs_put_logged_extents(&logged_list);
4995         else
4996                 btrfs_submit_logged_extents(&logged_list, log);
4997         mutex_unlock(&BTRFS_I(inode)->log_mutex);
4998
4999         btrfs_free_path(path);
5000         btrfs_free_path(dst_path);
5001         return err;
5002 }
5003
5004 /*
5005  * Check if we must fallback to a transaction commit when logging an inode.
5006  * This must be called after logging the inode and is used only in the context
5007  * when fsyncing an inode requires the need to log some other inode - in which
5008  * case we can't lock the i_mutex of each other inode we need to log as that
5009  * can lead to deadlocks with concurrent fsync against other inodes (as we can
5010  * log inodes up or down in the hierarchy) or rename operations for example. So
5011  * we take the log_mutex of the inode after we have logged it and then check for
5012  * its last_unlink_trans value - this is safe because any task setting
5013  * last_unlink_trans must take the log_mutex and it must do this before it does
5014  * the actual unlink operation, so if we do this check before a concurrent task
5015  * sets last_unlink_trans it means we've logged a consistent version/state of
5016  * all the inode items, otherwise we are not sure and must do a transaction
5017  * commit (the concurrent task might have only updated last_unlink_trans before
5018  * we logged the inode or it might have also done the unlink).
5019  */
5020 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
5021                                           struct btrfs_inode *inode)
5022 {
5023         struct btrfs_fs_info *fs_info = inode->root->fs_info;
5024         bool ret = false;
5025
5026         mutex_lock(&inode->log_mutex);
5027         if (inode->last_unlink_trans > fs_info->last_trans_committed) {
5028                 /*
5029                  * Make sure any commits to the log are forced to be full
5030                  * commits.
5031                  */
5032                 btrfs_set_log_full_commit(fs_info, trans);
5033                 ret = true;
5034         }
5035         mutex_unlock(&inode->log_mutex);
5036
5037         return ret;
5038 }
5039
5040 /*
5041  * follow the dentry parent pointers up the chain and see if any
5042  * of the directories in it require a full commit before they can
5043  * be logged.  Returns zero if nothing special needs to be done or 1 if
5044  * a full commit is required.
5045  */
5046 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
5047                                                struct inode *inode,
5048                                                struct dentry *parent,
5049                                                struct super_block *sb,
5050                                                u64 last_committed)
5051 {
5052         int ret = 0;
5053         struct dentry *old_parent = NULL;
5054         struct inode *orig_inode = inode;
5055
5056         /*
5057          * for regular files, if its inode is already on disk, we don't
5058          * have to worry about the parents at all.  This is because
5059          * we can use the last_unlink_trans field to record renames
5060          * and other fun in this file.
5061          */
5062         if (S_ISREG(inode->i_mode) &&
5063             BTRFS_I(inode)->generation <= last_committed &&
5064             BTRFS_I(inode)->last_unlink_trans <= last_committed)
5065                         goto out;
5066
5067         if (!S_ISDIR(inode->i_mode)) {
5068                 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5069                         goto out;
5070                 inode = d_inode(parent);
5071         }
5072
5073         while (1) {
5074                 /*
5075                  * If we are logging a directory then we start with our inode,
5076                  * not our parent's inode, so we need to skip setting the
5077                  * logged_trans so that further down in the log code we don't
5078                  * think this inode has already been logged.
5079                  */
5080                 if (inode != orig_inode)
5081                         BTRFS_I(inode)->logged_trans = trans->transid;
5082                 smp_mb();
5083
5084                 if (btrfs_must_commit_transaction(trans, BTRFS_I(inode))) {
5085                         ret = 1;
5086                         break;
5087                 }
5088
5089                 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5090                         break;
5091
5092                 if (IS_ROOT(parent)) {
5093                         inode = d_inode(parent);
5094                         if (btrfs_must_commit_transaction(trans, BTRFS_I(inode)))
5095                                 ret = 1;
5096                         break;
5097                 }
5098
5099                 parent = dget_parent(parent);
5100                 dput(old_parent);
5101                 old_parent = parent;
5102                 inode = d_inode(parent);
5103
5104         }
5105         dput(old_parent);
5106 out:
5107         return ret;
5108 }
5109
5110 struct btrfs_dir_list {
5111         u64 ino;
5112         struct list_head list;
5113 };
5114
5115 /*
5116  * Log the inodes of the new dentries of a directory. See log_dir_items() for
5117  * details about the why it is needed.
5118  * This is a recursive operation - if an existing dentry corresponds to a
5119  * directory, that directory's new entries are logged too (same behaviour as
5120  * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5121  * the dentries point to we do not lock their i_mutex, otherwise lockdep
5122  * complains about the following circular lock dependency / possible deadlock:
5123  *
5124  *        CPU0                                        CPU1
5125  *        ----                                        ----
5126  * lock(&type->i_mutex_dir_key#3/2);
5127  *                                            lock(sb_internal#2);
5128  *                                            lock(&type->i_mutex_dir_key#3/2);
5129  * lock(&sb->s_type->i_mutex_key#14);
5130  *
5131  * Where sb_internal is the lock (a counter that works as a lock) acquired by
5132  * sb_start_intwrite() in btrfs_start_transaction().
5133  * Not locking i_mutex of the inodes is still safe because:
5134  *
5135  * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5136  *    that while logging the inode new references (names) are added or removed
5137  *    from the inode, leaving the logged inode item with a link count that does
5138  *    not match the number of logged inode reference items. This is fine because
5139  *    at log replay time we compute the real number of links and correct the
5140  *    link count in the inode item (see replay_one_buffer() and
5141  *    link_to_fixup_dir());
5142  *
5143  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5144  *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
5145  *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
5146  *    has a size that doesn't match the sum of the lengths of all the logged
5147  *    names. This does not result in a problem because if a dir_item key is
5148  *    logged but its matching dir_index key is not logged, at log replay time we
5149  *    don't use it to replay the respective name (see replay_one_name()). On the
5150  *    other hand if only the dir_index key ends up being logged, the respective
5151  *    name is added to the fs/subvol tree with both the dir_item and dir_index
5152  *    keys created (see replay_one_name()).
5153  *    The directory's inode item with a wrong i_size is not a problem as well,
5154  *    since we don't use it at log replay time to set the i_size in the inode
5155  *    item of the fs/subvol tree (see overwrite_item()).
5156  */
5157 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5158                                 struct btrfs_root *root,
5159                                 struct btrfs_inode *start_inode,
5160                                 struct btrfs_log_ctx *ctx)
5161 {
5162         struct btrfs_fs_info *fs_info = root->fs_info;
5163         struct btrfs_root *log = root->log_root;
5164         struct btrfs_path *path;
5165         LIST_HEAD(dir_list);
5166         struct btrfs_dir_list *dir_elem;
5167         int ret = 0;
5168
5169         path = btrfs_alloc_path();
5170         if (!path)
5171                 return -ENOMEM;
5172
5173         dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5174         if (!dir_elem) {
5175                 btrfs_free_path(path);
5176                 return -ENOMEM;
5177         }
5178         dir_elem->ino = btrfs_ino(start_inode);
5179         list_add_tail(&dir_elem->list, &dir_list);
5180
5181         while (!list_empty(&dir_list)) {
5182                 struct extent_buffer *leaf;
5183                 struct btrfs_key min_key;
5184                 int nritems;
5185                 int i;
5186
5187                 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
5188                                             list);
5189                 if (ret)
5190                         goto next_dir_inode;
5191
5192                 min_key.objectid = dir_elem->ino;
5193                 min_key.type = BTRFS_DIR_ITEM_KEY;
5194                 min_key.offset = 0;
5195 again:
5196                 btrfs_release_path(path);
5197                 ret = btrfs_search_forward(log, &min_key, path, trans->transid);
5198                 if (ret < 0) {
5199                         goto next_dir_inode;
5200                 } else if (ret > 0) {
5201                         ret = 0;
5202                         goto next_dir_inode;
5203                 }
5204
5205 process_leaf:
5206                 leaf = path->nodes[0];
5207                 nritems = btrfs_header_nritems(leaf);
5208                 for (i = path->slots[0]; i < nritems; i++) {
5209                         struct btrfs_dir_item *di;
5210                         struct btrfs_key di_key;
5211                         struct inode *di_inode;
5212                         struct btrfs_dir_list *new_dir_elem;
5213                         int log_mode = LOG_INODE_EXISTS;
5214                         int type;
5215
5216                         btrfs_item_key_to_cpu(leaf, &min_key, i);
5217                         if (min_key.objectid != dir_elem->ino ||
5218                             min_key.type != BTRFS_DIR_ITEM_KEY)
5219                                 goto next_dir_inode;
5220
5221                         di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
5222                         type = btrfs_dir_type(leaf, di);
5223                         if (btrfs_dir_transid(leaf, di) < trans->transid &&
5224                             type != BTRFS_FT_DIR)
5225                                 continue;
5226                         btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5227                         if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5228                                 continue;
5229
5230                         btrfs_release_path(path);
5231                         di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL);
5232                         if (IS_ERR(di_inode)) {
5233                                 ret = PTR_ERR(di_inode);
5234                                 goto next_dir_inode;
5235                         }
5236
5237                         if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
5238                                 iput(di_inode);
5239                                 break;
5240                         }
5241
5242                         ctx->log_new_dentries = false;
5243                         if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
5244                                 log_mode = LOG_INODE_ALL;
5245                         ret = btrfs_log_inode(trans, root, di_inode,
5246                                               log_mode, 0, LLONG_MAX, ctx);
5247                         if (!ret &&
5248                             btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
5249                                 ret = 1;
5250                         iput(di_inode);
5251                         if (ret)
5252                                 goto next_dir_inode;
5253                         if (ctx->log_new_dentries) {
5254                                 new_dir_elem = kmalloc(sizeof(*new_dir_elem),
5255                                                        GFP_NOFS);
5256                                 if (!new_dir_elem) {
5257                                         ret = -ENOMEM;
5258                                         goto next_dir_inode;
5259                                 }
5260                                 new_dir_elem->ino = di_key.objectid;
5261                                 list_add_tail(&new_dir_elem->list, &dir_list);
5262                         }
5263                         break;
5264                 }
5265                 if (i == nritems) {
5266                         ret = btrfs_next_leaf(log, path);
5267                         if (ret < 0) {
5268                                 goto next_dir_inode;
5269                         } else if (ret > 0) {
5270                                 ret = 0;
5271                                 goto next_dir_inode;
5272                         }
5273                         goto process_leaf;
5274                 }
5275                 if (min_key.offset < (u64)-1) {
5276                         min_key.offset++;
5277                         goto again;
5278                 }
5279 next_dir_inode:
5280                 list_del(&dir_elem->list);
5281                 kfree(dir_elem);
5282         }
5283
5284         btrfs_free_path(path);
5285         return ret;
5286 }
5287
5288 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5289                                  struct inode *inode,
5290                                  struct btrfs_log_ctx *ctx)
5291 {
5292         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5293         int ret;
5294         struct btrfs_path *path;
5295         struct btrfs_key key;
5296         struct btrfs_root *root = BTRFS_I(inode)->root;
5297         const u64 ino = btrfs_ino(BTRFS_I(inode));
5298
5299         path = btrfs_alloc_path();
5300         if (!path)
5301                 return -ENOMEM;
5302         path->skip_locking = 1;
5303         path->search_commit_root = 1;
5304
5305         key.objectid = ino;
5306         key.type = BTRFS_INODE_REF_KEY;
5307         key.offset = 0;
5308         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5309         if (ret < 0)
5310                 goto out;
5311
5312         while (true) {
5313                 struct extent_buffer *leaf = path->nodes[0];
5314                 int slot = path->slots[0];
5315                 u32 cur_offset = 0;
5316                 u32 item_size;
5317                 unsigned long ptr;
5318
5319                 if (slot >= btrfs_header_nritems(leaf)) {
5320                         ret = btrfs_next_leaf(root, path);
5321                         if (ret < 0)
5322                                 goto out;
5323                         else if (ret > 0)
5324                                 break;
5325                         continue;
5326                 }
5327
5328                 btrfs_item_key_to_cpu(leaf, &key, slot);
5329                 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
5330                 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
5331                         break;
5332
5333                 item_size = btrfs_item_size_nr(leaf, slot);
5334                 ptr = btrfs_item_ptr_offset(leaf, slot);
5335                 while (cur_offset < item_size) {
5336                         struct btrfs_key inode_key;
5337                         struct inode *dir_inode;
5338
5339                         inode_key.type = BTRFS_INODE_ITEM_KEY;
5340                         inode_key.offset = 0;
5341
5342                         if (key.type == BTRFS_INODE_EXTREF_KEY) {
5343                                 struct btrfs_inode_extref *extref;
5344
5345                                 extref = (struct btrfs_inode_extref *)
5346                                         (ptr + cur_offset);
5347                                 inode_key.objectid = btrfs_inode_extref_parent(
5348                                         leaf, extref);
5349                                 cur_offset += sizeof(*extref);
5350                                 cur_offset += btrfs_inode_extref_name_len(leaf,
5351                                         extref);
5352                         } else {
5353                                 inode_key.objectid = key.offset;
5354                                 cur_offset = item_size;
5355                         }
5356
5357                         dir_inode = btrfs_iget(fs_info->sb, &inode_key,
5358                                                root, NULL);
5359                         /* If parent inode was deleted, skip it. */
5360                         if (IS_ERR(dir_inode))
5361                                 continue;
5362
5363                         if (ctx)
5364                                 ctx->log_new_dentries = false;
5365                         ret = btrfs_log_inode(trans, root, dir_inode,
5366                                               LOG_INODE_ALL, 0, LLONG_MAX, ctx);
5367                         if (!ret &&
5368                             btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
5369                                 ret = 1;
5370                         if (!ret && ctx && ctx->log_new_dentries)
5371                                 ret = log_new_dir_dentries(trans, root,
5372                                                            BTRFS_I(dir_inode), ctx);
5373                         iput(dir_inode);
5374                         if (ret)
5375                                 goto out;
5376                 }
5377                 path->slots[0]++;
5378         }
5379         ret = 0;
5380 out:
5381         btrfs_free_path(path);
5382         return ret;
5383 }
5384
5385 /*
5386  * helper function around btrfs_log_inode to make sure newly created
5387  * parent directories also end up in the log.  A minimal inode and backref
5388  * only logging is done of any parent directories that are older than
5389  * the last committed transaction
5390  */
5391 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
5392                                   struct btrfs_root *root, struct inode *inode,
5393                                   struct dentry *parent,
5394                                   const loff_t start,
5395                                   const loff_t end,
5396                                   int exists_only,
5397                                   struct btrfs_log_ctx *ctx)
5398 {
5399         struct btrfs_fs_info *fs_info = root->fs_info;
5400         int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
5401         struct super_block *sb;
5402         struct dentry *old_parent = NULL;
5403         int ret = 0;
5404         u64 last_committed = fs_info->last_trans_committed;
5405         bool log_dentries = false;
5406         struct inode *orig_inode = inode;
5407
5408         sb = inode->i_sb;
5409
5410         if (btrfs_test_opt(fs_info, NOTREELOG)) {
5411                 ret = 1;
5412                 goto end_no_trans;
5413         }
5414
5415         /*
5416          * The prev transaction commit doesn't complete, we need do
5417          * full commit by ourselves.
5418          */
5419         if (fs_info->last_trans_log_full_commit >
5420             fs_info->last_trans_committed) {
5421                 ret = 1;
5422                 goto end_no_trans;
5423         }
5424
5425         if (root != BTRFS_I(inode)->root ||
5426             btrfs_root_refs(&root->root_item) == 0) {
5427                 ret = 1;
5428                 goto end_no_trans;
5429         }
5430
5431         ret = check_parent_dirs_for_sync(trans, inode, parent,
5432                                          sb, last_committed);
5433         if (ret)
5434                 goto end_no_trans;
5435
5436         if (btrfs_inode_in_log(BTRFS_I(inode), trans->transid)) {
5437                 ret = BTRFS_NO_LOG_SYNC;
5438                 goto end_no_trans;
5439         }
5440
5441         ret = start_log_trans(trans, root, ctx);
5442         if (ret)
5443                 goto end_no_trans;
5444
5445         ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
5446         if (ret)
5447                 goto end_trans;
5448
5449         /*
5450          * for regular files, if its inode is already on disk, we don't
5451          * have to worry about the parents at all.  This is because
5452          * we can use the last_unlink_trans field to record renames
5453          * and other fun in this file.
5454          */
5455         if (S_ISREG(inode->i_mode) &&
5456             BTRFS_I(inode)->generation <= last_committed &&
5457             BTRFS_I(inode)->last_unlink_trans <= last_committed) {
5458                 ret = 0;
5459                 goto end_trans;
5460         }
5461
5462         if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
5463                 log_dentries = true;
5464
5465         /*
5466          * On unlink we must make sure all our current and old parent directory
5467          * inodes are fully logged. This is to prevent leaving dangling
5468          * directory index entries in directories that were our parents but are
5469          * not anymore. Not doing this results in old parent directory being
5470          * impossible to delete after log replay (rmdir will always fail with
5471          * error -ENOTEMPTY).
5472          *
5473          * Example 1:
5474          *
5475          * mkdir testdir
5476          * touch testdir/foo
5477          * ln testdir/foo testdir/bar
5478          * sync
5479          * unlink testdir/bar
5480          * xfs_io -c fsync testdir/foo
5481          * <power failure>
5482          * mount fs, triggers log replay
5483          *
5484          * If we don't log the parent directory (testdir), after log replay the
5485          * directory still has an entry pointing to the file inode using the bar
5486          * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
5487          * the file inode has a link count of 1.
5488          *
5489          * Example 2:
5490          *
5491          * mkdir testdir
5492          * touch foo
5493          * ln foo testdir/foo2
5494          * ln foo testdir/foo3
5495          * sync
5496          * unlink testdir/foo3
5497          * xfs_io -c fsync foo
5498          * <power failure>
5499          * mount fs, triggers log replay
5500          *
5501          * Similar as the first example, after log replay the parent directory
5502          * testdir still has an entry pointing to the inode file with name foo3
5503          * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
5504          * and has a link count of 2.
5505          */
5506         if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
5507                 ret = btrfs_log_all_parents(trans, orig_inode, ctx);
5508                 if (ret)
5509                         goto end_trans;
5510         }
5511
5512         while (1) {
5513                 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5514                         break;
5515
5516                 inode = d_inode(parent);
5517                 if (root != BTRFS_I(inode)->root)
5518                         break;
5519
5520                 if (BTRFS_I(inode)->generation > last_committed) {
5521                         ret = btrfs_log_inode(trans, root, inode,
5522                                               LOG_INODE_EXISTS,
5523                                               0, LLONG_MAX, ctx);
5524                         if (ret)
5525                                 goto end_trans;
5526                 }
5527                 if (IS_ROOT(parent))
5528                         break;
5529
5530                 parent = dget_parent(parent);
5531                 dput(old_parent);
5532                 old_parent = parent;
5533         }
5534         if (log_dentries)
5535                 ret = log_new_dir_dentries(trans, root, BTRFS_I(orig_inode), ctx);
5536         else
5537                 ret = 0;
5538 end_trans:
5539         dput(old_parent);
5540         if (ret < 0) {
5541                 btrfs_set_log_full_commit(fs_info, trans);
5542                 ret = 1;
5543         }
5544
5545         if (ret)
5546                 btrfs_remove_log_ctx(root, ctx);
5547         btrfs_end_log_trans(root);
5548 end_no_trans:
5549         return ret;
5550 }
5551
5552 /*
5553  * it is not safe to log dentry if the chunk root has added new
5554  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
5555  * If this returns 1, you must commit the transaction to safely get your
5556  * data on disk.
5557  */
5558 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
5559                           struct btrfs_root *root, struct dentry *dentry,
5560                           const loff_t start,
5561                           const loff_t end,
5562                           struct btrfs_log_ctx *ctx)
5563 {
5564         struct dentry *parent = dget_parent(dentry);
5565         int ret;
5566
5567         ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent,
5568                                      start, end, 0, ctx);
5569         dput(parent);
5570
5571         return ret;
5572 }
5573
5574 /*
5575  * should be called during mount to recover any replay any log trees
5576  * from the FS
5577  */
5578 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
5579 {
5580         int ret;
5581         struct btrfs_path *path;
5582         struct btrfs_trans_handle *trans;
5583         struct btrfs_key key;
5584         struct btrfs_key found_key;
5585         struct btrfs_key tmp_key;
5586         struct btrfs_root *log;
5587         struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
5588         struct walk_control wc = {
5589                 .process_func = process_one_buffer,
5590                 .stage = 0,
5591         };
5592
5593         path = btrfs_alloc_path();
5594         if (!path)
5595                 return -ENOMEM;
5596
5597         set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
5598
5599         trans = btrfs_start_transaction(fs_info->tree_root, 0);
5600         if (IS_ERR(trans)) {
5601                 ret = PTR_ERR(trans);
5602                 goto error;
5603         }
5604
5605         wc.trans = trans;
5606         wc.pin = 1;
5607
5608         ret = walk_log_tree(trans, log_root_tree, &wc);
5609         if (ret) {
5610                 btrfs_handle_fs_error(fs_info, ret,
5611                         "Failed to pin buffers while recovering log root tree.");
5612                 goto error;
5613         }
5614
5615 again:
5616         key.objectid = BTRFS_TREE_LOG_OBJECTID;
5617         key.offset = (u64)-1;
5618         key.type = BTRFS_ROOT_ITEM_KEY;
5619
5620         while (1) {
5621                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
5622
5623                 if (ret < 0) {
5624                         btrfs_handle_fs_error(fs_info, ret,
5625                                     "Couldn't find tree log root.");
5626                         goto error;
5627                 }
5628                 if (ret > 0) {
5629                         if (path->slots[0] == 0)
5630                                 break;
5631                         path->slots[0]--;
5632                 }
5633                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
5634                                       path->slots[0]);
5635                 btrfs_release_path(path);
5636                 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
5637                         break;
5638
5639                 log = btrfs_read_fs_root(log_root_tree, &found_key);
5640                 if (IS_ERR(log)) {
5641                         ret = PTR_ERR(log);
5642                         btrfs_handle_fs_error(fs_info, ret,
5643                                     "Couldn't read tree log root.");
5644                         goto error;
5645                 }
5646
5647                 tmp_key.objectid = found_key.offset;
5648                 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
5649                 tmp_key.offset = (u64)-1;
5650
5651                 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
5652                 if (IS_ERR(wc.replay_dest)) {
5653                         ret = PTR_ERR(wc.replay_dest);
5654                         free_extent_buffer(log->node);
5655                         free_extent_buffer(log->commit_root);
5656                         kfree(log);
5657                         btrfs_handle_fs_error(fs_info, ret,
5658                                 "Couldn't read target root for tree log recovery.");
5659                         goto error;
5660                 }
5661
5662                 wc.replay_dest->log_root = log;
5663                 btrfs_record_root_in_trans(trans, wc.replay_dest);
5664                 ret = walk_log_tree(trans, log, &wc);
5665
5666                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
5667                         ret = fixup_inode_link_counts(trans, wc.replay_dest,
5668                                                       path);
5669                 }
5670
5671                 key.offset = found_key.offset - 1;
5672                 wc.replay_dest->log_root = NULL;
5673                 free_extent_buffer(log->node);
5674                 free_extent_buffer(log->commit_root);
5675                 kfree(log);
5676
5677                 if (ret)
5678                         goto error;
5679
5680                 if (found_key.offset == 0)
5681                         break;
5682         }
5683         btrfs_release_path(path);
5684
5685         /* step one is to pin it all, step two is to replay just inodes */
5686         if (wc.pin) {
5687                 wc.pin = 0;
5688                 wc.process_func = replay_one_buffer;
5689                 wc.stage = LOG_WALK_REPLAY_INODES;
5690                 goto again;
5691         }
5692         /* step three is to replay everything */
5693         if (wc.stage < LOG_WALK_REPLAY_ALL) {
5694                 wc.stage++;
5695                 goto again;
5696         }
5697
5698         btrfs_free_path(path);
5699
5700         /* step 4: commit the transaction, which also unpins the blocks */
5701         ret = btrfs_commit_transaction(trans);
5702         if (ret)
5703                 return ret;
5704
5705         free_extent_buffer(log_root_tree->node);
5706         log_root_tree->log_root = NULL;
5707         clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
5708         kfree(log_root_tree);
5709
5710         return 0;
5711 error:
5712         if (wc.trans)
5713                 btrfs_end_transaction(wc.trans);
5714         btrfs_free_path(path);
5715         return ret;
5716 }
5717
5718 /*
5719  * there are some corner cases where we want to force a full
5720  * commit instead of allowing a directory to be logged.
5721  *
5722  * They revolve around files there were unlinked from the directory, and
5723  * this function updates the parent directory so that a full commit is
5724  * properly done if it is fsync'd later after the unlinks are done.
5725  *
5726  * Must be called before the unlink operations (updates to the subvolume tree,
5727  * inodes, etc) are done.
5728  */
5729 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
5730                              struct btrfs_inode *dir, struct btrfs_inode *inode,
5731                              int for_rename)
5732 {
5733         /*
5734          * when we're logging a file, if it hasn't been renamed
5735          * or unlinked, and its inode is fully committed on disk,
5736          * we don't have to worry about walking up the directory chain
5737          * to log its parents.
5738          *
5739          * So, we use the last_unlink_trans field to put this transid
5740          * into the file.  When the file is logged we check it and
5741          * don't log the parents if the file is fully on disk.
5742          */
5743         mutex_lock(&inode->log_mutex);
5744         inode->last_unlink_trans = trans->transid;
5745         mutex_unlock(&inode->log_mutex);
5746
5747         /*
5748          * if this directory was already logged any new
5749          * names for this file/dir will get recorded
5750          */
5751         smp_mb();
5752         if (dir->logged_trans == trans->transid)
5753                 return;
5754
5755         /*
5756          * if the inode we're about to unlink was logged,
5757          * the log will be properly updated for any new names
5758          */
5759         if (inode->logged_trans == trans->transid)
5760                 return;
5761
5762         /*
5763          * when renaming files across directories, if the directory
5764          * there we're unlinking from gets fsync'd later on, there's
5765          * no way to find the destination directory later and fsync it
5766          * properly.  So, we have to be conservative and force commits
5767          * so the new name gets discovered.
5768          */
5769         if (for_rename)
5770                 goto record;
5771
5772         /* we can safely do the unlink without any special recording */
5773         return;
5774
5775 record:
5776         mutex_lock(&dir->log_mutex);
5777         dir->last_unlink_trans = trans->transid;
5778         mutex_unlock(&dir->log_mutex);
5779 }
5780
5781 /*
5782  * Make sure that if someone attempts to fsync the parent directory of a deleted
5783  * snapshot, it ends up triggering a transaction commit. This is to guarantee
5784  * that after replaying the log tree of the parent directory's root we will not
5785  * see the snapshot anymore and at log replay time we will not see any log tree
5786  * corresponding to the deleted snapshot's root, which could lead to replaying
5787  * it after replaying the log tree of the parent directory (which would replay
5788  * the snapshot delete operation).
5789  *
5790  * Must be called before the actual snapshot destroy operation (updates to the
5791  * parent root and tree of tree roots trees, etc) are done.
5792  */
5793 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
5794                                    struct btrfs_inode *dir)
5795 {
5796         mutex_lock(&dir->log_mutex);
5797         dir->last_unlink_trans = trans->transid;
5798         mutex_unlock(&dir->log_mutex);
5799 }
5800
5801 /*
5802  * Call this after adding a new name for a file and it will properly
5803  * update the log to reflect the new name.
5804  *
5805  * It will return zero if all goes well, and it will return 1 if a
5806  * full transaction commit is required.
5807  */
5808 int btrfs_log_new_name(struct btrfs_trans_handle *trans,
5809                         struct btrfs_inode *inode, struct btrfs_inode *old_dir,
5810                         struct dentry *parent)
5811 {
5812         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
5813         struct btrfs_root * root = inode->root;
5814
5815         /*
5816          * this will force the logging code to walk the dentry chain
5817          * up for the file
5818          */
5819         if (S_ISREG(inode->vfs_inode.i_mode))
5820                 inode->last_unlink_trans = trans->transid;
5821
5822         /*
5823          * if this inode hasn't been logged and directory we're renaming it
5824          * from hasn't been logged, we don't need to log it
5825          */
5826         if (inode->logged_trans <= fs_info->last_trans_committed &&
5827             (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
5828                 return 0;
5829
5830         return btrfs_log_inode_parent(trans, root, &inode->vfs_inode, parent, 0,
5831                                       LLONG_MAX, 1, NULL);
5832 }
5833