btrfs: move btrfs_account_ro_block_groups_free_space into space-info.c
[linux-block.git] / fs / btrfs / tree-log.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
e02119d5
CM
2/*
3 * Copyright (C) 2008 Oracle. All rights reserved.
e02119d5
CM
4 */
5
6#include <linux/sched.h>
5a0e3ad6 7#include <linux/slab.h>
c6adc9cc 8#include <linux/blkdev.h>
5dc562c5 9#include <linux/list_sort.h>
c7f88c4e 10#include <linux/iversion.h>
602cbe91 11#include "misc.h"
9678c543 12#include "ctree.h"
995946dd 13#include "tree-log.h"
e02119d5
CM
14#include "disk-io.h"
15#include "locking.h"
16#include "print-tree.h"
f186373f 17#include "backref.h"
ebb8765b 18#include "compression.h"
df2c95f3 19#include "qgroup.h"
6787bb9f
NB
20#include "block-group.h"
21#include "space-info.h"
d3575156 22#include "zoned.h"
26c2c454 23#include "inode-item.h"
c7f13d42 24#include "fs.h"
ad1ac501 25#include "accessors.h"
e02119d5 26
e09d94c9
FM
27#define MAX_CONFLICT_INODES 10
28
e02119d5
CM
29/* magic values for the inode_only field in btrfs_log_inode:
30 *
31 * LOG_INODE_ALL means to log everything
32 * LOG_INODE_EXISTS means to log just enough to recreate the inode
33 * during log replay
34 */
e13976cf
DS
35enum {
36 LOG_INODE_ALL,
37 LOG_INODE_EXISTS,
e13976cf 38};
e02119d5 39
12fcfd22
CM
40/*
41 * directory trouble cases
42 *
43 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
44 * log, we must force a full commit before doing an fsync of the directory
45 * where the unlink was done.
46 * ---> record transid of last unlink/rename per directory
47 *
48 * mkdir foo/some_dir
49 * normal commit
50 * rename foo/some_dir foo2/some_dir
51 * mkdir foo/some_dir
52 * fsync foo/some_dir/some_file
53 *
54 * The fsync above will unlink the original some_dir without recording
55 * it in its new location (foo2). After a crash, some_dir will be gone
56 * unless the fsync of some_file forces a full commit
57 *
58 * 2) we must log any new names for any file or dir that is in the fsync
59 * log. ---> check inode while renaming/linking.
60 *
61 * 2a) we must log any new names for any file or dir during rename
62 * when the directory they are being removed from was logged.
63 * ---> check inode and old parent dir during rename
64 *
65 * 2a is actually the more important variant. With the extra logging
66 * a crash might unlink the old name without recreating the new one
67 *
68 * 3) after a crash, we must go through any directories with a link count
69 * of zero and redo the rm -rf
70 *
71 * mkdir f1/foo
72 * normal commit
73 * rm -rf f1/foo
74 * fsync(f1)
75 *
76 * The directory f1 was fully removed from the FS, but fsync was never
77 * called on f1, only its parent dir. After a crash the rm -rf must
78 * be replayed. This must be able to recurse down the entire
79 * directory tree. The inode link count fixup code takes care of the
80 * ugly details.
81 */
82
e02119d5
CM
83/*
84 * stages for the tree walking. The first
85 * stage (0) is to only pin down the blocks we find
86 * the second stage (1) is to make sure that all the inodes
87 * we find in the log are created in the subvolume.
88 *
89 * The last stage is to deal with directories and links and extents
90 * and all the other fun semantics
91 */
e13976cf
DS
92enum {
93 LOG_WALK_PIN_ONLY,
94 LOG_WALK_REPLAY_INODES,
95 LOG_WALK_REPLAY_DIR_INDEX,
96 LOG_WALK_REPLAY_ALL,
97};
e02119d5 98
12fcfd22 99static int btrfs_log_inode(struct btrfs_trans_handle *trans,
90d04510 100 struct btrfs_inode *inode,
49dae1bc 101 int inode_only,
8407f553 102 struct btrfs_log_ctx *ctx);
ec051c0f
YZ
103static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
104 struct btrfs_root *root,
105 struct btrfs_path *path, u64 objectid);
12fcfd22
CM
106static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
107 struct btrfs_root *root,
108 struct btrfs_root *log,
109 struct btrfs_path *path,
110 u64 dirid, int del_all);
fa1a0f42 111static void wait_log_commit(struct btrfs_root *root, int transid);
e02119d5
CM
112
113/*
114 * tree logging is a special write ahead log used to make sure that
115 * fsyncs and O_SYNCs can happen without doing full tree commits.
116 *
117 * Full tree commits are expensive because they require commonly
118 * modified blocks to be recowed, creating many dirty pages in the
119 * extent tree an 4x-6x higher write load than ext3.
120 *
121 * Instead of doing a tree commit on every fsync, we use the
122 * key ranges and transaction ids to find items for a given file or directory
123 * that have changed in this transaction. Those items are copied into
124 * a special tree (one per subvolume root), that tree is written to disk
125 * and then the fsync is considered complete.
126 *
127 * After a crash, items are copied out of the log-tree back into the
128 * subvolume tree. Any file data extents found are recorded in the extent
129 * allocation tree, and the log-tree freed.
130 *
131 * The log tree is read three times, once to pin down all the extents it is
132 * using in ram and once, once to create all the inodes logged in the tree
133 * and once to do all the other items.
134 */
135
e02119d5
CM
136/*
137 * start a sub transaction and setup the log tree
138 * this increments the log tree writer count to make the people
139 * syncing the tree wait for us to finish
140 */
141static int start_log_trans(struct btrfs_trans_handle *trans,
8b050d35
MX
142 struct btrfs_root *root,
143 struct btrfs_log_ctx *ctx)
e02119d5 144{
0b246afa 145 struct btrfs_fs_info *fs_info = root->fs_info;
47876f7c 146 struct btrfs_root *tree_root = fs_info->tree_root;
fa1a0f42 147 const bool zoned = btrfs_is_zoned(fs_info);
34eb2a52 148 int ret = 0;
fa1a0f42 149 bool created = false;
7237f183 150
47876f7c
FM
151 /*
152 * First check if the log root tree was already created. If not, create
153 * it before locking the root's log_mutex, just to keep lockdep happy.
154 */
155 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
156 mutex_lock(&tree_root->log_mutex);
157 if (!fs_info->log_root_tree) {
158 ret = btrfs_init_log_root_tree(trans, fs_info);
fa1a0f42 159 if (!ret) {
47876f7c 160 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
fa1a0f42
NA
161 created = true;
162 }
47876f7c
FM
163 }
164 mutex_unlock(&tree_root->log_mutex);
165 if (ret)
166 return ret;
167 }
168
7237f183 169 mutex_lock(&root->log_mutex);
34eb2a52 170
fa1a0f42 171again:
7237f183 172 if (root->log_root) {
fa1a0f42
NA
173 int index = (root->log_transid + 1) % 2;
174
4884b8e8 175 if (btrfs_need_log_full_commit(trans)) {
f31f09f6 176 ret = BTRFS_LOG_FORCE_COMMIT;
50471a38
MX
177 goto out;
178 }
34eb2a52 179
fa1a0f42
NA
180 if (zoned && atomic_read(&root->log_commit[index])) {
181 wait_log_commit(root, root->log_transid - 1);
182 goto again;
183 }
184
ff782e0a 185 if (!root->log_start_pid) {
27cdeb70 186 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
34eb2a52 187 root->log_start_pid = current->pid;
ff782e0a 188 } else if (root->log_start_pid != current->pid) {
27cdeb70 189 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
ff782e0a 190 }
34eb2a52 191 } else {
fa1a0f42
NA
192 /*
193 * This means fs_info->log_root_tree was already created
194 * for some other FS trees. Do the full commit not to mix
195 * nodes from multiple log transactions to do sequential
196 * writing.
197 */
198 if (zoned && !created) {
f31f09f6 199 ret = BTRFS_LOG_FORCE_COMMIT;
fa1a0f42
NA
200 goto out;
201 }
202
e02119d5 203 ret = btrfs_add_log_tree(trans, root);
4a500fd1 204 if (ret)
e87ac136 205 goto out;
34eb2a52 206
e7a79811 207 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
34eb2a52
Z
208 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
209 root->log_start_pid = current->pid;
e02119d5 210 }
34eb2a52 211
7237f183 212 atomic_inc(&root->log_writers);
289cffcb 213 if (!ctx->logging_new_name) {
34eb2a52 214 int index = root->log_transid % 2;
8b050d35 215 list_add_tail(&ctx->list, &root->log_ctxs[index]);
d1433deb 216 ctx->log_transid = root->log_transid;
8b050d35 217 }
34eb2a52 218
e87ac136 219out:
7237f183 220 mutex_unlock(&root->log_mutex);
e87ac136 221 return ret;
e02119d5
CM
222}
223
224/*
225 * returns 0 if there was a log transaction running and we were able
226 * to join, or returns -ENOENT if there were not transactions
227 * in progress
228 */
229static int join_running_log_trans(struct btrfs_root *root)
230{
fa1a0f42 231 const bool zoned = btrfs_is_zoned(root->fs_info);
e02119d5
CM
232 int ret = -ENOENT;
233
e7a79811
FM
234 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
235 return ret;
236
7237f183 237 mutex_lock(&root->log_mutex);
fa1a0f42 238again:
e02119d5 239 if (root->log_root) {
fa1a0f42
NA
240 int index = (root->log_transid + 1) % 2;
241
e02119d5 242 ret = 0;
fa1a0f42
NA
243 if (zoned && atomic_read(&root->log_commit[index])) {
244 wait_log_commit(root, root->log_transid - 1);
245 goto again;
246 }
7237f183 247 atomic_inc(&root->log_writers);
e02119d5 248 }
7237f183 249 mutex_unlock(&root->log_mutex);
e02119d5
CM
250 return ret;
251}
252
12fcfd22
CM
253/*
254 * This either makes the current running log transaction wait
255 * until you call btrfs_end_log_trans() or it makes any future
256 * log transactions wait until you call btrfs_end_log_trans()
257 */
45128b08 258void btrfs_pin_log_trans(struct btrfs_root *root)
12fcfd22 259{
12fcfd22 260 atomic_inc(&root->log_writers);
12fcfd22
CM
261}
262
e02119d5
CM
263/*
264 * indicate we're done making changes to the log tree
265 * and wake up anyone waiting to do a sync
266 */
143bede5 267void btrfs_end_log_trans(struct btrfs_root *root)
e02119d5 268{
7237f183 269 if (atomic_dec_and_test(&root->log_writers)) {
093258e6
DS
270 /* atomic_dec_and_test implies a barrier */
271 cond_wake_up_nomb(&root->log_writer_wait);
7237f183 272 }
e02119d5
CM
273}
274
247462a5
DS
275static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
276{
277 filemap_fdatawait_range(buf->pages[0]->mapping,
278 buf->start, buf->start + buf->len - 1);
279}
e02119d5
CM
280
281/*
282 * the walk control struct is used to pass state down the chain when
283 * processing the log tree. The stage field tells us which part
284 * of the log tree processing we are currently doing. The others
285 * are state fields used for that specific part
286 */
287struct walk_control {
288 /* should we free the extent on disk when done? This is used
289 * at transaction commit time while freeing a log tree
290 */
291 int free;
292
e02119d5
CM
293 /* pin only walk, we record which extents on disk belong to the
294 * log trees
295 */
296 int pin;
297
298 /* what stage of the replay code we're currently in */
299 int stage;
300
f2d72f42
FM
301 /*
302 * Ignore any items from the inode currently being processed. Needs
303 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
304 * the LOG_WALK_REPLAY_INODES stage.
305 */
306 bool ignore_cur_inode;
307
e02119d5
CM
308 /* the root we are currently replaying */
309 struct btrfs_root *replay_dest;
310
311 /* the trans handle for the current replay */
312 struct btrfs_trans_handle *trans;
313
314 /* the function that gets used to process blocks we find in the
315 * tree. Note the extent_buffer might not be up to date when it is
316 * passed in, and it must be checked or read if you need the data
317 * inside it
318 */
319 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
581c1760 320 struct walk_control *wc, u64 gen, int level);
e02119d5
CM
321};
322
323/*
324 * process_func used to pin down extents, write them or wait on them
325 */
326static int process_one_buffer(struct btrfs_root *log,
327 struct extent_buffer *eb,
581c1760 328 struct walk_control *wc, u64 gen, int level)
e02119d5 329{
0b246afa 330 struct btrfs_fs_info *fs_info = log->fs_info;
b50c6e25
JB
331 int ret = 0;
332
8c2a1a30
JB
333 /*
334 * If this fs is mixed then we need to be able to process the leaves to
335 * pin down any logged extents, so we have to read the block.
336 */
0b246afa 337 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
6a2e9dc4 338 ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
8c2a1a30
JB
339 if (ret)
340 return ret;
341 }
342
c816d705 343 if (wc->pin) {
9fce5704 344 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
2ff7e61e 345 eb->len);
c816d705
FM
346 if (ret)
347 return ret;
e02119d5 348
c816d705
FM
349 if (btrfs_buffer_uptodate(eb, gen, 0) &&
350 btrfs_header_level(eb) == 0)
bcdc428c 351 ret = btrfs_exclude_logged_extents(eb);
e02119d5 352 }
b50c6e25 353 return ret;
e02119d5
CM
354}
355
086dcbfa
FM
356static int do_overwrite_item(struct btrfs_trans_handle *trans,
357 struct btrfs_root *root,
358 struct btrfs_path *path,
359 struct extent_buffer *eb, int slot,
360 struct btrfs_key *key)
e02119d5
CM
361{
362 int ret;
363 u32 item_size;
364 u64 saved_i_size = 0;
365 int save_old_i_size = 0;
366 unsigned long src_ptr;
367 unsigned long dst_ptr;
368 int overwrite_root = 0;
4bc4bee4 369 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
e02119d5
CM
370
371 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
372 overwrite_root = 1;
373
3212fa14 374 item_size = btrfs_item_size(eb, slot);
e02119d5
CM
375 src_ptr = btrfs_item_ptr_offset(eb, slot);
376
086dcbfa
FM
377 /* Our caller must have done a search for the key for us. */
378 ASSERT(path->nodes[0] != NULL);
379
380 /*
381 * And the slot must point to the exact key or the slot where the key
382 * should be at (the first item with a key greater than 'key')
383 */
384 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
385 struct btrfs_key found_key;
386
387 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
388 ret = btrfs_comp_cpu_keys(&found_key, key);
389 ASSERT(ret >= 0);
390 } else {
391 ret = 1;
392 }
4bc4bee4 393
e02119d5
CM
394 if (ret == 0) {
395 char *src_copy;
396 char *dst_copy;
3212fa14 397 u32 dst_size = btrfs_item_size(path->nodes[0],
e02119d5
CM
398 path->slots[0]);
399 if (dst_size != item_size)
400 goto insert;
401
402 if (item_size == 0) {
b3b4aa74 403 btrfs_release_path(path);
e02119d5
CM
404 return 0;
405 }
406 dst_copy = kmalloc(item_size, GFP_NOFS);
407 src_copy = kmalloc(item_size, GFP_NOFS);
2a29edc6 408 if (!dst_copy || !src_copy) {
b3b4aa74 409 btrfs_release_path(path);
2a29edc6 410 kfree(dst_copy);
411 kfree(src_copy);
412 return -ENOMEM;
413 }
e02119d5
CM
414
415 read_extent_buffer(eb, src_copy, src_ptr, item_size);
416
417 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
418 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
419 item_size);
420 ret = memcmp(dst_copy, src_copy, item_size);
421
422 kfree(dst_copy);
423 kfree(src_copy);
424 /*
425 * they have the same contents, just return, this saves
426 * us from cowing blocks in the destination tree and doing
427 * extra writes that may not have been done by a previous
428 * sync
429 */
430 if (ret == 0) {
b3b4aa74 431 btrfs_release_path(path);
e02119d5
CM
432 return 0;
433 }
434
4bc4bee4
JB
435 /*
436 * We need to load the old nbytes into the inode so when we
437 * replay the extents we've logged we get the right nbytes.
438 */
439 if (inode_item) {
440 struct btrfs_inode_item *item;
441 u64 nbytes;
d555438b 442 u32 mode;
4bc4bee4
JB
443
444 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
445 struct btrfs_inode_item);
446 nbytes = btrfs_inode_nbytes(path->nodes[0], item);
447 item = btrfs_item_ptr(eb, slot,
448 struct btrfs_inode_item);
449 btrfs_set_inode_nbytes(eb, item, nbytes);
d555438b
JB
450
451 /*
452 * If this is a directory we need to reset the i_size to
453 * 0 so that we can set it up properly when replaying
454 * the rest of the items in this log.
455 */
456 mode = btrfs_inode_mode(eb, item);
457 if (S_ISDIR(mode))
458 btrfs_set_inode_size(eb, item, 0);
4bc4bee4
JB
459 }
460 } else if (inode_item) {
461 struct btrfs_inode_item *item;
d555438b 462 u32 mode;
4bc4bee4
JB
463
464 /*
465 * New inode, set nbytes to 0 so that the nbytes comes out
466 * properly when we replay the extents.
467 */
468 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
469 btrfs_set_inode_nbytes(eb, item, 0);
d555438b
JB
470
471 /*
472 * If this is a directory we need to reset the i_size to 0 so
473 * that we can set it up properly when replaying the rest of
474 * the items in this log.
475 */
476 mode = btrfs_inode_mode(eb, item);
477 if (S_ISDIR(mode))
478 btrfs_set_inode_size(eb, item, 0);
e02119d5
CM
479 }
480insert:
b3b4aa74 481 btrfs_release_path(path);
e02119d5 482 /* try to insert the key into the destination tree */
df8d116f 483 path->skip_release_on_error = 1;
e02119d5
CM
484 ret = btrfs_insert_empty_item(trans, root, path,
485 key, item_size);
df8d116f 486 path->skip_release_on_error = 0;
e02119d5
CM
487
488 /* make sure any existing item is the correct size */
df8d116f 489 if (ret == -EEXIST || ret == -EOVERFLOW) {
e02119d5 490 u32 found_size;
3212fa14 491 found_size = btrfs_item_size(path->nodes[0],
e02119d5 492 path->slots[0]);
143bede5 493 if (found_size > item_size)
78ac4f9e 494 btrfs_truncate_item(path, item_size, 1);
143bede5 495 else if (found_size < item_size)
c71dd880 496 btrfs_extend_item(path, item_size - found_size);
e02119d5 497 } else if (ret) {
4a500fd1 498 return ret;
e02119d5
CM
499 }
500 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
501 path->slots[0]);
502
503 /* don't overwrite an existing inode if the generation number
504 * was logged as zero. This is done when the tree logging code
505 * is just logging an inode to make sure it exists after recovery.
506 *
507 * Also, don't overwrite i_size on directories during replay.
508 * log replay inserts and removes directory items based on the
509 * state of the tree found in the subvolume, and i_size is modified
510 * as it goes
511 */
512 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
513 struct btrfs_inode_item *src_item;
514 struct btrfs_inode_item *dst_item;
515
516 src_item = (struct btrfs_inode_item *)src_ptr;
517 dst_item = (struct btrfs_inode_item *)dst_ptr;
518
1a4bcf47
FM
519 if (btrfs_inode_generation(eb, src_item) == 0) {
520 struct extent_buffer *dst_eb = path->nodes[0];
2f2ff0ee 521 const u64 ino_size = btrfs_inode_size(eb, src_item);
1a4bcf47 522
2f2ff0ee
FM
523 /*
524 * For regular files an ino_size == 0 is used only when
525 * logging that an inode exists, as part of a directory
526 * fsync, and the inode wasn't fsynced before. In this
527 * case don't set the size of the inode in the fs/subvol
528 * tree, otherwise we would be throwing valid data away.
529 */
1a4bcf47 530 if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
2f2ff0ee 531 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
60d48e2e
DS
532 ino_size != 0)
533 btrfs_set_inode_size(dst_eb, dst_item, ino_size);
e02119d5 534 goto no_copy;
1a4bcf47 535 }
e02119d5
CM
536
537 if (overwrite_root &&
538 S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
539 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
540 save_old_i_size = 1;
541 saved_i_size = btrfs_inode_size(path->nodes[0],
542 dst_item);
543 }
544 }
545
546 copy_extent_buffer(path->nodes[0], eb, dst_ptr,
547 src_ptr, item_size);
548
549 if (save_old_i_size) {
550 struct btrfs_inode_item *dst_item;
551 dst_item = (struct btrfs_inode_item *)dst_ptr;
552 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
553 }
554
555 /* make sure the generation is filled in */
556 if (key->type == BTRFS_INODE_ITEM_KEY) {
557 struct btrfs_inode_item *dst_item;
558 dst_item = (struct btrfs_inode_item *)dst_ptr;
559 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
560 btrfs_set_inode_generation(path->nodes[0], dst_item,
561 trans->transid);
562 }
563 }
564no_copy:
565 btrfs_mark_buffer_dirty(path->nodes[0]);
b3b4aa74 566 btrfs_release_path(path);
e02119d5
CM
567 return 0;
568}
569
086dcbfa
FM
570/*
571 * Item overwrite used by replay and tree logging. eb, slot and key all refer
572 * to the src data we are copying out.
573 *
574 * root is the tree we are copying into, and path is a scratch
575 * path for use in this function (it should be released on entry and
576 * will be released on exit).
577 *
578 * If the key is already in the destination tree the existing item is
579 * overwritten. If the existing item isn't big enough, it is extended.
580 * If it is too large, it is truncated.
581 *
582 * If the key isn't in the destination yet, a new item is inserted.
583 */
584static int overwrite_item(struct btrfs_trans_handle *trans,
585 struct btrfs_root *root,
586 struct btrfs_path *path,
587 struct extent_buffer *eb, int slot,
588 struct btrfs_key *key)
589{
590 int ret;
591
592 /* Look for the key in the destination tree. */
593 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
594 if (ret < 0)
595 return ret;
596
597 return do_overwrite_item(trans, root, path, eb, slot, key);
598}
599
e43eec81 600static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
6db75318 601 struct fscrypt_str *name)
e43eec81
STD
602{
603 char *buf;
604
605 buf = kmalloc(len, GFP_NOFS);
606 if (!buf)
607 return -ENOMEM;
608
609 read_extent_buffer(eb, buf, (unsigned long)start, len);
610 name->name = buf;
611 name->len = len;
612 return 0;
613}
614
e02119d5
CM
615/*
616 * simple helper to read an inode off the disk from a given root
617 * This can only be called for subvolume roots and not for the log
618 */
619static noinline struct inode *read_one_inode(struct btrfs_root *root,
620 u64 objectid)
621{
622 struct inode *inode;
e02119d5 623
0202e83f 624 inode = btrfs_iget(root->fs_info->sb, objectid, root);
2e19f1f9 625 if (IS_ERR(inode))
5d4f98a2 626 inode = NULL;
e02119d5
CM
627 return inode;
628}
629
630/* replays a single extent in 'eb' at 'slot' with 'key' into the
631 * subvolume 'root'. path is released on entry and should be released
632 * on exit.
633 *
634 * extents in the log tree have not been allocated out of the extent
635 * tree yet. So, this completes the allocation, taking a reference
636 * as required if the extent already exists or creating a new extent
637 * if it isn't in the extent allocation tree yet.
638 *
639 * The extent is inserted into the file, dropping any existing extents
640 * from the file that overlap the new one.
641 */
642static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
643 struct btrfs_root *root,
644 struct btrfs_path *path,
645 struct extent_buffer *eb, int slot,
646 struct btrfs_key *key)
647{
5893dfb9 648 struct btrfs_drop_extents_args drop_args = { 0 };
0b246afa 649 struct btrfs_fs_info *fs_info = root->fs_info;
e02119d5 650 int found_type;
e02119d5 651 u64 extent_end;
e02119d5 652 u64 start = key->offset;
4bc4bee4 653 u64 nbytes = 0;
e02119d5
CM
654 struct btrfs_file_extent_item *item;
655 struct inode *inode = NULL;
656 unsigned long size;
657 int ret = 0;
658
659 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
660 found_type = btrfs_file_extent_type(eb, item);
661
d899e052 662 if (found_type == BTRFS_FILE_EXTENT_REG ||
4bc4bee4
JB
663 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
664 nbytes = btrfs_file_extent_num_bytes(eb, item);
665 extent_end = start + nbytes;
666
667 /*
668 * We don't add to the inodes nbytes if we are prealloc or a
669 * hole.
670 */
671 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
672 nbytes = 0;
673 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
e41ca589 674 size = btrfs_file_extent_ram_bytes(eb, item);
4bc4bee4 675 nbytes = btrfs_file_extent_ram_bytes(eb, item);
da17066c 676 extent_end = ALIGN(start + size,
0b246afa 677 fs_info->sectorsize);
e02119d5
CM
678 } else {
679 ret = 0;
680 goto out;
681 }
682
683 inode = read_one_inode(root, key->objectid);
684 if (!inode) {
685 ret = -EIO;
686 goto out;
687 }
688
689 /*
690 * first check to see if we already have this extent in the
691 * file. This must be done before the btrfs_drop_extents run
692 * so we don't try to drop this extent.
693 */
f85b7379
DS
694 ret = btrfs_lookup_file_extent(trans, root, path,
695 btrfs_ino(BTRFS_I(inode)), start, 0);
e02119d5 696
d899e052
YZ
697 if (ret == 0 &&
698 (found_type == BTRFS_FILE_EXTENT_REG ||
699 found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
e02119d5
CM
700 struct btrfs_file_extent_item cmp1;
701 struct btrfs_file_extent_item cmp2;
702 struct btrfs_file_extent_item *existing;
703 struct extent_buffer *leaf;
704
705 leaf = path->nodes[0];
706 existing = btrfs_item_ptr(leaf, path->slots[0],
707 struct btrfs_file_extent_item);
708
709 read_extent_buffer(eb, &cmp1, (unsigned long)item,
710 sizeof(cmp1));
711 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
712 sizeof(cmp2));
713
714 /*
715 * we already have a pointer to this exact extent,
716 * we don't have to do anything
717 */
718 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
b3b4aa74 719 btrfs_release_path(path);
e02119d5
CM
720 goto out;
721 }
722 }
b3b4aa74 723 btrfs_release_path(path);
e02119d5
CM
724
725 /* drop any overlapping extents */
5893dfb9
FM
726 drop_args.start = start;
727 drop_args.end = extent_end;
728 drop_args.drop_cache = true;
729 ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
3650860b
JB
730 if (ret)
731 goto out;
e02119d5 732
07d400a6
YZ
733 if (found_type == BTRFS_FILE_EXTENT_REG ||
734 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5d4f98a2 735 u64 offset;
07d400a6
YZ
736 unsigned long dest_offset;
737 struct btrfs_key ins;
738
3168021c
FM
739 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
740 btrfs_fs_incompat(fs_info, NO_HOLES))
741 goto update_inode;
742
07d400a6
YZ
743 ret = btrfs_insert_empty_item(trans, root, path, key,
744 sizeof(*item));
3650860b
JB
745 if (ret)
746 goto out;
07d400a6
YZ
747 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
748 path->slots[0]);
749 copy_extent_buffer(path->nodes[0], eb, dest_offset,
750 (unsigned long)item, sizeof(*item));
751
752 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
753 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
754 ins.type = BTRFS_EXTENT_ITEM_KEY;
5d4f98a2 755 offset = key->offset - btrfs_file_extent_offset(eb, item);
07d400a6 756
df2c95f3
QW
757 /*
758 * Manually record dirty extent, as here we did a shallow
759 * file extent item copy and skip normal backref update,
760 * but modifying extent tree all by ourselves.
761 * So need to manually record dirty extent for qgroup,
762 * as the owner of the file extent changed from log tree
763 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
764 */
a95f3aaf 765 ret = btrfs_qgroup_trace_extent(trans,
df2c95f3 766 btrfs_file_extent_disk_bytenr(eb, item),
e2896e79 767 btrfs_file_extent_disk_num_bytes(eb, item));
df2c95f3
QW
768 if (ret < 0)
769 goto out;
770
07d400a6 771 if (ins.objectid > 0) {
82fa113f 772 struct btrfs_ref ref = { 0 };
07d400a6
YZ
773 u64 csum_start;
774 u64 csum_end;
775 LIST_HEAD(ordered_sums);
82fa113f 776
07d400a6
YZ
777 /*
778 * is this extent already allocated in the extent
779 * allocation tree? If so, just add a reference
780 */
2ff7e61e 781 ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
07d400a6 782 ins.offset);
3736127a
MPS
783 if (ret < 0) {
784 goto out;
785 } else if (ret == 0) {
82fa113f
QW
786 btrfs_init_generic_ref(&ref,
787 BTRFS_ADD_DELAYED_REF,
788 ins.objectid, ins.offset, 0);
789 btrfs_init_data_ref(&ref,
790 root->root_key.objectid,
f42c5da6 791 key->objectid, offset, 0, false);
82fa113f 792 ret = btrfs_inc_extent_ref(trans, &ref);
b50c6e25
JB
793 if (ret)
794 goto out;
07d400a6
YZ
795 } else {
796 /*
797 * insert the extent pointer in the extent
798 * allocation tree
799 */
5d4f98a2 800 ret = btrfs_alloc_logged_file_extent(trans,
2ff7e61e 801 root->root_key.objectid,
5d4f98a2 802 key->objectid, offset, &ins);
b50c6e25
JB
803 if (ret)
804 goto out;
07d400a6 805 }
b3b4aa74 806 btrfs_release_path(path);
07d400a6
YZ
807
808 if (btrfs_file_extent_compression(eb, item)) {
809 csum_start = ins.objectid;
810 csum_end = csum_start + ins.offset;
811 } else {
812 csum_start = ins.objectid +
813 btrfs_file_extent_offset(eb, item);
814 csum_end = csum_start +
815 btrfs_file_extent_num_bytes(eb, item);
816 }
817
818 ret = btrfs_lookup_csums_range(root->log_root,
819 csum_start, csum_end - 1,
26ce9114 820 &ordered_sums, 0, false);
3650860b
JB
821 if (ret)
822 goto out;
b84b8390
FM
823 /*
824 * Now delete all existing cums in the csum root that
825 * cover our range. We do this because we can have an
826 * extent that is completely referenced by one file
827 * extent item and partially referenced by another
828 * file extent item (like after using the clone or
829 * extent_same ioctls). In this case if we end up doing
830 * the replay of the one that partially references the
831 * extent first, and we do not do the csum deletion
832 * below, we can get 2 csum items in the csum tree that
833 * overlap each other. For example, imagine our log has
834 * the two following file extent items:
835 *
836 * key (257 EXTENT_DATA 409600)
837 * extent data disk byte 12845056 nr 102400
838 * extent data offset 20480 nr 20480 ram 102400
839 *
840 * key (257 EXTENT_DATA 819200)
841 * extent data disk byte 12845056 nr 102400
842 * extent data offset 0 nr 102400 ram 102400
843 *
844 * Where the second one fully references the 100K extent
845 * that starts at disk byte 12845056, and the log tree
846 * has a single csum item that covers the entire range
847 * of the extent:
848 *
849 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
850 *
851 * After the first file extent item is replayed, the
852 * csum tree gets the following csum item:
853 *
854 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
855 *
856 * Which covers the 20K sub-range starting at offset 20K
857 * of our extent. Now when we replay the second file
858 * extent item, if we do not delete existing csum items
859 * that cover any of its blocks, we end up getting two
860 * csum items in our csum tree that overlap each other:
861 *
862 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
863 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
864 *
865 * Which is a problem, because after this anyone trying
866 * to lookup up for the checksum of any block of our
867 * extent starting at an offset of 40K or higher, will
868 * end up looking at the second csum item only, which
869 * does not contain the checksum for any block starting
870 * at offset 40K or higher of our extent.
871 */
07d400a6
YZ
872 while (!list_empty(&ordered_sums)) {
873 struct btrfs_ordered_sum *sums;
fc28b25e
JB
874 struct btrfs_root *csum_root;
875
07d400a6
YZ
876 sums = list_entry(ordered_sums.next,
877 struct btrfs_ordered_sum,
878 list);
fc28b25e
JB
879 csum_root = btrfs_csum_root(fs_info,
880 sums->bytenr);
b84b8390 881 if (!ret)
fc28b25e 882 ret = btrfs_del_csums(trans, csum_root,
5b4aacef
JM
883 sums->bytenr,
884 sums->len);
3650860b
JB
885 if (!ret)
886 ret = btrfs_csum_file_blocks(trans,
fc28b25e
JB
887 csum_root,
888 sums);
07d400a6
YZ
889 list_del(&sums->list);
890 kfree(sums);
891 }
3650860b
JB
892 if (ret)
893 goto out;
07d400a6 894 } else {
b3b4aa74 895 btrfs_release_path(path);
07d400a6
YZ
896 }
897 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
898 /* inline extents are easy, we just overwrite them */
899 ret = overwrite_item(trans, root, path, eb, slot, key);
3650860b
JB
900 if (ret)
901 goto out;
07d400a6 902 }
e02119d5 903
9ddc959e
JB
904 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
905 extent_end - start);
906 if (ret)
907 goto out;
908
3168021c 909update_inode:
2766ff61 910 btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
9a56fcd1 911 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
e02119d5 912out:
8aa1e49e 913 iput(inode);
e02119d5
CM
914 return ret;
915}
916
313ab753
FM
917static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
918 struct btrfs_inode *dir,
919 struct btrfs_inode *inode,
6db75318 920 const struct fscrypt_str *name)
313ab753
FM
921{
922 int ret;
923
e43eec81 924 ret = btrfs_unlink_inode(trans, dir, inode, name);
313ab753
FM
925 if (ret)
926 return ret;
927 /*
928 * Whenever we need to check if a name exists or not, we check the
929 * fs/subvolume tree. So after an unlink we must run delayed items, so
930 * that future checks for a name during log replay see that the name
931 * does not exists anymore.
932 */
933 return btrfs_run_delayed_items(trans);
934}
935
e02119d5
CM
936/*
937 * when cleaning up conflicts between the directory names in the
938 * subvolume, directory names in the log and directory names in the
939 * inode back references, we may have to unlink inodes from directories.
940 *
941 * This is a helper function to do the unlink of a specific directory
942 * item
943 */
944static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
e02119d5 945 struct btrfs_path *path,
207e7d92 946 struct btrfs_inode *dir,
e02119d5
CM
947 struct btrfs_dir_item *di)
948{
9798ba24 949 struct btrfs_root *root = dir->root;
e02119d5 950 struct inode *inode;
6db75318 951 struct fscrypt_str name;
e02119d5
CM
952 struct extent_buffer *leaf;
953 struct btrfs_key location;
954 int ret;
955
956 leaf = path->nodes[0];
957
958 btrfs_dir_item_key_to_cpu(leaf, di, &location);
e43eec81
STD
959 ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
960 if (ret)
2a29edc6 961 return -ENOMEM;
962
b3b4aa74 963 btrfs_release_path(path);
e02119d5
CM
964
965 inode = read_one_inode(root, location.objectid);
c00e9493 966 if (!inode) {
3650860b
JB
967 ret = -EIO;
968 goto out;
c00e9493 969 }
e02119d5 970
ec051c0f 971 ret = link_to_fixup_dir(trans, root, path, location.objectid);
3650860b
JB
972 if (ret)
973 goto out;
12fcfd22 974
e43eec81 975 ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
3650860b 976out:
e43eec81 977 kfree(name.name);
e02119d5
CM
978 iput(inode);
979 return ret;
980}
981
982/*
77a5b9e3
FM
983 * See if a given name and sequence number found in an inode back reference are
984 * already in a directory and correctly point to this inode.
985 *
986 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
987 * exists.
e02119d5
CM
988 */
989static noinline int inode_in_dir(struct btrfs_root *root,
990 struct btrfs_path *path,
991 u64 dirid, u64 objectid, u64 index,
6db75318 992 struct fscrypt_str *name)
e02119d5
CM
993{
994 struct btrfs_dir_item *di;
995 struct btrfs_key location;
77a5b9e3 996 int ret = 0;
e02119d5
CM
997
998 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
e43eec81 999 index, name, 0);
77a5b9e3 1000 if (IS_ERR(di)) {
8dcbc261 1001 ret = PTR_ERR(di);
77a5b9e3
FM
1002 goto out;
1003 } else if (di) {
e02119d5
CM
1004 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
1005 if (location.objectid != objectid)
1006 goto out;
77a5b9e3 1007 } else {
e02119d5 1008 goto out;
77a5b9e3 1009 }
e02119d5 1010
77a5b9e3 1011 btrfs_release_path(path);
e43eec81 1012 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
77a5b9e3
FM
1013 if (IS_ERR(di)) {
1014 ret = PTR_ERR(di);
e02119d5 1015 goto out;
77a5b9e3
FM
1016 } else if (di) {
1017 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
1018 if (location.objectid == objectid)
1019 ret = 1;
1020 }
e02119d5 1021out:
b3b4aa74 1022 btrfs_release_path(path);
77a5b9e3 1023 return ret;
e02119d5
CM
1024}
1025
1026/*
1027 * helper function to check a log tree for a named back reference in
1028 * an inode. This is used to decide if a back reference that is
1029 * found in the subvolume conflicts with what we find in the log.
1030 *
1031 * inode backreferences may have multiple refs in a single item,
1032 * during replay we process one reference at a time, and we don't
1033 * want to delete valid links to a file from the subvolume if that
1034 * link is also in the log.
1035 */
1036static noinline int backref_in_log(struct btrfs_root *log,
1037 struct btrfs_key *key,
f186373f 1038 u64 ref_objectid,
6db75318 1039 const struct fscrypt_str *name)
e02119d5
CM
1040{
1041 struct btrfs_path *path;
e02119d5 1042 int ret;
e02119d5
CM
1043
1044 path = btrfs_alloc_path();
2a29edc6 1045 if (!path)
1046 return -ENOMEM;
1047
e02119d5 1048 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
d3316c82
NB
1049 if (ret < 0) {
1050 goto out;
1051 } else if (ret == 1) {
89cbf5f6 1052 ret = 0;
f186373f
MF
1053 goto out;
1054 }
1055
89cbf5f6
NB
1056 if (key->type == BTRFS_INODE_EXTREF_KEY)
1057 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1058 path->slots[0],
e43eec81 1059 ref_objectid, name);
89cbf5f6
NB
1060 else
1061 ret = !!btrfs_find_name_in_backref(path->nodes[0],
e43eec81 1062 path->slots[0], name);
e02119d5
CM
1063out:
1064 btrfs_free_path(path);
89cbf5f6 1065 return ret;
e02119d5
CM
1066}
1067
5a1d7843 1068static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
e02119d5 1069 struct btrfs_root *root,
e02119d5 1070 struct btrfs_path *path,
5a1d7843 1071 struct btrfs_root *log_root,
94c91a1f
NB
1072 struct btrfs_inode *dir,
1073 struct btrfs_inode *inode,
f186373f 1074 u64 inode_objectid, u64 parent_objectid,
6db75318 1075 u64 ref_index, struct fscrypt_str *name)
e02119d5 1076{
34f3e4f2 1077 int ret;
f186373f 1078 struct extent_buffer *leaf;
5a1d7843 1079 struct btrfs_dir_item *di;
f186373f
MF
1080 struct btrfs_key search_key;
1081 struct btrfs_inode_extref *extref;
c622ae60 1082
f186373f
MF
1083again:
1084 /* Search old style refs */
1085 search_key.objectid = inode_objectid;
1086 search_key.type = BTRFS_INODE_REF_KEY;
1087 search_key.offset = parent_objectid;
1088 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
e02119d5 1089 if (ret == 0) {
e02119d5
CM
1090 struct btrfs_inode_ref *victim_ref;
1091 unsigned long ptr;
1092 unsigned long ptr_end;
f186373f
MF
1093
1094 leaf = path->nodes[0];
e02119d5
CM
1095
1096 /* are we trying to overwrite a back ref for the root directory
1097 * if so, just jump out, we're done
1098 */
f186373f 1099 if (search_key.objectid == search_key.offset)
5a1d7843 1100 return 1;
e02119d5
CM
1101
1102 /* check all the names in this back reference to see
1103 * if they are in the log. if so, we allow them to stay
1104 * otherwise they must be unlinked as a conflict
1105 */
1106 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3212fa14 1107 ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
d397712b 1108 while (ptr < ptr_end) {
6db75318 1109 struct fscrypt_str victim_name;
e02119d5 1110
e43eec81
STD
1111 victim_ref = (struct btrfs_inode_ref *)ptr;
1112 ret = read_alloc_one_name(leaf, (victim_ref + 1),
1113 btrfs_inode_ref_name_len(leaf, victim_ref),
1114 &victim_name);
1115 if (ret)
1116 return ret;
e02119d5 1117
d3316c82 1118 ret = backref_in_log(log_root, &search_key,
e43eec81 1119 parent_objectid, &victim_name);
d3316c82 1120 if (ret < 0) {
e43eec81 1121 kfree(victim_name.name);
d3316c82
NB
1122 return ret;
1123 } else if (!ret) {
94c91a1f 1124 inc_nlink(&inode->vfs_inode);
b3b4aa74 1125 btrfs_release_path(path);
12fcfd22 1126
313ab753 1127 ret = unlink_inode_for_log_replay(trans, dir, inode,
e43eec81
STD
1128 &victim_name);
1129 kfree(victim_name.name);
ada9af21
FDBM
1130 if (ret)
1131 return ret;
f186373f 1132 goto again;
e02119d5 1133 }
e43eec81 1134 kfree(victim_name.name);
f186373f 1135
e43eec81 1136 ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
e02119d5 1137 }
e02119d5 1138 }
b3b4aa74 1139 btrfs_release_path(path);
e02119d5 1140
f186373f 1141 /* Same search but for extended refs */
e43eec81 1142 extref = btrfs_lookup_inode_extref(NULL, root, path, name,
f186373f
MF
1143 inode_objectid, parent_objectid, 0,
1144 0);
7a6b75b7
FM
1145 if (IS_ERR(extref)) {
1146 return PTR_ERR(extref);
1147 } else if (extref) {
f186373f
MF
1148 u32 item_size;
1149 u32 cur_offset = 0;
1150 unsigned long base;
1151 struct inode *victim_parent;
1152
1153 leaf = path->nodes[0];
1154
3212fa14 1155 item_size = btrfs_item_size(leaf, path->slots[0]);
f186373f
MF
1156 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1157
1158 while (cur_offset < item_size) {
6db75318 1159 struct fscrypt_str victim_name;
f186373f 1160
e43eec81 1161 extref = (struct btrfs_inode_extref *)(base + cur_offset);
f186373f
MF
1162
1163 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1164 goto next;
1165
e43eec81
STD
1166 ret = read_alloc_one_name(leaf, &extref->name,
1167 btrfs_inode_extref_name_len(leaf, extref),
1168 &victim_name);
1169 if (ret)
1170 return ret;
f186373f
MF
1171
1172 search_key.objectid = inode_objectid;
1173 search_key.type = BTRFS_INODE_EXTREF_KEY;
1174 search_key.offset = btrfs_extref_hash(parent_objectid,
e43eec81
STD
1175 victim_name.name,
1176 victim_name.len);
d3316c82 1177 ret = backref_in_log(log_root, &search_key,
e43eec81 1178 parent_objectid, &victim_name);
d3316c82 1179 if (ret < 0) {
e43eec81 1180 kfree(victim_name.name);
d3316c82
NB
1181 return ret;
1182 } else if (!ret) {
f186373f
MF
1183 ret = -ENOENT;
1184 victim_parent = read_one_inode(root,
94c91a1f 1185 parent_objectid);
f186373f 1186 if (victim_parent) {
94c91a1f 1187 inc_nlink(&inode->vfs_inode);
f186373f
MF
1188 btrfs_release_path(path);
1189
313ab753 1190 ret = unlink_inode_for_log_replay(trans,
4ec5934e 1191 BTRFS_I(victim_parent),
e43eec81 1192 inode, &victim_name);
f186373f 1193 }
f186373f 1194 iput(victim_parent);
e43eec81 1195 kfree(victim_name.name);
3650860b
JB
1196 if (ret)
1197 return ret;
f186373f
MF
1198 goto again;
1199 }
e43eec81 1200 kfree(victim_name.name);
f186373f 1201next:
e43eec81 1202 cur_offset += victim_name.len + sizeof(*extref);
f186373f 1203 }
f186373f
MF
1204 }
1205 btrfs_release_path(path);
1206
34f3e4f2 1207 /* look for a conflicting sequence number */
94c91a1f 1208 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
e43eec81 1209 ref_index, name, 0);
52db7779 1210 if (IS_ERR(di)) {
8dcbc261 1211 return PTR_ERR(di);
52db7779 1212 } else if (di) {
9798ba24 1213 ret = drop_one_dir_item(trans, path, dir, di);
3650860b
JB
1214 if (ret)
1215 return ret;
34f3e4f2 1216 }
1217 btrfs_release_path(path);
1218
52042d8e 1219 /* look for a conflicting name */
e43eec81 1220 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
52db7779
FM
1221 if (IS_ERR(di)) {
1222 return PTR_ERR(di);
1223 } else if (di) {
9798ba24 1224 ret = drop_one_dir_item(trans, path, dir, di);
3650860b
JB
1225 if (ret)
1226 return ret;
34f3e4f2 1227 }
1228 btrfs_release_path(path);
1229
5a1d7843
JS
1230 return 0;
1231}
e02119d5 1232
bae15d95 1233static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
6db75318 1234 struct fscrypt_str *name, u64 *index,
bae15d95 1235 u64 *parent_objectid)
f186373f
MF
1236{
1237 struct btrfs_inode_extref *extref;
e43eec81 1238 int ret;
f186373f
MF
1239
1240 extref = (struct btrfs_inode_extref *)ref_ptr;
1241
e43eec81
STD
1242 ret = read_alloc_one_name(eb, &extref->name,
1243 btrfs_inode_extref_name_len(eb, extref), name);
1244 if (ret)
1245 return ret;
f186373f 1246
1f250e92
FM
1247 if (index)
1248 *index = btrfs_inode_extref_index(eb, extref);
f186373f
MF
1249 if (parent_objectid)
1250 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1251
1252 return 0;
1253}
1254
bae15d95 1255static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
6db75318 1256 struct fscrypt_str *name, u64 *index)
f186373f
MF
1257{
1258 struct btrfs_inode_ref *ref;
e43eec81 1259 int ret;
f186373f
MF
1260
1261 ref = (struct btrfs_inode_ref *)ref_ptr;
1262
e43eec81
STD
1263 ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
1264 name);
1265 if (ret)
1266 return ret;
f186373f 1267
1f250e92
FM
1268 if (index)
1269 *index = btrfs_inode_ref_index(eb, ref);
f186373f
MF
1270
1271 return 0;
1272}
1273
1f250e92
FM
1274/*
1275 * Take an inode reference item from the log tree and iterate all names from the
1276 * inode reference item in the subvolume tree with the same key (if it exists).
1277 * For any name that is not in the inode reference item from the log tree, do a
1278 * proper unlink of that name (that is, remove its entry from the inode
1279 * reference item and both dir index keys).
1280 */
1281static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1282 struct btrfs_root *root,
1283 struct btrfs_path *path,
1284 struct btrfs_inode *inode,
1285 struct extent_buffer *log_eb,
1286 int log_slot,
1287 struct btrfs_key *key)
1288{
1289 int ret;
1290 unsigned long ref_ptr;
1291 unsigned long ref_end;
1292 struct extent_buffer *eb;
1293
1294again:
1295 btrfs_release_path(path);
1296 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1297 if (ret > 0) {
1298 ret = 0;
1299 goto out;
1300 }
1301 if (ret < 0)
1302 goto out;
1303
1304 eb = path->nodes[0];
1305 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3212fa14 1306 ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
1f250e92 1307 while (ref_ptr < ref_end) {
6db75318 1308 struct fscrypt_str name;
1f250e92
FM
1309 u64 parent_id;
1310
1311 if (key->type == BTRFS_INODE_EXTREF_KEY) {
e43eec81 1312 ret = extref_get_fields(eb, ref_ptr, &name,
1f250e92
FM
1313 NULL, &parent_id);
1314 } else {
1315 parent_id = key->offset;
e43eec81 1316 ret = ref_get_fields(eb, ref_ptr, &name, NULL);
1f250e92
FM
1317 }
1318 if (ret)
1319 goto out;
1320
1321 if (key->type == BTRFS_INODE_EXTREF_KEY)
6ff49c6a 1322 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
e43eec81 1323 parent_id, &name);
1f250e92 1324 else
e43eec81 1325 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
1f250e92
FM
1326
1327 if (!ret) {
1328 struct inode *dir;
1329
1330 btrfs_release_path(path);
1331 dir = read_one_inode(root, parent_id);
1332 if (!dir) {
1333 ret = -ENOENT;
e43eec81 1334 kfree(name.name);
1f250e92
FM
1335 goto out;
1336 }
313ab753 1337 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
e43eec81
STD
1338 inode, &name);
1339 kfree(name.name);
1f250e92
FM
1340 iput(dir);
1341 if (ret)
1342 goto out;
1343 goto again;
1344 }
1345
e43eec81
STD
1346 kfree(name.name);
1347 ref_ptr += name.len;
1f250e92
FM
1348 if (key->type == BTRFS_INODE_EXTREF_KEY)
1349 ref_ptr += sizeof(struct btrfs_inode_extref);
1350 else
1351 ref_ptr += sizeof(struct btrfs_inode_ref);
1352 }
1353 ret = 0;
1354 out:
1355 btrfs_release_path(path);
1356 return ret;
1357}
1358
5a1d7843
JS
1359/*
1360 * replay one inode back reference item found in the log tree.
1361 * eb, slot and key refer to the buffer and key found in the log tree.
1362 * root is the destination we are replaying into, and path is for temp
1363 * use by this function. (it should be released on return).
1364 */
1365static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1366 struct btrfs_root *root,
1367 struct btrfs_root *log,
1368 struct btrfs_path *path,
1369 struct extent_buffer *eb, int slot,
1370 struct btrfs_key *key)
1371{
03b2f08b
GB
1372 struct inode *dir = NULL;
1373 struct inode *inode = NULL;
5a1d7843
JS
1374 unsigned long ref_ptr;
1375 unsigned long ref_end;
6db75318 1376 struct fscrypt_str name;
5a1d7843 1377 int ret;
f186373f
MF
1378 int log_ref_ver = 0;
1379 u64 parent_objectid;
1380 u64 inode_objectid;
f46dbe3d 1381 u64 ref_index = 0;
f186373f
MF
1382 int ref_struct_size;
1383
1384 ref_ptr = btrfs_item_ptr_offset(eb, slot);
3212fa14 1385 ref_end = ref_ptr + btrfs_item_size(eb, slot);
f186373f
MF
1386
1387 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1388 struct btrfs_inode_extref *r;
1389
1390 ref_struct_size = sizeof(struct btrfs_inode_extref);
1391 log_ref_ver = 1;
1392 r = (struct btrfs_inode_extref *)ref_ptr;
1393 parent_objectid = btrfs_inode_extref_parent(eb, r);
1394 } else {
1395 ref_struct_size = sizeof(struct btrfs_inode_ref);
1396 parent_objectid = key->offset;
1397 }
1398 inode_objectid = key->objectid;
e02119d5 1399
5a1d7843
JS
1400 /*
1401 * it is possible that we didn't log all the parent directories
1402 * for a given inode. If we don't find the dir, just don't
1403 * copy the back ref in. The link count fixup code will take
1404 * care of the rest
1405 */
f186373f 1406 dir = read_one_inode(root, parent_objectid);
03b2f08b
GB
1407 if (!dir) {
1408 ret = -ENOENT;
1409 goto out;
1410 }
5a1d7843 1411
f186373f 1412 inode = read_one_inode(root, inode_objectid);
5a1d7843 1413 if (!inode) {
03b2f08b
GB
1414 ret = -EIO;
1415 goto out;
5a1d7843
JS
1416 }
1417
5a1d7843 1418 while (ref_ptr < ref_end) {
f186373f 1419 if (log_ref_ver) {
e43eec81 1420 ret = extref_get_fields(eb, ref_ptr, &name,
bae15d95 1421 &ref_index, &parent_objectid);
f186373f
MF
1422 /*
1423 * parent object can change from one array
1424 * item to another.
1425 */
1426 if (!dir)
1427 dir = read_one_inode(root, parent_objectid);
03b2f08b
GB
1428 if (!dir) {
1429 ret = -ENOENT;
1430 goto out;
1431 }
f186373f 1432 } else {
e43eec81 1433 ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
f186373f
MF
1434 }
1435 if (ret)
03b2f08b 1436 goto out;
5a1d7843 1437
77a5b9e3 1438 ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
e43eec81 1439 btrfs_ino(BTRFS_I(inode)), ref_index, &name);
77a5b9e3
FM
1440 if (ret < 0) {
1441 goto out;
1442 } else if (ret == 0) {
5a1d7843
JS
1443 /*
1444 * look for a conflicting back reference in the
1445 * metadata. if we find one we have to unlink that name
1446 * of the file before we add our new link. Later on, we
1447 * overwrite any existing back reference, and we don't
1448 * want to create dangling pointers in the directory.
1449 */
7059c658
FM
1450 ret = __add_inode_ref(trans, root, path, log,
1451 BTRFS_I(dir), BTRFS_I(inode),
1452 inode_objectid, parent_objectid,
e43eec81 1453 ref_index, &name);
7059c658
FM
1454 if (ret) {
1455 if (ret == 1)
1456 ret = 0;
0d836392 1457 goto out;
7059c658 1458 }
0d836392 1459
5a1d7843 1460 /* insert our name */
7059c658 1461 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
e43eec81 1462 &name, 0, ref_index);
3650860b
JB
1463 if (ret)
1464 goto out;
5a1d7843 1465
f96d4474
JB
1466 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1467 if (ret)
1468 goto out;
5a1d7843 1469 }
77a5b9e3 1470 /* Else, ret == 1, we already have a perfect match, we're done. */
5a1d7843 1471
e43eec81
STD
1472 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
1473 kfree(name.name);
1474 name.name = NULL;
f186373f
MF
1475 if (log_ref_ver) {
1476 iput(dir);
1477 dir = NULL;
1478 }
5a1d7843 1479 }
e02119d5 1480
1f250e92
FM
1481 /*
1482 * Before we overwrite the inode reference item in the subvolume tree
1483 * with the item from the log tree, we must unlink all names from the
1484 * parent directory that are in the subvolume's tree inode reference
1485 * item, otherwise we end up with an inconsistent subvolume tree where
1486 * dir index entries exist for a name but there is no inode reference
1487 * item with the same name.
1488 */
1489 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1490 key);
1491 if (ret)
1492 goto out;
1493
e02119d5
CM
1494 /* finally write the back reference in the inode */
1495 ret = overwrite_item(trans, root, path, eb, slot, key);
5a1d7843 1496out:
b3b4aa74 1497 btrfs_release_path(path);
e43eec81 1498 kfree(name.name);
e02119d5
CM
1499 iput(dir);
1500 iput(inode);
3650860b 1501 return ret;
e02119d5
CM
1502}
1503
f186373f 1504static int count_inode_extrefs(struct btrfs_root *root,
36283658 1505 struct btrfs_inode *inode, struct btrfs_path *path)
f186373f
MF
1506{
1507 int ret = 0;
1508 int name_len;
1509 unsigned int nlink = 0;
1510 u32 item_size;
1511 u32 cur_offset = 0;
36283658 1512 u64 inode_objectid = btrfs_ino(inode);
f186373f
MF
1513 u64 offset = 0;
1514 unsigned long ptr;
1515 struct btrfs_inode_extref *extref;
1516 struct extent_buffer *leaf;
1517
1518 while (1) {
1519 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1520 &extref, &offset);
1521 if (ret)
1522 break;
c71bf099 1523
f186373f 1524 leaf = path->nodes[0];
3212fa14 1525 item_size = btrfs_item_size(leaf, path->slots[0]);
f186373f 1526 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
2c2c452b 1527 cur_offset = 0;
f186373f
MF
1528
1529 while (cur_offset < item_size) {
1530 extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1531 name_len = btrfs_inode_extref_name_len(leaf, extref);
1532
1533 nlink++;
1534
1535 cur_offset += name_len + sizeof(*extref);
1536 }
1537
1538 offset++;
1539 btrfs_release_path(path);
1540 }
1541 btrfs_release_path(path);
1542
2c2c452b 1543 if (ret < 0 && ret != -ENOENT)
f186373f
MF
1544 return ret;
1545 return nlink;
1546}
1547
1548static int count_inode_refs(struct btrfs_root *root,
f329e319 1549 struct btrfs_inode *inode, struct btrfs_path *path)
e02119d5 1550{
e02119d5
CM
1551 int ret;
1552 struct btrfs_key key;
f186373f 1553 unsigned int nlink = 0;
e02119d5
CM
1554 unsigned long ptr;
1555 unsigned long ptr_end;
1556 int name_len;
f329e319 1557 u64 ino = btrfs_ino(inode);
e02119d5 1558
33345d01 1559 key.objectid = ino;
e02119d5
CM
1560 key.type = BTRFS_INODE_REF_KEY;
1561 key.offset = (u64)-1;
1562
d397712b 1563 while (1) {
e02119d5
CM
1564 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1565 if (ret < 0)
1566 break;
1567 if (ret > 0) {
1568 if (path->slots[0] == 0)
1569 break;
1570 path->slots[0]--;
1571 }
e93ae26f 1572process_slot:
e02119d5
CM
1573 btrfs_item_key_to_cpu(path->nodes[0], &key,
1574 path->slots[0]);
33345d01 1575 if (key.objectid != ino ||
e02119d5
CM
1576 key.type != BTRFS_INODE_REF_KEY)
1577 break;
1578 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
3212fa14 1579 ptr_end = ptr + btrfs_item_size(path->nodes[0],
e02119d5 1580 path->slots[0]);
d397712b 1581 while (ptr < ptr_end) {
e02119d5
CM
1582 struct btrfs_inode_ref *ref;
1583
1584 ref = (struct btrfs_inode_ref *)ptr;
1585 name_len = btrfs_inode_ref_name_len(path->nodes[0],
1586 ref);
1587 ptr = (unsigned long)(ref + 1) + name_len;
1588 nlink++;
1589 }
1590
1591 if (key.offset == 0)
1592 break;
e93ae26f
FDBM
1593 if (path->slots[0] > 0) {
1594 path->slots[0]--;
1595 goto process_slot;
1596 }
e02119d5 1597 key.offset--;
b3b4aa74 1598 btrfs_release_path(path);
e02119d5 1599 }
b3b4aa74 1600 btrfs_release_path(path);
f186373f
MF
1601
1602 return nlink;
1603}
1604
1605/*
1606 * There are a few corners where the link count of the file can't
1607 * be properly maintained during replay. So, instead of adding
1608 * lots of complexity to the log code, we just scan the backrefs
1609 * for any file that has been through replay.
1610 *
1611 * The scan will update the link count on the inode to reflect the
1612 * number of back refs found. If it goes down to zero, the iput
1613 * will free the inode.
1614 */
1615static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1616 struct btrfs_root *root,
1617 struct inode *inode)
1618{
1619 struct btrfs_path *path;
1620 int ret;
1621 u64 nlink = 0;
4a0cc7ca 1622 u64 ino = btrfs_ino(BTRFS_I(inode));
f186373f
MF
1623
1624 path = btrfs_alloc_path();
1625 if (!path)
1626 return -ENOMEM;
1627
f329e319 1628 ret = count_inode_refs(root, BTRFS_I(inode), path);
f186373f
MF
1629 if (ret < 0)
1630 goto out;
1631
1632 nlink = ret;
1633
36283658 1634 ret = count_inode_extrefs(root, BTRFS_I(inode), path);
f186373f
MF
1635 if (ret < 0)
1636 goto out;
1637
1638 nlink += ret;
1639
1640 ret = 0;
1641
e02119d5 1642 if (nlink != inode->i_nlink) {
bfe86848 1643 set_nlink(inode, nlink);
f96d4474
JB
1644 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1645 if (ret)
1646 goto out;
e02119d5 1647 }
8d5bf1cb 1648 BTRFS_I(inode)->index_cnt = (u64)-1;
e02119d5 1649
c71bf099
YZ
1650 if (inode->i_nlink == 0) {
1651 if (S_ISDIR(inode->i_mode)) {
1652 ret = replay_dir_deletes(trans, root, NULL, path,
33345d01 1653 ino, 1);
3650860b
JB
1654 if (ret)
1655 goto out;
c71bf099 1656 }
ecdcf3c2
NB
1657 ret = btrfs_insert_orphan_item(trans, root, ino);
1658 if (ret == -EEXIST)
1659 ret = 0;
12fcfd22 1660 }
12fcfd22 1661
f186373f
MF
1662out:
1663 btrfs_free_path(path);
1664 return ret;
e02119d5
CM
1665}
1666
1667static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1668 struct btrfs_root *root,
1669 struct btrfs_path *path)
1670{
1671 int ret;
1672 struct btrfs_key key;
1673 struct inode *inode;
1674
1675 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1676 key.type = BTRFS_ORPHAN_ITEM_KEY;
1677 key.offset = (u64)-1;
d397712b 1678 while (1) {
e02119d5
CM
1679 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1680 if (ret < 0)
1681 break;
1682
1683 if (ret == 1) {
011b28ac 1684 ret = 0;
e02119d5
CM
1685 if (path->slots[0] == 0)
1686 break;
1687 path->slots[0]--;
1688 }
1689
1690 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1691 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1692 key.type != BTRFS_ORPHAN_ITEM_KEY)
1693 break;
1694
1695 ret = btrfs_del_item(trans, root, path);
65a246c5 1696 if (ret)
011b28ac 1697 break;
e02119d5 1698
b3b4aa74 1699 btrfs_release_path(path);
e02119d5 1700 inode = read_one_inode(root, key.offset);
011b28ac
JB
1701 if (!inode) {
1702 ret = -EIO;
1703 break;
1704 }
e02119d5
CM
1705
1706 ret = fixup_inode_link_count(trans, root, inode);
e02119d5 1707 iput(inode);
3650860b 1708 if (ret)
011b28ac 1709 break;
e02119d5 1710
12fcfd22
CM
1711 /*
1712 * fixup on a directory may create new entries,
1713 * make sure we always look for the highset possible
1714 * offset
1715 */
1716 key.offset = (u64)-1;
e02119d5 1717 }
b3b4aa74 1718 btrfs_release_path(path);
65a246c5 1719 return ret;
e02119d5
CM
1720}
1721
1722
1723/*
1724 * record a given inode in the fixup dir so we can check its link
1725 * count when replay is done. The link count is incremented here
1726 * so the inode won't go away until we check it
1727 */
1728static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1729 struct btrfs_root *root,
1730 struct btrfs_path *path,
1731 u64 objectid)
1732{
1733 struct btrfs_key key;
1734 int ret = 0;
1735 struct inode *inode;
1736
1737 inode = read_one_inode(root, objectid);
c00e9493
TI
1738 if (!inode)
1739 return -EIO;
e02119d5
CM
1740
1741 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
962a298f 1742 key.type = BTRFS_ORPHAN_ITEM_KEY;
e02119d5
CM
1743 key.offset = objectid;
1744
1745 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1746
b3b4aa74 1747 btrfs_release_path(path);
e02119d5 1748 if (ret == 0) {
9bf7a489
JB
1749 if (!inode->i_nlink)
1750 set_nlink(inode, 1);
1751 else
8b558c5f 1752 inc_nlink(inode);
9a56fcd1 1753 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
e02119d5
CM
1754 } else if (ret == -EEXIST) {
1755 ret = 0;
e02119d5
CM
1756 }
1757 iput(inode);
1758
1759 return ret;
1760}
1761
1762/*
1763 * when replaying the log for a directory, we only insert names
1764 * for inodes that actually exist. This means an fsync on a directory
1765 * does not implicitly fsync all the new files in it
1766 */
1767static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1768 struct btrfs_root *root,
e02119d5 1769 u64 dirid, u64 index,
6db75318 1770 const struct fscrypt_str *name,
e02119d5
CM
1771 struct btrfs_key *location)
1772{
1773 struct inode *inode;
1774 struct inode *dir;
1775 int ret;
1776
1777 inode = read_one_inode(root, location->objectid);
1778 if (!inode)
1779 return -ENOENT;
1780
1781 dir = read_one_inode(root, dirid);
1782 if (!dir) {
1783 iput(inode);
1784 return -EIO;
1785 }
d555438b 1786
db0a669f 1787 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
e43eec81 1788 1, index);
e02119d5
CM
1789
1790 /* FIXME, put inode into FIXUP list */
1791
1792 iput(inode);
1793 iput(dir);
1794 return ret;
1795}
1796
339d0354
FM
1797static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
1798 struct btrfs_inode *dir,
1799 struct btrfs_path *path,
1800 struct btrfs_dir_item *dst_di,
1801 const struct btrfs_key *log_key,
94a48aef 1802 u8 log_flags,
339d0354
FM
1803 bool exists)
1804{
1805 struct btrfs_key found_key;
1806
1807 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1808 /* The existing dentry points to the same inode, don't delete it. */
1809 if (found_key.objectid == log_key->objectid &&
1810 found_key.type == log_key->type &&
1811 found_key.offset == log_key->offset &&
94a48aef 1812 btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
339d0354
FM
1813 return 1;
1814
1815 /*
1816 * Don't drop the conflicting directory entry if the inode for the new
1817 * entry doesn't exist.
1818 */
1819 if (!exists)
1820 return 0;
1821
1822 return drop_one_dir_item(trans, path, dir, dst_di);
1823}
1824
e02119d5
CM
1825/*
1826 * take a single entry in a log directory item and replay it into
1827 * the subvolume.
1828 *
1829 * if a conflicting item exists in the subdirectory already,
1830 * the inode it points to is unlinked and put into the link count
1831 * fix up tree.
1832 *
1833 * If a name from the log points to a file or directory that does
1834 * not exist in the FS, it is skipped. fsyncs on directories
1835 * do not force down inodes inside that directory, just changes to the
1836 * names or unlinks in a directory.
bb53eda9
FM
1837 *
1838 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1839 * non-existing inode) and 1 if the name was replayed.
e02119d5
CM
1840 */
1841static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1842 struct btrfs_root *root,
1843 struct btrfs_path *path,
1844 struct extent_buffer *eb,
1845 struct btrfs_dir_item *di,
1846 struct btrfs_key *key)
1847{
6db75318 1848 struct fscrypt_str name;
339d0354
FM
1849 struct btrfs_dir_item *dir_dst_di;
1850 struct btrfs_dir_item *index_dst_di;
1851 bool dir_dst_matches = false;
1852 bool index_dst_matches = false;
e02119d5 1853 struct btrfs_key log_key;
339d0354 1854 struct btrfs_key search_key;
e02119d5 1855 struct inode *dir;
94a48aef 1856 u8 log_flags;
cfd31269
FM
1857 bool exists;
1858 int ret;
339d0354 1859 bool update_size = true;
bb53eda9 1860 bool name_added = false;
e02119d5
CM
1861
1862 dir = read_one_inode(root, key->objectid);
c00e9493
TI
1863 if (!dir)
1864 return -EIO;
e02119d5 1865
e43eec81
STD
1866 ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
1867 if (ret)
2bac325e 1868 goto out;
2a29edc6 1869
94a48aef 1870 log_flags = btrfs_dir_flags(eb, di);
e02119d5 1871 btrfs_dir_item_key_to_cpu(eb, di, &log_key);
cfd31269 1872 ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
b3b4aa74 1873 btrfs_release_path(path);
cfd31269
FM
1874 if (ret < 0)
1875 goto out;
1876 exists = (ret == 0);
1877 ret = 0;
4bef0848 1878
339d0354 1879 dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
e43eec81 1880 &name, 1);
339d0354
FM
1881 if (IS_ERR(dir_dst_di)) {
1882 ret = PTR_ERR(dir_dst_di);
3650860b 1883 goto out;
339d0354
FM
1884 } else if (dir_dst_di) {
1885 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
94a48aef
OS
1886 dir_dst_di, &log_key,
1887 log_flags, exists);
339d0354
FM
1888 if (ret < 0)
1889 goto out;
1890 dir_dst_matches = (ret == 1);
e02119d5 1891 }
e15ac641 1892
339d0354
FM
1893 btrfs_release_path(path);
1894
1895 index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1896 key->objectid, key->offset,
e43eec81 1897 &name, 1);
339d0354
FM
1898 if (IS_ERR(index_dst_di)) {
1899 ret = PTR_ERR(index_dst_di);
e15ac641 1900 goto out;
339d0354
FM
1901 } else if (index_dst_di) {
1902 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
1903 index_dst_di, &log_key,
94a48aef 1904 log_flags, exists);
339d0354 1905 if (ret < 0)
e02119d5 1906 goto out;
339d0354 1907 index_dst_matches = (ret == 1);
e02119d5
CM
1908 }
1909
339d0354
FM
1910 btrfs_release_path(path);
1911
1912 if (dir_dst_matches && index_dst_matches) {
1913 ret = 0;
a2cc11db 1914 update_size = false;
e02119d5
CM
1915 goto out;
1916 }
1917
725af92a
NB
1918 /*
1919 * Check if the inode reference exists in the log for the given name,
1920 * inode and parent inode
1921 */
339d0354
FM
1922 search_key.objectid = log_key.objectid;
1923 search_key.type = BTRFS_INODE_REF_KEY;
1924 search_key.offset = key->objectid;
e43eec81 1925 ret = backref_in_log(root->log_root, &search_key, 0, &name);
725af92a
NB
1926 if (ret < 0) {
1927 goto out;
1928 } else if (ret) {
1929 /* The dentry will be added later. */
1930 ret = 0;
1931 update_size = false;
1932 goto out;
1933 }
1934
339d0354
FM
1935 search_key.objectid = log_key.objectid;
1936 search_key.type = BTRFS_INODE_EXTREF_KEY;
1937 search_key.offset = key->objectid;
e43eec81 1938 ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
725af92a
NB
1939 if (ret < 0) {
1940 goto out;
1941 } else if (ret) {
df8d116f
FM
1942 /* The dentry will be added later. */
1943 ret = 0;
1944 update_size = false;
1945 goto out;
1946 }
b3b4aa74 1947 btrfs_release_path(path);
60d53eb3 1948 ret = insert_one_name(trans, root, key->objectid, key->offset,
e43eec81 1949 &name, &log_key);
df8d116f 1950 if (ret && ret != -ENOENT && ret != -EEXIST)
3650860b 1951 goto out;
bb53eda9
FM
1952 if (!ret)
1953 name_added = true;
d555438b 1954 update_size = false;
3650860b 1955 ret = 0;
339d0354
FM
1956
1957out:
1958 if (!ret && update_size) {
e43eec81 1959 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
339d0354
FM
1960 ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
1961 }
e43eec81 1962 kfree(name.name);
339d0354
FM
1963 iput(dir);
1964 if (!ret && name_added)
1965 ret = 1;
1966 return ret;
e02119d5
CM
1967}
1968
339d0354 1969/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
e02119d5
CM
1970static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1971 struct btrfs_root *root,
1972 struct btrfs_path *path,
1973 struct extent_buffer *eb, int slot,
1974 struct btrfs_key *key)
1975{
339d0354 1976 int ret;
e02119d5 1977 struct btrfs_dir_item *di;
e02119d5 1978
339d0354
FM
1979 /* We only log dir index keys, which only contain a single dir item. */
1980 ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
bb53eda9 1981
339d0354
FM
1982 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
1983 ret = replay_one_name(trans, root, path, eb, di, key);
1984 if (ret < 0)
1985 return ret;
bb53eda9 1986
339d0354
FM
1987 /*
1988 * If this entry refers to a non-directory (directories can not have a
1989 * link count > 1) and it was added in the transaction that was not
1990 * committed, make sure we fixup the link count of the inode the entry
1991 * points to. Otherwise something like the following would result in a
1992 * directory pointing to an inode with a wrong link that does not account
1993 * for this dir entry:
1994 *
1995 * mkdir testdir
1996 * touch testdir/foo
1997 * touch testdir/bar
1998 * sync
1999 *
2000 * ln testdir/bar testdir/bar_link
2001 * ln testdir/foo testdir/foo_link
2002 * xfs_io -c "fsync" testdir/bar
2003 *
2004 * <power failure>
2005 *
2006 * mount fs, log replay happens
2007 *
2008 * File foo would remain with a link count of 1 when it has two entries
2009 * pointing to it in the directory testdir. This would make it impossible
2010 * to ever delete the parent directory has it would result in stale
2011 * dentries that can never be deleted.
2012 */
94a48aef 2013 if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
339d0354
FM
2014 struct btrfs_path *fixup_path;
2015 struct btrfs_key di_key;
bb53eda9 2016
339d0354
FM
2017 fixup_path = btrfs_alloc_path();
2018 if (!fixup_path)
2019 return -ENOMEM;
2020
2021 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2022 ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
2023 btrfs_free_path(fixup_path);
e02119d5 2024 }
339d0354 2025
bb53eda9 2026 return ret;
e02119d5
CM
2027}
2028
2029/*
2030 * directory replay has two parts. There are the standard directory
2031 * items in the log copied from the subvolume, and range items
2032 * created in the log while the subvolume was logged.
2033 *
2034 * The range items tell us which parts of the key space the log
2035 * is authoritative for. During replay, if a key in the subvolume
2036 * directory is in a logged range item, but not actually in the log
2037 * that means it was deleted from the directory before the fsync
2038 * and should be removed.
2039 */
2040static noinline int find_dir_range(struct btrfs_root *root,
2041 struct btrfs_path *path,
ccae4a19 2042 u64 dirid,
e02119d5
CM
2043 u64 *start_ret, u64 *end_ret)
2044{
2045 struct btrfs_key key;
2046 u64 found_end;
2047 struct btrfs_dir_log_item *item;
2048 int ret;
2049 int nritems;
2050
2051 if (*start_ret == (u64)-1)
2052 return 1;
2053
2054 key.objectid = dirid;
ccae4a19 2055 key.type = BTRFS_DIR_LOG_INDEX_KEY;
e02119d5
CM
2056 key.offset = *start_ret;
2057
2058 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2059 if (ret < 0)
2060 goto out;
2061 if (ret > 0) {
2062 if (path->slots[0] == 0)
2063 goto out;
2064 path->slots[0]--;
2065 }
2066 if (ret != 0)
2067 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2068
ccae4a19 2069 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
e02119d5
CM
2070 ret = 1;
2071 goto next;
2072 }
2073 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2074 struct btrfs_dir_log_item);
2075 found_end = btrfs_dir_log_end(path->nodes[0], item);
2076
2077 if (*start_ret >= key.offset && *start_ret <= found_end) {
2078 ret = 0;
2079 *start_ret = key.offset;
2080 *end_ret = found_end;
2081 goto out;
2082 }
2083 ret = 1;
2084next:
2085 /* check the next slot in the tree to see if it is a valid item */
2086 nritems = btrfs_header_nritems(path->nodes[0]);
2a7bf53f 2087 path->slots[0]++;
e02119d5
CM
2088 if (path->slots[0] >= nritems) {
2089 ret = btrfs_next_leaf(root, path);
2090 if (ret)
2091 goto out;
e02119d5
CM
2092 }
2093
2094 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2095
ccae4a19 2096 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
e02119d5
CM
2097 ret = 1;
2098 goto out;
2099 }
2100 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2101 struct btrfs_dir_log_item);
2102 found_end = btrfs_dir_log_end(path->nodes[0], item);
2103 *start_ret = key.offset;
2104 *end_ret = found_end;
2105 ret = 0;
2106out:
b3b4aa74 2107 btrfs_release_path(path);
e02119d5
CM
2108 return ret;
2109}
2110
2111/*
2112 * this looks for a given directory item in the log. If the directory
2113 * item is not in the log, the item is removed and the inode it points
2114 * to is unlinked
2115 */
2116static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
e02119d5
CM
2117 struct btrfs_root *log,
2118 struct btrfs_path *path,
2119 struct btrfs_path *log_path,
2120 struct inode *dir,
2121 struct btrfs_key *dir_key)
2122{
d1ed82f3 2123 struct btrfs_root *root = BTRFS_I(dir)->root;
e02119d5
CM
2124 int ret;
2125 struct extent_buffer *eb;
2126 int slot;
e02119d5 2127 struct btrfs_dir_item *di;
6db75318 2128 struct fscrypt_str name;
ccae4a19 2129 struct inode *inode = NULL;
e02119d5
CM
2130 struct btrfs_key location;
2131
ccae4a19 2132 /*
143823cf 2133 * Currently we only log dir index keys. Even if we replay a log created
ccae4a19
FM
2134 * by an older kernel that logged both dir index and dir item keys, all
2135 * we need to do is process the dir index keys, we (and our caller) can
2136 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
2137 */
2138 ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
2139
e02119d5
CM
2140 eb = path->nodes[0];
2141 slot = path->slots[0];
ccae4a19 2142 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
e43eec81
STD
2143 ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
2144 if (ret)
ccae4a19 2145 goto out;
3650860b 2146
ccae4a19
FM
2147 if (log) {
2148 struct btrfs_dir_item *log_di;
e02119d5 2149
ccae4a19
FM
2150 log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
2151 dir_key->objectid,
e43eec81 2152 dir_key->offset, &name, 0);
ccae4a19
FM
2153 if (IS_ERR(log_di)) {
2154 ret = PTR_ERR(log_di);
2155 goto out;
2156 } else if (log_di) {
2157 /* The dentry exists in the log, we have nothing to do. */
e02119d5
CM
2158 ret = 0;
2159 goto out;
2160 }
ccae4a19 2161 }
e02119d5 2162
ccae4a19
FM
2163 btrfs_dir_item_key_to_cpu(eb, di, &location);
2164 btrfs_release_path(path);
2165 btrfs_release_path(log_path);
2166 inode = read_one_inode(root, location.objectid);
2167 if (!inode) {
2168 ret = -EIO;
2169 goto out;
e02119d5 2170 }
ccae4a19
FM
2171
2172 ret = link_to_fixup_dir(trans, root, path, location.objectid);
2173 if (ret)
2174 goto out;
2175
2176 inc_nlink(inode);
313ab753 2177 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
e43eec81 2178 &name);
ccae4a19
FM
2179 /*
2180 * Unlike dir item keys, dir index keys can only have one name (entry) in
2181 * them, as there are no key collisions since each key has a unique offset
2182 * (an index number), so we're done.
2183 */
e02119d5 2184out:
b3b4aa74
DS
2185 btrfs_release_path(path);
2186 btrfs_release_path(log_path);
e43eec81 2187 kfree(name.name);
ccae4a19 2188 iput(inode);
e02119d5
CM
2189 return ret;
2190}
2191
4f764e51
FM
2192static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2193 struct btrfs_root *root,
2194 struct btrfs_root *log,
2195 struct btrfs_path *path,
2196 const u64 ino)
2197{
2198 struct btrfs_key search_key;
2199 struct btrfs_path *log_path;
2200 int i;
2201 int nritems;
2202 int ret;
2203
2204 log_path = btrfs_alloc_path();
2205 if (!log_path)
2206 return -ENOMEM;
2207
2208 search_key.objectid = ino;
2209 search_key.type = BTRFS_XATTR_ITEM_KEY;
2210 search_key.offset = 0;
2211again:
2212 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2213 if (ret < 0)
2214 goto out;
2215process_leaf:
2216 nritems = btrfs_header_nritems(path->nodes[0]);
2217 for (i = path->slots[0]; i < nritems; i++) {
2218 struct btrfs_key key;
2219 struct btrfs_dir_item *di;
2220 struct btrfs_dir_item *log_di;
2221 u32 total_size;
2222 u32 cur;
2223
2224 btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2225 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2226 ret = 0;
2227 goto out;
2228 }
2229
2230 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
3212fa14 2231 total_size = btrfs_item_size(path->nodes[0], i);
4f764e51
FM
2232 cur = 0;
2233 while (cur < total_size) {
2234 u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2235 u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2236 u32 this_len = sizeof(*di) + name_len + data_len;
2237 char *name;
2238
2239 name = kmalloc(name_len, GFP_NOFS);
2240 if (!name) {
2241 ret = -ENOMEM;
2242 goto out;
2243 }
2244 read_extent_buffer(path->nodes[0], name,
2245 (unsigned long)(di + 1), name_len);
2246
2247 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2248 name, name_len, 0);
2249 btrfs_release_path(log_path);
2250 if (!log_di) {
2251 /* Doesn't exist in log tree, so delete it. */
2252 btrfs_release_path(path);
2253 di = btrfs_lookup_xattr(trans, root, path, ino,
2254 name, name_len, -1);
2255 kfree(name);
2256 if (IS_ERR(di)) {
2257 ret = PTR_ERR(di);
2258 goto out;
2259 }
2260 ASSERT(di);
2261 ret = btrfs_delete_one_dir_name(trans, root,
2262 path, di);
2263 if (ret)
2264 goto out;
2265 btrfs_release_path(path);
2266 search_key = key;
2267 goto again;
2268 }
2269 kfree(name);
2270 if (IS_ERR(log_di)) {
2271 ret = PTR_ERR(log_di);
2272 goto out;
2273 }
2274 cur += this_len;
2275 di = (struct btrfs_dir_item *)((char *)di + this_len);
2276 }
2277 }
2278 ret = btrfs_next_leaf(root, path);
2279 if (ret > 0)
2280 ret = 0;
2281 else if (ret == 0)
2282 goto process_leaf;
2283out:
2284 btrfs_free_path(log_path);
2285 btrfs_release_path(path);
2286 return ret;
2287}
2288
2289
e02119d5
CM
2290/*
2291 * deletion replay happens before we copy any new directory items
2292 * out of the log or out of backreferences from inodes. It
2293 * scans the log to find ranges of keys that log is authoritative for,
2294 * and then scans the directory to find items in those ranges that are
2295 * not present in the log.
2296 *
2297 * Anything we don't find in the log is unlinked and removed from the
2298 * directory.
2299 */
2300static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2301 struct btrfs_root *root,
2302 struct btrfs_root *log,
2303 struct btrfs_path *path,
12fcfd22 2304 u64 dirid, int del_all)
e02119d5
CM
2305{
2306 u64 range_start;
2307 u64 range_end;
e02119d5
CM
2308 int ret = 0;
2309 struct btrfs_key dir_key;
2310 struct btrfs_key found_key;
2311 struct btrfs_path *log_path;
2312 struct inode *dir;
2313
2314 dir_key.objectid = dirid;
ccae4a19 2315 dir_key.type = BTRFS_DIR_INDEX_KEY;
e02119d5
CM
2316 log_path = btrfs_alloc_path();
2317 if (!log_path)
2318 return -ENOMEM;
2319
2320 dir = read_one_inode(root, dirid);
2321 /* it isn't an error if the inode isn't there, that can happen
2322 * because we replay the deletes before we copy in the inode item
2323 * from the log
2324 */
2325 if (!dir) {
2326 btrfs_free_path(log_path);
2327 return 0;
2328 }
ccae4a19 2329
e02119d5
CM
2330 range_start = 0;
2331 range_end = 0;
d397712b 2332 while (1) {
12fcfd22
CM
2333 if (del_all)
2334 range_end = (u64)-1;
2335 else {
ccae4a19 2336 ret = find_dir_range(log, path, dirid,
12fcfd22 2337 &range_start, &range_end);
10adb115
FM
2338 if (ret < 0)
2339 goto out;
2340 else if (ret > 0)
12fcfd22
CM
2341 break;
2342 }
e02119d5
CM
2343
2344 dir_key.offset = range_start;
d397712b 2345 while (1) {
e02119d5
CM
2346 int nritems;
2347 ret = btrfs_search_slot(NULL, root, &dir_key, path,
2348 0, 0);
2349 if (ret < 0)
2350 goto out;
2351
2352 nritems = btrfs_header_nritems(path->nodes[0]);
2353 if (path->slots[0] >= nritems) {
2354 ret = btrfs_next_leaf(root, path);
b98def7c 2355 if (ret == 1)
e02119d5 2356 break;
b98def7c
LB
2357 else if (ret < 0)
2358 goto out;
e02119d5
CM
2359 }
2360 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2361 path->slots[0]);
2362 if (found_key.objectid != dirid ||
ccae4a19
FM
2363 found_key.type != dir_key.type) {
2364 ret = 0;
2365 goto out;
2366 }
e02119d5
CM
2367
2368 if (found_key.offset > range_end)
2369 break;
2370
d1ed82f3 2371 ret = check_item_in_log(trans, log, path,
12fcfd22
CM
2372 log_path, dir,
2373 &found_key);
3650860b
JB
2374 if (ret)
2375 goto out;
e02119d5
CM
2376 if (found_key.offset == (u64)-1)
2377 break;
2378 dir_key.offset = found_key.offset + 1;
2379 }
b3b4aa74 2380 btrfs_release_path(path);
e02119d5
CM
2381 if (range_end == (u64)-1)
2382 break;
2383 range_start = range_end + 1;
2384 }
e02119d5 2385 ret = 0;
e02119d5 2386out:
b3b4aa74 2387 btrfs_release_path(path);
e02119d5
CM
2388 btrfs_free_path(log_path);
2389 iput(dir);
2390 return ret;
2391}
2392
2393/*
2394 * the process_func used to replay items from the log tree. This
2395 * gets called in two different stages. The first stage just looks
2396 * for inodes and makes sure they are all copied into the subvolume.
2397 *
2398 * The second stage copies all the other item types from the log into
2399 * the subvolume. The two stage approach is slower, but gets rid of
2400 * lots of complexity around inodes referencing other inodes that exist
2401 * only in the log (references come from either directory items or inode
2402 * back refs).
2403 */
2404static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
581c1760 2405 struct walk_control *wc, u64 gen, int level)
e02119d5
CM
2406{
2407 int nritems;
2408 struct btrfs_path *path;
2409 struct btrfs_root *root = wc->replay_dest;
2410 struct btrfs_key key;
e02119d5
CM
2411 int i;
2412 int ret;
2413
6a2e9dc4 2414 ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
018642a1
TI
2415 if (ret)
2416 return ret;
e02119d5
CM
2417
2418 level = btrfs_header_level(eb);
2419
2420 if (level != 0)
2421 return 0;
2422
2423 path = btrfs_alloc_path();
1e5063d0
MF
2424 if (!path)
2425 return -ENOMEM;
e02119d5
CM
2426
2427 nritems = btrfs_header_nritems(eb);
2428 for (i = 0; i < nritems; i++) {
2429 btrfs_item_key_to_cpu(eb, &key, i);
e02119d5
CM
2430
2431 /* inode keys are done during the first stage */
2432 if (key.type == BTRFS_INODE_ITEM_KEY &&
2433 wc->stage == LOG_WALK_REPLAY_INODES) {
e02119d5
CM
2434 struct btrfs_inode_item *inode_item;
2435 u32 mode;
2436
2437 inode_item = btrfs_item_ptr(eb, i,
2438 struct btrfs_inode_item);
f2d72f42
FM
2439 /*
2440 * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2441 * and never got linked before the fsync, skip it, as
2442 * replaying it is pointless since it would be deleted
2443 * later. We skip logging tmpfiles, but it's always
2444 * possible we are replaying a log created with a kernel
2445 * that used to log tmpfiles.
2446 */
2447 if (btrfs_inode_nlink(eb, inode_item) == 0) {
2448 wc->ignore_cur_inode = true;
2449 continue;
2450 } else {
2451 wc->ignore_cur_inode = false;
2452 }
4f764e51
FM
2453 ret = replay_xattr_deletes(wc->trans, root, log,
2454 path, key.objectid);
2455 if (ret)
2456 break;
e02119d5
CM
2457 mode = btrfs_inode_mode(eb, inode_item);
2458 if (S_ISDIR(mode)) {
2459 ret = replay_dir_deletes(wc->trans,
12fcfd22 2460 root, log, path, key.objectid, 0);
b50c6e25
JB
2461 if (ret)
2462 break;
e02119d5
CM
2463 }
2464 ret = overwrite_item(wc->trans, root, path,
2465 eb, i, &key);
b50c6e25
JB
2466 if (ret)
2467 break;
e02119d5 2468
471d557a
FM
2469 /*
2470 * Before replaying extents, truncate the inode to its
2471 * size. We need to do it now and not after log replay
2472 * because before an fsync we can have prealloc extents
2473 * added beyond the inode's i_size. If we did it after,
2474 * through orphan cleanup for example, we would drop
2475 * those prealloc extents just after replaying them.
e02119d5
CM
2476 */
2477 if (S_ISREG(mode)) {
5893dfb9 2478 struct btrfs_drop_extents_args drop_args = { 0 };
471d557a
FM
2479 struct inode *inode;
2480 u64 from;
2481
2482 inode = read_one_inode(root, key.objectid);
2483 if (!inode) {
2484 ret = -EIO;
2485 break;
2486 }
2487 from = ALIGN(i_size_read(inode),
2488 root->fs_info->sectorsize);
5893dfb9
FM
2489 drop_args.start = from;
2490 drop_args.end = (u64)-1;
2491 drop_args.drop_cache = true;
2492 ret = btrfs_drop_extents(wc->trans, root,
2493 BTRFS_I(inode),
2494 &drop_args);
471d557a 2495 if (!ret) {
2766ff61
FM
2496 inode_sub_bytes(inode,
2497 drop_args.bytes_found);
f2d72f42 2498 /* Update the inode's nbytes. */
471d557a 2499 ret = btrfs_update_inode(wc->trans,
9a56fcd1 2500 root, BTRFS_I(inode));
471d557a
FM
2501 }
2502 iput(inode);
b50c6e25
JB
2503 if (ret)
2504 break;
e02119d5 2505 }
c71bf099 2506
e02119d5
CM
2507 ret = link_to_fixup_dir(wc->trans, root,
2508 path, key.objectid);
b50c6e25
JB
2509 if (ret)
2510 break;
e02119d5 2511 }
dd8e7217 2512
f2d72f42
FM
2513 if (wc->ignore_cur_inode)
2514 continue;
2515
dd8e7217
JB
2516 if (key.type == BTRFS_DIR_INDEX_KEY &&
2517 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2518 ret = replay_one_dir_item(wc->trans, root, path,
2519 eb, i, &key);
2520 if (ret)
2521 break;
2522 }
2523
e02119d5
CM
2524 if (wc->stage < LOG_WALK_REPLAY_ALL)
2525 continue;
2526
2527 /* these keys are simply copied */
2528 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2529 ret = overwrite_item(wc->trans, root, path,
2530 eb, i, &key);
b50c6e25
JB
2531 if (ret)
2532 break;
2da1c669
LB
2533 } else if (key.type == BTRFS_INODE_REF_KEY ||
2534 key.type == BTRFS_INODE_EXTREF_KEY) {
f186373f
MF
2535 ret = add_inode_ref(wc->trans, root, log, path,
2536 eb, i, &key);
b50c6e25
JB
2537 if (ret && ret != -ENOENT)
2538 break;
2539 ret = 0;
e02119d5
CM
2540 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2541 ret = replay_one_extent(wc->trans, root, path,
2542 eb, i, &key);
b50c6e25
JB
2543 if (ret)
2544 break;
e02119d5 2545 }
339d0354
FM
2546 /*
2547 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
2548 * BTRFS_DIR_INDEX_KEY items which we use to derive the
2549 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
2550 * older kernel with such keys, ignore them.
2551 */
e02119d5
CM
2552 }
2553 btrfs_free_path(path);
b50c6e25 2554 return ret;
e02119d5
CM
2555}
2556
6787bb9f
NB
2557/*
2558 * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2559 */
2560static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2561{
2562 struct btrfs_block_group *cache;
2563
2564 cache = btrfs_lookup_block_group(fs_info, start);
2565 if (!cache) {
2566 btrfs_err(fs_info, "unable to find block group for %llu", start);
2567 return;
2568 }
2569
2570 spin_lock(&cache->space_info->lock);
2571 spin_lock(&cache->lock);
2572 cache->reserved -= fs_info->nodesize;
2573 cache->space_info->bytes_reserved -= fs_info->nodesize;
2574 spin_unlock(&cache->lock);
2575 spin_unlock(&cache->space_info->lock);
2576
2577 btrfs_put_block_group(cache);
2578}
2579
d397712b 2580static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
e02119d5
CM
2581 struct btrfs_root *root,
2582 struct btrfs_path *path, int *level,
2583 struct walk_control *wc)
2584{
0b246afa 2585 struct btrfs_fs_info *fs_info = root->fs_info;
e02119d5
CM
2586 u64 bytenr;
2587 u64 ptr_gen;
2588 struct extent_buffer *next;
2589 struct extent_buffer *cur;
e02119d5
CM
2590 u32 blocksize;
2591 int ret = 0;
2592
d397712b 2593 while (*level > 0) {
581c1760
QW
2594 struct btrfs_key first_key;
2595
e02119d5
CM
2596 cur = path->nodes[*level];
2597
fae7f21c 2598 WARN_ON(btrfs_header_level(cur) != *level);
e02119d5
CM
2599
2600 if (path->slots[*level] >=
2601 btrfs_header_nritems(cur))
2602 break;
2603
2604 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2605 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
581c1760 2606 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
0b246afa 2607 blocksize = fs_info->nodesize;
e02119d5 2608
3fbaf258
JB
2609 next = btrfs_find_create_tree_block(fs_info, bytenr,
2610 btrfs_header_owner(cur),
2611 *level - 1);
c871b0f2
LB
2612 if (IS_ERR(next))
2613 return PTR_ERR(next);
e02119d5 2614
e02119d5 2615 if (*level == 1) {
581c1760
QW
2616 ret = wc->process_func(root, next, wc, ptr_gen,
2617 *level - 1);
b50c6e25
JB
2618 if (ret) {
2619 free_extent_buffer(next);
1e5063d0 2620 return ret;
b50c6e25 2621 }
4a500fd1 2622
e02119d5
CM
2623 path->slots[*level]++;
2624 if (wc->free) {
6a2e9dc4 2625 ret = btrfs_read_extent_buffer(next, ptr_gen,
581c1760 2626 *level - 1, &first_key);
018642a1
TI
2627 if (ret) {
2628 free_extent_buffer(next);
2629 return ret;
2630 }
e02119d5 2631
681ae509
JB
2632 if (trans) {
2633 btrfs_tree_lock(next);
6a884d7d 2634 btrfs_clean_tree_block(next);
681ae509
JB
2635 btrfs_wait_tree_block_writeback(next);
2636 btrfs_tree_unlock(next);
7bfc1007 2637 ret = btrfs_pin_reserved_extent(trans,
10e958d5
NB
2638 bytenr, blocksize);
2639 if (ret) {
2640 free_extent_buffer(next);
2641 return ret;
2642 }
d3575156
NA
2643 btrfs_redirty_list_add(
2644 trans->transaction, next);
1846430c
LB
2645 } else {
2646 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2647 clear_extent_buffer_dirty(next);
10e958d5 2648 unaccount_log_buffer(fs_info, bytenr);
3650860b 2649 }
e02119d5
CM
2650 }
2651 free_extent_buffer(next);
2652 continue;
2653 }
6a2e9dc4 2654 ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key);
018642a1
TI
2655 if (ret) {
2656 free_extent_buffer(next);
2657 return ret;
2658 }
e02119d5 2659
e02119d5
CM
2660 if (path->nodes[*level-1])
2661 free_extent_buffer(path->nodes[*level-1]);
2662 path->nodes[*level-1] = next;
2663 *level = btrfs_header_level(next);
2664 path->slots[*level] = 0;
2665 cond_resched();
2666 }
4a500fd1 2667 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
e02119d5
CM
2668
2669 cond_resched();
2670 return 0;
2671}
2672
d397712b 2673static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
e02119d5
CM
2674 struct btrfs_root *root,
2675 struct btrfs_path *path, int *level,
2676 struct walk_control *wc)
2677{
0b246afa 2678 struct btrfs_fs_info *fs_info = root->fs_info;
e02119d5
CM
2679 int i;
2680 int slot;
2681 int ret;
2682
d397712b 2683 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
e02119d5 2684 slot = path->slots[i];
4a500fd1 2685 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
e02119d5
CM
2686 path->slots[i]++;
2687 *level = i;
2688 WARN_ON(*level == 0);
2689 return 0;
2690 } else {
1e5063d0 2691 ret = wc->process_func(root, path->nodes[*level], wc,
581c1760
QW
2692 btrfs_header_generation(path->nodes[*level]),
2693 *level);
1e5063d0
MF
2694 if (ret)
2695 return ret;
2696
e02119d5
CM
2697 if (wc->free) {
2698 struct extent_buffer *next;
2699
2700 next = path->nodes[*level];
2701
681ae509
JB
2702 if (trans) {
2703 btrfs_tree_lock(next);
6a884d7d 2704 btrfs_clean_tree_block(next);
681ae509
JB
2705 btrfs_wait_tree_block_writeback(next);
2706 btrfs_tree_unlock(next);
7bfc1007 2707 ret = btrfs_pin_reserved_extent(trans,
10e958d5
NB
2708 path->nodes[*level]->start,
2709 path->nodes[*level]->len);
2710 if (ret)
2711 return ret;
84c25448
NA
2712 btrfs_redirty_list_add(trans->transaction,
2713 next);
1846430c
LB
2714 } else {
2715 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2716 clear_extent_buffer_dirty(next);
e02119d5 2717
10e958d5
NB
2718 unaccount_log_buffer(fs_info,
2719 path->nodes[*level]->start);
2720 }
e02119d5
CM
2721 }
2722 free_extent_buffer(path->nodes[*level]);
2723 path->nodes[*level] = NULL;
2724 *level = i + 1;
2725 }
2726 }
2727 return 1;
2728}
2729
2730/*
2731 * drop the reference count on the tree rooted at 'snap'. This traverses
2732 * the tree freeing any blocks that have a ref count of zero after being
2733 * decremented.
2734 */
2735static int walk_log_tree(struct btrfs_trans_handle *trans,
2736 struct btrfs_root *log, struct walk_control *wc)
2737{
2ff7e61e 2738 struct btrfs_fs_info *fs_info = log->fs_info;
e02119d5
CM
2739 int ret = 0;
2740 int wret;
2741 int level;
2742 struct btrfs_path *path;
e02119d5
CM
2743 int orig_level;
2744
2745 path = btrfs_alloc_path();
db5b493a
TI
2746 if (!path)
2747 return -ENOMEM;
e02119d5
CM
2748
2749 level = btrfs_header_level(log->node);
2750 orig_level = level;
2751 path->nodes[level] = log->node;
67439dad 2752 atomic_inc(&log->node->refs);
e02119d5
CM
2753 path->slots[level] = 0;
2754
d397712b 2755 while (1) {
e02119d5
CM
2756 wret = walk_down_log_tree(trans, log, path, &level, wc);
2757 if (wret > 0)
2758 break;
79787eaa 2759 if (wret < 0) {
e02119d5 2760 ret = wret;
79787eaa
JM
2761 goto out;
2762 }
e02119d5
CM
2763
2764 wret = walk_up_log_tree(trans, log, path, &level, wc);
2765 if (wret > 0)
2766 break;
79787eaa 2767 if (wret < 0) {
e02119d5 2768 ret = wret;
79787eaa
JM
2769 goto out;
2770 }
e02119d5
CM
2771 }
2772
2773 /* was the root node processed? if not, catch it here */
2774 if (path->nodes[orig_level]) {
79787eaa 2775 ret = wc->process_func(log, path->nodes[orig_level], wc,
581c1760
QW
2776 btrfs_header_generation(path->nodes[orig_level]),
2777 orig_level);
79787eaa
JM
2778 if (ret)
2779 goto out;
e02119d5
CM
2780 if (wc->free) {
2781 struct extent_buffer *next;
2782
2783 next = path->nodes[orig_level];
2784
681ae509
JB
2785 if (trans) {
2786 btrfs_tree_lock(next);
6a884d7d 2787 btrfs_clean_tree_block(next);
681ae509
JB
2788 btrfs_wait_tree_block_writeback(next);
2789 btrfs_tree_unlock(next);
7bfc1007 2790 ret = btrfs_pin_reserved_extent(trans,
10e958d5
NB
2791 next->start, next->len);
2792 if (ret)
2793 goto out;
84c25448 2794 btrfs_redirty_list_add(trans->transaction, next);
1846430c
LB
2795 } else {
2796 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2797 clear_extent_buffer_dirty(next);
10e958d5 2798 unaccount_log_buffer(fs_info, next->start);
681ae509 2799 }
e02119d5
CM
2800 }
2801 }
2802
79787eaa 2803out:
e02119d5 2804 btrfs_free_path(path);
e02119d5
CM
2805 return ret;
2806}
2807
7237f183
YZ
2808/*
2809 * helper function to update the item for a given subvolumes log root
2810 * in the tree of log roots
2811 */
2812static int update_log_root(struct btrfs_trans_handle *trans,
4203e968
JB
2813 struct btrfs_root *log,
2814 struct btrfs_root_item *root_item)
7237f183 2815{
0b246afa 2816 struct btrfs_fs_info *fs_info = log->fs_info;
7237f183
YZ
2817 int ret;
2818
2819 if (log->log_transid == 1) {
2820 /* insert root item on the first sync */
0b246afa 2821 ret = btrfs_insert_root(trans, fs_info->log_root_tree,
4203e968 2822 &log->root_key, root_item);
7237f183 2823 } else {
0b246afa 2824 ret = btrfs_update_root(trans, fs_info->log_root_tree,
4203e968 2825 &log->root_key, root_item);
7237f183
YZ
2826 }
2827 return ret;
2828}
2829
60d53eb3 2830static void wait_log_commit(struct btrfs_root *root, int transid)
e02119d5
CM
2831{
2832 DEFINE_WAIT(wait);
7237f183 2833 int index = transid % 2;
e02119d5 2834
7237f183
YZ
2835 /*
2836 * we only allow two pending log transactions at a time,
2837 * so we know that if ours is more than 2 older than the
2838 * current transaction, we're done
2839 */
49e83f57 2840 for (;;) {
7237f183
YZ
2841 prepare_to_wait(&root->log_commit_wait[index],
2842 &wait, TASK_UNINTERRUPTIBLE);
12fcfd22 2843
49e83f57
LB
2844 if (!(root->log_transid_committed < transid &&
2845 atomic_read(&root->log_commit[index])))
2846 break;
12fcfd22 2847
49e83f57
LB
2848 mutex_unlock(&root->log_mutex);
2849 schedule();
7237f183 2850 mutex_lock(&root->log_mutex);
49e83f57
LB
2851 }
2852 finish_wait(&root->log_commit_wait[index], &wait);
7237f183
YZ
2853}
2854
60d53eb3 2855static void wait_for_writer(struct btrfs_root *root)
7237f183
YZ
2856{
2857 DEFINE_WAIT(wait);
8b050d35 2858
49e83f57
LB
2859 for (;;) {
2860 prepare_to_wait(&root->log_writer_wait, &wait,
2861 TASK_UNINTERRUPTIBLE);
2862 if (!atomic_read(&root->log_writers))
2863 break;
2864
7237f183 2865 mutex_unlock(&root->log_mutex);
49e83f57 2866 schedule();
575849ec 2867 mutex_lock(&root->log_mutex);
7237f183 2868 }
49e83f57 2869 finish_wait(&root->log_writer_wait, &wait);
e02119d5
CM
2870}
2871
8b050d35
MX
2872static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2873 struct btrfs_log_ctx *ctx)
2874{
8b050d35
MX
2875 mutex_lock(&root->log_mutex);
2876 list_del_init(&ctx->list);
2877 mutex_unlock(&root->log_mutex);
2878}
2879
2880/*
2881 * Invoked in log mutex context, or be sure there is no other task which
2882 * can access the list.
2883 */
2884static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2885 int index, int error)
2886{
2887 struct btrfs_log_ctx *ctx;
570dd450 2888 struct btrfs_log_ctx *safe;
8b050d35 2889
570dd450
CM
2890 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2891 list_del_init(&ctx->list);
8b050d35 2892 ctx->log_ret = error;
570dd450 2893 }
8b050d35
MX
2894}
2895
e02119d5
CM
2896/*
2897 * btrfs_sync_log does sends a given tree log down to the disk and
2898 * updates the super blocks to record it. When this call is done,
12fcfd22
CM
2899 * you know that any inodes previously logged are safely on disk only
2900 * if it returns 0.
2901 *
2902 * Any other return value means you need to call btrfs_commit_transaction.
2903 * Some of the edge cases for fsyncing directories that have had unlinks
2904 * or renames done in the past mean that sometimes the only safe
2905 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
2906 * that has happened.
e02119d5
CM
2907 */
2908int btrfs_sync_log(struct btrfs_trans_handle *trans,
8b050d35 2909 struct btrfs_root *root, struct btrfs_log_ctx *ctx)
e02119d5 2910{
7237f183
YZ
2911 int index1;
2912 int index2;
8cef4e16 2913 int mark;
e02119d5 2914 int ret;
0b246afa 2915 struct btrfs_fs_info *fs_info = root->fs_info;
e02119d5 2916 struct btrfs_root *log = root->log_root;
0b246afa 2917 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
4203e968 2918 struct btrfs_root_item new_root_item;
bb14a59b 2919 int log_transid = 0;
8b050d35 2920 struct btrfs_log_ctx root_log_ctx;
c6adc9cc 2921 struct blk_plug plug;
47876f7c
FM
2922 u64 log_root_start;
2923 u64 log_root_level;
e02119d5 2924
7237f183 2925 mutex_lock(&root->log_mutex);
d1433deb
MX
2926 log_transid = ctx->log_transid;
2927 if (root->log_transid_committed >= log_transid) {
2928 mutex_unlock(&root->log_mutex);
2929 return ctx->log_ret;
2930 }
2931
2932 index1 = log_transid % 2;
7237f183 2933 if (atomic_read(&root->log_commit[index1])) {
60d53eb3 2934 wait_log_commit(root, log_transid);
7237f183 2935 mutex_unlock(&root->log_mutex);
8b050d35 2936 return ctx->log_ret;
e02119d5 2937 }
d1433deb 2938 ASSERT(log_transid == root->log_transid);
7237f183
YZ
2939 atomic_set(&root->log_commit[index1], 1);
2940
2941 /* wait for previous tree log sync to complete */
2942 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
60d53eb3 2943 wait_log_commit(root, log_transid - 1);
48cab2e0 2944
86df7eb9 2945 while (1) {
2ecb7923 2946 int batch = atomic_read(&root->log_batch);
cd354ad6 2947 /* when we're on an ssd, just kick the log commit out */
0b246afa 2948 if (!btrfs_test_opt(fs_info, SSD) &&
27cdeb70 2949 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
86df7eb9
YZ
2950 mutex_unlock(&root->log_mutex);
2951 schedule_timeout_uninterruptible(1);
2952 mutex_lock(&root->log_mutex);
2953 }
60d53eb3 2954 wait_for_writer(root);
2ecb7923 2955 if (batch == atomic_read(&root->log_batch))
e02119d5
CM
2956 break;
2957 }
e02119d5 2958
12fcfd22 2959 /* bail out if we need to do a full commit */
4884b8e8 2960 if (btrfs_need_log_full_commit(trans)) {
f31f09f6 2961 ret = BTRFS_LOG_FORCE_COMMIT;
12fcfd22
CM
2962 mutex_unlock(&root->log_mutex);
2963 goto out;
2964 }
2965
8cef4e16
YZ
2966 if (log_transid % 2 == 0)
2967 mark = EXTENT_DIRTY;
2968 else
2969 mark = EXTENT_NEW;
2970
690587d1
CM
2971 /* we start IO on all the marked extents here, but we don't actually
2972 * wait for them until later.
2973 */
c6adc9cc 2974 blk_start_plug(&plug);
2ff7e61e 2975 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
b528f467
NA
2976 /*
2977 * -EAGAIN happens when someone, e.g., a concurrent transaction
2978 * commit, writes a dirty extent in this tree-log commit. This
2979 * concurrent write will create a hole writing out the extents,
2980 * and we cannot proceed on a zoned filesystem, requiring
2981 * sequential writing. While we can bail out to a full commit
2982 * here, but we can continue hoping the concurrent writing fills
2983 * the hole.
2984 */
2985 if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
2986 ret = 0;
79787eaa 2987 if (ret) {
c6adc9cc 2988 blk_finish_plug(&plug);
66642832 2989 btrfs_abort_transaction(trans, ret);
90787766 2990 btrfs_set_log_full_commit(trans);
79787eaa
JM
2991 mutex_unlock(&root->log_mutex);
2992 goto out;
2993 }
7237f183 2994
4203e968
JB
2995 /*
2996 * We _must_ update under the root->log_mutex in order to make sure we
2997 * have a consistent view of the log root we are trying to commit at
2998 * this moment.
2999 *
3000 * We _must_ copy this into a local copy, because we are not holding the
3001 * log_root_tree->log_mutex yet. This is important because when we
3002 * commit the log_root_tree we must have a consistent view of the
3003 * log_root_tree when we update the super block to point at the
3004 * log_root_tree bytenr. If we update the log_root_tree here we'll race
3005 * with the commit and possibly point at the new block which we may not
3006 * have written out.
3007 */
5d4f98a2 3008 btrfs_set_root_node(&log->root_item, log->node);
4203e968 3009 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
7237f183 3010
7237f183
YZ
3011 root->log_transid++;
3012 log->log_transid = root->log_transid;
ff782e0a 3013 root->log_start_pid = 0;
7237f183 3014 /*
8cef4e16
YZ
3015 * IO has been started, blocks of the log tree have WRITTEN flag set
3016 * in their headers. new modifications of the log will be written to
3017 * new positions. so it's safe to allow log writers to go in.
7237f183
YZ
3018 */
3019 mutex_unlock(&root->log_mutex);
3020
3ddebf27 3021 if (btrfs_is_zoned(fs_info)) {
e75f9fd1 3022 mutex_lock(&fs_info->tree_root->log_mutex);
3ddebf27
NA
3023 if (!log_root_tree->node) {
3024 ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
3025 if (ret) {
ea32af47 3026 mutex_unlock(&fs_info->tree_root->log_mutex);
50ff5788 3027 blk_finish_plug(&plug);
3ddebf27
NA
3028 goto out;
3029 }
3030 }
e75f9fd1 3031 mutex_unlock(&fs_info->tree_root->log_mutex);
3ddebf27
NA
3032 }
3033
e75f9fd1
NA
3034 btrfs_init_log_ctx(&root_log_ctx, NULL);
3035
3036 mutex_lock(&log_root_tree->log_mutex);
3037
e3d3b415
FM
3038 index2 = log_root_tree->log_transid % 2;
3039 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3040 root_log_ctx.log_transid = log_root_tree->log_transid;
3041
4203e968
JB
3042 /*
3043 * Now we are safe to update the log_root_tree because we're under the
3044 * log_mutex, and we're a current writer so we're holding the commit
3045 * open until we drop the log_mutex.
3046 */
3047 ret = update_log_root(trans, log, &new_root_item);
4a500fd1 3048 if (ret) {
d1433deb
MX
3049 if (!list_empty(&root_log_ctx.list))
3050 list_del_init(&root_log_ctx.list);
3051
c6adc9cc 3052 blk_finish_plug(&plug);
90787766 3053 btrfs_set_log_full_commit(trans);
995946dd 3054
79787eaa 3055 if (ret != -ENOSPC) {
66642832 3056 btrfs_abort_transaction(trans, ret);
79787eaa
JM
3057 mutex_unlock(&log_root_tree->log_mutex);
3058 goto out;
3059 }
bf89d38f 3060 btrfs_wait_tree_log_extents(log, mark);
4a500fd1 3061 mutex_unlock(&log_root_tree->log_mutex);
f31f09f6 3062 ret = BTRFS_LOG_FORCE_COMMIT;
4a500fd1
YZ
3063 goto out;
3064 }
3065
d1433deb 3066 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3da5ab56 3067 blk_finish_plug(&plug);
cbd60aa7 3068 list_del_init(&root_log_ctx.list);
d1433deb
MX
3069 mutex_unlock(&log_root_tree->log_mutex);
3070 ret = root_log_ctx.log_ret;
3071 goto out;
3072 }
8b050d35 3073
d1433deb 3074 index2 = root_log_ctx.log_transid % 2;
7237f183 3075 if (atomic_read(&log_root_tree->log_commit[index2])) {
c6adc9cc 3076 blk_finish_plug(&plug);
bf89d38f 3077 ret = btrfs_wait_tree_log_extents(log, mark);
60d53eb3 3078 wait_log_commit(log_root_tree,
d1433deb 3079 root_log_ctx.log_transid);
7237f183 3080 mutex_unlock(&log_root_tree->log_mutex);
5ab5e44a
FM
3081 if (!ret)
3082 ret = root_log_ctx.log_ret;
7237f183
YZ
3083 goto out;
3084 }
d1433deb 3085 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
7237f183
YZ
3086 atomic_set(&log_root_tree->log_commit[index2], 1);
3087
12fcfd22 3088 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
60d53eb3 3089 wait_log_commit(log_root_tree,
d1433deb 3090 root_log_ctx.log_transid - 1);
12fcfd22
CM
3091 }
3092
12fcfd22
CM
3093 /*
3094 * now that we've moved on to the tree of log tree roots,
3095 * check the full commit flag again
3096 */
4884b8e8 3097 if (btrfs_need_log_full_commit(trans)) {
c6adc9cc 3098 blk_finish_plug(&plug);
bf89d38f 3099 btrfs_wait_tree_log_extents(log, mark);
12fcfd22 3100 mutex_unlock(&log_root_tree->log_mutex);
f31f09f6 3101 ret = BTRFS_LOG_FORCE_COMMIT;
12fcfd22
CM
3102 goto out_wake_log_root;
3103 }
7237f183 3104
2ff7e61e 3105 ret = btrfs_write_marked_extents(fs_info,
c6adc9cc
MX
3106 &log_root_tree->dirty_log_pages,
3107 EXTENT_DIRTY | EXTENT_NEW);
3108 blk_finish_plug(&plug);
b528f467
NA
3109 /*
3110 * As described above, -EAGAIN indicates a hole in the extents. We
3111 * cannot wait for these write outs since the waiting cause a
3112 * deadlock. Bail out to the full commit instead.
3113 */
3114 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
3115 btrfs_set_log_full_commit(trans);
3116 btrfs_wait_tree_log_extents(log, mark);
3117 mutex_unlock(&log_root_tree->log_mutex);
3118 goto out_wake_log_root;
3119 } else if (ret) {
90787766 3120 btrfs_set_log_full_commit(trans);
66642832 3121 btrfs_abort_transaction(trans, ret);
79787eaa
JM
3122 mutex_unlock(&log_root_tree->log_mutex);
3123 goto out_wake_log_root;
3124 }
bf89d38f 3125 ret = btrfs_wait_tree_log_extents(log, mark);
5ab5e44a 3126 if (!ret)
bf89d38f
JM
3127 ret = btrfs_wait_tree_log_extents(log_root_tree,
3128 EXTENT_NEW | EXTENT_DIRTY);
5ab5e44a 3129 if (ret) {
90787766 3130 btrfs_set_log_full_commit(trans);
5ab5e44a
FM
3131 mutex_unlock(&log_root_tree->log_mutex);
3132 goto out_wake_log_root;
3133 }
e02119d5 3134
47876f7c
FM
3135 log_root_start = log_root_tree->node->start;
3136 log_root_level = btrfs_header_level(log_root_tree->node);
7237f183 3137 log_root_tree->log_transid++;
7237f183
YZ
3138 mutex_unlock(&log_root_tree->log_mutex);
3139
3140 /*
47876f7c
FM
3141 * Here we are guaranteed that nobody is going to write the superblock
3142 * for the current transaction before us and that neither we do write
3143 * our superblock before the previous transaction finishes its commit
3144 * and writes its superblock, because:
3145 *
3146 * 1) We are holding a handle on the current transaction, so no body
3147 * can commit it until we release the handle;
3148 *
3149 * 2) Before writing our superblock we acquire the tree_log_mutex, so
3150 * if the previous transaction is still committing, and hasn't yet
3151 * written its superblock, we wait for it to do it, because a
3152 * transaction commit acquires the tree_log_mutex when the commit
3153 * begins and releases it only after writing its superblock.
7237f183 3154 */
47876f7c 3155 mutex_lock(&fs_info->tree_log_mutex);
165ea85f
JB
3156
3157 /*
3158 * The previous transaction writeout phase could have failed, and thus
3159 * marked the fs in an error state. We must not commit here, as we
3160 * could have updated our generation in the super_for_commit and
3161 * writing the super here would result in transid mismatches. If there
3162 * is an error here just bail.
3163 */
84961539 3164 if (BTRFS_FS_ERROR(fs_info)) {
165ea85f
JB
3165 ret = -EIO;
3166 btrfs_set_log_full_commit(trans);
3167 btrfs_abort_transaction(trans, ret);
3168 mutex_unlock(&fs_info->tree_log_mutex);
3169 goto out_wake_log_root;
3170 }
3171
47876f7c
FM
3172 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
3173 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
eece6a9c 3174 ret = write_all_supers(fs_info, 1);
47876f7c 3175 mutex_unlock(&fs_info->tree_log_mutex);
5af3e8cc 3176 if (ret) {
90787766 3177 btrfs_set_log_full_commit(trans);
66642832 3178 btrfs_abort_transaction(trans, ret);
5af3e8cc
SB
3179 goto out_wake_log_root;
3180 }
7237f183 3181
e1a6d264
FM
3182 /*
3183 * We know there can only be one task here, since we have not yet set
3184 * root->log_commit[index1] to 0 and any task attempting to sync the
3185 * log must wait for the previous log transaction to commit if it's
3186 * still in progress or wait for the current log transaction commit if
3187 * someone else already started it. We use <= and not < because the
3188 * first log transaction has an ID of 0.
3189 */
3190 ASSERT(root->last_log_commit <= log_transid);
3191 root->last_log_commit = log_transid;
257c62e1 3192
12fcfd22 3193out_wake_log_root:
570dd450 3194 mutex_lock(&log_root_tree->log_mutex);
8b050d35
MX
3195 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3196
d1433deb 3197 log_root_tree->log_transid_committed++;
7237f183 3198 atomic_set(&log_root_tree->log_commit[index2], 0);
d1433deb
MX
3199 mutex_unlock(&log_root_tree->log_mutex);
3200
33a9eca7 3201 /*
093258e6
DS
3202 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3203 * all the updates above are seen by the woken threads. It might not be
3204 * necessary, but proving that seems to be hard.
33a9eca7 3205 */
093258e6 3206 cond_wake_up(&log_root_tree->log_commit_wait[index2]);
e02119d5 3207out:
d1433deb 3208 mutex_lock(&root->log_mutex);
570dd450 3209 btrfs_remove_all_log_ctxs(root, index1, ret);
d1433deb 3210 root->log_transid_committed++;
7237f183 3211 atomic_set(&root->log_commit[index1], 0);
d1433deb 3212 mutex_unlock(&root->log_mutex);
8b050d35 3213
33a9eca7 3214 /*
093258e6
DS
3215 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3216 * all the updates above are seen by the woken threads. It might not be
3217 * necessary, but proving that seems to be hard.
33a9eca7 3218 */
093258e6 3219 cond_wake_up(&root->log_commit_wait[index1]);
b31eabd8 3220 return ret;
e02119d5
CM
3221}
3222
4a500fd1
YZ
3223static void free_log_tree(struct btrfs_trans_handle *trans,
3224 struct btrfs_root *log)
e02119d5
CM
3225{
3226 int ret;
e02119d5
CM
3227 struct walk_control wc = {
3228 .free = 1,
3229 .process_func = process_one_buffer
3230 };
3231
3ddebf27
NA
3232 if (log->node) {
3233 ret = walk_log_tree(trans, log, &wc);
3234 if (ret) {
40cdc509
FM
3235 /*
3236 * We weren't able to traverse the entire log tree, the
3237 * typical scenario is getting an -EIO when reading an
3238 * extent buffer of the tree, due to a previous writeback
3239 * failure of it.
3240 */
3241 set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
3242 &log->fs_info->fs_state);
3243
3244 /*
3245 * Some extent buffers of the log tree may still be dirty
3246 * and not yet written back to storage, because we may
3247 * have updates to a log tree without syncing a log tree,
3248 * such as during rename and link operations. So flush
3249 * them out and wait for their writeback to complete, so
3250 * that we properly cleanup their state and pages.
3251 */
3252 btrfs_write_marked_extents(log->fs_info,
3253 &log->dirty_log_pages,
3254 EXTENT_DIRTY | EXTENT_NEW);
3255 btrfs_wait_tree_log_extents(log,
3256 EXTENT_DIRTY | EXTENT_NEW);
3257
3ddebf27
NA
3258 if (trans)
3259 btrfs_abort_transaction(trans, ret);
3260 else
3261 btrfs_handle_fs_error(log->fs_info, ret, NULL);
3262 }
374b0e2d 3263 }
e02119d5 3264
59b0713a
FM
3265 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
3266 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
e289f03e 3267 extent_io_tree_release(&log->log_csum_range);
d3575156 3268
00246528 3269 btrfs_put_root(log);
4a500fd1
YZ
3270}
3271
3272/*
3273 * free all the extents used by the tree log. This should be called
3274 * at commit time of the full transaction
3275 */
3276int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3277{
3278 if (root->log_root) {
3279 free_log_tree(trans, root->log_root);
3280 root->log_root = NULL;
e7a79811 3281 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
4a500fd1
YZ
3282 }
3283 return 0;
3284}
3285
3286int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3287 struct btrfs_fs_info *fs_info)
3288{
3289 if (fs_info->log_root_tree) {
3290 free_log_tree(trans, fs_info->log_root_tree);
3291 fs_info->log_root_tree = NULL;
47876f7c 3292 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
4a500fd1 3293 }
e02119d5
CM
3294 return 0;
3295}
3296
803f0f64 3297/*
0f8ce498
FM
3298 * Check if an inode was logged in the current transaction. This correctly deals
3299 * with the case where the inode was logged but has a logged_trans of 0, which
3300 * happens if the inode is evicted and loaded again, as logged_trans is an in
3301 * memory only field (not persisted).
3302 *
3303 * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
3304 * and < 0 on error.
803f0f64 3305 */
0f8ce498
FM
3306static int inode_logged(struct btrfs_trans_handle *trans,
3307 struct btrfs_inode *inode,
3308 struct btrfs_path *path_in)
803f0f64 3309{
0f8ce498
FM
3310 struct btrfs_path *path = path_in;
3311 struct btrfs_key key;
3312 int ret;
3313
803f0f64 3314 if (inode->logged_trans == trans->transid)
0f8ce498 3315 return 1;
803f0f64 3316
0f8ce498
FM
3317 /*
3318 * If logged_trans is not 0, then we know the inode logged was not logged
3319 * in this transaction, so we can return false right away.
3320 */
3321 if (inode->logged_trans > 0)
3322 return 0;
3323
3324 /*
3325 * If no log tree was created for this root in this transaction, then
3326 * the inode can not have been logged in this transaction. In that case
3327 * set logged_trans to anything greater than 0 and less than the current
3328 * transaction's ID, to avoid the search below in a future call in case
3329 * a log tree gets created after this.
3330 */
3331 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
3332 inode->logged_trans = trans->transid - 1;
3333 return 0;
3334 }
3335
3336 /*
3337 * We have a log tree and the inode's logged_trans is 0. We can't tell
3338 * for sure if the inode was logged before in this transaction by looking
3339 * only at logged_trans. We could be pessimistic and assume it was, but
3340 * that can lead to unnecessarily logging an inode during rename and link
3341 * operations, and then further updating the log in followup rename and
3342 * link operations, specially if it's a directory, which adds latency
3343 * visible to applications doing a series of rename or link operations.
3344 *
3345 * A logged_trans of 0 here can mean several things:
3346 *
3347 * 1) The inode was never logged since the filesystem was mounted, and may
3348 * or may have not been evicted and loaded again;
3349 *
3350 * 2) The inode was logged in a previous transaction, then evicted and
3351 * then loaded again;
3352 *
3353 * 3) The inode was logged in the current transaction, then evicted and
3354 * then loaded again.
3355 *
3356 * For cases 1) and 2) we don't want to return true, but we need to detect
3357 * case 3) and return true. So we do a search in the log root for the inode
3358 * item.
3359 */
3360 key.objectid = btrfs_ino(inode);
3361 key.type = BTRFS_INODE_ITEM_KEY;
3362 key.offset = 0;
3363
3364 if (!path) {
3365 path = btrfs_alloc_path();
3366 if (!path)
3367 return -ENOMEM;
3368 }
3369
3370 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
3371
3372 if (path_in)
3373 btrfs_release_path(path);
3374 else
3375 btrfs_free_path(path);
1e0860f3 3376
6e8e777d 3377 /*
0f8ce498
FM
3378 * Logging an inode always results in logging its inode item. So if we
3379 * did not find the item we know the inode was not logged for sure.
6e8e777d 3380 */
0f8ce498
FM
3381 if (ret < 0) {
3382 return ret;
3383 } else if (ret > 0) {
3384 /*
3385 * Set logged_trans to a value greater than 0 and less then the
3386 * current transaction to avoid doing the search in future calls.
3387 */
3388 inode->logged_trans = trans->transid - 1;
3389 return 0;
3390 }
3391
3392 /*
3393 * The inode was previously logged and then evicted, set logged_trans to
3394 * the current transacion's ID, to avoid future tree searches as long as
3395 * the inode is not evicted again.
3396 */
3397 inode->logged_trans = trans->transid;
3398
3399 /*
3400 * If it's a directory, then we must set last_dir_index_offset to the
3401 * maximum possible value, so that the next attempt to log the inode does
3402 * not skip checking if dir index keys found in modified subvolume tree
3403 * leaves have been logged before, otherwise it would result in attempts
3404 * to insert duplicate dir index keys in the log tree. This must be done
3405 * because last_dir_index_offset is an in-memory only field, not persisted
3406 * in the inode item or any other on-disk structure, so its value is lost
3407 * once the inode is evicted.
3408 */
3409 if (S_ISDIR(inode->vfs_inode.i_mode))
3410 inode->last_dir_index_offset = (u64)-1;
803f0f64 3411
0f8ce498 3412 return 1;
803f0f64
FM
3413}
3414
839061fe
FM
3415/*
3416 * Delete a directory entry from the log if it exists.
3417 *
3418 * Returns < 0 on error
3419 * 1 if the entry does not exists
3420 * 0 if the entry existed and was successfully deleted
3421 */
3422static int del_logged_dentry(struct btrfs_trans_handle *trans,
3423 struct btrfs_root *log,
3424 struct btrfs_path *path,
3425 u64 dir_ino,
6db75318 3426 const struct fscrypt_str *name,
839061fe
FM
3427 u64 index)
3428{
3429 struct btrfs_dir_item *di;
3430
3431 /*
3432 * We only log dir index items of a directory, so we don't need to look
3433 * for dir item keys.
3434 */
3435 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
e43eec81 3436 index, name, -1);
839061fe
FM
3437 if (IS_ERR(di))
3438 return PTR_ERR(di);
3439 else if (!di)
3440 return 1;
3441
3442 /*
3443 * We do not need to update the size field of the directory's
3444 * inode item because on log replay we update the field to reflect
3445 * all existing entries in the directory (see overwrite_item()).
3446 */
3447 return btrfs_delete_one_dir_name(trans, log, path, di);
3448}
3449
e02119d5
CM
3450/*
3451 * If both a file and directory are logged, and unlinks or renames are
3452 * mixed in, we have a few interesting corners:
3453 *
3454 * create file X in dir Y
3455 * link file X to X.link in dir Y
3456 * fsync file X
3457 * unlink file X but leave X.link
3458 * fsync dir Y
3459 *
3460 * After a crash we would expect only X.link to exist. But file X
3461 * didn't get fsync'd again so the log has back refs for X and X.link.
3462 *
3463 * We solve this by removing directory entries and inode backrefs from the
3464 * log when a file that was logged in the current transaction is
3465 * unlinked. Any later fsync will include the updated log entries, and
3466 * we'll be able to reconstruct the proper directory items from backrefs.
3467 *
3468 * This optimizations allows us to avoid relogging the entire inode
3469 * or the entire directory.
3470 */
9a35fc95
JB
3471void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3472 struct btrfs_root *root,
6db75318 3473 const struct fscrypt_str *name,
9a35fc95 3474 struct btrfs_inode *dir, u64 index)
e02119d5 3475{
e02119d5
CM
3476 struct btrfs_path *path;
3477 int ret;
e02119d5 3478
0f8ce498
FM
3479 ret = inode_logged(trans, dir, NULL);
3480 if (ret == 0)
3481 return;
3482 else if (ret < 0) {
3483 btrfs_set_log_full_commit(trans);
9a35fc95 3484 return;
0f8ce498 3485 }
3a5f1d45 3486
e02119d5
CM
3487 ret = join_running_log_trans(root);
3488 if (ret)
9a35fc95 3489 return;
e02119d5 3490
49f34d1f 3491 mutex_lock(&dir->log_mutex);
e02119d5 3492
e02119d5 3493 path = btrfs_alloc_path();
a62f44a5 3494 if (!path) {
839061fe 3495 ret = -ENOMEM;
a62f44a5
TI
3496 goto out_unlock;
3497 }
2a29edc6 3498
839061fe 3499 ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
e43eec81 3500 name, index);
e02119d5 3501 btrfs_free_path(path);
a62f44a5 3502out_unlock:
49f34d1f 3503 mutex_unlock(&dir->log_mutex);
839061fe 3504 if (ret < 0)
90787766 3505 btrfs_set_log_full_commit(trans);
12fcfd22 3506 btrfs_end_log_trans(root);
e02119d5
CM
3507}
3508
3509/* see comments for btrfs_del_dir_entries_in_log */
9a35fc95
JB
3510void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3511 struct btrfs_root *root,
6db75318 3512 const struct fscrypt_str *name,
9a35fc95 3513 struct btrfs_inode *inode, u64 dirid)
e02119d5
CM
3514{
3515 struct btrfs_root *log;
3516 u64 index;
3517 int ret;
3518
0f8ce498
FM
3519 ret = inode_logged(trans, inode, NULL);
3520 if (ret == 0)
9a35fc95 3521 return;
0f8ce498
FM
3522 else if (ret < 0) {
3523 btrfs_set_log_full_commit(trans);
3524 return;
3525 }
3a5f1d45 3526
e02119d5
CM
3527 ret = join_running_log_trans(root);
3528 if (ret)
9a35fc95 3529 return;
e02119d5 3530 log = root->log_root;
a491abb2 3531 mutex_lock(&inode->log_mutex);
e02119d5 3532
e43eec81 3533 ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode),
e02119d5 3534 dirid, &index);
a491abb2 3535 mutex_unlock(&inode->log_mutex);
9a35fc95 3536 if (ret < 0 && ret != -ENOENT)
90787766 3537 btrfs_set_log_full_commit(trans);
12fcfd22 3538 btrfs_end_log_trans(root);
e02119d5
CM
3539}
3540
3541/*
3542 * creates a range item in the log for 'dirid'. first_offset and
3543 * last_offset tell us which parts of the key space the log should
3544 * be considered authoritative for.
3545 */
3546static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3547 struct btrfs_root *log,
3548 struct btrfs_path *path,
339d0354 3549 u64 dirid,
e02119d5
CM
3550 u64 first_offset, u64 last_offset)
3551{
3552 int ret;
3553 struct btrfs_key key;
3554 struct btrfs_dir_log_item *item;
3555
3556 key.objectid = dirid;
3557 key.offset = first_offset;
339d0354 3558 key.type = BTRFS_DIR_LOG_INDEX_KEY;
e02119d5 3559 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
750ee454
FM
3560 /*
3561 * -EEXIST is fine and can happen sporadically when we are logging a
3562 * directory and have concurrent insertions in the subvolume's tree for
3563 * items from other inodes and that result in pushing off some dir items
3564 * from one leaf to another in order to accommodate for the new items.
3565 * This results in logging the same dir index range key.
3566 */
3567 if (ret && ret != -EEXIST)
4a500fd1 3568 return ret;
e02119d5
CM
3569
3570 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3571 struct btrfs_dir_log_item);
750ee454
FM
3572 if (ret == -EEXIST) {
3573 const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
3574
3575 /*
3576 * btrfs_del_dir_entries_in_log() might have been called during
3577 * an unlink between the initial insertion of this key and the
3578 * current update, or we might be logging a single entry deletion
3579 * during a rename, so set the new last_offset to the max value.
3580 */
3581 last_offset = max(last_offset, curr_end);
3582 }
e02119d5
CM
3583 btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3584 btrfs_mark_buffer_dirty(path->nodes[0]);
b3b4aa74 3585 btrfs_release_path(path);
e02119d5
CM
3586 return 0;
3587}
3588
086dcbfa
FM
3589static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
3590 struct btrfs_root *log,
3591 struct extent_buffer *src,
3592 struct btrfs_path *dst_path,
3593 int start_slot,
3594 int count)
3595{
3596 char *ins_data = NULL;
b7ef5f3a 3597 struct btrfs_item_batch batch;
086dcbfa 3598 struct extent_buffer *dst;
da1b811f
FM
3599 unsigned long src_offset;
3600 unsigned long dst_offset;
086dcbfa
FM
3601 struct btrfs_key key;
3602 u32 item_size;
3603 int ret;
3604 int i;
3605
3606 ASSERT(count > 0);
b7ef5f3a 3607 batch.nr = count;
086dcbfa
FM
3608
3609 if (count == 1) {
3610 btrfs_item_key_to_cpu(src, &key, start_slot);
3212fa14 3611 item_size = btrfs_item_size(src, start_slot);
b7ef5f3a
FM
3612 batch.keys = &key;
3613 batch.data_sizes = &item_size;
3614 batch.total_data_size = item_size;
086dcbfa 3615 } else {
b7ef5f3a
FM
3616 struct btrfs_key *ins_keys;
3617 u32 *ins_sizes;
3618
086dcbfa
FM
3619 ins_data = kmalloc(count * sizeof(u32) +
3620 count * sizeof(struct btrfs_key), GFP_NOFS);
3621 if (!ins_data)
3622 return -ENOMEM;
3623
3624 ins_sizes = (u32 *)ins_data;
3625 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
b7ef5f3a
FM
3626 batch.keys = ins_keys;
3627 batch.data_sizes = ins_sizes;
3628 batch.total_data_size = 0;
086dcbfa
FM
3629
3630 for (i = 0; i < count; i++) {
3631 const int slot = start_slot + i;
3632
3633 btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
3212fa14 3634 ins_sizes[i] = btrfs_item_size(src, slot);
b7ef5f3a 3635 batch.total_data_size += ins_sizes[i];
086dcbfa
FM
3636 }
3637 }
3638
b7ef5f3a 3639 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
086dcbfa
FM
3640 if (ret)
3641 goto out;
3642
3643 dst = dst_path->nodes[0];
da1b811f
FM
3644 /*
3645 * Copy all the items in bulk, in a single copy operation. Item data is
3646 * organized such that it's placed at the end of a leaf and from right
3647 * to left. For example, the data for the second item ends at an offset
3648 * that matches the offset where the data for the first item starts, the
3649 * data for the third item ends at an offset that matches the offset
3650 * where the data of the second items starts, and so on.
3651 * Therefore our source and destination start offsets for copy match the
3652 * offsets of the last items (highest slots).
3653 */
3654 dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
3655 src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
3656 copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
086dcbfa
FM
3657 btrfs_release_path(dst_path);
3658out:
3659 kfree(ins_data);
3660
3661 return ret;
3662}
3663
eb10d85e
FM
3664static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
3665 struct btrfs_inode *inode,
3666 struct btrfs_path *path,
3667 struct btrfs_path *dst_path,
732d591a
FM
3668 struct btrfs_log_ctx *ctx,
3669 u64 *last_old_dentry_offset)
eb10d85e
FM
3670{
3671 struct btrfs_root *log = inode->root->log_root;
796787c9
FM
3672 struct extent_buffer *src;
3673 const int nritems = btrfs_header_nritems(path->nodes[0]);
eb10d85e 3674 const u64 ino = btrfs_ino(inode);
086dcbfa
FM
3675 bool last_found = false;
3676 int batch_start = 0;
3677 int batch_size = 0;
eb10d85e
FM
3678 int i;
3679
796787c9
FM
3680 /*
3681 * We need to clone the leaf, release the read lock on it, and use the
3682 * clone before modifying the log tree. See the comment at copy_items()
3683 * about why we need to do this.
3684 */
3685 src = btrfs_clone_extent_buffer(path->nodes[0]);
3686 if (!src)
3687 return -ENOMEM;
3688
3689 i = path->slots[0];
3690 btrfs_release_path(path);
3691 path->nodes[0] = src;
3692 path->slots[0] = i;
3693
3694 for (; i < nritems; i++) {
732d591a 3695 struct btrfs_dir_item *di;
eb10d85e 3696 struct btrfs_key key;
eb10d85e
FM
3697 int ret;
3698
3699 btrfs_item_key_to_cpu(src, &key, i);
3700
339d0354 3701 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
086dcbfa
FM
3702 last_found = true;
3703 break;
3704 }
eb10d85e 3705
732d591a 3706 di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
dc287224 3707 ctx->last_dir_item_offset = key.offset;
732d591a
FM
3708
3709 /*
3710 * Skip ranges of items that consist only of dir item keys created
3711 * in past transactions. However if we find a gap, we must log a
3712 * dir index range item for that gap, so that index keys in that
3713 * gap are deleted during log replay.
3714 */
3715 if (btrfs_dir_transid(src, di) < trans->transid) {
3716 if (key.offset > *last_old_dentry_offset + 1) {
3717 ret = insert_dir_log_key(trans, log, dst_path,
3718 ino, *last_old_dentry_offset + 1,
3719 key.offset - 1);
732d591a
FM
3720 if (ret < 0)
3721 return ret;
3722 }
3723
3724 *last_old_dentry_offset = key.offset;
3725 continue;
3726 }
193df624
FM
3727
3728 /* If we logged this dir index item before, we can skip it. */
3729 if (key.offset <= inode->last_dir_index_offset)
3730 continue;
3731
eb10d85e
FM
3732 /*
3733 * We must make sure that when we log a directory entry, the
3734 * corresponding inode, after log replay, has a matching link
3735 * count. For example:
3736 *
3737 * touch foo
3738 * mkdir mydir
3739 * sync
3740 * ln foo mydir/bar
3741 * xfs_io -c "fsync" mydir
3742 * <crash>
3743 * <mount fs and log replay>
3744 *
3745 * Would result in a fsync log that when replayed, our file inode
3746 * would have a link count of 1, but we get two directory entries
3747 * pointing to the same inode. After removing one of the names,
3748 * it would not be possible to remove the other name, which
3749 * resulted always in stale file handle errors, and would not be
3750 * possible to rmdir the parent directory, since its i_size could
3751 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
3752 * resulting in -ENOTEMPTY errors.
3753 */
086dcbfa 3754 if (!ctx->log_new_dentries) {
086dcbfa
FM
3755 struct btrfs_key di_key;
3756
086dcbfa 3757 btrfs_dir_item_key_to_cpu(src, di, &di_key);
732d591a 3758 if (di_key.type != BTRFS_ROOT_ITEM_KEY)
086dcbfa
FM
3759 ctx->log_new_dentries = true;
3760 }
3761
086dcbfa
FM
3762 if (batch_size == 0)
3763 batch_start = i;
3764 batch_size++;
eb10d85e
FM
3765 }
3766
086dcbfa
FM
3767 if (batch_size > 0) {
3768 int ret;
3769
3770 ret = flush_dir_items_batch(trans, log, src, dst_path,
3771 batch_start, batch_size);
3772 if (ret < 0)
3773 return ret;
3774 }
3775
3776 return last_found ? 1 : 0;
eb10d85e
FM
3777}
3778
e02119d5
CM
3779/*
3780 * log all the items included in the current transaction for a given
3781 * directory. This also creates the range items in the log tree required
3782 * to replay anything deleted before the fsync
3783 */
3784static noinline int log_dir_items(struct btrfs_trans_handle *trans,
90d04510 3785 struct btrfs_inode *inode,
e02119d5 3786 struct btrfs_path *path,
339d0354 3787 struct btrfs_path *dst_path,
2f2ff0ee 3788 struct btrfs_log_ctx *ctx,
e02119d5
CM
3789 u64 min_offset, u64 *last_offset_ret)
3790{
3791 struct btrfs_key min_key;
90d04510 3792 struct btrfs_root *root = inode->root;
e02119d5 3793 struct btrfs_root *log = root->log_root;
4a500fd1 3794 int err = 0;
e02119d5 3795 int ret;
732d591a 3796 u64 last_old_dentry_offset = min_offset - 1;
e02119d5 3797 u64 last_offset = (u64)-1;
684a5773 3798 u64 ino = btrfs_ino(inode);
e02119d5 3799
33345d01 3800 min_key.objectid = ino;
339d0354 3801 min_key.type = BTRFS_DIR_INDEX_KEY;
e02119d5
CM
3802 min_key.offset = min_offset;
3803
6174d3cb 3804 ret = btrfs_search_forward(root, &min_key, path, trans->transid);
e02119d5
CM
3805
3806 /*
3807 * we didn't find anything from this transaction, see if there
3808 * is anything at all
3809 */
339d0354
FM
3810 if (ret != 0 || min_key.objectid != ino ||
3811 min_key.type != BTRFS_DIR_INDEX_KEY) {
33345d01 3812 min_key.objectid = ino;
339d0354 3813 min_key.type = BTRFS_DIR_INDEX_KEY;
e02119d5 3814 min_key.offset = (u64)-1;
b3b4aa74 3815 btrfs_release_path(path);
e02119d5
CM
3816 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3817 if (ret < 0) {
b3b4aa74 3818 btrfs_release_path(path);
e02119d5
CM
3819 return ret;
3820 }
339d0354 3821 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
e02119d5
CM
3822
3823 /* if ret == 0 there are items for this type,
3824 * create a range to tell us the last key of this type.
3825 * otherwise, there are no items in this directory after
3826 * *min_offset, and we create a range to indicate that.
3827 */
3828 if (ret == 0) {
3829 struct btrfs_key tmp;
732d591a 3830
e02119d5
CM
3831 btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3832 path->slots[0]);
339d0354 3833 if (tmp.type == BTRFS_DIR_INDEX_KEY)
732d591a 3834 last_old_dentry_offset = tmp.offset;
e02119d5
CM
3835 }
3836 goto done;
3837 }
3838
3839 /* go backward to find any previous key */
339d0354 3840 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
e02119d5
CM
3841 if (ret == 0) {
3842 struct btrfs_key tmp;
a450a4af 3843
e02119d5 3844 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
a450a4af
FM
3845 /*
3846 * The dir index key before the first one we found that needs to
3847 * be logged might be in a previous leaf, and there might be a
3848 * gap between these keys, meaning that we had deletions that
3849 * happened. So the key range item we log (key type
3850 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
3851 * previous key's offset plus 1, so that those deletes are replayed.
3852 */
3853 if (tmp.type == BTRFS_DIR_INDEX_KEY)
732d591a 3854 last_old_dentry_offset = tmp.offset;
e02119d5 3855 }
b3b4aa74 3856 btrfs_release_path(path);
e02119d5 3857
2cc83342
JB
3858 /*
3859 * Find the first key from this transaction again. See the note for
3860 * log_new_dir_dentries, if we're logging a directory recursively we
3861 * won't be holding its i_mutex, which means we can modify the directory
3862 * while we're logging it. If we remove an entry between our first
3863 * search and this search we'll not find the key again and can just
3864 * bail.
3865 */
bb56f02f 3866search:
e02119d5 3867 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2cc83342 3868 if (ret != 0)
e02119d5 3869 goto done;
e02119d5
CM
3870
3871 /*
3872 * we have a block from this transaction, log every item in it
3873 * from our directory
3874 */
d397712b 3875 while (1) {
732d591a
FM
3876 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
3877 &last_old_dentry_offset);
eb10d85e
FM
3878 if (ret != 0) {
3879 if (ret < 0)
4a500fd1 3880 err = ret;
eb10d85e 3881 goto done;
e02119d5 3882 }
eb10d85e 3883 path->slots[0] = btrfs_header_nritems(path->nodes[0]);
e02119d5
CM
3884
3885 /*
3886 * look ahead to the next item and see if it is also
3887 * from this directory and from this transaction
3888 */
3889 ret = btrfs_next_leaf(root, path);
80c0b421
LB
3890 if (ret) {
3891 if (ret == 1)
3892 last_offset = (u64)-1;
3893 else
3894 err = ret;
e02119d5
CM
3895 goto done;
3896 }
eb10d85e 3897 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
339d0354 3898 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
e02119d5
CM
3899 last_offset = (u64)-1;
3900 goto done;
3901 }
3902 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
a450a4af
FM
3903 /*
3904 * The next leaf was not changed in the current transaction
3905 * and has at least one dir index key.
3906 * We check for the next key because there might have been
3907 * one or more deletions between the last key we logged and
3908 * that next key. So the key range item we log (key type
3909 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
3910 * offset minus 1, so that those deletes are replayed.
3911 */
3912 last_offset = min_key.offset - 1;
e02119d5
CM
3913 goto done;
3914 }
eb10d85e
FM
3915 if (need_resched()) {
3916 btrfs_release_path(path);
3917 cond_resched();
3918 goto search;
3919 }
e02119d5
CM
3920 }
3921done:
b3b4aa74
DS
3922 btrfs_release_path(path);
3923 btrfs_release_path(dst_path);
e02119d5 3924
4a500fd1
YZ
3925 if (err == 0) {
3926 *last_offset_ret = last_offset;
3927 /*
732d591a
FM
3928 * In case the leaf was changed in the current transaction but
3929 * all its dir items are from a past transaction, the last item
3930 * in the leaf is a dir item and there's no gap between that last
3931 * dir item and the first one on the next leaf (which did not
3932 * change in the current transaction), then we don't need to log
3933 * a range, last_old_dentry_offset is == to last_offset.
4a500fd1 3934 */
732d591a
FM
3935 ASSERT(last_old_dentry_offset <= last_offset);
3936 if (last_old_dentry_offset < last_offset) {
3937 ret = insert_dir_log_key(trans, log, path, ino,
3938 last_old_dentry_offset + 1,
3939 last_offset);
3940 if (ret)
3941 err = ret;
3942 }
4a500fd1
YZ
3943 }
3944 return err;
e02119d5
CM
3945}
3946
193df624
FM
3947/*
3948 * If the inode was logged before and it was evicted, then its
3949 * last_dir_index_offset is (u64)-1, so we don't the value of the last index
3950 * key offset. If that's the case, search for it and update the inode. This
3951 * is to avoid lookups in the log tree every time we try to insert a dir index
3952 * key from a leaf changed in the current transaction, and to allow us to always
3953 * do batch insertions of dir index keys.
3954 */
3955static int update_last_dir_index_offset(struct btrfs_inode *inode,
3956 struct btrfs_path *path,
3957 const struct btrfs_log_ctx *ctx)
3958{
3959 const u64 ino = btrfs_ino(inode);
3960 struct btrfs_key key;
3961 int ret;
3962
3963 lockdep_assert_held(&inode->log_mutex);
3964
3965 if (inode->last_dir_index_offset != (u64)-1)
3966 return 0;
3967
3968 if (!ctx->logged_before) {
3969 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
3970 return 0;
3971 }
3972
3973 key.objectid = ino;
3974 key.type = BTRFS_DIR_INDEX_KEY;
3975 key.offset = (u64)-1;
3976
3977 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
3978 /*
3979 * An error happened or we actually have an index key with an offset
3980 * value of (u64)-1. Bail out, we're done.
3981 */
3982 if (ret <= 0)
3983 goto out;
3984
3985 ret = 0;
3986 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
3987
3988 /*
3989 * No dir index items, bail out and leave last_dir_index_offset with
3990 * the value right before the first valid index value.
3991 */
3992 if (path->slots[0] == 0)
3993 goto out;
3994
3995 /*
3996 * btrfs_search_slot() left us at one slot beyond the slot with the last
3997 * index key, or beyond the last key of the directory that is not an
3998 * index key. If we have an index key before, set last_dir_index_offset
3999 * to its offset value, otherwise leave it with a value right before the
4000 * first valid index value, as it means we have an empty directory.
4001 */
4002 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
4003 if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
4004 inode->last_dir_index_offset = key.offset;
4005
4006out:
4007 btrfs_release_path(path);
4008
4009 return ret;
4010}
4011
e02119d5
CM
4012/*
4013 * logging directories is very similar to logging inodes, We find all the items
4014 * from the current transaction and write them to the log.
4015 *
4016 * The recovery code scans the directory in the subvolume, and if it finds a
4017 * key in the range logged that is not present in the log tree, then it means
4018 * that dir entry was unlinked during the transaction.
4019 *
4020 * In order for that scan to work, we must include one key smaller than
4021 * the smallest logged by this transaction and one key larger than the largest
4022 * key logged by this transaction.
4023 */
4024static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
90d04510 4025 struct btrfs_inode *inode,
e02119d5 4026 struct btrfs_path *path,
2f2ff0ee
FM
4027 struct btrfs_path *dst_path,
4028 struct btrfs_log_ctx *ctx)
e02119d5
CM
4029{
4030 u64 min_key;
4031 u64 max_key;
4032 int ret;
e02119d5 4033
193df624
FM
4034 ret = update_last_dir_index_offset(inode, path, ctx);
4035 if (ret)
4036 return ret;
4037
732d591a 4038 min_key = BTRFS_DIR_START_INDEX;
e02119d5 4039 max_key = 0;
339d0354 4040 ctx->last_dir_item_offset = inode->last_dir_index_offset;
dc287224 4041
d397712b 4042 while (1) {
339d0354 4043 ret = log_dir_items(trans, inode, path, dst_path,
dbf39ea4 4044 ctx, min_key, &max_key);
4a500fd1
YZ
4045 if (ret)
4046 return ret;
e02119d5
CM
4047 if (max_key == (u64)-1)
4048 break;
4049 min_key = max_key + 1;
4050 }
4051
339d0354
FM
4052 inode->last_dir_index_offset = ctx->last_dir_item_offset;
4053
e02119d5
CM
4054 return 0;
4055}
4056
4057/*
4058 * a helper function to drop items from the log before we relog an
4059 * inode. max_key_type indicates the highest item type to remove.
4060 * This cannot be run for file data extents because it does not
4061 * free the extents they point to.
4062 */
88e221cd 4063static int drop_inode_items(struct btrfs_trans_handle *trans,
e02119d5
CM
4064 struct btrfs_root *log,
4065 struct btrfs_path *path,
88e221cd
FM
4066 struct btrfs_inode *inode,
4067 int max_key_type)
e02119d5
CM
4068{
4069 int ret;
4070 struct btrfs_key key;
4071 struct btrfs_key found_key;
18ec90d6 4072 int start_slot;
e02119d5 4073
88e221cd 4074 key.objectid = btrfs_ino(inode);
e02119d5
CM
4075 key.type = max_key_type;
4076 key.offset = (u64)-1;
4077
d397712b 4078 while (1) {
e02119d5 4079 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3650860b 4080 BUG_ON(ret == 0); /* Logic error */
4a500fd1 4081 if (ret < 0)
e02119d5
CM
4082 break;
4083
4084 if (path->slots[0] == 0)
4085 break;
4086
4087 path->slots[0]--;
4088 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4089 path->slots[0]);
4090
88e221cd 4091 if (found_key.objectid != key.objectid)
e02119d5
CM
4092 break;
4093
18ec90d6
JB
4094 found_key.offset = 0;
4095 found_key.type = 0;
e3b83361 4096 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
cbca7d59
FM
4097 if (ret < 0)
4098 break;
18ec90d6
JB
4099
4100 ret = btrfs_del_items(trans, log, path, start_slot,
4101 path->slots[0] - start_slot + 1);
4102 /*
4103 * If start slot isn't 0 then we don't need to re-search, we've
4104 * found the last guy with the objectid in this tree.
4105 */
4106 if (ret || start_slot != 0)
65a246c5 4107 break;
b3b4aa74 4108 btrfs_release_path(path);
e02119d5 4109 }
b3b4aa74 4110 btrfs_release_path(path);
5bdbeb21
JB
4111 if (ret > 0)
4112 ret = 0;
4a500fd1 4113 return ret;
e02119d5
CM
4114}
4115
8a2b3da1
FM
4116static int truncate_inode_items(struct btrfs_trans_handle *trans,
4117 struct btrfs_root *log_root,
4118 struct btrfs_inode *inode,
4119 u64 new_size, u32 min_type)
4120{
d9ac19c3
JB
4121 struct btrfs_truncate_control control = {
4122 .new_size = new_size,
487e81d2 4123 .ino = btrfs_ino(inode),
d9ac19c3 4124 .min_type = min_type,
5caa490e 4125 .skip_ref_updates = true,
d9ac19c3 4126 };
8a2b3da1 4127
8697b8f8 4128 return btrfs_truncate_inode_items(trans, log_root, &control);
8a2b3da1
FM
4129}
4130
94edf4ae
JB
4131static void fill_inode_item(struct btrfs_trans_handle *trans,
4132 struct extent_buffer *leaf,
4133 struct btrfs_inode_item *item,
1a4bcf47
FM
4134 struct inode *inode, int log_inode_only,
4135 u64 logged_isize)
94edf4ae 4136{
0b1c6cca 4137 struct btrfs_map_token token;
77eea05e 4138 u64 flags;
0b1c6cca 4139
c82f823c 4140 btrfs_init_map_token(&token, leaf);
94edf4ae
JB
4141
4142 if (log_inode_only) {
4143 /* set the generation to zero so the recover code
4144 * can tell the difference between an logging
4145 * just to say 'this inode exists' and a logging
4146 * to say 'update this inode with these values'
4147 */
cc4c13d5
DS
4148 btrfs_set_token_inode_generation(&token, item, 0);
4149 btrfs_set_token_inode_size(&token, item, logged_isize);
94edf4ae 4150 } else {
cc4c13d5
DS
4151 btrfs_set_token_inode_generation(&token, item,
4152 BTRFS_I(inode)->generation);
4153 btrfs_set_token_inode_size(&token, item, inode->i_size);
0b1c6cca
JB
4154 }
4155
cc4c13d5
DS
4156 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
4157 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
4158 btrfs_set_token_inode_mode(&token, item, inode->i_mode);
4159 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
4160
4161 btrfs_set_token_timespec_sec(&token, &item->atime,
4162 inode->i_atime.tv_sec);
4163 btrfs_set_token_timespec_nsec(&token, &item->atime,
4164 inode->i_atime.tv_nsec);
4165
4166 btrfs_set_token_timespec_sec(&token, &item->mtime,
4167 inode->i_mtime.tv_sec);
4168 btrfs_set_token_timespec_nsec(&token, &item->mtime,
4169 inode->i_mtime.tv_nsec);
4170
4171 btrfs_set_token_timespec_sec(&token, &item->ctime,
4172 inode->i_ctime.tv_sec);
4173 btrfs_set_token_timespec_nsec(&token, &item->ctime,
4174 inode->i_ctime.tv_nsec);
4175
e593e54e
FM
4176 /*
4177 * We do not need to set the nbytes field, in fact during a fast fsync
4178 * its value may not even be correct, since a fast fsync does not wait
4179 * for ordered extent completion, which is where we update nbytes, it
4180 * only waits for writeback to complete. During log replay as we find
4181 * file extent items and replay them, we adjust the nbytes field of the
4182 * inode item in subvolume tree as needed (see overwrite_item()).
4183 */
cc4c13d5
DS
4184
4185 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
4186 btrfs_set_token_inode_transid(&token, item, trans->transid);
4187 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
77eea05e
BB
4188 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4189 BTRFS_I(inode)->ro_flags);
4190 btrfs_set_token_inode_flags(&token, item, flags);
cc4c13d5 4191 btrfs_set_token_inode_block_group(&token, item, 0);
94edf4ae
JB
4192}
4193
a95249b3
JB
4194static int log_inode_item(struct btrfs_trans_handle *trans,
4195 struct btrfs_root *log, struct btrfs_path *path,
2ac691d8 4196 struct btrfs_inode *inode, bool inode_item_dropped)
a95249b3
JB
4197{
4198 struct btrfs_inode_item *inode_item;
a95249b3
JB
4199 int ret;
4200
2ac691d8
FM
4201 /*
4202 * If we are doing a fast fsync and the inode was logged before in the
4203 * current transaction, then we know the inode was previously logged and
4204 * it exists in the log tree. For performance reasons, in this case use
4205 * btrfs_search_slot() directly with ins_len set to 0 so that we never
4206 * attempt a write lock on the leaf's parent, which adds unnecessary lock
4207 * contention in case there are concurrent fsyncs for other inodes of the
4208 * same subvolume. Using btrfs_insert_empty_item() when the inode item
4209 * already exists can also result in unnecessarily splitting a leaf.
4210 */
4211 if (!inode_item_dropped && inode->logged_trans == trans->transid) {
4212 ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
4213 ASSERT(ret <= 0);
4214 if (ret > 0)
4215 ret = -ENOENT;
4216 } else {
4217 /*
4218 * This means it is the first fsync in the current transaction,
4219 * so the inode item is not in the log and we need to insert it.
4220 * We can never get -EEXIST because we are only called for a fast
4221 * fsync and in case an inode eviction happens after the inode was
4222 * logged before in the current transaction, when we load again
4223 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
4224 * flags and set ->logged_trans to 0.
4225 */
4226 ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
4227 sizeof(*inode_item));
4228 ASSERT(ret != -EEXIST);
4229 }
4230 if (ret)
a95249b3
JB
4231 return ret;
4232 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4233 struct btrfs_inode_item);
6d889a3b
NB
4234 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
4235 0, 0);
a95249b3
JB
4236 btrfs_release_path(path);
4237 return 0;
4238}
4239
40e046ac 4240static int log_csums(struct btrfs_trans_handle *trans,
3ebac17c 4241 struct btrfs_inode *inode,
40e046ac
FM
4242 struct btrfs_root *log_root,
4243 struct btrfs_ordered_sum *sums)
4244{
e289f03e
FM
4245 const u64 lock_end = sums->bytenr + sums->len - 1;
4246 struct extent_state *cached_state = NULL;
40e046ac
FM
4247 int ret;
4248
3ebac17c
FM
4249 /*
4250 * If this inode was not used for reflink operations in the current
4251 * transaction with new extents, then do the fast path, no need to
4252 * worry about logging checksum items with overlapping ranges.
4253 */
4254 if (inode->last_reflink_trans < trans->transid)
4255 return btrfs_csum_file_blocks(trans, log_root, sums);
4256
e289f03e
FM
4257 /*
4258 * Serialize logging for checksums. This is to avoid racing with the
4259 * same checksum being logged by another task that is logging another
4260 * file which happens to refer to the same extent as well. Such races
4261 * can leave checksum items in the log with overlapping ranges.
4262 */
570eb97b
JB
4263 ret = lock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
4264 &cached_state);
e289f03e
FM
4265 if (ret)
4266 return ret;
40e046ac
FM
4267 /*
4268 * Due to extent cloning, we might have logged a csum item that covers a
4269 * subrange of a cloned extent, and later we can end up logging a csum
4270 * item for a larger subrange of the same extent or the entire range.
4271 * This would leave csum items in the log tree that cover the same range
4272 * and break the searches for checksums in the log tree, resulting in
4273 * some checksums missing in the fs/subvolume tree. So just delete (or
4274 * trim and adjust) any existing csum items in the log for this range.
4275 */
4276 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
e289f03e
FM
4277 if (!ret)
4278 ret = btrfs_csum_file_blocks(trans, log_root, sums);
40e046ac 4279
570eb97b
JB
4280 unlock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
4281 &cached_state);
e289f03e
FM
4282
4283 return ret;
40e046ac
FM
4284}
4285
31ff1cd2 4286static noinline int copy_items(struct btrfs_trans_handle *trans,
44d70e19 4287 struct btrfs_inode *inode,
31ff1cd2 4288 struct btrfs_path *dst_path,
0e56315c 4289 struct btrfs_path *src_path,
1a4bcf47
FM
4290 int start_slot, int nr, int inode_only,
4291 u64 logged_isize)
31ff1cd2 4292{
44d70e19 4293 struct btrfs_root *log = inode->root->log_root;
31ff1cd2 4294 struct btrfs_file_extent_item *extent;
796787c9 4295 struct extent_buffer *src;
7f30c072 4296 int ret = 0;
31ff1cd2
CM
4297 struct btrfs_key *ins_keys;
4298 u32 *ins_sizes;
b7ef5f3a 4299 struct btrfs_item_batch batch;
31ff1cd2
CM
4300 char *ins_data;
4301 int i;
7f30c072 4302 int dst_index;
7f30c072
FM
4303 const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
4304 const u64 i_size = i_size_read(&inode->vfs_inode);
d20f7043 4305
796787c9
FM
4306 /*
4307 * To keep lockdep happy and avoid deadlocks, clone the source leaf and
4308 * use the clone. This is because otherwise we would be changing the log
4309 * tree, to insert items from the subvolume tree or insert csum items,
4310 * while holding a read lock on a leaf from the subvolume tree, which
4311 * creates a nasty lock dependency when COWing log tree nodes/leaves:
4312 *
4313 * 1) Modifying the log tree triggers an extent buffer allocation while
4314 * holding a write lock on a parent extent buffer from the log tree.
4315 * Allocating the pages for an extent buffer, or the extent buffer
4316 * struct, can trigger inode eviction and finally the inode eviction
4317 * will trigger a release/remove of a delayed node, which requires
4318 * taking the delayed node's mutex;
4319 *
4320 * 2) Allocating a metadata extent for a log tree can trigger the async
4321 * reclaim thread and make us wait for it to release enough space and
4322 * unblock our reservation ticket. The reclaim thread can start
4323 * flushing delayed items, and that in turn results in the need to
4324 * lock delayed node mutexes and in the need to write lock extent
4325 * buffers of a subvolume tree - all this while holding a write lock
4326 * on the parent extent buffer in the log tree.
4327 *
4328 * So one task in scenario 1) running in parallel with another task in
4329 * scenario 2) could lead to a deadlock, one wanting to lock a delayed
4330 * node mutex while having a read lock on a leaf from the subvolume,
4331 * while the other is holding the delayed node's mutex and wants to
4332 * write lock the same subvolume leaf for flushing delayed items.
4333 */
4334 src = btrfs_clone_extent_buffer(src_path->nodes[0]);
4335 if (!src)
4336 return -ENOMEM;
4337
4338 i = src_path->slots[0];
4339 btrfs_release_path(src_path);
4340 src_path->nodes[0] = src;
4341 src_path->slots[0] = i;
4342
31ff1cd2
CM
4343 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
4344 nr * sizeof(u32), GFP_NOFS);
2a29edc6 4345 if (!ins_data)
4346 return -ENOMEM;
4347
31ff1cd2
CM
4348 ins_sizes = (u32 *)ins_data;
4349 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
b7ef5f3a
FM
4350 batch.keys = ins_keys;
4351 batch.data_sizes = ins_sizes;
4352 batch.total_data_size = 0;
7f30c072 4353 batch.nr = 0;
31ff1cd2 4354
7f30c072 4355 dst_index = 0;
31ff1cd2 4356 for (i = 0; i < nr; i++) {
7f30c072
FM
4357 const int src_slot = start_slot + i;
4358 struct btrfs_root *csum_root;
5b7ce5e2
FM
4359 struct btrfs_ordered_sum *sums;
4360 struct btrfs_ordered_sum *sums_next;
4361 LIST_HEAD(ordered_sums);
7f30c072
FM
4362 u64 disk_bytenr;
4363 u64 disk_num_bytes;
4364 u64 extent_offset;
4365 u64 extent_num_bytes;
4366 bool is_old_extent;
4367
4368 btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
4369
4370 if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
4371 goto add_to_batch;
4372
4373 extent = btrfs_item_ptr(src, src_slot,
4374 struct btrfs_file_extent_item);
4375
4376 is_old_extent = (btrfs_file_extent_generation(src, extent) <
4377 trans->transid);
4378
4379 /*
4380 * Don't copy extents from past generations. That would make us
4381 * log a lot more metadata for common cases like doing only a
4382 * few random writes into a file and then fsync it for the first
4383 * time or after the full sync flag is set on the inode. We can
4384 * get leaves full of extent items, most of which are from past
4385 * generations, so we can skip them - as long as the inode has
4386 * not been the target of a reflink operation in this transaction,
4387 * as in that case it might have had file extent items with old
4388 * generations copied into it. We also must always log prealloc
4389 * extents that start at or beyond eof, otherwise we would lose
4390 * them on log replay.
4391 */
4392 if (is_old_extent &&
4393 ins_keys[dst_index].offset < i_size &&
4394 inode->last_reflink_trans < trans->transid)
4395 continue;
4396
4397 if (skip_csum)
4398 goto add_to_batch;
4399
4400 /* Only regular extents have checksums. */
4401 if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
4402 goto add_to_batch;
4403
4404 /*
4405 * If it's an extent created in a past transaction, then its
4406 * checksums are already accessible from the committed csum tree,
4407 * no need to log them.
4408 */
4409 if (is_old_extent)
4410 goto add_to_batch;
4411
4412 disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
4413 /* If it's an explicit hole, there are no checksums. */
4414 if (disk_bytenr == 0)
4415 goto add_to_batch;
4416
4417 disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
4418
4419 if (btrfs_file_extent_compression(src, extent)) {
4420 extent_offset = 0;
4421 extent_num_bytes = disk_num_bytes;
4422 } else {
4423 extent_offset = btrfs_file_extent_offset(src, extent);
4424 extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
4425 }
4426
4427 csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
4428 disk_bytenr += extent_offset;
4429 ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
4430 disk_bytenr + extent_num_bytes - 1,
26ce9114 4431 &ordered_sums, 0, false);
7f30c072
FM
4432 if (ret)
4433 goto out;
4434
5b7ce5e2
FM
4435 list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
4436 if (!ret)
4437 ret = log_csums(trans, inode, log, sums);
4438 list_del(&sums->list);
4439 kfree(sums);
4440 }
4441 if (ret)
4442 goto out;
4443
7f30c072
FM
4444add_to_batch:
4445 ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
4446 batch.total_data_size += ins_sizes[dst_index];
4447 batch.nr++;
4448 dst_index++;
31ff1cd2 4449 }
7f30c072
FM
4450
4451 /*
4452 * We have a leaf full of old extent items that don't need to be logged,
4453 * so we don't need to do anything.
4454 */
4455 if (batch.nr == 0)
4456 goto out;
4457
b7ef5f3a 4458 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
7f30c072
FM
4459 if (ret)
4460 goto out;
4461
4462 dst_index = 0;
4463 for (i = 0; i < nr; i++) {
4464 const int src_slot = start_slot + i;
4465 const int dst_slot = dst_path->slots[0] + dst_index;
4466 struct btrfs_key key;
4467 unsigned long src_offset;
4468 unsigned long dst_offset;
4469
4470 /*
4471 * We're done, all the remaining items in the source leaf
4472 * correspond to old file extent items.
4473 */
4474 if (dst_index >= batch.nr)
4475 break;
4476
4477 btrfs_item_key_to_cpu(src, &key, src_slot);
4478
4479 if (key.type != BTRFS_EXTENT_DATA_KEY)
4480 goto copy_item;
31ff1cd2 4481
7f30c072
FM
4482 extent = btrfs_item_ptr(src, src_slot,
4483 struct btrfs_file_extent_item);
31ff1cd2 4484
7f30c072
FM
4485 /* See the comment in the previous loop, same logic. */
4486 if (btrfs_file_extent_generation(src, extent) < trans->transid &&
4487 key.offset < i_size &&
4488 inode->last_reflink_trans < trans->transid)
4489 continue;
4490
4491copy_item:
4492 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
4493 src_offset = btrfs_item_ptr_offset(src, src_slot);
31ff1cd2 4494
7f30c072
FM
4495 if (key.type == BTRFS_INODE_ITEM_KEY) {
4496 struct btrfs_inode_item *inode_item;
4497
4498 inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
31ff1cd2 4499 struct btrfs_inode_item);
94edf4ae 4500 fill_inode_item(trans, dst_path->nodes[0], inode_item,
f85b7379
DS
4501 &inode->vfs_inode,
4502 inode_only == LOG_INODE_EXISTS,
1a4bcf47 4503 logged_isize);
94edf4ae
JB
4504 } else {
4505 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
7f30c072 4506 src_offset, ins_sizes[dst_index]);
31ff1cd2 4507 }
94edf4ae 4508
7f30c072 4509 dst_index++;
31ff1cd2
CM
4510 }
4511
4512 btrfs_mark_buffer_dirty(dst_path->nodes[0]);
b3b4aa74 4513 btrfs_release_path(dst_path);
7f30c072 4514out:
31ff1cd2 4515 kfree(ins_data);
d20f7043 4516
4a500fd1 4517 return ret;
31ff1cd2
CM
4518}
4519
4f0f586b
ST
4520static int extent_cmp(void *priv, const struct list_head *a,
4521 const struct list_head *b)
5dc562c5 4522{
214cc184 4523 const struct extent_map *em1, *em2;
5dc562c5
JB
4524
4525 em1 = list_entry(a, struct extent_map, list);
4526 em2 = list_entry(b, struct extent_map, list);
4527
4528 if (em1->start < em2->start)
4529 return -1;
4530 else if (em1->start > em2->start)
4531 return 1;
4532 return 0;
4533}
4534
e7175a69
JB
4535static int log_extent_csums(struct btrfs_trans_handle *trans,
4536 struct btrfs_inode *inode,
a9ecb653 4537 struct btrfs_root *log_root,
48778179
FM
4538 const struct extent_map *em,
4539 struct btrfs_log_ctx *ctx)
5dc562c5 4540{
48778179 4541 struct btrfs_ordered_extent *ordered;
fc28b25e 4542 struct btrfs_root *csum_root;
2ab28f32
JB
4543 u64 csum_offset;
4544 u64 csum_len;
48778179
FM
4545 u64 mod_start = em->mod_start;
4546 u64 mod_len = em->mod_len;
8407f553
FM
4547 LIST_HEAD(ordered_sums);
4548 int ret = 0;
0aa4a17d 4549
e7175a69
JB
4550 if (inode->flags & BTRFS_INODE_NODATASUM ||
4551 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
8407f553 4552 em->block_start == EXTENT_MAP_HOLE)
70c8a91c 4553 return 0;
5dc562c5 4554
48778179
FM
4555 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4556 const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4557 const u64 mod_end = mod_start + mod_len;
4558 struct btrfs_ordered_sum *sums;
4559
4560 if (mod_len == 0)
4561 break;
4562
4563 if (ordered_end <= mod_start)
4564 continue;
4565 if (mod_end <= ordered->file_offset)
4566 break;
4567
4568 /*
4569 * We are going to copy all the csums on this ordered extent, so
4570 * go ahead and adjust mod_start and mod_len in case this ordered
4571 * extent has already been logged.
4572 */
4573 if (ordered->file_offset > mod_start) {
4574 if (ordered_end >= mod_end)
4575 mod_len = ordered->file_offset - mod_start;
4576 /*
4577 * If we have this case
4578 *
4579 * |--------- logged extent ---------|
4580 * |----- ordered extent ----|
4581 *
4582 * Just don't mess with mod_start and mod_len, we'll
4583 * just end up logging more csums than we need and it
4584 * will be ok.
4585 */
4586 } else {
4587 if (ordered_end < mod_end) {
4588 mod_len = mod_end - ordered_end;
4589 mod_start = ordered_end;
4590 } else {
4591 mod_len = 0;
4592 }
4593 }
4594
4595 /*
4596 * To keep us from looping for the above case of an ordered
4597 * extent that falls inside of the logged extent.
4598 */
4599 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4600 continue;
4601
4602 list_for_each_entry(sums, &ordered->list, list) {
4603 ret = log_csums(trans, inode, log_root, sums);
4604 if (ret)
4605 return ret;
4606 }
4607 }
4608
4609 /* We're done, found all csums in the ordered extents. */
4610 if (mod_len == 0)
4611 return 0;
4612
e7175a69 4613 /* If we're compressed we have to save the entire range of csums. */
488111aa
FDBM
4614 if (em->compress_type) {
4615 csum_offset = 0;
8407f553 4616 csum_len = max(em->block_len, em->orig_block_len);
488111aa 4617 } else {
48778179
FM
4618 csum_offset = mod_start - em->start;
4619 csum_len = mod_len;
488111aa 4620 }
2ab28f32 4621
70c8a91c 4622 /* block start is already adjusted for the file extent offset. */
fc28b25e
JB
4623 csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
4624 ret = btrfs_lookup_csums_range(csum_root,
70c8a91c
JB
4625 em->block_start + csum_offset,
4626 em->block_start + csum_offset +
26ce9114 4627 csum_len - 1, &ordered_sums, 0, false);
70c8a91c
JB
4628 if (ret)
4629 return ret;
5dc562c5 4630
70c8a91c
JB
4631 while (!list_empty(&ordered_sums)) {
4632 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4633 struct btrfs_ordered_sum,
4634 list);
4635 if (!ret)
3ebac17c 4636 ret = log_csums(trans, inode, log_root, sums);
70c8a91c
JB
4637 list_del(&sums->list);
4638 kfree(sums);
5dc562c5
JB
4639 }
4640
70c8a91c 4641 return ret;
5dc562c5
JB
4642}
4643
8407f553 4644static int log_one_extent(struct btrfs_trans_handle *trans,
90d04510 4645 struct btrfs_inode *inode,
8407f553
FM
4646 const struct extent_map *em,
4647 struct btrfs_path *path,
8407f553
FM
4648 struct btrfs_log_ctx *ctx)
4649{
5893dfb9 4650 struct btrfs_drop_extents_args drop_args = { 0 };
90d04510 4651 struct btrfs_root *log = inode->root->log_root;
e1f53ed8 4652 struct btrfs_file_extent_item fi = { 0 };
8407f553 4653 struct extent_buffer *leaf;
8407f553
FM
4654 struct btrfs_key key;
4655 u64 extent_offset = em->start - em->orig_start;
4656 u64 block_len;
4657 int ret;
8407f553 4658
e1f53ed8
FM
4659 btrfs_set_stack_file_extent_generation(&fi, trans->transid);
4660 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4661 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
4662 else
4663 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
4664
4665 block_len = max(em->block_len, em->orig_block_len);
4666 if (em->compress_type != BTRFS_COMPRESS_NONE) {
4667 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
4668 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4669 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4670 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
4671 extent_offset);
4672 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4673 }
4674
4675 btrfs_set_stack_file_extent_offset(&fi, extent_offset);
4676 btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
4677 btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
4678 btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
4679
48778179 4680 ret = log_extent_csums(trans, inode, log, em, ctx);
8407f553
FM
4681 if (ret)
4682 return ret;
4683
5328b2a7
FM
4684 /*
4685 * If this is the first time we are logging the inode in the current
4686 * transaction, we can avoid btrfs_drop_extents(), which is expensive
4687 * because it does a deletion search, which always acquires write locks
4688 * for extent buffers at levels 2, 1 and 0. This not only wastes time
4689 * but also adds significant contention in a log tree, since log trees
4690 * are small, with a root at level 2 or 3 at most, due to their short
4691 * life span.
4692 */
0f8ce498 4693 if (ctx->logged_before) {
5328b2a7
FM
4694 drop_args.path = path;
4695 drop_args.start = em->start;
4696 drop_args.end = em->start + em->len;
4697 drop_args.replace_extent = true;
e1f53ed8 4698 drop_args.extent_item_size = sizeof(fi);
5328b2a7
FM
4699 ret = btrfs_drop_extents(trans, log, inode, &drop_args);
4700 if (ret)
4701 return ret;
4702 }
8407f553 4703
5893dfb9 4704 if (!drop_args.extent_inserted) {
9d122629 4705 key.objectid = btrfs_ino(inode);
8407f553
FM
4706 key.type = BTRFS_EXTENT_DATA_KEY;
4707 key.offset = em->start;
4708
4709 ret = btrfs_insert_empty_item(trans, log, path, &key,
e1f53ed8 4710 sizeof(fi));
8407f553
FM
4711 if (ret)
4712 return ret;
4713 }
4714 leaf = path->nodes[0];
e1f53ed8
FM
4715 write_extent_buffer(leaf, &fi,
4716 btrfs_item_ptr_offset(leaf, path->slots[0]),
4717 sizeof(fi));
8407f553
FM
4718 btrfs_mark_buffer_dirty(leaf);
4719
4720 btrfs_release_path(path);
4721
4722 return ret;
4723}
4724
31d11b83
FM
4725/*
4726 * Log all prealloc extents beyond the inode's i_size to make sure we do not
d9947887 4727 * lose them after doing a full/fast fsync and replaying the log. We scan the
31d11b83
FM
4728 * subvolume's root instead of iterating the inode's extent map tree because
4729 * otherwise we can log incorrect extent items based on extent map conversion.
4730 * That can happen due to the fact that extent maps are merged when they
4731 * are not in the extent map tree's list of modified extents.
4732 */
4733static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4734 struct btrfs_inode *inode,
4735 struct btrfs_path *path)
4736{
4737 struct btrfs_root *root = inode->root;
4738 struct btrfs_key key;
4739 const u64 i_size = i_size_read(&inode->vfs_inode);
4740 const u64 ino = btrfs_ino(inode);
4741 struct btrfs_path *dst_path = NULL;
0e56315c 4742 bool dropped_extents = false;
f135cea3
FM
4743 u64 truncate_offset = i_size;
4744 struct extent_buffer *leaf;
4745 int slot;
31d11b83
FM
4746 int ins_nr = 0;
4747 int start_slot;
4748 int ret;
4749
4750 if (!(inode->flags & BTRFS_INODE_PREALLOC))
4751 return 0;
4752
4753 key.objectid = ino;
4754 key.type = BTRFS_EXTENT_DATA_KEY;
4755 key.offset = i_size;
4756 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4757 if (ret < 0)
4758 goto out;
4759
f135cea3
FM
4760 /*
4761 * We must check if there is a prealloc extent that starts before the
4762 * i_size and crosses the i_size boundary. This is to ensure later we
4763 * truncate down to the end of that extent and not to the i_size, as
4764 * otherwise we end up losing part of the prealloc extent after a log
4765 * replay and with an implicit hole if there is another prealloc extent
4766 * that starts at an offset beyond i_size.
4767 */
4768 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4769 if (ret < 0)
4770 goto out;
4771
4772 if (ret == 0) {
4773 struct btrfs_file_extent_item *ei;
4774
4775 leaf = path->nodes[0];
4776 slot = path->slots[0];
4777 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4778
4779 if (btrfs_file_extent_type(leaf, ei) ==
4780 BTRFS_FILE_EXTENT_PREALLOC) {
4781 u64 extent_end;
4782
4783 btrfs_item_key_to_cpu(leaf, &key, slot);
4784 extent_end = key.offset +
4785 btrfs_file_extent_num_bytes(leaf, ei);
4786
4787 if (extent_end > i_size)
4788 truncate_offset = extent_end;
4789 }
4790 } else {
4791 ret = 0;
4792 }
4793
31d11b83 4794 while (true) {
f135cea3
FM
4795 leaf = path->nodes[0];
4796 slot = path->slots[0];
31d11b83
FM
4797
4798 if (slot >= btrfs_header_nritems(leaf)) {
4799 if (ins_nr > 0) {
4800 ret = copy_items(trans, inode, dst_path, path,
0e56315c 4801 start_slot, ins_nr, 1, 0);
31d11b83
FM
4802 if (ret < 0)
4803 goto out;
4804 ins_nr = 0;
4805 }
4806 ret = btrfs_next_leaf(root, path);
4807 if (ret < 0)
4808 goto out;
4809 if (ret > 0) {
4810 ret = 0;
4811 break;
4812 }
4813 continue;
4814 }
4815
4816 btrfs_item_key_to_cpu(leaf, &key, slot);
4817 if (key.objectid > ino)
4818 break;
4819 if (WARN_ON_ONCE(key.objectid < ino) ||
4820 key.type < BTRFS_EXTENT_DATA_KEY ||
4821 key.offset < i_size) {
4822 path->slots[0]++;
4823 continue;
4824 }
0e56315c 4825 if (!dropped_extents) {
31d11b83
FM
4826 /*
4827 * Avoid logging extent items logged in past fsync calls
4828 * and leading to duplicate keys in the log tree.
4829 */
8a2b3da1
FM
4830 ret = truncate_inode_items(trans, root->log_root, inode,
4831 truncate_offset,
4832 BTRFS_EXTENT_DATA_KEY);
31d11b83
FM
4833 if (ret)
4834 goto out;
0e56315c 4835 dropped_extents = true;
31d11b83
FM
4836 }
4837 if (ins_nr == 0)
4838 start_slot = slot;
4839 ins_nr++;
4840 path->slots[0]++;
4841 if (!dst_path) {
4842 dst_path = btrfs_alloc_path();
4843 if (!dst_path) {
4844 ret = -ENOMEM;
4845 goto out;
4846 }
4847 }
4848 }
0bc2d3c0 4849 if (ins_nr > 0)
0e56315c 4850 ret = copy_items(trans, inode, dst_path, path,
31d11b83 4851 start_slot, ins_nr, 1, 0);
31d11b83
FM
4852out:
4853 btrfs_release_path(path);
4854 btrfs_free_path(dst_path);
4855 return ret;
4856}
4857
5dc562c5 4858static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
9d122629 4859 struct btrfs_inode *inode,
827463c4 4860 struct btrfs_path *path,
48778179 4861 struct btrfs_log_ctx *ctx)
5dc562c5 4862{
48778179
FM
4863 struct btrfs_ordered_extent *ordered;
4864 struct btrfs_ordered_extent *tmp;
5dc562c5
JB
4865 struct extent_map *em, *n;
4866 struct list_head extents;
9d122629 4867 struct extent_map_tree *tree = &inode->extent_tree;
5dc562c5 4868 int ret = 0;
2ab28f32 4869 int num = 0;
5dc562c5
JB
4870
4871 INIT_LIST_HEAD(&extents);
4872
5dc562c5 4873 write_lock(&tree->lock);
5dc562c5
JB
4874
4875 list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4876 list_del_init(&em->list);
2ab28f32
JB
4877 /*
4878 * Just an arbitrary number, this can be really CPU intensive
4879 * once we start getting a lot of extents, and really once we
4880 * have a bunch of extents we just want to commit since it will
4881 * be faster.
4882 */
4883 if (++num > 32768) {
4884 list_del_init(&tree->modified_extents);
4885 ret = -EFBIG;
4886 goto process;
4887 }
4888
5f96bfb7 4889 if (em->generation < trans->transid)
5dc562c5 4890 continue;
8c6c5928 4891
31d11b83
FM
4892 /* We log prealloc extents beyond eof later. */
4893 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
4894 em->start >= i_size_read(&inode->vfs_inode))
4895 continue;
4896
ff44c6e3 4897 /* Need a ref to keep it from getting evicted from cache */
490b54d6 4898 refcount_inc(&em->refs);
ff44c6e3 4899 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
5dc562c5 4900 list_add_tail(&em->list, &extents);
2ab28f32 4901 num++;
5dc562c5
JB
4902 }
4903
4904 list_sort(NULL, &extents, extent_cmp);
2ab28f32 4905process:
5dc562c5
JB
4906 while (!list_empty(&extents)) {
4907 em = list_entry(extents.next, struct extent_map, list);
4908
4909 list_del_init(&em->list);
4910
4911 /*
4912 * If we had an error we just need to delete everybody from our
4913 * private list.
4914 */
ff44c6e3 4915 if (ret) {
201a9038 4916 clear_em_logging(tree, em);
ff44c6e3 4917 free_extent_map(em);
5dc562c5 4918 continue;
ff44c6e3
JB
4919 }
4920
4921 write_unlock(&tree->lock);
5dc562c5 4922
90d04510 4923 ret = log_one_extent(trans, inode, em, path, ctx);
ff44c6e3 4924 write_lock(&tree->lock);
201a9038
JB
4925 clear_em_logging(tree, em);
4926 free_extent_map(em);
5dc562c5 4927 }
ff44c6e3
JB
4928 WARN_ON(!list_empty(&extents));
4929 write_unlock(&tree->lock);
5dc562c5 4930
31d11b83
FM
4931 if (!ret)
4932 ret = btrfs_log_prealloc_extents(trans, inode, path);
48778179
FM
4933 if (ret)
4934 return ret;
31d11b83 4935
48778179
FM
4936 /*
4937 * We have logged all extents successfully, now make sure the commit of
4938 * the current transaction waits for the ordered extents to complete
4939 * before it commits and wipes out the log trees, otherwise we would
4940 * lose data if an ordered extents completes after the transaction
4941 * commits and a power failure happens after the transaction commit.
4942 */
4943 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
4944 list_del_init(&ordered->log_list);
4945 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
4946
4947 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4948 spin_lock_irq(&inode->ordered_tree.lock);
4949 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4950 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
4951 atomic_inc(&trans->transaction->pending_ordered);
4952 }
4953 spin_unlock_irq(&inode->ordered_tree.lock);
4954 }
4955 btrfs_put_ordered_extent(ordered);
4956 }
4957
4958 return 0;
5dc562c5
JB
4959}
4960
481b01c0 4961static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
1a4bcf47
FM
4962 struct btrfs_path *path, u64 *size_ret)
4963{
4964 struct btrfs_key key;
4965 int ret;
4966
481b01c0 4967 key.objectid = btrfs_ino(inode);
1a4bcf47
FM
4968 key.type = BTRFS_INODE_ITEM_KEY;
4969 key.offset = 0;
4970
4971 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4972 if (ret < 0) {
4973 return ret;
4974 } else if (ret > 0) {
2f2ff0ee 4975 *size_ret = 0;
1a4bcf47
FM
4976 } else {
4977 struct btrfs_inode_item *item;
4978
4979 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4980 struct btrfs_inode_item);
4981 *size_ret = btrfs_inode_size(path->nodes[0], item);
bf504110
FM
4982 /*
4983 * If the in-memory inode's i_size is smaller then the inode
4984 * size stored in the btree, return the inode's i_size, so
4985 * that we get a correct inode size after replaying the log
4986 * when before a power failure we had a shrinking truncate
4987 * followed by addition of a new name (rename / new hard link).
4988 * Otherwise return the inode size from the btree, to avoid
4989 * data loss when replaying a log due to previously doing a
4990 * write that expands the inode's size and logging a new name
4991 * immediately after.
4992 */
4993 if (*size_ret > inode->vfs_inode.i_size)
4994 *size_ret = inode->vfs_inode.i_size;
1a4bcf47
FM
4995 }
4996
4997 btrfs_release_path(path);
4998 return 0;
4999}
5000
36283bf7
FM
5001/*
5002 * At the moment we always log all xattrs. This is to figure out at log replay
5003 * time which xattrs must have their deletion replayed. If a xattr is missing
5004 * in the log tree and exists in the fs/subvol tree, we delete it. This is
5005 * because if a xattr is deleted, the inode is fsynced and a power failure
5006 * happens, causing the log to be replayed the next time the fs is mounted,
5007 * we want the xattr to not exist anymore (same behaviour as other filesystems
5008 * with a journal, ext3/4, xfs, f2fs, etc).
5009 */
5010static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
1a93c36a 5011 struct btrfs_inode *inode,
36283bf7
FM
5012 struct btrfs_path *path,
5013 struct btrfs_path *dst_path)
5014{
90d04510 5015 struct btrfs_root *root = inode->root;
36283bf7
FM
5016 int ret;
5017 struct btrfs_key key;
1a93c36a 5018 const u64 ino = btrfs_ino(inode);
36283bf7
FM
5019 int ins_nr = 0;
5020 int start_slot = 0;
f2f121ab
FM
5021 bool found_xattrs = false;
5022
5023 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
5024 return 0;
36283bf7
FM
5025
5026 key.objectid = ino;
5027 key.type = BTRFS_XATTR_ITEM_KEY;
5028 key.offset = 0;
5029
5030 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5031 if (ret < 0)
5032 return ret;
5033
5034 while (true) {
5035 int slot = path->slots[0];
5036 struct extent_buffer *leaf = path->nodes[0];
5037 int nritems = btrfs_header_nritems(leaf);
5038
5039 if (slot >= nritems) {
5040 if (ins_nr > 0) {
1a93c36a 5041 ret = copy_items(trans, inode, dst_path, path,
0e56315c 5042 start_slot, ins_nr, 1, 0);
36283bf7
FM
5043 if (ret < 0)
5044 return ret;
5045 ins_nr = 0;
5046 }
5047 ret = btrfs_next_leaf(root, path);
5048 if (ret < 0)
5049 return ret;
5050 else if (ret > 0)
5051 break;
5052 continue;
5053 }
5054
5055 btrfs_item_key_to_cpu(leaf, &key, slot);
5056 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
5057 break;
5058
5059 if (ins_nr == 0)
5060 start_slot = slot;
5061 ins_nr++;
5062 path->slots[0]++;
f2f121ab 5063 found_xattrs = true;
36283bf7
FM
5064 cond_resched();
5065 }
5066 if (ins_nr > 0) {
1a93c36a 5067 ret = copy_items(trans, inode, dst_path, path,
0e56315c 5068 start_slot, ins_nr, 1, 0);
36283bf7
FM
5069 if (ret < 0)
5070 return ret;
5071 }
5072
f2f121ab
FM
5073 if (!found_xattrs)
5074 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
5075
36283bf7
FM
5076 return 0;
5077}
5078
a89ca6f2 5079/*
0e56315c
FM
5080 * When using the NO_HOLES feature if we punched a hole that causes the
5081 * deletion of entire leafs or all the extent items of the first leaf (the one
5082 * that contains the inode item and references) we may end up not processing
5083 * any extents, because there are no leafs with a generation matching the
5084 * current transaction that have extent items for our inode. So we need to find
5085 * if any holes exist and then log them. We also need to log holes after any
5086 * truncate operation that changes the inode's size.
a89ca6f2 5087 */
0e56315c 5088static int btrfs_log_holes(struct btrfs_trans_handle *trans,
0e56315c 5089 struct btrfs_inode *inode,
7af59743 5090 struct btrfs_path *path)
a89ca6f2 5091{
90d04510 5092 struct btrfs_root *root = inode->root;
0b246afa 5093 struct btrfs_fs_info *fs_info = root->fs_info;
a89ca6f2 5094 struct btrfs_key key;
a0308dd7
NB
5095 const u64 ino = btrfs_ino(inode);
5096 const u64 i_size = i_size_read(&inode->vfs_inode);
7af59743 5097 u64 prev_extent_end = 0;
0e56315c 5098 int ret;
a89ca6f2 5099
0e56315c 5100 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
a89ca6f2
FM
5101 return 0;
5102
5103 key.objectid = ino;
5104 key.type = BTRFS_EXTENT_DATA_KEY;
7af59743 5105 key.offset = 0;
a89ca6f2
FM
5106
5107 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
a89ca6f2
FM
5108 if (ret < 0)
5109 return ret;
5110
0e56315c 5111 while (true) {
0e56315c 5112 struct extent_buffer *leaf = path->nodes[0];
a89ca6f2 5113
0e56315c
FM
5114 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5115 ret = btrfs_next_leaf(root, path);
5116 if (ret < 0)
5117 return ret;
5118 if (ret > 0) {
5119 ret = 0;
5120 break;
5121 }
5122 leaf = path->nodes[0];
5123 }
5124
5125 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5126 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
5127 break;
5128
5129 /* We have a hole, log it. */
5130 if (prev_extent_end < key.offset) {
7af59743 5131 const u64 hole_len = key.offset - prev_extent_end;
0e56315c
FM
5132
5133 /*
5134 * Release the path to avoid deadlocks with other code
5135 * paths that search the root while holding locks on
5136 * leafs from the log root.
5137 */
5138 btrfs_release_path(path);
d1f68ba0
OS
5139 ret = btrfs_insert_hole_extent(trans, root->log_root,
5140 ino, prev_extent_end,
5141 hole_len);
0e56315c
FM
5142 if (ret < 0)
5143 return ret;
5144
5145 /*
5146 * Search for the same key again in the root. Since it's
5147 * an extent item and we are holding the inode lock, the
5148 * key must still exist. If it doesn't just emit warning
5149 * and return an error to fall back to a transaction
5150 * commit.
5151 */
5152 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5153 if (ret < 0)
5154 return ret;
5155 if (WARN_ON(ret > 0))
5156 return -ENOENT;
5157 leaf = path->nodes[0];
5158 }
a89ca6f2 5159
7af59743 5160 prev_extent_end = btrfs_file_extent_end(path);
0e56315c
FM
5161 path->slots[0]++;
5162 cond_resched();
a89ca6f2 5163 }
a89ca6f2 5164
7af59743 5165 if (prev_extent_end < i_size) {
0e56315c 5166 u64 hole_len;
a89ca6f2 5167
0e56315c 5168 btrfs_release_path(path);
7af59743 5169 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
d1f68ba0
OS
5170 ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
5171 prev_extent_end, hole_len);
0e56315c
FM
5172 if (ret < 0)
5173 return ret;
5174 }
5175
5176 return 0;
a89ca6f2
FM
5177}
5178
56f23fdb
FM
5179/*
5180 * When we are logging a new inode X, check if it doesn't have a reference that
5181 * matches the reference from some other inode Y created in a past transaction
5182 * and that was renamed in the current transaction. If we don't do this, then at
5183 * log replay time we can lose inode Y (and all its files if it's a directory):
5184 *
5185 * mkdir /mnt/x
5186 * echo "hello world" > /mnt/x/foobar
5187 * sync
5188 * mv /mnt/x /mnt/y
5189 * mkdir /mnt/x # or touch /mnt/x
5190 * xfs_io -c fsync /mnt/x
5191 * <power fail>
5192 * mount fs, trigger log replay
5193 *
5194 * After the log replay procedure, we would lose the first directory and all its
5195 * files (file foobar).
5196 * For the case where inode Y is not a directory we simply end up losing it:
5197 *
5198 * echo "123" > /mnt/foo
5199 * sync
5200 * mv /mnt/foo /mnt/bar
5201 * echo "abc" > /mnt/foo
5202 * xfs_io -c fsync /mnt/foo
5203 * <power fail>
5204 *
5205 * We also need this for cases where a snapshot entry is replaced by some other
5206 * entry (file or directory) otherwise we end up with an unreplayable log due to
5207 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
5208 * if it were a regular entry:
5209 *
5210 * mkdir /mnt/x
5211 * btrfs subvolume snapshot /mnt /mnt/x/snap
5212 * btrfs subvolume delete /mnt/x/snap
5213 * rmdir /mnt/x
5214 * mkdir /mnt/x
5215 * fsync /mnt/x or fsync some new file inside it
5216 * <power fail>
5217 *
5218 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
5219 * the same transaction.
5220 */
5221static int btrfs_check_ref_name_override(struct extent_buffer *eb,
5222 const int slot,
5223 const struct btrfs_key *key,
4791c8f1 5224 struct btrfs_inode *inode,
a3baaf0d 5225 u64 *other_ino, u64 *other_parent)
56f23fdb
FM
5226{
5227 int ret;
5228 struct btrfs_path *search_path;
5229 char *name = NULL;
5230 u32 name_len = 0;
3212fa14 5231 u32 item_size = btrfs_item_size(eb, slot);
56f23fdb
FM
5232 u32 cur_offset = 0;
5233 unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
5234
5235 search_path = btrfs_alloc_path();
5236 if (!search_path)
5237 return -ENOMEM;
5238 search_path->search_commit_root = 1;
5239 search_path->skip_locking = 1;
5240
5241 while (cur_offset < item_size) {
5242 u64 parent;
5243 u32 this_name_len;
5244 u32 this_len;
5245 unsigned long name_ptr;
5246 struct btrfs_dir_item *di;
6db75318 5247 struct fscrypt_str name_str;
56f23fdb
FM
5248
5249 if (key->type == BTRFS_INODE_REF_KEY) {
5250 struct btrfs_inode_ref *iref;
5251
5252 iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
5253 parent = key->offset;
5254 this_name_len = btrfs_inode_ref_name_len(eb, iref);
5255 name_ptr = (unsigned long)(iref + 1);
5256 this_len = sizeof(*iref) + this_name_len;
5257 } else {
5258 struct btrfs_inode_extref *extref;
5259
5260 extref = (struct btrfs_inode_extref *)(ptr +
5261 cur_offset);
5262 parent = btrfs_inode_extref_parent(eb, extref);
5263 this_name_len = btrfs_inode_extref_name_len(eb, extref);
5264 name_ptr = (unsigned long)&extref->name;
5265 this_len = sizeof(*extref) + this_name_len;
5266 }
5267
5268 if (this_name_len > name_len) {
5269 char *new_name;
5270
5271 new_name = krealloc(name, this_name_len, GFP_NOFS);
5272 if (!new_name) {
5273 ret = -ENOMEM;
5274 goto out;
5275 }
5276 name_len = this_name_len;
5277 name = new_name;
5278 }
5279
5280 read_extent_buffer(eb, name, name_ptr, this_name_len);
e43eec81
STD
5281
5282 name_str.name = name;
5283 name_str.len = this_name_len;
4791c8f1 5284 di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
e43eec81 5285 parent, &name_str, 0);
56f23fdb 5286 if (di && !IS_ERR(di)) {
44f714da
FM
5287 struct btrfs_key di_key;
5288
5289 btrfs_dir_item_key_to_cpu(search_path->nodes[0],
5290 di, &di_key);
5291 if (di_key.type == BTRFS_INODE_ITEM_KEY) {
6b5fc433
FM
5292 if (di_key.objectid != key->objectid) {
5293 ret = 1;
5294 *other_ino = di_key.objectid;
a3baaf0d 5295 *other_parent = parent;
6b5fc433
FM
5296 } else {
5297 ret = 0;
5298 }
44f714da
FM
5299 } else {
5300 ret = -EAGAIN;
5301 }
56f23fdb
FM
5302 goto out;
5303 } else if (IS_ERR(di)) {
5304 ret = PTR_ERR(di);
5305 goto out;
5306 }
5307 btrfs_release_path(search_path);
5308
5309 cur_offset += this_len;
5310 }
5311 ret = 0;
5312out:
5313 btrfs_free_path(search_path);
5314 kfree(name);
5315 return ret;
5316}
5317
a3751024
FM
5318/*
5319 * Check if we need to log an inode. This is used in contexts where while
5320 * logging an inode we need to log another inode (either that it exists or in
5321 * full mode). This is used instead of btrfs_inode_in_log() because the later
5322 * requires the inode to be in the log and have the log transaction committed,
5323 * while here we do not care if the log transaction was already committed - our
5324 * caller will commit the log later - and we want to avoid logging an inode
5325 * multiple times when multiple tasks have joined the same log transaction.
5326 */
5327static bool need_log_inode(const struct btrfs_trans_handle *trans,
5328 const struct btrfs_inode *inode)
5329{
5330 /*
5331 * If a directory was not modified, no dentries added or removed, we can
5332 * and should avoid logging it.
5333 */
5334 if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
5335 return false;
5336
5337 /*
5338 * If this inode does not have new/updated/deleted xattrs since the last
5339 * time it was logged and is flagged as logged in the current transaction,
5340 * we can skip logging it. As for new/deleted names, those are updated in
5341 * the log by link/unlink/rename operations.
5342 * In case the inode was logged and then evicted and reloaded, its
5343 * logged_trans will be 0, in which case we have to fully log it since
5344 * logged_trans is a transient field, not persisted.
5345 */
5346 if (inode->logged_trans == trans->transid &&
5347 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
5348 return false;
5349
5350 return true;
5351}
5352
f6d86dbe
FM
5353struct btrfs_dir_list {
5354 u64 ino;
5355 struct list_head list;
5356};
5357
5358/*
5359 * Log the inodes of the new dentries of a directory.
5360 * See process_dir_items_leaf() for details about why it is needed.
5361 * This is a recursive operation - if an existing dentry corresponds to a
5362 * directory, that directory's new entries are logged too (same behaviour as
5363 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5364 * the dentries point to we do not acquire their VFS lock, otherwise lockdep
5365 * complains about the following circular lock dependency / possible deadlock:
5366 *
5367 * CPU0 CPU1
5368 * ---- ----
5369 * lock(&type->i_mutex_dir_key#3/2);
5370 * lock(sb_internal#2);
5371 * lock(&type->i_mutex_dir_key#3/2);
5372 * lock(&sb->s_type->i_mutex_key#14);
5373 *
5374 * Where sb_internal is the lock (a counter that works as a lock) acquired by
5375 * sb_start_intwrite() in btrfs_start_transaction().
5376 * Not acquiring the VFS lock of the inodes is still safe because:
5377 *
5378 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5379 * that while logging the inode new references (names) are added or removed
5380 * from the inode, leaving the logged inode item with a link count that does
5381 * not match the number of logged inode reference items. This is fine because
5382 * at log replay time we compute the real number of links and correct the
5383 * link count in the inode item (see replay_one_buffer() and
5384 * link_to_fixup_dir());
5385 *
5386 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5387 * while logging the inode's items new index items (key type
5388 * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
5389 * has a size that doesn't match the sum of the lengths of all the logged
5390 * names - this is ok, not a problem, because at log replay time we set the
5391 * directory's i_size to the correct value (see replay_one_name() and
5392 * do_overwrite_item()).
5393 */
5394static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5395 struct btrfs_inode *start_inode,
5396 struct btrfs_log_ctx *ctx)
5397{
5398 struct btrfs_root *root = start_inode->root;
5399 struct btrfs_fs_info *fs_info = root->fs_info;
5400 struct btrfs_path *path;
5401 LIST_HEAD(dir_list);
5402 struct btrfs_dir_list *dir_elem;
5403 u64 ino = btrfs_ino(start_inode);
5404 int ret = 0;
5405
5406 /*
5407 * If we are logging a new name, as part of a link or rename operation,
5408 * don't bother logging new dentries, as we just want to log the names
5409 * of an inode and that any new parents exist.
5410 */
5411 if (ctx->logging_new_name)
5412 return 0;
5413
5414 path = btrfs_alloc_path();
5415 if (!path)
5416 return -ENOMEM;
5417
5418 while (true) {
5419 struct extent_buffer *leaf;
5420 struct btrfs_key min_key;
5421 bool continue_curr_inode = true;
5422 int nritems;
5423 int i;
5424
5425 min_key.objectid = ino;
5426 min_key.type = BTRFS_DIR_INDEX_KEY;
5427 min_key.offset = 0;
5428again:
5429 btrfs_release_path(path);
5430 ret = btrfs_search_forward(root, &min_key, path, trans->transid);
5431 if (ret < 0) {
5432 break;
5433 } else if (ret > 0) {
5434 ret = 0;
5435 goto next;
5436 }
5437
5438 leaf = path->nodes[0];
5439 nritems = btrfs_header_nritems(leaf);
5440 for (i = path->slots[0]; i < nritems; i++) {
5441 struct btrfs_dir_item *di;
5442 struct btrfs_key di_key;
5443 struct inode *di_inode;
5444 int log_mode = LOG_INODE_EXISTS;
5445 int type;
5446
5447 btrfs_item_key_to_cpu(leaf, &min_key, i);
5448 if (min_key.objectid != ino ||
5449 min_key.type != BTRFS_DIR_INDEX_KEY) {
5450 continue_curr_inode = false;
5451 break;
5452 }
5453
5454 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
94a48aef 5455 type = btrfs_dir_ftype(leaf, di);
f6d86dbe
FM
5456 if (btrfs_dir_transid(leaf, di) < trans->transid)
5457 continue;
5458 btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5459 if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5460 continue;
5461
5462 btrfs_release_path(path);
5463 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
5464 if (IS_ERR(di_inode)) {
5465 ret = PTR_ERR(di_inode);
5466 goto out;
5467 }
5468
5469 if (!need_log_inode(trans, BTRFS_I(di_inode))) {
5470 btrfs_add_delayed_iput(di_inode);
5471 break;
5472 }
5473
5474 ctx->log_new_dentries = false;
5475 if (type == BTRFS_FT_DIR)
5476 log_mode = LOG_INODE_ALL;
5477 ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
5478 log_mode, ctx);
5479 btrfs_add_delayed_iput(di_inode);
5480 if (ret)
5481 goto out;
5482 if (ctx->log_new_dentries) {
5483 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5484 if (!dir_elem) {
5485 ret = -ENOMEM;
5486 goto out;
5487 }
5488 dir_elem->ino = di_key.objectid;
5489 list_add_tail(&dir_elem->list, &dir_list);
5490 }
5491 break;
5492 }
5493
5494 if (continue_curr_inode && min_key.offset < (u64)-1) {
5495 min_key.offset++;
5496 goto again;
5497 }
5498
5499next:
5500 if (list_empty(&dir_list))
5501 break;
5502
5503 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
5504 ino = dir_elem->ino;
5505 list_del(&dir_elem->list);
5506 kfree(dir_elem);
5507 }
5508out:
5509 btrfs_free_path(path);
5510 if (ret) {
5511 struct btrfs_dir_list *next;
5512
5513 list_for_each_entry_safe(dir_elem, next, &dir_list, list)
5514 kfree(dir_elem);
5515 }
5516
5517 return ret;
5518}
5519
6b5fc433
FM
5520struct btrfs_ino_list {
5521 u64 ino;
a3baaf0d 5522 u64 parent;
6b5fc433
FM
5523 struct list_head list;
5524};
5525
e09d94c9
FM
5526static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
5527{
5528 struct btrfs_ino_list *curr;
5529 struct btrfs_ino_list *next;
5530
5531 list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
5532 list_del(&curr->list);
5533 kfree(curr);
5534 }
5535}
5536
5557a069
FM
5537static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
5538 struct btrfs_path *path)
5539{
5540 struct btrfs_key key;
5541 int ret;
5542
5543 key.objectid = ino;
5544 key.type = BTRFS_INODE_ITEM_KEY;
5545 key.offset = 0;
5546
5547 path->search_commit_root = 1;
5548 path->skip_locking = 1;
5549
5550 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5551 if (WARN_ON_ONCE(ret > 0)) {
5552 /*
5553 * We have previously found the inode through the commit root
5554 * so this should not happen. If it does, just error out and
5555 * fallback to a transaction commit.
5556 */
5557 ret = -ENOENT;
5558 } else if (ret == 0) {
5559 struct btrfs_inode_item *item;
5560
5561 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5562 struct btrfs_inode_item);
5563 if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
5564 ret = 1;
5565 }
5566
5567 btrfs_release_path(path);
5568 path->search_commit_root = 0;
5569 path->skip_locking = 0;
5570
5571 return ret;
5572}
5573
e09d94c9
FM
5574static int add_conflicting_inode(struct btrfs_trans_handle *trans,
5575 struct btrfs_root *root,
5557a069 5576 struct btrfs_path *path,
e09d94c9
FM
5577 u64 ino, u64 parent,
5578 struct btrfs_log_ctx *ctx)
6b5fc433
FM
5579{
5580 struct btrfs_ino_list *ino_elem;
e09d94c9
FM
5581 struct inode *inode;
5582
5583 /*
5584 * It's rare to have a lot of conflicting inodes, in practice it is not
5585 * common to have more than 1 or 2. We don't want to collect too many,
5586 * as we could end up logging too many inodes (even if only in
5587 * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
5588 * commits.
5589 */
5590 if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
5591 return BTRFS_LOG_FORCE_COMMIT;
5592
5593 inode = btrfs_iget(root->fs_info->sb, ino, root);
5594 /*
5595 * If the other inode that had a conflicting dir entry was deleted in
5557a069
FM
5596 * the current transaction then we either:
5597 *
5598 * 1) Log the parent directory (later after adding it to the list) if
5599 * the inode is a directory. This is because it may be a deleted
5600 * subvolume/snapshot or it may be a regular directory that had
5601 * deleted subvolumes/snapshots (or subdirectories that had them),
5602 * and at the moment we can't deal with dropping subvolumes/snapshots
5603 * during log replay. So we just log the parent, which will result in
5604 * a fallback to a transaction commit if we are dealing with those
5605 * cases (last_unlink_trans will match the current transaction);
5606 *
5607 * 2) Do nothing if it's not a directory. During log replay we simply
5608 * unlink the conflicting dentry from the parent directory and then
5609 * add the dentry for our inode. Like this we can avoid logging the
5610 * parent directory (and maybe fallback to a transaction commit in
5611 * case it has a last_unlink_trans == trans->transid, due to moving
5612 * some inode from it to some other directory).
e09d94c9
FM
5613 */
5614 if (IS_ERR(inode)) {
5615 int ret = PTR_ERR(inode);
5616
5617 if (ret != -ENOENT)
5618 return ret;
5619
5557a069
FM
5620 ret = conflicting_inode_is_dir(root, ino, path);
5621 /* Not a directory or we got an error. */
5622 if (ret <= 0)
5623 return ret;
5624
5625 /* Conflicting inode is a directory, so we'll log its parent. */
e09d94c9
FM
5626 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5627 if (!ino_elem)
5628 return -ENOMEM;
5629 ino_elem->ino = ino;
5630 ino_elem->parent = parent;
5631 list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
5632 ctx->num_conflict_inodes++;
5633
5634 return 0;
5635 }
5636
5637 /*
5638 * If the inode was already logged skip it - otherwise we can hit an
5639 * infinite loop. Example:
5640 *
5641 * From the commit root (previous transaction) we have the following
5642 * inodes:
5643 *
5644 * inode 257 a directory
5645 * inode 258 with references "zz" and "zz_link" on inode 257
5646 * inode 259 with reference "a" on inode 257
5647 *
5648 * And in the current (uncommitted) transaction we have:
5649 *
5650 * inode 257 a directory, unchanged
5651 * inode 258 with references "a" and "a2" on inode 257
5652 * inode 259 with reference "zz_link" on inode 257
5653 * inode 261 with reference "zz" on inode 257
5654 *
5655 * When logging inode 261 the following infinite loop could
5656 * happen if we don't skip already logged inodes:
5657 *
5658 * - we detect inode 258 as a conflicting inode, with inode 261
5659 * on reference "zz", and log it;
5660 *
5661 * - we detect inode 259 as a conflicting inode, with inode 258
5662 * on reference "a", and log it;
5663 *
5664 * - we detect inode 258 as a conflicting inode, with inode 259
5665 * on reference "zz_link", and log it - again! After this we
5666 * repeat the above steps forever.
5667 *
5668 * Here we can use need_log_inode() because we only need to log the
5669 * inode in LOG_INODE_EXISTS mode and rename operations update the log,
5670 * so that the log ends up with the new name and without the old name.
5671 */
5672 if (!need_log_inode(trans, BTRFS_I(inode))) {
5673 btrfs_add_delayed_iput(inode);
5674 return 0;
5675 }
5676
5677 btrfs_add_delayed_iput(inode);
6b5fc433
FM
5678
5679 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5680 if (!ino_elem)
5681 return -ENOMEM;
5682 ino_elem->ino = ino;
a3baaf0d 5683 ino_elem->parent = parent;
e09d94c9
FM
5684 list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
5685 ctx->num_conflict_inodes++;
6b5fc433 5686
e09d94c9
FM
5687 return 0;
5688}
6b5fc433 5689
e09d94c9
FM
5690static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
5691 struct btrfs_root *root,
5692 struct btrfs_log_ctx *ctx)
5693{
5694 struct btrfs_fs_info *fs_info = root->fs_info;
5695 int ret = 0;
6b5fc433 5696
e09d94c9
FM
5697 /*
5698 * Conflicting inodes are logged by the first call to btrfs_log_inode(),
5699 * otherwise we could have unbounded recursion of btrfs_log_inode()
5700 * calls. This check guarantees we can have only 1 level of recursion.
5701 */
5702 if (ctx->logging_conflict_inodes)
5703 return 0;
5704
5705 ctx->logging_conflict_inodes = true;
5706
5707 /*
5708 * New conflicting inodes may be found and added to the list while we
5709 * are logging a conflicting inode, so keep iterating while the list is
5710 * not empty.
5711 */
5712 while (!list_empty(&ctx->conflict_inodes)) {
5713 struct btrfs_ino_list *curr;
5714 struct inode *inode;
5715 u64 ino;
5716 u64 parent;
5717
5718 curr = list_first_entry(&ctx->conflict_inodes,
5719 struct btrfs_ino_list, list);
5720 ino = curr->ino;
5721 parent = curr->parent;
5722 list_del(&curr->list);
5723 kfree(curr);
6b5fc433 5724
0202e83f 5725 inode = btrfs_iget(fs_info->sb, ino, root);
6b5fc433
FM
5726 /*
5727 * If the other inode that had a conflicting dir entry was
a3baaf0d 5728 * deleted in the current transaction, we need to log its parent
e09d94c9 5729 * directory. See the comment at add_conflicting_inode().
6b5fc433
FM
5730 */
5731 if (IS_ERR(inode)) {
5732 ret = PTR_ERR(inode);
e09d94c9
FM
5733 if (ret != -ENOENT)
5734 break;
5735
5736 inode = btrfs_iget(fs_info->sb, parent, root);
5737 if (IS_ERR(inode)) {
5738 ret = PTR_ERR(inode);
5739 break;
a3baaf0d 5740 }
e09d94c9
FM
5741
5742 /*
5743 * Always log the directory, we cannot make this
5744 * conditional on need_log_inode() because the directory
5745 * might have been logged in LOG_INODE_EXISTS mode or
5746 * the dir index of the conflicting inode is not in a
5747 * dir index key range logged for the directory. So we
5748 * must make sure the deletion is recorded.
5749 */
5750 ret = btrfs_log_inode(trans, BTRFS_I(inode),
5751 LOG_INODE_ALL, ctx);
5752 btrfs_add_delayed_iput(inode);
5753 if (ret)
5754 break;
6b5fc433
FM
5755 continue;
5756 }
e09d94c9 5757
b5e4ff9d 5758 /*
e09d94c9
FM
5759 * Here we can use need_log_inode() because we only need to log
5760 * the inode in LOG_INODE_EXISTS mode and rename operations
5761 * update the log, so that the log ends up with the new name and
5762 * without the old name.
b5e4ff9d 5763 *
e09d94c9
FM
5764 * We did this check at add_conflicting_inode(), but here we do
5765 * it again because if some other task logged the inode after
5766 * that, we can avoid doing it again.
b5e4ff9d 5767 */
e09d94c9 5768 if (!need_log_inode(trans, BTRFS_I(inode))) {
b5e4ff9d
FM
5769 btrfs_add_delayed_iput(inode);
5770 continue;
5771 }
e09d94c9 5772
6b5fc433
FM
5773 /*
5774 * We are safe logging the other inode without acquiring its
5775 * lock as long as we log with the LOG_INODE_EXISTS mode. We
5776 * are safe against concurrent renames of the other inode as
5777 * well because during a rename we pin the log and update the
5778 * log with the new name before we unpin it.
5779 */
e09d94c9 5780 ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
410f954c 5781 btrfs_add_delayed_iput(inode);
e09d94c9
FM
5782 if (ret)
5783 break;
6b5fc433
FM
5784 }
5785
e09d94c9
FM
5786 ctx->logging_conflict_inodes = false;
5787 if (ret)
5788 free_conflicting_inodes(ctx);
5789
6b5fc433
FM
5790 return ret;
5791}
5792
da447009
FM
5793static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5794 struct btrfs_inode *inode,
5795 struct btrfs_key *min_key,
5796 const struct btrfs_key *max_key,
5797 struct btrfs_path *path,
5798 struct btrfs_path *dst_path,
5799 const u64 logged_isize,
da447009
FM
5800 const int inode_only,
5801 struct btrfs_log_ctx *ctx,
5802 bool *need_log_inode_item)
5803{
d9947887 5804 const u64 i_size = i_size_read(&inode->vfs_inode);
da447009
FM
5805 struct btrfs_root *root = inode->root;
5806 int ins_start_slot = 0;
5807 int ins_nr = 0;
5808 int ret;
5809
5810 while (1) {
5811 ret = btrfs_search_forward(root, min_key, path, trans->transid);
5812 if (ret < 0)
5813 return ret;
5814 if (ret > 0) {
5815 ret = 0;
5816 break;
5817 }
5818again:
5819 /* Note, ins_nr might be > 0 here, cleanup outside the loop */
5820 if (min_key->objectid != max_key->objectid)
5821 break;
5822 if (min_key->type > max_key->type)
5823 break;
5824
d9947887 5825 if (min_key->type == BTRFS_INODE_ITEM_KEY) {
da447009 5826 *need_log_inode_item = false;
d9947887
FM
5827 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
5828 min_key->offset >= i_size) {
5829 /*
5830 * Extents at and beyond eof are logged with
5831 * btrfs_log_prealloc_extents().
5832 * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
5833 * and no keys greater than that, so bail out.
5834 */
5835 break;
5836 } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
5837 min_key->type == BTRFS_INODE_EXTREF_KEY) &&
e09d94c9
FM
5838 (inode->generation == trans->transid ||
5839 ctx->logging_conflict_inodes)) {
da447009
FM
5840 u64 other_ino = 0;
5841 u64 other_parent = 0;
5842
5843 ret = btrfs_check_ref_name_override(path->nodes[0],
5844 path->slots[0], min_key, inode,
5845 &other_ino, &other_parent);
5846 if (ret < 0) {
5847 return ret;
289cffcb 5848 } else if (ret > 0 &&
da447009
FM
5849 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5850 if (ins_nr > 0) {
5851 ins_nr++;
5852 } else {
5853 ins_nr = 1;
5854 ins_start_slot = path->slots[0];
5855 }
5856 ret = copy_items(trans, inode, dst_path, path,
5857 ins_start_slot, ins_nr,
5858 inode_only, logged_isize);
5859 if (ret < 0)
5860 return ret;
5861 ins_nr = 0;
5862
e09d94c9 5863 btrfs_release_path(path);
5557a069 5864 ret = add_conflicting_inode(trans, root, path,
e09d94c9
FM
5865 other_ino,
5866 other_parent, ctx);
da447009
FM
5867 if (ret)
5868 return ret;
da447009
FM
5869 goto next_key;
5870 }
d9947887
FM
5871 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5872 /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
da447009
FM
5873 if (ins_nr == 0)
5874 goto next_slot;
5875 ret = copy_items(trans, inode, dst_path, path,
5876 ins_start_slot,
5877 ins_nr, inode_only, logged_isize);
5878 if (ret < 0)
5879 return ret;
5880 ins_nr = 0;
5881 goto next_slot;
5882 }
5883
5884 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5885 ins_nr++;
5886 goto next_slot;
5887 } else if (!ins_nr) {
5888 ins_start_slot = path->slots[0];
5889 ins_nr = 1;
5890 goto next_slot;
5891 }
5892
5893 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5894 ins_nr, inode_only, logged_isize);
5895 if (ret < 0)
5896 return ret;
5897 ins_nr = 1;
5898 ins_start_slot = path->slots[0];
5899next_slot:
5900 path->slots[0]++;
5901 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5902 btrfs_item_key_to_cpu(path->nodes[0], min_key,
5903 path->slots[0]);
5904 goto again;
5905 }
5906 if (ins_nr) {
5907 ret = copy_items(trans, inode, dst_path, path,
5908 ins_start_slot, ins_nr, inode_only,
5909 logged_isize);
5910 if (ret < 0)
5911 return ret;
5912 ins_nr = 0;
5913 }
5914 btrfs_release_path(path);
5915next_key:
5916 if (min_key->offset < (u64)-1) {
5917 min_key->offset++;
5918 } else if (min_key->type < max_key->type) {
5919 min_key->type++;
5920 min_key->offset = 0;
5921 } else {
5922 break;
5923 }
96acb375
FM
5924
5925 /*
5926 * We may process many leaves full of items for our inode, so
5927 * avoid monopolizing a cpu for too long by rescheduling while
5928 * not holding locks on any tree.
5929 */
5930 cond_resched();
da447009 5931 }
d9947887 5932 if (ins_nr) {
da447009
FM
5933 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5934 ins_nr, inode_only, logged_isize);
d9947887
FM
5935 if (ret)
5936 return ret;
5937 }
5938
5939 if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
5940 /*
5941 * Release the path because otherwise we might attempt to double
5942 * lock the same leaf with btrfs_log_prealloc_extents() below.
5943 */
5944 btrfs_release_path(path);
5945 ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
5946 }
da447009
FM
5947
5948 return ret;
5949}
5950
30b80f3c
FM
5951static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
5952 struct btrfs_root *log,
5953 struct btrfs_path *path,
5954 const struct btrfs_item_batch *batch,
5955 const struct btrfs_delayed_item *first_item)
5956{
5957 const struct btrfs_delayed_item *curr = first_item;
5958 int ret;
5959
5960 ret = btrfs_insert_empty_items(trans, log, path, batch);
5961 if (ret)
5962 return ret;
5963
5964 for (int i = 0; i < batch->nr; i++) {
5965 char *data_ptr;
5966
5967 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
5968 write_extent_buffer(path->nodes[0], &curr->data,
5969 (unsigned long)data_ptr, curr->data_len);
5970 curr = list_next_entry(curr, log_list);
5971 path->slots[0]++;
5972 }
5973
5974 btrfs_release_path(path);
5975
5976 return 0;
5977}
5978
5979static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
5980 struct btrfs_inode *inode,
5981 struct btrfs_path *path,
5982 const struct list_head *delayed_ins_list,
5983 struct btrfs_log_ctx *ctx)
5984{
5985 /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
5986 const int max_batch_size = 195;
5987 const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
5988 const u64 ino = btrfs_ino(inode);
5989 struct btrfs_root *log = inode->root->log_root;
5990 struct btrfs_item_batch batch = {
5991 .nr = 0,
5992 .total_data_size = 0,
5993 };
5994 const struct btrfs_delayed_item *first = NULL;
5995 const struct btrfs_delayed_item *curr;
5996 char *ins_data;
5997 struct btrfs_key *ins_keys;
5998 u32 *ins_sizes;
5999 u64 curr_batch_size = 0;
6000 int batch_idx = 0;
6001 int ret;
6002
6003 /* We are adding dir index items to the log tree. */
6004 lockdep_assert_held(&inode->log_mutex);
6005
6006 /*
6007 * We collect delayed items before copying index keys from the subvolume
6008 * to the log tree. However just after we collected them, they may have
6009 * been flushed (all of them or just some of them), and therefore we
6010 * could have copied them from the subvolume tree to the log tree.
6011 * So find the first delayed item that was not yet logged (they are
6012 * sorted by index number).
6013 */
6014 list_for_each_entry(curr, delayed_ins_list, log_list) {
6015 if (curr->index > inode->last_dir_index_offset) {
6016 first = curr;
6017 break;
6018 }
6019 }
6020
6021 /* Empty list or all delayed items were already logged. */
6022 if (!first)
6023 return 0;
6024
6025 ins_data = kmalloc(max_batch_size * sizeof(u32) +
6026 max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
6027 if (!ins_data)
6028 return -ENOMEM;
6029 ins_sizes = (u32 *)ins_data;
6030 batch.data_sizes = ins_sizes;
6031 ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
6032 batch.keys = ins_keys;
6033
6034 curr = first;
6035 while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
6036 const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
6037
6038 if (curr_batch_size + curr_size > leaf_data_size ||
6039 batch.nr == max_batch_size) {
6040 ret = insert_delayed_items_batch(trans, log, path,
6041 &batch, first);
6042 if (ret)
6043 goto out;
6044 batch_idx = 0;
6045 batch.nr = 0;
6046 batch.total_data_size = 0;
6047 curr_batch_size = 0;
6048 first = curr;
6049 }
6050
6051 ins_sizes[batch_idx] = curr->data_len;
6052 ins_keys[batch_idx].objectid = ino;
6053 ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
6054 ins_keys[batch_idx].offset = curr->index;
6055 curr_batch_size += curr_size;
6056 batch.total_data_size += curr->data_len;
6057 batch.nr++;
6058 batch_idx++;
6059 curr = list_next_entry(curr, log_list);
6060 }
6061
6062 ASSERT(batch.nr >= 1);
6063 ret = insert_delayed_items_batch(trans, log, path, &batch, first);
6064
6065 curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
6066 log_list);
6067 inode->last_dir_index_offset = curr->index;
6068out:
6069 kfree(ins_data);
6070
6071 return ret;
6072}
6073
6074static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
6075 struct btrfs_inode *inode,
6076 struct btrfs_path *path,
6077 const struct list_head *delayed_del_list,
6078 struct btrfs_log_ctx *ctx)
6079{
6080 const u64 ino = btrfs_ino(inode);
6081 const struct btrfs_delayed_item *curr;
6082
6083 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6084 log_list);
6085
6086 while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6087 u64 first_dir_index = curr->index;
6088 u64 last_dir_index;
6089 const struct btrfs_delayed_item *next;
6090 int ret;
6091
6092 /*
6093 * Find a range of consecutive dir index items to delete. Like
6094 * this we log a single dir range item spanning several contiguous
6095 * dir items instead of logging one range item per dir index item.
6096 */
6097 next = list_next_entry(curr, log_list);
6098 while (!list_entry_is_head(next, delayed_del_list, log_list)) {
6099 if (next->index != curr->index + 1)
6100 break;
6101 curr = next;
6102 next = list_next_entry(next, log_list);
6103 }
6104
6105 last_dir_index = curr->index;
6106 ASSERT(last_dir_index >= first_dir_index);
6107
6108 ret = insert_dir_log_key(trans, inode->root->log_root, path,
6109 ino, first_dir_index, last_dir_index);
6110 if (ret)
6111 return ret;
6112 curr = list_next_entry(curr, log_list);
6113 }
6114
6115 return 0;
6116}
6117
6118static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
6119 struct btrfs_inode *inode,
6120 struct btrfs_path *path,
6121 struct btrfs_log_ctx *ctx,
6122 const struct list_head *delayed_del_list,
6123 const struct btrfs_delayed_item *first,
6124 const struct btrfs_delayed_item **last_ret)
6125{
6126 const struct btrfs_delayed_item *next;
6127 struct extent_buffer *leaf = path->nodes[0];
6128 const int last_slot = btrfs_header_nritems(leaf) - 1;
6129 int slot = path->slots[0] + 1;
6130 const u64 ino = btrfs_ino(inode);
6131
6132 next = list_next_entry(first, log_list);
6133
6134 while (slot < last_slot &&
6135 !list_entry_is_head(next, delayed_del_list, log_list)) {
6136 struct btrfs_key key;
6137
6138 btrfs_item_key_to_cpu(leaf, &key, slot);
6139 if (key.objectid != ino ||
6140 key.type != BTRFS_DIR_INDEX_KEY ||
6141 key.offset != next->index)
6142 break;
6143
6144 slot++;
6145 *last_ret = next;
6146 next = list_next_entry(next, log_list);
6147 }
6148
6149 return btrfs_del_items(trans, inode->root->log_root, path,
6150 path->slots[0], slot - path->slots[0]);
6151}
6152
6153static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
6154 struct btrfs_inode *inode,
6155 struct btrfs_path *path,
6156 const struct list_head *delayed_del_list,
6157 struct btrfs_log_ctx *ctx)
6158{
6159 struct btrfs_root *log = inode->root->log_root;
6160 const struct btrfs_delayed_item *curr;
6161 u64 last_range_start;
6162 u64 last_range_end = 0;
6163 struct btrfs_key key;
6164
6165 key.objectid = btrfs_ino(inode);
6166 key.type = BTRFS_DIR_INDEX_KEY;
6167 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6168 log_list);
6169
6170 while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6171 const struct btrfs_delayed_item *last = curr;
6172 u64 first_dir_index = curr->index;
6173 u64 last_dir_index;
6174 bool deleted_items = false;
6175 int ret;
6176
6177 key.offset = curr->index;
6178 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
6179 if (ret < 0) {
6180 return ret;
6181 } else if (ret == 0) {
6182 ret = batch_delete_dir_index_items(trans, inode, path, ctx,
6183 delayed_del_list, curr,
6184 &last);
6185 if (ret)
6186 return ret;
6187 deleted_items = true;
6188 }
6189
6190 btrfs_release_path(path);
6191
6192 /*
6193 * If we deleted items from the leaf, it means we have a range
6194 * item logging their range, so no need to add one or update an
6195 * existing one. Otherwise we have to log a dir range item.
6196 */
6197 if (deleted_items)
6198 goto next_batch;
6199
6200 last_dir_index = last->index;
6201 ASSERT(last_dir_index >= first_dir_index);
6202 /*
6203 * If this range starts right after where the previous one ends,
6204 * then we want to reuse the previous range item and change its
6205 * end offset to the end of this range. This is just to minimize
6206 * leaf space usage, by avoiding adding a new range item.
6207 */
6208 if (last_range_end != 0 && first_dir_index == last_range_end + 1)
6209 first_dir_index = last_range_start;
6210
6211 ret = insert_dir_log_key(trans, log, path, key.objectid,
6212 first_dir_index, last_dir_index);
6213 if (ret)
6214 return ret;
6215
6216 last_range_start = first_dir_index;
6217 last_range_end = last_dir_index;
6218next_batch:
6219 curr = list_next_entry(last, log_list);
6220 }
6221
6222 return 0;
6223}
6224
6225static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
6226 struct btrfs_inode *inode,
6227 struct btrfs_path *path,
6228 const struct list_head *delayed_del_list,
6229 struct btrfs_log_ctx *ctx)
6230{
6231 /*
6232 * We are deleting dir index items from the log tree or adding range
6233 * items to it.
6234 */
6235 lockdep_assert_held(&inode->log_mutex);
6236
6237 if (list_empty(delayed_del_list))
6238 return 0;
6239
6240 if (ctx->logged_before)
6241 return log_delayed_deletions_incremental(trans, inode, path,
6242 delayed_del_list, ctx);
6243
6244 return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
6245 ctx);
6246}
6247
6248/*
6249 * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
6250 * items instead of the subvolume tree.
6251 */
6252static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
6253 struct btrfs_inode *inode,
6254 const struct list_head *delayed_ins_list,
6255 struct btrfs_log_ctx *ctx)
6256{
6257 const bool orig_log_new_dentries = ctx->log_new_dentries;
6258 struct btrfs_fs_info *fs_info = trans->fs_info;
6259 struct btrfs_delayed_item *item;
6260 int ret = 0;
6261
6262 /*
6263 * No need for the log mutex, plus to avoid potential deadlocks or
6264 * lockdep annotations due to nesting of delayed inode mutexes and log
6265 * mutexes.
6266 */
6267 lockdep_assert_not_held(&inode->log_mutex);
6268
6269 ASSERT(!ctx->logging_new_delayed_dentries);
6270 ctx->logging_new_delayed_dentries = true;
6271
6272 list_for_each_entry(item, delayed_ins_list, log_list) {
6273 struct btrfs_dir_item *dir_item;
6274 struct inode *di_inode;
6275 struct btrfs_key key;
6276 int log_mode = LOG_INODE_EXISTS;
6277
6278 dir_item = (struct btrfs_dir_item *)item->data;
6279 btrfs_disk_key_to_cpu(&key, &dir_item->location);
6280
6281 if (key.type == BTRFS_ROOT_ITEM_KEY)
6282 continue;
6283
6284 di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root);
6285 if (IS_ERR(di_inode)) {
6286 ret = PTR_ERR(di_inode);
6287 break;
6288 }
6289
6290 if (!need_log_inode(trans, BTRFS_I(di_inode))) {
6291 btrfs_add_delayed_iput(di_inode);
6292 continue;
6293 }
6294
94a48aef 6295 if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR)
30b80f3c
FM
6296 log_mode = LOG_INODE_ALL;
6297
6298 ctx->log_new_dentries = false;
6299 ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
6300
6301 if (!ret && ctx->log_new_dentries)
6302 ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
6303
6304 btrfs_add_delayed_iput(di_inode);
6305
6306 if (ret)
6307 break;
6308 }
6309
6310 ctx->log_new_dentries = orig_log_new_dentries;
6311 ctx->logging_new_delayed_dentries = false;
6312
6313 return ret;
6314}
6315
e02119d5
CM
6316/* log a single inode in the tree log.
6317 * At least one parent directory for this inode must exist in the tree
6318 * or be logged already.
6319 *
6320 * Any items from this inode changed by the current transaction are copied
6321 * to the log tree. An extra reference is taken on any extents in this
6322 * file, allowing us to avoid a whole pile of corner cases around logging
6323 * blocks that have been removed from the tree.
6324 *
6325 * See LOG_INODE_ALL and related defines for a description of what inode_only
6326 * does.
6327 *
6328 * This handles both files and directories.
6329 */
12fcfd22 6330static int btrfs_log_inode(struct btrfs_trans_handle *trans,
90d04510 6331 struct btrfs_inode *inode,
49dae1bc 6332 int inode_only,
8407f553 6333 struct btrfs_log_ctx *ctx)
e02119d5
CM
6334{
6335 struct btrfs_path *path;
6336 struct btrfs_path *dst_path;
6337 struct btrfs_key min_key;
6338 struct btrfs_key max_key;
90d04510 6339 struct btrfs_root *log = inode->root->log_root;
65faced5 6340 int ret;
5dc562c5 6341 bool fast_search = false;
a59108a7
NB
6342 u64 ino = btrfs_ino(inode);
6343 struct extent_map_tree *em_tree = &inode->extent_tree;
1a4bcf47 6344 u64 logged_isize = 0;
e4545de5 6345 bool need_log_inode_item = true;
9a8fca62 6346 bool xattrs_logged = false;
2ac691d8 6347 bool inode_item_dropped = true;
30b80f3c
FM
6348 bool full_dir_logging = false;
6349 LIST_HEAD(delayed_ins_list);
6350 LIST_HEAD(delayed_del_list);
e02119d5 6351
e02119d5 6352 path = btrfs_alloc_path();
5df67083
TI
6353 if (!path)
6354 return -ENOMEM;
e02119d5 6355 dst_path = btrfs_alloc_path();
5df67083
TI
6356 if (!dst_path) {
6357 btrfs_free_path(path);
6358 return -ENOMEM;
6359 }
e02119d5 6360
33345d01 6361 min_key.objectid = ino;
e02119d5
CM
6362 min_key.type = BTRFS_INODE_ITEM_KEY;
6363 min_key.offset = 0;
6364
33345d01 6365 max_key.objectid = ino;
12fcfd22 6366
12fcfd22 6367
5dc562c5 6368 /* today the code can only do partial logging of directories */
a59108a7 6369 if (S_ISDIR(inode->vfs_inode.i_mode) ||
5269b67e 6370 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
a59108a7 6371 &inode->runtime_flags) &&
781feef7 6372 inode_only >= LOG_INODE_EXISTS))
e02119d5
CM
6373 max_key.type = BTRFS_XATTR_ITEM_KEY;
6374 else
6375 max_key.type = (u8)-1;
6376 max_key.offset = (u64)-1;
6377
30b80f3c
FM
6378 if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
6379 full_dir_logging = true;
6380
2c2c452b 6381 /*
30b80f3c
FM
6382 * If we are logging a directory while we are logging dentries of the
6383 * delayed items of some other inode, then we need to flush the delayed
6384 * items of this directory and not log the delayed items directly. This
6385 * is to prevent more than one level of recursion into btrfs_log_inode()
6386 * by having something like this:
6387 *
6388 * $ mkdir -p a/b/c/d/e/f/g/h/...
6389 * $ xfs_io -c "fsync" a
6390 *
6391 * Where all directories in the path did not exist before and are
6392 * created in the current transaction.
6393 * So in such a case we directly log the delayed items of the main
6394 * directory ("a") without flushing them first, while for each of its
6395 * subdirectories we flush their delayed items before logging them.
6396 * This prevents a potential unbounded recursion like this:
6397 *
6398 * btrfs_log_inode()
6399 * log_new_delayed_dentries()
6400 * btrfs_log_inode()
6401 * log_new_delayed_dentries()
6402 * btrfs_log_inode()
6403 * log_new_delayed_dentries()
6404 * (...)
6405 *
6406 * We have thresholds for the maximum number of delayed items to have in
6407 * memory, and once they are hit, the items are flushed asynchronously.
6408 * However the limit is quite high, so lets prevent deep levels of
6409 * recursion to happen by limiting the maximum depth to be 1.
2c2c452b 6410 */
30b80f3c 6411 if (full_dir_logging && ctx->logging_new_delayed_dentries) {
65faced5
FM
6412 ret = btrfs_commit_inode_delayed_items(trans, inode);
6413 if (ret)
f6df27dd 6414 goto out;
16cdcec7
MX
6415 }
6416
e09d94c9 6417 mutex_lock(&inode->log_mutex);
e02119d5 6418
d0e64a98
FM
6419 /*
6420 * For symlinks, we must always log their content, which is stored in an
6421 * inline extent, otherwise we could end up with an empty symlink after
6422 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
6423 * one attempts to create an empty symlink).
6424 * We don't need to worry about flushing delalloc, because when we create
6425 * the inline extent when the symlink is created (we never have delalloc
6426 * for symlinks).
6427 */
6428 if (S_ISLNK(inode->vfs_inode.i_mode))
6429 inode_only = LOG_INODE_ALL;
6430
0f8ce498
FM
6431 /*
6432 * Before logging the inode item, cache the value returned by
6433 * inode_logged(), because after that we have the need to figure out if
6434 * the inode was previously logged in this transaction.
6435 */
6436 ret = inode_logged(trans, inode, path);
65faced5 6437 if (ret < 0)
0f8ce498 6438 goto out_unlock;
0f8ce498 6439 ctx->logged_before = (ret == 1);
65faced5 6440 ret = 0;
0f8ce498 6441
64d6b281
FM
6442 /*
6443 * This is for cases where logging a directory could result in losing a
6444 * a file after replaying the log. For example, if we move a file from a
6445 * directory A to a directory B, then fsync directory A, we have no way
6446 * to known the file was moved from A to B, so logging just A would
6447 * result in losing the file after a log replay.
6448 */
30b80f3c 6449 if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
64d6b281 6450 btrfs_set_log_full_commit(trans);
f31f09f6 6451 ret = BTRFS_LOG_FORCE_COMMIT;
64d6b281
FM
6452 goto out_unlock;
6453 }
6454
e02119d5
CM
6455 /*
6456 * a brute force approach to making sure we get the most uptodate
6457 * copies of everything.
6458 */
a59108a7 6459 if (S_ISDIR(inode->vfs_inode.i_mode)) {
ab12313a 6460 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
0f8ce498
FM
6461 if (ctx->logged_before)
6462 ret = drop_inode_items(trans, log, path, inode,
04fc7d51 6463 BTRFS_XATTR_ITEM_KEY);
e02119d5 6464 } else {
0f8ce498 6465 if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
1a4bcf47
FM
6466 /*
6467 * Make sure the new inode item we write to the log has
6468 * the same isize as the current one (if it exists).
6469 * This is necessary to prevent data loss after log
6470 * replay, and also to prevent doing a wrong expanding
6471 * truncate - for e.g. create file, write 4K into offset
6472 * 0, fsync, write 4K into offset 4096, add hard link,
6473 * fsync some other file (to sync log), power fail - if
6474 * we use the inode's current i_size, after log replay
6475 * we get a 8Kb file, with the last 4Kb extent as a hole
6476 * (zeroes), as if an expanding truncate happened,
6477 * instead of getting a file of 4Kb only.
6478 */
65faced5
FM
6479 ret = logged_inode_size(log, inode, path, &logged_isize);
6480 if (ret)
1a4bcf47
FM
6481 goto out_unlock;
6482 }
a742994a 6483 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
a59108a7 6484 &inode->runtime_flags)) {
a742994a 6485 if (inode_only == LOG_INODE_EXISTS) {
4f764e51 6486 max_key.type = BTRFS_XATTR_ITEM_KEY;
0f8ce498
FM
6487 if (ctx->logged_before)
6488 ret = drop_inode_items(trans, log, path,
6489 inode, max_key.type);
a742994a
FM
6490 } else {
6491 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
a59108a7 6492 &inode->runtime_flags);
a742994a 6493 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
a59108a7 6494 &inode->runtime_flags);
0f8ce498 6495 if (ctx->logged_before)
4934a815
FM
6496 ret = truncate_inode_items(trans, log,
6497 inode, 0, 0);
a742994a 6498 }
4f764e51 6499 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
a59108a7 6500 &inode->runtime_flags) ||
6cfab851 6501 inode_only == LOG_INODE_EXISTS) {
4f764e51 6502 if (inode_only == LOG_INODE_ALL)
183f37fa 6503 fast_search = true;
4f764e51 6504 max_key.type = BTRFS_XATTR_ITEM_KEY;
0f8ce498
FM
6505 if (ctx->logged_before)
6506 ret = drop_inode_items(trans, log, path, inode,
6507 max_key.type);
a95249b3
JB
6508 } else {
6509 if (inode_only == LOG_INODE_ALL)
6510 fast_search = true;
2ac691d8 6511 inode_item_dropped = false;
a95249b3 6512 goto log_extents;
5dc562c5 6513 }
a95249b3 6514
e02119d5 6515 }
65faced5 6516 if (ret)
4a500fd1 6517 goto out_unlock;
e02119d5 6518
30b80f3c
FM
6519 /*
6520 * If we are logging a directory in full mode, collect the delayed items
6521 * before iterating the subvolume tree, so that we don't miss any new
6522 * dir index items in case they get flushed while or right after we are
6523 * iterating the subvolume tree.
6524 */
6525 if (full_dir_logging && !ctx->logging_new_delayed_dentries)
6526 btrfs_log_get_delayed_items(inode, &delayed_ins_list,
6527 &delayed_del_list);
6528
65faced5 6529 ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
da447009 6530 path, dst_path, logged_isize,
e09d94c9 6531 inode_only, ctx,
7af59743 6532 &need_log_inode_item);
65faced5 6533 if (ret)
da447009 6534 goto out_unlock;
5dc562c5 6535
36283bf7
FM
6536 btrfs_release_path(path);
6537 btrfs_release_path(dst_path);
65faced5
FM
6538 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
6539 if (ret)
36283bf7 6540 goto out_unlock;
9a8fca62 6541 xattrs_logged = true;
a89ca6f2
FM
6542 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
6543 btrfs_release_path(path);
6544 btrfs_release_path(dst_path);
65faced5
FM
6545 ret = btrfs_log_holes(trans, inode, path);
6546 if (ret)
a89ca6f2
FM
6547 goto out_unlock;
6548 }
a95249b3 6549log_extents:
f3b15ccd
JB
6550 btrfs_release_path(path);
6551 btrfs_release_path(dst_path);
e4545de5 6552 if (need_log_inode_item) {
65faced5
FM
6553 ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
6554 if (ret)
b590b839
FM
6555 goto out_unlock;
6556 /*
6557 * If we are doing a fast fsync and the inode was logged before
6558 * in this transaction, we don't need to log the xattrs because
6559 * they were logged before. If xattrs were added, changed or
6560 * deleted since the last time we logged the inode, then we have
6561 * already logged them because the inode had the runtime flag
6562 * BTRFS_INODE_COPY_EVERYTHING set.
6563 */
6564 if (!xattrs_logged && inode->logged_trans < trans->transid) {
65faced5
FM
6565 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
6566 if (ret)
b590b839 6567 goto out_unlock;
9a8fca62
FM
6568 btrfs_release_path(path);
6569 }
e4545de5 6570 }
5dc562c5 6571 if (fast_search) {
90d04510 6572 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
65faced5 6573 if (ret)
5dc562c5 6574 goto out_unlock;
d006a048 6575 } else if (inode_only == LOG_INODE_ALL) {
06d3d22b
LB
6576 struct extent_map *em, *n;
6577
49dae1bc 6578 write_lock(&em_tree->lock);
48778179
FM
6579 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
6580 list_del_init(&em->list);
49dae1bc 6581 write_unlock(&em_tree->lock);
5dc562c5
JB
6582 }
6583
30b80f3c 6584 if (full_dir_logging) {
90d04510 6585 ret = log_directory_changes(trans, inode, path, dst_path, ctx);
65faced5 6586 if (ret)
4a500fd1 6587 goto out_unlock;
30b80f3c
FM
6588 ret = log_delayed_insertion_items(trans, inode, path,
6589 &delayed_ins_list, ctx);
6590 if (ret)
6591 goto out_unlock;
6592 ret = log_delayed_deletion_items(trans, inode, path,
6593 &delayed_del_list, ctx);
6594 if (ret)
6595 goto out_unlock;
e02119d5 6596 }
49dae1bc 6597
130341be
FM
6598 spin_lock(&inode->lock);
6599 inode->logged_trans = trans->transid;
d1d832a0 6600 /*
130341be
FM
6601 * Don't update last_log_commit if we logged that an inode exists.
6602 * We do this for three reasons:
6603 *
6604 * 1) We might have had buffered writes to this inode that were
6605 * flushed and had their ordered extents completed in this
6606 * transaction, but we did not previously log the inode with
6607 * LOG_INODE_ALL. Later the inode was evicted and after that
6608 * it was loaded again and this LOG_INODE_EXISTS log operation
6609 * happened. We must make sure that if an explicit fsync against
6610 * the inode is performed later, it logs the new extents, an
6611 * updated inode item, etc, and syncs the log. The same logic
6612 * applies to direct IO writes instead of buffered writes.
6613 *
6614 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
6615 * is logged with an i_size of 0 or whatever value was logged
6616 * before. If later the i_size of the inode is increased by a
6617 * truncate operation, the log is synced through an fsync of
6618 * some other inode and then finally an explicit fsync against
6619 * this inode is made, we must make sure this fsync logs the
6620 * inode with the new i_size, the hole between old i_size and
6621 * the new i_size, and syncs the log.
6622 *
6623 * 3) If we are logging that an ancestor inode exists as part of
6624 * logging a new name from a link or rename operation, don't update
6625 * its last_log_commit - otherwise if an explicit fsync is made
6626 * against an ancestor, the fsync considers the inode in the log
6627 * and doesn't sync the log, resulting in the ancestor missing after
6628 * a power failure unless the log was synced as part of an fsync
6629 * against any other unrelated inode.
d1d832a0 6630 */
130341be
FM
6631 if (inode_only != LOG_INODE_EXISTS)
6632 inode->last_log_commit = inode->last_sub_trans;
6633 spin_unlock(&inode->lock);
23e3337f
FM
6634
6635 /*
6636 * Reset the last_reflink_trans so that the next fsync does not need to
6637 * go through the slower path when logging extents and their checksums.
6638 */
6639 if (inode_only == LOG_INODE_ALL)
6640 inode->last_reflink_trans = 0;
6641
4a500fd1 6642out_unlock:
a59108a7 6643 mutex_unlock(&inode->log_mutex);
f6df27dd 6644out:
e02119d5
CM
6645 btrfs_free_path(path);
6646 btrfs_free_path(dst_path);
0f8ce498 6647
e09d94c9
FM
6648 if (ret)
6649 free_conflicting_inodes(ctx);
6650 else
6651 ret = log_conflicting_inodes(trans, inode->root, ctx);
0f8ce498 6652
30b80f3c
FM
6653 if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
6654 if (!ret)
6655 ret = log_new_delayed_dentries(trans, inode,
6656 &delayed_ins_list, ctx);
6657
6658 btrfs_log_put_delayed_items(inode, &delayed_ins_list,
6659 &delayed_del_list);
6660 }
6661
65faced5 6662 return ret;
e02119d5
CM
6663}
6664
18aa0922 6665static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
d0a0b78d 6666 struct btrfs_inode *inode,
18aa0922
FM
6667 struct btrfs_log_ctx *ctx)
6668{
3ffbd68c 6669 struct btrfs_fs_info *fs_info = trans->fs_info;
18aa0922
FM
6670 int ret;
6671 struct btrfs_path *path;
6672 struct btrfs_key key;
d0a0b78d
NB
6673 struct btrfs_root *root = inode->root;
6674 const u64 ino = btrfs_ino(inode);
18aa0922
FM
6675
6676 path = btrfs_alloc_path();
6677 if (!path)
6678 return -ENOMEM;
6679 path->skip_locking = 1;
6680 path->search_commit_root = 1;
6681
6682 key.objectid = ino;
6683 key.type = BTRFS_INODE_REF_KEY;
6684 key.offset = 0;
6685 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6686 if (ret < 0)
6687 goto out;
6688
6689 while (true) {
6690 struct extent_buffer *leaf = path->nodes[0];
6691 int slot = path->slots[0];
6692 u32 cur_offset = 0;
6693 u32 item_size;
6694 unsigned long ptr;
6695
6696 if (slot >= btrfs_header_nritems(leaf)) {
6697 ret = btrfs_next_leaf(root, path);
6698 if (ret < 0)
6699 goto out;
6700 else if (ret > 0)
6701 break;
6702 continue;
6703 }
6704
6705 btrfs_item_key_to_cpu(leaf, &key, slot);
6706 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
6707 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
6708 break;
6709
3212fa14 6710 item_size = btrfs_item_size(leaf, slot);
18aa0922
FM
6711 ptr = btrfs_item_ptr_offset(leaf, slot);
6712 while (cur_offset < item_size) {
6713 struct btrfs_key inode_key;
6714 struct inode *dir_inode;
6715
6716 inode_key.type = BTRFS_INODE_ITEM_KEY;
6717 inode_key.offset = 0;
6718
6719 if (key.type == BTRFS_INODE_EXTREF_KEY) {
6720 struct btrfs_inode_extref *extref;
6721
6722 extref = (struct btrfs_inode_extref *)
6723 (ptr + cur_offset);
6724 inode_key.objectid = btrfs_inode_extref_parent(
6725 leaf, extref);
6726 cur_offset += sizeof(*extref);
6727 cur_offset += btrfs_inode_extref_name_len(leaf,
6728 extref);
6729 } else {
6730 inode_key.objectid = key.offset;
6731 cur_offset = item_size;
6732 }
6733
0202e83f
DS
6734 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
6735 root);
0f375eed
FM
6736 /*
6737 * If the parent inode was deleted, return an error to
6738 * fallback to a transaction commit. This is to prevent
6739 * getting an inode that was moved from one parent A to
6740 * a parent B, got its former parent A deleted and then
6741 * it got fsync'ed, from existing at both parents after
6742 * a log replay (and the old parent still existing).
6743 * Example:
6744 *
6745 * mkdir /mnt/A
6746 * mkdir /mnt/B
6747 * touch /mnt/B/bar
6748 * sync
6749 * mv /mnt/B/bar /mnt/A/bar
6750 * mv -T /mnt/A /mnt/B
6751 * fsync /mnt/B/bar
6752 * <power fail>
6753 *
6754 * If we ignore the old parent B which got deleted,
6755 * after a log replay we would have file bar linked
6756 * at both parents and the old parent B would still
6757 * exist.
6758 */
6759 if (IS_ERR(dir_inode)) {
6760 ret = PTR_ERR(dir_inode);
6761 goto out;
6762 }
18aa0922 6763
3e6a86a1
FM
6764 if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
6765 btrfs_add_delayed_iput(dir_inode);
6766 continue;
6767 }
6768
289cffcb 6769 ctx->log_new_dentries = false;
90d04510 6770 ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
48778179 6771 LOG_INODE_ALL, ctx);
289cffcb 6772 if (!ret && ctx->log_new_dentries)
8786a6d7 6773 ret = log_new_dir_dentries(trans,
f85b7379 6774 BTRFS_I(dir_inode), ctx);
410f954c 6775 btrfs_add_delayed_iput(dir_inode);
18aa0922
FM
6776 if (ret)
6777 goto out;
6778 }
6779 path->slots[0]++;
6780 }
6781 ret = 0;
6782out:
6783 btrfs_free_path(path);
6784 return ret;
6785}
6786
b8aa330d
FM
6787static int log_new_ancestors(struct btrfs_trans_handle *trans,
6788 struct btrfs_root *root,
6789 struct btrfs_path *path,
6790 struct btrfs_log_ctx *ctx)
6791{
6792 struct btrfs_key found_key;
6793
6794 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
6795
6796 while (true) {
6797 struct btrfs_fs_info *fs_info = root->fs_info;
b8aa330d
FM
6798 struct extent_buffer *leaf = path->nodes[0];
6799 int slot = path->slots[0];
6800 struct btrfs_key search_key;
6801 struct inode *inode;
0202e83f 6802 u64 ino;
b8aa330d
FM
6803 int ret = 0;
6804
6805 btrfs_release_path(path);
6806
0202e83f
DS
6807 ino = found_key.offset;
6808
b8aa330d
FM
6809 search_key.objectid = found_key.offset;
6810 search_key.type = BTRFS_INODE_ITEM_KEY;
6811 search_key.offset = 0;
0202e83f 6812 inode = btrfs_iget(fs_info->sb, ino, root);
b8aa330d
FM
6813 if (IS_ERR(inode))
6814 return PTR_ERR(inode);
6815
ab12313a
FM
6816 if (BTRFS_I(inode)->generation >= trans->transid &&
6817 need_log_inode(trans, BTRFS_I(inode)))
90d04510 6818 ret = btrfs_log_inode(trans, BTRFS_I(inode),
48778179 6819 LOG_INODE_EXISTS, ctx);
410f954c 6820 btrfs_add_delayed_iput(inode);
b8aa330d
FM
6821 if (ret)
6822 return ret;
6823
6824 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
6825 break;
6826
6827 search_key.type = BTRFS_INODE_REF_KEY;
6828 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6829 if (ret < 0)
6830 return ret;
6831
6832 leaf = path->nodes[0];
6833 slot = path->slots[0];
6834 if (slot >= btrfs_header_nritems(leaf)) {
6835 ret = btrfs_next_leaf(root, path);
6836 if (ret < 0)
6837 return ret;
6838 else if (ret > 0)
6839 return -ENOENT;
6840 leaf = path->nodes[0];
6841 slot = path->slots[0];
6842 }
6843
6844 btrfs_item_key_to_cpu(leaf, &found_key, slot);
6845 if (found_key.objectid != search_key.objectid ||
6846 found_key.type != BTRFS_INODE_REF_KEY)
6847 return -ENOENT;
6848 }
6849 return 0;
6850}
6851
6852static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
6853 struct btrfs_inode *inode,
6854 struct dentry *parent,
6855 struct btrfs_log_ctx *ctx)
6856{
6857 struct btrfs_root *root = inode->root;
b8aa330d
FM
6858 struct dentry *old_parent = NULL;
6859 struct super_block *sb = inode->vfs_inode.i_sb;
6860 int ret = 0;
6861
6862 while (true) {
6863 if (!parent || d_really_is_negative(parent) ||
6864 sb != parent->d_sb)
6865 break;
6866
6867 inode = BTRFS_I(d_inode(parent));
6868 if (root != inode->root)
6869 break;
6870
ab12313a
FM
6871 if (inode->generation >= trans->transid &&
6872 need_log_inode(trans, inode)) {
90d04510 6873 ret = btrfs_log_inode(trans, inode,
48778179 6874 LOG_INODE_EXISTS, ctx);
b8aa330d
FM
6875 if (ret)
6876 break;
6877 }
6878 if (IS_ROOT(parent))
6879 break;
6880
6881 parent = dget_parent(parent);
6882 dput(old_parent);
6883 old_parent = parent;
6884 }
6885 dput(old_parent);
6886
6887 return ret;
6888}
6889
6890static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
6891 struct btrfs_inode *inode,
6892 struct dentry *parent,
6893 struct btrfs_log_ctx *ctx)
6894{
6895 struct btrfs_root *root = inode->root;
6896 const u64 ino = btrfs_ino(inode);
6897 struct btrfs_path *path;
6898 struct btrfs_key search_key;
6899 int ret;
6900
6901 /*
6902 * For a single hard link case, go through a fast path that does not
6903 * need to iterate the fs/subvolume tree.
6904 */
6905 if (inode->vfs_inode.i_nlink < 2)
6906 return log_new_ancestors_fast(trans, inode, parent, ctx);
6907
6908 path = btrfs_alloc_path();
6909 if (!path)
6910 return -ENOMEM;
6911
6912 search_key.objectid = ino;
6913 search_key.type = BTRFS_INODE_REF_KEY;
6914 search_key.offset = 0;
6915again:
6916 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6917 if (ret < 0)
6918 goto out;
6919 if (ret == 0)
6920 path->slots[0]++;
6921
6922 while (true) {
6923 struct extent_buffer *leaf = path->nodes[0];
6924 int slot = path->slots[0];
6925 struct btrfs_key found_key;
6926
6927 if (slot >= btrfs_header_nritems(leaf)) {
6928 ret = btrfs_next_leaf(root, path);
6929 if (ret < 0)
6930 goto out;
6931 else if (ret > 0)
6932 break;
6933 continue;
6934 }
6935
6936 btrfs_item_key_to_cpu(leaf, &found_key, slot);
6937 if (found_key.objectid != ino ||
6938 found_key.type > BTRFS_INODE_EXTREF_KEY)
6939 break;
6940
6941 /*
6942 * Don't deal with extended references because they are rare
6943 * cases and too complex to deal with (we would need to keep
6944 * track of which subitem we are processing for each item in
6945 * this loop, etc). So just return some error to fallback to
6946 * a transaction commit.
6947 */
6948 if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
6949 ret = -EMLINK;
6950 goto out;
6951 }
6952
6953 /*
6954 * Logging ancestors needs to do more searches on the fs/subvol
6955 * tree, so it releases the path as needed to avoid deadlocks.
6956 * Keep track of the last inode ref key and resume from that key
6957 * after logging all new ancestors for the current hard link.
6958 */
6959 memcpy(&search_key, &found_key, sizeof(search_key));
6960
6961 ret = log_new_ancestors(trans, root, path, ctx);
6962 if (ret)
6963 goto out;
6964 btrfs_release_path(path);
6965 goto again;
6966 }
6967 ret = 0;
6968out:
6969 btrfs_free_path(path);
6970 return ret;
6971}
6972
e02119d5
CM
6973/*
6974 * helper function around btrfs_log_inode to make sure newly created
6975 * parent directories also end up in the log. A minimal inode and backref
6976 * only logging is done of any parent directories that are older than
6977 * the last committed transaction
6978 */
48a3b636 6979static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
19df27a9 6980 struct btrfs_inode *inode,
49dae1bc 6981 struct dentry *parent,
41a1eada 6982 int inode_only,
8b050d35 6983 struct btrfs_log_ctx *ctx)
e02119d5 6984{
f882274b 6985 struct btrfs_root *root = inode->root;
0b246afa 6986 struct btrfs_fs_info *fs_info = root->fs_info;
12fcfd22 6987 int ret = 0;
2f2ff0ee 6988 bool log_dentries = false;
12fcfd22 6989
0b246afa 6990 if (btrfs_test_opt(fs_info, NOTREELOG)) {
f31f09f6 6991 ret = BTRFS_LOG_FORCE_COMMIT;
3a5e1404
SW
6992 goto end_no_trans;
6993 }
6994
f882274b 6995 if (btrfs_root_refs(&root->root_item) == 0) {
f31f09f6 6996 ret = BTRFS_LOG_FORCE_COMMIT;
76dda93c
YZ
6997 goto end_no_trans;
6998 }
6999
f2d72f42
FM
7000 /*
7001 * Skip already logged inodes or inodes corresponding to tmpfiles
7002 * (since logging them is pointless, a link count of 0 means they
7003 * will never be accessible).
7004 */
626e9f41
FM
7005 if ((btrfs_inode_in_log(inode, trans->transid) &&
7006 list_empty(&ctx->ordered_extents)) ||
f2d72f42 7007 inode->vfs_inode.i_nlink == 0) {
257c62e1
CM
7008 ret = BTRFS_NO_LOG_SYNC;
7009 goto end_no_trans;
7010 }
7011
8b050d35 7012 ret = start_log_trans(trans, root, ctx);
4a500fd1 7013 if (ret)
e87ac136 7014 goto end_no_trans;
e02119d5 7015
90d04510 7016 ret = btrfs_log_inode(trans, inode, inode_only, ctx);
4a500fd1
YZ
7017 if (ret)
7018 goto end_trans;
12fcfd22 7019
af4176b4
CM
7020 /*
7021 * for regular files, if its inode is already on disk, we don't
7022 * have to worry about the parents at all. This is because
7023 * we can use the last_unlink_trans field to record renames
7024 * and other fun in this file.
7025 */
19df27a9 7026 if (S_ISREG(inode->vfs_inode.i_mode) &&
47d3db41
FM
7027 inode->generation < trans->transid &&
7028 inode->last_unlink_trans < trans->transid) {
4a500fd1
YZ
7029 ret = 0;
7030 goto end_trans;
7031 }
af4176b4 7032
289cffcb 7033 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
2f2ff0ee
FM
7034 log_dentries = true;
7035
18aa0922 7036 /*
01327610 7037 * On unlink we must make sure all our current and old parent directory
18aa0922
FM
7038 * inodes are fully logged. This is to prevent leaving dangling
7039 * directory index entries in directories that were our parents but are
7040 * not anymore. Not doing this results in old parent directory being
7041 * impossible to delete after log replay (rmdir will always fail with
7042 * error -ENOTEMPTY).
7043 *
7044 * Example 1:
7045 *
7046 * mkdir testdir
7047 * touch testdir/foo
7048 * ln testdir/foo testdir/bar
7049 * sync
7050 * unlink testdir/bar
7051 * xfs_io -c fsync testdir/foo
7052 * <power failure>
7053 * mount fs, triggers log replay
7054 *
7055 * If we don't log the parent directory (testdir), after log replay the
7056 * directory still has an entry pointing to the file inode using the bar
7057 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
7058 * the file inode has a link count of 1.
7059 *
7060 * Example 2:
7061 *
7062 * mkdir testdir
7063 * touch foo
7064 * ln foo testdir/foo2
7065 * ln foo testdir/foo3
7066 * sync
7067 * unlink testdir/foo3
7068 * xfs_io -c fsync foo
7069 * <power failure>
7070 * mount fs, triggers log replay
7071 *
7072 * Similar as the first example, after log replay the parent directory
7073 * testdir still has an entry pointing to the inode file with name foo3
7074 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
7075 * and has a link count of 2.
7076 */
47d3db41 7077 if (inode->last_unlink_trans >= trans->transid) {
b8aa330d 7078 ret = btrfs_log_all_parents(trans, inode, ctx);
18aa0922
FM
7079 if (ret)
7080 goto end_trans;
7081 }
7082
b8aa330d
FM
7083 ret = log_all_new_ancestors(trans, inode, parent, ctx);
7084 if (ret)
41bd6067 7085 goto end_trans;
76dda93c 7086
2f2ff0ee 7087 if (log_dentries)
8786a6d7 7088 ret = log_new_dir_dentries(trans, inode, ctx);
2f2ff0ee
FM
7089 else
7090 ret = 0;
4a500fd1
YZ
7091end_trans:
7092 if (ret < 0) {
90787766 7093 btrfs_set_log_full_commit(trans);
f31f09f6 7094 ret = BTRFS_LOG_FORCE_COMMIT;
4a500fd1 7095 }
8b050d35
MX
7096
7097 if (ret)
7098 btrfs_remove_log_ctx(root, ctx);
12fcfd22
CM
7099 btrfs_end_log_trans(root);
7100end_no_trans:
7101 return ret;
e02119d5
CM
7102}
7103
7104/*
7105 * it is not safe to log dentry if the chunk root has added new
7106 * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
7107 * If this returns 1, you must commit the transaction to safely get your
7108 * data on disk.
7109 */
7110int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
e5b84f7a 7111 struct dentry *dentry,
8b050d35 7112 struct btrfs_log_ctx *ctx)
e02119d5 7113{
6a912213
JB
7114 struct dentry *parent = dget_parent(dentry);
7115 int ret;
7116
f882274b 7117 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
48778179 7118 LOG_INODE_ALL, ctx);
6a912213
JB
7119 dput(parent);
7120
7121 return ret;
e02119d5
CM
7122}
7123
7124/*
7125 * should be called during mount to recover any replay any log trees
7126 * from the FS
7127 */
7128int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
7129{
7130 int ret;
7131 struct btrfs_path *path;
7132 struct btrfs_trans_handle *trans;
7133 struct btrfs_key key;
7134 struct btrfs_key found_key;
e02119d5
CM
7135 struct btrfs_root *log;
7136 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
7137 struct walk_control wc = {
7138 .process_func = process_one_buffer,
430a6626 7139 .stage = LOG_WALK_PIN_ONLY,
e02119d5
CM
7140 };
7141
e02119d5 7142 path = btrfs_alloc_path();
db5b493a
TI
7143 if (!path)
7144 return -ENOMEM;
7145
afcdd129 7146 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
e02119d5 7147
4a500fd1 7148 trans = btrfs_start_transaction(fs_info->tree_root, 0);
79787eaa
JM
7149 if (IS_ERR(trans)) {
7150 ret = PTR_ERR(trans);
7151 goto error;
7152 }
e02119d5
CM
7153
7154 wc.trans = trans;
7155 wc.pin = 1;
7156
db5b493a 7157 ret = walk_log_tree(trans, log_root_tree, &wc);
79787eaa 7158 if (ret) {
ba51e2a1 7159 btrfs_abort_transaction(trans, ret);
79787eaa
JM
7160 goto error;
7161 }
e02119d5
CM
7162
7163again:
7164 key.objectid = BTRFS_TREE_LOG_OBJECTID;
7165 key.offset = (u64)-1;
962a298f 7166 key.type = BTRFS_ROOT_ITEM_KEY;
e02119d5 7167
d397712b 7168 while (1) {
e02119d5 7169 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
79787eaa
JM
7170
7171 if (ret < 0) {
ba51e2a1 7172 btrfs_abort_transaction(trans, ret);
79787eaa
JM
7173 goto error;
7174 }
e02119d5
CM
7175 if (ret > 0) {
7176 if (path->slots[0] == 0)
7177 break;
7178 path->slots[0]--;
7179 }
7180 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
7181 path->slots[0]);
b3b4aa74 7182 btrfs_release_path(path);
e02119d5
CM
7183 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
7184 break;
7185
62a2c73e 7186 log = btrfs_read_tree_root(log_root_tree, &found_key);
79787eaa
JM
7187 if (IS_ERR(log)) {
7188 ret = PTR_ERR(log);
ba51e2a1 7189 btrfs_abort_transaction(trans, ret);
79787eaa
JM
7190 goto error;
7191 }
e02119d5 7192
56e9357a
DS
7193 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
7194 true);
79787eaa
JM
7195 if (IS_ERR(wc.replay_dest)) {
7196 ret = PTR_ERR(wc.replay_dest);
9bc574de
JB
7197
7198 /*
7199 * We didn't find the subvol, likely because it was
7200 * deleted. This is ok, simply skip this log and go to
7201 * the next one.
7202 *
7203 * We need to exclude the root because we can't have
7204 * other log replays overwriting this log as we'll read
7205 * it back in a few more times. This will keep our
7206 * block from being modified, and we'll just bail for
7207 * each subsequent pass.
7208 */
7209 if (ret == -ENOENT)
9fce5704 7210 ret = btrfs_pin_extent_for_log_replay(trans,
9bc574de
JB
7211 log->node->start,
7212 log->node->len);
00246528 7213 btrfs_put_root(log);
9bc574de
JB
7214
7215 if (!ret)
7216 goto next;
ba51e2a1 7217 btrfs_abort_transaction(trans, ret);
79787eaa
JM
7218 goto error;
7219 }
e02119d5 7220
07d400a6 7221 wc.replay_dest->log_root = log;
2002ae11
JB
7222 ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
7223 if (ret)
7224 /* The loop needs to continue due to the root refs */
ba51e2a1 7225 btrfs_abort_transaction(trans, ret);
2002ae11
JB
7226 else
7227 ret = walk_log_tree(trans, log, &wc);
e02119d5 7228
b50c6e25 7229 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
e02119d5
CM
7230 ret = fixup_inode_link_counts(trans, wc.replay_dest,
7231 path);
ba51e2a1
JB
7232 if (ret)
7233 btrfs_abort_transaction(trans, ret);
e02119d5
CM
7234 }
7235
900c9981
LB
7236 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
7237 struct btrfs_root *root = wc.replay_dest;
7238
7239 btrfs_release_path(path);
7240
7241 /*
7242 * We have just replayed everything, and the highest
7243 * objectid of fs roots probably has changed in case
7244 * some inode_item's got replayed.
7245 *
7246 * root->objectid_mutex is not acquired as log replay
7247 * could only happen during mount.
7248 */
453e4873 7249 ret = btrfs_init_root_free_objectid(root);
ba51e2a1
JB
7250 if (ret)
7251 btrfs_abort_transaction(trans, ret);
900c9981
LB
7252 }
7253
07d400a6 7254 wc.replay_dest->log_root = NULL;
00246528 7255 btrfs_put_root(wc.replay_dest);
00246528 7256 btrfs_put_root(log);
e02119d5 7257
b50c6e25
JB
7258 if (ret)
7259 goto error;
9bc574de 7260next:
e02119d5
CM
7261 if (found_key.offset == 0)
7262 break;
9bc574de 7263 key.offset = found_key.offset - 1;
e02119d5 7264 }
b3b4aa74 7265 btrfs_release_path(path);
e02119d5
CM
7266
7267 /* step one is to pin it all, step two is to replay just inodes */
7268 if (wc.pin) {
7269 wc.pin = 0;
7270 wc.process_func = replay_one_buffer;
7271 wc.stage = LOG_WALK_REPLAY_INODES;
7272 goto again;
7273 }
7274 /* step three is to replay everything */
7275 if (wc.stage < LOG_WALK_REPLAY_ALL) {
7276 wc.stage++;
7277 goto again;
7278 }
7279
7280 btrfs_free_path(path);
7281
abefa55a 7282 /* step 4: commit the transaction, which also unpins the blocks */
3a45bb20 7283 ret = btrfs_commit_transaction(trans);
abefa55a
JB
7284 if (ret)
7285 return ret;
7286
e02119d5 7287 log_root_tree->log_root = NULL;
afcdd129 7288 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
00246528 7289 btrfs_put_root(log_root_tree);
79787eaa 7290
abefa55a 7291 return 0;
79787eaa 7292error:
b50c6e25 7293 if (wc.trans)
3a45bb20 7294 btrfs_end_transaction(wc.trans);
1aeb6b56 7295 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
79787eaa
JM
7296 btrfs_free_path(path);
7297 return ret;
e02119d5 7298}
12fcfd22
CM
7299
7300/*
7301 * there are some corner cases where we want to force a full
7302 * commit instead of allowing a directory to be logged.
7303 *
7304 * They revolve around files there were unlinked from the directory, and
7305 * this function updates the parent directory so that a full commit is
7306 * properly done if it is fsync'd later after the unlinks are done.
2be63d5c
FM
7307 *
7308 * Must be called before the unlink operations (updates to the subvolume tree,
7309 * inodes, etc) are done.
12fcfd22
CM
7310 */
7311void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
4176bdbf 7312 struct btrfs_inode *dir, struct btrfs_inode *inode,
12fcfd22
CM
7313 int for_rename)
7314{
af4176b4
CM
7315 /*
7316 * when we're logging a file, if it hasn't been renamed
7317 * or unlinked, and its inode is fully committed on disk,
7318 * we don't have to worry about walking up the directory chain
7319 * to log its parents.
7320 *
7321 * So, we use the last_unlink_trans field to put this transid
7322 * into the file. When the file is logged we check it and
7323 * don't log the parents if the file is fully on disk.
7324 */
4176bdbf
NB
7325 mutex_lock(&inode->log_mutex);
7326 inode->last_unlink_trans = trans->transid;
7327 mutex_unlock(&inode->log_mutex);
af4176b4 7328
12fcfd22
CM
7329 /*
7330 * if this directory was already logged any new
7331 * names for this file/dir will get recorded
7332 */
4176bdbf 7333 if (dir->logged_trans == trans->transid)
12fcfd22
CM
7334 return;
7335
7336 /*
7337 * if the inode we're about to unlink was logged,
7338 * the log will be properly updated for any new names
7339 */
4176bdbf 7340 if (inode->logged_trans == trans->transid)
12fcfd22
CM
7341 return;
7342
7343 /*
7344 * when renaming files across directories, if the directory
7345 * there we're unlinking from gets fsync'd later on, there's
7346 * no way to find the destination directory later and fsync it
7347 * properly. So, we have to be conservative and force commits
7348 * so the new name gets discovered.
7349 */
7350 if (for_rename)
7351 goto record;
7352
7353 /* we can safely do the unlink without any special recording */
7354 return;
7355
7356record:
4176bdbf
NB
7357 mutex_lock(&dir->log_mutex);
7358 dir->last_unlink_trans = trans->transid;
7359 mutex_unlock(&dir->log_mutex);
1ec9a1ae
FM
7360}
7361
7362/*
7363 * Make sure that if someone attempts to fsync the parent directory of a deleted
7364 * snapshot, it ends up triggering a transaction commit. This is to guarantee
7365 * that after replaying the log tree of the parent directory's root we will not
7366 * see the snapshot anymore and at log replay time we will not see any log tree
7367 * corresponding to the deleted snapshot's root, which could lead to replaying
7368 * it after replaying the log tree of the parent directory (which would replay
7369 * the snapshot delete operation).
2be63d5c
FM
7370 *
7371 * Must be called before the actual snapshot destroy operation (updates to the
7372 * parent root and tree of tree roots trees, etc) are done.
1ec9a1ae
FM
7373 */
7374void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
43663557 7375 struct btrfs_inode *dir)
1ec9a1ae 7376{
43663557
NB
7377 mutex_lock(&dir->log_mutex);
7378 dir->last_unlink_trans = trans->transid;
7379 mutex_unlock(&dir->log_mutex);
12fcfd22
CM
7380}
7381
d5f5bd54
FM
7382/**
7383 * Update the log after adding a new name for an inode.
7384 *
7385 * @trans: Transaction handle.
7386 * @old_dentry: The dentry associated with the old name and the old
7387 * parent directory.
7388 * @old_dir: The inode of the previous parent directory for the case
7389 * of a rename. For a link operation, it must be NULL.
88d2beec
FM
7390 * @old_dir_index: The index number associated with the old name, meaningful
7391 * only for rename operations (when @old_dir is not NULL).
7392 * Ignored for link operations.
d5f5bd54
FM
7393 * @parent: The dentry associated with the directory under which the
7394 * new name is located.
7395 *
7396 * Call this after adding a new name for an inode, as a result of a link or
7397 * rename operation, and it will properly update the log to reflect the new name.
12fcfd22 7398 */
75b463d2 7399void btrfs_log_new_name(struct btrfs_trans_handle *trans,
d5f5bd54 7400 struct dentry *old_dentry, struct btrfs_inode *old_dir,
88d2beec 7401 u64 old_dir_index, struct dentry *parent)
12fcfd22 7402{
d5f5bd54 7403 struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
259c4b96 7404 struct btrfs_root *root = inode->root;
75b463d2 7405 struct btrfs_log_ctx ctx;
259c4b96 7406 bool log_pinned = false;
0f8ce498 7407 int ret;
12fcfd22 7408
af4176b4
CM
7409 /*
7410 * this will force the logging code to walk the dentry chain
7411 * up for the file
7412 */
9a6509c4 7413 if (!S_ISDIR(inode->vfs_inode.i_mode))
9ca5fbfb 7414 inode->last_unlink_trans = trans->transid;
af4176b4 7415
12fcfd22
CM
7416 /*
7417 * if this inode hasn't been logged and directory we're renaming it
7418 * from hasn't been logged, we don't need to log it
7419 */
0f8ce498
FM
7420 ret = inode_logged(trans, inode, NULL);
7421 if (ret < 0) {
7422 goto out;
7423 } else if (ret == 0) {
7424 if (!old_dir)
7425 return;
7426 /*
7427 * If the inode was not logged and we are doing a rename (old_dir is not
7428 * NULL), check if old_dir was logged - if it was not we can return and
7429 * do nothing.
7430 */
7431 ret = inode_logged(trans, old_dir, NULL);
7432 if (ret < 0)
7433 goto out;
7434 else if (ret == 0)
7435 return;
7436 }
7437 ret = 0;
12fcfd22 7438
54a40fc3
FM
7439 /*
7440 * If we are doing a rename (old_dir is not NULL) from a directory that
88d2beec
FM
7441 * was previously logged, make sure that on log replay we get the old
7442 * dir entry deleted. This is needed because we will also log the new
7443 * name of the renamed inode, so we need to make sure that after log
7444 * replay we don't end up with both the new and old dir entries existing.
54a40fc3 7445 */
88d2beec
FM
7446 if (old_dir && old_dir->logged_trans == trans->transid) {
7447 struct btrfs_root *log = old_dir->root->log_root;
7448 struct btrfs_path *path;
ab3c5c18 7449 struct fscrypt_name fname;
88d2beec
FM
7450
7451 ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
7452
ab3c5c18
STD
7453 ret = fscrypt_setup_filename(&old_dir->vfs_inode,
7454 &old_dentry->d_name, 0, &fname);
7455 if (ret)
7456 goto out;
259c4b96
FM
7457 /*
7458 * We have two inodes to update in the log, the old directory and
7459 * the inode that got renamed, so we must pin the log to prevent
7460 * anyone from syncing the log until we have updated both inodes
7461 * in the log.
7462 */
723df2bc
FM
7463 ret = join_running_log_trans(root);
7464 /*
7465 * At least one of the inodes was logged before, so this should
7466 * not fail, but if it does, it's not serious, just bail out and
7467 * mark the log for a full commit.
7468 */
7469 if (WARN_ON_ONCE(ret < 0))
7470 goto out;
259c4b96 7471 log_pinned = true;
259c4b96 7472
88d2beec
FM
7473 path = btrfs_alloc_path();
7474 if (!path) {
259c4b96 7475 ret = -ENOMEM;
ab3c5c18 7476 fscrypt_free_filename(&fname);
259c4b96 7477 goto out;
88d2beec
FM
7478 }
7479
7480 /*
7481 * Other concurrent task might be logging the old directory,
7482 * as it can be triggered when logging other inode that had or
750ee454
FM
7483 * still has a dentry in the old directory. We lock the old
7484 * directory's log_mutex to ensure the deletion of the old
7485 * name is persisted, because during directory logging we
7486 * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
7487 * the old name's dir index item is in the delayed items, so
7488 * it could be missed by an in progress directory logging.
88d2beec
FM
7489 */
7490 mutex_lock(&old_dir->log_mutex);
7491 ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
6db75318 7492 &fname.disk_name, old_dir_index);
88d2beec
FM
7493 if (ret > 0) {
7494 /*
7495 * The dentry does not exist in the log, so record its
7496 * deletion.
7497 */
7498 btrfs_release_path(path);
7499 ret = insert_dir_log_key(trans, log, path,
7500 btrfs_ino(old_dir),
7501 old_dir_index, old_dir_index);
7502 }
7503 mutex_unlock(&old_dir->log_mutex);
7504
7505 btrfs_free_path(path);
ab3c5c18 7506 fscrypt_free_filename(&fname);
259c4b96
FM
7507 if (ret < 0)
7508 goto out;
88d2beec 7509 }
54a40fc3 7510
75b463d2
FM
7511 btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
7512 ctx.logging_new_name = true;
7513 /*
7514 * We don't care about the return value. If we fail to log the new name
7515 * then we know the next attempt to sync the log will fallback to a full
7516 * transaction commit (due to a call to btrfs_set_log_full_commit()), so
7517 * we don't need to worry about getting a log committed that has an
7518 * inconsistent state after a rename operation.
7519 */
48778179 7520 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
e09d94c9 7521 ASSERT(list_empty(&ctx.conflict_inodes));
259c4b96 7522out:
0f8ce498
FM
7523 /*
7524 * If an error happened mark the log for a full commit because it's not
7525 * consistent and up to date or we couldn't find out if one of the
7526 * inodes was logged before in this transaction. Do it before unpinning
7527 * the log, to avoid any races with someone else trying to commit it.
7528 */
7529 if (ret < 0)
7530 btrfs_set_log_full_commit(trans);
7531 if (log_pinned)
259c4b96 7532 btrfs_end_log_trans(root);
12fcfd22
CM
7533}
7534