btrfs: push printk index code into their respective helpers
[linux-block.git] / fs / btrfs / delayed-inode.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
16cdcec7
MX
2/*
3 * Copyright (C) 2011 Fujitsu. All rights reserved.
4 * Written by Miao Xie <miaox@cn.fujitsu.com>
16cdcec7
MX
5 */
6
7#include <linux/slab.h>
c7f88c4e 8#include <linux/iversion.h>
9b569ea0 9#include "messages.h"
602cbe91 10#include "misc.h"
16cdcec7
MX
11#include "delayed-inode.h"
12#include "disk-io.h"
13#include "transaction.h"
3cae210f 14#include "ctree.h"
4f5427cc 15#include "qgroup.h"
1f95ec01 16#include "locking.h"
26c2c454 17#include "inode-item.h"
f1e5c618 18#include "space-info.h"
16cdcec7 19
de3cb945
CM
20#define BTRFS_DELAYED_WRITEBACK 512
21#define BTRFS_DELAYED_BACKGROUND 128
22#define BTRFS_DELAYED_BATCH 16
16cdcec7
MX
23
24static struct kmem_cache *delayed_node_cache;
25
26int __init btrfs_delayed_inode_init(void)
27{
837e1972 28 delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
16cdcec7
MX
29 sizeof(struct btrfs_delayed_node),
30 0,
fba4b697 31 SLAB_MEM_SPREAD,
16cdcec7
MX
32 NULL);
33 if (!delayed_node_cache)
34 return -ENOMEM;
35 return 0;
36}
37
e67c718b 38void __cold btrfs_delayed_inode_exit(void)
16cdcec7 39{
5598e900 40 kmem_cache_destroy(delayed_node_cache);
16cdcec7
MX
41}
42
43static inline void btrfs_init_delayed_node(
44 struct btrfs_delayed_node *delayed_node,
45 struct btrfs_root *root, u64 inode_id)
46{
47 delayed_node->root = root;
48 delayed_node->inode_id = inode_id;
6de5f18e 49 refcount_set(&delayed_node->refs, 0);
03a1d4c8
LB
50 delayed_node->ins_root = RB_ROOT_CACHED;
51 delayed_node->del_root = RB_ROOT_CACHED;
16cdcec7 52 mutex_init(&delayed_node->mutex);
16cdcec7
MX
53 INIT_LIST_HEAD(&delayed_node->n_list);
54 INIT_LIST_HEAD(&delayed_node->p_list);
16cdcec7
MX
55}
56
f85b7379
DS
57static struct btrfs_delayed_node *btrfs_get_delayed_node(
58 struct btrfs_inode *btrfs_inode)
16cdcec7 59{
16cdcec7 60 struct btrfs_root *root = btrfs_inode->root;
4a0cc7ca 61 u64 ino = btrfs_ino(btrfs_inode);
2f7e33d4 62 struct btrfs_delayed_node *node;
16cdcec7 63
20c7bcec 64 node = READ_ONCE(btrfs_inode->delayed_node);
16cdcec7 65 if (node) {
6de5f18e 66 refcount_inc(&node->refs);
16cdcec7
MX
67 return node;
68 }
69
70 spin_lock(&root->inode_lock);
088aea3b 71 node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
ec35e48b 72
16cdcec7
MX
73 if (node) {
74 if (btrfs_inode->delayed_node) {
6de5f18e 75 refcount_inc(&node->refs); /* can be accessed */
2f7e33d4 76 BUG_ON(btrfs_inode->delayed_node != node);
16cdcec7 77 spin_unlock(&root->inode_lock);
2f7e33d4 78 return node;
16cdcec7 79 }
ec35e48b
CM
80
81 /*
82 * It's possible that we're racing into the middle of removing
088aea3b 83 * this node from the radix tree. In this case, the refcount
ec35e48b 84 * was zero and it should never go back to one. Just return
088aea3b 85 * NULL like it was never in the radix at all; our release
ec35e48b
CM
86 * function is in the process of removing it.
87 *
88 * Some implementations of refcount_inc refuse to bump the
89 * refcount once it has hit zero. If we don't do this dance
90 * here, refcount_inc() may decide to just WARN_ONCE() instead
91 * of actually bumping the refcount.
92 *
088aea3b 93 * If this node is properly in the radix, we want to bump the
ec35e48b
CM
94 * refcount twice, once for the inode and once for this get
95 * operation.
96 */
97 if (refcount_inc_not_zero(&node->refs)) {
98 refcount_inc(&node->refs);
99 btrfs_inode->delayed_node = node;
100 } else {
101 node = NULL;
102 }
103
16cdcec7
MX
104 spin_unlock(&root->inode_lock);
105 return node;
106 }
107 spin_unlock(&root->inode_lock);
108
2f7e33d4
MX
109 return NULL;
110}
111
79787eaa 112/* Will return either the node or PTR_ERR(-ENOMEM) */
2f7e33d4 113static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
f85b7379 114 struct btrfs_inode *btrfs_inode)
2f7e33d4
MX
115{
116 struct btrfs_delayed_node *node;
2f7e33d4 117 struct btrfs_root *root = btrfs_inode->root;
4a0cc7ca 118 u64 ino = btrfs_ino(btrfs_inode);
2f7e33d4
MX
119 int ret;
120
088aea3b
DS
121again:
122 node = btrfs_get_delayed_node(btrfs_inode);
123 if (node)
124 return node;
2f7e33d4 125
088aea3b
DS
126 node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
127 if (!node)
128 return ERR_PTR(-ENOMEM);
129 btrfs_init_delayed_node(node, root, ino);
16cdcec7 130
088aea3b
DS
131 /* cached in the btrfs inode and can be accessed */
132 refcount_set(&node->refs, 2);
16cdcec7 133
088aea3b
DS
134 ret = radix_tree_preload(GFP_NOFS);
135 if (ret) {
136 kmem_cache_free(delayed_node_cache, node);
137 return ERR_PTR(ret);
138 }
139
140 spin_lock(&root->inode_lock);
141 ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
142 if (ret == -EEXIST) {
143 spin_unlock(&root->inode_lock);
144 kmem_cache_free(delayed_node_cache, node);
145 radix_tree_preload_end();
146 goto again;
147 }
16cdcec7
MX
148 btrfs_inode->delayed_node = node;
149 spin_unlock(&root->inode_lock);
088aea3b 150 radix_tree_preload_end();
16cdcec7
MX
151
152 return node;
153}
154
155/*
156 * Call it when holding delayed_node->mutex
157 *
158 * If mod = 1, add this node into the prepared list.
159 */
160static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
161 struct btrfs_delayed_node *node,
162 int mod)
163{
164 spin_lock(&root->lock);
7cf35d91 165 if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
16cdcec7
MX
166 if (!list_empty(&node->p_list))
167 list_move_tail(&node->p_list, &root->prepare_list);
168 else if (mod)
169 list_add_tail(&node->p_list, &root->prepare_list);
170 } else {
171 list_add_tail(&node->n_list, &root->node_list);
172 list_add_tail(&node->p_list, &root->prepare_list);
6de5f18e 173 refcount_inc(&node->refs); /* inserted into list */
16cdcec7 174 root->nodes++;
7cf35d91 175 set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
16cdcec7
MX
176 }
177 spin_unlock(&root->lock);
178}
179
180/* Call it when holding delayed_node->mutex */
181static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
182 struct btrfs_delayed_node *node)
183{
184 spin_lock(&root->lock);
7cf35d91 185 if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
16cdcec7 186 root->nodes--;
6de5f18e 187 refcount_dec(&node->refs); /* not in the list */
16cdcec7
MX
188 list_del_init(&node->n_list);
189 if (!list_empty(&node->p_list))
190 list_del_init(&node->p_list);
7cf35d91 191 clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
16cdcec7
MX
192 }
193 spin_unlock(&root->lock);
194}
195
48a3b636 196static struct btrfs_delayed_node *btrfs_first_delayed_node(
16cdcec7
MX
197 struct btrfs_delayed_root *delayed_root)
198{
199 struct list_head *p;
200 struct btrfs_delayed_node *node = NULL;
201
202 spin_lock(&delayed_root->lock);
203 if (list_empty(&delayed_root->node_list))
204 goto out;
205
206 p = delayed_root->node_list.next;
207 node = list_entry(p, struct btrfs_delayed_node, n_list);
6de5f18e 208 refcount_inc(&node->refs);
16cdcec7
MX
209out:
210 spin_unlock(&delayed_root->lock);
211
212 return node;
213}
214
48a3b636 215static struct btrfs_delayed_node *btrfs_next_delayed_node(
16cdcec7
MX
216 struct btrfs_delayed_node *node)
217{
218 struct btrfs_delayed_root *delayed_root;
219 struct list_head *p;
220 struct btrfs_delayed_node *next = NULL;
221
222 delayed_root = node->root->fs_info->delayed_root;
223 spin_lock(&delayed_root->lock);
7cf35d91
MX
224 if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
225 /* not in the list */
16cdcec7
MX
226 if (list_empty(&delayed_root->node_list))
227 goto out;
228 p = delayed_root->node_list.next;
229 } else if (list_is_last(&node->n_list, &delayed_root->node_list))
230 goto out;
231 else
232 p = node->n_list.next;
233
234 next = list_entry(p, struct btrfs_delayed_node, n_list);
6de5f18e 235 refcount_inc(&next->refs);
16cdcec7
MX
236out:
237 spin_unlock(&delayed_root->lock);
238
239 return next;
240}
241
242static void __btrfs_release_delayed_node(
243 struct btrfs_delayed_node *delayed_node,
244 int mod)
245{
246 struct btrfs_delayed_root *delayed_root;
247
248 if (!delayed_node)
249 return;
250
251 delayed_root = delayed_node->root->fs_info->delayed_root;
252
253 mutex_lock(&delayed_node->mutex);
254 if (delayed_node->count)
255 btrfs_queue_delayed_node(delayed_root, delayed_node, mod);
256 else
257 btrfs_dequeue_delayed_node(delayed_root, delayed_node);
258 mutex_unlock(&delayed_node->mutex);
259
6de5f18e 260 if (refcount_dec_and_test(&delayed_node->refs)) {
16cdcec7 261 struct btrfs_root *root = delayed_node->root;
ec35e48b 262
16cdcec7 263 spin_lock(&root->inode_lock);
ec35e48b
CM
264 /*
265 * Once our refcount goes to zero, nobody is allowed to bump it
266 * back up. We can delete it now.
267 */
268 ASSERT(refcount_read(&delayed_node->refs) == 0);
088aea3b
DS
269 radix_tree_delete(&root->delayed_nodes_tree,
270 delayed_node->inode_id);
16cdcec7 271 spin_unlock(&root->inode_lock);
ec35e48b 272 kmem_cache_free(delayed_node_cache, delayed_node);
16cdcec7
MX
273 }
274}
275
276static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
277{
278 __btrfs_release_delayed_node(node, 0);
279}
280
48a3b636 281static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
16cdcec7
MX
282 struct btrfs_delayed_root *delayed_root)
283{
284 struct list_head *p;
285 struct btrfs_delayed_node *node = NULL;
286
287 spin_lock(&delayed_root->lock);
288 if (list_empty(&delayed_root->prepare_list))
289 goto out;
290
291 p = delayed_root->prepare_list.next;
292 list_del_init(p);
293 node = list_entry(p, struct btrfs_delayed_node, p_list);
6de5f18e 294 refcount_inc(&node->refs);
16cdcec7
MX
295out:
296 spin_unlock(&delayed_root->lock);
297
298 return node;
299}
300
301static inline void btrfs_release_prepared_delayed_node(
302 struct btrfs_delayed_node *node)
303{
304 __btrfs_release_delayed_node(node, 1);
305}
306
4c469798
FM
307static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
308 struct btrfs_delayed_node *node,
309 enum btrfs_delayed_item_type type)
16cdcec7
MX
310{
311 struct btrfs_delayed_item *item;
4c469798 312
16cdcec7
MX
313 item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
314 if (item) {
315 item->data_len = data_len;
4c469798 316 item->type = type;
16cdcec7 317 item->bytes_reserved = 0;
96d89923
FM
318 item->delayed_node = node;
319 RB_CLEAR_NODE(&item->rb_node);
30b80f3c
FM
320 INIT_LIST_HEAD(&item->log_list);
321 item->logged = false;
089e77e1 322 refcount_set(&item->refs, 1);
16cdcec7
MX
323 }
324 return item;
325}
326
327/*
328 * __btrfs_lookup_delayed_item - look up the delayed item by key
329 * @delayed_node: pointer to the delayed node
96d89923 330 * @index: the dir index value to lookup (offset of a dir index key)
16cdcec7
MX
331 *
332 * Note: if we don't find the right item, we will return the prev item and
333 * the next item.
334 */
335static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
336 struct rb_root *root,
4cbf37f5 337 u64 index)
16cdcec7 338{
4cbf37f5 339 struct rb_node *node = root->rb_node;
16cdcec7 340 struct btrfs_delayed_item *delayed_item = NULL;
16cdcec7
MX
341
342 while (node) {
343 delayed_item = rb_entry(node, struct btrfs_delayed_item,
344 rb_node);
96d89923 345 if (delayed_item->index < index)
16cdcec7 346 node = node->rb_right;
96d89923 347 else if (delayed_item->index > index)
16cdcec7
MX
348 node = node->rb_left;
349 else
350 return delayed_item;
351 }
352
16cdcec7
MX
353 return NULL;
354}
355
16cdcec7 356static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
c9d02ab4 357 struct btrfs_delayed_item *ins)
16cdcec7
MX
358{
359 struct rb_node **p, *node;
360 struct rb_node *parent_node = NULL;
03a1d4c8 361 struct rb_root_cached *root;
16cdcec7 362 struct btrfs_delayed_item *item;
03a1d4c8 363 bool leftmost = true;
16cdcec7 364
4c469798 365 if (ins->type == BTRFS_DELAYED_INSERTION_ITEM)
16cdcec7 366 root = &delayed_node->ins_root;
16cdcec7 367 else
4c469798
FM
368 root = &delayed_node->del_root;
369
03a1d4c8 370 p = &root->rb_root.rb_node;
16cdcec7
MX
371 node = &ins->rb_node;
372
373 while (*p) {
374 parent_node = *p;
375 item = rb_entry(parent_node, struct btrfs_delayed_item,
376 rb_node);
377
96d89923 378 if (item->index < ins->index) {
16cdcec7 379 p = &(*p)->rb_right;
03a1d4c8 380 leftmost = false;
96d89923 381 } else if (item->index > ins->index) {
16cdcec7 382 p = &(*p)->rb_left;
03a1d4c8 383 } else {
16cdcec7 384 return -EEXIST;
03a1d4c8 385 }
16cdcec7
MX
386 }
387
388 rb_link_node(node, parent_node, p);
03a1d4c8 389 rb_insert_color_cached(node, root, leftmost);
a176affe 390
4c469798 391 if (ins->type == BTRFS_DELAYED_INSERTION_ITEM &&
96d89923
FM
392 ins->index >= delayed_node->index_cnt)
393 delayed_node->index_cnt = ins->index + 1;
16cdcec7
MX
394
395 delayed_node->count++;
396 atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
397 return 0;
398}
399
de3cb945
CM
400static void finish_one_item(struct btrfs_delayed_root *delayed_root)
401{
402 int seq = atomic_inc_return(&delayed_root->items_seq);
ee863954 403
093258e6 404 /* atomic_dec_return implies a barrier */
de3cb945 405 if ((atomic_dec_return(&delayed_root->items) <
093258e6
DS
406 BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0))
407 cond_wake_up_nomb(&delayed_root->wait);
de3cb945
CM
408}
409
16cdcec7
MX
410static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
411{
03a1d4c8 412 struct rb_root_cached *root;
16cdcec7
MX
413 struct btrfs_delayed_root *delayed_root;
414
96d89923
FM
415 /* Not inserted, ignore it. */
416 if (RB_EMPTY_NODE(&delayed_item->rb_node))
933c22a7 417 return;
96d89923 418
16cdcec7
MX
419 delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
420
421 BUG_ON(!delayed_root);
16cdcec7 422
4c469798 423 if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
16cdcec7
MX
424 root = &delayed_item->delayed_node->ins_root;
425 else
426 root = &delayed_item->delayed_node->del_root;
427
03a1d4c8 428 rb_erase_cached(&delayed_item->rb_node, root);
96d89923 429 RB_CLEAR_NODE(&delayed_item->rb_node);
16cdcec7 430 delayed_item->delayed_node->count--;
de3cb945
CM
431
432 finish_one_item(delayed_root);
16cdcec7
MX
433}
434
435static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
436{
437 if (item) {
438 __btrfs_remove_delayed_item(item);
089e77e1 439 if (refcount_dec_and_test(&item->refs))
16cdcec7
MX
440 kfree(item);
441 }
442}
443
48a3b636 444static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
16cdcec7
MX
445 struct btrfs_delayed_node *delayed_node)
446{
447 struct rb_node *p;
448 struct btrfs_delayed_item *item = NULL;
449
03a1d4c8 450 p = rb_first_cached(&delayed_node->ins_root);
16cdcec7
MX
451 if (p)
452 item = rb_entry(p, struct btrfs_delayed_item, rb_node);
453
454 return item;
455}
456
48a3b636 457static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
16cdcec7
MX
458 struct btrfs_delayed_node *delayed_node)
459{
460 struct rb_node *p;
461 struct btrfs_delayed_item *item = NULL;
462
03a1d4c8 463 p = rb_first_cached(&delayed_node->del_root);
16cdcec7
MX
464 if (p)
465 item = rb_entry(p, struct btrfs_delayed_item, rb_node);
466
467 return item;
468}
469
48a3b636 470static struct btrfs_delayed_item *__btrfs_next_delayed_item(
16cdcec7
MX
471 struct btrfs_delayed_item *item)
472{
473 struct rb_node *p;
474 struct btrfs_delayed_item *next = NULL;
475
476 p = rb_next(&item->rb_node);
477 if (p)
478 next = rb_entry(p, struct btrfs_delayed_item, rb_node);
479
480 return next;
481}
482
16cdcec7 483static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
16cdcec7
MX
484 struct btrfs_delayed_item *item)
485{
486 struct btrfs_block_rsv *src_rsv;
487 struct btrfs_block_rsv *dst_rsv;
df492881 488 struct btrfs_fs_info *fs_info = trans->fs_info;
16cdcec7
MX
489 u64 num_bytes;
490 int ret;
491
492 if (!trans->bytes_reserved)
493 return 0;
494
495 src_rsv = trans->block_rsv;
0b246afa 496 dst_rsv = &fs_info->delayed_block_rsv;
16cdcec7 497
2bd36e7b 498 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
f218ea6c
QW
499
500 /*
501 * Here we migrate space rsv from transaction rsv, since have already
502 * reserved space when starting a transaction. So no need to reserve
503 * qgroup space here.
504 */
3a584174 505 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
8c2a3ca2 506 if (!ret) {
0b246afa 507 trace_btrfs_space_reservation(fs_info, "delayed_item",
96d89923 508 item->delayed_node->inode_id,
8c2a3ca2 509 num_bytes, 1);
763748b2
FM
510 /*
511 * For insertions we track reserved metadata space by accounting
512 * for the number of leaves that will be used, based on the delayed
513 * node's index_items_size field.
514 */
4c469798 515 if (item->type == BTRFS_DELAYED_DELETION_ITEM)
763748b2 516 item->bytes_reserved = num_bytes;
8c2a3ca2 517 }
16cdcec7
MX
518
519 return ret;
520}
521
4f5427cc 522static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
16cdcec7
MX
523 struct btrfs_delayed_item *item)
524{
19fd2949 525 struct btrfs_block_rsv *rsv;
4f5427cc 526 struct btrfs_fs_info *fs_info = root->fs_info;
19fd2949 527
16cdcec7
MX
528 if (!item->bytes_reserved)
529 return;
530
0b246afa 531 rsv = &fs_info->delayed_block_rsv;
f218ea6c
QW
532 /*
533 * Check btrfs_delayed_item_reserve_metadata() to see why we don't need
534 * to release/reserve qgroup space.
535 */
0b246afa 536 trace_btrfs_space_reservation(fs_info, "delayed_item",
96d89923
FM
537 item->delayed_node->inode_id,
538 item->bytes_reserved, 0);
63f018be 539 btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);
16cdcec7
MX
540}
541
763748b2
FM
542static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node,
543 unsigned int num_leaves)
544{
545 struct btrfs_fs_info *fs_info = node->root->fs_info;
546 const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves);
547
548 /* There are no space reservations during log replay, bail out. */
549 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
550 return;
551
552 trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id,
553 bytes, 0);
554 btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL);
555}
556
16cdcec7
MX
557static int btrfs_delayed_inode_reserve_metadata(
558 struct btrfs_trans_handle *trans,
559 struct btrfs_root *root,
560 struct btrfs_delayed_node *node)
561{
0b246afa 562 struct btrfs_fs_info *fs_info = root->fs_info;
16cdcec7
MX
563 struct btrfs_block_rsv *src_rsv;
564 struct btrfs_block_rsv *dst_rsv;
565 u64 num_bytes;
566 int ret;
567
16cdcec7 568 src_rsv = trans->block_rsv;
0b246afa 569 dst_rsv = &fs_info->delayed_block_rsv;
16cdcec7 570
bcacf5f3 571 num_bytes = btrfs_calc_metadata_size(fs_info, 1);
c06a0e12
JB
572
573 /*
574 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
575 * which doesn't reserve space for speed. This is a problem since we
576 * still need to reserve space for this update, so try to reserve the
577 * space.
578 *
579 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
69fe2d75 580 * we always reserve enough to update the inode item.
c06a0e12 581 */
e755d9ab 582 if (!src_rsv || (!trans->bytes_reserved &&
66d8f3dd 583 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
4d14c5cd
NB
584 ret = btrfs_qgroup_reserve_meta(root, num_bytes,
585 BTRFS_QGROUP_RSV_META_PREALLOC, true);
f218ea6c
QW
586 if (ret < 0)
587 return ret;
9270501c 588 ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes,
08e007d2 589 BTRFS_RESERVE_NO_FLUSH);
98686ffc
NB
590 /* NO_FLUSH could only fail with -ENOSPC */
591 ASSERT(ret == 0 || ret == -ENOSPC);
592 if (ret)
0f9c03d8 593 btrfs_qgroup_free_meta_prealloc(root, num_bytes);
98686ffc
NB
594 } else {
595 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
c06a0e12
JB
596 }
597
8c2a3ca2 598 if (!ret) {
0b246afa 599 trace_btrfs_space_reservation(fs_info, "delayed_inode",
8e3c9d3c 600 node->inode_id, num_bytes, 1);
16cdcec7 601 node->bytes_reserved = num_bytes;
8c2a3ca2 602 }
16cdcec7
MX
603
604 return ret;
605}
606
2ff7e61e 607static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
4f5427cc
QW
608 struct btrfs_delayed_node *node,
609 bool qgroup_free)
16cdcec7
MX
610{
611 struct btrfs_block_rsv *rsv;
612
613 if (!node->bytes_reserved)
614 return;
615
0b246afa
JM
616 rsv = &fs_info->delayed_block_rsv;
617 trace_btrfs_space_reservation(fs_info, "delayed_inode",
8c2a3ca2 618 node->inode_id, node->bytes_reserved, 0);
63f018be 619 btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL);
4f5427cc
QW
620 if (qgroup_free)
621 btrfs_qgroup_free_meta_prealloc(node->root,
622 node->bytes_reserved);
623 else
624 btrfs_qgroup_convert_reserved_meta(node->root,
625 node->bytes_reserved);
16cdcec7
MX
626 node->bytes_reserved = 0;
627}
628
629/*
06ac264f
FM
630 * Insert a single delayed item or a batch of delayed items, as many as possible
631 * that fit in a leaf. The delayed items (dir index keys) are sorted by their key
632 * in the rbtree, and if there's a gap between two consecutive dir index items,
633 * then it means at some point we had delayed dir indexes to add but they got
634 * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them
635 * into the subvolume tree. Dir index keys also have their offsets coming from a
636 * monotonically increasing counter, so we can't get new keys with an offset that
637 * fits within a gap between delayed dir index items.
16cdcec7 638 */
506650dc
FM
639static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
640 struct btrfs_root *root,
641 struct btrfs_path *path,
642 struct btrfs_delayed_item *first_item)
16cdcec7 643{
763748b2
FM
644 struct btrfs_fs_info *fs_info = root->fs_info;
645 struct btrfs_delayed_node *node = first_item->delayed_node;
b7ef5f3a 646 LIST_HEAD(item_list);
506650dc
FM
647 struct btrfs_delayed_item *curr;
648 struct btrfs_delayed_item *next;
763748b2 649 const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info);
b7ef5f3a 650 struct btrfs_item_batch batch;
96d89923 651 struct btrfs_key first_key;
4c469798 652 const u32 first_data_size = first_item->data_len;
506650dc 653 int total_size;
506650dc 654 char *ins_data = NULL;
506650dc 655 int ret;
71b68e9e 656 bool continuous_keys_only = false;
16cdcec7 657
763748b2
FM
658 lockdep_assert_held(&node->mutex);
659
71b68e9e
JB
660 /*
661 * During normal operation the delayed index offset is continuously
662 * increasing, so we can batch insert all items as there will not be any
663 * overlapping keys in the tree.
664 *
665 * The exception to this is log replay, where we may have interleaved
666 * offsets in the tree, so our batch needs to be continuous keys only in
667 * order to ensure we do not end up with out of order items in our leaf.
668 */
669 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
670 continuous_keys_only = true;
671
763748b2
FM
672 /*
673 * For delayed items to insert, we track reserved metadata bytes based
674 * on the number of leaves that we will use.
675 * See btrfs_insert_delayed_dir_index() and
676 * btrfs_delayed_item_reserve_metadata()).
677 */
678 ASSERT(first_item->bytes_reserved == 0);
679
b7ef5f3a 680 list_add_tail(&first_item->tree_list, &item_list);
4c469798 681 batch.total_data_size = first_data_size;
b7ef5f3a 682 batch.nr = 1;
4c469798 683 total_size = first_data_size + sizeof(struct btrfs_item);
506650dc 684 curr = first_item;
16cdcec7 685
506650dc
FM
686 while (true) {
687 int next_size;
16cdcec7 688
16cdcec7 689 next = __btrfs_next_delayed_item(curr);
06ac264f 690 if (!next)
16cdcec7
MX
691 break;
692
71b68e9e
JB
693 /*
694 * We cannot allow gaps in the key space if we're doing log
695 * replay.
696 */
96d89923 697 if (continuous_keys_only && (next->index != curr->index + 1))
71b68e9e
JB
698 break;
699
763748b2
FM
700 ASSERT(next->bytes_reserved == 0);
701
506650dc
FM
702 next_size = next->data_len + sizeof(struct btrfs_item);
703 if (total_size + next_size > max_size)
16cdcec7 704 break;
16cdcec7 705
b7ef5f3a
FM
706 list_add_tail(&next->tree_list, &item_list);
707 batch.nr++;
506650dc 708 total_size += next_size;
b7ef5f3a 709 batch.total_data_size += next->data_len;
506650dc 710 curr = next;
16cdcec7
MX
711 }
712
b7ef5f3a 713 if (batch.nr == 1) {
96d89923
FM
714 first_key.objectid = node->inode_id;
715 first_key.type = BTRFS_DIR_INDEX_KEY;
716 first_key.offset = first_item->index;
717 batch.keys = &first_key;
4c469798 718 batch.data_sizes = &first_data_size;
506650dc 719 } else {
b7ef5f3a
FM
720 struct btrfs_key *ins_keys;
721 u32 *ins_sizes;
506650dc 722 int i = 0;
16cdcec7 723
b7ef5f3a
FM
724 ins_data = kmalloc(batch.nr * sizeof(u32) +
725 batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
506650dc
FM
726 if (!ins_data) {
727 ret = -ENOMEM;
728 goto out;
729 }
730 ins_sizes = (u32 *)ins_data;
b7ef5f3a
FM
731 ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32));
732 batch.keys = ins_keys;
733 batch.data_sizes = ins_sizes;
734 list_for_each_entry(curr, &item_list, tree_list) {
96d89923
FM
735 ins_keys[i].objectid = node->inode_id;
736 ins_keys[i].type = BTRFS_DIR_INDEX_KEY;
737 ins_keys[i].offset = curr->index;
506650dc
FM
738 ins_sizes[i] = curr->data_len;
739 i++;
740 }
16cdcec7
MX
741 }
742
b7ef5f3a 743 ret = btrfs_insert_empty_items(trans, root, path, &batch);
506650dc
FM
744 if (ret)
745 goto out;
16cdcec7 746
b7ef5f3a 747 list_for_each_entry(curr, &item_list, tree_list) {
506650dc 748 char *data_ptr;
16cdcec7 749
506650dc
FM
750 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
751 write_extent_buffer(path->nodes[0], &curr->data,
752 (unsigned long)data_ptr, curr->data_len);
753 path->slots[0]++;
754 }
16cdcec7 755
506650dc
FM
756 /*
757 * Now release our path before releasing the delayed items and their
758 * metadata reservations, so that we don't block other tasks for more
759 * time than needed.
760 */
761 btrfs_release_path(path);
16cdcec7 762
763748b2
FM
763 ASSERT(node->index_item_leaves > 0);
764
71b68e9e
JB
765 /*
766 * For normal operations we will batch an entire leaf's worth of delayed
767 * items, so if there are more items to process we can decrement
768 * index_item_leaves by 1 as we inserted 1 leaf's worth of items.
769 *
770 * However for log replay we may not have inserted an entire leaf's
771 * worth of items, we may have not had continuous items, so decrementing
772 * here would mess up the index_item_leaves accounting. For this case
773 * only clean up the accounting when there are no items left.
774 */
775 if (next && !continuous_keys_only) {
763748b2
FM
776 /*
777 * We inserted one batch of items into a leaf a there are more
778 * items to flush in a future batch, now release one unit of
779 * metadata space from the delayed block reserve, corresponding
780 * the leaf we just flushed to.
781 */
782 btrfs_delayed_item_release_leaves(node, 1);
783 node->index_item_leaves--;
71b68e9e 784 } else if (!next) {
763748b2
FM
785 /*
786 * There are no more items to insert. We can have a number of
787 * reserved leaves > 1 here - this happens when many dir index
788 * items are added and then removed before they are flushed (file
789 * names with a very short life, never span a transaction). So
790 * release all remaining leaves.
791 */
792 btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
793 node->index_item_leaves = 0;
794 }
795
b7ef5f3a 796 list_for_each_entry_safe(curr, next, &item_list, tree_list) {
16cdcec7
MX
797 list_del(&curr->tree_list);
798 btrfs_release_delayed_item(curr);
799 }
16cdcec7 800out:
506650dc 801 kfree(ins_data);
16cdcec7
MX
802 return ret;
803}
804
16cdcec7
MX
805static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
806 struct btrfs_path *path,
807 struct btrfs_root *root,
808 struct btrfs_delayed_node *node)
809{
16cdcec7
MX
810 int ret = 0;
811
506650dc
FM
812 while (ret == 0) {
813 struct btrfs_delayed_item *curr;
16cdcec7 814
506650dc
FM
815 mutex_lock(&node->mutex);
816 curr = __btrfs_first_delayed_insertion_item(node);
817 if (!curr) {
818 mutex_unlock(&node->mutex);
819 break;
820 }
821 ret = btrfs_insert_delayed_item(trans, root, path, curr);
822 mutex_unlock(&node->mutex);
16cdcec7 823 }
16cdcec7 824
16cdcec7
MX
825 return ret;
826}
827
828static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
829 struct btrfs_root *root,
830 struct btrfs_path *path,
831 struct btrfs_delayed_item *item)
832{
96d89923 833 const u64 ino = item->delayed_node->inode_id;
1f4f639f 834 struct btrfs_fs_info *fs_info = root->fs_info;
16cdcec7 835 struct btrfs_delayed_item *curr, *next;
659192e6 836 struct extent_buffer *leaf = path->nodes[0];
4bd02d90
FM
837 LIST_HEAD(batch_list);
838 int nitems, slot, last_slot;
839 int ret;
1f4f639f 840 u64 total_reserved_size = item->bytes_reserved;
16cdcec7 841
659192e6 842 ASSERT(leaf != NULL);
16cdcec7 843
4bd02d90
FM
844 slot = path->slots[0];
845 last_slot = btrfs_header_nritems(leaf) - 1;
659192e6
FM
846 /*
847 * Our caller always gives us a path pointing to an existing item, so
848 * this can not happen.
849 */
4bd02d90
FM
850 ASSERT(slot <= last_slot);
851 if (WARN_ON(slot > last_slot))
659192e6 852 return -ENOENT;
16cdcec7 853
4bd02d90
FM
854 nitems = 1;
855 curr = item;
856 list_add_tail(&curr->tree_list, &batch_list);
857
16cdcec7 858 /*
4bd02d90
FM
859 * Keep checking if the next delayed item matches the next item in the
860 * leaf - if so, we can add it to the batch of items to delete from the
861 * leaf.
16cdcec7 862 */
4bd02d90
FM
863 while (slot < last_slot) {
864 struct btrfs_key key;
16cdcec7 865
16cdcec7
MX
866 next = __btrfs_next_delayed_item(curr);
867 if (!next)
868 break;
869
4bd02d90
FM
870 slot++;
871 btrfs_item_key_to_cpu(leaf, &key, slot);
96d89923
FM
872 if (key.objectid != ino ||
873 key.type != BTRFS_DIR_INDEX_KEY ||
874 key.offset != next->index)
16cdcec7 875 break;
4bd02d90
FM
876 nitems++;
877 curr = next;
878 list_add_tail(&curr->tree_list, &batch_list);
1f4f639f 879 total_reserved_size += curr->bytes_reserved;
16cdcec7
MX
880 }
881
16cdcec7
MX
882 ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
883 if (ret)
4bd02d90 884 return ret;
16cdcec7 885
1f4f639f
NB
886 /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */
887 if (total_reserved_size > 0) {
888 /*
889 * Check btrfs_delayed_item_reserve_metadata() to see why we
890 * don't need to release/reserve qgroup space.
891 */
96d89923
FM
892 trace_btrfs_space_reservation(fs_info, "delayed_item", ino,
893 total_reserved_size, 0);
1f4f639f
NB
894 btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv,
895 total_reserved_size, NULL);
896 }
897
4bd02d90 898 list_for_each_entry_safe(curr, next, &batch_list, tree_list) {
16cdcec7
MX
899 list_del(&curr->tree_list);
900 btrfs_release_delayed_item(curr);
901 }
902
4bd02d90 903 return 0;
16cdcec7
MX
904}
905
906static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
907 struct btrfs_path *path,
908 struct btrfs_root *root,
909 struct btrfs_delayed_node *node)
910{
96d89923 911 struct btrfs_key key;
16cdcec7
MX
912 int ret = 0;
913
96d89923
FM
914 key.objectid = node->inode_id;
915 key.type = BTRFS_DIR_INDEX_KEY;
916
36baa2c7
FM
917 while (ret == 0) {
918 struct btrfs_delayed_item *item;
919
920 mutex_lock(&node->mutex);
921 item = __btrfs_first_delayed_deletion_item(node);
922 if (!item) {
923 mutex_unlock(&node->mutex);
924 break;
925 }
926
96d89923
FM
927 key.offset = item->index;
928 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
36baa2c7
FM
929 if (ret > 0) {
930 /*
931 * There's no matching item in the leaf. This means we
932 * have already deleted this item in a past run of the
933 * delayed items. We ignore errors when running delayed
934 * items from an async context, through a work queue job
935 * running btrfs_async_run_delayed_root(), and don't
936 * release delayed items that failed to complete. This
937 * is because we will retry later, and at transaction
938 * commit time we always run delayed items and will
939 * then deal with errors if they fail to run again.
940 *
941 * So just release delayed items for which we can't find
942 * an item in the tree, and move to the next item.
943 */
944 btrfs_release_path(path);
945 btrfs_release_delayed_item(item);
946 ret = 0;
947 } else if (ret == 0) {
948 ret = btrfs_batch_delete_items(trans, root, path, item);
949 btrfs_release_path(path);
950 }
16cdcec7 951
16cdcec7 952 /*
36baa2c7
FM
953 * We unlock and relock on each iteration, this is to prevent
954 * blocking other tasks for too long while we are being run from
955 * the async context (work queue job). Those tasks are typically
956 * running system calls like creat/mkdir/rename/unlink/etc which
957 * need to add delayed items to this delayed node.
16cdcec7 958 */
36baa2c7 959 mutex_unlock(&node->mutex);
16cdcec7
MX
960 }
961
16cdcec7
MX
962 return ret;
963}
964
965static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
966{
967 struct btrfs_delayed_root *delayed_root;
968
7cf35d91
MX
969 if (delayed_node &&
970 test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
16cdcec7 971 BUG_ON(!delayed_node->root);
7cf35d91 972 clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
16cdcec7
MX
973 delayed_node->count--;
974
975 delayed_root = delayed_node->root->fs_info->delayed_root;
de3cb945 976 finish_one_item(delayed_root);
16cdcec7
MX
977 }
978}
979
67de1176
MX
980static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
981{
67de1176 982
a4cb90dc
JB
983 if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) {
984 struct btrfs_delayed_root *delayed_root;
67de1176 985
a4cb90dc
JB
986 ASSERT(delayed_node->root);
987 delayed_node->count--;
988
989 delayed_root = delayed_node->root->fs_info->delayed_root;
990 finish_one_item(delayed_root);
991 }
67de1176
MX
992}
993
0e8c36a9
MX
994static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
995 struct btrfs_root *root,
996 struct btrfs_path *path,
997 struct btrfs_delayed_node *node)
16cdcec7 998{
2ff7e61e 999 struct btrfs_fs_info *fs_info = root->fs_info;
16cdcec7
MX
1000 struct btrfs_key key;
1001 struct btrfs_inode_item *inode_item;
1002 struct extent_buffer *leaf;
67de1176 1003 int mod;
16cdcec7
MX
1004 int ret;
1005
16cdcec7 1006 key.objectid = node->inode_id;
962a298f 1007 key.type = BTRFS_INODE_ITEM_KEY;
16cdcec7 1008 key.offset = 0;
0e8c36a9 1009
67de1176
MX
1010 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
1011 mod = -1;
1012 else
1013 mod = 1;
1014
1015 ret = btrfs_lookup_inode(trans, root, path, &key, mod);
bb385bed
JB
1016 if (ret > 0)
1017 ret = -ENOENT;
1018 if (ret < 0)
1019 goto out;
16cdcec7 1020
16cdcec7
MX
1021 leaf = path->nodes[0];
1022 inode_item = btrfs_item_ptr(leaf, path->slots[0],
1023 struct btrfs_inode_item);
1024 write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
1025 sizeof(struct btrfs_inode_item));
1026 btrfs_mark_buffer_dirty(leaf);
16cdcec7 1027
67de1176 1028 if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
a4cb90dc 1029 goto out;
67de1176
MX
1030
1031 path->slots[0]++;
1032 if (path->slots[0] >= btrfs_header_nritems(leaf))
1033 goto search;
1034again:
1035 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1036 if (key.objectid != node->inode_id)
1037 goto out;
1038
1039 if (key.type != BTRFS_INODE_REF_KEY &&
1040 key.type != BTRFS_INODE_EXTREF_KEY)
1041 goto out;
1042
1043 /*
1044 * Delayed iref deletion is for the inode who has only one link,
1045 * so there is only one iref. The case that several irefs are
1046 * in the same item doesn't exist.
1047 */
1048 btrfs_del_item(trans, root, path);
1049out:
1050 btrfs_release_delayed_iref(node);
67de1176
MX
1051 btrfs_release_path(path);
1052err_out:
4f5427cc 1053 btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
16cdcec7 1054 btrfs_release_delayed_inode(node);
16cdcec7 1055
04587ad9
JB
1056 /*
1057 * If we fail to update the delayed inode we need to abort the
1058 * transaction, because we could leave the inode with the improper
1059 * counts behind.
1060 */
1061 if (ret && ret != -ENOENT)
1062 btrfs_abort_transaction(trans, ret);
1063
67de1176
MX
1064 return ret;
1065
1066search:
1067 btrfs_release_path(path);
1068
962a298f 1069 key.type = BTRFS_INODE_EXTREF_KEY;
67de1176 1070 key.offset = -1;
351cbf6e 1071
67de1176
MX
1072 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1073 if (ret < 0)
1074 goto err_out;
1075 ASSERT(ret);
1076
1077 ret = 0;
1078 leaf = path->nodes[0];
1079 path->slots[0]--;
1080 goto again;
16cdcec7
MX
1081}
1082
0e8c36a9
MX
1083static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1084 struct btrfs_root *root,
1085 struct btrfs_path *path,
1086 struct btrfs_delayed_node *node)
1087{
1088 int ret;
1089
1090 mutex_lock(&node->mutex);
7cf35d91 1091 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) {
0e8c36a9
MX
1092 mutex_unlock(&node->mutex);
1093 return 0;
1094 }
1095
1096 ret = __btrfs_update_delayed_inode(trans, root, path, node);
1097 mutex_unlock(&node->mutex);
1098 return ret;
1099}
1100
4ea41ce0
MX
1101static inline int
1102__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1103 struct btrfs_path *path,
1104 struct btrfs_delayed_node *node)
1105{
1106 int ret;
1107
1108 ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1109 if (ret)
1110 return ret;
1111
1112 ret = btrfs_delete_delayed_items(trans, path, node->root, node);
1113 if (ret)
1114 return ret;
1115
1116 ret = btrfs_update_delayed_inode(trans, node->root, path, node);
1117 return ret;
1118}
1119
79787eaa
JM
1120/*
1121 * Called when committing the transaction.
1122 * Returns 0 on success.
1123 * Returns < 0 on error and returns with an aborted transaction with any
1124 * outstanding delayed items cleaned up.
1125 */
b84acab3 1126static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
16cdcec7 1127{
b84acab3 1128 struct btrfs_fs_info *fs_info = trans->fs_info;
16cdcec7
MX
1129 struct btrfs_delayed_root *delayed_root;
1130 struct btrfs_delayed_node *curr_node, *prev_node;
1131 struct btrfs_path *path;
19fd2949 1132 struct btrfs_block_rsv *block_rsv;
16cdcec7 1133 int ret = 0;
96c3f433 1134 bool count = (nr > 0);
16cdcec7 1135
bf31f87f 1136 if (TRANS_ABORTED(trans))
79787eaa
JM
1137 return -EIO;
1138
16cdcec7
MX
1139 path = btrfs_alloc_path();
1140 if (!path)
1141 return -ENOMEM;
16cdcec7 1142
19fd2949 1143 block_rsv = trans->block_rsv;
0b246afa 1144 trans->block_rsv = &fs_info->delayed_block_rsv;
19fd2949 1145
ccdf9b30 1146 delayed_root = fs_info->delayed_root;
16cdcec7
MX
1147
1148 curr_node = btrfs_first_delayed_node(delayed_root);
a4559e6f 1149 while (curr_node && (!count || nr--)) {
4ea41ce0
MX
1150 ret = __btrfs_commit_inode_delayed_items(trans, path,
1151 curr_node);
16cdcec7
MX
1152 if (ret) {
1153 btrfs_release_delayed_node(curr_node);
96c3f433 1154 curr_node = NULL;
66642832 1155 btrfs_abort_transaction(trans, ret);
16cdcec7
MX
1156 break;
1157 }
1158
1159 prev_node = curr_node;
1160 curr_node = btrfs_next_delayed_node(curr_node);
1161 btrfs_release_delayed_node(prev_node);
1162 }
1163
96c3f433
JB
1164 if (curr_node)
1165 btrfs_release_delayed_node(curr_node);
16cdcec7 1166 btrfs_free_path(path);
19fd2949 1167 trans->block_rsv = block_rsv;
79787eaa 1168
16cdcec7
MX
1169 return ret;
1170}
1171
e5c304e6 1172int btrfs_run_delayed_items(struct btrfs_trans_handle *trans)
96c3f433 1173{
b84acab3 1174 return __btrfs_run_delayed_items(trans, -1);
96c3f433
JB
1175}
1176
e5c304e6 1177int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr)
96c3f433 1178{
b84acab3 1179 return __btrfs_run_delayed_items(trans, nr);
96c3f433
JB
1180}
1181
16cdcec7 1182int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
5f4b32e9 1183 struct btrfs_inode *inode)
16cdcec7 1184{
5f4b32e9 1185 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
4ea41ce0
MX
1186 struct btrfs_path *path;
1187 struct btrfs_block_rsv *block_rsv;
16cdcec7
MX
1188 int ret;
1189
1190 if (!delayed_node)
1191 return 0;
1192
1193 mutex_lock(&delayed_node->mutex);
1194 if (!delayed_node->count) {
1195 mutex_unlock(&delayed_node->mutex);
1196 btrfs_release_delayed_node(delayed_node);
1197 return 0;
1198 }
1199 mutex_unlock(&delayed_node->mutex);
1200
4ea41ce0 1201 path = btrfs_alloc_path();
3c77bd94
FDBM
1202 if (!path) {
1203 btrfs_release_delayed_node(delayed_node);
4ea41ce0 1204 return -ENOMEM;
3c77bd94 1205 }
4ea41ce0
MX
1206
1207 block_rsv = trans->block_rsv;
1208 trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
1209
1210 ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
1211
16cdcec7 1212 btrfs_release_delayed_node(delayed_node);
4ea41ce0
MX
1213 btrfs_free_path(path);
1214 trans->block_rsv = block_rsv;
1215
16cdcec7
MX
1216 return ret;
1217}
1218
aa79021f 1219int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode)
0e8c36a9 1220{
3ffbd68c 1221 struct btrfs_fs_info *fs_info = inode->root->fs_info;
0e8c36a9 1222 struct btrfs_trans_handle *trans;
aa79021f 1223 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
0e8c36a9
MX
1224 struct btrfs_path *path;
1225 struct btrfs_block_rsv *block_rsv;
1226 int ret;
1227
1228 if (!delayed_node)
1229 return 0;
1230
1231 mutex_lock(&delayed_node->mutex);
7cf35d91 1232 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
0e8c36a9
MX
1233 mutex_unlock(&delayed_node->mutex);
1234 btrfs_release_delayed_node(delayed_node);
1235 return 0;
1236 }
1237 mutex_unlock(&delayed_node->mutex);
1238
1239 trans = btrfs_join_transaction(delayed_node->root);
1240 if (IS_ERR(trans)) {
1241 ret = PTR_ERR(trans);
1242 goto out;
1243 }
1244
1245 path = btrfs_alloc_path();
1246 if (!path) {
1247 ret = -ENOMEM;
1248 goto trans_out;
1249 }
0e8c36a9
MX
1250
1251 block_rsv = trans->block_rsv;
2ff7e61e 1252 trans->block_rsv = &fs_info->delayed_block_rsv;
0e8c36a9
MX
1253
1254 mutex_lock(&delayed_node->mutex);
7cf35d91 1255 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags))
0e8c36a9
MX
1256 ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
1257 path, delayed_node);
1258 else
1259 ret = 0;
1260 mutex_unlock(&delayed_node->mutex);
1261
1262 btrfs_free_path(path);
1263 trans->block_rsv = block_rsv;
1264trans_out:
3a45bb20 1265 btrfs_end_transaction(trans);
2ff7e61e 1266 btrfs_btree_balance_dirty(fs_info);
0e8c36a9
MX
1267out:
1268 btrfs_release_delayed_node(delayed_node);
1269
1270 return ret;
1271}
1272
f48d1cf5 1273void btrfs_remove_delayed_node(struct btrfs_inode *inode)
16cdcec7
MX
1274{
1275 struct btrfs_delayed_node *delayed_node;
1276
f48d1cf5 1277 delayed_node = READ_ONCE(inode->delayed_node);
16cdcec7
MX
1278 if (!delayed_node)
1279 return;
1280
f48d1cf5 1281 inode->delayed_node = NULL;
16cdcec7
MX
1282 btrfs_release_delayed_node(delayed_node);
1283}
1284
de3cb945
CM
1285struct btrfs_async_delayed_work {
1286 struct btrfs_delayed_root *delayed_root;
1287 int nr;
d458b054 1288 struct btrfs_work work;
16cdcec7
MX
1289};
1290
d458b054 1291static void btrfs_async_run_delayed_root(struct btrfs_work *work)
16cdcec7 1292{
de3cb945
CM
1293 struct btrfs_async_delayed_work *async_work;
1294 struct btrfs_delayed_root *delayed_root;
16cdcec7
MX
1295 struct btrfs_trans_handle *trans;
1296 struct btrfs_path *path;
1297 struct btrfs_delayed_node *delayed_node = NULL;
1298 struct btrfs_root *root;
19fd2949 1299 struct btrfs_block_rsv *block_rsv;
de3cb945 1300 int total_done = 0;
16cdcec7 1301
de3cb945
CM
1302 async_work = container_of(work, struct btrfs_async_delayed_work, work);
1303 delayed_root = async_work->delayed_root;
16cdcec7
MX
1304
1305 path = btrfs_alloc_path();
1306 if (!path)
1307 goto out;
16cdcec7 1308
617c54a8
NB
1309 do {
1310 if (atomic_read(&delayed_root->items) <
1311 BTRFS_DELAYED_BACKGROUND / 2)
1312 break;
de3cb945 1313
617c54a8
NB
1314 delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
1315 if (!delayed_node)
1316 break;
de3cb945 1317
617c54a8 1318 root = delayed_node->root;
16cdcec7 1319
617c54a8
NB
1320 trans = btrfs_join_transaction(root);
1321 if (IS_ERR(trans)) {
1322 btrfs_release_path(path);
1323 btrfs_release_prepared_delayed_node(delayed_node);
1324 total_done++;
1325 continue;
1326 }
16cdcec7 1327
617c54a8
NB
1328 block_rsv = trans->block_rsv;
1329 trans->block_rsv = &root->fs_info->delayed_block_rsv;
19fd2949 1330
617c54a8 1331 __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
16cdcec7 1332
617c54a8
NB
1333 trans->block_rsv = block_rsv;
1334 btrfs_end_transaction(trans);
1335 btrfs_btree_balance_dirty_nodelay(root->fs_info);
de3cb945 1336
617c54a8
NB
1337 btrfs_release_path(path);
1338 btrfs_release_prepared_delayed_node(delayed_node);
1339 total_done++;
de3cb945 1340
617c54a8
NB
1341 } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK)
1342 || total_done < async_work->nr);
de3cb945 1343
16cdcec7
MX
1344 btrfs_free_path(path);
1345out:
de3cb945
CM
1346 wake_up(&delayed_root->wait);
1347 kfree(async_work);
16cdcec7
MX
1348}
1349
de3cb945 1350
16cdcec7 1351static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
a585e948 1352 struct btrfs_fs_info *fs_info, int nr)
16cdcec7 1353{
de3cb945 1354 struct btrfs_async_delayed_work *async_work;
16cdcec7 1355
de3cb945
CM
1356 async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
1357 if (!async_work)
16cdcec7 1358 return -ENOMEM;
16cdcec7 1359
de3cb945 1360 async_work->delayed_root = delayed_root;
a0cac0ec
OS
1361 btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL,
1362 NULL);
de3cb945 1363 async_work->nr = nr;
16cdcec7 1364
a585e948 1365 btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
16cdcec7
MX
1366 return 0;
1367}
1368
ccdf9b30 1369void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info)
e999376f 1370{
ccdf9b30 1371 WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root));
e999376f
CM
1372}
1373
0353808c 1374static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
de3cb945
CM
1375{
1376 int val = atomic_read(&delayed_root->items_seq);
1377
0353808c 1378 if (val < seq || val >= seq + BTRFS_DELAYED_BATCH)
de3cb945 1379 return 1;
0353808c
MX
1380
1381 if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
1382 return 1;
1383
de3cb945
CM
1384 return 0;
1385}
1386
2ff7e61e 1387void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
16cdcec7 1388{
2ff7e61e 1389 struct btrfs_delayed_root *delayed_root = fs_info->delayed_root;
16cdcec7 1390
8577787f
NB
1391 if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) ||
1392 btrfs_workqueue_normal_congested(fs_info->delayed_workers))
16cdcec7
MX
1393 return;
1394
1395 if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
0353808c 1396 int seq;
16cdcec7 1397 int ret;
0353808c
MX
1398
1399 seq = atomic_read(&delayed_root->items_seq);
de3cb945 1400
a585e948 1401 ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0);
16cdcec7
MX
1402 if (ret)
1403 return;
1404
0353808c
MX
1405 wait_event_interruptible(delayed_root->wait,
1406 could_end_wait(delayed_root, seq));
4dd466d3 1407 return;
16cdcec7
MX
1408 }
1409
a585e948 1410 btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH);
16cdcec7
MX
1411}
1412
79787eaa 1413/* Will return 0 or -ENOMEM */
16cdcec7 1414int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
2ff7e61e 1415 const char *name, int name_len,
6f45d185 1416 struct btrfs_inode *dir,
16cdcec7
MX
1417 struct btrfs_disk_key *disk_key, u8 type,
1418 u64 index)
1419{
763748b2
FM
1420 struct btrfs_fs_info *fs_info = trans->fs_info;
1421 const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info);
16cdcec7
MX
1422 struct btrfs_delayed_node *delayed_node;
1423 struct btrfs_delayed_item *delayed_item;
1424 struct btrfs_dir_item *dir_item;
763748b2
FM
1425 bool reserve_leaf_space;
1426 u32 data_len;
16cdcec7
MX
1427 int ret;
1428
6f45d185 1429 delayed_node = btrfs_get_or_create_delayed_node(dir);
16cdcec7
MX
1430 if (IS_ERR(delayed_node))
1431 return PTR_ERR(delayed_node);
1432
96d89923 1433 delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len,
4c469798
FM
1434 delayed_node,
1435 BTRFS_DELAYED_INSERTION_ITEM);
16cdcec7
MX
1436 if (!delayed_item) {
1437 ret = -ENOMEM;
1438 goto release_node;
1439 }
1440
96d89923 1441 delayed_item->index = index;
16cdcec7
MX
1442
1443 dir_item = (struct btrfs_dir_item *)delayed_item->data;
1444 dir_item->location = *disk_key;
3cae210f
QW
1445 btrfs_set_stack_dir_transid(dir_item, trans->transid);
1446 btrfs_set_stack_dir_data_len(dir_item, 0);
1447 btrfs_set_stack_dir_name_len(dir_item, name_len);
1448 btrfs_set_stack_dir_type(dir_item, type);
16cdcec7
MX
1449 memcpy((char *)(dir_item + 1), name, name_len);
1450
763748b2 1451 data_len = delayed_item->data_len + sizeof(struct btrfs_item);
8c2a3ca2 1452
16cdcec7 1453 mutex_lock(&delayed_node->mutex);
763748b2
FM
1454
1455 if (delayed_node->index_item_leaves == 0 ||
1456 delayed_node->curr_index_batch_size + data_len > leaf_data_size) {
1457 delayed_node->curr_index_batch_size = data_len;
1458 reserve_leaf_space = true;
1459 } else {
1460 delayed_node->curr_index_batch_size += data_len;
1461 reserve_leaf_space = false;
1462 }
1463
1464 if (reserve_leaf_space) {
df492881 1465 ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item);
763748b2
FM
1466 /*
1467 * Space was reserved for a dir index item insertion when we
1468 * started the transaction, so getting a failure here should be
1469 * impossible.
1470 */
1471 if (WARN_ON(ret)) {
1472 mutex_unlock(&delayed_node->mutex);
1473 btrfs_release_delayed_item(delayed_item);
1474 goto release_node;
1475 }
1476
1477 delayed_node->index_item_leaves++;
1478 } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
1479 const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
1480
1481 /*
1482 * Adding the new dir index item does not require touching another
1483 * leaf, so we can release 1 unit of metadata that was previously
1484 * reserved when starting the transaction. This applies only to
1485 * the case where we had a transaction start and excludes the
1486 * transaction join case (when replaying log trees).
1487 */
1488 trace_btrfs_space_reservation(fs_info, "transaction",
1489 trans->transid, bytes, 0);
1490 btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL);
1491 ASSERT(trans->bytes_reserved >= bytes);
1492 trans->bytes_reserved -= bytes;
1493 }
1494
c9d02ab4 1495 ret = __btrfs_add_delayed_item(delayed_node, delayed_item);
16cdcec7 1496 if (unlikely(ret)) {
4465c8b4 1497 btrfs_err(trans->fs_info,
5d163e0e 1498 "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
4fd786e6 1499 name_len, name, delayed_node->root->root_key.objectid,
5d163e0e 1500 delayed_node->inode_id, ret);
16cdcec7
MX
1501 BUG();
1502 }
1503 mutex_unlock(&delayed_node->mutex);
1504
1505release_node:
1506 btrfs_release_delayed_node(delayed_node);
1507 return ret;
1508}
1509
2ff7e61e 1510static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
16cdcec7 1511 struct btrfs_delayed_node *node,
96d89923 1512 u64 index)
16cdcec7
MX
1513{
1514 struct btrfs_delayed_item *item;
1515
1516 mutex_lock(&node->mutex);
4cbf37f5 1517 item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index);
16cdcec7
MX
1518 if (!item) {
1519 mutex_unlock(&node->mutex);
1520 return 1;
1521 }
1522
763748b2
FM
1523 /*
1524 * For delayed items to insert, we track reserved metadata bytes based
1525 * on the number of leaves that we will use.
1526 * See btrfs_insert_delayed_dir_index() and
1527 * btrfs_delayed_item_reserve_metadata()).
1528 */
1529 ASSERT(item->bytes_reserved == 0);
1530 ASSERT(node->index_item_leaves > 0);
1531
1532 /*
1533 * If there's only one leaf reserved, we can decrement this item from the
1534 * current batch, otherwise we can not because we don't know which leaf
1535 * it belongs to. With the current limit on delayed items, we rarely
1536 * accumulate enough dir index items to fill more than one leaf (even
1537 * when using a leaf size of 4K).
1538 */
1539 if (node->index_item_leaves == 1) {
1540 const u32 data_len = item->data_len + sizeof(struct btrfs_item);
1541
1542 ASSERT(node->curr_index_batch_size >= data_len);
1543 node->curr_index_batch_size -= data_len;
1544 }
1545
16cdcec7 1546 btrfs_release_delayed_item(item);
763748b2
FM
1547
1548 /* If we now have no more dir index items, we can release all leaves. */
1549 if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) {
1550 btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
1551 node->index_item_leaves = 0;
1552 }
1553
16cdcec7
MX
1554 mutex_unlock(&node->mutex);
1555 return 0;
1556}
1557
1558int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
e67bbbb9 1559 struct btrfs_inode *dir, u64 index)
16cdcec7
MX
1560{
1561 struct btrfs_delayed_node *node;
1562 struct btrfs_delayed_item *item;
16cdcec7
MX
1563 int ret;
1564
e67bbbb9 1565 node = btrfs_get_or_create_delayed_node(dir);
16cdcec7
MX
1566 if (IS_ERR(node))
1567 return PTR_ERR(node);
1568
96d89923 1569 ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index);
16cdcec7
MX
1570 if (!ret)
1571 goto end;
1572
4c469798 1573 item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM);
16cdcec7
MX
1574 if (!item) {
1575 ret = -ENOMEM;
1576 goto end;
1577 }
1578
96d89923 1579 item->index = index;
16cdcec7 1580
df492881 1581 ret = btrfs_delayed_item_reserve_metadata(trans, item);
16cdcec7
MX
1582 /*
1583 * we have reserved enough space when we start a new transaction,
1584 * so reserving metadata failure is impossible.
1585 */
933c22a7
QW
1586 if (ret < 0) {
1587 btrfs_err(trans->fs_info,
1588"metadata reservation failed for delayed dir item deltiona, should have been reserved");
1589 btrfs_release_delayed_item(item);
1590 goto end;
1591 }
16cdcec7
MX
1592
1593 mutex_lock(&node->mutex);
c9d02ab4 1594 ret = __btrfs_add_delayed_item(node, item);
16cdcec7 1595 if (unlikely(ret)) {
9add2945 1596 btrfs_err(trans->fs_info,
5d163e0e 1597 "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
4fd786e6
MT
1598 index, node->root->root_key.objectid,
1599 node->inode_id, ret);
933c22a7
QW
1600 btrfs_delayed_item_release_metadata(dir->root, item);
1601 btrfs_release_delayed_item(item);
16cdcec7
MX
1602 }
1603 mutex_unlock(&node->mutex);
1604end:
1605 btrfs_release_delayed_node(node);
1606 return ret;
1607}
1608
f5cc7b80 1609int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
16cdcec7 1610{
f5cc7b80 1611 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
16cdcec7
MX
1612
1613 if (!delayed_node)
1614 return -ENOENT;
1615
1616 /*
1617 * Since we have held i_mutex of this directory, it is impossible that
1618 * a new directory index is added into the delayed node and index_cnt
1619 * is updated now. So we needn't lock the delayed node.
1620 */
2f7e33d4
MX
1621 if (!delayed_node->index_cnt) {
1622 btrfs_release_delayed_node(delayed_node);
16cdcec7 1623 return -EINVAL;
2f7e33d4 1624 }
16cdcec7 1625
f5cc7b80 1626 inode->index_cnt = delayed_node->index_cnt;
2f7e33d4
MX
1627 btrfs_release_delayed_node(delayed_node);
1628 return 0;
16cdcec7
MX
1629}
1630
02dbfc99
OS
1631bool btrfs_readdir_get_delayed_items(struct inode *inode,
1632 struct list_head *ins_list,
1633 struct list_head *del_list)
16cdcec7
MX
1634{
1635 struct btrfs_delayed_node *delayed_node;
1636 struct btrfs_delayed_item *item;
1637
340c6ca9 1638 delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
16cdcec7 1639 if (!delayed_node)
02dbfc99
OS
1640 return false;
1641
1642 /*
1643 * We can only do one readdir with delayed items at a time because of
1644 * item->readdir_list.
1645 */
64708539
JB
1646 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
1647 btrfs_inode_lock(inode, 0);
16cdcec7
MX
1648
1649 mutex_lock(&delayed_node->mutex);
1650 item = __btrfs_first_delayed_insertion_item(delayed_node);
1651 while (item) {
089e77e1 1652 refcount_inc(&item->refs);
16cdcec7
MX
1653 list_add_tail(&item->readdir_list, ins_list);
1654 item = __btrfs_next_delayed_item(item);
1655 }
1656
1657 item = __btrfs_first_delayed_deletion_item(delayed_node);
1658 while (item) {
089e77e1 1659 refcount_inc(&item->refs);
16cdcec7
MX
1660 list_add_tail(&item->readdir_list, del_list);
1661 item = __btrfs_next_delayed_item(item);
1662 }
1663 mutex_unlock(&delayed_node->mutex);
1664 /*
1665 * This delayed node is still cached in the btrfs inode, so refs
1666 * must be > 1 now, and we needn't check it is going to be freed
1667 * or not.
1668 *
1669 * Besides that, this function is used to read dir, we do not
1670 * insert/delete delayed items in this period. So we also needn't
1671 * requeue or dequeue this delayed node.
1672 */
6de5f18e 1673 refcount_dec(&delayed_node->refs);
02dbfc99
OS
1674
1675 return true;
16cdcec7
MX
1676}
1677
02dbfc99
OS
1678void btrfs_readdir_put_delayed_items(struct inode *inode,
1679 struct list_head *ins_list,
1680 struct list_head *del_list)
16cdcec7
MX
1681{
1682 struct btrfs_delayed_item *curr, *next;
1683
1684 list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
1685 list_del(&curr->readdir_list);
089e77e1 1686 if (refcount_dec_and_test(&curr->refs))
16cdcec7
MX
1687 kfree(curr);
1688 }
1689
1690 list_for_each_entry_safe(curr, next, del_list, readdir_list) {
1691 list_del(&curr->readdir_list);
089e77e1 1692 if (refcount_dec_and_test(&curr->refs))
16cdcec7
MX
1693 kfree(curr);
1694 }
02dbfc99
OS
1695
1696 /*
1697 * The VFS is going to do up_read(), so we need to downgrade back to a
1698 * read lock.
1699 */
1700 downgrade_write(&inode->i_rwsem);
16cdcec7
MX
1701}
1702
1703int btrfs_should_delete_dir_index(struct list_head *del_list,
1704 u64 index)
1705{
e4fd493c
JB
1706 struct btrfs_delayed_item *curr;
1707 int ret = 0;
16cdcec7 1708
e4fd493c 1709 list_for_each_entry(curr, del_list, readdir_list) {
96d89923 1710 if (curr->index > index)
16cdcec7 1711 break;
96d89923 1712 if (curr->index == index) {
e4fd493c
JB
1713 ret = 1;
1714 break;
1715 }
16cdcec7 1716 }
e4fd493c 1717 return ret;
16cdcec7
MX
1718}
1719
1720/*
1721 * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
1722 *
1723 */
9cdda8d3 1724int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
d2fbb2b5 1725 struct list_head *ins_list)
16cdcec7
MX
1726{
1727 struct btrfs_dir_item *di;
1728 struct btrfs_delayed_item *curr, *next;
1729 struct btrfs_key location;
1730 char *name;
1731 int name_len;
1732 int over = 0;
1733 unsigned char d_type;
1734
1735 if (list_empty(ins_list))
1736 return 0;
1737
1738 /*
1739 * Changing the data of the delayed item is impossible. So
1740 * we needn't lock them. And we have held i_mutex of the
1741 * directory, nobody can delete any directory indexes now.
1742 */
1743 list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
1744 list_del(&curr->readdir_list);
1745
96d89923 1746 if (curr->index < ctx->pos) {
089e77e1 1747 if (refcount_dec_and_test(&curr->refs))
16cdcec7
MX
1748 kfree(curr);
1749 continue;
1750 }
1751
96d89923 1752 ctx->pos = curr->index;
16cdcec7
MX
1753
1754 di = (struct btrfs_dir_item *)curr->data;
1755 name = (char *)(di + 1);
3cae210f 1756 name_len = btrfs_stack_dir_name_len(di);
16cdcec7 1757
7d157c3d 1758 d_type = fs_ftype_to_dtype(di->type);
16cdcec7
MX
1759 btrfs_disk_key_to_cpu(&location, &di->location);
1760
9cdda8d3 1761 over = !dir_emit(ctx, name, name_len,
16cdcec7
MX
1762 location.objectid, d_type);
1763
089e77e1 1764 if (refcount_dec_and_test(&curr->refs))
16cdcec7
MX
1765 kfree(curr);
1766
1767 if (over)
1768 return 1;
42e9cc46 1769 ctx->pos++;
16cdcec7
MX
1770 }
1771 return 0;
1772}
1773
16cdcec7
MX
1774static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
1775 struct btrfs_inode_item *inode_item,
1776 struct inode *inode)
1777{
77eea05e
BB
1778 u64 flags;
1779
2f2f43d3
EB
1780 btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
1781 btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
16cdcec7
MX
1782 btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
1783 btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
1784 btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
1785 btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
1786 btrfs_set_stack_inode_generation(inode_item,
1787 BTRFS_I(inode)->generation);
c7f88c4e
JL
1788 btrfs_set_stack_inode_sequence(inode_item,
1789 inode_peek_iversion(inode));
16cdcec7
MX
1790 btrfs_set_stack_inode_transid(inode_item, trans->transid);
1791 btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
77eea05e
BB
1792 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
1793 BTRFS_I(inode)->ro_flags);
1794 btrfs_set_stack_inode_flags(inode_item, flags);
ff5714cc 1795 btrfs_set_stack_inode_block_group(inode_item, 0);
16cdcec7 1796
a937b979 1797 btrfs_set_stack_timespec_sec(&inode_item->atime,
16cdcec7 1798 inode->i_atime.tv_sec);
a937b979 1799 btrfs_set_stack_timespec_nsec(&inode_item->atime,
16cdcec7
MX
1800 inode->i_atime.tv_nsec);
1801
a937b979 1802 btrfs_set_stack_timespec_sec(&inode_item->mtime,
16cdcec7 1803 inode->i_mtime.tv_sec);
a937b979 1804 btrfs_set_stack_timespec_nsec(&inode_item->mtime,
16cdcec7
MX
1805 inode->i_mtime.tv_nsec);
1806
a937b979 1807 btrfs_set_stack_timespec_sec(&inode_item->ctime,
16cdcec7 1808 inode->i_ctime.tv_sec);
a937b979 1809 btrfs_set_stack_timespec_nsec(&inode_item->ctime,
16cdcec7 1810 inode->i_ctime.tv_nsec);
9cc97d64 1811
1812 btrfs_set_stack_timespec_sec(&inode_item->otime,
1813 BTRFS_I(inode)->i_otime.tv_sec);
1814 btrfs_set_stack_timespec_nsec(&inode_item->otime,
1815 BTRFS_I(inode)->i_otime.tv_nsec);
16cdcec7
MX
1816}
1817
2f7e33d4
MX
1818int btrfs_fill_inode(struct inode *inode, u32 *rdev)
1819{
9ddc959e 1820 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2f7e33d4
MX
1821 struct btrfs_delayed_node *delayed_node;
1822 struct btrfs_inode_item *inode_item;
2f7e33d4 1823
340c6ca9 1824 delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
2f7e33d4
MX
1825 if (!delayed_node)
1826 return -ENOENT;
1827
1828 mutex_lock(&delayed_node->mutex);
7cf35d91 1829 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
2f7e33d4
MX
1830 mutex_unlock(&delayed_node->mutex);
1831 btrfs_release_delayed_node(delayed_node);
1832 return -ENOENT;
1833 }
1834
1835 inode_item = &delayed_node->inode_item;
1836
2f2f43d3
EB
1837 i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
1838 i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
6ef06d27 1839 btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item));
9ddc959e
JB
1840 btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
1841 round_up(i_size_read(inode), fs_info->sectorsize));
2f7e33d4 1842 inode->i_mode = btrfs_stack_inode_mode(inode_item);
bfe86848 1843 set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
2f7e33d4
MX
1844 inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
1845 BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
6e17d30b
YD
1846 BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);
1847
c7f88c4e
JL
1848 inode_set_iversion_queried(inode,
1849 btrfs_stack_inode_sequence(inode_item));
2f7e33d4
MX
1850 inode->i_rdev = 0;
1851 *rdev = btrfs_stack_inode_rdev(inode_item);
77eea05e
BB
1852 btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
1853 &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
2f7e33d4 1854
a937b979
DS
1855 inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
1856 inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
2f7e33d4 1857
a937b979
DS
1858 inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
1859 inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
2f7e33d4 1860
a937b979
DS
1861 inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime);
1862 inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime);
2f7e33d4 1863
9cc97d64 1864 BTRFS_I(inode)->i_otime.tv_sec =
1865 btrfs_stack_timespec_sec(&inode_item->otime);
1866 BTRFS_I(inode)->i_otime.tv_nsec =
1867 btrfs_stack_timespec_nsec(&inode_item->otime);
1868
2f7e33d4
MX
1869 inode->i_generation = BTRFS_I(inode)->generation;
1870 BTRFS_I(inode)->index_cnt = (u64)-1;
1871
1872 mutex_unlock(&delayed_node->mutex);
1873 btrfs_release_delayed_node(delayed_node);
1874 return 0;
1875}
1876
16cdcec7 1877int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
f3fbcaef
NB
1878 struct btrfs_root *root,
1879 struct btrfs_inode *inode)
16cdcec7
MX
1880{
1881 struct btrfs_delayed_node *delayed_node;
aa0467d8 1882 int ret = 0;
16cdcec7 1883
f3fbcaef 1884 delayed_node = btrfs_get_or_create_delayed_node(inode);
16cdcec7
MX
1885 if (IS_ERR(delayed_node))
1886 return PTR_ERR(delayed_node);
1887
1888 mutex_lock(&delayed_node->mutex);
7cf35d91 1889 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
f3fbcaef
NB
1890 fill_stack_inode_item(trans, &delayed_node->inode_item,
1891 &inode->vfs_inode);
16cdcec7
MX
1892 goto release_node;
1893 }
1894
8e3c9d3c 1895 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
c06a0e12
JB
1896 if (ret)
1897 goto release_node;
16cdcec7 1898
f3fbcaef 1899 fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode);
7cf35d91 1900 set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
16cdcec7
MX
1901 delayed_node->count++;
1902 atomic_inc(&root->fs_info->delayed_root->items);
1903release_node:
1904 mutex_unlock(&delayed_node->mutex);
1905 btrfs_release_delayed_node(delayed_node);
1906 return ret;
1907}
1908
e07222c7 1909int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
67de1176 1910{
3ffbd68c 1911 struct btrfs_fs_info *fs_info = inode->root->fs_info;
67de1176
MX
1912 struct btrfs_delayed_node *delayed_node;
1913
6f896054
CM
1914 /*
1915 * we don't do delayed inode updates during log recovery because it
1916 * leads to enospc problems. This means we also can't do
1917 * delayed inode refs
1918 */
0b246afa 1919 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
6f896054
CM
1920 return -EAGAIN;
1921
e07222c7 1922 delayed_node = btrfs_get_or_create_delayed_node(inode);
67de1176
MX
1923 if (IS_ERR(delayed_node))
1924 return PTR_ERR(delayed_node);
1925
1926 /*
1927 * We don't reserve space for inode ref deletion is because:
1928 * - We ONLY do async inode ref deletion for the inode who has only
1929 * one link(i_nlink == 1), it means there is only one inode ref.
1930 * And in most case, the inode ref and the inode item are in the
1931 * same leaf, and we will deal with them at the same time.
1932 * Since we are sure we will reserve the space for the inode item,
1933 * it is unnecessary to reserve space for inode ref deletion.
1934 * - If the inode ref and the inode item are not in the same leaf,
1935 * We also needn't worry about enospc problem, because we reserve
1936 * much more space for the inode update than it needs.
1937 * - At the worst, we can steal some space from the global reservation.
1938 * It is very rare.
1939 */
1940 mutex_lock(&delayed_node->mutex);
1941 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
1942 goto release_node;
1943
1944 set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
1945 delayed_node->count++;
0b246afa 1946 atomic_inc(&fs_info->delayed_root->items);
67de1176
MX
1947release_node:
1948 mutex_unlock(&delayed_node->mutex);
1949 btrfs_release_delayed_node(delayed_node);
1950 return 0;
1951}
1952
16cdcec7
MX
1953static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
1954{
1955 struct btrfs_root *root = delayed_node->root;
2ff7e61e 1956 struct btrfs_fs_info *fs_info = root->fs_info;
16cdcec7
MX
1957 struct btrfs_delayed_item *curr_item, *prev_item;
1958
1959 mutex_lock(&delayed_node->mutex);
1960 curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
1961 while (curr_item) {
16cdcec7
MX
1962 prev_item = curr_item;
1963 curr_item = __btrfs_next_delayed_item(prev_item);
1964 btrfs_release_delayed_item(prev_item);
1965 }
1966
763748b2
FM
1967 if (delayed_node->index_item_leaves > 0) {
1968 btrfs_delayed_item_release_leaves(delayed_node,
1969 delayed_node->index_item_leaves);
1970 delayed_node->index_item_leaves = 0;
1971 }
1972
16cdcec7
MX
1973 curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
1974 while (curr_item) {
4f5427cc 1975 btrfs_delayed_item_release_metadata(root, curr_item);
16cdcec7
MX
1976 prev_item = curr_item;
1977 curr_item = __btrfs_next_delayed_item(prev_item);
1978 btrfs_release_delayed_item(prev_item);
1979 }
1980
a4cb90dc 1981 btrfs_release_delayed_iref(delayed_node);
67de1176 1982
7cf35d91 1983 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
4f5427cc 1984 btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false);
16cdcec7
MX
1985 btrfs_release_delayed_inode(delayed_node);
1986 }
1987 mutex_unlock(&delayed_node->mutex);
1988}
1989
4ccb5c72 1990void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
16cdcec7
MX
1991{
1992 struct btrfs_delayed_node *delayed_node;
1993
4ccb5c72 1994 delayed_node = btrfs_get_delayed_node(inode);
16cdcec7
MX
1995 if (!delayed_node)
1996 return;
1997
1998 __btrfs_kill_delayed_node(delayed_node);
1999 btrfs_release_delayed_node(delayed_node);
2000}
2001
2002void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
2003{
088aea3b 2004 u64 inode_id = 0;
16cdcec7 2005 struct btrfs_delayed_node *delayed_nodes[8];
088aea3b 2006 int i, n;
16cdcec7
MX
2007
2008 while (1) {
2009 spin_lock(&root->inode_lock);
088aea3b
DS
2010 n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
2011 (void **)delayed_nodes, inode_id,
2012 ARRAY_SIZE(delayed_nodes));
2013 if (!n) {
16cdcec7 2014 spin_unlock(&root->inode_lock);
088aea3b 2015 break;
16cdcec7
MX
2016 }
2017
088aea3b
DS
2018 inode_id = delayed_nodes[n - 1]->inode_id + 1;
2019 for (i = 0; i < n; i++) {
baf320b9
JB
2020 /*
2021 * Don't increase refs in case the node is dead and
2022 * about to be removed from the tree in the loop below
2023 */
088aea3b
DS
2024 if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
2025 delayed_nodes[i] = NULL;
baf320b9 2026 }
16cdcec7
MX
2027 spin_unlock(&root->inode_lock);
2028
088aea3b
DS
2029 for (i = 0; i < n; i++) {
2030 if (!delayed_nodes[i])
2031 continue;
16cdcec7
MX
2032 __btrfs_kill_delayed_node(delayed_nodes[i]);
2033 btrfs_release_delayed_node(delayed_nodes[i]);
2034 }
2035 }
2036}
67cde344 2037
ccdf9b30 2038void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
67cde344 2039{
67cde344
MX
2040 struct btrfs_delayed_node *curr_node, *prev_node;
2041
ccdf9b30 2042 curr_node = btrfs_first_delayed_node(fs_info->delayed_root);
67cde344
MX
2043 while (curr_node) {
2044 __btrfs_kill_delayed_node(curr_node);
2045
2046 prev_node = curr_node;
2047 curr_node = btrfs_next_delayed_node(curr_node);
2048 btrfs_release_delayed_node(prev_node);
2049 }
2050}
2051
30b80f3c
FM
2052void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
2053 struct list_head *ins_list,
2054 struct list_head *del_list)
2055{
2056 struct btrfs_delayed_node *node;
2057 struct btrfs_delayed_item *item;
2058
2059 node = btrfs_get_delayed_node(inode);
2060 if (!node)
2061 return;
2062
2063 mutex_lock(&node->mutex);
2064 item = __btrfs_first_delayed_insertion_item(node);
2065 while (item) {
2066 /*
2067 * It's possible that the item is already in a log list. This
2068 * can happen in case two tasks are trying to log the same
2069 * directory. For example if we have tasks A and task B:
2070 *
2071 * Task A collected the delayed items into a log list while
2072 * under the inode's log_mutex (at btrfs_log_inode()), but it
2073 * only releases the items after logging the inodes they point
2074 * to (if they are new inodes), which happens after unlocking
2075 * the log mutex;
2076 *
2077 * Task B enters btrfs_log_inode() and acquires the log_mutex
2078 * of the same directory inode, before task B releases the
2079 * delayed items. This can happen for example when logging some
2080 * inode we need to trigger logging of its parent directory, so
2081 * logging two files that have the same parent directory can
2082 * lead to this.
2083 *
2084 * If this happens, just ignore delayed items already in a log
2085 * list. All the tasks logging the directory are under a log
2086 * transaction and whichever finishes first can not sync the log
2087 * before the other completes and leaves the log transaction.
2088 */
2089 if (!item->logged && list_empty(&item->log_list)) {
2090 refcount_inc(&item->refs);
2091 list_add_tail(&item->log_list, ins_list);
2092 }
2093 item = __btrfs_next_delayed_item(item);
2094 }
2095
2096 item = __btrfs_first_delayed_deletion_item(node);
2097 while (item) {
2098 /* It may be non-empty, for the same reason mentioned above. */
2099 if (!item->logged && list_empty(&item->log_list)) {
2100 refcount_inc(&item->refs);
2101 list_add_tail(&item->log_list, del_list);
2102 }
2103 item = __btrfs_next_delayed_item(item);
2104 }
2105 mutex_unlock(&node->mutex);
2106
2107 /*
2108 * We are called during inode logging, which means the inode is in use
2109 * and can not be evicted before we finish logging the inode. So we never
2110 * have the last reference on the delayed inode.
2111 * Also, we don't use btrfs_release_delayed_node() because that would
2112 * requeue the delayed inode (change its order in the list of prepared
2113 * nodes) and we don't want to do such change because we don't create or
2114 * delete delayed items.
2115 */
2116 ASSERT(refcount_read(&node->refs) > 1);
2117 refcount_dec(&node->refs);
2118}
2119
2120void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
2121 struct list_head *ins_list,
2122 struct list_head *del_list)
2123{
2124 struct btrfs_delayed_node *node;
2125 struct btrfs_delayed_item *item;
2126 struct btrfs_delayed_item *next;
2127
2128 node = btrfs_get_delayed_node(inode);
2129 if (!node)
2130 return;
2131
2132 mutex_lock(&node->mutex);
2133
2134 list_for_each_entry_safe(item, next, ins_list, log_list) {
2135 item->logged = true;
2136 list_del_init(&item->log_list);
2137 if (refcount_dec_and_test(&item->refs))
2138 kfree(item);
2139 }
2140
2141 list_for_each_entry_safe(item, next, del_list, log_list) {
2142 item->logged = true;
2143 list_del_init(&item->log_list);
2144 if (refcount_dec_and_test(&item->refs))
2145 kfree(item);
2146 }
2147
2148 mutex_unlock(&node->mutex);
2149
2150 /*
2151 * We are called during inode logging, which means the inode is in use
2152 * and can not be evicted before we finish logging the inode. So we never
2153 * have the last reference on the delayed inode.
2154 * Also, we don't use btrfs_release_delayed_node() because that would
2155 * requeue the delayed inode (change its order in the list of prepared
2156 * nodes) and we don't want to do such change because we don't create or
2157 * delete delayed items.
2158 */
2159 ASSERT(refcount_read(&node->refs) > 1);
2160 refcount_dec(&node->refs);
2161}