btrfs: move super prototypes into super.h
[linux-block.git] / fs / btrfs / tree-mod-log.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "messages.h"
4 #include "tree-mod-log.h"
5 #include "disk-io.h"
6 #include "fs.h"
7 #include "accessors.h"
8
9 struct tree_mod_root {
10         u64 logical;
11         u8 level;
12 };
13
14 struct tree_mod_elem {
15         struct rb_node node;
16         u64 logical;
17         u64 seq;
18         enum btrfs_mod_log_op op;
19
20         /*
21          * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
22          * operations.
23          */
24         int slot;
25
26         /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
27         u64 generation;
28
29         /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
30         struct btrfs_disk_key key;
31         u64 blockptr;
32
33         /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
34         struct {
35                 int dst_slot;
36                 int nr_items;
37         } move;
38
39         /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
40         struct tree_mod_root old_root;
41 };
42
43 /*
44  * Pull a new tree mod seq number for our operation.
45  */
46 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
47 {
48         return atomic64_inc_return(&fs_info->tree_mod_seq);
49 }
50
51 /*
52  * This adds a new blocker to the tree mod log's blocker list if the @elem
53  * passed does not already have a sequence number set. So when a caller expects
54  * to record tree modifications, it should ensure to set elem->seq to zero
55  * before calling btrfs_get_tree_mod_seq.
56  * Returns a fresh, unused tree log modification sequence number, even if no new
57  * blocker was added.
58  */
59 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
60                            struct btrfs_seq_list *elem)
61 {
62         write_lock(&fs_info->tree_mod_log_lock);
63         if (!elem->seq) {
64                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
65                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
66                 set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
67         }
68         write_unlock(&fs_info->tree_mod_log_lock);
69
70         return elem->seq;
71 }
72
73 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
74                             struct btrfs_seq_list *elem)
75 {
76         struct rb_root *tm_root;
77         struct rb_node *node;
78         struct rb_node *next;
79         struct tree_mod_elem *tm;
80         u64 min_seq = BTRFS_SEQ_LAST;
81         u64 seq_putting = elem->seq;
82
83         if (!seq_putting)
84                 return;
85
86         write_lock(&fs_info->tree_mod_log_lock);
87         list_del(&elem->list);
88         elem->seq = 0;
89
90         if (list_empty(&fs_info->tree_mod_seq_list)) {
91                 clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
92         } else {
93                 struct btrfs_seq_list *first;
94
95                 first = list_first_entry(&fs_info->tree_mod_seq_list,
96                                          struct btrfs_seq_list, list);
97                 if (seq_putting > first->seq) {
98                         /*
99                          * Blocker with lower sequence number exists, we cannot
100                          * remove anything from the log.
101                          */
102                         write_unlock(&fs_info->tree_mod_log_lock);
103                         return;
104                 }
105                 min_seq = first->seq;
106         }
107
108         /*
109          * Anything that's lower than the lowest existing (read: blocked)
110          * sequence number can be removed from the tree.
111          */
112         tm_root = &fs_info->tree_mod_log;
113         for (node = rb_first(tm_root); node; node = next) {
114                 next = rb_next(node);
115                 tm = rb_entry(node, struct tree_mod_elem, node);
116                 if (tm->seq >= min_seq)
117                         continue;
118                 rb_erase(node, tm_root);
119                 kfree(tm);
120         }
121         write_unlock(&fs_info->tree_mod_log_lock);
122 }
123
124 /*
125  * Key order of the log:
126  *       node/leaf start address -> sequence
127  *
128  * The 'start address' is the logical address of the *new* root node for root
129  * replace operations, or the logical address of the affected block for all
130  * other operations.
131  */
132 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
133                                         struct tree_mod_elem *tm)
134 {
135         struct rb_root *tm_root;
136         struct rb_node **new;
137         struct rb_node *parent = NULL;
138         struct tree_mod_elem *cur;
139
140         lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
141
142         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
143
144         tm_root = &fs_info->tree_mod_log;
145         new = &tm_root->rb_node;
146         while (*new) {
147                 cur = rb_entry(*new, struct tree_mod_elem, node);
148                 parent = *new;
149                 if (cur->logical < tm->logical)
150                         new = &((*new)->rb_left);
151                 else if (cur->logical > tm->logical)
152                         new = &((*new)->rb_right);
153                 else if (cur->seq < tm->seq)
154                         new = &((*new)->rb_left);
155                 else if (cur->seq > tm->seq)
156                         new = &((*new)->rb_right);
157                 else
158                         return -EEXIST;
159         }
160
161         rb_link_node(&tm->node, parent, new);
162         rb_insert_color(&tm->node, tm_root);
163         return 0;
164 }
165
166 /*
167  * Determines if logging can be omitted. Returns true if it can. Otherwise, it
168  * returns false with the tree_mod_log_lock acquired. The caller must hold
169  * this until all tree mod log insertions are recorded in the rb tree and then
170  * write unlock fs_info::tree_mod_log_lock.
171  */
172 static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
173                                     struct extent_buffer *eb)
174 {
175         if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
176                 return true;
177         if (eb && btrfs_header_level(eb) == 0)
178                 return true;
179
180         write_lock(&fs_info->tree_mod_log_lock);
181         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
182                 write_unlock(&fs_info->tree_mod_log_lock);
183                 return true;
184         }
185
186         return false;
187 }
188
189 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
190 static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
191                                     struct extent_buffer *eb)
192 {
193         if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
194                 return false;
195         if (eb && btrfs_header_level(eb) == 0)
196                 return false;
197
198         return true;
199 }
200
201 static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
202                                                  int slot,
203                                                  enum btrfs_mod_log_op op)
204 {
205         struct tree_mod_elem *tm;
206
207         tm = kzalloc(sizeof(*tm), GFP_NOFS);
208         if (!tm)
209                 return NULL;
210
211         tm->logical = eb->start;
212         if (op != BTRFS_MOD_LOG_KEY_ADD) {
213                 btrfs_node_key(eb, &tm->key, slot);
214                 tm->blockptr = btrfs_node_blockptr(eb, slot);
215         }
216         tm->op = op;
217         tm->slot = slot;
218         tm->generation = btrfs_node_ptr_generation(eb, slot);
219         RB_CLEAR_NODE(&tm->node);
220
221         return tm;
222 }
223
224 int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
225                                   enum btrfs_mod_log_op op)
226 {
227         struct tree_mod_elem *tm;
228         int ret;
229
230         if (!tree_mod_need_log(eb->fs_info, eb))
231                 return 0;
232
233         tm = alloc_tree_mod_elem(eb, slot, op);
234         if (!tm)
235                 return -ENOMEM;
236
237         if (tree_mod_dont_log(eb->fs_info, eb)) {
238                 kfree(tm);
239                 return 0;
240         }
241
242         ret = tree_mod_log_insert(eb->fs_info, tm);
243         write_unlock(&eb->fs_info->tree_mod_log_lock);
244         if (ret)
245                 kfree(tm);
246
247         return ret;
248 }
249
250 int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
251                                    int dst_slot, int src_slot,
252                                    int nr_items)
253 {
254         struct tree_mod_elem *tm = NULL;
255         struct tree_mod_elem **tm_list = NULL;
256         int ret = 0;
257         int i;
258         bool locked = false;
259
260         if (!tree_mod_need_log(eb->fs_info, eb))
261                 return 0;
262
263         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
264         if (!tm_list)
265                 return -ENOMEM;
266
267         tm = kzalloc(sizeof(*tm), GFP_NOFS);
268         if (!tm) {
269                 ret = -ENOMEM;
270                 goto free_tms;
271         }
272
273         tm->logical = eb->start;
274         tm->slot = src_slot;
275         tm->move.dst_slot = dst_slot;
276         tm->move.nr_items = nr_items;
277         tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
278
279         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
280                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
281                                 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
282                 if (!tm_list[i]) {
283                         ret = -ENOMEM;
284                         goto free_tms;
285                 }
286         }
287
288         if (tree_mod_dont_log(eb->fs_info, eb))
289                 goto free_tms;
290         locked = true;
291
292         /*
293          * When we override something during the move, we log these removals.
294          * This can only happen when we move towards the beginning of the
295          * buffer, i.e. dst_slot < src_slot.
296          */
297         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
298                 ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
299                 if (ret)
300                         goto free_tms;
301         }
302
303         ret = tree_mod_log_insert(eb->fs_info, tm);
304         if (ret)
305                 goto free_tms;
306         write_unlock(&eb->fs_info->tree_mod_log_lock);
307         kfree(tm_list);
308
309         return 0;
310
311 free_tms:
312         for (i = 0; i < nr_items; i++) {
313                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
314                         rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
315                 kfree(tm_list[i]);
316         }
317         if (locked)
318                 write_unlock(&eb->fs_info->tree_mod_log_lock);
319         kfree(tm_list);
320         kfree(tm);
321
322         return ret;
323 }
324
325 static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
326                                        struct tree_mod_elem **tm_list,
327                                        int nritems)
328 {
329         int i, j;
330         int ret;
331
332         for (i = nritems - 1; i >= 0; i--) {
333                 ret = tree_mod_log_insert(fs_info, tm_list[i]);
334                 if (ret) {
335                         for (j = nritems - 1; j > i; j--)
336                                 rb_erase(&tm_list[j]->node,
337                                          &fs_info->tree_mod_log);
338                         return ret;
339                 }
340         }
341
342         return 0;
343 }
344
345 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
346                                    struct extent_buffer *new_root,
347                                    bool log_removal)
348 {
349         struct btrfs_fs_info *fs_info = old_root->fs_info;
350         struct tree_mod_elem *tm = NULL;
351         struct tree_mod_elem **tm_list = NULL;
352         int nritems = 0;
353         int ret = 0;
354         int i;
355
356         if (!tree_mod_need_log(fs_info, NULL))
357                 return 0;
358
359         if (log_removal && btrfs_header_level(old_root) > 0) {
360                 nritems = btrfs_header_nritems(old_root);
361                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
362                                   GFP_NOFS);
363                 if (!tm_list) {
364                         ret = -ENOMEM;
365                         goto free_tms;
366                 }
367                 for (i = 0; i < nritems; i++) {
368                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
369                             BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
370                         if (!tm_list[i]) {
371                                 ret = -ENOMEM;
372                                 goto free_tms;
373                         }
374                 }
375         }
376
377         tm = kzalloc(sizeof(*tm), GFP_NOFS);
378         if (!tm) {
379                 ret = -ENOMEM;
380                 goto free_tms;
381         }
382
383         tm->logical = new_root->start;
384         tm->old_root.logical = old_root->start;
385         tm->old_root.level = btrfs_header_level(old_root);
386         tm->generation = btrfs_header_generation(old_root);
387         tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
388
389         if (tree_mod_dont_log(fs_info, NULL))
390                 goto free_tms;
391
392         if (tm_list)
393                 ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
394         if (!ret)
395                 ret = tree_mod_log_insert(fs_info, tm);
396
397         write_unlock(&fs_info->tree_mod_log_lock);
398         if (ret)
399                 goto free_tms;
400         kfree(tm_list);
401
402         return ret;
403
404 free_tms:
405         if (tm_list) {
406                 for (i = 0; i < nritems; i++)
407                         kfree(tm_list[i]);
408                 kfree(tm_list);
409         }
410         kfree(tm);
411
412         return ret;
413 }
414
415 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
416                                                    u64 start, u64 min_seq,
417                                                    bool smallest)
418 {
419         struct rb_root *tm_root;
420         struct rb_node *node;
421         struct tree_mod_elem *cur = NULL;
422         struct tree_mod_elem *found = NULL;
423
424         read_lock(&fs_info->tree_mod_log_lock);
425         tm_root = &fs_info->tree_mod_log;
426         node = tm_root->rb_node;
427         while (node) {
428                 cur = rb_entry(node, struct tree_mod_elem, node);
429                 if (cur->logical < start) {
430                         node = node->rb_left;
431                 } else if (cur->logical > start) {
432                         node = node->rb_right;
433                 } else if (cur->seq < min_seq) {
434                         node = node->rb_left;
435                 } else if (!smallest) {
436                         /* We want the node with the highest seq */
437                         if (found)
438                                 BUG_ON(found->seq > cur->seq);
439                         found = cur;
440                         node = node->rb_left;
441                 } else if (cur->seq > min_seq) {
442                         /* We want the node with the smallest seq */
443                         if (found)
444                                 BUG_ON(found->seq < cur->seq);
445                         found = cur;
446                         node = node->rb_right;
447                 } else {
448                         found = cur;
449                         break;
450                 }
451         }
452         read_unlock(&fs_info->tree_mod_log_lock);
453
454         return found;
455 }
456
457 /*
458  * This returns the element from the log with the smallest time sequence
459  * value that's in the log (the oldest log item). Any element with a time
460  * sequence lower than min_seq will be ignored.
461  */
462 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
463                                                         u64 start, u64 min_seq)
464 {
465         return __tree_mod_log_search(fs_info, start, min_seq, true);
466 }
467
468 /*
469  * This returns the element from the log with the largest time sequence
470  * value that's in the log (the most recent log item). Any element with
471  * a time sequence lower than min_seq will be ignored.
472  */
473 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
474                                                  u64 start, u64 min_seq)
475 {
476         return __tree_mod_log_search(fs_info, start, min_seq, false);
477 }
478
479 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
480                                struct extent_buffer *src,
481                                unsigned long dst_offset,
482                                unsigned long src_offset,
483                                int nr_items)
484 {
485         struct btrfs_fs_info *fs_info = dst->fs_info;
486         int ret = 0;
487         struct tree_mod_elem **tm_list = NULL;
488         struct tree_mod_elem **tm_list_add, **tm_list_rem;
489         int i;
490         bool locked = false;
491
492         if (!tree_mod_need_log(fs_info, NULL))
493                 return 0;
494
495         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
496                 return 0;
497
498         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
499                           GFP_NOFS);
500         if (!tm_list)
501                 return -ENOMEM;
502
503         tm_list_add = tm_list;
504         tm_list_rem = tm_list + nr_items;
505         for (i = 0; i < nr_items; i++) {
506                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
507                                                      BTRFS_MOD_LOG_KEY_REMOVE);
508                 if (!tm_list_rem[i]) {
509                         ret = -ENOMEM;
510                         goto free_tms;
511                 }
512
513                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
514                                                      BTRFS_MOD_LOG_KEY_ADD);
515                 if (!tm_list_add[i]) {
516                         ret = -ENOMEM;
517                         goto free_tms;
518                 }
519         }
520
521         if (tree_mod_dont_log(fs_info, NULL))
522                 goto free_tms;
523         locked = true;
524
525         for (i = 0; i < nr_items; i++) {
526                 ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
527                 if (ret)
528                         goto free_tms;
529                 ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
530                 if (ret)
531                         goto free_tms;
532         }
533
534         write_unlock(&fs_info->tree_mod_log_lock);
535         kfree(tm_list);
536
537         return 0;
538
539 free_tms:
540         for (i = 0; i < nr_items * 2; i++) {
541                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
542                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
543                 kfree(tm_list[i]);
544         }
545         if (locked)
546                 write_unlock(&fs_info->tree_mod_log_lock);
547         kfree(tm_list);
548
549         return ret;
550 }
551
552 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
553 {
554         struct tree_mod_elem **tm_list = NULL;
555         int nritems = 0;
556         int i;
557         int ret = 0;
558
559         if (!tree_mod_need_log(eb->fs_info, eb))
560                 return 0;
561
562         nritems = btrfs_header_nritems(eb);
563         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
564         if (!tm_list)
565                 return -ENOMEM;
566
567         for (i = 0; i < nritems; i++) {
568                 tm_list[i] = alloc_tree_mod_elem(eb, i,
569                                     BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
570                 if (!tm_list[i]) {
571                         ret = -ENOMEM;
572                         goto free_tms;
573                 }
574         }
575
576         if (tree_mod_dont_log(eb->fs_info, eb))
577                 goto free_tms;
578
579         ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
580         write_unlock(&eb->fs_info->tree_mod_log_lock);
581         if (ret)
582                 goto free_tms;
583         kfree(tm_list);
584
585         return 0;
586
587 free_tms:
588         for (i = 0; i < nritems; i++)
589                 kfree(tm_list[i]);
590         kfree(tm_list);
591
592         return ret;
593 }
594
595 /*
596  * Returns the logical address of the oldest predecessor of the given root.
597  * Entries older than time_seq are ignored.
598  */
599 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
600                                                       u64 time_seq)
601 {
602         struct tree_mod_elem *tm;
603         struct tree_mod_elem *found = NULL;
604         u64 root_logical = eb_root->start;
605         bool looped = false;
606
607         if (!time_seq)
608                 return NULL;
609
610         /*
611          * The very last operation that's logged for a root is the replacement
612          * operation (if it is replaced at all). This has the logical address
613          * of the *new* root, making it the very first operation that's logged
614          * for this root.
615          */
616         while (1) {
617                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
618                                                 time_seq);
619                 if (!looped && !tm)
620                         return NULL;
621                 /*
622                  * If there are no tree operation for the oldest root, we simply
623                  * return it. This should only happen if that (old) root is at
624                  * level 0.
625                  */
626                 if (!tm)
627                         break;
628
629                 /*
630                  * If there's an operation that's not a root replacement, we
631                  * found the oldest version of our root. Normally, we'll find a
632                  * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
633                  */
634                 if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
635                         break;
636
637                 found = tm;
638                 root_logical = tm->old_root.logical;
639                 looped = true;
640         }
641
642         /* If there's no old root to return, return what we found instead */
643         if (!found)
644                 found = tm;
645
646         return found;
647 }
648
649
650 /*
651  * tm is a pointer to the first operation to rewind within eb. Then, all
652  * previous operations will be rewound (until we reach something older than
653  * time_seq).
654  */
655 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
656                                 struct extent_buffer *eb,
657                                 u64 time_seq,
658                                 struct tree_mod_elem *first_tm)
659 {
660         u32 n;
661         struct rb_node *next;
662         struct tree_mod_elem *tm = first_tm;
663         unsigned long o_dst;
664         unsigned long o_src;
665         unsigned long p_size = sizeof(struct btrfs_key_ptr);
666
667         n = btrfs_header_nritems(eb);
668         read_lock(&fs_info->tree_mod_log_lock);
669         while (tm && tm->seq >= time_seq) {
670                 /*
671                  * All the operations are recorded with the operator used for
672                  * the modification. As we're going backwards, we do the
673                  * opposite of each operation here.
674                  */
675                 switch (tm->op) {
676                 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
677                         BUG_ON(tm->slot < n);
678                         fallthrough;
679                 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
680                 case BTRFS_MOD_LOG_KEY_REMOVE:
681                         btrfs_set_node_key(eb, &tm->key, tm->slot);
682                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
683                         btrfs_set_node_ptr_generation(eb, tm->slot,
684                                                       tm->generation);
685                         n++;
686                         break;
687                 case BTRFS_MOD_LOG_KEY_REPLACE:
688                         BUG_ON(tm->slot >= n);
689                         btrfs_set_node_key(eb, &tm->key, tm->slot);
690                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
691                         btrfs_set_node_ptr_generation(eb, tm->slot,
692                                                       tm->generation);
693                         break;
694                 case BTRFS_MOD_LOG_KEY_ADD:
695                         /* if a move operation is needed it's in the log */
696                         n--;
697                         break;
698                 case BTRFS_MOD_LOG_MOVE_KEYS:
699                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
700                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
701                         memmove_extent_buffer(eb, o_dst, o_src,
702                                               tm->move.nr_items * p_size);
703                         break;
704                 case BTRFS_MOD_LOG_ROOT_REPLACE:
705                         /*
706                          * This operation is special. For roots, this must be
707                          * handled explicitly before rewinding.
708                          * For non-roots, this operation may exist if the node
709                          * was a root: root A -> child B; then A gets empty and
710                          * B is promoted to the new root. In the mod log, we'll
711                          * have a root-replace operation for B, a tree block
712                          * that is no root. We simply ignore that operation.
713                          */
714                         break;
715                 }
716                 next = rb_next(&tm->node);
717                 if (!next)
718                         break;
719                 tm = rb_entry(next, struct tree_mod_elem, node);
720                 if (tm->logical != first_tm->logical)
721                         break;
722         }
723         read_unlock(&fs_info->tree_mod_log_lock);
724         btrfs_set_header_nritems(eb, n);
725 }
726
727 /*
728  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
729  * is returned. If rewind operations happen, a fresh buffer is returned. The
730  * returned buffer is always read-locked. If the returned buffer is not the
731  * input buffer, the lock on the input buffer is released and the input buffer
732  * is freed (its refcount is decremented).
733  */
734 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
735                                                 struct btrfs_path *path,
736                                                 struct extent_buffer *eb,
737                                                 u64 time_seq)
738 {
739         struct extent_buffer *eb_rewin;
740         struct tree_mod_elem *tm;
741
742         if (!time_seq)
743                 return eb;
744
745         if (btrfs_header_level(eb) == 0)
746                 return eb;
747
748         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
749         if (!tm)
750                 return eb;
751
752         if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
753                 BUG_ON(tm->slot != 0);
754                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
755                 if (!eb_rewin) {
756                         btrfs_tree_read_unlock(eb);
757                         free_extent_buffer(eb);
758                         return NULL;
759                 }
760                 btrfs_set_header_bytenr(eb_rewin, eb->start);
761                 btrfs_set_header_backref_rev(eb_rewin,
762                                              btrfs_header_backref_rev(eb));
763                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
764                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
765         } else {
766                 eb_rewin = btrfs_clone_extent_buffer(eb);
767                 if (!eb_rewin) {
768                         btrfs_tree_read_unlock(eb);
769                         free_extent_buffer(eb);
770                         return NULL;
771                 }
772         }
773
774         btrfs_tree_read_unlock(eb);
775         free_extent_buffer(eb);
776
777         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
778                                        eb_rewin, btrfs_header_level(eb_rewin));
779         btrfs_tree_read_lock(eb_rewin);
780         tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
781         WARN_ON(btrfs_header_nritems(eb_rewin) >
782                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
783
784         return eb_rewin;
785 }
786
787 /*
788  * Rewind the state of @root's root node to the given @time_seq value.
789  * If there are no changes, the current root->root_node is returned. If anything
790  * changed in between, there's a fresh buffer allocated on which the rewind
791  * operations are done. In any case, the returned buffer is read locked.
792  * Returns NULL on error (with no locks held).
793  */
794 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
795 {
796         struct btrfs_fs_info *fs_info = root->fs_info;
797         struct tree_mod_elem *tm;
798         struct extent_buffer *eb = NULL;
799         struct extent_buffer *eb_root;
800         u64 eb_root_owner = 0;
801         struct extent_buffer *old;
802         struct tree_mod_root *old_root = NULL;
803         u64 old_generation = 0;
804         u64 logical;
805         int level;
806
807         eb_root = btrfs_read_lock_root_node(root);
808         tm = tree_mod_log_oldest_root(eb_root, time_seq);
809         if (!tm)
810                 return eb_root;
811
812         if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
813                 old_root = &tm->old_root;
814                 old_generation = tm->generation;
815                 logical = old_root->logical;
816                 level = old_root->level;
817         } else {
818                 logical = eb_root->start;
819                 level = btrfs_header_level(eb_root);
820         }
821
822         tm = tree_mod_log_search(fs_info, logical, time_seq);
823         if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
824                 btrfs_tree_read_unlock(eb_root);
825                 free_extent_buffer(eb_root);
826                 old = read_tree_block(fs_info, logical, root->root_key.objectid,
827                                       0, level, NULL);
828                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
829                         if (!IS_ERR(old))
830                                 free_extent_buffer(old);
831                         btrfs_warn(fs_info,
832                                    "failed to read tree block %llu from get_old_root",
833                                    logical);
834                 } else {
835                         struct tree_mod_elem *tm2;
836
837                         btrfs_tree_read_lock(old);
838                         eb = btrfs_clone_extent_buffer(old);
839                         /*
840                          * After the lookup for the most recent tree mod operation
841                          * above and before we locked and cloned the extent buffer
842                          * 'old', a new tree mod log operation may have been added.
843                          * So lookup for a more recent one to make sure the number
844                          * of mod log operations we replay is consistent with the
845                          * number of items we have in the cloned extent buffer,
846                          * otherwise we can hit a BUG_ON when rewinding the extent
847                          * buffer.
848                          */
849                         tm2 = tree_mod_log_search(fs_info, logical, time_seq);
850                         btrfs_tree_read_unlock(old);
851                         free_extent_buffer(old);
852                         ASSERT(tm2);
853                         ASSERT(tm2 == tm || tm2->seq > tm->seq);
854                         if (!tm2 || tm2->seq < tm->seq) {
855                                 free_extent_buffer(eb);
856                                 return NULL;
857                         }
858                         tm = tm2;
859                 }
860         } else if (old_root) {
861                 eb_root_owner = btrfs_header_owner(eb_root);
862                 btrfs_tree_read_unlock(eb_root);
863                 free_extent_buffer(eb_root);
864                 eb = alloc_dummy_extent_buffer(fs_info, logical);
865         } else {
866                 eb = btrfs_clone_extent_buffer(eb_root);
867                 btrfs_tree_read_unlock(eb_root);
868                 free_extent_buffer(eb_root);
869         }
870
871         if (!eb)
872                 return NULL;
873         if (old_root) {
874                 btrfs_set_header_bytenr(eb, eb->start);
875                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
876                 btrfs_set_header_owner(eb, eb_root_owner);
877                 btrfs_set_header_level(eb, old_root->level);
878                 btrfs_set_header_generation(eb, old_generation);
879         }
880         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
881                                        btrfs_header_level(eb));
882         btrfs_tree_read_lock(eb);
883         if (tm)
884                 tree_mod_log_rewind(fs_info, eb, time_seq, tm);
885         else
886                 WARN_ON(btrfs_header_level(eb) != 0);
887         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
888
889         return eb;
890 }
891
892 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
893 {
894         struct tree_mod_elem *tm;
895         int level;
896         struct extent_buffer *eb_root = btrfs_root_node(root);
897
898         tm = tree_mod_log_oldest_root(eb_root, time_seq);
899         if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
900                 level = tm->old_root.level;
901         else
902                 level = btrfs_header_level(eb_root);
903
904         free_extent_buffer(eb_root);
905
906         return level;
907 }
908
909 /*
910  * Return the lowest sequence number in the tree modification log.
911  *
912  * Return the sequence number of the oldest tree modification log user, which
913  * corresponds to the lowest sequence number of all existing users. If there are
914  * no users it returns 0.
915  */
916 u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
917 {
918         u64 ret = 0;
919
920         read_lock(&fs_info->tree_mod_log_lock);
921         if (!list_empty(&fs_info->tree_mod_seq_list)) {
922                 struct btrfs_seq_list *elem;
923
924                 elem = list_first_entry(&fs_info->tree_mod_seq_list,
925                                         struct btrfs_seq_list, list);
926                 ret = elem->seq;
927         }
928         read_unlock(&fs_info->tree_mod_log_lock);
929
930         return ret;
931 }