ext4: prevent used blocks from being allocated during fast commit replay
[linux-2.6-block.git] / fs / ext4 / fast_commit.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
30  * - EXT4_FC_TAG_LINK           - records directory entry link
31  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
41  *                                during recovery. Note that iblocks field is
42  *                                not replayed and instead derived during
43  *                                replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  *
69  * Not all operations are supported by fast commits today (e.g extended
70  * attributes). Fast commit ineligibility is marked by calling
71  * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
72  * to full commit.
73  *
74  * Atomicity of commits
75  * --------------------
76  * In order to guarantee atomicity during the commit operation, fast commit
77  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
78  * tag contains CRC of the contents and TID of the transaction after which
79  * this fast commit should be applied. Recovery code replays fast commit
80  * logs only if there's at least 1 valid tail present. For every fast commit
81  * operation, there is 1 tail. This means, we may end up with multiple tails
82  * in the fast commit space. Here's an example:
83  *
84  * - Create a new file A and remove existing file B
85  * - fsync()
86  * - Append contents to file A
87  * - Truncate file A
88  * - fsync()
89  *
90  * The fast commit space at the end of above operations would look like this:
91  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
92  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
93  *
94  * Replay code should thus check for all the valid tails in the FC area.
95  *
96  * Fast Commit Replay Idempotence
97  * ------------------------------
98  *
99  * Fast commits tags are idempotent in nature provided the recovery code follows
100  * certain rules. The guiding principle that the commit path follows while
101  * committing is that it stores the result of a particular operation instead of
102  * storing the procedure.
103  *
104  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
105  * was associated with inode 10. During fast commit, instead of storing this
106  * operation as a procedure "rename a to b", we store the resulting file system
107  * state as a "series" of outcomes:
108  *
109  * - Link dirent b to inode 10
110  * - Unlink dirent a
111  * - Inode <10> with valid refcount
112  *
113  * Now when recovery code runs, it needs "enforce" this state on the file
114  * system. This is what guarantees idempotence of fast commit replay.
115  *
116  * Let's take an example of a procedure that is not idempotent and see how fast
117  * commits make it idempotent. Consider following sequence of operations:
118  *
119  *     rm A;    mv B A;    read A
120  *  (x)     (y)        (z)
121  *
122  * (x), (y) and (z) are the points at which we can crash. If we store this
123  * sequence of operations as is then the replay is not idempotent. Let's say
124  * while in replay, we crash at (z). During the second replay, file A (which was
125  * actually created as a result of "mv B A" operation) would get deleted. Thus,
126  * file named A would be absent when we try to read A. So, this sequence of
127  * operations is not idempotent. However, as mentioned above, instead of storing
128  * the procedure fast commits store the outcome of each procedure. Thus the fast
129  * commit log for above procedure would be as follows:
130  *
131  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
132  * inode 11 before the replay)
133  *
134  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
135  * (w)          (x)                    (y)          (z)
136  *
137  * If we crash at (z), we will have file A linked to inode 11. During the second
138  * replay, we will remove file A (inode 11). But we will create it back and make
139  * it point to inode 11. We won't find B, so we'll just skip that step. At this
140  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
141  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
142  * similarly. Thus, by converting a non-idempotent procedure into a series of
143  * idempotent outcomes, fast commits ensured idempotence during the replay.
144  *
145  * TODOs
146  * -----
147  *
148  * 0) Fast commit replay path hardening: Fast commit replay code should use
149  *    journal handles to make sure all the updates it does during the replay
150  *    path are atomic. With that if we crash during fast commit replay, after
151  *    trying to do recovery again, we will find a file system where fast commit
152  *    area is invalid (because new full commit would be found). In order to deal
153  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
154  *    superblock state is persisted before starting the replay, so that after
155  *    the crash, fast commit recovery code can look at that flag and perform
156  *    fast commit recovery even if that area is invalidated by later full
157  *    commits.
158  *
159  * 1) Fast commit's commit path locks the entire file system during fast
160  *    commit. This has significant performance penalty. Instead of that, we
161  *    should use ext4_fc_start/stop_update functions to start inode level
162  *    updates from ext4_journal_start/stop. Once we do that we can drop file
163  *    system locking during commit path.
164  *
165  * 2) Handle more ineligible cases.
166  */
167
168 #include <trace/events/ext4.h>
169 static struct kmem_cache *ext4_fc_dentry_cachep;
170
171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
172 {
173         BUFFER_TRACE(bh, "");
174         if (uptodate) {
175                 ext4_debug("%s: Block %lld up-to-date",
176                            __func__, bh->b_blocknr);
177                 set_buffer_uptodate(bh);
178         } else {
179                 ext4_debug("%s: Block %lld not up-to-date",
180                            __func__, bh->b_blocknr);
181                 clear_buffer_uptodate(bh);
182         }
183
184         unlock_buffer(bh);
185 }
186
187 static inline void ext4_fc_reset_inode(struct inode *inode)
188 {
189         struct ext4_inode_info *ei = EXT4_I(inode);
190
191         ei->i_fc_lblk_start = 0;
192         ei->i_fc_lblk_len = 0;
193 }
194
195 void ext4_fc_init_inode(struct inode *inode)
196 {
197         struct ext4_inode_info *ei = EXT4_I(inode);
198
199         ext4_fc_reset_inode(inode);
200         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
201         INIT_LIST_HEAD(&ei->i_fc_list);
202         init_waitqueue_head(&ei->i_fc_wait);
203         atomic_set(&ei->i_fc_updates, 0);
204 }
205
206 /* This function must be called with sbi->s_fc_lock held. */
207 static void ext4_fc_wait_committing_inode(struct inode *inode)
208 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
209 {
210         wait_queue_head_t *wq;
211         struct ext4_inode_info *ei = EXT4_I(inode);
212
213 #if (BITS_PER_LONG < 64)
214         DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
215                         EXT4_STATE_FC_COMMITTING);
216         wq = bit_waitqueue(&ei->i_state_flags,
217                                 EXT4_STATE_FC_COMMITTING);
218 #else
219         DEFINE_WAIT_BIT(wait, &ei->i_flags,
220                         EXT4_STATE_FC_COMMITTING);
221         wq = bit_waitqueue(&ei->i_flags,
222                                 EXT4_STATE_FC_COMMITTING);
223 #endif
224         lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
225         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
226         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
227         schedule();
228         finish_wait(wq, &wait.wq_entry);
229 }
230
231 /*
232  * Inform Ext4's fast about start of an inode update
233  *
234  * This function is called by the high level call VFS callbacks before
235  * performing any inode update. This function blocks if there's an ongoing
236  * fast commit on the inode in question.
237  */
238 void ext4_fc_start_update(struct inode *inode)
239 {
240         struct ext4_inode_info *ei = EXT4_I(inode);
241
242         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
243             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
244                 return;
245
246 restart:
247         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
248         if (list_empty(&ei->i_fc_list))
249                 goto out;
250
251         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
252                 ext4_fc_wait_committing_inode(inode);
253                 goto restart;
254         }
255 out:
256         atomic_inc(&ei->i_fc_updates);
257         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
258 }
259
260 /*
261  * Stop inode update and wake up waiting fast commits if any.
262  */
263 void ext4_fc_stop_update(struct inode *inode)
264 {
265         struct ext4_inode_info *ei = EXT4_I(inode);
266
267         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
268             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
269                 return;
270
271         if (atomic_dec_and_test(&ei->i_fc_updates))
272                 wake_up_all(&ei->i_fc_wait);
273 }
274
275 /*
276  * Remove inode from fast commit list. If the inode is being committed
277  * we wait until inode commit is done.
278  */
279 void ext4_fc_del(struct inode *inode)
280 {
281         struct ext4_inode_info *ei = EXT4_I(inode);
282
283         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
284             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
285                 return;
286
287 restart:
288         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
289         if (list_empty(&ei->i_fc_list)) {
290                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
291                 return;
292         }
293
294         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
295                 ext4_fc_wait_committing_inode(inode);
296                 goto restart;
297         }
298         list_del_init(&ei->i_fc_list);
299         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
300 }
301
302 /*
303  * Mark file system as fast commit ineligible. This means that next commit
304  * operation would result in a full jbd2 commit.
305  */
306 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
307 {
308         struct ext4_sb_info *sbi = EXT4_SB(sb);
309
310         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
311             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
312                 return;
313
314         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
315         WARN_ON(reason >= EXT4_FC_REASON_MAX);
316         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
317 }
318
319 /*
320  * Generic fast commit tracking function. If this is the first time this we are
321  * called after a full commit, we initialize fast commit fields and then call
322  * __fc_track_fn() with update = 0. If we have already been called after a full
323  * commit, we pass update = 1. Based on that, the track function can determine
324  * if it needs to track a field for the first time or if it needs to just
325  * update the previously tracked value.
326  *
327  * If enqueue is set, this function enqueues the inode in fast commit list.
328  */
329 static int ext4_fc_track_template(
330         handle_t *handle, struct inode *inode,
331         int (*__fc_track_fn)(struct inode *, void *, bool),
332         void *args, int enqueue)
333 {
334         bool update = false;
335         struct ext4_inode_info *ei = EXT4_I(inode);
336         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
337         tid_t tid = 0;
338         int ret;
339
340         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
341             (sbi->s_mount_state & EXT4_FC_REPLAY))
342                 return -EOPNOTSUPP;
343
344         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
345                 return -EINVAL;
346
347         tid = handle->h_transaction->t_tid;
348         mutex_lock(&ei->i_fc_lock);
349         if (tid == ei->i_sync_tid) {
350                 update = true;
351         } else {
352                 ext4_fc_reset_inode(inode);
353                 ei->i_sync_tid = tid;
354         }
355         ret = __fc_track_fn(inode, args, update);
356         mutex_unlock(&ei->i_fc_lock);
357
358         if (!enqueue)
359                 return ret;
360
361         spin_lock(&sbi->s_fc_lock);
362         if (list_empty(&EXT4_I(inode)->i_fc_list))
363                 list_add_tail(&EXT4_I(inode)->i_fc_list,
364                                 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
365                                 &sbi->s_fc_q[FC_Q_STAGING] :
366                                 &sbi->s_fc_q[FC_Q_MAIN]);
367         spin_unlock(&sbi->s_fc_lock);
368
369         return ret;
370 }
371
372 struct __track_dentry_update_args {
373         struct dentry *dentry;
374         int op;
375 };
376
377 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
378 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
379 {
380         struct ext4_fc_dentry_update *node;
381         struct ext4_inode_info *ei = EXT4_I(inode);
382         struct __track_dentry_update_args *dentry_update =
383                 (struct __track_dentry_update_args *)arg;
384         struct dentry *dentry = dentry_update->dentry;
385         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
386
387         mutex_unlock(&ei->i_fc_lock);
388         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
389         if (!node) {
390                 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
391                 mutex_lock(&ei->i_fc_lock);
392                 return -ENOMEM;
393         }
394
395         node->fcd_op = dentry_update->op;
396         node->fcd_parent = dentry->d_parent->d_inode->i_ino;
397         node->fcd_ino = inode->i_ino;
398         if (dentry->d_name.len > DNAME_INLINE_LEN) {
399                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
400                 if (!node->fcd_name.name) {
401                         kmem_cache_free(ext4_fc_dentry_cachep, node);
402                         ext4_fc_mark_ineligible(inode->i_sb,
403                                 EXT4_FC_REASON_NOMEM);
404                         mutex_lock(&ei->i_fc_lock);
405                         return -ENOMEM;
406                 }
407                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
408                         dentry->d_name.len);
409         } else {
410                 memcpy(node->fcd_iname, dentry->d_name.name,
411                         dentry->d_name.len);
412                 node->fcd_name.name = node->fcd_iname;
413         }
414         node->fcd_name.len = dentry->d_name.len;
415
416         spin_lock(&sbi->s_fc_lock);
417         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
418                 list_add_tail(&node->fcd_list,
419                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
420         else
421                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
422         spin_unlock(&sbi->s_fc_lock);
423         mutex_lock(&ei->i_fc_lock);
424
425         return 0;
426 }
427
428 void __ext4_fc_track_unlink(handle_t *handle,
429                 struct inode *inode, struct dentry *dentry)
430 {
431         struct __track_dentry_update_args args;
432         int ret;
433
434         args.dentry = dentry;
435         args.op = EXT4_FC_TAG_UNLINK;
436
437         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
438                                         (void *)&args, 0);
439         trace_ext4_fc_track_unlink(inode, dentry, ret);
440 }
441
442 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
443 {
444         __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
445 }
446
447 void __ext4_fc_track_link(handle_t *handle,
448         struct inode *inode, struct dentry *dentry)
449 {
450         struct __track_dentry_update_args args;
451         int ret;
452
453         args.dentry = dentry;
454         args.op = EXT4_FC_TAG_LINK;
455
456         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
457                                         (void *)&args, 0);
458         trace_ext4_fc_track_link(inode, dentry, ret);
459 }
460
461 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
462 {
463         __ext4_fc_track_link(handle, d_inode(dentry), dentry);
464 }
465
466 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
467                           struct dentry *dentry)
468 {
469         struct __track_dentry_update_args args;
470         int ret;
471
472         args.dentry = dentry;
473         args.op = EXT4_FC_TAG_CREAT;
474
475         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
476                                         (void *)&args, 0);
477         trace_ext4_fc_track_create(inode, dentry, ret);
478 }
479
480 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
481 {
482         __ext4_fc_track_create(handle, d_inode(dentry), dentry);
483 }
484
485 /* __track_fn for inode tracking */
486 static int __track_inode(struct inode *inode, void *arg, bool update)
487 {
488         if (update)
489                 return -EEXIST;
490
491         EXT4_I(inode)->i_fc_lblk_len = 0;
492
493         return 0;
494 }
495
496 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
497 {
498         int ret;
499
500         if (S_ISDIR(inode->i_mode))
501                 return;
502
503         if (ext4_should_journal_data(inode)) {
504                 ext4_fc_mark_ineligible(inode->i_sb,
505                                         EXT4_FC_REASON_INODE_JOURNAL_DATA);
506                 return;
507         }
508
509         ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
510         trace_ext4_fc_track_inode(inode, ret);
511 }
512
513 struct __track_range_args {
514         ext4_lblk_t start, end;
515 };
516
517 /* __track_fn for tracking data updates */
518 static int __track_range(struct inode *inode, void *arg, bool update)
519 {
520         struct ext4_inode_info *ei = EXT4_I(inode);
521         ext4_lblk_t oldstart;
522         struct __track_range_args *__arg =
523                 (struct __track_range_args *)arg;
524
525         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
526                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
527                 return -ECANCELED;
528         }
529
530         oldstart = ei->i_fc_lblk_start;
531
532         if (update && ei->i_fc_lblk_len > 0) {
533                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
534                 ei->i_fc_lblk_len =
535                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
536                                 ei->i_fc_lblk_start + 1;
537         } else {
538                 ei->i_fc_lblk_start = __arg->start;
539                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
540         }
541
542         return 0;
543 }
544
545 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
546                          ext4_lblk_t end)
547 {
548         struct __track_range_args args;
549         int ret;
550
551         if (S_ISDIR(inode->i_mode))
552                 return;
553
554         args.start = start;
555         args.end = end;
556
557         ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
558
559         trace_ext4_fc_track_range(inode, start, end, ret);
560 }
561
562 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
563 {
564         int write_flags = REQ_SYNC;
565         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
566
567         /* Add REQ_FUA | REQ_PREFLUSH only its tail */
568         if (test_opt(sb, BARRIER) && is_tail)
569                 write_flags |= REQ_FUA | REQ_PREFLUSH;
570         lock_buffer(bh);
571         set_buffer_dirty(bh);
572         set_buffer_uptodate(bh);
573         bh->b_end_io = ext4_end_buffer_io_sync;
574         submit_bh(REQ_OP_WRITE, write_flags, bh);
575         EXT4_SB(sb)->s_fc_bh = NULL;
576 }
577
578 /* Ext4 commit path routines */
579
580 /* memzero and update CRC */
581 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
582                                 u32 *crc)
583 {
584         void *ret;
585
586         ret = memset(dst, 0, len);
587         if (crc)
588                 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
589         return ret;
590 }
591
592 /*
593  * Allocate len bytes on a fast commit buffer.
594  *
595  * During the commit time this function is used to manage fast commit
596  * block space. We don't split a fast commit log onto different
597  * blocks. So this function makes sure that if there's not enough space
598  * on the current block, the remaining space in the current block is
599  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
600  * new block is from jbd2 and CRC is updated to reflect the padding
601  * we added.
602  */
603 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
604 {
605         struct ext4_fc_tl *tl;
606         struct ext4_sb_info *sbi = EXT4_SB(sb);
607         struct buffer_head *bh;
608         int bsize = sbi->s_journal->j_blocksize;
609         int ret, off = sbi->s_fc_bytes % bsize;
610         int pad_len;
611
612         /*
613          * After allocating len, we should have space at least for a 0 byte
614          * padding.
615          */
616         if (len + sizeof(struct ext4_fc_tl) > bsize)
617                 return NULL;
618
619         if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
620                 /*
621                  * Only allocate from current buffer if we have enough space for
622                  * this request AND we have space to add a zero byte padding.
623                  */
624                 if (!sbi->s_fc_bh) {
625                         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
626                         if (ret)
627                                 return NULL;
628                         sbi->s_fc_bh = bh;
629                 }
630                 sbi->s_fc_bytes += len;
631                 return sbi->s_fc_bh->b_data + off;
632         }
633         /* Need to add PAD tag */
634         tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
635         tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
636         pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
637         tl->fc_len = cpu_to_le16(pad_len);
638         if (crc)
639                 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
640         if (pad_len > 0)
641                 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
642         ext4_fc_submit_bh(sb, false);
643
644         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
645         if (ret)
646                 return NULL;
647         sbi->s_fc_bh = bh;
648         sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
649         return sbi->s_fc_bh->b_data;
650 }
651
652 /* memcpy to fc reserved space and update CRC */
653 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
654                                 int len, u32 *crc)
655 {
656         if (crc)
657                 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
658         return memcpy(dst, src, len);
659 }
660
661 /*
662  * Complete a fast commit by writing tail tag.
663  *
664  * Writing tail tag marks the end of a fast commit. In order to guarantee
665  * atomicity, after writing tail tag, even if there's space remaining
666  * in the block, next commit shouldn't use it. That's why tail tag
667  * has the length as that of the remaining space on the block.
668  */
669 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
670 {
671         struct ext4_sb_info *sbi = EXT4_SB(sb);
672         struct ext4_fc_tl tl;
673         struct ext4_fc_tail tail;
674         int off, bsize = sbi->s_journal->j_blocksize;
675         u8 *dst;
676
677         /*
678          * ext4_fc_reserve_space takes care of allocating an extra block if
679          * there's no enough space on this block for accommodating this tail.
680          */
681         dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
682         if (!dst)
683                 return -ENOSPC;
684
685         off = sbi->s_fc_bytes % bsize;
686
687         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
688         tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
689         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
690
691         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
692         dst += sizeof(tl);
693         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
694         ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
695         dst += sizeof(tail.fc_tid);
696         tail.fc_crc = cpu_to_le32(crc);
697         ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
698
699         ext4_fc_submit_bh(sb, true);
700
701         return 0;
702 }
703
704 /*
705  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
706  * Returns false if there's not enough space.
707  */
708 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
709                            u32 *crc)
710 {
711         struct ext4_fc_tl tl;
712         u8 *dst;
713
714         dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
715         if (!dst)
716                 return false;
717
718         tl.fc_tag = cpu_to_le16(tag);
719         tl.fc_len = cpu_to_le16(len);
720
721         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
722         ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
723
724         return true;
725 }
726
727 /* Same as above, but adds dentry tlv. */
728 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
729                                    struct ext4_fc_dentry_update *fc_dentry)
730 {
731         struct ext4_fc_dentry_info fcd;
732         struct ext4_fc_tl tl;
733         int dlen = fc_dentry->fcd_name.len;
734         u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
735                                         crc);
736
737         if (!dst)
738                 return false;
739
740         fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
741         fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
742         tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
743         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
744         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
745         dst += sizeof(tl);
746         ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
747         dst += sizeof(fcd);
748         ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
749
750         return true;
751 }
752
753 /*
754  * Writes inode in the fast commit space under TLV with tag @tag.
755  * Returns 0 on success, error on failure.
756  */
757 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
758 {
759         struct ext4_inode_info *ei = EXT4_I(inode);
760         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
761         int ret;
762         struct ext4_iloc iloc;
763         struct ext4_fc_inode fc_inode;
764         struct ext4_fc_tl tl;
765         u8 *dst;
766
767         ret = ext4_get_inode_loc(inode, &iloc);
768         if (ret)
769                 return ret;
770
771         if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
772                 inode_len = EXT4_INODE_SIZE(inode->i_sb);
773         else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
774                 inode_len += ei->i_extra_isize;
775
776         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
777         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
778         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
779
780         dst = ext4_fc_reserve_space(inode->i_sb,
781                         sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
782         if (!dst)
783                 return -ECANCELED;
784
785         if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
786                 return -ECANCELED;
787         dst += sizeof(tl);
788         if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
789                 return -ECANCELED;
790         dst += sizeof(fc_inode);
791         if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
792                                         inode_len, crc))
793                 return -ECANCELED;
794
795         return 0;
796 }
797
798 /*
799  * Writes updated data ranges for the inode in question. Updates CRC.
800  * Returns 0 on success, error otherwise.
801  */
802 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
803 {
804         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
805         struct ext4_inode_info *ei = EXT4_I(inode);
806         struct ext4_map_blocks map;
807         struct ext4_fc_add_range fc_ext;
808         struct ext4_fc_del_range lrange;
809         struct ext4_extent *ex;
810         int ret;
811
812         mutex_lock(&ei->i_fc_lock);
813         if (ei->i_fc_lblk_len == 0) {
814                 mutex_unlock(&ei->i_fc_lock);
815                 return 0;
816         }
817         old_blk_size = ei->i_fc_lblk_start;
818         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
819         ei->i_fc_lblk_len = 0;
820         mutex_unlock(&ei->i_fc_lock);
821
822         cur_lblk_off = old_blk_size;
823         jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
824                   __func__, cur_lblk_off, new_blk_size, inode->i_ino);
825
826         while (cur_lblk_off <= new_blk_size) {
827                 map.m_lblk = cur_lblk_off;
828                 map.m_len = new_blk_size - cur_lblk_off + 1;
829                 ret = ext4_map_blocks(NULL, inode, &map, 0);
830                 if (ret < 0)
831                         return -ECANCELED;
832
833                 if (map.m_len == 0) {
834                         cur_lblk_off++;
835                         continue;
836                 }
837
838                 if (ret == 0) {
839                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
840                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
841                         lrange.fc_len = cpu_to_le32(map.m_len);
842                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
843                                             sizeof(lrange), (u8 *)&lrange, crc))
844                                 return -ENOSPC;
845                 } else {
846                         unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
847                                 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
848
849                         /* Limit the number of blocks in one extent */
850                         map.m_len = min(max, map.m_len);
851
852                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
853                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
854                         ex->ee_block = cpu_to_le32(map.m_lblk);
855                         ex->ee_len = cpu_to_le16(map.m_len);
856                         ext4_ext_store_pblock(ex, map.m_pblk);
857                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
858                                 ext4_ext_mark_unwritten(ex);
859                         else
860                                 ext4_ext_mark_initialized(ex);
861                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
862                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
863                                 return -ENOSPC;
864                 }
865
866                 cur_lblk_off += map.m_len;
867         }
868
869         return 0;
870 }
871
872
873 /* Submit data for all the fast commit inodes */
874 static int ext4_fc_submit_inode_data_all(journal_t *journal)
875 {
876         struct super_block *sb = (struct super_block *)(journal->j_private);
877         struct ext4_sb_info *sbi = EXT4_SB(sb);
878         struct ext4_inode_info *ei;
879         int ret = 0;
880
881         spin_lock(&sbi->s_fc_lock);
882         ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
883         list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
884                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
885                 while (atomic_read(&ei->i_fc_updates)) {
886                         DEFINE_WAIT(wait);
887
888                         prepare_to_wait(&ei->i_fc_wait, &wait,
889                                                 TASK_UNINTERRUPTIBLE);
890                         if (atomic_read(&ei->i_fc_updates)) {
891                                 spin_unlock(&sbi->s_fc_lock);
892                                 schedule();
893                                 spin_lock(&sbi->s_fc_lock);
894                         }
895                         finish_wait(&ei->i_fc_wait, &wait);
896                 }
897                 spin_unlock(&sbi->s_fc_lock);
898                 ret = jbd2_submit_inode_data(ei->jinode);
899                 if (ret)
900                         return ret;
901                 spin_lock(&sbi->s_fc_lock);
902         }
903         spin_unlock(&sbi->s_fc_lock);
904
905         return ret;
906 }
907
908 /* Wait for completion of data for all the fast commit inodes */
909 static int ext4_fc_wait_inode_data_all(journal_t *journal)
910 {
911         struct super_block *sb = (struct super_block *)(journal->j_private);
912         struct ext4_sb_info *sbi = EXT4_SB(sb);
913         struct ext4_inode_info *pos, *n;
914         int ret = 0;
915
916         spin_lock(&sbi->s_fc_lock);
917         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
918                 if (!ext4_test_inode_state(&pos->vfs_inode,
919                                            EXT4_STATE_FC_COMMITTING))
920                         continue;
921                 spin_unlock(&sbi->s_fc_lock);
922
923                 ret = jbd2_wait_inode_data(journal, pos->jinode);
924                 if (ret)
925                         return ret;
926                 spin_lock(&sbi->s_fc_lock);
927         }
928         spin_unlock(&sbi->s_fc_lock);
929
930         return 0;
931 }
932
933 /* Commit all the directory entry updates */
934 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
935 __acquires(&sbi->s_fc_lock)
936 __releases(&sbi->s_fc_lock)
937 {
938         struct super_block *sb = (struct super_block *)(journal->j_private);
939         struct ext4_sb_info *sbi = EXT4_SB(sb);
940         struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
941         struct inode *inode;
942         struct ext4_inode_info *ei, *ei_n;
943         int ret;
944
945         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
946                 return 0;
947         list_for_each_entry_safe(fc_dentry, fc_dentry_n,
948                                  &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
949                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
950                         spin_unlock(&sbi->s_fc_lock);
951                         if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
952                                 ret = -ENOSPC;
953                                 goto lock_and_exit;
954                         }
955                         spin_lock(&sbi->s_fc_lock);
956                         continue;
957                 }
958
959                 inode = NULL;
960                 list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
961                                          i_fc_list) {
962                         if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
963                                 inode = &ei->vfs_inode;
964                                 break;
965                         }
966                 }
967                 /*
968                  * If we don't find inode in our list, then it was deleted,
969                  * in which case, we don't need to record it's create tag.
970                  */
971                 if (!inode)
972                         continue;
973                 spin_unlock(&sbi->s_fc_lock);
974
975                 /*
976                  * We first write the inode and then the create dirent. This
977                  * allows the recovery code to create an unnamed inode first
978                  * and then link it to a directory entry. This allows us
979                  * to use namei.c routines almost as is and simplifies
980                  * the recovery code.
981                  */
982                 ret = ext4_fc_write_inode(inode, crc);
983                 if (ret)
984                         goto lock_and_exit;
985
986                 ret = ext4_fc_write_inode_data(inode, crc);
987                 if (ret)
988                         goto lock_and_exit;
989
990                 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
991                         ret = -ENOSPC;
992                         goto lock_and_exit;
993                 }
994
995                 spin_lock(&sbi->s_fc_lock);
996         }
997         return 0;
998 lock_and_exit:
999         spin_lock(&sbi->s_fc_lock);
1000         return ret;
1001 }
1002
1003 static int ext4_fc_perform_commit(journal_t *journal)
1004 {
1005         struct super_block *sb = (struct super_block *)(journal->j_private);
1006         struct ext4_sb_info *sbi = EXT4_SB(sb);
1007         struct ext4_inode_info *iter;
1008         struct ext4_fc_head head;
1009         struct inode *inode;
1010         struct blk_plug plug;
1011         int ret = 0;
1012         u32 crc = 0;
1013
1014         ret = ext4_fc_submit_inode_data_all(journal);
1015         if (ret)
1016                 return ret;
1017
1018         ret = ext4_fc_wait_inode_data_all(journal);
1019         if (ret)
1020                 return ret;
1021
1022         /*
1023          * If file system device is different from journal device, issue a cache
1024          * flush before we start writing fast commit blocks.
1025          */
1026         if (journal->j_fs_dev != journal->j_dev)
1027                 blkdev_issue_flush(journal->j_fs_dev);
1028
1029         blk_start_plug(&plug);
1030         if (sbi->s_fc_bytes == 0) {
1031                 /*
1032                  * Add a head tag only if this is the first fast commit
1033                  * in this TID.
1034                  */
1035                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1036                 head.fc_tid = cpu_to_le32(
1037                         sbi->s_journal->j_running_transaction->t_tid);
1038                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1039                         (u8 *)&head, &crc)) {
1040                         ret = -ENOSPC;
1041                         goto out;
1042                 }
1043         }
1044
1045         spin_lock(&sbi->s_fc_lock);
1046         ret = ext4_fc_commit_dentry_updates(journal, &crc);
1047         if (ret) {
1048                 spin_unlock(&sbi->s_fc_lock);
1049                 goto out;
1050         }
1051
1052         list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1053                 inode = &iter->vfs_inode;
1054                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1055                         continue;
1056
1057                 spin_unlock(&sbi->s_fc_lock);
1058                 ret = ext4_fc_write_inode_data(inode, &crc);
1059                 if (ret)
1060                         goto out;
1061                 ret = ext4_fc_write_inode(inode, &crc);
1062                 if (ret)
1063                         goto out;
1064                 spin_lock(&sbi->s_fc_lock);
1065         }
1066         spin_unlock(&sbi->s_fc_lock);
1067
1068         ret = ext4_fc_write_tail(sb, crc);
1069
1070 out:
1071         blk_finish_plug(&plug);
1072         return ret;
1073 }
1074
1075 static void ext4_fc_update_stats(struct super_block *sb, int status,
1076                                  u64 commit_time, int nblks)
1077 {
1078         struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1079
1080         jbd_debug(1, "Fast commit ended with status = %d", status);
1081         if (status == EXT4_FC_STATUS_OK) {
1082                 stats->fc_num_commits++;
1083                 stats->fc_numblks += nblks;
1084                 if (likely(stats->s_fc_avg_commit_time))
1085                         stats->s_fc_avg_commit_time =
1086                                 (commit_time +
1087                                  stats->s_fc_avg_commit_time * 3) / 4;
1088                 else
1089                         stats->s_fc_avg_commit_time = commit_time;
1090         } else if (status == EXT4_FC_STATUS_FAILED ||
1091                    status == EXT4_FC_STATUS_INELIGIBLE) {
1092                 if (status == EXT4_FC_STATUS_FAILED)
1093                         stats->fc_failed_commits++;
1094                 stats->fc_ineligible_commits++;
1095         } else {
1096                 stats->fc_skipped_commits++;
1097         }
1098         trace_ext4_fc_commit_stop(sb, nblks, status);
1099 }
1100
1101 /*
1102  * The main commit entry point. Performs a fast commit for transaction
1103  * commit_tid if needed. If it's not possible to perform a fast commit
1104  * due to various reasons, we fall back to full commit. Returns 0
1105  * on success, error otherwise.
1106  */
1107 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1108 {
1109         struct super_block *sb = (struct super_block *)(journal->j_private);
1110         struct ext4_sb_info *sbi = EXT4_SB(sb);
1111         int nblks = 0, ret, bsize = journal->j_blocksize;
1112         int subtid = atomic_read(&sbi->s_fc_subtid);
1113         int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1114         ktime_t start_time, commit_time;
1115
1116         trace_ext4_fc_commit_start(sb);
1117
1118         start_time = ktime_get();
1119
1120         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1121                 return jbd2_complete_transaction(journal, commit_tid);
1122
1123 restart_fc:
1124         ret = jbd2_fc_begin_commit(journal, commit_tid);
1125         if (ret == -EALREADY) {
1126                 /* There was an ongoing commit, check if we need to restart */
1127                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1128                         commit_tid > journal->j_commit_sequence)
1129                         goto restart_fc;
1130                 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0);
1131                 return 0;
1132         } else if (ret) {
1133                 /*
1134                  * Commit couldn't start. Just update stats and perform a
1135                  * full commit.
1136                  */
1137                 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0);
1138                 return jbd2_complete_transaction(journal, commit_tid);
1139         }
1140
1141         /*
1142          * After establishing journal barrier via jbd2_fc_begin_commit(), check
1143          * if we are fast commit ineligible.
1144          */
1145         if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1146                 status = EXT4_FC_STATUS_INELIGIBLE;
1147                 goto fallback;
1148         }
1149
1150         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1151         ret = ext4_fc_perform_commit(journal);
1152         if (ret < 0) {
1153                 status = EXT4_FC_STATUS_FAILED;
1154                 goto fallback;
1155         }
1156         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1157         ret = jbd2_fc_wait_bufs(journal, nblks);
1158         if (ret < 0) {
1159                 status = EXT4_FC_STATUS_FAILED;
1160                 goto fallback;
1161         }
1162         atomic_inc(&sbi->s_fc_subtid);
1163         ret = jbd2_fc_end_commit(journal);
1164         /*
1165          * weight the commit time higher than the average time so we
1166          * don't react too strongly to vast changes in the commit time
1167          */
1168         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1169         ext4_fc_update_stats(sb, status, commit_time, nblks);
1170         return ret;
1171
1172 fallback:
1173         ret = jbd2_fc_end_commit_fallback(journal);
1174         ext4_fc_update_stats(sb, status, 0, 0);
1175         return ret;
1176 }
1177
1178 /*
1179  * Fast commit cleanup routine. This is called after every fast commit and
1180  * full commit. full is true if we are called after a full commit.
1181  */
1182 static void ext4_fc_cleanup(journal_t *journal, int full)
1183 {
1184         struct super_block *sb = journal->j_private;
1185         struct ext4_sb_info *sbi = EXT4_SB(sb);
1186         struct ext4_inode_info *iter, *iter_n;
1187         struct ext4_fc_dentry_update *fc_dentry;
1188
1189         if (full && sbi->s_fc_bh)
1190                 sbi->s_fc_bh = NULL;
1191
1192         jbd2_fc_release_bufs(journal);
1193
1194         spin_lock(&sbi->s_fc_lock);
1195         list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1196                                  i_fc_list) {
1197                 list_del_init(&iter->i_fc_list);
1198                 ext4_clear_inode_state(&iter->vfs_inode,
1199                                        EXT4_STATE_FC_COMMITTING);
1200                 ext4_fc_reset_inode(&iter->vfs_inode);
1201                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1202                 smp_mb();
1203 #if (BITS_PER_LONG < 64)
1204                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1205 #else
1206                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1207 #endif
1208         }
1209
1210         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1211                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1212                                              struct ext4_fc_dentry_update,
1213                                              fcd_list);
1214                 list_del_init(&fc_dentry->fcd_list);
1215                 spin_unlock(&sbi->s_fc_lock);
1216
1217                 if (fc_dentry->fcd_name.name &&
1218                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1219                         kfree(fc_dentry->fcd_name.name);
1220                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1221                 spin_lock(&sbi->s_fc_lock);
1222         }
1223
1224         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1225                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1226         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1227                                 &sbi->s_fc_q[FC_Q_MAIN]);
1228
1229         ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1230         ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1231
1232         if (full)
1233                 sbi->s_fc_bytes = 0;
1234         spin_unlock(&sbi->s_fc_lock);
1235         trace_ext4_fc_stats(sb);
1236 }
1237
1238 /* Ext4 Replay Path Routines */
1239
1240 /* Helper struct for dentry replay routines */
1241 struct dentry_info_args {
1242         int parent_ino, dname_len, ino, inode_len;
1243         char *dname;
1244 };
1245
1246 static inline void tl_to_darg(struct dentry_info_args *darg,
1247                               struct  ext4_fc_tl *tl, u8 *val)
1248 {
1249         struct ext4_fc_dentry_info fcd;
1250
1251         memcpy(&fcd, val, sizeof(fcd));
1252
1253         darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1254         darg->ino = le32_to_cpu(fcd.fc_ino);
1255         darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1256         darg->dname_len = le16_to_cpu(tl->fc_len) -
1257                 sizeof(struct ext4_fc_dentry_info);
1258 }
1259
1260 /* Unlink replay function */
1261 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1262                                  u8 *val)
1263 {
1264         struct inode *inode, *old_parent;
1265         struct qstr entry;
1266         struct dentry_info_args darg;
1267         int ret = 0;
1268
1269         tl_to_darg(&darg, tl, val);
1270
1271         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1272                         darg.parent_ino, darg.dname_len);
1273
1274         entry.name = darg.dname;
1275         entry.len = darg.dname_len;
1276         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1277
1278         if (IS_ERR(inode)) {
1279                 jbd_debug(1, "Inode %d not found", darg.ino);
1280                 return 0;
1281         }
1282
1283         old_parent = ext4_iget(sb, darg.parent_ino,
1284                                 EXT4_IGET_NORMAL);
1285         if (IS_ERR(old_parent)) {
1286                 jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1287                 iput(inode);
1288                 return 0;
1289         }
1290
1291         ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1292         /* -ENOENT ok coz it might not exist anymore. */
1293         if (ret == -ENOENT)
1294                 ret = 0;
1295         iput(old_parent);
1296         iput(inode);
1297         return ret;
1298 }
1299
1300 static int ext4_fc_replay_link_internal(struct super_block *sb,
1301                                 struct dentry_info_args *darg,
1302                                 struct inode *inode)
1303 {
1304         struct inode *dir = NULL;
1305         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1306         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1307         int ret = 0;
1308
1309         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1310         if (IS_ERR(dir)) {
1311                 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1312                 dir = NULL;
1313                 goto out;
1314         }
1315
1316         dentry_dir = d_obtain_alias(dir);
1317         if (IS_ERR(dentry_dir)) {
1318                 jbd_debug(1, "Failed to obtain dentry");
1319                 dentry_dir = NULL;
1320                 goto out;
1321         }
1322
1323         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1324         if (!dentry_inode) {
1325                 jbd_debug(1, "Inode dentry not created.");
1326                 ret = -ENOMEM;
1327                 goto out;
1328         }
1329
1330         ret = __ext4_link(dir, inode, dentry_inode);
1331         /*
1332          * It's possible that link already existed since data blocks
1333          * for the dir in question got persisted before we crashed OR
1334          * we replayed this tag and crashed before the entire replay
1335          * could complete.
1336          */
1337         if (ret && ret != -EEXIST) {
1338                 jbd_debug(1, "Failed to link\n");
1339                 goto out;
1340         }
1341
1342         ret = 0;
1343 out:
1344         if (dentry_dir) {
1345                 d_drop(dentry_dir);
1346                 dput(dentry_dir);
1347         } else if (dir) {
1348                 iput(dir);
1349         }
1350         if (dentry_inode) {
1351                 d_drop(dentry_inode);
1352                 dput(dentry_inode);
1353         }
1354
1355         return ret;
1356 }
1357
1358 /* Link replay function */
1359 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1360                                u8 *val)
1361 {
1362         struct inode *inode;
1363         struct dentry_info_args darg;
1364         int ret = 0;
1365
1366         tl_to_darg(&darg, tl, val);
1367         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1368                         darg.parent_ino, darg.dname_len);
1369
1370         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1371         if (IS_ERR(inode)) {
1372                 jbd_debug(1, "Inode not found.");
1373                 return 0;
1374         }
1375
1376         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1377         iput(inode);
1378         return ret;
1379 }
1380
1381 /*
1382  * Record all the modified inodes during replay. We use this later to setup
1383  * block bitmaps correctly.
1384  */
1385 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1386 {
1387         struct ext4_fc_replay_state *state;
1388         int i;
1389
1390         state = &EXT4_SB(sb)->s_fc_replay_state;
1391         for (i = 0; i < state->fc_modified_inodes_used; i++)
1392                 if (state->fc_modified_inodes[i] == ino)
1393                         return 0;
1394         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1395                 state->fc_modified_inodes_size +=
1396                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1397                 state->fc_modified_inodes = krealloc(
1398                                         state->fc_modified_inodes, sizeof(int) *
1399                                         state->fc_modified_inodes_size,
1400                                         GFP_KERNEL);
1401                 if (!state->fc_modified_inodes)
1402                         return -ENOMEM;
1403         }
1404         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1405         return 0;
1406 }
1407
1408 /*
1409  * Inode replay function
1410  */
1411 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1412                                 u8 *val)
1413 {
1414         struct ext4_fc_inode fc_inode;
1415         struct ext4_inode *raw_inode;
1416         struct ext4_inode *raw_fc_inode;
1417         struct inode *inode = NULL;
1418         struct ext4_iloc iloc;
1419         int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1420         struct ext4_extent_header *eh;
1421
1422         memcpy(&fc_inode, val, sizeof(fc_inode));
1423
1424         ino = le32_to_cpu(fc_inode.fc_ino);
1425         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1426
1427         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1428         if (!IS_ERR(inode)) {
1429                 ext4_ext_clear_bb(inode);
1430                 iput(inode);
1431         }
1432         inode = NULL;
1433
1434         ext4_fc_record_modified_inode(sb, ino);
1435
1436         raw_fc_inode = (struct ext4_inode *)
1437                 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1438         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1439         if (ret)
1440                 goto out;
1441
1442         inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1443         raw_inode = ext4_raw_inode(&iloc);
1444
1445         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1446         memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1447                 inode_len - offsetof(struct ext4_inode, i_generation));
1448         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1449                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1450                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1451                         memset(eh, 0, sizeof(*eh));
1452                         eh->eh_magic = EXT4_EXT_MAGIC;
1453                         eh->eh_max = cpu_to_le16(
1454                                 (sizeof(raw_inode->i_block) -
1455                                  sizeof(struct ext4_extent_header))
1456                                  / sizeof(struct ext4_extent));
1457                 }
1458         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1459                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1460                         sizeof(raw_inode->i_block));
1461         }
1462
1463         /* Immediately update the inode on disk. */
1464         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1465         if (ret)
1466                 goto out;
1467         ret = sync_dirty_buffer(iloc.bh);
1468         if (ret)
1469                 goto out;
1470         ret = ext4_mark_inode_used(sb, ino);
1471         if (ret)
1472                 goto out;
1473
1474         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1475         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1476         if (IS_ERR(inode)) {
1477                 jbd_debug(1, "Inode not found.");
1478                 return -EFSCORRUPTED;
1479         }
1480
1481         /*
1482          * Our allocator could have made different decisions than before
1483          * crashing. This should be fixed but until then, we calculate
1484          * the number of blocks the inode.
1485          */
1486         if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1487                 ext4_ext_replay_set_iblocks(inode);
1488
1489         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1490         ext4_reset_inode_seed(inode);
1491
1492         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1493         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1494         sync_dirty_buffer(iloc.bh);
1495         brelse(iloc.bh);
1496 out:
1497         iput(inode);
1498         if (!ret)
1499                 blkdev_issue_flush(sb->s_bdev);
1500
1501         return 0;
1502 }
1503
1504 /*
1505  * Dentry create replay function.
1506  *
1507  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1508  * inode for which we are trying to create a dentry here, should already have
1509  * been replayed before we start here.
1510  */
1511 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1512                                  u8 *val)
1513 {
1514         int ret = 0;
1515         struct inode *inode = NULL;
1516         struct inode *dir = NULL;
1517         struct dentry_info_args darg;
1518
1519         tl_to_darg(&darg, tl, val);
1520
1521         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1522                         darg.parent_ino, darg.dname_len);
1523
1524         /* This takes care of update group descriptor and other metadata */
1525         ret = ext4_mark_inode_used(sb, darg.ino);
1526         if (ret)
1527                 goto out;
1528
1529         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1530         if (IS_ERR(inode)) {
1531                 jbd_debug(1, "inode %d not found.", darg.ino);
1532                 inode = NULL;
1533                 ret = -EINVAL;
1534                 goto out;
1535         }
1536
1537         if (S_ISDIR(inode->i_mode)) {
1538                 /*
1539                  * If we are creating a directory, we need to make sure that the
1540                  * dot and dot dot dirents are setup properly.
1541                  */
1542                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1543                 if (IS_ERR(dir)) {
1544                         jbd_debug(1, "Dir %d not found.", darg.ino);
1545                         goto out;
1546                 }
1547                 ret = ext4_init_new_dir(NULL, dir, inode);
1548                 iput(dir);
1549                 if (ret) {
1550                         ret = 0;
1551                         goto out;
1552                 }
1553         }
1554         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1555         if (ret)
1556                 goto out;
1557         set_nlink(inode, 1);
1558         ext4_mark_inode_dirty(NULL, inode);
1559 out:
1560         if (inode)
1561                 iput(inode);
1562         return ret;
1563 }
1564
1565 /*
1566  * Record physical disk regions which are in use as per fast commit area,
1567  * and used by inodes during replay phase. Our simple replay phase
1568  * allocator excludes these regions from allocation.
1569  */
1570 int ext4_fc_record_regions(struct super_block *sb, int ino,
1571                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1572 {
1573         struct ext4_fc_replay_state *state;
1574         struct ext4_fc_alloc_region *region;
1575
1576         state = &EXT4_SB(sb)->s_fc_replay_state;
1577         /*
1578          * during replay phase, the fc_regions_valid may not same as
1579          * fc_regions_used, update it when do new additions.
1580          */
1581         if (replay && state->fc_regions_used != state->fc_regions_valid)
1582                 state->fc_regions_used = state->fc_regions_valid;
1583         if (state->fc_regions_used == state->fc_regions_size) {
1584                 state->fc_regions_size +=
1585                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1586                 state->fc_regions = krealloc(
1587                                         state->fc_regions,
1588                                         state->fc_regions_size *
1589                                         sizeof(struct ext4_fc_alloc_region),
1590                                         GFP_KERNEL);
1591                 if (!state->fc_regions)
1592                         return -ENOMEM;
1593         }
1594         region = &state->fc_regions[state->fc_regions_used++];
1595         region->ino = ino;
1596         region->lblk = lblk;
1597         region->pblk = pblk;
1598         region->len = len;
1599
1600         if (replay)
1601                 state->fc_regions_valid++;
1602
1603         return 0;
1604 }
1605
1606 /* Replay add range tag */
1607 static int ext4_fc_replay_add_range(struct super_block *sb,
1608                                     struct ext4_fc_tl *tl, u8 *val)
1609 {
1610         struct ext4_fc_add_range fc_add_ex;
1611         struct ext4_extent newex, *ex;
1612         struct inode *inode;
1613         ext4_lblk_t start, cur;
1614         int remaining, len;
1615         ext4_fsblk_t start_pblk;
1616         struct ext4_map_blocks map;
1617         struct ext4_ext_path *path = NULL;
1618         int ret;
1619
1620         memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1621         ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1622
1623         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1624                 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1625                 ext4_ext_get_actual_len(ex));
1626
1627         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1628         if (IS_ERR(inode)) {
1629                 jbd_debug(1, "Inode not found.");
1630                 return 0;
1631         }
1632
1633         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1634
1635         start = le32_to_cpu(ex->ee_block);
1636         start_pblk = ext4_ext_pblock(ex);
1637         len = ext4_ext_get_actual_len(ex);
1638
1639         cur = start;
1640         remaining = len;
1641         jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1642                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
1643                   inode->i_ino);
1644
1645         while (remaining > 0) {
1646                 map.m_lblk = cur;
1647                 map.m_len = remaining;
1648                 map.m_pblk = 0;
1649                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1650
1651                 if (ret < 0) {
1652                         iput(inode);
1653                         return 0;
1654                 }
1655
1656                 if (ret == 0) {
1657                         /* Range is not mapped */
1658                         path = ext4_find_extent(inode, cur, NULL, 0);
1659                         if (IS_ERR(path)) {
1660                                 iput(inode);
1661                                 return 0;
1662                         }
1663                         memset(&newex, 0, sizeof(newex));
1664                         newex.ee_block = cpu_to_le32(cur);
1665                         ext4_ext_store_pblock(
1666                                 &newex, start_pblk + cur - start);
1667                         newex.ee_len = cpu_to_le16(map.m_len);
1668                         if (ext4_ext_is_unwritten(ex))
1669                                 ext4_ext_mark_unwritten(&newex);
1670                         down_write(&EXT4_I(inode)->i_data_sem);
1671                         ret = ext4_ext_insert_extent(
1672                                 NULL, inode, &path, &newex, 0);
1673                         up_write((&EXT4_I(inode)->i_data_sem));
1674                         ext4_ext_drop_refs(path);
1675                         kfree(path);
1676                         if (ret) {
1677                                 iput(inode);
1678                                 return 0;
1679                         }
1680                         goto next;
1681                 }
1682
1683                 if (start_pblk + cur - start != map.m_pblk) {
1684                         /*
1685                          * Logical to physical mapping changed. This can happen
1686                          * if this range was removed and then reallocated to
1687                          * map to new physical blocks during a fast commit.
1688                          */
1689                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1690                                         ext4_ext_is_unwritten(ex),
1691                                         start_pblk + cur - start);
1692                         if (ret) {
1693                                 iput(inode);
1694                                 return 0;
1695                         }
1696                         /*
1697                          * Mark the old blocks as free since they aren't used
1698                          * anymore. We maintain an array of all the modified
1699                          * inodes. In case these blocks are still used at either
1700                          * a different logical range in the same inode or in
1701                          * some different inode, we will mark them as allocated
1702                          * at the end of the FC replay using our array of
1703                          * modified inodes.
1704                          */
1705                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1706                         goto next;
1707                 }
1708
1709                 /* Range is mapped and needs a state change */
1710                 jbd_debug(1, "Converting from %ld to %d %lld",
1711                                 map.m_flags & EXT4_MAP_UNWRITTEN,
1712                         ext4_ext_is_unwritten(ex), map.m_pblk);
1713                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1714                                         ext4_ext_is_unwritten(ex), map.m_pblk);
1715                 if (ret) {
1716                         iput(inode);
1717                         return 0;
1718                 }
1719                 /*
1720                  * We may have split the extent tree while toggling the state.
1721                  * Try to shrink the extent tree now.
1722                  */
1723                 ext4_ext_replay_shrink_inode(inode, start + len);
1724 next:
1725                 cur += map.m_len;
1726                 remaining -= map.m_len;
1727         }
1728         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1729                                         sb->s_blocksize_bits);
1730         iput(inode);
1731         return 0;
1732 }
1733
1734 /* Replay DEL_RANGE tag */
1735 static int
1736 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1737                          u8 *val)
1738 {
1739         struct inode *inode;
1740         struct ext4_fc_del_range lrange;
1741         struct ext4_map_blocks map;
1742         ext4_lblk_t cur, remaining;
1743         int ret;
1744
1745         memcpy(&lrange, val, sizeof(lrange));
1746         cur = le32_to_cpu(lrange.fc_lblk);
1747         remaining = le32_to_cpu(lrange.fc_len);
1748
1749         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1750                 le32_to_cpu(lrange.fc_ino), cur, remaining);
1751
1752         inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1753         if (IS_ERR(inode)) {
1754                 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1755                 return 0;
1756         }
1757
1758         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1759
1760         jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1761                         inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1762                         le32_to_cpu(lrange.fc_len));
1763         while (remaining > 0) {
1764                 map.m_lblk = cur;
1765                 map.m_len = remaining;
1766
1767                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1768                 if (ret < 0) {
1769                         iput(inode);
1770                         return 0;
1771                 }
1772                 if (ret > 0) {
1773                         remaining -= ret;
1774                         cur += ret;
1775                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1776                 } else {
1777                         remaining -= map.m_len;
1778                         cur += map.m_len;
1779                 }
1780         }
1781
1782         down_write(&EXT4_I(inode)->i_data_sem);
1783         ret = ext4_ext_remove_space(inode, lrange.fc_lblk,
1784                                 lrange.fc_lblk + lrange.fc_len - 1);
1785         up_write(&EXT4_I(inode)->i_data_sem);
1786         if (ret) {
1787                 iput(inode);
1788                 return 0;
1789         }
1790         ext4_ext_replay_shrink_inode(inode,
1791                 i_size_read(inode) >> sb->s_blocksize_bits);
1792         ext4_mark_inode_dirty(NULL, inode);
1793         iput(inode);
1794
1795         return 0;
1796 }
1797
1798 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1799 {
1800         struct ext4_fc_replay_state *state;
1801         struct inode *inode;
1802         struct ext4_ext_path *path = NULL;
1803         struct ext4_map_blocks map;
1804         int i, ret, j;
1805         ext4_lblk_t cur, end;
1806
1807         state = &EXT4_SB(sb)->s_fc_replay_state;
1808         for (i = 0; i < state->fc_modified_inodes_used; i++) {
1809                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1810                         EXT4_IGET_NORMAL);
1811                 if (IS_ERR(inode)) {
1812                         jbd_debug(1, "Inode %d not found.",
1813                                 state->fc_modified_inodes[i]);
1814                         continue;
1815                 }
1816                 cur = 0;
1817                 end = EXT_MAX_BLOCKS;
1818                 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1819                         iput(inode);
1820                         continue;
1821                 }
1822                 while (cur < end) {
1823                         map.m_lblk = cur;
1824                         map.m_len = end - cur;
1825
1826                         ret = ext4_map_blocks(NULL, inode, &map, 0);
1827                         if (ret < 0)
1828                                 break;
1829
1830                         if (ret > 0) {
1831                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1832                                 if (!IS_ERR(path)) {
1833                                         for (j = 0; j < path->p_depth; j++)
1834                                                 ext4_mb_mark_bb(inode->i_sb,
1835                                                         path[j].p_block, 1, 1);
1836                                         ext4_ext_drop_refs(path);
1837                                         kfree(path);
1838                                 }
1839                                 cur += ret;
1840                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1841                                                         map.m_len, 1);
1842                         } else {
1843                                 cur = cur + (map.m_len ? map.m_len : 1);
1844                         }
1845                 }
1846                 iput(inode);
1847         }
1848 }
1849
1850 /*
1851  * Check if block is in excluded regions for block allocation. The simple
1852  * allocator that runs during replay phase is calls this function to see
1853  * if it is okay to use a block.
1854  */
1855 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1856 {
1857         int i;
1858         struct ext4_fc_replay_state *state;
1859
1860         state = &EXT4_SB(sb)->s_fc_replay_state;
1861         for (i = 0; i < state->fc_regions_valid; i++) {
1862                 if (state->fc_regions[i].ino == 0 ||
1863                         state->fc_regions[i].len == 0)
1864                         continue;
1865                 if (blk >= state->fc_regions[i].pblk &&
1866                     blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1867                         return true;
1868         }
1869         return false;
1870 }
1871
1872 /* Cleanup function called after replay */
1873 void ext4_fc_replay_cleanup(struct super_block *sb)
1874 {
1875         struct ext4_sb_info *sbi = EXT4_SB(sb);
1876
1877         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1878         kfree(sbi->s_fc_replay_state.fc_regions);
1879         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1880 }
1881
1882 /*
1883  * Recovery Scan phase handler
1884  *
1885  * This function is called during the scan phase and is responsible
1886  * for doing following things:
1887  * - Make sure the fast commit area has valid tags for replay
1888  * - Count number of tags that need to be replayed by the replay handler
1889  * - Verify CRC
1890  * - Create a list of excluded blocks for allocation during replay phase
1891  *
1892  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1893  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1894  * to indicate that scan has finished and JBD2 can now start replay phase.
1895  * It returns a negative error to indicate that there was an error. At the end
1896  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1897  * to indicate the number of tags that need to replayed during the replay phase.
1898  */
1899 static int ext4_fc_replay_scan(journal_t *journal,
1900                                 struct buffer_head *bh, int off,
1901                                 tid_t expected_tid)
1902 {
1903         struct super_block *sb = journal->j_private;
1904         struct ext4_sb_info *sbi = EXT4_SB(sb);
1905         struct ext4_fc_replay_state *state;
1906         int ret = JBD2_FC_REPLAY_CONTINUE;
1907         struct ext4_fc_add_range ext;
1908         struct ext4_fc_tl tl;
1909         struct ext4_fc_tail tail;
1910         __u8 *start, *end, *cur, *val;
1911         struct ext4_fc_head head;
1912         struct ext4_extent *ex;
1913
1914         state = &sbi->s_fc_replay_state;
1915
1916         start = (u8 *)bh->b_data;
1917         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1918
1919         if (state->fc_replay_expected_off == 0) {
1920                 state->fc_cur_tag = 0;
1921                 state->fc_replay_num_tags = 0;
1922                 state->fc_crc = 0;
1923                 state->fc_regions = NULL;
1924                 state->fc_regions_valid = state->fc_regions_used =
1925                         state->fc_regions_size = 0;
1926                 /* Check if we can stop early */
1927                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1928                         != EXT4_FC_TAG_HEAD)
1929                         return 0;
1930         }
1931
1932         if (off != state->fc_replay_expected_off) {
1933                 ret = -EFSCORRUPTED;
1934                 goto out_err;
1935         }
1936
1937         state->fc_replay_expected_off++;
1938         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1939                 memcpy(&tl, cur, sizeof(tl));
1940                 val = cur + sizeof(tl);
1941                 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1942                           tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1943                 switch (le16_to_cpu(tl.fc_tag)) {
1944                 case EXT4_FC_TAG_ADD_RANGE:
1945                         memcpy(&ext, val, sizeof(ext));
1946                         ex = (struct ext4_extent *)&ext.fc_ex;
1947                         ret = ext4_fc_record_regions(sb,
1948                                 le32_to_cpu(ext.fc_ino),
1949                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1950                                 ext4_ext_get_actual_len(ex), 0);
1951                         if (ret < 0)
1952                                 break;
1953                         ret = JBD2_FC_REPLAY_CONTINUE;
1954                         fallthrough;
1955                 case EXT4_FC_TAG_DEL_RANGE:
1956                 case EXT4_FC_TAG_LINK:
1957                 case EXT4_FC_TAG_UNLINK:
1958                 case EXT4_FC_TAG_CREAT:
1959                 case EXT4_FC_TAG_INODE:
1960                 case EXT4_FC_TAG_PAD:
1961                         state->fc_cur_tag++;
1962                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1963                                         sizeof(tl) + le16_to_cpu(tl.fc_len));
1964                         break;
1965                 case EXT4_FC_TAG_TAIL:
1966                         state->fc_cur_tag++;
1967                         memcpy(&tail, val, sizeof(tail));
1968                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1969                                                 sizeof(tl) +
1970                                                 offsetof(struct ext4_fc_tail,
1971                                                 fc_crc));
1972                         if (le32_to_cpu(tail.fc_tid) == expected_tid &&
1973                                 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
1974                                 state->fc_replay_num_tags = state->fc_cur_tag;
1975                                 state->fc_regions_valid =
1976                                         state->fc_regions_used;
1977                         } else {
1978                                 ret = state->fc_replay_num_tags ?
1979                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1980                         }
1981                         state->fc_crc = 0;
1982                         break;
1983                 case EXT4_FC_TAG_HEAD:
1984                         memcpy(&head, val, sizeof(head));
1985                         if (le32_to_cpu(head.fc_features) &
1986                                 ~EXT4_FC_SUPPORTED_FEATURES) {
1987                                 ret = -EOPNOTSUPP;
1988                                 break;
1989                         }
1990                         if (le32_to_cpu(head.fc_tid) != expected_tid) {
1991                                 ret = JBD2_FC_REPLAY_STOP;
1992                                 break;
1993                         }
1994                         state->fc_cur_tag++;
1995                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1996                                             sizeof(tl) + le16_to_cpu(tl.fc_len));
1997                         break;
1998                 default:
1999                         ret = state->fc_replay_num_tags ?
2000                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
2001                 }
2002                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2003                         break;
2004         }
2005
2006 out_err:
2007         trace_ext4_fc_replay_scan(sb, ret, off);
2008         return ret;
2009 }
2010
2011 /*
2012  * Main recovery path entry point.
2013  * The meaning of return codes is similar as above.
2014  */
2015 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2016                                 enum passtype pass, int off, tid_t expected_tid)
2017 {
2018         struct super_block *sb = journal->j_private;
2019         struct ext4_sb_info *sbi = EXT4_SB(sb);
2020         struct ext4_fc_tl tl;
2021         __u8 *start, *end, *cur, *val;
2022         int ret = JBD2_FC_REPLAY_CONTINUE;
2023         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2024         struct ext4_fc_tail tail;
2025
2026         if (pass == PASS_SCAN) {
2027                 state->fc_current_pass = PASS_SCAN;
2028                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2029         }
2030
2031         if (state->fc_current_pass != pass) {
2032                 state->fc_current_pass = pass;
2033                 sbi->s_mount_state |= EXT4_FC_REPLAY;
2034         }
2035         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2036                 jbd_debug(1, "Replay stops\n");
2037                 ext4_fc_set_bitmaps_and_counters(sb);
2038                 return 0;
2039         }
2040
2041 #ifdef CONFIG_EXT4_DEBUG
2042         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2043                 pr_warn("Dropping fc block %d because max_replay set\n", off);
2044                 return JBD2_FC_REPLAY_STOP;
2045         }
2046 #endif
2047
2048         start = (u8 *)bh->b_data;
2049         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2050
2051         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2052                 memcpy(&tl, cur, sizeof(tl));
2053                 val = cur + sizeof(tl);
2054
2055                 if (state->fc_replay_num_tags == 0) {
2056                         ret = JBD2_FC_REPLAY_STOP;
2057                         ext4_fc_set_bitmaps_and_counters(sb);
2058                         break;
2059                 }
2060                 jbd_debug(3, "Replay phase, tag:%s\n",
2061                                 tag2str(le16_to_cpu(tl.fc_tag)));
2062                 state->fc_replay_num_tags--;
2063                 switch (le16_to_cpu(tl.fc_tag)) {
2064                 case EXT4_FC_TAG_LINK:
2065                         ret = ext4_fc_replay_link(sb, &tl, val);
2066                         break;
2067                 case EXT4_FC_TAG_UNLINK:
2068                         ret = ext4_fc_replay_unlink(sb, &tl, val);
2069                         break;
2070                 case EXT4_FC_TAG_ADD_RANGE:
2071                         ret = ext4_fc_replay_add_range(sb, &tl, val);
2072                         break;
2073                 case EXT4_FC_TAG_CREAT:
2074                         ret = ext4_fc_replay_create(sb, &tl, val);
2075                         break;
2076                 case EXT4_FC_TAG_DEL_RANGE:
2077                         ret = ext4_fc_replay_del_range(sb, &tl, val);
2078                         break;
2079                 case EXT4_FC_TAG_INODE:
2080                         ret = ext4_fc_replay_inode(sb, &tl, val);
2081                         break;
2082                 case EXT4_FC_TAG_PAD:
2083                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2084                                              le16_to_cpu(tl.fc_len), 0);
2085                         break;
2086                 case EXT4_FC_TAG_TAIL:
2087                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2088                                              le16_to_cpu(tl.fc_len), 0);
2089                         memcpy(&tail, val, sizeof(tail));
2090                         WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2091                         break;
2092                 case EXT4_FC_TAG_HEAD:
2093                         break;
2094                 default:
2095                         trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2096                                              le16_to_cpu(tl.fc_len), 0);
2097                         ret = -ECANCELED;
2098                         break;
2099                 }
2100                 if (ret < 0)
2101                         break;
2102                 ret = JBD2_FC_REPLAY_CONTINUE;
2103         }
2104         return ret;
2105 }
2106
2107 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2108 {
2109         /*
2110          * We set replay callback even if fast commit disabled because we may
2111          * could still have fast commit blocks that need to be replayed even if
2112          * fast commit has now been turned off.
2113          */
2114         journal->j_fc_replay_callback = ext4_fc_replay;
2115         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2116                 return;
2117         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2118 }
2119
2120 static const char *fc_ineligible_reasons[] = {
2121         "Extended attributes changed",
2122         "Cross rename",
2123         "Journal flag changed",
2124         "Insufficient memory",
2125         "Swap boot",
2126         "Resize",
2127         "Dir renamed",
2128         "Falloc range op",
2129         "Data journalling",
2130         "FC Commit Failed"
2131 };
2132
2133 int ext4_fc_info_show(struct seq_file *seq, void *v)
2134 {
2135         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2136         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2137         int i;
2138
2139         if (v != SEQ_START_TOKEN)
2140                 return 0;
2141
2142         seq_printf(seq,
2143                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2144                    stats->fc_num_commits, stats->fc_ineligible_commits,
2145                    stats->fc_numblks,
2146                    div_u64(stats->s_fc_avg_commit_time, 1000));
2147         seq_puts(seq, "Ineligible reasons:\n");
2148         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2149                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2150                         stats->fc_ineligible_reason_count[i]);
2151
2152         return 0;
2153 }
2154
2155 int __init ext4_fc_init_dentry_cache(void)
2156 {
2157         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2158                                            SLAB_RECLAIM_ACCOUNT);
2159
2160         if (ext4_fc_dentry_cachep == NULL)
2161                 return -ENOMEM;
2162
2163         return 0;
2164 }
2165
2166 void ext4_fc_destroy_dentry_cache(void)
2167 {
2168         kmem_cache_destroy(ext4_fc_dentry_cachep);
2169 }