ext4: use s_mount_flags instead of s_mount_state for fast commit state
[linux-2.6-block.git] / fs / ext4 / fast_commit.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
30  * - EXT4_FC_TAG_LINK           - records directory entry link
31  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
41  *                                during recovery. Note that iblocks field is
42  *                                not replayed and instead derived during
43  *                                replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  * Not all operations are supported by fast commits today (e.g extended
69  * attributes). Fast commit ineligiblity is marked by calling one of the
70  * two following functions:
71  *
72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73  *   back to full commit. This is useful in case of transient errors.
74  *
75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76  *   the fast commits happening between ext4_fc_start_ineligible() and
77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79  *   make one more fast commit to fall back to full commit after stop call so
80  *   that it guaranteed that the fast commit ineligible operation contained
81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82  *   followed by at least 1 full commit.
83  *
84  * Atomicity of commits
85  * --------------------
86  * In order to gaurantee atomicity during the commit operation, fast commit
87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88  * tag contains CRC of the contents and TID of the transaction after which
89  * this fast commit should be applied. Recovery code replays fast commit
90  * logs only if there's at least 1 valid tail present. For every fast commit
91  * operation, there is 1 tail. This means, we may end up with multiple tails
92  * in the fast commit space. Here's an example:
93  *
94  * - Create a new file A and remove existing file B
95  * - fsync()
96  * - Append contents to file A
97  * - Truncate file A
98  * - fsync()
99  *
100  * The fast commit space at the end of above operations would look like this:
101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
103  *
104  * Replay code should thus check for all the valid tails in the FC area.
105  *
106  * TODOs
107  * -----
108  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109  *    eligible update must be protected within ext4_fc_start_update() and
110  *    ext4_fc_stop_update(). These routines are called at much higher
111  *    routines. This can be made more fine grained by combining with
112  *    ext4_journal_start().
113  *
114  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115  *
116  * 3) Handle more ineligible cases.
117  */
118
119 #include <trace/events/ext4.h>
120 static struct kmem_cache *ext4_fc_dentry_cachep;
121
122 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123 {
124         BUFFER_TRACE(bh, "");
125         if (uptodate) {
126                 ext4_debug("%s: Block %lld up-to-date",
127                            __func__, bh->b_blocknr);
128                 set_buffer_uptodate(bh);
129         } else {
130                 ext4_debug("%s: Block %lld not up-to-date",
131                            __func__, bh->b_blocknr);
132                 clear_buffer_uptodate(bh);
133         }
134
135         unlock_buffer(bh);
136 }
137
138 static inline void ext4_fc_reset_inode(struct inode *inode)
139 {
140         struct ext4_inode_info *ei = EXT4_I(inode);
141
142         ei->i_fc_lblk_start = 0;
143         ei->i_fc_lblk_len = 0;
144 }
145
146 void ext4_fc_init_inode(struct inode *inode)
147 {
148         struct ext4_inode_info *ei = EXT4_I(inode);
149
150         ext4_fc_reset_inode(inode);
151         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152         INIT_LIST_HEAD(&ei->i_fc_list);
153         init_waitqueue_head(&ei->i_fc_wait);
154         atomic_set(&ei->i_fc_updates, 0);
155         ei->i_fc_committed_subtid = 0;
156 }
157
158 /*
159  * Inform Ext4's fast about start of an inode update
160  *
161  * This function is called by the high level call VFS callbacks before
162  * performing any inode update. This function blocks if there's an ongoing
163  * fast commit on the inode in question.
164  */
165 void ext4_fc_start_update(struct inode *inode)
166 {
167         struct ext4_inode_info *ei = EXT4_I(inode);
168
169         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
170             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
171                 return;
172
173 restart:
174         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
175         if (list_empty(&ei->i_fc_list))
176                 goto out;
177
178         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
179                 wait_queue_head_t *wq;
180 #if (BITS_PER_LONG < 64)
181                 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
182                                 EXT4_STATE_FC_COMMITTING);
183                 wq = bit_waitqueue(&ei->i_state_flags,
184                                    EXT4_STATE_FC_COMMITTING);
185 #else
186                 DEFINE_WAIT_BIT(wait, &ei->i_flags,
187                                 EXT4_STATE_FC_COMMITTING);
188                 wq = bit_waitqueue(&ei->i_flags,
189                                    EXT4_STATE_FC_COMMITTING);
190 #endif
191                 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
192                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
193                 schedule();
194                 finish_wait(wq, &wait.wq_entry);
195                 goto restart;
196         }
197 out:
198         atomic_inc(&ei->i_fc_updates);
199         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
200 }
201
202 /*
203  * Stop inode update and wake up waiting fast commits if any.
204  */
205 void ext4_fc_stop_update(struct inode *inode)
206 {
207         struct ext4_inode_info *ei = EXT4_I(inode);
208
209         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
210             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
211                 return;
212
213         if (atomic_dec_and_test(&ei->i_fc_updates))
214                 wake_up_all(&ei->i_fc_wait);
215 }
216
217 /*
218  * Remove inode from fast commit list. If the inode is being committed
219  * we wait until inode commit is done.
220  */
221 void ext4_fc_del(struct inode *inode)
222 {
223         struct ext4_inode_info *ei = EXT4_I(inode);
224
225         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
226             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
227                 return;
228
229 restart:
230         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
231         if (list_empty(&ei->i_fc_list)) {
232                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
233                 return;
234         }
235
236         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
237                 wait_queue_head_t *wq;
238 #if (BITS_PER_LONG < 64)
239                 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
240                                 EXT4_STATE_FC_COMMITTING);
241                 wq = bit_waitqueue(&ei->i_state_flags,
242                                    EXT4_STATE_FC_COMMITTING);
243 #else
244                 DEFINE_WAIT_BIT(wait, &ei->i_flags,
245                                 EXT4_STATE_FC_COMMITTING);
246                 wq = bit_waitqueue(&ei->i_flags,
247                                    EXT4_STATE_FC_COMMITTING);
248 #endif
249                 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
250                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
251                 schedule();
252                 finish_wait(wq, &wait.wq_entry);
253                 goto restart;
254         }
255         if (!list_empty(&ei->i_fc_list))
256                 list_del_init(&ei->i_fc_list);
257         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
258 }
259
260 /*
261  * Mark file system as fast commit ineligible. This means that next commit
262  * operation would result in a full jbd2 commit.
263  */
264 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
265 {
266         struct ext4_sb_info *sbi = EXT4_SB(sb);
267
268         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
269             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
270                 return;
271
272         sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
273         WARN_ON(reason >= EXT4_FC_REASON_MAX);
274         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
275 }
276
277 /*
278  * Start a fast commit ineligible update. Any commits that happen while
279  * such an operation is in progress fall back to full commits.
280  */
281 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
282 {
283         struct ext4_sb_info *sbi = EXT4_SB(sb);
284
285         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
286             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
287                 return;
288
289         WARN_ON(reason >= EXT4_FC_REASON_MAX);
290         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
291         atomic_inc(&sbi->s_fc_ineligible_updates);
292 }
293
294 /*
295  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
296  * to ensure that after stopping the ineligible update, at least one full
297  * commit takes place.
298  */
299 void ext4_fc_stop_ineligible(struct super_block *sb)
300 {
301         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
302             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
303                 return;
304
305         EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
306         atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
307 }
308
309 static inline int ext4_fc_is_ineligible(struct super_block *sb)
310 {
311         return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) ||
312                 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
313 }
314
315 /*
316  * Generic fast commit tracking function. If this is the first time this we are
317  * called after a full commit, we initialize fast commit fields and then call
318  * __fc_track_fn() with update = 0. If we have already been called after a full
319  * commit, we pass update = 1. Based on that, the track function can determine
320  * if it needs to track a field for the first time or if it needs to just
321  * update the previously tracked value.
322  *
323  * If enqueue is set, this function enqueues the inode in fast commit list.
324  */
325 static int ext4_fc_track_template(
326         struct inode *inode, int (*__fc_track_fn)(struct inode *, void *, bool),
327         void *args, int enqueue)
328 {
329         tid_t running_txn_tid;
330         bool update = false;
331         struct ext4_inode_info *ei = EXT4_I(inode);
332         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
333         int ret;
334
335         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
336             (sbi->s_mount_state & EXT4_FC_REPLAY))
337                 return -EOPNOTSUPP;
338
339         if (ext4_fc_is_ineligible(inode->i_sb))
340                 return -EINVAL;
341
342         running_txn_tid = sbi->s_journal ?
343                 sbi->s_journal->j_commit_sequence + 1 : 0;
344
345         mutex_lock(&ei->i_fc_lock);
346         if (running_txn_tid == ei->i_sync_tid) {
347                 update = true;
348         } else {
349                 ext4_fc_reset_inode(inode);
350                 ei->i_sync_tid = running_txn_tid;
351         }
352         ret = __fc_track_fn(inode, args, update);
353         mutex_unlock(&ei->i_fc_lock);
354
355         if (!enqueue)
356                 return ret;
357
358         spin_lock(&sbi->s_fc_lock);
359         if (list_empty(&EXT4_I(inode)->i_fc_list))
360                 list_add_tail(&EXT4_I(inode)->i_fc_list,
361                                 (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ?
362                                 &sbi->s_fc_q[FC_Q_STAGING] :
363                                 &sbi->s_fc_q[FC_Q_MAIN]);
364         spin_unlock(&sbi->s_fc_lock);
365
366         return ret;
367 }
368
369 struct __track_dentry_update_args {
370         struct dentry *dentry;
371         int op;
372 };
373
374 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
375 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
376 {
377         struct ext4_fc_dentry_update *node;
378         struct ext4_inode_info *ei = EXT4_I(inode);
379         struct __track_dentry_update_args *dentry_update =
380                 (struct __track_dentry_update_args *)arg;
381         struct dentry *dentry = dentry_update->dentry;
382         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
383
384         mutex_unlock(&ei->i_fc_lock);
385         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
386         if (!node) {
387                 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MEM);
388                 mutex_lock(&ei->i_fc_lock);
389                 return -ENOMEM;
390         }
391
392         node->fcd_op = dentry_update->op;
393         node->fcd_parent = dentry->d_parent->d_inode->i_ino;
394         node->fcd_ino = inode->i_ino;
395         if (dentry->d_name.len > DNAME_INLINE_LEN) {
396                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
397                 if (!node->fcd_name.name) {
398                         kmem_cache_free(ext4_fc_dentry_cachep, node);
399                         ext4_fc_mark_ineligible(inode->i_sb,
400                                 EXT4_FC_REASON_MEM);
401                         mutex_lock(&ei->i_fc_lock);
402                         return -ENOMEM;
403                 }
404                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
405                         dentry->d_name.len);
406         } else {
407                 memcpy(node->fcd_iname, dentry->d_name.name,
408                         dentry->d_name.len);
409                 node->fcd_name.name = node->fcd_iname;
410         }
411         node->fcd_name.len = dentry->d_name.len;
412
413         spin_lock(&sbi->s_fc_lock);
414         if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING)
415                 list_add_tail(&node->fcd_list,
416                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
417         else
418                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
419         spin_unlock(&sbi->s_fc_lock);
420         mutex_lock(&ei->i_fc_lock);
421
422         return 0;
423 }
424
425 void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry)
426 {
427         struct __track_dentry_update_args args;
428         int ret;
429
430         args.dentry = dentry;
431         args.op = EXT4_FC_TAG_UNLINK;
432
433         ret = ext4_fc_track_template(inode, __track_dentry_update,
434                                         (void *)&args, 0);
435         trace_ext4_fc_track_unlink(inode, dentry, ret);
436 }
437
438 void ext4_fc_track_link(struct inode *inode, struct dentry *dentry)
439 {
440         struct __track_dentry_update_args args;
441         int ret;
442
443         args.dentry = dentry;
444         args.op = EXT4_FC_TAG_LINK;
445
446         ret = ext4_fc_track_template(inode, __track_dentry_update,
447                                         (void *)&args, 0);
448         trace_ext4_fc_track_link(inode, dentry, ret);
449 }
450
451 void ext4_fc_track_create(struct inode *inode, struct dentry *dentry)
452 {
453         struct __track_dentry_update_args args;
454         int ret;
455
456         args.dentry = dentry;
457         args.op = EXT4_FC_TAG_CREAT;
458
459         ret = ext4_fc_track_template(inode, __track_dentry_update,
460                                         (void *)&args, 0);
461         trace_ext4_fc_track_create(inode, dentry, ret);
462 }
463
464 /* __track_fn for inode tracking */
465 static int __track_inode(struct inode *inode, void *arg, bool update)
466 {
467         if (update)
468                 return -EEXIST;
469
470         EXT4_I(inode)->i_fc_lblk_len = 0;
471
472         return 0;
473 }
474
475 void ext4_fc_track_inode(struct inode *inode)
476 {
477         int ret;
478
479         if (S_ISDIR(inode->i_mode))
480                 return;
481
482         ret = ext4_fc_track_template(inode, __track_inode, NULL, 1);
483         trace_ext4_fc_track_inode(inode, ret);
484 }
485
486 struct __track_range_args {
487         ext4_lblk_t start, end;
488 };
489
490 /* __track_fn for tracking data updates */
491 static int __track_range(struct inode *inode, void *arg, bool update)
492 {
493         struct ext4_inode_info *ei = EXT4_I(inode);
494         ext4_lblk_t oldstart;
495         struct __track_range_args *__arg =
496                 (struct __track_range_args *)arg;
497
498         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
499                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
500                 return -ECANCELED;
501         }
502
503         oldstart = ei->i_fc_lblk_start;
504
505         if (update && ei->i_fc_lblk_len > 0) {
506                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
507                 ei->i_fc_lblk_len =
508                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
509                                 ei->i_fc_lblk_start + 1;
510         } else {
511                 ei->i_fc_lblk_start = __arg->start;
512                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
513         }
514
515         return 0;
516 }
517
518 void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start,
519                          ext4_lblk_t end)
520 {
521         struct __track_range_args args;
522         int ret;
523
524         if (S_ISDIR(inode->i_mode))
525                 return;
526
527         args.start = start;
528         args.end = end;
529
530         ret = ext4_fc_track_template(inode,  __track_range, &args, 1);
531
532         trace_ext4_fc_track_range(inode, start, end, ret);
533 }
534
535 static void ext4_fc_submit_bh(struct super_block *sb)
536 {
537         int write_flags = REQ_SYNC;
538         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
539
540         if (test_opt(sb, BARRIER))
541                 write_flags |= REQ_FUA | REQ_PREFLUSH;
542         lock_buffer(bh);
543         clear_buffer_dirty(bh);
544         set_buffer_uptodate(bh);
545         bh->b_end_io = ext4_end_buffer_io_sync;
546         submit_bh(REQ_OP_WRITE, write_flags, bh);
547         EXT4_SB(sb)->s_fc_bh = NULL;
548 }
549
550 /* Ext4 commit path routines */
551
552 /* memzero and update CRC */
553 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
554                                 u32 *crc)
555 {
556         void *ret;
557
558         ret = memset(dst, 0, len);
559         if (crc)
560                 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
561         return ret;
562 }
563
564 /*
565  * Allocate len bytes on a fast commit buffer.
566  *
567  * During the commit time this function is used to manage fast commit
568  * block space. We don't split a fast commit log onto different
569  * blocks. So this function makes sure that if there's not enough space
570  * on the current block, the remaining space in the current block is
571  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
572  * new block is from jbd2 and CRC is updated to reflect the padding
573  * we added.
574  */
575 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
576 {
577         struct ext4_fc_tl *tl;
578         struct ext4_sb_info *sbi = EXT4_SB(sb);
579         struct buffer_head *bh;
580         int bsize = sbi->s_journal->j_blocksize;
581         int ret, off = sbi->s_fc_bytes % bsize;
582         int pad_len;
583
584         /*
585          * After allocating len, we should have space at least for a 0 byte
586          * padding.
587          */
588         if (len + sizeof(struct ext4_fc_tl) > bsize)
589                 return NULL;
590
591         if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
592                 /*
593                  * Only allocate from current buffer if we have enough space for
594                  * this request AND we have space to add a zero byte padding.
595                  */
596                 if (!sbi->s_fc_bh) {
597                         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
598                         if (ret)
599                                 return NULL;
600                         sbi->s_fc_bh = bh;
601                 }
602                 sbi->s_fc_bytes += len;
603                 return sbi->s_fc_bh->b_data + off;
604         }
605         /* Need to add PAD tag */
606         tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
607         tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
608         pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
609         tl->fc_len = cpu_to_le16(pad_len);
610         if (crc)
611                 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
612         if (pad_len > 0)
613                 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
614         ext4_fc_submit_bh(sb);
615
616         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
617         if (ret)
618                 return NULL;
619         sbi->s_fc_bh = bh;
620         sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
621         return sbi->s_fc_bh->b_data;
622 }
623
624 /* memcpy to fc reserved space and update CRC */
625 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
626                                 int len, u32 *crc)
627 {
628         if (crc)
629                 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
630         return memcpy(dst, src, len);
631 }
632
633 /*
634  * Complete a fast commit by writing tail tag.
635  *
636  * Writing tail tag marks the end of a fast commit. In order to guarantee
637  * atomicity, after writing tail tag, even if there's space remaining
638  * in the block, next commit shouldn't use it. That's why tail tag
639  * has the length as that of the remaining space on the block.
640  */
641 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
642 {
643         struct ext4_sb_info *sbi = EXT4_SB(sb);
644         struct ext4_fc_tl tl;
645         struct ext4_fc_tail tail;
646         int off, bsize = sbi->s_journal->j_blocksize;
647         u8 *dst;
648
649         /*
650          * ext4_fc_reserve_space takes care of allocating an extra block if
651          * there's no enough space on this block for accommodating this tail.
652          */
653         dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
654         if (!dst)
655                 return -ENOSPC;
656
657         off = sbi->s_fc_bytes % bsize;
658
659         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
660         tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
661         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
662
663         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
664         dst += sizeof(tl);
665         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
666         ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
667         dst += sizeof(tail.fc_tid);
668         tail.fc_crc = cpu_to_le32(crc);
669         ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
670
671         ext4_fc_submit_bh(sb);
672
673         return 0;
674 }
675
676 /*
677  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
678  * Returns false if there's not enough space.
679  */
680 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
681                            u32 *crc)
682 {
683         struct ext4_fc_tl tl;
684         u8 *dst;
685
686         dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
687         if (!dst)
688                 return false;
689
690         tl.fc_tag = cpu_to_le16(tag);
691         tl.fc_len = cpu_to_le16(len);
692
693         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
694         ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
695
696         return true;
697 }
698
699 /* Same as above, but adds dentry tlv. */
700 static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
701                                         int parent_ino, int ino, int dlen,
702                                         const unsigned char *dname,
703                                         u32 *crc)
704 {
705         struct ext4_fc_dentry_info fcd;
706         struct ext4_fc_tl tl;
707         u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
708                                         crc);
709
710         if (!dst)
711                 return false;
712
713         fcd.fc_parent_ino = cpu_to_le32(parent_ino);
714         fcd.fc_ino = cpu_to_le32(ino);
715         tl.fc_tag = cpu_to_le16(tag);
716         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
717         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
718         dst += sizeof(tl);
719         ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
720         dst += sizeof(fcd);
721         ext4_fc_memcpy(sb, dst, dname, dlen, crc);
722         dst += dlen;
723
724         return true;
725 }
726
727 /*
728  * Writes inode in the fast commit space under TLV with tag @tag.
729  * Returns 0 on success, error on failure.
730  */
731 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
732 {
733         struct ext4_inode_info *ei = EXT4_I(inode);
734         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
735         int ret;
736         struct ext4_iloc iloc;
737         struct ext4_fc_inode fc_inode;
738         struct ext4_fc_tl tl;
739         u8 *dst;
740
741         ret = ext4_get_inode_loc(inode, &iloc);
742         if (ret)
743                 return ret;
744
745         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
746                 inode_len += ei->i_extra_isize;
747
748         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
749         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
750         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
751
752         dst = ext4_fc_reserve_space(inode->i_sb,
753                         sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
754         if (!dst)
755                 return -ECANCELED;
756
757         if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
758                 return -ECANCELED;
759         dst += sizeof(tl);
760         if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
761                 return -ECANCELED;
762         dst += sizeof(fc_inode);
763         if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
764                                         inode_len, crc))
765                 return -ECANCELED;
766
767         return 0;
768 }
769
770 /*
771  * Writes updated data ranges for the inode in question. Updates CRC.
772  * Returns 0 on success, error otherwise.
773  */
774 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
775 {
776         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
777         struct ext4_inode_info *ei = EXT4_I(inode);
778         struct ext4_map_blocks map;
779         struct ext4_fc_add_range fc_ext;
780         struct ext4_fc_del_range lrange;
781         struct ext4_extent *ex;
782         int ret;
783
784         mutex_lock(&ei->i_fc_lock);
785         if (ei->i_fc_lblk_len == 0) {
786                 mutex_unlock(&ei->i_fc_lock);
787                 return 0;
788         }
789         old_blk_size = ei->i_fc_lblk_start;
790         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
791         ei->i_fc_lblk_len = 0;
792         mutex_unlock(&ei->i_fc_lock);
793
794         cur_lblk_off = old_blk_size;
795         jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
796                   __func__, cur_lblk_off, new_blk_size, inode->i_ino);
797
798         while (cur_lblk_off <= new_blk_size) {
799                 map.m_lblk = cur_lblk_off;
800                 map.m_len = new_blk_size - cur_lblk_off + 1;
801                 ret = ext4_map_blocks(NULL, inode, &map, 0);
802                 if (ret < 0)
803                         return -ECANCELED;
804
805                 if (map.m_len == 0) {
806                         cur_lblk_off++;
807                         continue;
808                 }
809
810                 if (ret == 0) {
811                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
812                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
813                         lrange.fc_len = cpu_to_le32(map.m_len);
814                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
815                                             sizeof(lrange), (u8 *)&lrange, crc))
816                                 return -ENOSPC;
817                 } else {
818                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
819                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
820                         ex->ee_block = cpu_to_le32(map.m_lblk);
821                         ex->ee_len = cpu_to_le16(map.m_len);
822                         ext4_ext_store_pblock(ex, map.m_pblk);
823                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
824                                 ext4_ext_mark_unwritten(ex);
825                         else
826                                 ext4_ext_mark_initialized(ex);
827                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
828                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
829                                 return -ENOSPC;
830                 }
831
832                 cur_lblk_off += map.m_len;
833         }
834
835         return 0;
836 }
837
838
839 /* Submit data for all the fast commit inodes */
840 static int ext4_fc_submit_inode_data_all(journal_t *journal)
841 {
842         struct super_block *sb = (struct super_block *)(journal->j_private);
843         struct ext4_sb_info *sbi = EXT4_SB(sb);
844         struct ext4_inode_info *ei;
845         struct list_head *pos;
846         int ret = 0;
847
848         spin_lock(&sbi->s_fc_lock);
849         sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING;
850         list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
851                 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
852                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
853                 while (atomic_read(&ei->i_fc_updates)) {
854                         DEFINE_WAIT(wait);
855
856                         prepare_to_wait(&ei->i_fc_wait, &wait,
857                                                 TASK_UNINTERRUPTIBLE);
858                         if (atomic_read(&ei->i_fc_updates)) {
859                                 spin_unlock(&sbi->s_fc_lock);
860                                 schedule();
861                                 spin_lock(&sbi->s_fc_lock);
862                         }
863                         finish_wait(&ei->i_fc_wait, &wait);
864                 }
865                 spin_unlock(&sbi->s_fc_lock);
866                 ret = jbd2_submit_inode_data(ei->jinode);
867                 if (ret)
868                         return ret;
869                 spin_lock(&sbi->s_fc_lock);
870         }
871         spin_unlock(&sbi->s_fc_lock);
872
873         return ret;
874 }
875
876 /* Wait for completion of data for all the fast commit inodes */
877 static int ext4_fc_wait_inode_data_all(journal_t *journal)
878 {
879         struct super_block *sb = (struct super_block *)(journal->j_private);
880         struct ext4_sb_info *sbi = EXT4_SB(sb);
881         struct ext4_inode_info *pos, *n;
882         int ret = 0;
883
884         spin_lock(&sbi->s_fc_lock);
885         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
886                 if (!ext4_test_inode_state(&pos->vfs_inode,
887                                            EXT4_STATE_FC_COMMITTING))
888                         continue;
889                 spin_unlock(&sbi->s_fc_lock);
890
891                 ret = jbd2_wait_inode_data(journal, pos->jinode);
892                 if (ret)
893                         return ret;
894                 spin_lock(&sbi->s_fc_lock);
895         }
896         spin_unlock(&sbi->s_fc_lock);
897
898         return 0;
899 }
900
901 /* Commit all the directory entry updates */
902 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
903 {
904         struct super_block *sb = (struct super_block *)(journal->j_private);
905         struct ext4_sb_info *sbi = EXT4_SB(sb);
906         struct ext4_fc_dentry_update *fc_dentry;
907         struct inode *inode;
908         struct list_head *pos, *n, *fcd_pos, *fcd_n;
909         struct ext4_inode_info *ei;
910         int ret;
911
912         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
913                 return 0;
914         list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
915                 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
916                                         fcd_list);
917                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
918                         spin_unlock(&sbi->s_fc_lock);
919                         if (!ext4_fc_add_dentry_tlv(
920                                 sb, fc_dentry->fcd_op,
921                                 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
922                                 fc_dentry->fcd_name.len,
923                                 fc_dentry->fcd_name.name, crc)) {
924                                 ret = -ENOSPC;
925                                 goto lock_and_exit;
926                         }
927                         spin_lock(&sbi->s_fc_lock);
928                         continue;
929                 }
930
931                 inode = NULL;
932                 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
933                         ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
934                         if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
935                                 inode = &ei->vfs_inode;
936                                 break;
937                         }
938                 }
939                 /*
940                  * If we don't find inode in our list, then it was deleted,
941                  * in which case, we don't need to record it's create tag.
942                  */
943                 if (!inode)
944                         continue;
945                 spin_unlock(&sbi->s_fc_lock);
946
947                 /*
948                  * We first write the inode and then the create dirent. This
949                  * allows the recovery code to create an unnamed inode first
950                  * and then link it to a directory entry. This allows us
951                  * to use namei.c routines almost as is and simplifies
952                  * the recovery code.
953                  */
954                 ret = ext4_fc_write_inode(inode, crc);
955                 if (ret)
956                         goto lock_and_exit;
957
958                 ret = ext4_fc_write_inode_data(inode, crc);
959                 if (ret)
960                         goto lock_and_exit;
961
962                 if (!ext4_fc_add_dentry_tlv(
963                         sb, fc_dentry->fcd_op,
964                         fc_dentry->fcd_parent, fc_dentry->fcd_ino,
965                         fc_dentry->fcd_name.len,
966                         fc_dentry->fcd_name.name, crc)) {
967                         ret = -ENOSPC;
968                         goto lock_and_exit;
969                 }
970
971                 spin_lock(&sbi->s_fc_lock);
972         }
973         return 0;
974 lock_and_exit:
975         spin_lock(&sbi->s_fc_lock);
976         return ret;
977 }
978
979 static int ext4_fc_perform_commit(journal_t *journal)
980 {
981         struct super_block *sb = (struct super_block *)(journal->j_private);
982         struct ext4_sb_info *sbi = EXT4_SB(sb);
983         struct ext4_inode_info *iter;
984         struct ext4_fc_head head;
985         struct list_head *pos;
986         struct inode *inode;
987         struct blk_plug plug;
988         int ret = 0;
989         u32 crc = 0;
990
991         ret = ext4_fc_submit_inode_data_all(journal);
992         if (ret)
993                 return ret;
994
995         ret = ext4_fc_wait_inode_data_all(journal);
996         if (ret)
997                 return ret;
998
999         blk_start_plug(&plug);
1000         if (sbi->s_fc_bytes == 0) {
1001                 /*
1002                  * Add a head tag only if this is the first fast commit
1003                  * in this TID.
1004                  */
1005                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1006                 head.fc_tid = cpu_to_le32(
1007                         sbi->s_journal->j_running_transaction->t_tid);
1008                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1009                         (u8 *)&head, &crc))
1010                         goto out;
1011         }
1012
1013         spin_lock(&sbi->s_fc_lock);
1014         ret = ext4_fc_commit_dentry_updates(journal, &crc);
1015         if (ret) {
1016                 spin_unlock(&sbi->s_fc_lock);
1017                 goto out;
1018         }
1019
1020         list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1021                 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1022                 inode = &iter->vfs_inode;
1023                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1024                         continue;
1025
1026                 spin_unlock(&sbi->s_fc_lock);
1027                 ret = ext4_fc_write_inode_data(inode, &crc);
1028                 if (ret)
1029                         goto out;
1030                 ret = ext4_fc_write_inode(inode, &crc);
1031                 if (ret)
1032                         goto out;
1033                 spin_lock(&sbi->s_fc_lock);
1034                 EXT4_I(inode)->i_fc_committed_subtid =
1035                         atomic_read(&sbi->s_fc_subtid);
1036         }
1037         spin_unlock(&sbi->s_fc_lock);
1038
1039         ret = ext4_fc_write_tail(sb, crc);
1040
1041 out:
1042         blk_finish_plug(&plug);
1043         return ret;
1044 }
1045
1046 /*
1047  * The main commit entry point. Performs a fast commit for transaction
1048  * commit_tid if needed. If it's not possible to perform a fast commit
1049  * due to various reasons, we fall back to full commit. Returns 0
1050  * on success, error otherwise.
1051  */
1052 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1053 {
1054         struct super_block *sb = (struct super_block *)(journal->j_private);
1055         struct ext4_sb_info *sbi = EXT4_SB(sb);
1056         int nblks = 0, ret, bsize = journal->j_blocksize;
1057         int subtid = atomic_read(&sbi->s_fc_subtid);
1058         int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1059         ktime_t start_time, commit_time;
1060
1061         trace_ext4_fc_commit_start(sb);
1062
1063         start_time = ktime_get();
1064
1065         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1066                 (ext4_fc_is_ineligible(sb))) {
1067                 reason = EXT4_FC_REASON_INELIGIBLE;
1068                 goto out;
1069         }
1070
1071 restart_fc:
1072         ret = jbd2_fc_begin_commit(journal, commit_tid);
1073         if (ret == -EALREADY) {
1074                 /* There was an ongoing commit, check if we need to restart */
1075                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1076                         commit_tid > journal->j_commit_sequence)
1077                         goto restart_fc;
1078                 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1079                 goto out;
1080         } else if (ret) {
1081                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1082                 reason = EXT4_FC_REASON_FC_START_FAILED;
1083                 goto out;
1084         }
1085
1086         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1087         ret = ext4_fc_perform_commit(journal);
1088         if (ret < 0) {
1089                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1090                 reason = EXT4_FC_REASON_FC_FAILED;
1091                 goto out;
1092         }
1093         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1094         ret = jbd2_fc_wait_bufs(journal, nblks);
1095         if (ret < 0) {
1096                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1097                 reason = EXT4_FC_REASON_FC_FAILED;
1098                 goto out;
1099         }
1100         atomic_inc(&sbi->s_fc_subtid);
1101         jbd2_fc_end_commit(journal);
1102 out:
1103         /* Has any ineligible update happened since we started? */
1104         if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1105                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1106                 reason = EXT4_FC_REASON_INELIGIBLE;
1107         }
1108
1109         spin_lock(&sbi->s_fc_lock);
1110         if (reason != EXT4_FC_REASON_OK &&
1111                 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1112                 sbi->s_fc_stats.fc_ineligible_commits++;
1113         } else {
1114                 sbi->s_fc_stats.fc_num_commits++;
1115                 sbi->s_fc_stats.fc_numblks += nblks;
1116         }
1117         spin_unlock(&sbi->s_fc_lock);
1118         nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1119         trace_ext4_fc_commit_stop(sb, nblks, reason);
1120         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1121         /*
1122          * weight the commit time higher than the average time so we don't
1123          * react too strongly to vast changes in the commit time
1124          */
1125         if (likely(sbi->s_fc_avg_commit_time))
1126                 sbi->s_fc_avg_commit_time = (commit_time +
1127                                 sbi->s_fc_avg_commit_time * 3) / 4;
1128         else
1129                 sbi->s_fc_avg_commit_time = commit_time;
1130         jbd_debug(1,
1131                 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1132                 nblks, reason, subtid);
1133         if (reason == EXT4_FC_REASON_FC_FAILED)
1134                 return jbd2_fc_end_commit_fallback(journal, commit_tid);
1135         if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1136                 reason == EXT4_FC_REASON_INELIGIBLE)
1137                 return jbd2_complete_transaction(journal, commit_tid);
1138         return 0;
1139 }
1140
1141 /*
1142  * Fast commit cleanup routine. This is called after every fast commit and
1143  * full commit. full is true if we are called after a full commit.
1144  */
1145 static void ext4_fc_cleanup(journal_t *journal, int full)
1146 {
1147         struct super_block *sb = journal->j_private;
1148         struct ext4_sb_info *sbi = EXT4_SB(sb);
1149         struct ext4_inode_info *iter;
1150         struct ext4_fc_dentry_update *fc_dentry;
1151         struct list_head *pos, *n;
1152
1153         if (full && sbi->s_fc_bh)
1154                 sbi->s_fc_bh = NULL;
1155
1156         jbd2_fc_release_bufs(journal);
1157
1158         spin_lock(&sbi->s_fc_lock);
1159         list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1160                 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1161                 list_del_init(&iter->i_fc_list);
1162                 ext4_clear_inode_state(&iter->vfs_inode,
1163                                        EXT4_STATE_FC_COMMITTING);
1164                 ext4_fc_reset_inode(&iter->vfs_inode);
1165                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1166                 smp_mb();
1167 #if (BITS_PER_LONG < 64)
1168                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1169 #else
1170                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1171 #endif
1172         }
1173
1174         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1175                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1176                                              struct ext4_fc_dentry_update,
1177                                              fcd_list);
1178                 list_del_init(&fc_dentry->fcd_list);
1179                 spin_unlock(&sbi->s_fc_lock);
1180
1181                 if (fc_dentry->fcd_name.name &&
1182                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1183                         kfree(fc_dentry->fcd_name.name);
1184                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1185                 spin_lock(&sbi->s_fc_lock);
1186         }
1187
1188         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1189                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1190         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1191                                 &sbi->s_fc_q[FC_Q_STAGING]);
1192
1193         sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
1194         sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
1195
1196         if (full)
1197                 sbi->s_fc_bytes = 0;
1198         spin_unlock(&sbi->s_fc_lock);
1199         trace_ext4_fc_stats(sb);
1200 }
1201
1202 /* Ext4 Replay Path Routines */
1203
1204 /* Get length of a particular tlv */
1205 static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
1206 {
1207         return le16_to_cpu(tl->fc_len);
1208 }
1209
1210 /* Get a pointer to "value" of a tlv */
1211 static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
1212 {
1213         return (u8 *)tl + sizeof(*tl);
1214 }
1215
1216 /* Helper struct for dentry replay routines */
1217 struct dentry_info_args {
1218         int parent_ino, dname_len, ino, inode_len;
1219         char *dname;
1220 };
1221
1222 static inline void tl_to_darg(struct dentry_info_args *darg,
1223                                 struct  ext4_fc_tl *tl)
1224 {
1225         struct ext4_fc_dentry_info *fcd;
1226
1227         fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1228
1229         darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1230         darg->ino = le32_to_cpu(fcd->fc_ino);
1231         darg->dname = fcd->fc_dname;
1232         darg->dname_len = ext4_fc_tag_len(tl) -
1233                         sizeof(struct ext4_fc_dentry_info);
1234 }
1235
1236 /* Unlink replay function */
1237 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1238 {
1239         struct inode *inode, *old_parent;
1240         struct qstr entry;
1241         struct dentry_info_args darg;
1242         int ret = 0;
1243
1244         tl_to_darg(&darg, tl);
1245
1246         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1247                         darg.parent_ino, darg.dname_len);
1248
1249         entry.name = darg.dname;
1250         entry.len = darg.dname_len;
1251         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1252
1253         if (IS_ERR_OR_NULL(inode)) {
1254                 jbd_debug(1, "Inode %d not found", darg.ino);
1255                 return 0;
1256         }
1257
1258         old_parent = ext4_iget(sb, darg.parent_ino,
1259                                 EXT4_IGET_NORMAL);
1260         if (IS_ERR_OR_NULL(old_parent)) {
1261                 jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1262                 iput(inode);
1263                 return 0;
1264         }
1265
1266         ret = __ext4_unlink(old_parent, &entry, inode);
1267         /* -ENOENT ok coz it might not exist anymore. */
1268         if (ret == -ENOENT)
1269                 ret = 0;
1270         iput(old_parent);
1271         iput(inode);
1272         return ret;
1273 }
1274
1275 static int ext4_fc_replay_link_internal(struct super_block *sb,
1276                                 struct dentry_info_args *darg,
1277                                 struct inode *inode)
1278 {
1279         struct inode *dir = NULL;
1280         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1281         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1282         int ret = 0;
1283
1284         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1285         if (IS_ERR(dir)) {
1286                 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1287                 dir = NULL;
1288                 goto out;
1289         }
1290
1291         dentry_dir = d_obtain_alias(dir);
1292         if (IS_ERR(dentry_dir)) {
1293                 jbd_debug(1, "Failed to obtain dentry");
1294                 dentry_dir = NULL;
1295                 goto out;
1296         }
1297
1298         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1299         if (!dentry_inode) {
1300                 jbd_debug(1, "Inode dentry not created.");
1301                 ret = -ENOMEM;
1302                 goto out;
1303         }
1304
1305         ret = __ext4_link(dir, inode, dentry_inode);
1306         /*
1307          * It's possible that link already existed since data blocks
1308          * for the dir in question got persisted before we crashed OR
1309          * we replayed this tag and crashed before the entire replay
1310          * could complete.
1311          */
1312         if (ret && ret != -EEXIST) {
1313                 jbd_debug(1, "Failed to link\n");
1314                 goto out;
1315         }
1316
1317         ret = 0;
1318 out:
1319         if (dentry_dir) {
1320                 d_drop(dentry_dir);
1321                 dput(dentry_dir);
1322         } else if (dir) {
1323                 iput(dir);
1324         }
1325         if (dentry_inode) {
1326                 d_drop(dentry_inode);
1327                 dput(dentry_inode);
1328         }
1329
1330         return ret;
1331 }
1332
1333 /* Link replay function */
1334 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1335 {
1336         struct inode *inode;
1337         struct dentry_info_args darg;
1338         int ret = 0;
1339
1340         tl_to_darg(&darg, tl);
1341         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1342                         darg.parent_ino, darg.dname_len);
1343
1344         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1345         if (IS_ERR_OR_NULL(inode)) {
1346                 jbd_debug(1, "Inode not found.");
1347                 return 0;
1348         }
1349
1350         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1351         iput(inode);
1352         return ret;
1353 }
1354
1355 /*
1356  * Record all the modified inodes during replay. We use this later to setup
1357  * block bitmaps correctly.
1358  */
1359 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1360 {
1361         struct ext4_fc_replay_state *state;
1362         int i;
1363
1364         state = &EXT4_SB(sb)->s_fc_replay_state;
1365         for (i = 0; i < state->fc_modified_inodes_used; i++)
1366                 if (state->fc_modified_inodes[i] == ino)
1367                         return 0;
1368         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1369                 state->fc_modified_inodes_size +=
1370                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1371                 state->fc_modified_inodes = krealloc(
1372                                         state->fc_modified_inodes, sizeof(int) *
1373                                         state->fc_modified_inodes_size,
1374                                         GFP_KERNEL);
1375                 if (!state->fc_modified_inodes)
1376                         return -ENOMEM;
1377         }
1378         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1379         return 0;
1380 }
1381
1382 /*
1383  * Inode replay function
1384  */
1385 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1386 {
1387         struct ext4_fc_inode *fc_inode;
1388         struct ext4_inode *raw_inode;
1389         struct ext4_inode *raw_fc_inode;
1390         struct inode *inode = NULL;
1391         struct ext4_iloc iloc;
1392         int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1393         struct ext4_extent_header *eh;
1394
1395         fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1396
1397         ino = le32_to_cpu(fc_inode->fc_ino);
1398         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1399
1400         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1401         if (!IS_ERR_OR_NULL(inode)) {
1402                 ext4_ext_clear_bb(inode);
1403                 iput(inode);
1404         }
1405
1406         ext4_fc_record_modified_inode(sb, ino);
1407
1408         raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1409         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1410         if (ret)
1411                 goto out;
1412
1413         inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1414         raw_inode = ext4_raw_inode(&iloc);
1415
1416         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1417         memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1418                 inode_len - offsetof(struct ext4_inode, i_generation));
1419         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1420                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1421                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1422                         memset(eh, 0, sizeof(*eh));
1423                         eh->eh_magic = EXT4_EXT_MAGIC;
1424                         eh->eh_max = cpu_to_le16(
1425                                 (sizeof(raw_inode->i_block) -
1426                                  sizeof(struct ext4_extent_header))
1427                                  / sizeof(struct ext4_extent));
1428                 }
1429         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1430                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1431                         sizeof(raw_inode->i_block));
1432         }
1433
1434         /* Immediately update the inode on disk. */
1435         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1436         if (ret)
1437                 goto out;
1438         ret = sync_dirty_buffer(iloc.bh);
1439         if (ret)
1440                 goto out;
1441         ret = ext4_mark_inode_used(sb, ino);
1442         if (ret)
1443                 goto out;
1444
1445         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1446         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1447         if (IS_ERR_OR_NULL(inode)) {
1448                 jbd_debug(1, "Inode not found.");
1449                 return -EFSCORRUPTED;
1450         }
1451
1452         /*
1453          * Our allocator could have made different decisions than before
1454          * crashing. This should be fixed but until then, we calculate
1455          * the number of blocks the inode.
1456          */
1457         ext4_ext_replay_set_iblocks(inode);
1458
1459         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1460         ext4_reset_inode_seed(inode);
1461
1462         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1463         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1464         sync_dirty_buffer(iloc.bh);
1465         brelse(iloc.bh);
1466 out:
1467         iput(inode);
1468         if (!ret)
1469                 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1470
1471         return 0;
1472 }
1473
1474 /*
1475  * Dentry create replay function.
1476  *
1477  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1478  * inode for which we are trying to create a dentry here, should already have
1479  * been replayed before we start here.
1480  */
1481 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1482 {
1483         int ret = 0;
1484         struct inode *inode = NULL;
1485         struct inode *dir = NULL;
1486         struct dentry_info_args darg;
1487
1488         tl_to_darg(&darg, tl);
1489
1490         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1491                         darg.parent_ino, darg.dname_len);
1492
1493         /* This takes care of update group descriptor and other metadata */
1494         ret = ext4_mark_inode_used(sb, darg.ino);
1495         if (ret)
1496                 goto out;
1497
1498         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1499         if (IS_ERR_OR_NULL(inode)) {
1500                 jbd_debug(1, "inode %d not found.", darg.ino);
1501                 inode = NULL;
1502                 ret = -EINVAL;
1503                 goto out;
1504         }
1505
1506         if (S_ISDIR(inode->i_mode)) {
1507                 /*
1508                  * If we are creating a directory, we need to make sure that the
1509                  * dot and dot dot dirents are setup properly.
1510                  */
1511                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1512                 if (IS_ERR_OR_NULL(dir)) {
1513                         jbd_debug(1, "Dir %d not found.", darg.ino);
1514                         goto out;
1515                 }
1516                 ret = ext4_init_new_dir(NULL, dir, inode);
1517                 iput(dir);
1518                 if (ret) {
1519                         ret = 0;
1520                         goto out;
1521                 }
1522         }
1523         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1524         if (ret)
1525                 goto out;
1526         set_nlink(inode, 1);
1527         ext4_mark_inode_dirty(NULL, inode);
1528 out:
1529         if (inode)
1530                 iput(inode);
1531         return ret;
1532 }
1533
1534 /*
1535  * Record physical disk regions which are in use as per fast commit area. Our
1536  * simple replay phase allocator excludes these regions from allocation.
1537  */
1538 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1539                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1540 {
1541         struct ext4_fc_replay_state *state;
1542         struct ext4_fc_alloc_region *region;
1543
1544         state = &EXT4_SB(sb)->s_fc_replay_state;
1545         if (state->fc_regions_used == state->fc_regions_size) {
1546                 state->fc_regions_size +=
1547                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1548                 state->fc_regions = krealloc(
1549                                         state->fc_regions,
1550                                         state->fc_regions_size *
1551                                         sizeof(struct ext4_fc_alloc_region),
1552                                         GFP_KERNEL);
1553                 if (!state->fc_regions)
1554                         return -ENOMEM;
1555         }
1556         region = &state->fc_regions[state->fc_regions_used++];
1557         region->ino = ino;
1558         region->lblk = lblk;
1559         region->pblk = pblk;
1560         region->len = len;
1561
1562         return 0;
1563 }
1564
1565 /* Replay add range tag */
1566 static int ext4_fc_replay_add_range(struct super_block *sb,
1567                                 struct ext4_fc_tl *tl)
1568 {
1569         struct ext4_fc_add_range *fc_add_ex;
1570         struct ext4_extent newex, *ex;
1571         struct inode *inode;
1572         ext4_lblk_t start, cur;
1573         int remaining, len;
1574         ext4_fsblk_t start_pblk;
1575         struct ext4_map_blocks map;
1576         struct ext4_ext_path *path = NULL;
1577         int ret;
1578
1579         fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1580         ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1581
1582         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1583                 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1584                 ext4_ext_get_actual_len(ex));
1585
1586         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1587                                 EXT4_IGET_NORMAL);
1588         if (IS_ERR_OR_NULL(inode)) {
1589                 jbd_debug(1, "Inode not found.");
1590                 return 0;
1591         }
1592
1593         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1594
1595         start = le32_to_cpu(ex->ee_block);
1596         start_pblk = ext4_ext_pblock(ex);
1597         len = ext4_ext_get_actual_len(ex);
1598
1599         cur = start;
1600         remaining = len;
1601         jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1602                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
1603                   inode->i_ino);
1604
1605         while (remaining > 0) {
1606                 map.m_lblk = cur;
1607                 map.m_len = remaining;
1608                 map.m_pblk = 0;
1609                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1610
1611                 if (ret < 0) {
1612                         iput(inode);
1613                         return 0;
1614                 }
1615
1616                 if (ret == 0) {
1617                         /* Range is not mapped */
1618                         path = ext4_find_extent(inode, cur, NULL, 0);
1619                         if (!path)
1620                                 continue;
1621                         memset(&newex, 0, sizeof(newex));
1622                         newex.ee_block = cpu_to_le32(cur);
1623                         ext4_ext_store_pblock(
1624                                 &newex, start_pblk + cur - start);
1625                         newex.ee_len = cpu_to_le16(map.m_len);
1626                         if (ext4_ext_is_unwritten(ex))
1627                                 ext4_ext_mark_unwritten(&newex);
1628                         down_write(&EXT4_I(inode)->i_data_sem);
1629                         ret = ext4_ext_insert_extent(
1630                                 NULL, inode, &path, &newex, 0);
1631                         up_write((&EXT4_I(inode)->i_data_sem));
1632                         ext4_ext_drop_refs(path);
1633                         kfree(path);
1634                         if (ret) {
1635                                 iput(inode);
1636                                 return 0;
1637                         }
1638                         goto next;
1639                 }
1640
1641                 if (start_pblk + cur - start != map.m_pblk) {
1642                         /*
1643                          * Logical to physical mapping changed. This can happen
1644                          * if this range was removed and then reallocated to
1645                          * map to new physical blocks during a fast commit.
1646                          */
1647                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1648                                         ext4_ext_is_unwritten(ex),
1649                                         start_pblk + cur - start);
1650                         if (ret) {
1651                                 iput(inode);
1652                                 return 0;
1653                         }
1654                         /*
1655                          * Mark the old blocks as free since they aren't used
1656                          * anymore. We maintain an array of all the modified
1657                          * inodes. In case these blocks are still used at either
1658                          * a different logical range in the same inode or in
1659                          * some different inode, we will mark them as allocated
1660                          * at the end of the FC replay using our array of
1661                          * modified inodes.
1662                          */
1663                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1664                         goto next;
1665                 }
1666
1667                 /* Range is mapped and needs a state change */
1668                 jbd_debug(1, "Converting from %d to %d %lld",
1669                                 map.m_flags & EXT4_MAP_UNWRITTEN,
1670                         ext4_ext_is_unwritten(ex), map.m_pblk);
1671                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1672                                         ext4_ext_is_unwritten(ex), map.m_pblk);
1673                 if (ret) {
1674                         iput(inode);
1675                         return 0;
1676                 }
1677                 /*
1678                  * We may have split the extent tree while toggling the state.
1679                  * Try to shrink the extent tree now.
1680                  */
1681                 ext4_ext_replay_shrink_inode(inode, start + len);
1682 next:
1683                 cur += map.m_len;
1684                 remaining -= map.m_len;
1685         }
1686         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1687                                         sb->s_blocksize_bits);
1688         iput(inode);
1689         return 0;
1690 }
1691
1692 /* Replay DEL_RANGE tag */
1693 static int
1694 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1695 {
1696         struct inode *inode;
1697         struct ext4_fc_del_range *lrange;
1698         struct ext4_map_blocks map;
1699         ext4_lblk_t cur, remaining;
1700         int ret;
1701
1702         lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1703         cur = le32_to_cpu(lrange->fc_lblk);
1704         remaining = le32_to_cpu(lrange->fc_len);
1705
1706         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1707                 le32_to_cpu(lrange->fc_ino), cur, remaining);
1708
1709         inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1710         if (IS_ERR_OR_NULL(inode)) {
1711                 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1712                 return 0;
1713         }
1714
1715         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1716
1717         jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1718                         inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1719                         le32_to_cpu(lrange->fc_len));
1720         while (remaining > 0) {
1721                 map.m_lblk = cur;
1722                 map.m_len = remaining;
1723
1724                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1725                 if (ret < 0) {
1726                         iput(inode);
1727                         return 0;
1728                 }
1729                 if (ret > 0) {
1730                         remaining -= ret;
1731                         cur += ret;
1732                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1733                 } else {
1734                         remaining -= map.m_len;
1735                         cur += map.m_len;
1736                 }
1737         }
1738
1739         ret = ext4_punch_hole(inode,
1740                 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1741                 le32_to_cpu(lrange->fc_len) <<  sb->s_blocksize_bits);
1742         if (ret)
1743                 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1744         ext4_ext_replay_shrink_inode(inode,
1745                 i_size_read(inode) >> sb->s_blocksize_bits);
1746         ext4_mark_inode_dirty(NULL, inode);
1747         iput(inode);
1748
1749         return 0;
1750 }
1751
1752 static inline const char *tag2str(u16 tag)
1753 {
1754         switch (tag) {
1755         case EXT4_FC_TAG_LINK:
1756                 return "TAG_ADD_ENTRY";
1757         case EXT4_FC_TAG_UNLINK:
1758                 return "TAG_DEL_ENTRY";
1759         case EXT4_FC_TAG_ADD_RANGE:
1760                 return "TAG_ADD_RANGE";
1761         case EXT4_FC_TAG_CREAT:
1762                 return "TAG_CREAT_DENTRY";
1763         case EXT4_FC_TAG_DEL_RANGE:
1764                 return "TAG_DEL_RANGE";
1765         case EXT4_FC_TAG_INODE:
1766                 return "TAG_INODE";
1767         case EXT4_FC_TAG_PAD:
1768                 return "TAG_PAD";
1769         case EXT4_FC_TAG_TAIL:
1770                 return "TAG_TAIL";
1771         case EXT4_FC_TAG_HEAD:
1772                 return "TAG_HEAD";
1773         default:
1774                 return "TAG_ERROR";
1775         }
1776 }
1777
1778 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1779 {
1780         struct ext4_fc_replay_state *state;
1781         struct inode *inode;
1782         struct ext4_ext_path *path = NULL;
1783         struct ext4_map_blocks map;
1784         int i, ret, j;
1785         ext4_lblk_t cur, end;
1786
1787         state = &EXT4_SB(sb)->s_fc_replay_state;
1788         for (i = 0; i < state->fc_modified_inodes_used; i++) {
1789                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1790                         EXT4_IGET_NORMAL);
1791                 if (IS_ERR_OR_NULL(inode)) {
1792                         jbd_debug(1, "Inode %d not found.",
1793                                 state->fc_modified_inodes[i]);
1794                         continue;
1795                 }
1796                 cur = 0;
1797                 end = EXT_MAX_BLOCKS;
1798                 while (cur < end) {
1799                         map.m_lblk = cur;
1800                         map.m_len = end - cur;
1801
1802                         ret = ext4_map_blocks(NULL, inode, &map, 0);
1803                         if (ret < 0)
1804                                 break;
1805
1806                         if (ret > 0) {
1807                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1808                                 if (!IS_ERR_OR_NULL(path)) {
1809                                         for (j = 0; j < path->p_depth; j++)
1810                                                 ext4_mb_mark_bb(inode->i_sb,
1811                                                         path[j].p_block, 1, 1);
1812                                         ext4_ext_drop_refs(path);
1813                                         kfree(path);
1814                                 }
1815                                 cur += ret;
1816                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1817                                                         map.m_len, 1);
1818                         } else {
1819                                 cur = cur + (map.m_len ? map.m_len : 1);
1820                         }
1821                 }
1822                 iput(inode);
1823         }
1824 }
1825
1826 /*
1827  * Check if block is in excluded regions for block allocation. The simple
1828  * allocator that runs during replay phase is calls this function to see
1829  * if it is okay to use a block.
1830  */
1831 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1832 {
1833         int i;
1834         struct ext4_fc_replay_state *state;
1835
1836         state = &EXT4_SB(sb)->s_fc_replay_state;
1837         for (i = 0; i < state->fc_regions_valid; i++) {
1838                 if (state->fc_regions[i].ino == 0 ||
1839                         state->fc_regions[i].len == 0)
1840                         continue;
1841                 if (blk >= state->fc_regions[i].pblk &&
1842                     blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1843                         return true;
1844         }
1845         return false;
1846 }
1847
1848 /* Cleanup function called after replay */
1849 void ext4_fc_replay_cleanup(struct super_block *sb)
1850 {
1851         struct ext4_sb_info *sbi = EXT4_SB(sb);
1852
1853         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1854         kfree(sbi->s_fc_replay_state.fc_regions);
1855         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1856 }
1857
1858 /*
1859  * Recovery Scan phase handler
1860  *
1861  * This function is called during the scan phase and is responsible
1862  * for doing following things:
1863  * - Make sure the fast commit area has valid tags for replay
1864  * - Count number of tags that need to be replayed by the replay handler
1865  * - Verify CRC
1866  * - Create a list of excluded blocks for allocation during replay phase
1867  *
1868  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1869  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1870  * to indicate that scan has finished and JBD2 can now start replay phase.
1871  * It returns a negative error to indicate that there was an error. At the end
1872  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1873  * to indicate the number of tags that need to replayed during the replay phase.
1874  */
1875 static int ext4_fc_replay_scan(journal_t *journal,
1876                                 struct buffer_head *bh, int off,
1877                                 tid_t expected_tid)
1878 {
1879         struct super_block *sb = journal->j_private;
1880         struct ext4_sb_info *sbi = EXT4_SB(sb);
1881         struct ext4_fc_replay_state *state;
1882         int ret = JBD2_FC_REPLAY_CONTINUE;
1883         struct ext4_fc_add_range *ext;
1884         struct ext4_fc_tl *tl;
1885         struct ext4_fc_tail *tail;
1886         __u8 *start, *end;
1887         struct ext4_fc_head *head;
1888         struct ext4_extent *ex;
1889
1890         state = &sbi->s_fc_replay_state;
1891
1892         start = (u8 *)bh->b_data;
1893         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1894
1895         if (state->fc_replay_expected_off == 0) {
1896                 state->fc_cur_tag = 0;
1897                 state->fc_replay_num_tags = 0;
1898                 state->fc_crc = 0;
1899                 state->fc_regions = NULL;
1900                 state->fc_regions_valid = state->fc_regions_used =
1901                         state->fc_regions_size = 0;
1902                 /* Check if we can stop early */
1903                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1904                         != EXT4_FC_TAG_HEAD)
1905                         return 0;
1906         }
1907
1908         if (off != state->fc_replay_expected_off) {
1909                 ret = -EFSCORRUPTED;
1910                 goto out_err;
1911         }
1912
1913         state->fc_replay_expected_off++;
1914         fc_for_each_tl(start, end, tl) {
1915                 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1916                           tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1917                 switch (le16_to_cpu(tl->fc_tag)) {
1918                 case EXT4_FC_TAG_ADD_RANGE:
1919                         ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1920                         ex = (struct ext4_extent *)&ext->fc_ex;
1921                         ret = ext4_fc_record_regions(sb,
1922                                 le32_to_cpu(ext->fc_ino),
1923                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1924                                 ext4_ext_get_actual_len(ex));
1925                         if (ret < 0)
1926                                 break;
1927                         ret = JBD2_FC_REPLAY_CONTINUE;
1928                         fallthrough;
1929                 case EXT4_FC_TAG_DEL_RANGE:
1930                 case EXT4_FC_TAG_LINK:
1931                 case EXT4_FC_TAG_UNLINK:
1932                 case EXT4_FC_TAG_CREAT:
1933                 case EXT4_FC_TAG_INODE:
1934                 case EXT4_FC_TAG_PAD:
1935                         state->fc_cur_tag++;
1936                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1937                                         sizeof(*tl) + ext4_fc_tag_len(tl));
1938                         break;
1939                 case EXT4_FC_TAG_TAIL:
1940                         state->fc_cur_tag++;
1941                         tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1942                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1943                                                 sizeof(*tl) +
1944                                                 offsetof(struct ext4_fc_tail,
1945                                                 fc_crc));
1946                         if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1947                                 le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1948                                 state->fc_replay_num_tags = state->fc_cur_tag;
1949                                 state->fc_regions_valid =
1950                                         state->fc_regions_used;
1951                         } else {
1952                                 ret = state->fc_replay_num_tags ?
1953                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1954                         }
1955                         state->fc_crc = 0;
1956                         break;
1957                 case EXT4_FC_TAG_HEAD:
1958                         head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
1959                         if (le32_to_cpu(head->fc_features) &
1960                                 ~EXT4_FC_SUPPORTED_FEATURES) {
1961                                 ret = -EOPNOTSUPP;
1962                                 break;
1963                         }
1964                         if (le32_to_cpu(head->fc_tid) != expected_tid) {
1965                                 ret = JBD2_FC_REPLAY_STOP;
1966                                 break;
1967                         }
1968                         state->fc_cur_tag++;
1969                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1970                                         sizeof(*tl) + ext4_fc_tag_len(tl));
1971                         break;
1972                 default:
1973                         ret = state->fc_replay_num_tags ?
1974                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
1975                 }
1976                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
1977                         break;
1978         }
1979
1980 out_err:
1981         trace_ext4_fc_replay_scan(sb, ret, off);
1982         return ret;
1983 }
1984
1985 /*
1986  * Main recovery path entry point.
1987  * The meaning of return codes is similar as above.
1988  */
1989 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
1990                                 enum passtype pass, int off, tid_t expected_tid)
1991 {
1992         struct super_block *sb = journal->j_private;
1993         struct ext4_sb_info *sbi = EXT4_SB(sb);
1994         struct ext4_fc_tl *tl;
1995         __u8 *start, *end;
1996         int ret = JBD2_FC_REPLAY_CONTINUE;
1997         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
1998         struct ext4_fc_tail *tail;
1999
2000         if (pass == PASS_SCAN) {
2001                 state->fc_current_pass = PASS_SCAN;
2002                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2003         }
2004
2005         if (state->fc_current_pass != pass) {
2006                 state->fc_current_pass = pass;
2007                 sbi->s_mount_state |= EXT4_FC_REPLAY;
2008         }
2009         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2010                 jbd_debug(1, "Replay stops\n");
2011                 ext4_fc_set_bitmaps_and_counters(sb);
2012                 return 0;
2013         }
2014
2015 #ifdef CONFIG_EXT4_DEBUG
2016         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2017                 pr_warn("Dropping fc block %d because max_replay set\n", off);
2018                 return JBD2_FC_REPLAY_STOP;
2019         }
2020 #endif
2021
2022         start = (u8 *)bh->b_data;
2023         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2024
2025         fc_for_each_tl(start, end, tl) {
2026                 if (state->fc_replay_num_tags == 0) {
2027                         ret = JBD2_FC_REPLAY_STOP;
2028                         ext4_fc_set_bitmaps_and_counters(sb);
2029                         break;
2030                 }
2031                 jbd_debug(3, "Replay phase, tag:%s\n",
2032                                 tag2str(le16_to_cpu(tl->fc_tag)));
2033                 state->fc_replay_num_tags--;
2034                 switch (le16_to_cpu(tl->fc_tag)) {
2035                 case EXT4_FC_TAG_LINK:
2036                         ret = ext4_fc_replay_link(sb, tl);
2037                         break;
2038                 case EXT4_FC_TAG_UNLINK:
2039                         ret = ext4_fc_replay_unlink(sb, tl);
2040                         break;
2041                 case EXT4_FC_TAG_ADD_RANGE:
2042                         ret = ext4_fc_replay_add_range(sb, tl);
2043                         break;
2044                 case EXT4_FC_TAG_CREAT:
2045                         ret = ext4_fc_replay_create(sb, tl);
2046                         break;
2047                 case EXT4_FC_TAG_DEL_RANGE:
2048                         ret = ext4_fc_replay_del_range(sb, tl);
2049                         break;
2050                 case EXT4_FC_TAG_INODE:
2051                         ret = ext4_fc_replay_inode(sb, tl);
2052                         break;
2053                 case EXT4_FC_TAG_PAD:
2054                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2055                                 ext4_fc_tag_len(tl), 0);
2056                         break;
2057                 case EXT4_FC_TAG_TAIL:
2058                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2059                                 ext4_fc_tag_len(tl), 0);
2060                         tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2061                         WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2062                         break;
2063                 case EXT4_FC_TAG_HEAD:
2064                         break;
2065                 default:
2066                         trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2067                                 ext4_fc_tag_len(tl), 0);
2068                         ret = -ECANCELED;
2069                         break;
2070                 }
2071                 if (ret < 0)
2072                         break;
2073                 ret = JBD2_FC_REPLAY_CONTINUE;
2074         }
2075         return ret;
2076 }
2077
2078 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2079 {
2080         int num_fc_blocks;
2081
2082         /*
2083          * We set replay callback even if fast commit disabled because we may
2084          * could still have fast commit blocks that need to be replayed even if
2085          * fast commit has now been turned off.
2086          */
2087         journal->j_fc_replay_callback = ext4_fc_replay;
2088         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2089                 return;
2090         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2091         if (!buffer_uptodate(journal->j_sb_buffer)
2092                 && ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO,
2093                                         true)) {
2094                 ext4_msg(sb, KERN_ERR, "I/O error on journal");
2095                 return;
2096         }
2097         num_fc_blocks = be32_to_cpu(journal->j_superblock->s_num_fc_blks);
2098         if (jbd2_fc_init(journal, num_fc_blocks ? num_fc_blocks :
2099                                         EXT4_NUM_FC_BLKS)) {
2100                 pr_warn("Error while enabling fast commits, turning off.");
2101                 ext4_clear_feature_fast_commit(sb);
2102         }
2103 }
2104
2105 const char *fc_ineligible_reasons[] = {
2106         "Extended attributes changed",
2107         "Cross rename",
2108         "Journal flag changed",
2109         "Insufficient memory",
2110         "Swap boot",
2111         "Resize",
2112         "Dir renamed",
2113         "Falloc range op",
2114         "FC Commit Failed"
2115 };
2116
2117 int ext4_fc_info_show(struct seq_file *seq, void *v)
2118 {
2119         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2120         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2121         int i;
2122
2123         if (v != SEQ_START_TOKEN)
2124                 return 0;
2125
2126         seq_printf(seq,
2127                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2128                    stats->fc_num_commits, stats->fc_ineligible_commits,
2129                    stats->fc_numblks,
2130                    div_u64(sbi->s_fc_avg_commit_time, 1000));
2131         seq_puts(seq, "Ineligible reasons:\n");
2132         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2133                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2134                         stats->fc_ineligible_reason_count[i]);
2135
2136         return 0;
2137 }
2138
2139 int __init ext4_fc_init_dentry_cache(void)
2140 {
2141         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2142                                            SLAB_RECLAIM_ACCOUNT);
2143
2144         if (ext4_fc_dentry_cachep == NULL)
2145                 return -ENOMEM;
2146
2147         return 0;
2148 }