ext4: reuse order and buddy in mb_mark_used when buddy split
[linux-2.6-block.git] / fs / ext4 / fast_commit.c
CommitLineData
6866d7b3
HS
1// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
aa75f4d3 10#include "ext4.h"
6866d7b3 11#include "ext4_jbd2.h"
aa75f4d3
HS
12#include "ext4_extents.h"
13#include "mballoc.h"
14
15/*
16 * Ext4 Fast Commits
17 * -----------------
18 *
19 * Ext4 fast commits implement fine grained journalling for Ext4.
20 *
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
26 *
27 * (A) Directory entry updates:
28 *
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
32 *
33 * (B) File specific data range updates:
34 *
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
37 *
38 * (C) Inode metadata (mtime / ctime etc):
39 *
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
43 * replay.
44 * Commit Operation
45 * ----------------
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
51 *
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
60 *
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
65 *
66 * Fast Commit Ineligibility
67 * -------------------------
aa75f4d3 68 *
7bbbe241
HS
69 * Not all operations are supported by fast commits today (e.g extended
70 * attributes). Fast commit ineligibility is marked by calling
71 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
72 * to full commit.
aa75f4d3
HS
73 *
74 * Atomicity of commits
75 * --------------------
a740762f 76 * In order to guarantee atomicity during the commit operation, fast commit
aa75f4d3
HS
77 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
78 * tag contains CRC of the contents and TID of the transaction after which
79 * this fast commit should be applied. Recovery code replays fast commit
80 * logs only if there's at least 1 valid tail present. For every fast commit
81 * operation, there is 1 tail. This means, we may end up with multiple tails
82 * in the fast commit space. Here's an example:
83 *
84 * - Create a new file A and remove existing file B
85 * - fsync()
86 * - Append contents to file A
87 * - Truncate file A
88 * - fsync()
89 *
90 * The fast commit space at the end of above operations would look like this:
91 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
92 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
93 *
94 * Replay code should thus check for all the valid tails in the FC area.
95 *
b1b7dce3
HS
96 * Fast Commit Replay Idempotence
97 * ------------------------------
98 *
99 * Fast commits tags are idempotent in nature provided the recovery code follows
100 * certain rules. The guiding principle that the commit path follows while
101 * committing is that it stores the result of a particular operation instead of
102 * storing the procedure.
103 *
104 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
105 * was associated with inode 10. During fast commit, instead of storing this
106 * operation as a procedure "rename a to b", we store the resulting file system
107 * state as a "series" of outcomes:
108 *
109 * - Link dirent b to inode 10
110 * - Unlink dirent a
111 * - Inode <10> with valid refcount
112 *
113 * Now when recovery code runs, it needs "enforce" this state on the file
114 * system. This is what guarantees idempotence of fast commit replay.
115 *
116 * Let's take an example of a procedure that is not idempotent and see how fast
117 * commits make it idempotent. Consider following sequence of operations:
118 *
119 * rm A; mv B A; read A
120 * (x) (y) (z)
121 *
122 * (x), (y) and (z) are the points at which we can crash. If we store this
123 * sequence of operations as is then the replay is not idempotent. Let's say
124 * while in replay, we crash at (z). During the second replay, file A (which was
125 * actually created as a result of "mv B A" operation) would get deleted. Thus,
126 * file named A would be absent when we try to read A. So, this sequence of
127 * operations is not idempotent. However, as mentioned above, instead of storing
128 * the procedure fast commits store the outcome of each procedure. Thus the fast
129 * commit log for above procedure would be as follows:
130 *
131 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
132 * inode 11 before the replay)
133 *
134 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11]
135 * (w) (x) (y) (z)
136 *
137 * If we crash at (z), we will have file A linked to inode 11. During the second
138 * replay, we will remove file A (inode 11). But we will create it back and make
139 * it point to inode 11. We won't find B, so we'll just skip that step. At this
140 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
141 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
142 * similarly. Thus, by converting a non-idempotent procedure into a series of
143 * idempotent outcomes, fast commits ensured idempotence during the replay.
144 *
aa75f4d3
HS
145 * TODOs
146 * -----
b1b7dce3
HS
147 *
148 * 0) Fast commit replay path hardening: Fast commit replay code should use
149 * journal handles to make sure all the updates it does during the replay
150 * path are atomic. With that if we crash during fast commit replay, after
151 * trying to do recovery again, we will find a file system where fast commit
152 * area is invalid (because new full commit would be found). In order to deal
153 * with that, fast commit replay code should ensure that the "FC_REPLAY"
154 * superblock state is persisted before starting the replay, so that after
155 * the crash, fast commit recovery code can look at that flag and perform
156 * fast commit recovery even if that area is invalidated by later full
157 * commits.
158 *
d1199b94
HS
159 * 1) Fast commit's commit path locks the entire file system during fast
160 * commit. This has significant performance penalty. Instead of that, we
161 * should use ext4_fc_start/stop_update functions to start inode level
162 * updates from ext4_journal_start/stop. Once we do that we can drop file
163 * system locking during commit path.
aa75f4d3 164 *
d1199b94 165 * 2) Handle more ineligible cases.
aa75f4d3
HS
166 */
167
168#include <trace/events/ext4.h>
169static struct kmem_cache *ext4_fc_dentry_cachep;
170
171static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
172{
173 BUFFER_TRACE(bh, "");
174 if (uptodate) {
175 ext4_debug("%s: Block %lld up-to-date",
176 __func__, bh->b_blocknr);
177 set_buffer_uptodate(bh);
178 } else {
179 ext4_debug("%s: Block %lld not up-to-date",
180 __func__, bh->b_blocknr);
181 clear_buffer_uptodate(bh);
182 }
183
184 unlock_buffer(bh);
185}
186
187static inline void ext4_fc_reset_inode(struct inode *inode)
188{
189 struct ext4_inode_info *ei = EXT4_I(inode);
190
191 ei->i_fc_lblk_start = 0;
192 ei->i_fc_lblk_len = 0;
193}
194
195void ext4_fc_init_inode(struct inode *inode)
196{
197 struct ext4_inode_info *ei = EXT4_I(inode);
198
199 ext4_fc_reset_inode(inode);
200 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
201 INIT_LIST_HEAD(&ei->i_fc_list);
b3998b3b 202 INIT_LIST_HEAD(&ei->i_fc_dilist);
aa75f4d3
HS
203 init_waitqueue_head(&ei->i_fc_wait);
204 atomic_set(&ei->i_fc_updates, 0);
aa75f4d3
HS
205}
206
f6634e26
HS
207/* This function must be called with sbi->s_fc_lock held. */
208static void ext4_fc_wait_committing_inode(struct inode *inode)
fa329e27 209__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
f6634e26
HS
210{
211 wait_queue_head_t *wq;
212 struct ext4_inode_info *ei = EXT4_I(inode);
213
214#if (BITS_PER_LONG < 64)
215 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
216 EXT4_STATE_FC_COMMITTING);
217 wq = bit_waitqueue(&ei->i_state_flags,
218 EXT4_STATE_FC_COMMITTING);
219#else
220 DEFINE_WAIT_BIT(wait, &ei->i_flags,
221 EXT4_STATE_FC_COMMITTING);
222 wq = bit_waitqueue(&ei->i_flags,
223 EXT4_STATE_FC_COMMITTING);
224#endif
225 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
226 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
227 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
228 schedule();
229 finish_wait(wq, &wait.wq_entry);
230}
231
aa75f4d3
HS
232/*
233 * Inform Ext4's fast about start of an inode update
234 *
235 * This function is called by the high level call VFS callbacks before
236 * performing any inode update. This function blocks if there's an ongoing
237 * fast commit on the inode in question.
238 */
239void ext4_fc_start_update(struct inode *inode)
240{
241 struct ext4_inode_info *ei = EXT4_I(inode);
242
8016e29f
HS
243 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
244 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
aa75f4d3
HS
245 return;
246
247restart:
248 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
249 if (list_empty(&ei->i_fc_list))
250 goto out;
251
252 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
f6634e26 253 ext4_fc_wait_committing_inode(inode);
aa75f4d3
HS
254 goto restart;
255 }
256out:
257 atomic_inc(&ei->i_fc_updates);
258 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
259}
260
261/*
262 * Stop inode update and wake up waiting fast commits if any.
263 */
264void ext4_fc_stop_update(struct inode *inode)
265{
266 struct ext4_inode_info *ei = EXT4_I(inode);
267
8016e29f
HS
268 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
269 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
aa75f4d3
HS
270 return;
271
272 if (atomic_dec_and_test(&ei->i_fc_updates))
273 wake_up_all(&ei->i_fc_wait);
274}
275
276/*
277 * Remove inode from fast commit list. If the inode is being committed
278 * we wait until inode commit is done.
279 */
280void ext4_fc_del(struct inode *inode)
281{
282 struct ext4_inode_info *ei = EXT4_I(inode);
b3998b3b
RH
283 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
284 struct ext4_fc_dentry_update *fc_dentry;
aa75f4d3 285
8016e29f
HS
286 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
287 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
aa75f4d3
HS
288 return;
289
290restart:
291 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
b3998b3b 292 if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
aa75f4d3
HS
293 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
294 return;
295 }
296
297 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
f6634e26 298 ext4_fc_wait_committing_inode(inode);
aa75f4d3
HS
299 goto restart;
300 }
b3998b3b
RH
301
302 if (!list_empty(&ei->i_fc_list))
303 list_del_init(&ei->i_fc_list);
304
305 /*
306 * Since this inode is getting removed, let's also remove all FC
307 * dentry create references, since it is not needed to log it anyways.
308 */
309 if (list_empty(&ei->i_fc_dilist)) {
310 spin_unlock(&sbi->s_fc_lock);
311 return;
312 }
313
314 fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
315 WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
316 list_del_init(&fc_dentry->fcd_list);
317 list_del_init(&fc_dentry->fcd_dilist);
318
319 WARN_ON(!list_empty(&ei->i_fc_dilist));
320 spin_unlock(&sbi->s_fc_lock);
321
322 if (fc_dentry->fcd_name.name &&
323 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
324 kfree(fc_dentry->fcd_name.name);
325 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
326
327 return;
aa75f4d3
HS
328}
329
330/*
e85c81ba
XY
331 * Mark file system as fast commit ineligible, and record latest
332 * ineligible transaction tid. This means until the recorded
333 * transaction, commit operation would result in a full jbd2 commit.
aa75f4d3 334 */
e85c81ba 335void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
aa75f4d3
HS
336{
337 struct ext4_sb_info *sbi = EXT4_SB(sb);
e85c81ba 338 tid_t tid;
aa75f4d3 339
8016e29f
HS
340 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
341 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
342 return;
343
9b5f6c9b 344 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
e85c81ba
XY
345 if (handle && !IS_ERR(handle))
346 tid = handle->h_transaction->t_tid;
347 else {
348 read_lock(&sbi->s_journal->j_state_lock);
349 tid = sbi->s_journal->j_running_transaction ?
350 sbi->s_journal->j_running_transaction->t_tid : 0;
351 read_unlock(&sbi->s_journal->j_state_lock);
352 }
353 spin_lock(&sbi->s_fc_lock);
354 if (sbi->s_fc_ineligible_tid < tid)
355 sbi->s_fc_ineligible_tid = tid;
356 spin_unlock(&sbi->s_fc_lock);
aa75f4d3
HS
357 WARN_ON(reason >= EXT4_FC_REASON_MAX);
358 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
359}
360
aa75f4d3
HS
361/*
362 * Generic fast commit tracking function. If this is the first time this we are
363 * called after a full commit, we initialize fast commit fields and then call
364 * __fc_track_fn() with update = 0. If we have already been called after a full
365 * commit, we pass update = 1. Based on that, the track function can determine
366 * if it needs to track a field for the first time or if it needs to just
367 * update the previously tracked value.
368 *
369 * If enqueue is set, this function enqueues the inode in fast commit list.
370 */
371static int ext4_fc_track_template(
a80f7fcf
HS
372 handle_t *handle, struct inode *inode,
373 int (*__fc_track_fn)(struct inode *, void *, bool),
aa75f4d3
HS
374 void *args, int enqueue)
375{
aa75f4d3
HS
376 bool update = false;
377 struct ext4_inode_info *ei = EXT4_I(inode);
378 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
a80f7fcf 379 tid_t tid = 0;
aa75f4d3
HS
380 int ret;
381
a80f7fcf 382 tid = handle->h_transaction->t_tid;
aa75f4d3 383 mutex_lock(&ei->i_fc_lock);
a80f7fcf 384 if (tid == ei->i_sync_tid) {
aa75f4d3
HS
385 update = true;
386 } else {
387 ext4_fc_reset_inode(inode);
a80f7fcf 388 ei->i_sync_tid = tid;
aa75f4d3
HS
389 }
390 ret = __fc_track_fn(inode, args, update);
391 mutex_unlock(&ei->i_fc_lock);
392
393 if (!enqueue)
394 return ret;
395
396 spin_lock(&sbi->s_fc_lock);
397 if (list_empty(&EXT4_I(inode)->i_fc_list))
398 list_add_tail(&EXT4_I(inode)->i_fc_list,
bdc8a53a
XY
399 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
400 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
aa75f4d3
HS
401 &sbi->s_fc_q[FC_Q_STAGING] :
402 &sbi->s_fc_q[FC_Q_MAIN]);
403 spin_unlock(&sbi->s_fc_lock);
404
405 return ret;
406}
407
408struct __track_dentry_update_args {
409 struct dentry *dentry;
410 int op;
411};
412
413/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
414static int __track_dentry_update(struct inode *inode, void *arg, bool update)
415{
416 struct ext4_fc_dentry_update *node;
417 struct ext4_inode_info *ei = EXT4_I(inode);
418 struct __track_dentry_update_args *dentry_update =
419 (struct __track_dentry_update_args *)arg;
420 struct dentry *dentry = dentry_update->dentry;
421 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
422
423 mutex_unlock(&ei->i_fc_lock);
424 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
425 if (!node) {
e85c81ba 426 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
aa75f4d3
HS
427 mutex_lock(&ei->i_fc_lock);
428 return -ENOMEM;
429 }
430
431 node->fcd_op = dentry_update->op;
432 node->fcd_parent = dentry->d_parent->d_inode->i_ino;
433 node->fcd_ino = inode->i_ino;
434 if (dentry->d_name.len > DNAME_INLINE_LEN) {
435 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
436 if (!node->fcd_name.name) {
437 kmem_cache_free(ext4_fc_dentry_cachep, node);
438 ext4_fc_mark_ineligible(inode->i_sb,
e85c81ba 439 EXT4_FC_REASON_NOMEM, NULL);
aa75f4d3
HS
440 mutex_lock(&ei->i_fc_lock);
441 return -ENOMEM;
442 }
443 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
444 dentry->d_name.len);
445 } else {
446 memcpy(node->fcd_iname, dentry->d_name.name,
447 dentry->d_name.len);
448 node->fcd_name.name = node->fcd_iname;
449 }
450 node->fcd_name.len = dentry->d_name.len;
b3998b3b 451 INIT_LIST_HEAD(&node->fcd_dilist);
aa75f4d3 452 spin_lock(&sbi->s_fc_lock);
bdc8a53a
XY
453 if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
454 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
aa75f4d3
HS
455 list_add_tail(&node->fcd_list,
456 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
457 else
458 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
b3998b3b
RH
459
460 /*
461 * This helps us keep a track of all fc_dentry updates which is part of
462 * this ext4 inode. So in case the inode is getting unlinked, before
463 * even we get a chance to fsync, we could remove all fc_dentry
464 * references while evicting the inode in ext4_fc_del().
465 * Also with this, we don't need to loop over all the inodes in
466 * sbi->s_fc_q to get the corresponding inode in
467 * ext4_fc_commit_dentry_updates().
468 */
469 if (dentry_update->op == EXT4_FC_TAG_CREAT) {
470 WARN_ON(!list_empty(&ei->i_fc_dilist));
471 list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
472 }
aa75f4d3
HS
473 spin_unlock(&sbi->s_fc_lock);
474 mutex_lock(&ei->i_fc_lock);
475
476 return 0;
477}
478
a80f7fcf
HS
479void __ext4_fc_track_unlink(handle_t *handle,
480 struct inode *inode, struct dentry *dentry)
aa75f4d3
HS
481{
482 struct __track_dentry_update_args args;
483 int ret;
484
485 args.dentry = dentry;
486 args.op = EXT4_FC_TAG_UNLINK;
487
a80f7fcf 488 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
aa75f4d3 489 (void *)&args, 0);
1d2e2440 490 trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
aa75f4d3
HS
491}
492
a80f7fcf
HS
493void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
494{
78be0471
RH
495 struct inode *inode = d_inode(dentry);
496 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
497
498 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
499 (sbi->s_mount_state & EXT4_FC_REPLAY))
500 return;
501
502 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
503 return;
504
505 __ext4_fc_track_unlink(handle, inode, dentry);
a80f7fcf
HS
506}
507
508void __ext4_fc_track_link(handle_t *handle,
509 struct inode *inode, struct dentry *dentry)
aa75f4d3
HS
510{
511 struct __track_dentry_update_args args;
512 int ret;
513
514 args.dentry = dentry;
515 args.op = EXT4_FC_TAG_LINK;
516
a80f7fcf 517 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
aa75f4d3 518 (void *)&args, 0);
1d2e2440 519 trace_ext4_fc_track_link(handle, inode, dentry, ret);
aa75f4d3
HS
520}
521
a80f7fcf
HS
522void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
523{
78be0471
RH
524 struct inode *inode = d_inode(dentry);
525 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
526
527 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
528 (sbi->s_mount_state & EXT4_FC_REPLAY))
529 return;
530
531 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
532 return;
533
534 __ext4_fc_track_link(handle, inode, dentry);
a80f7fcf
HS
535}
536
8210bb29
HS
537void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
538 struct dentry *dentry)
aa75f4d3
HS
539{
540 struct __track_dentry_update_args args;
541 int ret;
542
543 args.dentry = dentry;
544 args.op = EXT4_FC_TAG_CREAT;
545
a80f7fcf 546 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
aa75f4d3 547 (void *)&args, 0);
1d2e2440 548 trace_ext4_fc_track_create(handle, inode, dentry, ret);
aa75f4d3
HS
549}
550
8210bb29
HS
551void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
552{
78be0471
RH
553 struct inode *inode = d_inode(dentry);
554 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
555
556 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
557 (sbi->s_mount_state & EXT4_FC_REPLAY))
558 return;
559
560 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
561 return;
562
563 __ext4_fc_track_create(handle, inode, dentry);
8210bb29
HS
564}
565
aa75f4d3
HS
566/* __track_fn for inode tracking */
567static int __track_inode(struct inode *inode, void *arg, bool update)
568{
569 if (update)
570 return -EEXIST;
571
572 EXT4_I(inode)->i_fc_lblk_len = 0;
573
574 return 0;
575}
576
a80f7fcf 577void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
aa75f4d3 578{
78be0471 579 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
aa75f4d3
HS
580 int ret;
581
582 if (S_ISDIR(inode->i_mode))
583 return;
584
556e0319
HS
585 if (ext4_should_journal_data(inode)) {
586 ext4_fc_mark_ineligible(inode->i_sb,
e85c81ba 587 EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
556e0319
HS
588 return;
589 }
590
78be0471
RH
591 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
592 (sbi->s_mount_state & EXT4_FC_REPLAY))
593 return;
594
595 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
596 return;
597
a80f7fcf 598 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
1d2e2440 599 trace_ext4_fc_track_inode(handle, inode, ret);
aa75f4d3
HS
600}
601
602struct __track_range_args {
603 ext4_lblk_t start, end;
604};
605
606/* __track_fn for tracking data updates */
607static int __track_range(struct inode *inode, void *arg, bool update)
608{
609 struct ext4_inode_info *ei = EXT4_I(inode);
610 ext4_lblk_t oldstart;
611 struct __track_range_args *__arg =
612 (struct __track_range_args *)arg;
613
614 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
615 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
616 return -ECANCELED;
617 }
618
619 oldstart = ei->i_fc_lblk_start;
620
621 if (update && ei->i_fc_lblk_len > 0) {
622 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
623 ei->i_fc_lblk_len =
624 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
625 ei->i_fc_lblk_start + 1;
626 } else {
627 ei->i_fc_lblk_start = __arg->start;
628 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
629 }
630
631 return 0;
632}
633
a80f7fcf 634void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
aa75f4d3
HS
635 ext4_lblk_t end)
636{
78be0471 637 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
aa75f4d3
HS
638 struct __track_range_args args;
639 int ret;
640
641 if (S_ISDIR(inode->i_mode))
642 return;
643
78be0471
RH
644 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
645 (sbi->s_mount_state & EXT4_FC_REPLAY))
646 return;
647
648 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
649 return;
650
aa75f4d3
HS
651 args.start = start;
652 args.end = end;
653
a80f7fcf 654 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
aa75f4d3 655
1d2e2440 656 trace_ext4_fc_track_range(handle, inode, start, end, ret);
aa75f4d3
HS
657}
658
e9f53353 659static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
aa75f4d3
HS
660{
661 int write_flags = REQ_SYNC;
662 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
663
e9f53353
DP
664 /* Add REQ_FUA | REQ_PREFLUSH only its tail */
665 if (test_opt(sb, BARRIER) && is_tail)
aa75f4d3
HS
666 write_flags |= REQ_FUA | REQ_PREFLUSH;
667 lock_buffer(bh);
764b3fd3 668 set_buffer_dirty(bh);
aa75f4d3
HS
669 set_buffer_uptodate(bh);
670 bh->b_end_io = ext4_end_buffer_io_sync;
671 submit_bh(REQ_OP_WRITE, write_flags, bh);
672 EXT4_SB(sb)->s_fc_bh = NULL;
673}
674
675/* Ext4 commit path routines */
676
677/* memzero and update CRC */
678static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
679 u32 *crc)
680{
681 void *ret;
682
683 ret = memset(dst, 0, len);
684 if (crc)
685 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
686 return ret;
687}
688
689/*
690 * Allocate len bytes on a fast commit buffer.
691 *
692 * During the commit time this function is used to manage fast commit
693 * block space. We don't split a fast commit log onto different
694 * blocks. So this function makes sure that if there's not enough space
695 * on the current block, the remaining space in the current block is
696 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
697 * new block is from jbd2 and CRC is updated to reflect the padding
698 * we added.
699 */
700static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
701{
702 struct ext4_fc_tl *tl;
703 struct ext4_sb_info *sbi = EXT4_SB(sb);
704 struct buffer_head *bh;
705 int bsize = sbi->s_journal->j_blocksize;
706 int ret, off = sbi->s_fc_bytes % bsize;
707 int pad_len;
708
709 /*
710 * After allocating len, we should have space at least for a 0 byte
711 * padding.
712 */
713 if (len + sizeof(struct ext4_fc_tl) > bsize)
714 return NULL;
715
716 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
717 /*
718 * Only allocate from current buffer if we have enough space for
719 * this request AND we have space to add a zero byte padding.
720 */
721 if (!sbi->s_fc_bh) {
722 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
723 if (ret)
724 return NULL;
725 sbi->s_fc_bh = bh;
726 }
727 sbi->s_fc_bytes += len;
728 return sbi->s_fc_bh->b_data + off;
729 }
730 /* Need to add PAD tag */
731 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
732 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
733 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
734 tl->fc_len = cpu_to_le16(pad_len);
735 if (crc)
736 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
737 if (pad_len > 0)
738 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
e9f53353 739 ext4_fc_submit_bh(sb, false);
aa75f4d3
HS
740
741 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
742 if (ret)
743 return NULL;
744 sbi->s_fc_bh = bh;
745 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
746 return sbi->s_fc_bh->b_data;
747}
748
749/* memcpy to fc reserved space and update CRC */
750static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
751 int len, u32 *crc)
752{
753 if (crc)
754 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
755 return memcpy(dst, src, len);
756}
757
758/*
759 * Complete a fast commit by writing tail tag.
760 *
761 * Writing tail tag marks the end of a fast commit. In order to guarantee
762 * atomicity, after writing tail tag, even if there's space remaining
763 * in the block, next commit shouldn't use it. That's why tail tag
764 * has the length as that of the remaining space on the block.
765 */
766static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
767{
768 struct ext4_sb_info *sbi = EXT4_SB(sb);
769 struct ext4_fc_tl tl;
770 struct ext4_fc_tail tail;
771 int off, bsize = sbi->s_journal->j_blocksize;
772 u8 *dst;
773
774 /*
775 * ext4_fc_reserve_space takes care of allocating an extra block if
776 * there's no enough space on this block for accommodating this tail.
777 */
778 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
779 if (!dst)
780 return -ENOSPC;
781
782 off = sbi->s_fc_bytes % bsize;
783
784 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
785 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
786 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
787
788 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
789 dst += sizeof(tl);
790 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
791 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
792 dst += sizeof(tail.fc_tid);
793 tail.fc_crc = cpu_to_le32(crc);
794 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
795
e9f53353 796 ext4_fc_submit_bh(sb, true);
aa75f4d3
HS
797
798 return 0;
799}
800
801/*
802 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
803 * Returns false if there's not enough space.
804 */
805static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
806 u32 *crc)
807{
808 struct ext4_fc_tl tl;
809 u8 *dst;
810
811 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
812 if (!dst)
813 return false;
814
815 tl.fc_tag = cpu_to_le16(tag);
816 tl.fc_len = cpu_to_le16(len);
817
818 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
819 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
820
821 return true;
822}
823
824/* Same as above, but adds dentry tlv. */
facec450
GJ
825static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
826 struct ext4_fc_dentry_update *fc_dentry)
aa75f4d3
HS
827{
828 struct ext4_fc_dentry_info fcd;
829 struct ext4_fc_tl tl;
facec450 830 int dlen = fc_dentry->fcd_name.len;
aa75f4d3
HS
831 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
832 crc);
833
834 if (!dst)
835 return false;
836
facec450
GJ
837 fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
838 fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
839 tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
aa75f4d3
HS
840 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
841 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
842 dst += sizeof(tl);
843 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
844 dst += sizeof(fcd);
facec450 845 ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
aa75f4d3
HS
846
847 return true;
848}
849
850/*
851 * Writes inode in the fast commit space under TLV with tag @tag.
852 * Returns 0 on success, error on failure.
853 */
854static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
855{
856 struct ext4_inode_info *ei = EXT4_I(inode);
857 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
858 int ret;
859 struct ext4_iloc iloc;
860 struct ext4_fc_inode fc_inode;
861 struct ext4_fc_tl tl;
862 u8 *dst;
863
864 ret = ext4_get_inode_loc(inode, &iloc);
865 if (ret)
866 return ret;
867
6c31a689
HS
868 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
869 inode_len = EXT4_INODE_SIZE(inode->i_sb);
870 else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
aa75f4d3
HS
871 inode_len += ei->i_extra_isize;
872
873 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
874 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
875 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
876
877 dst = ext4_fc_reserve_space(inode->i_sb,
878 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
879 if (!dst)
880 return -ECANCELED;
881
882 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
883 return -ECANCELED;
884 dst += sizeof(tl);
885 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
886 return -ECANCELED;
887 dst += sizeof(fc_inode);
888 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
889 inode_len, crc))
890 return -ECANCELED;
891
892 return 0;
893}
894
895/*
896 * Writes updated data ranges for the inode in question. Updates CRC.
897 * Returns 0 on success, error otherwise.
898 */
899static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
900{
901 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
902 struct ext4_inode_info *ei = EXT4_I(inode);
903 struct ext4_map_blocks map;
904 struct ext4_fc_add_range fc_ext;
905 struct ext4_fc_del_range lrange;
906 struct ext4_extent *ex;
907 int ret;
908
909 mutex_lock(&ei->i_fc_lock);
910 if (ei->i_fc_lblk_len == 0) {
911 mutex_unlock(&ei->i_fc_lock);
912 return 0;
913 }
914 old_blk_size = ei->i_fc_lblk_start;
915 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
916 ei->i_fc_lblk_len = 0;
917 mutex_unlock(&ei->i_fc_lock);
918
919 cur_lblk_off = old_blk_size;
920 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
921 __func__, cur_lblk_off, new_blk_size, inode->i_ino);
922
923 while (cur_lblk_off <= new_blk_size) {
924 map.m_lblk = cur_lblk_off;
925 map.m_len = new_blk_size - cur_lblk_off + 1;
926 ret = ext4_map_blocks(NULL, inode, &map, 0);
927 if (ret < 0)
928 return -ECANCELED;
929
930 if (map.m_len == 0) {
931 cur_lblk_off++;
932 continue;
933 }
934
935 if (ret == 0) {
936 lrange.fc_ino = cpu_to_le32(inode->i_ino);
937 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
938 lrange.fc_len = cpu_to_le32(map.m_len);
939 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
940 sizeof(lrange), (u8 *)&lrange, crc))
941 return -ENOSPC;
942 } else {
a2c2f082
HT
943 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
944 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
945
946 /* Limit the number of blocks in one extent */
947 map.m_len = min(max, map.m_len);
948
aa75f4d3
HS
949 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
950 ex = (struct ext4_extent *)&fc_ext.fc_ex;
951 ex->ee_block = cpu_to_le32(map.m_lblk);
952 ex->ee_len = cpu_to_le16(map.m_len);
953 ext4_ext_store_pblock(ex, map.m_pblk);
954 if (map.m_flags & EXT4_MAP_UNWRITTEN)
955 ext4_ext_mark_unwritten(ex);
956 else
957 ext4_ext_mark_initialized(ex);
958 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
959 sizeof(fc_ext), (u8 *)&fc_ext, crc))
960 return -ENOSPC;
961 }
962
963 cur_lblk_off += map.m_len;
964 }
965
966 return 0;
967}
968
969
970/* Submit data for all the fast commit inodes */
971static int ext4_fc_submit_inode_data_all(journal_t *journal)
972{
c30365b9 973 struct super_block *sb = journal->j_private;
aa75f4d3
HS
974 struct ext4_sb_info *sbi = EXT4_SB(sb);
975 struct ext4_inode_info *ei;
aa75f4d3
HS
976 int ret = 0;
977
978 spin_lock(&sbi->s_fc_lock);
96e7c02d 979 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
aa75f4d3
HS
980 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
981 while (atomic_read(&ei->i_fc_updates)) {
982 DEFINE_WAIT(wait);
983
984 prepare_to_wait(&ei->i_fc_wait, &wait,
985 TASK_UNINTERRUPTIBLE);
986 if (atomic_read(&ei->i_fc_updates)) {
987 spin_unlock(&sbi->s_fc_lock);
988 schedule();
989 spin_lock(&sbi->s_fc_lock);
990 }
991 finish_wait(&ei->i_fc_wait, &wait);
992 }
993 spin_unlock(&sbi->s_fc_lock);
994 ret = jbd2_submit_inode_data(ei->jinode);
995 if (ret)
996 return ret;
997 spin_lock(&sbi->s_fc_lock);
998 }
999 spin_unlock(&sbi->s_fc_lock);
1000
1001 return ret;
1002}
1003
1004/* Wait for completion of data for all the fast commit inodes */
1005static int ext4_fc_wait_inode_data_all(journal_t *journal)
1006{
c30365b9 1007 struct super_block *sb = journal->j_private;
aa75f4d3
HS
1008 struct ext4_sb_info *sbi = EXT4_SB(sb);
1009 struct ext4_inode_info *pos, *n;
1010 int ret = 0;
1011
1012 spin_lock(&sbi->s_fc_lock);
1013 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1014 if (!ext4_test_inode_state(&pos->vfs_inode,
1015 EXT4_STATE_FC_COMMITTING))
1016 continue;
1017 spin_unlock(&sbi->s_fc_lock);
1018
1019 ret = jbd2_wait_inode_data(journal, pos->jinode);
1020 if (ret)
1021 return ret;
1022 spin_lock(&sbi->s_fc_lock);
1023 }
1024 spin_unlock(&sbi->s_fc_lock);
1025
1026 return 0;
1027}
1028
1029/* Commit all the directory entry updates */
1030static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
fa329e27
TT
1031__acquires(&sbi->s_fc_lock)
1032__releases(&sbi->s_fc_lock)
aa75f4d3 1033{
c30365b9 1034 struct super_block *sb = journal->j_private;
aa75f4d3 1035 struct ext4_sb_info *sbi = EXT4_SB(sb);
96e7c02d 1036 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
aa75f4d3 1037 struct inode *inode;
b3998b3b 1038 struct ext4_inode_info *ei;
aa75f4d3
HS
1039 int ret;
1040
1041 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1042 return 0;
96e7c02d
DP
1043 list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1044 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
aa75f4d3
HS
1045 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1046 spin_unlock(&sbi->s_fc_lock);
facec450 1047 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
aa75f4d3
HS
1048 ret = -ENOSPC;
1049 goto lock_and_exit;
1050 }
1051 spin_lock(&sbi->s_fc_lock);
1052 continue;
1053 }
aa75f4d3 1054 /*
b3998b3b
RH
1055 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1056 * corresponding inode pointer
aa75f4d3 1057 */
b3998b3b
RH
1058 WARN_ON(list_empty(&fc_dentry->fcd_dilist));
1059 ei = list_first_entry(&fc_dentry->fcd_dilist,
1060 struct ext4_inode_info, i_fc_dilist);
1061 inode = &ei->vfs_inode;
1062 WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1063
aa75f4d3
HS
1064 spin_unlock(&sbi->s_fc_lock);
1065
1066 /*
1067 * We first write the inode and then the create dirent. This
1068 * allows the recovery code to create an unnamed inode first
1069 * and then link it to a directory entry. This allows us
1070 * to use namei.c routines almost as is and simplifies
1071 * the recovery code.
1072 */
1073 ret = ext4_fc_write_inode(inode, crc);
1074 if (ret)
1075 goto lock_and_exit;
1076
1077 ret = ext4_fc_write_inode_data(inode, crc);
1078 if (ret)
1079 goto lock_and_exit;
1080
facec450 1081 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
aa75f4d3
HS
1082 ret = -ENOSPC;
1083 goto lock_and_exit;
1084 }
1085
1086 spin_lock(&sbi->s_fc_lock);
1087 }
1088 return 0;
1089lock_and_exit:
1090 spin_lock(&sbi->s_fc_lock);
1091 return ret;
1092}
1093
1094static int ext4_fc_perform_commit(journal_t *journal)
1095{
c30365b9 1096 struct super_block *sb = journal->j_private;
aa75f4d3
HS
1097 struct ext4_sb_info *sbi = EXT4_SB(sb);
1098 struct ext4_inode_info *iter;
1099 struct ext4_fc_head head;
aa75f4d3
HS
1100 struct inode *inode;
1101 struct blk_plug plug;
1102 int ret = 0;
1103 u32 crc = 0;
1104
1105 ret = ext4_fc_submit_inode_data_all(journal);
1106 if (ret)
1107 return ret;
1108
1109 ret = ext4_fc_wait_inode_data_all(journal);
1110 if (ret)
1111 return ret;
1112
da0c5d26
HS
1113 /*
1114 * If file system device is different from journal device, issue a cache
1115 * flush before we start writing fast commit blocks.
1116 */
1117 if (journal->j_fs_dev != journal->j_dev)
c6bf3f0e 1118 blkdev_issue_flush(journal->j_fs_dev);
da0c5d26 1119
aa75f4d3
HS
1120 blk_start_plug(&plug);
1121 if (sbi->s_fc_bytes == 0) {
1122 /*
1123 * Add a head tag only if this is the first fast commit
1124 * in this TID.
1125 */
1126 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1127 head.fc_tid = cpu_to_le32(
1128 sbi->s_journal->j_running_transaction->t_tid);
1129 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
e1262cd2
XY
1130 (u8 *)&head, &crc)) {
1131 ret = -ENOSPC;
aa75f4d3 1132 goto out;
e1262cd2 1133 }
aa75f4d3
HS
1134 }
1135
1136 spin_lock(&sbi->s_fc_lock);
1137 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1138 if (ret) {
1139 spin_unlock(&sbi->s_fc_lock);
1140 goto out;
1141 }
1142
96e7c02d 1143 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
aa75f4d3
HS
1144 inode = &iter->vfs_inode;
1145 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1146 continue;
1147
1148 spin_unlock(&sbi->s_fc_lock);
1149 ret = ext4_fc_write_inode_data(inode, &crc);
1150 if (ret)
1151 goto out;
1152 ret = ext4_fc_write_inode(inode, &crc);
1153 if (ret)
1154 goto out;
1155 spin_lock(&sbi->s_fc_lock);
aa75f4d3
HS
1156 }
1157 spin_unlock(&sbi->s_fc_lock);
1158
1159 ret = ext4_fc_write_tail(sb, crc);
1160
1161out:
1162 blk_finish_plug(&plug);
1163 return ret;
1164}
1165
0915e464 1166static void ext4_fc_update_stats(struct super_block *sb, int status,
d9bf099c 1167 u64 commit_time, int nblks, tid_t commit_tid)
0915e464
HS
1168{
1169 struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1170
d9bf099c
RH
1171 jbd_debug(1, "Fast commit ended with status = %d for tid %u",
1172 status, commit_tid);
0915e464
HS
1173 if (status == EXT4_FC_STATUS_OK) {
1174 stats->fc_num_commits++;
1175 stats->fc_numblks += nblks;
1176 if (likely(stats->s_fc_avg_commit_time))
1177 stats->s_fc_avg_commit_time =
1178 (commit_time +
1179 stats->s_fc_avg_commit_time * 3) / 4;
1180 else
1181 stats->s_fc_avg_commit_time = commit_time;
1182 } else if (status == EXT4_FC_STATUS_FAILED ||
1183 status == EXT4_FC_STATUS_INELIGIBLE) {
1184 if (status == EXT4_FC_STATUS_FAILED)
1185 stats->fc_failed_commits++;
1186 stats->fc_ineligible_commits++;
1187 } else {
1188 stats->fc_skipped_commits++;
1189 }
5641ace5 1190 trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
0915e464
HS
1191}
1192
aa75f4d3
HS
1193/*
1194 * The main commit entry point. Performs a fast commit for transaction
1195 * commit_tid if needed. If it's not possible to perform a fast commit
1196 * due to various reasons, we fall back to full commit. Returns 0
1197 * on success, error otherwise.
1198 */
1199int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1200{
c30365b9 1201 struct super_block *sb = journal->j_private;
aa75f4d3
HS
1202 struct ext4_sb_info *sbi = EXT4_SB(sb);
1203 int nblks = 0, ret, bsize = journal->j_blocksize;
1204 int subtid = atomic_read(&sbi->s_fc_subtid);
0915e464 1205 int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
aa75f4d3
HS
1206 ktime_t start_time, commit_time;
1207
7f142440
RH
1208 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1209 return jbd2_complete_transaction(journal, commit_tid);
1210
5641ace5 1211 trace_ext4_fc_commit_start(sb, commit_tid);
aa75f4d3
HS
1212
1213 start_time = ktime_get();
1214
aa75f4d3
HS
1215restart_fc:
1216 ret = jbd2_fc_begin_commit(journal, commit_tid);
1217 if (ret == -EALREADY) {
1218 /* There was an ongoing commit, check if we need to restart */
1219 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1220 commit_tid > journal->j_commit_sequence)
1221 goto restart_fc;
d9bf099c
RH
1222 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1223 commit_tid);
0915e464 1224 return 0;
aa75f4d3 1225 } else if (ret) {
0915e464
HS
1226 /*
1227 * Commit couldn't start. Just update stats and perform a
1228 * full commit.
1229 */
d9bf099c
RH
1230 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1231 commit_tid);
0915e464 1232 return jbd2_complete_transaction(journal, commit_tid);
aa75f4d3 1233 }
0915e464 1234
7bbbe241
HS
1235 /*
1236 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1237 * if we are fast commit ineligible.
1238 */
1239 if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
0915e464
HS
1240 status = EXT4_FC_STATUS_INELIGIBLE;
1241 goto fallback;
7bbbe241 1242 }
aa75f4d3
HS
1243
1244 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1245 ret = ext4_fc_perform_commit(journal);
1246 if (ret < 0) {
0915e464
HS
1247 status = EXT4_FC_STATUS_FAILED;
1248 goto fallback;
aa75f4d3
HS
1249 }
1250 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1251 ret = jbd2_fc_wait_bufs(journal, nblks);
1252 if (ret < 0) {
0915e464
HS
1253 status = EXT4_FC_STATUS_FAILED;
1254 goto fallback;
aa75f4d3
HS
1255 }
1256 atomic_inc(&sbi->s_fc_subtid);
0915e464 1257 ret = jbd2_fc_end_commit(journal);
aa75f4d3 1258 /*
0915e464
HS
1259 * weight the commit time higher than the average time so we
1260 * don't react too strongly to vast changes in the commit time
aa75f4d3 1261 */
0915e464 1262 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
d9bf099c 1263 ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
0915e464
HS
1264 return ret;
1265
1266fallback:
1267 ret = jbd2_fc_end_commit_fallback(journal);
d9bf099c 1268 ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
0915e464 1269 return ret;
aa75f4d3
HS
1270}
1271
ff780b91
HS
1272/*
1273 * Fast commit cleanup routine. This is called after every fast commit and
1274 * full commit. full is true if we are called after a full commit.
1275 */
e85c81ba 1276static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
ff780b91 1277{
aa75f4d3
HS
1278 struct super_block *sb = journal->j_private;
1279 struct ext4_sb_info *sbi = EXT4_SB(sb);
96e7c02d 1280 struct ext4_inode_info *iter, *iter_n;
aa75f4d3 1281 struct ext4_fc_dentry_update *fc_dentry;
aa75f4d3
HS
1282
1283 if (full && sbi->s_fc_bh)
1284 sbi->s_fc_bh = NULL;
1285
08f4c42a 1286 trace_ext4_fc_cleanup(journal, full, tid);
aa75f4d3
HS
1287 jbd2_fc_release_bufs(journal);
1288
1289 spin_lock(&sbi->s_fc_lock);
96e7c02d
DP
1290 list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1291 i_fc_list) {
aa75f4d3
HS
1292 list_del_init(&iter->i_fc_list);
1293 ext4_clear_inode_state(&iter->vfs_inode,
1294 EXT4_STATE_FC_COMMITTING);
bdc8a53a
XY
1295 if (iter->i_sync_tid <= tid)
1296 ext4_fc_reset_inode(&iter->vfs_inode);
aa75f4d3
HS
1297 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1298 smp_mb();
1299#if (BITS_PER_LONG < 64)
1300 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1301#else
1302 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1303#endif
1304 }
1305
1306 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1307 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1308 struct ext4_fc_dentry_update,
1309 fcd_list);
1310 list_del_init(&fc_dentry->fcd_list);
b3998b3b 1311 list_del_init(&fc_dentry->fcd_dilist);
aa75f4d3
HS
1312 spin_unlock(&sbi->s_fc_lock);
1313
1314 if (fc_dentry->fcd_name.name &&
1315 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1316 kfree(fc_dentry->fcd_name.name);
1317 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1318 spin_lock(&sbi->s_fc_lock);
1319 }
1320
1321 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1322 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1323 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
31e203e0 1324 &sbi->s_fc_q[FC_Q_MAIN]);
aa75f4d3 1325
e85c81ba
XY
1326 if (tid >= sbi->s_fc_ineligible_tid) {
1327 sbi->s_fc_ineligible_tid = 0;
1328 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1329 }
aa75f4d3
HS
1330
1331 if (full)
1332 sbi->s_fc_bytes = 0;
1333 spin_unlock(&sbi->s_fc_lock);
1334 trace_ext4_fc_stats(sb);
ff780b91 1335}
6866d7b3 1336
8016e29f
HS
1337/* Ext4 Replay Path Routines */
1338
8016e29f
HS
1339/* Helper struct for dentry replay routines */
1340struct dentry_info_args {
1341 int parent_ino, dname_len, ino, inode_len;
1342 char *dname;
1343};
1344
1345static inline void tl_to_darg(struct dentry_info_args *darg,
a7ba36bc 1346 struct ext4_fc_tl *tl, u8 *val)
8016e29f 1347{
a7ba36bc 1348 struct ext4_fc_dentry_info fcd;
8016e29f 1349
a7ba36bc 1350 memcpy(&fcd, val, sizeof(fcd));
8016e29f 1351
a7ba36bc
HS
1352 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1353 darg->ino = le32_to_cpu(fcd.fc_ino);
1354 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1355 darg->dname_len = le16_to_cpu(tl->fc_len) -
1356 sizeof(struct ext4_fc_dentry_info);
8016e29f
HS
1357}
1358
1359/* Unlink replay function */
a7ba36bc
HS
1360static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1361 u8 *val)
8016e29f
HS
1362{
1363 struct inode *inode, *old_parent;
1364 struct qstr entry;
1365 struct dentry_info_args darg;
1366 int ret = 0;
1367
a7ba36bc 1368 tl_to_darg(&darg, tl, val);
8016e29f
HS
1369
1370 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1371 darg.parent_ino, darg.dname_len);
1372
1373 entry.name = darg.dname;
1374 entry.len = darg.dname_len;
1375 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1376
23dd561a 1377 if (IS_ERR(inode)) {
8016e29f
HS
1378 jbd_debug(1, "Inode %d not found", darg.ino);
1379 return 0;
1380 }
1381
1382 old_parent = ext4_iget(sb, darg.parent_ino,
1383 EXT4_IGET_NORMAL);
23dd561a 1384 if (IS_ERR(old_parent)) {
8016e29f
HS
1385 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
1386 iput(inode);
1387 return 0;
1388 }
1389
a80f7fcf 1390 ret = __ext4_unlink(NULL, old_parent, &entry, inode);
8016e29f
HS
1391 /* -ENOENT ok coz it might not exist anymore. */
1392 if (ret == -ENOENT)
1393 ret = 0;
1394 iput(old_parent);
1395 iput(inode);
1396 return ret;
1397}
1398
1399static int ext4_fc_replay_link_internal(struct super_block *sb,
1400 struct dentry_info_args *darg,
1401 struct inode *inode)
1402{
1403 struct inode *dir = NULL;
1404 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1405 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1406 int ret = 0;
1407
1408 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1409 if (IS_ERR(dir)) {
1410 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1411 dir = NULL;
1412 goto out;
1413 }
1414
1415 dentry_dir = d_obtain_alias(dir);
1416 if (IS_ERR(dentry_dir)) {
1417 jbd_debug(1, "Failed to obtain dentry");
1418 dentry_dir = NULL;
1419 goto out;
1420 }
1421
1422 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1423 if (!dentry_inode) {
1424 jbd_debug(1, "Inode dentry not created.");
1425 ret = -ENOMEM;
1426 goto out;
1427 }
1428
1429 ret = __ext4_link(dir, inode, dentry_inode);
1430 /*
1431 * It's possible that link already existed since data blocks
1432 * for the dir in question got persisted before we crashed OR
1433 * we replayed this tag and crashed before the entire replay
1434 * could complete.
1435 */
1436 if (ret && ret != -EEXIST) {
1437 jbd_debug(1, "Failed to link\n");
1438 goto out;
1439 }
1440
1441 ret = 0;
1442out:
1443 if (dentry_dir) {
1444 d_drop(dentry_dir);
1445 dput(dentry_dir);
1446 } else if (dir) {
1447 iput(dir);
1448 }
1449 if (dentry_inode) {
1450 d_drop(dentry_inode);
1451 dput(dentry_inode);
1452 }
1453
1454 return ret;
1455}
1456
1457/* Link replay function */
a7ba36bc
HS
1458static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1459 u8 *val)
8016e29f
HS
1460{
1461 struct inode *inode;
1462 struct dentry_info_args darg;
1463 int ret = 0;
1464
a7ba36bc 1465 tl_to_darg(&darg, tl, val);
8016e29f
HS
1466 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1467 darg.parent_ino, darg.dname_len);
1468
1469 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
23dd561a 1470 if (IS_ERR(inode)) {
8016e29f
HS
1471 jbd_debug(1, "Inode not found.");
1472 return 0;
1473 }
1474
1475 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1476 iput(inode);
1477 return ret;
1478}
1479
1480/*
1481 * Record all the modified inodes during replay. We use this later to setup
1482 * block bitmaps correctly.
1483 */
1484static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1485{
1486 struct ext4_fc_replay_state *state;
1487 int i;
1488
1489 state = &EXT4_SB(sb)->s_fc_replay_state;
1490 for (i = 0; i < state->fc_modified_inodes_used; i++)
1491 if (state->fc_modified_inodes[i] == ino)
1492 return 0;
1493 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
8016e29f 1494 state->fc_modified_inodes = krealloc(
cdce59a1
RH
1495 state->fc_modified_inodes,
1496 sizeof(int) * (state->fc_modified_inodes_size +
1497 EXT4_FC_REPLAY_REALLOC_INCREMENT),
1498 GFP_KERNEL);
8016e29f
HS
1499 if (!state->fc_modified_inodes)
1500 return -ENOMEM;
cdce59a1
RH
1501 state->fc_modified_inodes_size +=
1502 EXT4_FC_REPLAY_REALLOC_INCREMENT;
8016e29f
HS
1503 }
1504 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1505 return 0;
1506}
1507
1508/*
1509 * Inode replay function
1510 */
a7ba36bc
HS
1511static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1512 u8 *val)
8016e29f 1513{
a7ba36bc 1514 struct ext4_fc_inode fc_inode;
8016e29f
HS
1515 struct ext4_inode *raw_inode;
1516 struct ext4_inode *raw_fc_inode;
1517 struct inode *inode = NULL;
1518 struct ext4_iloc iloc;
1519 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1520 struct ext4_extent_header *eh;
1521
a7ba36bc 1522 memcpy(&fc_inode, val, sizeof(fc_inode));
8016e29f 1523
a7ba36bc 1524 ino = le32_to_cpu(fc_inode.fc_ino);
8016e29f
HS
1525 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1526
1527 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
23dd561a 1528 if (!IS_ERR(inode)) {
8016e29f
HS
1529 ext4_ext_clear_bb(inode);
1530 iput(inode);
1531 }
23dd561a 1532 inode = NULL;
8016e29f 1533
cdce59a1
RH
1534 ret = ext4_fc_record_modified_inode(sb, ino);
1535 if (ret)
1536 goto out;
8016e29f 1537
a7ba36bc
HS
1538 raw_fc_inode = (struct ext4_inode *)
1539 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
8016e29f
HS
1540 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1541 if (ret)
1542 goto out;
1543
a7ba36bc 1544 inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
8016e29f
HS
1545 raw_inode = ext4_raw_inode(&iloc);
1546
1547 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1548 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1549 inode_len - offsetof(struct ext4_inode, i_generation));
1550 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1551 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1552 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1553 memset(eh, 0, sizeof(*eh));
1554 eh->eh_magic = EXT4_EXT_MAGIC;
1555 eh->eh_max = cpu_to_le16(
1556 (sizeof(raw_inode->i_block) -
1557 sizeof(struct ext4_extent_header))
1558 / sizeof(struct ext4_extent));
1559 }
1560 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1561 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1562 sizeof(raw_inode->i_block));
1563 }
1564
1565 /* Immediately update the inode on disk. */
1566 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1567 if (ret)
1568 goto out;
1569 ret = sync_dirty_buffer(iloc.bh);
1570 if (ret)
1571 goto out;
1572 ret = ext4_mark_inode_used(sb, ino);
1573 if (ret)
1574 goto out;
1575
1576 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1577 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
23dd561a 1578 if (IS_ERR(inode)) {
8016e29f
HS
1579 jbd_debug(1, "Inode not found.");
1580 return -EFSCORRUPTED;
1581 }
1582
1583 /*
1584 * Our allocator could have made different decisions than before
1585 * crashing. This should be fixed but until then, we calculate
1586 * the number of blocks the inode.
1587 */
1ebf2178
HS
1588 if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1589 ext4_ext_replay_set_iblocks(inode);
8016e29f
HS
1590
1591 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1592 ext4_reset_inode_seed(inode);
1593
1594 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1595 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1596 sync_dirty_buffer(iloc.bh);
1597 brelse(iloc.bh);
1598out:
1599 iput(inode);
1600 if (!ret)
c6bf3f0e 1601 blkdev_issue_flush(sb->s_bdev);
8016e29f
HS
1602
1603 return 0;
1604}
1605
1606/*
1607 * Dentry create replay function.
1608 *
1609 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1610 * inode for which we are trying to create a dentry here, should already have
1611 * been replayed before we start here.
1612 */
a7ba36bc
HS
1613static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1614 u8 *val)
8016e29f
HS
1615{
1616 int ret = 0;
1617 struct inode *inode = NULL;
1618 struct inode *dir = NULL;
1619 struct dentry_info_args darg;
1620
a7ba36bc 1621 tl_to_darg(&darg, tl, val);
8016e29f
HS
1622
1623 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1624 darg.parent_ino, darg.dname_len);
1625
1626 /* This takes care of update group descriptor and other metadata */
1627 ret = ext4_mark_inode_used(sb, darg.ino);
1628 if (ret)
1629 goto out;
1630
1631 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
23dd561a 1632 if (IS_ERR(inode)) {
8016e29f
HS
1633 jbd_debug(1, "inode %d not found.", darg.ino);
1634 inode = NULL;
1635 ret = -EINVAL;
1636 goto out;
1637 }
1638
1639 if (S_ISDIR(inode->i_mode)) {
1640 /*
1641 * If we are creating a directory, we need to make sure that the
1642 * dot and dot dot dirents are setup properly.
1643 */
1644 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
23dd561a 1645 if (IS_ERR(dir)) {
8016e29f
HS
1646 jbd_debug(1, "Dir %d not found.", darg.ino);
1647 goto out;
1648 }
1649 ret = ext4_init_new_dir(NULL, dir, inode);
1650 iput(dir);
1651 if (ret) {
1652 ret = 0;
1653 goto out;
1654 }
1655 }
1656 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1657 if (ret)
1658 goto out;
1659 set_nlink(inode, 1);
1660 ext4_mark_inode_dirty(NULL, inode);
1661out:
784a0995 1662 iput(inode);
8016e29f
HS
1663 return ret;
1664}
1665
1666/*
599ea31d
XY
1667 * Record physical disk regions which are in use as per fast commit area,
1668 * and used by inodes during replay phase. Our simple replay phase
1669 * allocator excludes these regions from allocation.
8016e29f 1670 */
599ea31d
XY
1671int ext4_fc_record_regions(struct super_block *sb, int ino,
1672 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
8016e29f
HS
1673{
1674 struct ext4_fc_replay_state *state;
1675 struct ext4_fc_alloc_region *region;
1676
1677 state = &EXT4_SB(sb)->s_fc_replay_state;
599ea31d
XY
1678 /*
1679 * during replay phase, the fc_regions_valid may not same as
1680 * fc_regions_used, update it when do new additions.
1681 */
1682 if (replay && state->fc_regions_used != state->fc_regions_valid)
1683 state->fc_regions_used = state->fc_regions_valid;
8016e29f
HS
1684 if (state->fc_regions_used == state->fc_regions_size) {
1685 state->fc_regions_size +=
1686 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1687 state->fc_regions = krealloc(
1688 state->fc_regions,
1689 state->fc_regions_size *
1690 sizeof(struct ext4_fc_alloc_region),
1691 GFP_KERNEL);
1692 if (!state->fc_regions)
1693 return -ENOMEM;
1694 }
1695 region = &state->fc_regions[state->fc_regions_used++];
1696 region->ino = ino;
1697 region->lblk = lblk;
1698 region->pblk = pblk;
1699 region->len = len;
1700
599ea31d
XY
1701 if (replay)
1702 state->fc_regions_valid++;
1703
8016e29f
HS
1704 return 0;
1705}
1706
1707/* Replay add range tag */
1708static int ext4_fc_replay_add_range(struct super_block *sb,
a7ba36bc 1709 struct ext4_fc_tl *tl, u8 *val)
8016e29f 1710{
a7ba36bc 1711 struct ext4_fc_add_range fc_add_ex;
8016e29f
HS
1712 struct ext4_extent newex, *ex;
1713 struct inode *inode;
1714 ext4_lblk_t start, cur;
1715 int remaining, len;
1716 ext4_fsblk_t start_pblk;
1717 struct ext4_map_blocks map;
1718 struct ext4_ext_path *path = NULL;
1719 int ret;
1720
a7ba36bc
HS
1721 memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1722 ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
8016e29f
HS
1723
1724 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
a7ba36bc 1725 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
8016e29f
HS
1726 ext4_ext_get_actual_len(ex));
1727
a7ba36bc 1728 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
23dd561a 1729 if (IS_ERR(inode)) {
8016e29f
HS
1730 jbd_debug(1, "Inode not found.");
1731 return 0;
1732 }
1733
1734 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
cdce59a1
RH
1735 if (ret)
1736 goto out;
8016e29f
HS
1737
1738 start = le32_to_cpu(ex->ee_block);
1739 start_pblk = ext4_ext_pblock(ex);
1740 len = ext4_ext_get_actual_len(ex);
1741
1742 cur = start;
1743 remaining = len;
1744 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1745 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1746 inode->i_ino);
1747
1748 while (remaining > 0) {
1749 map.m_lblk = cur;
1750 map.m_len = remaining;
1751 map.m_pblk = 0;
1752 ret = ext4_map_blocks(NULL, inode, &map, 0);
1753
cdce59a1
RH
1754 if (ret < 0)
1755 goto out;
8016e29f
HS
1756
1757 if (ret == 0) {
1758 /* Range is not mapped */
1759 path = ext4_find_extent(inode, cur, NULL, 0);
cdce59a1
RH
1760 if (IS_ERR(path))
1761 goto out;
8016e29f
HS
1762 memset(&newex, 0, sizeof(newex));
1763 newex.ee_block = cpu_to_le32(cur);
1764 ext4_ext_store_pblock(
1765 &newex, start_pblk + cur - start);
1766 newex.ee_len = cpu_to_le16(map.m_len);
1767 if (ext4_ext_is_unwritten(ex))
1768 ext4_ext_mark_unwritten(&newex);
1769 down_write(&EXT4_I(inode)->i_data_sem);
1770 ret = ext4_ext_insert_extent(
1771 NULL, inode, &path, &newex, 0);
1772 up_write((&EXT4_I(inode)->i_data_sem));
1773 ext4_ext_drop_refs(path);
1774 kfree(path);
cdce59a1
RH
1775 if (ret)
1776 goto out;
8016e29f
HS
1777 goto next;
1778 }
1779
1780 if (start_pblk + cur - start != map.m_pblk) {
1781 /*
1782 * Logical to physical mapping changed. This can happen
1783 * if this range was removed and then reallocated to
1784 * map to new physical blocks during a fast commit.
1785 */
1786 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1787 ext4_ext_is_unwritten(ex),
1788 start_pblk + cur - start);
cdce59a1
RH
1789 if (ret)
1790 goto out;
8016e29f
HS
1791 /*
1792 * Mark the old blocks as free since they aren't used
1793 * anymore. We maintain an array of all the modified
1794 * inodes. In case these blocks are still used at either
1795 * a different logical range in the same inode or in
1796 * some different inode, we will mark them as allocated
1797 * at the end of the FC replay using our array of
1798 * modified inodes.
1799 */
1800 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1801 goto next;
1802 }
1803
1804 /* Range is mapped and needs a state change */
fcdf3c34 1805 jbd_debug(1, "Converting from %ld to %d %lld",
8016e29f
HS
1806 map.m_flags & EXT4_MAP_UNWRITTEN,
1807 ext4_ext_is_unwritten(ex), map.m_pblk);
1808 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1809 ext4_ext_is_unwritten(ex), map.m_pblk);
cdce59a1
RH
1810 if (ret)
1811 goto out;
8016e29f
HS
1812 /*
1813 * We may have split the extent tree while toggling the state.
1814 * Try to shrink the extent tree now.
1815 */
1816 ext4_ext_replay_shrink_inode(inode, start + len);
1817next:
1818 cur += map.m_len;
1819 remaining -= map.m_len;
1820 }
1821 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1822 sb->s_blocksize_bits);
cdce59a1 1823out:
8016e29f
HS
1824 iput(inode);
1825 return 0;
1826}
1827
1828/* Replay DEL_RANGE tag */
1829static int
a7ba36bc
HS
1830ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1831 u8 *val)
8016e29f
HS
1832{
1833 struct inode *inode;
a7ba36bc 1834 struct ext4_fc_del_range lrange;
8016e29f
HS
1835 struct ext4_map_blocks map;
1836 ext4_lblk_t cur, remaining;
1837 int ret;
1838
a7ba36bc
HS
1839 memcpy(&lrange, val, sizeof(lrange));
1840 cur = le32_to_cpu(lrange.fc_lblk);
1841 remaining = le32_to_cpu(lrange.fc_len);
8016e29f
HS
1842
1843 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
a7ba36bc 1844 le32_to_cpu(lrange.fc_ino), cur, remaining);
8016e29f 1845
a7ba36bc 1846 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
23dd561a 1847 if (IS_ERR(inode)) {
a7ba36bc 1848 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
8016e29f
HS
1849 return 0;
1850 }
1851
1852 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
cdce59a1
RH
1853 if (ret)
1854 goto out;
8016e29f
HS
1855
1856 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
a7ba36bc
HS
1857 inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1858 le32_to_cpu(lrange.fc_len));
8016e29f
HS
1859 while (remaining > 0) {
1860 map.m_lblk = cur;
1861 map.m_len = remaining;
1862
1863 ret = ext4_map_blocks(NULL, inode, &map, 0);
cdce59a1
RH
1864 if (ret < 0)
1865 goto out;
8016e29f
HS
1866 if (ret > 0) {
1867 remaining -= ret;
1868 cur += ret;
1869 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1870 } else {
1871 remaining -= map.m_len;
1872 cur += map.m_len;
1873 }
1874 }
1875
0b5b5a62 1876 down_write(&EXT4_I(inode)->i_data_sem);
8fca8a2b
XY
1877 ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1878 le32_to_cpu(lrange.fc_lblk) +
1879 le32_to_cpu(lrange.fc_len) - 1);
0b5b5a62 1880 up_write(&EXT4_I(inode)->i_data_sem);
cdce59a1
RH
1881 if (ret)
1882 goto out;
8016e29f
HS
1883 ext4_ext_replay_shrink_inode(inode,
1884 i_size_read(inode) >> sb->s_blocksize_bits);
1885 ext4_mark_inode_dirty(NULL, inode);
cdce59a1 1886out:
8016e29f 1887 iput(inode);
8016e29f
HS
1888 return 0;
1889}
1890
8016e29f
HS
1891static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1892{
1893 struct ext4_fc_replay_state *state;
1894 struct inode *inode;
1895 struct ext4_ext_path *path = NULL;
1896 struct ext4_map_blocks map;
1897 int i, ret, j;
1898 ext4_lblk_t cur, end;
1899
1900 state = &EXT4_SB(sb)->s_fc_replay_state;
1901 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1902 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1903 EXT4_IGET_NORMAL);
23dd561a 1904 if (IS_ERR(inode)) {
8016e29f
HS
1905 jbd_debug(1, "Inode %d not found.",
1906 state->fc_modified_inodes[i]);
1907 continue;
1908 }
1909 cur = 0;
1910 end = EXT_MAX_BLOCKS;
1ebf2178
HS
1911 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1912 iput(inode);
1913 continue;
1914 }
8016e29f
HS
1915 while (cur < end) {
1916 map.m_lblk = cur;
1917 map.m_len = end - cur;
1918
1919 ret = ext4_map_blocks(NULL, inode, &map, 0);
1920 if (ret < 0)
1921 break;
1922
1923 if (ret > 0) {
1924 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
23dd561a 1925 if (!IS_ERR(path)) {
8016e29f
HS
1926 for (j = 0; j < path->p_depth; j++)
1927 ext4_mb_mark_bb(inode->i_sb,
1928 path[j].p_block, 1, 1);
1929 ext4_ext_drop_refs(path);
1930 kfree(path);
1931 }
1932 cur += ret;
1933 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1934 map.m_len, 1);
1935 } else {
1936 cur = cur + (map.m_len ? map.m_len : 1);
1937 }
1938 }
1939 iput(inode);
1940 }
1941}
1942
1943/*
1944 * Check if block is in excluded regions for block allocation. The simple
1945 * allocator that runs during replay phase is calls this function to see
1946 * if it is okay to use a block.
1947 */
1948bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1949{
1950 int i;
1951 struct ext4_fc_replay_state *state;
1952
1953 state = &EXT4_SB(sb)->s_fc_replay_state;
1954 for (i = 0; i < state->fc_regions_valid; i++) {
1955 if (state->fc_regions[i].ino == 0 ||
1956 state->fc_regions[i].len == 0)
1957 continue;
dbaafbad
RH
1958 if (in_range(blk, state->fc_regions[i].pblk,
1959 state->fc_regions[i].len))
8016e29f
HS
1960 return true;
1961 }
1962 return false;
1963}
1964
1965/* Cleanup function called after replay */
1966void ext4_fc_replay_cleanup(struct super_block *sb)
1967{
1968 struct ext4_sb_info *sbi = EXT4_SB(sb);
1969
1970 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1971 kfree(sbi->s_fc_replay_state.fc_regions);
1972 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1973}
1974
1975/*
1976 * Recovery Scan phase handler
1977 *
1978 * This function is called during the scan phase and is responsible
1979 * for doing following things:
1980 * - Make sure the fast commit area has valid tags for replay
1981 * - Count number of tags that need to be replayed by the replay handler
1982 * - Verify CRC
1983 * - Create a list of excluded blocks for allocation during replay phase
1984 *
1985 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1986 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1987 * to indicate that scan has finished and JBD2 can now start replay phase.
1988 * It returns a negative error to indicate that there was an error. At the end
1989 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1990 * to indicate the number of tags that need to replayed during the replay phase.
1991 */
1992static int ext4_fc_replay_scan(journal_t *journal,
1993 struct buffer_head *bh, int off,
1994 tid_t expected_tid)
1995{
1996 struct super_block *sb = journal->j_private;
1997 struct ext4_sb_info *sbi = EXT4_SB(sb);
1998 struct ext4_fc_replay_state *state;
1999 int ret = JBD2_FC_REPLAY_CONTINUE;
a7ba36bc
HS
2000 struct ext4_fc_add_range ext;
2001 struct ext4_fc_tl tl;
2002 struct ext4_fc_tail tail;
2003 __u8 *start, *end, *cur, *val;
2004 struct ext4_fc_head head;
8016e29f
HS
2005 struct ext4_extent *ex;
2006
2007 state = &sbi->s_fc_replay_state;
2008
2009 start = (u8 *)bh->b_data;
2010 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2011
2012 if (state->fc_replay_expected_off == 0) {
2013 state->fc_cur_tag = 0;
2014 state->fc_replay_num_tags = 0;
2015 state->fc_crc = 0;
2016 state->fc_regions = NULL;
2017 state->fc_regions_valid = state->fc_regions_used =
2018 state->fc_regions_size = 0;
2019 /* Check if we can stop early */
2020 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2021 != EXT4_FC_TAG_HEAD)
2022 return 0;
2023 }
2024
2025 if (off != state->fc_replay_expected_off) {
2026 ret = -EFSCORRUPTED;
2027 goto out_err;
2028 }
2029
2030 state->fc_replay_expected_off++;
a7ba36bc
HS
2031 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2032 memcpy(&tl, cur, sizeof(tl));
2033 val = cur + sizeof(tl);
8016e29f 2034 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
a7ba36bc
HS
2035 tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
2036 switch (le16_to_cpu(tl.fc_tag)) {
8016e29f 2037 case EXT4_FC_TAG_ADD_RANGE:
a7ba36bc
HS
2038 memcpy(&ext, val, sizeof(ext));
2039 ex = (struct ext4_extent *)&ext.fc_ex;
8016e29f 2040 ret = ext4_fc_record_regions(sb,
a7ba36bc 2041 le32_to_cpu(ext.fc_ino),
8016e29f 2042 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
599ea31d 2043 ext4_ext_get_actual_len(ex), 0);
8016e29f
HS
2044 if (ret < 0)
2045 break;
2046 ret = JBD2_FC_REPLAY_CONTINUE;
2047 fallthrough;
2048 case EXT4_FC_TAG_DEL_RANGE:
2049 case EXT4_FC_TAG_LINK:
2050 case EXT4_FC_TAG_UNLINK:
2051 case EXT4_FC_TAG_CREAT:
2052 case EXT4_FC_TAG_INODE:
2053 case EXT4_FC_TAG_PAD:
2054 state->fc_cur_tag++;
a7ba36bc
HS
2055 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2056 sizeof(tl) + le16_to_cpu(tl.fc_len));
8016e29f
HS
2057 break;
2058 case EXT4_FC_TAG_TAIL:
2059 state->fc_cur_tag++;
a7ba36bc
HS
2060 memcpy(&tail, val, sizeof(tail));
2061 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2062 sizeof(tl) +
8016e29f
HS
2063 offsetof(struct ext4_fc_tail,
2064 fc_crc));
a7ba36bc
HS
2065 if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2066 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
8016e29f
HS
2067 state->fc_replay_num_tags = state->fc_cur_tag;
2068 state->fc_regions_valid =
2069 state->fc_regions_used;
2070 } else {
2071 ret = state->fc_replay_num_tags ?
2072 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2073 }
2074 state->fc_crc = 0;
2075 break;
2076 case EXT4_FC_TAG_HEAD:
a7ba36bc
HS
2077 memcpy(&head, val, sizeof(head));
2078 if (le32_to_cpu(head.fc_features) &
8016e29f
HS
2079 ~EXT4_FC_SUPPORTED_FEATURES) {
2080 ret = -EOPNOTSUPP;
2081 break;
2082 }
a7ba36bc 2083 if (le32_to_cpu(head.fc_tid) != expected_tid) {
8016e29f
HS
2084 ret = JBD2_FC_REPLAY_STOP;
2085 break;
2086 }
2087 state->fc_cur_tag++;
a7ba36bc
HS
2088 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2089 sizeof(tl) + le16_to_cpu(tl.fc_len));
8016e29f
HS
2090 break;
2091 default:
2092 ret = state->fc_replay_num_tags ?
2093 JBD2_FC_REPLAY_STOP : -ECANCELED;
2094 }
2095 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2096 break;
2097 }
2098
2099out_err:
2100 trace_ext4_fc_replay_scan(sb, ret, off);
2101 return ret;
2102}
2103
5b849b5f
HS
2104/*
2105 * Main recovery path entry point.
8016e29f 2106 * The meaning of return codes is similar as above.
5b849b5f
HS
2107 */
2108static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2109 enum passtype pass, int off, tid_t expected_tid)
2110{
8016e29f
HS
2111 struct super_block *sb = journal->j_private;
2112 struct ext4_sb_info *sbi = EXT4_SB(sb);
a7ba36bc
HS
2113 struct ext4_fc_tl tl;
2114 __u8 *start, *end, *cur, *val;
8016e29f
HS
2115 int ret = JBD2_FC_REPLAY_CONTINUE;
2116 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
a7ba36bc 2117 struct ext4_fc_tail tail;
8016e29f
HS
2118
2119 if (pass == PASS_SCAN) {
2120 state->fc_current_pass = PASS_SCAN;
2121 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2122 }
2123
2124 if (state->fc_current_pass != pass) {
2125 state->fc_current_pass = pass;
2126 sbi->s_mount_state |= EXT4_FC_REPLAY;
2127 }
2128 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2129 jbd_debug(1, "Replay stops\n");
2130 ext4_fc_set_bitmaps_and_counters(sb);
2131 return 0;
2132 }
2133
2134#ifdef CONFIG_EXT4_DEBUG
2135 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2136 pr_warn("Dropping fc block %d because max_replay set\n", off);
2137 return JBD2_FC_REPLAY_STOP;
2138 }
2139#endif
2140
2141 start = (u8 *)bh->b_data;
2142 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2143
a7ba36bc
HS
2144 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2145 memcpy(&tl, cur, sizeof(tl));
2146 val = cur + sizeof(tl);
2147
8016e29f
HS
2148 if (state->fc_replay_num_tags == 0) {
2149 ret = JBD2_FC_REPLAY_STOP;
2150 ext4_fc_set_bitmaps_and_counters(sb);
2151 break;
2152 }
2153 jbd_debug(3, "Replay phase, tag:%s\n",
a7ba36bc 2154 tag2str(le16_to_cpu(tl.fc_tag)));
8016e29f 2155 state->fc_replay_num_tags--;
a7ba36bc 2156 switch (le16_to_cpu(tl.fc_tag)) {
8016e29f 2157 case EXT4_FC_TAG_LINK:
a7ba36bc 2158 ret = ext4_fc_replay_link(sb, &tl, val);
8016e29f
HS
2159 break;
2160 case EXT4_FC_TAG_UNLINK:
a7ba36bc 2161 ret = ext4_fc_replay_unlink(sb, &tl, val);
8016e29f
HS
2162 break;
2163 case EXT4_FC_TAG_ADD_RANGE:
a7ba36bc 2164 ret = ext4_fc_replay_add_range(sb, &tl, val);
8016e29f
HS
2165 break;
2166 case EXT4_FC_TAG_CREAT:
a7ba36bc 2167 ret = ext4_fc_replay_create(sb, &tl, val);
8016e29f
HS
2168 break;
2169 case EXT4_FC_TAG_DEL_RANGE:
a7ba36bc 2170 ret = ext4_fc_replay_del_range(sb, &tl, val);
8016e29f
HS
2171 break;
2172 case EXT4_FC_TAG_INODE:
a7ba36bc 2173 ret = ext4_fc_replay_inode(sb, &tl, val);
8016e29f
HS
2174 break;
2175 case EXT4_FC_TAG_PAD:
2176 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
a7ba36bc 2177 le16_to_cpu(tl.fc_len), 0);
8016e29f
HS
2178 break;
2179 case EXT4_FC_TAG_TAIL:
2180 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
a7ba36bc
HS
2181 le16_to_cpu(tl.fc_len), 0);
2182 memcpy(&tail, val, sizeof(tail));
2183 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
8016e29f
HS
2184 break;
2185 case EXT4_FC_TAG_HEAD:
2186 break;
2187 default:
a7ba36bc
HS
2188 trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2189 le16_to_cpu(tl.fc_len), 0);
8016e29f
HS
2190 ret = -ECANCELED;
2191 break;
2192 }
2193 if (ret < 0)
2194 break;
2195 ret = JBD2_FC_REPLAY_CONTINUE;
2196 }
2197 return ret;
5b849b5f
HS
2198}
2199
6866d7b3
HS
2200void ext4_fc_init(struct super_block *sb, journal_t *journal)
2201{
5b849b5f
HS
2202 /*
2203 * We set replay callback even if fast commit disabled because we may
2204 * could still have fast commit blocks that need to be replayed even if
2205 * fast commit has now been turned off.
2206 */
2207 journal->j_fc_replay_callback = ext4_fc_replay;
6866d7b3
HS
2208 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2209 return;
ff780b91 2210 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
6866d7b3 2211}
aa75f4d3 2212
fa329e27 2213static const char *fc_ineligible_reasons[] = {
ce8c59d1
HS
2214 "Extended attributes changed",
2215 "Cross rename",
2216 "Journal flag changed",
2217 "Insufficient memory",
2218 "Swap boot",
2219 "Resize",
2220 "Dir renamed",
2221 "Falloc range op",
556e0319 2222 "Data journalling",
ce8c59d1
HS
2223 "FC Commit Failed"
2224};
2225
2226int ext4_fc_info_show(struct seq_file *seq, void *v)
2227{
2228 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2229 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2230 int i;
2231
2232 if (v != SEQ_START_TOKEN)
2233 return 0;
2234
2235 seq_printf(seq,
2236 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2237 stats->fc_num_commits, stats->fc_ineligible_commits,
2238 stats->fc_numblks,
0915e464 2239 div_u64(stats->s_fc_avg_commit_time, 1000));
ce8c59d1
HS
2240 seq_puts(seq, "Ineligible reasons:\n");
2241 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2242 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2243 stats->fc_ineligible_reason_count[i]);
2244
2245 return 0;
2246}
2247
aa75f4d3
HS
2248int __init ext4_fc_init_dentry_cache(void)
2249{
2250 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2251 SLAB_RECLAIM_ACCOUNT);
2252
2253 if (ext4_fc_dentry_cachep == NULL)
2254 return -ENOMEM;
2255
2256 return 0;
2257}
ab047d51
SAS
2258
2259void ext4_fc_destroy_dentry_cache(void)
2260{
2261 kmem_cache_destroy(ext4_fc_dentry_cachep);
2262}