ext4: disable fast commit with data journalling
[linux-2.6-block.git] / fs / ext4 / fast_commit.c
CommitLineData
6866d7b3
HS
1// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
aa75f4d3 10#include "ext4.h"
6866d7b3 11#include "ext4_jbd2.h"
aa75f4d3
HS
12#include "ext4_extents.h"
13#include "mballoc.h"
14
15/*
16 * Ext4 Fast Commits
17 * -----------------
18 *
19 * Ext4 fast commits implement fine grained journalling for Ext4.
20 *
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
26 *
27 * (A) Directory entry updates:
28 *
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
32 *
33 * (B) File specific data range updates:
34 *
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
37 *
38 * (C) Inode metadata (mtime / ctime etc):
39 *
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
43 * replay.
44 * Commit Operation
45 * ----------------
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
51 *
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
60 *
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
65 *
66 * Fast Commit Ineligibility
67 * -------------------------
68 * Not all operations are supported by fast commits today (e.g extended
69 * attributes). Fast commit ineligiblity is marked by calling one of the
70 * two following functions:
71 *
72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73 * back to full commit. This is useful in case of transient errors.
74 *
75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76 * the fast commits happening between ext4_fc_start_ineligible() and
77 * ext4_fc_stop_ineligible() and one fast commit after the call to
78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79 * make one more fast commit to fall back to full commit after stop call so
80 * that it guaranteed that the fast commit ineligible operation contained
81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82 * followed by at least 1 full commit.
83 *
84 * Atomicity of commits
85 * --------------------
a740762f 86 * In order to guarantee atomicity during the commit operation, fast commit
aa75f4d3
HS
87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88 * tag contains CRC of the contents and TID of the transaction after which
89 * this fast commit should be applied. Recovery code replays fast commit
90 * logs only if there's at least 1 valid tail present. For every fast commit
91 * operation, there is 1 tail. This means, we may end up with multiple tails
92 * in the fast commit space. Here's an example:
93 *
94 * - Create a new file A and remove existing file B
95 * - fsync()
96 * - Append contents to file A
97 * - Truncate file A
98 * - fsync()
99 *
100 * The fast commit space at the end of above operations would look like this:
101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
103 *
104 * Replay code should thus check for all the valid tails in the FC area.
105 *
106 * TODOs
107 * -----
108 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109 * eligible update must be protected within ext4_fc_start_update() and
110 * ext4_fc_stop_update(). These routines are called at much higher
111 * routines. This can be made more fine grained by combining with
112 * ext4_journal_start().
113 *
114 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115 *
116 * 3) Handle more ineligible cases.
117 */
118
119#include <trace/events/ext4.h>
120static struct kmem_cache *ext4_fc_dentry_cachep;
121
122static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123{
124 BUFFER_TRACE(bh, "");
125 if (uptodate) {
126 ext4_debug("%s: Block %lld up-to-date",
127 __func__, bh->b_blocknr);
128 set_buffer_uptodate(bh);
129 } else {
130 ext4_debug("%s: Block %lld not up-to-date",
131 __func__, bh->b_blocknr);
132 clear_buffer_uptodate(bh);
133 }
134
135 unlock_buffer(bh);
136}
137
138static inline void ext4_fc_reset_inode(struct inode *inode)
139{
140 struct ext4_inode_info *ei = EXT4_I(inode);
141
142 ei->i_fc_lblk_start = 0;
143 ei->i_fc_lblk_len = 0;
144}
145
146void ext4_fc_init_inode(struct inode *inode)
147{
148 struct ext4_inode_info *ei = EXT4_I(inode);
149
150 ext4_fc_reset_inode(inode);
151 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152 INIT_LIST_HEAD(&ei->i_fc_list);
153 init_waitqueue_head(&ei->i_fc_wait);
154 atomic_set(&ei->i_fc_updates, 0);
aa75f4d3
HS
155}
156
f6634e26
HS
157/* This function must be called with sbi->s_fc_lock held. */
158static void ext4_fc_wait_committing_inode(struct inode *inode)
159{
160 wait_queue_head_t *wq;
161 struct ext4_inode_info *ei = EXT4_I(inode);
162
163#if (BITS_PER_LONG < 64)
164 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
165 EXT4_STATE_FC_COMMITTING);
166 wq = bit_waitqueue(&ei->i_state_flags,
167 EXT4_STATE_FC_COMMITTING);
168#else
169 DEFINE_WAIT_BIT(wait, &ei->i_flags,
170 EXT4_STATE_FC_COMMITTING);
171 wq = bit_waitqueue(&ei->i_flags,
172 EXT4_STATE_FC_COMMITTING);
173#endif
174 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
175 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
176 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
177 schedule();
178 finish_wait(wq, &wait.wq_entry);
179}
180
aa75f4d3
HS
181/*
182 * Inform Ext4's fast about start of an inode update
183 *
184 * This function is called by the high level call VFS callbacks before
185 * performing any inode update. This function blocks if there's an ongoing
186 * fast commit on the inode in question.
187 */
188void ext4_fc_start_update(struct inode *inode)
189{
190 struct ext4_inode_info *ei = EXT4_I(inode);
191
8016e29f
HS
192 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
193 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
aa75f4d3
HS
194 return;
195
196restart:
197 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
198 if (list_empty(&ei->i_fc_list))
199 goto out;
200
201 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
f6634e26 202 ext4_fc_wait_committing_inode(inode);
aa75f4d3
HS
203 goto restart;
204 }
205out:
206 atomic_inc(&ei->i_fc_updates);
207 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
208}
209
210/*
211 * Stop inode update and wake up waiting fast commits if any.
212 */
213void ext4_fc_stop_update(struct inode *inode)
214{
215 struct ext4_inode_info *ei = EXT4_I(inode);
216
8016e29f
HS
217 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
218 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
aa75f4d3
HS
219 return;
220
221 if (atomic_dec_and_test(&ei->i_fc_updates))
222 wake_up_all(&ei->i_fc_wait);
223}
224
225/*
226 * Remove inode from fast commit list. If the inode is being committed
227 * we wait until inode commit is done.
228 */
229void ext4_fc_del(struct inode *inode)
230{
231 struct ext4_inode_info *ei = EXT4_I(inode);
232
8016e29f
HS
233 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
234 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
aa75f4d3
HS
235 return;
236
237restart:
238 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
239 if (list_empty(&ei->i_fc_list)) {
240 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
241 return;
242 }
243
244 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
f6634e26 245 ext4_fc_wait_committing_inode(inode);
aa75f4d3
HS
246 goto restart;
247 }
f6634e26 248 list_del_init(&ei->i_fc_list);
aa75f4d3
HS
249 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
250}
251
252/*
253 * Mark file system as fast commit ineligible. This means that next commit
254 * operation would result in a full jbd2 commit.
255 */
256void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
257{
258 struct ext4_sb_info *sbi = EXT4_SB(sb);
259
8016e29f
HS
260 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
261 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
262 return;
263
ababea77 264 sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
aa75f4d3
HS
265 WARN_ON(reason >= EXT4_FC_REASON_MAX);
266 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
267}
268
269/*
270 * Start a fast commit ineligible update. Any commits that happen while
271 * such an operation is in progress fall back to full commits.
272 */
273void ext4_fc_start_ineligible(struct super_block *sb, int reason)
274{
275 struct ext4_sb_info *sbi = EXT4_SB(sb);
276
8016e29f
HS
277 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
278 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
279 return;
280
aa75f4d3
HS
281 WARN_ON(reason >= EXT4_FC_REASON_MAX);
282 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
283 atomic_inc(&sbi->s_fc_ineligible_updates);
284}
285
286/*
ababea77 287 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
aa75f4d3
HS
288 * to ensure that after stopping the ineligible update, at least one full
289 * commit takes place.
290 */
291void ext4_fc_stop_ineligible(struct super_block *sb)
292{
8016e29f
HS
293 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
294 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
295 return;
296
ababea77 297 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
aa75f4d3
HS
298 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
299}
300
301static inline int ext4_fc_is_ineligible(struct super_block *sb)
302{
ababea77 303 return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) ||
aa75f4d3
HS
304 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
305}
306
307/*
308 * Generic fast commit tracking function. If this is the first time this we are
309 * called after a full commit, we initialize fast commit fields and then call
310 * __fc_track_fn() with update = 0. If we have already been called after a full
311 * commit, we pass update = 1. Based on that, the track function can determine
312 * if it needs to track a field for the first time or if it needs to just
313 * update the previously tracked value.
314 *
315 * If enqueue is set, this function enqueues the inode in fast commit list.
316 */
317static int ext4_fc_track_template(
a80f7fcf
HS
318 handle_t *handle, struct inode *inode,
319 int (*__fc_track_fn)(struct inode *, void *, bool),
aa75f4d3
HS
320 void *args, int enqueue)
321{
aa75f4d3
HS
322 bool update = false;
323 struct ext4_inode_info *ei = EXT4_I(inode);
324 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
a80f7fcf 325 tid_t tid = 0;
aa75f4d3
HS
326 int ret;
327
8016e29f
HS
328 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
329 (sbi->s_mount_state & EXT4_FC_REPLAY))
aa75f4d3
HS
330 return -EOPNOTSUPP;
331
332 if (ext4_fc_is_ineligible(inode->i_sb))
333 return -EINVAL;
334
a80f7fcf 335 tid = handle->h_transaction->t_tid;
aa75f4d3 336 mutex_lock(&ei->i_fc_lock);
a80f7fcf 337 if (tid == ei->i_sync_tid) {
aa75f4d3
HS
338 update = true;
339 } else {
340 ext4_fc_reset_inode(inode);
a80f7fcf 341 ei->i_sync_tid = tid;
aa75f4d3
HS
342 }
343 ret = __fc_track_fn(inode, args, update);
344 mutex_unlock(&ei->i_fc_lock);
345
346 if (!enqueue)
347 return ret;
348
349 spin_lock(&sbi->s_fc_lock);
350 if (list_empty(&EXT4_I(inode)->i_fc_list))
351 list_add_tail(&EXT4_I(inode)->i_fc_list,
ababea77 352 (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ?
aa75f4d3
HS
353 &sbi->s_fc_q[FC_Q_STAGING] :
354 &sbi->s_fc_q[FC_Q_MAIN]);
355 spin_unlock(&sbi->s_fc_lock);
356
357 return ret;
358}
359
360struct __track_dentry_update_args {
361 struct dentry *dentry;
362 int op;
363};
364
365/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
366static int __track_dentry_update(struct inode *inode, void *arg, bool update)
367{
368 struct ext4_fc_dentry_update *node;
369 struct ext4_inode_info *ei = EXT4_I(inode);
370 struct __track_dentry_update_args *dentry_update =
371 (struct __track_dentry_update_args *)arg;
372 struct dentry *dentry = dentry_update->dentry;
373 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
374
375 mutex_unlock(&ei->i_fc_lock);
376 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
377 if (!node) {
b21ebf14 378 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
aa75f4d3
HS
379 mutex_lock(&ei->i_fc_lock);
380 return -ENOMEM;
381 }
382
383 node->fcd_op = dentry_update->op;
384 node->fcd_parent = dentry->d_parent->d_inode->i_ino;
385 node->fcd_ino = inode->i_ino;
386 if (dentry->d_name.len > DNAME_INLINE_LEN) {
387 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
388 if (!node->fcd_name.name) {
389 kmem_cache_free(ext4_fc_dentry_cachep, node);
390 ext4_fc_mark_ineligible(inode->i_sb,
b21ebf14 391 EXT4_FC_REASON_NOMEM);
aa75f4d3
HS
392 mutex_lock(&ei->i_fc_lock);
393 return -ENOMEM;
394 }
395 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
396 dentry->d_name.len);
397 } else {
398 memcpy(node->fcd_iname, dentry->d_name.name,
399 dentry->d_name.len);
400 node->fcd_name.name = node->fcd_iname;
401 }
402 node->fcd_name.len = dentry->d_name.len;
403
404 spin_lock(&sbi->s_fc_lock);
ababea77 405 if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING)
aa75f4d3
HS
406 list_add_tail(&node->fcd_list,
407 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
408 else
409 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
410 spin_unlock(&sbi->s_fc_lock);
411 mutex_lock(&ei->i_fc_lock);
412
413 return 0;
414}
415
a80f7fcf
HS
416void __ext4_fc_track_unlink(handle_t *handle,
417 struct inode *inode, struct dentry *dentry)
aa75f4d3
HS
418{
419 struct __track_dentry_update_args args;
420 int ret;
421
422 args.dentry = dentry;
423 args.op = EXT4_FC_TAG_UNLINK;
424
a80f7fcf 425 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
aa75f4d3
HS
426 (void *)&args, 0);
427 trace_ext4_fc_track_unlink(inode, dentry, ret);
428}
429
a80f7fcf
HS
430void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
431{
432 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
433}
434
435void __ext4_fc_track_link(handle_t *handle,
436 struct inode *inode, struct dentry *dentry)
aa75f4d3
HS
437{
438 struct __track_dentry_update_args args;
439 int ret;
440
441 args.dentry = dentry;
442 args.op = EXT4_FC_TAG_LINK;
443
a80f7fcf 444 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
aa75f4d3
HS
445 (void *)&args, 0);
446 trace_ext4_fc_track_link(inode, dentry, ret);
447}
448
a80f7fcf
HS
449void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
450{
451 __ext4_fc_track_link(handle, d_inode(dentry), dentry);
452}
453
454void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
aa75f4d3
HS
455{
456 struct __track_dentry_update_args args;
a80f7fcf 457 struct inode *inode = d_inode(dentry);
aa75f4d3
HS
458 int ret;
459
460 args.dentry = dentry;
461 args.op = EXT4_FC_TAG_CREAT;
462
a80f7fcf 463 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
aa75f4d3
HS
464 (void *)&args, 0);
465 trace_ext4_fc_track_create(inode, dentry, ret);
466}
467
468/* __track_fn for inode tracking */
469static int __track_inode(struct inode *inode, void *arg, bool update)
470{
471 if (update)
472 return -EEXIST;
473
474 EXT4_I(inode)->i_fc_lblk_len = 0;
475
476 return 0;
477}
478
a80f7fcf 479void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
aa75f4d3
HS
480{
481 int ret;
482
483 if (S_ISDIR(inode->i_mode))
484 return;
485
556e0319
HS
486 if (ext4_should_journal_data(inode)) {
487 ext4_fc_mark_ineligible(inode->i_sb,
488 EXT4_FC_REASON_INODE_JOURNAL_DATA);
489 return;
490 }
491
a80f7fcf 492 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
aa75f4d3
HS
493 trace_ext4_fc_track_inode(inode, ret);
494}
495
496struct __track_range_args {
497 ext4_lblk_t start, end;
498};
499
500/* __track_fn for tracking data updates */
501static int __track_range(struct inode *inode, void *arg, bool update)
502{
503 struct ext4_inode_info *ei = EXT4_I(inode);
504 ext4_lblk_t oldstart;
505 struct __track_range_args *__arg =
506 (struct __track_range_args *)arg;
507
508 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
509 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
510 return -ECANCELED;
511 }
512
513 oldstart = ei->i_fc_lblk_start;
514
515 if (update && ei->i_fc_lblk_len > 0) {
516 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
517 ei->i_fc_lblk_len =
518 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
519 ei->i_fc_lblk_start + 1;
520 } else {
521 ei->i_fc_lblk_start = __arg->start;
522 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
523 }
524
525 return 0;
526}
527
a80f7fcf 528void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
aa75f4d3
HS
529 ext4_lblk_t end)
530{
531 struct __track_range_args args;
532 int ret;
533
534 if (S_ISDIR(inode->i_mode))
535 return;
536
537 args.start = start;
538 args.end = end;
539
a80f7fcf 540 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
aa75f4d3
HS
541
542 trace_ext4_fc_track_range(inode, start, end, ret);
543}
544
545static void ext4_fc_submit_bh(struct super_block *sb)
546{
547 int write_flags = REQ_SYNC;
548 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
549
a740762f 550 /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
aa75f4d3
HS
551 if (test_opt(sb, BARRIER))
552 write_flags |= REQ_FUA | REQ_PREFLUSH;
553 lock_buffer(bh);
764b3fd3 554 set_buffer_dirty(bh);
aa75f4d3
HS
555 set_buffer_uptodate(bh);
556 bh->b_end_io = ext4_end_buffer_io_sync;
557 submit_bh(REQ_OP_WRITE, write_flags, bh);
558 EXT4_SB(sb)->s_fc_bh = NULL;
559}
560
561/* Ext4 commit path routines */
562
563/* memzero and update CRC */
564static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
565 u32 *crc)
566{
567 void *ret;
568
569 ret = memset(dst, 0, len);
570 if (crc)
571 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
572 return ret;
573}
574
575/*
576 * Allocate len bytes on a fast commit buffer.
577 *
578 * During the commit time this function is used to manage fast commit
579 * block space. We don't split a fast commit log onto different
580 * blocks. So this function makes sure that if there's not enough space
581 * on the current block, the remaining space in the current block is
582 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
583 * new block is from jbd2 and CRC is updated to reflect the padding
584 * we added.
585 */
586static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
587{
588 struct ext4_fc_tl *tl;
589 struct ext4_sb_info *sbi = EXT4_SB(sb);
590 struct buffer_head *bh;
591 int bsize = sbi->s_journal->j_blocksize;
592 int ret, off = sbi->s_fc_bytes % bsize;
593 int pad_len;
594
595 /*
596 * After allocating len, we should have space at least for a 0 byte
597 * padding.
598 */
599 if (len + sizeof(struct ext4_fc_tl) > bsize)
600 return NULL;
601
602 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
603 /*
604 * Only allocate from current buffer if we have enough space for
605 * this request AND we have space to add a zero byte padding.
606 */
607 if (!sbi->s_fc_bh) {
608 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
609 if (ret)
610 return NULL;
611 sbi->s_fc_bh = bh;
612 }
613 sbi->s_fc_bytes += len;
614 return sbi->s_fc_bh->b_data + off;
615 }
616 /* Need to add PAD tag */
617 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
618 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
619 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
620 tl->fc_len = cpu_to_le16(pad_len);
621 if (crc)
622 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
623 if (pad_len > 0)
624 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
625 ext4_fc_submit_bh(sb);
626
627 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
628 if (ret)
629 return NULL;
630 sbi->s_fc_bh = bh;
631 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
632 return sbi->s_fc_bh->b_data;
633}
634
635/* memcpy to fc reserved space and update CRC */
636static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
637 int len, u32 *crc)
638{
639 if (crc)
640 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
641 return memcpy(dst, src, len);
642}
643
644/*
645 * Complete a fast commit by writing tail tag.
646 *
647 * Writing tail tag marks the end of a fast commit. In order to guarantee
648 * atomicity, after writing tail tag, even if there's space remaining
649 * in the block, next commit shouldn't use it. That's why tail tag
650 * has the length as that of the remaining space on the block.
651 */
652static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
653{
654 struct ext4_sb_info *sbi = EXT4_SB(sb);
655 struct ext4_fc_tl tl;
656 struct ext4_fc_tail tail;
657 int off, bsize = sbi->s_journal->j_blocksize;
658 u8 *dst;
659
660 /*
661 * ext4_fc_reserve_space takes care of allocating an extra block if
662 * there's no enough space on this block for accommodating this tail.
663 */
664 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
665 if (!dst)
666 return -ENOSPC;
667
668 off = sbi->s_fc_bytes % bsize;
669
670 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
671 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
672 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
673
674 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
675 dst += sizeof(tl);
676 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
677 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
678 dst += sizeof(tail.fc_tid);
679 tail.fc_crc = cpu_to_le32(crc);
680 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
681
682 ext4_fc_submit_bh(sb);
683
684 return 0;
685}
686
687/*
688 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
689 * Returns false if there's not enough space.
690 */
691static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
692 u32 *crc)
693{
694 struct ext4_fc_tl tl;
695 u8 *dst;
696
697 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
698 if (!dst)
699 return false;
700
701 tl.fc_tag = cpu_to_le16(tag);
702 tl.fc_len = cpu_to_le16(len);
703
704 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
705 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
706
707 return true;
708}
709
710/* Same as above, but adds dentry tlv. */
711static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
712 int parent_ino, int ino, int dlen,
713 const unsigned char *dname,
714 u32 *crc)
715{
716 struct ext4_fc_dentry_info fcd;
717 struct ext4_fc_tl tl;
718 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
719 crc);
720
721 if (!dst)
722 return false;
723
724 fcd.fc_parent_ino = cpu_to_le32(parent_ino);
725 fcd.fc_ino = cpu_to_le32(ino);
726 tl.fc_tag = cpu_to_le16(tag);
727 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
728 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
729 dst += sizeof(tl);
730 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
731 dst += sizeof(fcd);
732 ext4_fc_memcpy(sb, dst, dname, dlen, crc);
733 dst += dlen;
734
735 return true;
736}
737
738/*
739 * Writes inode in the fast commit space under TLV with tag @tag.
740 * Returns 0 on success, error on failure.
741 */
742static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
743{
744 struct ext4_inode_info *ei = EXT4_I(inode);
745 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
746 int ret;
747 struct ext4_iloc iloc;
748 struct ext4_fc_inode fc_inode;
749 struct ext4_fc_tl tl;
750 u8 *dst;
751
752 ret = ext4_get_inode_loc(inode, &iloc);
753 if (ret)
754 return ret;
755
756 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
757 inode_len += ei->i_extra_isize;
758
759 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
760 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
761 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
762
763 dst = ext4_fc_reserve_space(inode->i_sb,
764 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
765 if (!dst)
766 return -ECANCELED;
767
768 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
769 return -ECANCELED;
770 dst += sizeof(tl);
771 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
772 return -ECANCELED;
773 dst += sizeof(fc_inode);
774 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
775 inode_len, crc))
776 return -ECANCELED;
777
778 return 0;
779}
780
781/*
782 * Writes updated data ranges for the inode in question. Updates CRC.
783 * Returns 0 on success, error otherwise.
784 */
785static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
786{
787 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
788 struct ext4_inode_info *ei = EXT4_I(inode);
789 struct ext4_map_blocks map;
790 struct ext4_fc_add_range fc_ext;
791 struct ext4_fc_del_range lrange;
792 struct ext4_extent *ex;
793 int ret;
794
795 mutex_lock(&ei->i_fc_lock);
796 if (ei->i_fc_lblk_len == 0) {
797 mutex_unlock(&ei->i_fc_lock);
798 return 0;
799 }
800 old_blk_size = ei->i_fc_lblk_start;
801 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
802 ei->i_fc_lblk_len = 0;
803 mutex_unlock(&ei->i_fc_lock);
804
805 cur_lblk_off = old_blk_size;
806 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
807 __func__, cur_lblk_off, new_blk_size, inode->i_ino);
808
809 while (cur_lblk_off <= new_blk_size) {
810 map.m_lblk = cur_lblk_off;
811 map.m_len = new_blk_size - cur_lblk_off + 1;
812 ret = ext4_map_blocks(NULL, inode, &map, 0);
813 if (ret < 0)
814 return -ECANCELED;
815
816 if (map.m_len == 0) {
817 cur_lblk_off++;
818 continue;
819 }
820
821 if (ret == 0) {
822 lrange.fc_ino = cpu_to_le32(inode->i_ino);
823 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
824 lrange.fc_len = cpu_to_le32(map.m_len);
825 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
826 sizeof(lrange), (u8 *)&lrange, crc))
827 return -ENOSPC;
828 } else {
829 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
830 ex = (struct ext4_extent *)&fc_ext.fc_ex;
831 ex->ee_block = cpu_to_le32(map.m_lblk);
832 ex->ee_len = cpu_to_le16(map.m_len);
833 ext4_ext_store_pblock(ex, map.m_pblk);
834 if (map.m_flags & EXT4_MAP_UNWRITTEN)
835 ext4_ext_mark_unwritten(ex);
836 else
837 ext4_ext_mark_initialized(ex);
838 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
839 sizeof(fc_ext), (u8 *)&fc_ext, crc))
840 return -ENOSPC;
841 }
842
843 cur_lblk_off += map.m_len;
844 }
845
846 return 0;
847}
848
849
850/* Submit data for all the fast commit inodes */
851static int ext4_fc_submit_inode_data_all(journal_t *journal)
852{
853 struct super_block *sb = (struct super_block *)(journal->j_private);
854 struct ext4_sb_info *sbi = EXT4_SB(sb);
855 struct ext4_inode_info *ei;
856 struct list_head *pos;
857 int ret = 0;
858
859 spin_lock(&sbi->s_fc_lock);
ababea77 860 sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING;
aa75f4d3
HS
861 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
862 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
863 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
864 while (atomic_read(&ei->i_fc_updates)) {
865 DEFINE_WAIT(wait);
866
867 prepare_to_wait(&ei->i_fc_wait, &wait,
868 TASK_UNINTERRUPTIBLE);
869 if (atomic_read(&ei->i_fc_updates)) {
870 spin_unlock(&sbi->s_fc_lock);
871 schedule();
872 spin_lock(&sbi->s_fc_lock);
873 }
874 finish_wait(&ei->i_fc_wait, &wait);
875 }
876 spin_unlock(&sbi->s_fc_lock);
877 ret = jbd2_submit_inode_data(ei->jinode);
878 if (ret)
879 return ret;
880 spin_lock(&sbi->s_fc_lock);
881 }
882 spin_unlock(&sbi->s_fc_lock);
883
884 return ret;
885}
886
887/* Wait for completion of data for all the fast commit inodes */
888static int ext4_fc_wait_inode_data_all(journal_t *journal)
889{
890 struct super_block *sb = (struct super_block *)(journal->j_private);
891 struct ext4_sb_info *sbi = EXT4_SB(sb);
892 struct ext4_inode_info *pos, *n;
893 int ret = 0;
894
895 spin_lock(&sbi->s_fc_lock);
896 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
897 if (!ext4_test_inode_state(&pos->vfs_inode,
898 EXT4_STATE_FC_COMMITTING))
899 continue;
900 spin_unlock(&sbi->s_fc_lock);
901
902 ret = jbd2_wait_inode_data(journal, pos->jinode);
903 if (ret)
904 return ret;
905 spin_lock(&sbi->s_fc_lock);
906 }
907 spin_unlock(&sbi->s_fc_lock);
908
909 return 0;
910}
911
912/* Commit all the directory entry updates */
913static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
914{
915 struct super_block *sb = (struct super_block *)(journal->j_private);
916 struct ext4_sb_info *sbi = EXT4_SB(sb);
917 struct ext4_fc_dentry_update *fc_dentry;
918 struct inode *inode;
919 struct list_head *pos, *n, *fcd_pos, *fcd_n;
920 struct ext4_inode_info *ei;
921 int ret;
922
923 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
924 return 0;
925 list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
926 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
927 fcd_list);
928 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
929 spin_unlock(&sbi->s_fc_lock);
930 if (!ext4_fc_add_dentry_tlv(
931 sb, fc_dentry->fcd_op,
932 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
933 fc_dentry->fcd_name.len,
934 fc_dentry->fcd_name.name, crc)) {
935 ret = -ENOSPC;
936 goto lock_and_exit;
937 }
938 spin_lock(&sbi->s_fc_lock);
939 continue;
940 }
941
942 inode = NULL;
943 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
944 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
945 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
946 inode = &ei->vfs_inode;
947 break;
948 }
949 }
950 /*
951 * If we don't find inode in our list, then it was deleted,
952 * in which case, we don't need to record it's create tag.
953 */
954 if (!inode)
955 continue;
956 spin_unlock(&sbi->s_fc_lock);
957
958 /*
959 * We first write the inode and then the create dirent. This
960 * allows the recovery code to create an unnamed inode first
961 * and then link it to a directory entry. This allows us
962 * to use namei.c routines almost as is and simplifies
963 * the recovery code.
964 */
965 ret = ext4_fc_write_inode(inode, crc);
966 if (ret)
967 goto lock_and_exit;
968
969 ret = ext4_fc_write_inode_data(inode, crc);
970 if (ret)
971 goto lock_and_exit;
972
973 if (!ext4_fc_add_dentry_tlv(
974 sb, fc_dentry->fcd_op,
975 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
976 fc_dentry->fcd_name.len,
977 fc_dentry->fcd_name.name, crc)) {
aa75f4d3
HS
978 ret = -ENOSPC;
979 goto lock_and_exit;
980 }
981
982 spin_lock(&sbi->s_fc_lock);
983 }
984 return 0;
985lock_and_exit:
986 spin_lock(&sbi->s_fc_lock);
987 return ret;
988}
989
990static int ext4_fc_perform_commit(journal_t *journal)
991{
992 struct super_block *sb = (struct super_block *)(journal->j_private);
993 struct ext4_sb_info *sbi = EXT4_SB(sb);
994 struct ext4_inode_info *iter;
995 struct ext4_fc_head head;
996 struct list_head *pos;
997 struct inode *inode;
998 struct blk_plug plug;
999 int ret = 0;
1000 u32 crc = 0;
1001
1002 ret = ext4_fc_submit_inode_data_all(journal);
1003 if (ret)
1004 return ret;
1005
1006 ret = ext4_fc_wait_inode_data_all(journal);
1007 if (ret)
1008 return ret;
1009
1010 blk_start_plug(&plug);
1011 if (sbi->s_fc_bytes == 0) {
1012 /*
1013 * Add a head tag only if this is the first fast commit
1014 * in this TID.
1015 */
1016 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1017 head.fc_tid = cpu_to_le32(
1018 sbi->s_journal->j_running_transaction->t_tid);
1019 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1020 (u8 *)&head, &crc))
1021 goto out;
1022 }
1023
1024 spin_lock(&sbi->s_fc_lock);
1025 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1026 if (ret) {
1027 spin_unlock(&sbi->s_fc_lock);
1028 goto out;
1029 }
1030
1031 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1032 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1033 inode = &iter->vfs_inode;
1034 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1035 continue;
1036
1037 spin_unlock(&sbi->s_fc_lock);
1038 ret = ext4_fc_write_inode_data(inode, &crc);
1039 if (ret)
1040 goto out;
1041 ret = ext4_fc_write_inode(inode, &crc);
1042 if (ret)
1043 goto out;
1044 spin_lock(&sbi->s_fc_lock);
aa75f4d3
HS
1045 }
1046 spin_unlock(&sbi->s_fc_lock);
1047
1048 ret = ext4_fc_write_tail(sb, crc);
1049
1050out:
1051 blk_finish_plug(&plug);
1052 return ret;
1053}
1054
1055/*
1056 * The main commit entry point. Performs a fast commit for transaction
1057 * commit_tid if needed. If it's not possible to perform a fast commit
1058 * due to various reasons, we fall back to full commit. Returns 0
1059 * on success, error otherwise.
1060 */
1061int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1062{
1063 struct super_block *sb = (struct super_block *)(journal->j_private);
1064 struct ext4_sb_info *sbi = EXT4_SB(sb);
1065 int nblks = 0, ret, bsize = journal->j_blocksize;
1066 int subtid = atomic_read(&sbi->s_fc_subtid);
1067 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1068 ktime_t start_time, commit_time;
1069
1070 trace_ext4_fc_commit_start(sb);
1071
1072 start_time = ktime_get();
1073
1074 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1075 (ext4_fc_is_ineligible(sb))) {
1076 reason = EXT4_FC_REASON_INELIGIBLE;
1077 goto out;
1078 }
1079
1080restart_fc:
1081 ret = jbd2_fc_begin_commit(journal, commit_tid);
1082 if (ret == -EALREADY) {
1083 /* There was an ongoing commit, check if we need to restart */
1084 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1085 commit_tid > journal->j_commit_sequence)
1086 goto restart_fc;
1087 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1088 goto out;
1089 } else if (ret) {
1090 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1091 reason = EXT4_FC_REASON_FC_START_FAILED;
1092 goto out;
1093 }
1094
1095 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1096 ret = ext4_fc_perform_commit(journal);
1097 if (ret < 0) {
1098 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1099 reason = EXT4_FC_REASON_FC_FAILED;
1100 goto out;
1101 }
1102 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1103 ret = jbd2_fc_wait_bufs(journal, nblks);
1104 if (ret < 0) {
1105 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1106 reason = EXT4_FC_REASON_FC_FAILED;
1107 goto out;
1108 }
1109 atomic_inc(&sbi->s_fc_subtid);
1110 jbd2_fc_end_commit(journal);
1111out:
1112 /* Has any ineligible update happened since we started? */
1113 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1114 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1115 reason = EXT4_FC_REASON_INELIGIBLE;
1116 }
1117
1118 spin_lock(&sbi->s_fc_lock);
1119 if (reason != EXT4_FC_REASON_OK &&
1120 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1121 sbi->s_fc_stats.fc_ineligible_commits++;
1122 } else {
1123 sbi->s_fc_stats.fc_num_commits++;
1124 sbi->s_fc_stats.fc_numblks += nblks;
1125 }
1126 spin_unlock(&sbi->s_fc_lock);
1127 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1128 trace_ext4_fc_commit_stop(sb, nblks, reason);
1129 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1130 /*
1131 * weight the commit time higher than the average time so we don't
1132 * react too strongly to vast changes in the commit time
1133 */
1134 if (likely(sbi->s_fc_avg_commit_time))
1135 sbi->s_fc_avg_commit_time = (commit_time +
1136 sbi->s_fc_avg_commit_time * 3) / 4;
1137 else
1138 sbi->s_fc_avg_commit_time = commit_time;
1139 jbd_debug(1,
1140 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1141 nblks, reason, subtid);
1142 if (reason == EXT4_FC_REASON_FC_FAILED)
0bce577b 1143 return jbd2_fc_end_commit_fallback(journal);
aa75f4d3
HS
1144 if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1145 reason == EXT4_FC_REASON_INELIGIBLE)
1146 return jbd2_complete_transaction(journal, commit_tid);
1147 return 0;
1148}
1149
ff780b91
HS
1150/*
1151 * Fast commit cleanup routine. This is called after every fast commit and
1152 * full commit. full is true if we are called after a full commit.
1153 */
1154static void ext4_fc_cleanup(journal_t *journal, int full)
1155{
aa75f4d3
HS
1156 struct super_block *sb = journal->j_private;
1157 struct ext4_sb_info *sbi = EXT4_SB(sb);
1158 struct ext4_inode_info *iter;
1159 struct ext4_fc_dentry_update *fc_dentry;
1160 struct list_head *pos, *n;
1161
1162 if (full && sbi->s_fc_bh)
1163 sbi->s_fc_bh = NULL;
1164
1165 jbd2_fc_release_bufs(journal);
1166
1167 spin_lock(&sbi->s_fc_lock);
1168 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1169 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1170 list_del_init(&iter->i_fc_list);
1171 ext4_clear_inode_state(&iter->vfs_inode,
1172 EXT4_STATE_FC_COMMITTING);
1173 ext4_fc_reset_inode(&iter->vfs_inode);
1174 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1175 smp_mb();
1176#if (BITS_PER_LONG < 64)
1177 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1178#else
1179 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1180#endif
1181 }
1182
1183 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1184 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1185 struct ext4_fc_dentry_update,
1186 fcd_list);
1187 list_del_init(&fc_dentry->fcd_list);
1188 spin_unlock(&sbi->s_fc_lock);
1189
1190 if (fc_dentry->fcd_name.name &&
1191 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1192 kfree(fc_dentry->fcd_name.name);
1193 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1194 spin_lock(&sbi->s_fc_lock);
1195 }
1196
1197 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1198 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1199 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1200 &sbi->s_fc_q[FC_Q_STAGING]);
1201
ababea77
HS
1202 sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
1203 sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
aa75f4d3
HS
1204
1205 if (full)
1206 sbi->s_fc_bytes = 0;
1207 spin_unlock(&sbi->s_fc_lock);
1208 trace_ext4_fc_stats(sb);
ff780b91 1209}
6866d7b3 1210
8016e29f
HS
1211/* Ext4 Replay Path Routines */
1212
1213/* Get length of a particular tlv */
1214static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
1215{
1216 return le16_to_cpu(tl->fc_len);
1217}
1218
1219/* Get a pointer to "value" of a tlv */
1220static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
1221{
1222 return (u8 *)tl + sizeof(*tl);
1223}
1224
1225/* Helper struct for dentry replay routines */
1226struct dentry_info_args {
1227 int parent_ino, dname_len, ino, inode_len;
1228 char *dname;
1229};
1230
1231static inline void tl_to_darg(struct dentry_info_args *darg,
1232 struct ext4_fc_tl *tl)
1233{
1234 struct ext4_fc_dentry_info *fcd;
1235
1236 fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1237
1238 darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1239 darg->ino = le32_to_cpu(fcd->fc_ino);
1240 darg->dname = fcd->fc_dname;
1241 darg->dname_len = ext4_fc_tag_len(tl) -
1242 sizeof(struct ext4_fc_dentry_info);
1243}
1244
1245/* Unlink replay function */
1246static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1247{
1248 struct inode *inode, *old_parent;
1249 struct qstr entry;
1250 struct dentry_info_args darg;
1251 int ret = 0;
1252
1253 tl_to_darg(&darg, tl);
1254
1255 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1256 darg.parent_ino, darg.dname_len);
1257
1258 entry.name = darg.dname;
1259 entry.len = darg.dname_len;
1260 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1261
1262 if (IS_ERR_OR_NULL(inode)) {
1263 jbd_debug(1, "Inode %d not found", darg.ino);
1264 return 0;
1265 }
1266
1267 old_parent = ext4_iget(sb, darg.parent_ino,
1268 EXT4_IGET_NORMAL);
1269 if (IS_ERR_OR_NULL(old_parent)) {
1270 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
1271 iput(inode);
1272 return 0;
1273 }
1274
a80f7fcf 1275 ret = __ext4_unlink(NULL, old_parent, &entry, inode);
8016e29f
HS
1276 /* -ENOENT ok coz it might not exist anymore. */
1277 if (ret == -ENOENT)
1278 ret = 0;
1279 iput(old_parent);
1280 iput(inode);
1281 return ret;
1282}
1283
1284static int ext4_fc_replay_link_internal(struct super_block *sb,
1285 struct dentry_info_args *darg,
1286 struct inode *inode)
1287{
1288 struct inode *dir = NULL;
1289 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1290 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1291 int ret = 0;
1292
1293 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1294 if (IS_ERR(dir)) {
1295 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1296 dir = NULL;
1297 goto out;
1298 }
1299
1300 dentry_dir = d_obtain_alias(dir);
1301 if (IS_ERR(dentry_dir)) {
1302 jbd_debug(1, "Failed to obtain dentry");
1303 dentry_dir = NULL;
1304 goto out;
1305 }
1306
1307 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1308 if (!dentry_inode) {
1309 jbd_debug(1, "Inode dentry not created.");
1310 ret = -ENOMEM;
1311 goto out;
1312 }
1313
1314 ret = __ext4_link(dir, inode, dentry_inode);
1315 /*
1316 * It's possible that link already existed since data blocks
1317 * for the dir in question got persisted before we crashed OR
1318 * we replayed this tag and crashed before the entire replay
1319 * could complete.
1320 */
1321 if (ret && ret != -EEXIST) {
1322 jbd_debug(1, "Failed to link\n");
1323 goto out;
1324 }
1325
1326 ret = 0;
1327out:
1328 if (dentry_dir) {
1329 d_drop(dentry_dir);
1330 dput(dentry_dir);
1331 } else if (dir) {
1332 iput(dir);
1333 }
1334 if (dentry_inode) {
1335 d_drop(dentry_inode);
1336 dput(dentry_inode);
1337 }
1338
1339 return ret;
1340}
1341
1342/* Link replay function */
1343static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1344{
1345 struct inode *inode;
1346 struct dentry_info_args darg;
1347 int ret = 0;
1348
1349 tl_to_darg(&darg, tl);
1350 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1351 darg.parent_ino, darg.dname_len);
1352
1353 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1354 if (IS_ERR_OR_NULL(inode)) {
1355 jbd_debug(1, "Inode not found.");
1356 return 0;
1357 }
1358
1359 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1360 iput(inode);
1361 return ret;
1362}
1363
1364/*
1365 * Record all the modified inodes during replay. We use this later to setup
1366 * block bitmaps correctly.
1367 */
1368static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1369{
1370 struct ext4_fc_replay_state *state;
1371 int i;
1372
1373 state = &EXT4_SB(sb)->s_fc_replay_state;
1374 for (i = 0; i < state->fc_modified_inodes_used; i++)
1375 if (state->fc_modified_inodes[i] == ino)
1376 return 0;
1377 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1378 state->fc_modified_inodes_size +=
1379 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1380 state->fc_modified_inodes = krealloc(
1381 state->fc_modified_inodes, sizeof(int) *
1382 state->fc_modified_inodes_size,
1383 GFP_KERNEL);
1384 if (!state->fc_modified_inodes)
1385 return -ENOMEM;
1386 }
1387 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1388 return 0;
1389}
1390
1391/*
1392 * Inode replay function
1393 */
1394static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1395{
1396 struct ext4_fc_inode *fc_inode;
1397 struct ext4_inode *raw_inode;
1398 struct ext4_inode *raw_fc_inode;
1399 struct inode *inode = NULL;
1400 struct ext4_iloc iloc;
1401 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1402 struct ext4_extent_header *eh;
1403
1404 fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1405
1406 ino = le32_to_cpu(fc_inode->fc_ino);
1407 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1408
1409 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1410 if (!IS_ERR_OR_NULL(inode)) {
1411 ext4_ext_clear_bb(inode);
1412 iput(inode);
1413 }
1414
1415 ext4_fc_record_modified_inode(sb, ino);
1416
1417 raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1418 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1419 if (ret)
1420 goto out;
1421
1422 inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1423 raw_inode = ext4_raw_inode(&iloc);
1424
1425 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1426 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1427 inode_len - offsetof(struct ext4_inode, i_generation));
1428 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1429 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1430 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1431 memset(eh, 0, sizeof(*eh));
1432 eh->eh_magic = EXT4_EXT_MAGIC;
1433 eh->eh_max = cpu_to_le16(
1434 (sizeof(raw_inode->i_block) -
1435 sizeof(struct ext4_extent_header))
1436 / sizeof(struct ext4_extent));
1437 }
1438 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1439 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1440 sizeof(raw_inode->i_block));
1441 }
1442
1443 /* Immediately update the inode on disk. */
1444 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1445 if (ret)
1446 goto out;
1447 ret = sync_dirty_buffer(iloc.bh);
1448 if (ret)
1449 goto out;
1450 ret = ext4_mark_inode_used(sb, ino);
1451 if (ret)
1452 goto out;
1453
1454 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1455 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1456 if (IS_ERR_OR_NULL(inode)) {
1457 jbd_debug(1, "Inode not found.");
1458 return -EFSCORRUPTED;
1459 }
1460
1461 /*
1462 * Our allocator could have made different decisions than before
1463 * crashing. This should be fixed but until then, we calculate
1464 * the number of blocks the inode.
1465 */
1466 ext4_ext_replay_set_iblocks(inode);
1467
1468 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1469 ext4_reset_inode_seed(inode);
1470
1471 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1472 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1473 sync_dirty_buffer(iloc.bh);
1474 brelse(iloc.bh);
1475out:
1476 iput(inode);
1477 if (!ret)
1478 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1479
1480 return 0;
1481}
1482
1483/*
1484 * Dentry create replay function.
1485 *
1486 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1487 * inode for which we are trying to create a dentry here, should already have
1488 * been replayed before we start here.
1489 */
1490static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1491{
1492 int ret = 0;
1493 struct inode *inode = NULL;
1494 struct inode *dir = NULL;
1495 struct dentry_info_args darg;
1496
1497 tl_to_darg(&darg, tl);
1498
1499 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1500 darg.parent_ino, darg.dname_len);
1501
1502 /* This takes care of update group descriptor and other metadata */
1503 ret = ext4_mark_inode_used(sb, darg.ino);
1504 if (ret)
1505 goto out;
1506
1507 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1508 if (IS_ERR_OR_NULL(inode)) {
1509 jbd_debug(1, "inode %d not found.", darg.ino);
1510 inode = NULL;
1511 ret = -EINVAL;
1512 goto out;
1513 }
1514
1515 if (S_ISDIR(inode->i_mode)) {
1516 /*
1517 * If we are creating a directory, we need to make sure that the
1518 * dot and dot dot dirents are setup properly.
1519 */
1520 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1521 if (IS_ERR_OR_NULL(dir)) {
1522 jbd_debug(1, "Dir %d not found.", darg.ino);
1523 goto out;
1524 }
1525 ret = ext4_init_new_dir(NULL, dir, inode);
1526 iput(dir);
1527 if (ret) {
1528 ret = 0;
1529 goto out;
1530 }
1531 }
1532 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1533 if (ret)
1534 goto out;
1535 set_nlink(inode, 1);
1536 ext4_mark_inode_dirty(NULL, inode);
1537out:
1538 if (inode)
1539 iput(inode);
1540 return ret;
1541}
1542
1543/*
1544 * Record physical disk regions which are in use as per fast commit area. Our
1545 * simple replay phase allocator excludes these regions from allocation.
1546 */
1547static int ext4_fc_record_regions(struct super_block *sb, int ino,
1548 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1549{
1550 struct ext4_fc_replay_state *state;
1551 struct ext4_fc_alloc_region *region;
1552
1553 state = &EXT4_SB(sb)->s_fc_replay_state;
1554 if (state->fc_regions_used == state->fc_regions_size) {
1555 state->fc_regions_size +=
1556 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1557 state->fc_regions = krealloc(
1558 state->fc_regions,
1559 state->fc_regions_size *
1560 sizeof(struct ext4_fc_alloc_region),
1561 GFP_KERNEL);
1562 if (!state->fc_regions)
1563 return -ENOMEM;
1564 }
1565 region = &state->fc_regions[state->fc_regions_used++];
1566 region->ino = ino;
1567 region->lblk = lblk;
1568 region->pblk = pblk;
1569 region->len = len;
1570
1571 return 0;
1572}
1573
1574/* Replay add range tag */
1575static int ext4_fc_replay_add_range(struct super_block *sb,
1576 struct ext4_fc_tl *tl)
1577{
1578 struct ext4_fc_add_range *fc_add_ex;
1579 struct ext4_extent newex, *ex;
1580 struct inode *inode;
1581 ext4_lblk_t start, cur;
1582 int remaining, len;
1583 ext4_fsblk_t start_pblk;
1584 struct ext4_map_blocks map;
1585 struct ext4_ext_path *path = NULL;
1586 int ret;
1587
1588 fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1589 ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1590
1591 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1592 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1593 ext4_ext_get_actual_len(ex));
1594
1595 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1596 EXT4_IGET_NORMAL);
1597 if (IS_ERR_OR_NULL(inode)) {
1598 jbd_debug(1, "Inode not found.");
1599 return 0;
1600 }
1601
1602 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1603
1604 start = le32_to_cpu(ex->ee_block);
1605 start_pblk = ext4_ext_pblock(ex);
1606 len = ext4_ext_get_actual_len(ex);
1607
1608 cur = start;
1609 remaining = len;
1610 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1611 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1612 inode->i_ino);
1613
1614 while (remaining > 0) {
1615 map.m_lblk = cur;
1616 map.m_len = remaining;
1617 map.m_pblk = 0;
1618 ret = ext4_map_blocks(NULL, inode, &map, 0);
1619
1620 if (ret < 0) {
1621 iput(inode);
1622 return 0;
1623 }
1624
1625 if (ret == 0) {
1626 /* Range is not mapped */
1627 path = ext4_find_extent(inode, cur, NULL, 0);
8c9be1e5
HS
1628 if (IS_ERR(path)) {
1629 iput(inode);
1630 return 0;
1631 }
8016e29f
HS
1632 memset(&newex, 0, sizeof(newex));
1633 newex.ee_block = cpu_to_le32(cur);
1634 ext4_ext_store_pblock(
1635 &newex, start_pblk + cur - start);
1636 newex.ee_len = cpu_to_le16(map.m_len);
1637 if (ext4_ext_is_unwritten(ex))
1638 ext4_ext_mark_unwritten(&newex);
1639 down_write(&EXT4_I(inode)->i_data_sem);
1640 ret = ext4_ext_insert_extent(
1641 NULL, inode, &path, &newex, 0);
1642 up_write((&EXT4_I(inode)->i_data_sem));
1643 ext4_ext_drop_refs(path);
1644 kfree(path);
1645 if (ret) {
1646 iput(inode);
1647 return 0;
1648 }
1649 goto next;
1650 }
1651
1652 if (start_pblk + cur - start != map.m_pblk) {
1653 /*
1654 * Logical to physical mapping changed. This can happen
1655 * if this range was removed and then reallocated to
1656 * map to new physical blocks during a fast commit.
1657 */
1658 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1659 ext4_ext_is_unwritten(ex),
1660 start_pblk + cur - start);
1661 if (ret) {
1662 iput(inode);
1663 return 0;
1664 }
1665 /*
1666 * Mark the old blocks as free since they aren't used
1667 * anymore. We maintain an array of all the modified
1668 * inodes. In case these blocks are still used at either
1669 * a different logical range in the same inode or in
1670 * some different inode, we will mark them as allocated
1671 * at the end of the FC replay using our array of
1672 * modified inodes.
1673 */
1674 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1675 goto next;
1676 }
1677
1678 /* Range is mapped and needs a state change */
1679 jbd_debug(1, "Converting from %d to %d %lld",
1680 map.m_flags & EXT4_MAP_UNWRITTEN,
1681 ext4_ext_is_unwritten(ex), map.m_pblk);
1682 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1683 ext4_ext_is_unwritten(ex), map.m_pblk);
1684 if (ret) {
1685 iput(inode);
1686 return 0;
1687 }
1688 /*
1689 * We may have split the extent tree while toggling the state.
1690 * Try to shrink the extent tree now.
1691 */
1692 ext4_ext_replay_shrink_inode(inode, start + len);
1693next:
1694 cur += map.m_len;
1695 remaining -= map.m_len;
1696 }
1697 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1698 sb->s_blocksize_bits);
1699 iput(inode);
1700 return 0;
1701}
1702
1703/* Replay DEL_RANGE tag */
1704static int
1705ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1706{
1707 struct inode *inode;
1708 struct ext4_fc_del_range *lrange;
1709 struct ext4_map_blocks map;
1710 ext4_lblk_t cur, remaining;
1711 int ret;
1712
1713 lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1714 cur = le32_to_cpu(lrange->fc_lblk);
1715 remaining = le32_to_cpu(lrange->fc_len);
1716
1717 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1718 le32_to_cpu(lrange->fc_ino), cur, remaining);
1719
1720 inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1721 if (IS_ERR_OR_NULL(inode)) {
1722 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1723 return 0;
1724 }
1725
1726 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1727
1728 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1729 inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1730 le32_to_cpu(lrange->fc_len));
1731 while (remaining > 0) {
1732 map.m_lblk = cur;
1733 map.m_len = remaining;
1734
1735 ret = ext4_map_blocks(NULL, inode, &map, 0);
1736 if (ret < 0) {
1737 iput(inode);
1738 return 0;
1739 }
1740 if (ret > 0) {
1741 remaining -= ret;
1742 cur += ret;
1743 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1744 } else {
1745 remaining -= map.m_len;
1746 cur += map.m_len;
1747 }
1748 }
1749
1750 ret = ext4_punch_hole(inode,
1751 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1752 le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits);
1753 if (ret)
1754 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1755 ext4_ext_replay_shrink_inode(inode,
1756 i_size_read(inode) >> sb->s_blocksize_bits);
1757 ext4_mark_inode_dirty(NULL, inode);
1758 iput(inode);
1759
1760 return 0;
1761}
1762
1763static inline const char *tag2str(u16 tag)
1764{
1765 switch (tag) {
1766 case EXT4_FC_TAG_LINK:
1767 return "TAG_ADD_ENTRY";
1768 case EXT4_FC_TAG_UNLINK:
1769 return "TAG_DEL_ENTRY";
1770 case EXT4_FC_TAG_ADD_RANGE:
1771 return "TAG_ADD_RANGE";
1772 case EXT4_FC_TAG_CREAT:
1773 return "TAG_CREAT_DENTRY";
1774 case EXT4_FC_TAG_DEL_RANGE:
1775 return "TAG_DEL_RANGE";
1776 case EXT4_FC_TAG_INODE:
1777 return "TAG_INODE";
1778 case EXT4_FC_TAG_PAD:
1779 return "TAG_PAD";
1780 case EXT4_FC_TAG_TAIL:
1781 return "TAG_TAIL";
1782 case EXT4_FC_TAG_HEAD:
1783 return "TAG_HEAD";
1784 default:
1785 return "TAG_ERROR";
1786 }
1787}
1788
1789static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1790{
1791 struct ext4_fc_replay_state *state;
1792 struct inode *inode;
1793 struct ext4_ext_path *path = NULL;
1794 struct ext4_map_blocks map;
1795 int i, ret, j;
1796 ext4_lblk_t cur, end;
1797
1798 state = &EXT4_SB(sb)->s_fc_replay_state;
1799 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1800 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1801 EXT4_IGET_NORMAL);
1802 if (IS_ERR_OR_NULL(inode)) {
1803 jbd_debug(1, "Inode %d not found.",
1804 state->fc_modified_inodes[i]);
1805 continue;
1806 }
1807 cur = 0;
1808 end = EXT_MAX_BLOCKS;
1809 while (cur < end) {
1810 map.m_lblk = cur;
1811 map.m_len = end - cur;
1812
1813 ret = ext4_map_blocks(NULL, inode, &map, 0);
1814 if (ret < 0)
1815 break;
1816
1817 if (ret > 0) {
1818 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1819 if (!IS_ERR_OR_NULL(path)) {
1820 for (j = 0; j < path->p_depth; j++)
1821 ext4_mb_mark_bb(inode->i_sb,
1822 path[j].p_block, 1, 1);
1823 ext4_ext_drop_refs(path);
1824 kfree(path);
1825 }
1826 cur += ret;
1827 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1828 map.m_len, 1);
1829 } else {
1830 cur = cur + (map.m_len ? map.m_len : 1);
1831 }
1832 }
1833 iput(inode);
1834 }
1835}
1836
1837/*
1838 * Check if block is in excluded regions for block allocation. The simple
1839 * allocator that runs during replay phase is calls this function to see
1840 * if it is okay to use a block.
1841 */
1842bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1843{
1844 int i;
1845 struct ext4_fc_replay_state *state;
1846
1847 state = &EXT4_SB(sb)->s_fc_replay_state;
1848 for (i = 0; i < state->fc_regions_valid; i++) {
1849 if (state->fc_regions[i].ino == 0 ||
1850 state->fc_regions[i].len == 0)
1851 continue;
1852 if (blk >= state->fc_regions[i].pblk &&
1853 blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1854 return true;
1855 }
1856 return false;
1857}
1858
1859/* Cleanup function called after replay */
1860void ext4_fc_replay_cleanup(struct super_block *sb)
1861{
1862 struct ext4_sb_info *sbi = EXT4_SB(sb);
1863
1864 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1865 kfree(sbi->s_fc_replay_state.fc_regions);
1866 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1867}
1868
1869/*
1870 * Recovery Scan phase handler
1871 *
1872 * This function is called during the scan phase and is responsible
1873 * for doing following things:
1874 * - Make sure the fast commit area has valid tags for replay
1875 * - Count number of tags that need to be replayed by the replay handler
1876 * - Verify CRC
1877 * - Create a list of excluded blocks for allocation during replay phase
1878 *
1879 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1880 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1881 * to indicate that scan has finished and JBD2 can now start replay phase.
1882 * It returns a negative error to indicate that there was an error. At the end
1883 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1884 * to indicate the number of tags that need to replayed during the replay phase.
1885 */
1886static int ext4_fc_replay_scan(journal_t *journal,
1887 struct buffer_head *bh, int off,
1888 tid_t expected_tid)
1889{
1890 struct super_block *sb = journal->j_private;
1891 struct ext4_sb_info *sbi = EXT4_SB(sb);
1892 struct ext4_fc_replay_state *state;
1893 int ret = JBD2_FC_REPLAY_CONTINUE;
1894 struct ext4_fc_add_range *ext;
1895 struct ext4_fc_tl *tl;
1896 struct ext4_fc_tail *tail;
1897 __u8 *start, *end;
1898 struct ext4_fc_head *head;
1899 struct ext4_extent *ex;
1900
1901 state = &sbi->s_fc_replay_state;
1902
1903 start = (u8 *)bh->b_data;
1904 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1905
1906 if (state->fc_replay_expected_off == 0) {
1907 state->fc_cur_tag = 0;
1908 state->fc_replay_num_tags = 0;
1909 state->fc_crc = 0;
1910 state->fc_regions = NULL;
1911 state->fc_regions_valid = state->fc_regions_used =
1912 state->fc_regions_size = 0;
1913 /* Check if we can stop early */
1914 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1915 != EXT4_FC_TAG_HEAD)
1916 return 0;
1917 }
1918
1919 if (off != state->fc_replay_expected_off) {
1920 ret = -EFSCORRUPTED;
1921 goto out_err;
1922 }
1923
1924 state->fc_replay_expected_off++;
1925 fc_for_each_tl(start, end, tl) {
1926 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1927 tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1928 switch (le16_to_cpu(tl->fc_tag)) {
1929 case EXT4_FC_TAG_ADD_RANGE:
1930 ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1931 ex = (struct ext4_extent *)&ext->fc_ex;
1932 ret = ext4_fc_record_regions(sb,
1933 le32_to_cpu(ext->fc_ino),
1934 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1935 ext4_ext_get_actual_len(ex));
1936 if (ret < 0)
1937 break;
1938 ret = JBD2_FC_REPLAY_CONTINUE;
1939 fallthrough;
1940 case EXT4_FC_TAG_DEL_RANGE:
1941 case EXT4_FC_TAG_LINK:
1942 case EXT4_FC_TAG_UNLINK:
1943 case EXT4_FC_TAG_CREAT:
1944 case EXT4_FC_TAG_INODE:
1945 case EXT4_FC_TAG_PAD:
1946 state->fc_cur_tag++;
1947 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1948 sizeof(*tl) + ext4_fc_tag_len(tl));
1949 break;
1950 case EXT4_FC_TAG_TAIL:
1951 state->fc_cur_tag++;
1952 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1953 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1954 sizeof(*tl) +
1955 offsetof(struct ext4_fc_tail,
1956 fc_crc));
1957 if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1958 le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1959 state->fc_replay_num_tags = state->fc_cur_tag;
1960 state->fc_regions_valid =
1961 state->fc_regions_used;
1962 } else {
1963 ret = state->fc_replay_num_tags ?
1964 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1965 }
1966 state->fc_crc = 0;
1967 break;
1968 case EXT4_FC_TAG_HEAD:
1969 head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
1970 if (le32_to_cpu(head->fc_features) &
1971 ~EXT4_FC_SUPPORTED_FEATURES) {
1972 ret = -EOPNOTSUPP;
1973 break;
1974 }
1975 if (le32_to_cpu(head->fc_tid) != expected_tid) {
1976 ret = JBD2_FC_REPLAY_STOP;
1977 break;
1978 }
1979 state->fc_cur_tag++;
1980 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1981 sizeof(*tl) + ext4_fc_tag_len(tl));
1982 break;
1983 default:
1984 ret = state->fc_replay_num_tags ?
1985 JBD2_FC_REPLAY_STOP : -ECANCELED;
1986 }
1987 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
1988 break;
1989 }
1990
1991out_err:
1992 trace_ext4_fc_replay_scan(sb, ret, off);
1993 return ret;
1994}
1995
5b849b5f
HS
1996/*
1997 * Main recovery path entry point.
8016e29f 1998 * The meaning of return codes is similar as above.
5b849b5f
HS
1999 */
2000static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2001 enum passtype pass, int off, tid_t expected_tid)
2002{
8016e29f
HS
2003 struct super_block *sb = journal->j_private;
2004 struct ext4_sb_info *sbi = EXT4_SB(sb);
2005 struct ext4_fc_tl *tl;
2006 __u8 *start, *end;
2007 int ret = JBD2_FC_REPLAY_CONTINUE;
2008 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2009 struct ext4_fc_tail *tail;
2010
2011 if (pass == PASS_SCAN) {
2012 state->fc_current_pass = PASS_SCAN;
2013 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2014 }
2015
2016 if (state->fc_current_pass != pass) {
2017 state->fc_current_pass = pass;
2018 sbi->s_mount_state |= EXT4_FC_REPLAY;
2019 }
2020 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2021 jbd_debug(1, "Replay stops\n");
2022 ext4_fc_set_bitmaps_and_counters(sb);
2023 return 0;
2024 }
2025
2026#ifdef CONFIG_EXT4_DEBUG
2027 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2028 pr_warn("Dropping fc block %d because max_replay set\n", off);
2029 return JBD2_FC_REPLAY_STOP;
2030 }
2031#endif
2032
2033 start = (u8 *)bh->b_data;
2034 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2035
2036 fc_for_each_tl(start, end, tl) {
2037 if (state->fc_replay_num_tags == 0) {
2038 ret = JBD2_FC_REPLAY_STOP;
2039 ext4_fc_set_bitmaps_and_counters(sb);
2040 break;
2041 }
2042 jbd_debug(3, "Replay phase, tag:%s\n",
2043 tag2str(le16_to_cpu(tl->fc_tag)));
2044 state->fc_replay_num_tags--;
2045 switch (le16_to_cpu(tl->fc_tag)) {
2046 case EXT4_FC_TAG_LINK:
2047 ret = ext4_fc_replay_link(sb, tl);
2048 break;
2049 case EXT4_FC_TAG_UNLINK:
2050 ret = ext4_fc_replay_unlink(sb, tl);
2051 break;
2052 case EXT4_FC_TAG_ADD_RANGE:
2053 ret = ext4_fc_replay_add_range(sb, tl);
2054 break;
2055 case EXT4_FC_TAG_CREAT:
2056 ret = ext4_fc_replay_create(sb, tl);
2057 break;
2058 case EXT4_FC_TAG_DEL_RANGE:
2059 ret = ext4_fc_replay_del_range(sb, tl);
2060 break;
2061 case EXT4_FC_TAG_INODE:
2062 ret = ext4_fc_replay_inode(sb, tl);
2063 break;
2064 case EXT4_FC_TAG_PAD:
2065 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2066 ext4_fc_tag_len(tl), 0);
2067 break;
2068 case EXT4_FC_TAG_TAIL:
2069 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2070 ext4_fc_tag_len(tl), 0);
2071 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2072 WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2073 break;
2074 case EXT4_FC_TAG_HEAD:
2075 break;
2076 default:
2077 trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2078 ext4_fc_tag_len(tl), 0);
2079 ret = -ECANCELED;
2080 break;
2081 }
2082 if (ret < 0)
2083 break;
2084 ret = JBD2_FC_REPLAY_CONTINUE;
2085 }
2086 return ret;
5b849b5f
HS
2087}
2088
6866d7b3
HS
2089void ext4_fc_init(struct super_block *sb, journal_t *journal)
2090{
5b849b5f
HS
2091 /*
2092 * We set replay callback even if fast commit disabled because we may
2093 * could still have fast commit blocks that need to be replayed even if
2094 * fast commit has now been turned off.
2095 */
2096 journal->j_fc_replay_callback = ext4_fc_replay;
6866d7b3
HS
2097 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2098 return;
ff780b91 2099 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
6866d7b3 2100}
aa75f4d3 2101
ce8c59d1
HS
2102const char *fc_ineligible_reasons[] = {
2103 "Extended attributes changed",
2104 "Cross rename",
2105 "Journal flag changed",
2106 "Insufficient memory",
2107 "Swap boot",
2108 "Resize",
2109 "Dir renamed",
2110 "Falloc range op",
556e0319 2111 "Data journalling",
ce8c59d1
HS
2112 "FC Commit Failed"
2113};
2114
2115int ext4_fc_info_show(struct seq_file *seq, void *v)
2116{
2117 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2118 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2119 int i;
2120
2121 if (v != SEQ_START_TOKEN)
2122 return 0;
2123
2124 seq_printf(seq,
2125 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2126 stats->fc_num_commits, stats->fc_ineligible_commits,
2127 stats->fc_numblks,
2128 div_u64(sbi->s_fc_avg_commit_time, 1000));
2129 seq_puts(seq, "Ineligible reasons:\n");
2130 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2131 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2132 stats->fc_ineligible_reason_count[i]);
2133
2134 return 0;
2135}
2136
aa75f4d3
HS
2137int __init ext4_fc_init_dentry_cache(void)
2138{
2139 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2140 SLAB_RECLAIM_ACCOUNT);
2141
2142 if (ext4_fc_dentry_cachep == NULL)
2143 return -ENOMEM;
2144
2145 return 0;
2146}