Commit | Line | Data |
---|---|---|
6866d7b3 HS |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | /* | |
4 | * fs/ext4/fast_commit.c | |
5 | * | |
6 | * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> | |
7 | * | |
8 | * Ext4 fast commits routines. | |
9 | */ | |
aa75f4d3 | 10 | #include "ext4.h" |
6866d7b3 | 11 | #include "ext4_jbd2.h" |
aa75f4d3 HS |
12 | #include "ext4_extents.h" |
13 | #include "mballoc.h" | |
14 | ||
15 | /* | |
16 | * Ext4 Fast Commits | |
17 | * ----------------- | |
18 | * | |
19 | * Ext4 fast commits implement fine grained journalling for Ext4. | |
20 | * | |
21 | * Fast commits are organized as a log of tag-length-value (TLV) structs. (See | |
22 | * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by | |
23 | * TLV during the recovery phase. For the scenarios for which we currently | |
24 | * don't have replay code, fast commit falls back to full commits. | |
25 | * Fast commits record delta in one of the following three categories. | |
26 | * | |
27 | * (A) Directory entry updates: | |
28 | * | |
29 | * - EXT4_FC_TAG_UNLINK - records directory entry unlink | |
30 | * - EXT4_FC_TAG_LINK - records directory entry link | |
31 | * - EXT4_FC_TAG_CREAT - records inode and directory entry creation | |
32 | * | |
33 | * (B) File specific data range updates: | |
34 | * | |
35 | * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode | |
36 | * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode | |
37 | * | |
38 | * (C) Inode metadata (mtime / ctime etc): | |
39 | * | |
40 | * - EXT4_FC_TAG_INODE - record the inode that should be replayed | |
41 | * during recovery. Note that iblocks field is | |
42 | * not replayed and instead derived during | |
43 | * replay. | |
44 | * Commit Operation | |
45 | * ---------------- | |
46 | * With fast commits, we maintain all the directory entry operations in the | |
47 | * order in which they are issued in an in-memory queue. This queue is flushed | |
48 | * to disk during the commit operation. We also maintain a list of inodes | |
49 | * that need to be committed during a fast commit in another in memory queue of | |
50 | * inodes. During the commit operation, we commit in the following order: | |
51 | * | |
52 | * [1] Lock inodes for any further data updates by setting COMMITTING state | |
53 | * [2] Submit data buffers of all the inodes | |
54 | * [3] Wait for [2] to complete | |
55 | * [4] Commit all the directory entry updates in the fast commit space | |
56 | * [5] Commit all the changed inode structures | |
57 | * [6] Write tail tag (this tag ensures the atomicity, please read the following | |
58 | * section for more details). | |
59 | * [7] Wait for [4], [5] and [6] to complete. | |
60 | * | |
61 | * All the inode updates must call ext4_fc_start_update() before starting an | |
62 | * update. If such an ongoing update is present, fast commit waits for it to | |
63 | * complete. The completion of such an update is marked by | |
64 | * ext4_fc_stop_update(). | |
65 | * | |
66 | * Fast Commit Ineligibility | |
67 | * ------------------------- | |
68 | * Not all operations are supported by fast commits today (e.g extended | |
69 | * attributes). Fast commit ineligiblity is marked by calling one of the | |
70 | * two following functions: | |
71 | * | |
72 | * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall | |
73 | * back to full commit. This is useful in case of transient errors. | |
74 | * | |
75 | * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all | |
76 | * the fast commits happening between ext4_fc_start_ineligible() and | |
77 | * ext4_fc_stop_ineligible() and one fast commit after the call to | |
78 | * ext4_fc_stop_ineligible() to fall back to full commits. It is important to | |
79 | * make one more fast commit to fall back to full commit after stop call so | |
80 | * that it guaranteed that the fast commit ineligible operation contained | |
81 | * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is | |
82 | * followed by at least 1 full commit. | |
83 | * | |
84 | * Atomicity of commits | |
85 | * -------------------- | |
a740762f | 86 | * In order to guarantee atomicity during the commit operation, fast commit |
aa75f4d3 HS |
87 | * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail |
88 | * tag contains CRC of the contents and TID of the transaction after which | |
89 | * this fast commit should be applied. Recovery code replays fast commit | |
90 | * logs only if there's at least 1 valid tail present. For every fast commit | |
91 | * operation, there is 1 tail. This means, we may end up with multiple tails | |
92 | * in the fast commit space. Here's an example: | |
93 | * | |
94 | * - Create a new file A and remove existing file B | |
95 | * - fsync() | |
96 | * - Append contents to file A | |
97 | * - Truncate file A | |
98 | * - fsync() | |
99 | * | |
100 | * The fast commit space at the end of above operations would look like this: | |
101 | * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] | |
102 | * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| | |
103 | * | |
104 | * Replay code should thus check for all the valid tails in the FC area. | |
105 | * | |
106 | * TODOs | |
107 | * ----- | |
108 | * 1) Make fast commit atomic updates more fine grained. Today, a fast commit | |
109 | * eligible update must be protected within ext4_fc_start_update() and | |
110 | * ext4_fc_stop_update(). These routines are called at much higher | |
111 | * routines. This can be made more fine grained by combining with | |
112 | * ext4_journal_start(). | |
113 | * | |
114 | * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() | |
115 | * | |
116 | * 3) Handle more ineligible cases. | |
117 | */ | |
118 | ||
119 | #include <trace/events/ext4.h> | |
120 | static struct kmem_cache *ext4_fc_dentry_cachep; | |
121 | ||
122 | static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) | |
123 | { | |
124 | BUFFER_TRACE(bh, ""); | |
125 | if (uptodate) { | |
126 | ext4_debug("%s: Block %lld up-to-date", | |
127 | __func__, bh->b_blocknr); | |
128 | set_buffer_uptodate(bh); | |
129 | } else { | |
130 | ext4_debug("%s: Block %lld not up-to-date", | |
131 | __func__, bh->b_blocknr); | |
132 | clear_buffer_uptodate(bh); | |
133 | } | |
134 | ||
135 | unlock_buffer(bh); | |
136 | } | |
137 | ||
138 | static inline void ext4_fc_reset_inode(struct inode *inode) | |
139 | { | |
140 | struct ext4_inode_info *ei = EXT4_I(inode); | |
141 | ||
142 | ei->i_fc_lblk_start = 0; | |
143 | ei->i_fc_lblk_len = 0; | |
144 | } | |
145 | ||
146 | void ext4_fc_init_inode(struct inode *inode) | |
147 | { | |
148 | struct ext4_inode_info *ei = EXT4_I(inode); | |
149 | ||
150 | ext4_fc_reset_inode(inode); | |
151 | ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); | |
152 | INIT_LIST_HEAD(&ei->i_fc_list); | |
153 | init_waitqueue_head(&ei->i_fc_wait); | |
154 | atomic_set(&ei->i_fc_updates, 0); | |
aa75f4d3 HS |
155 | } |
156 | ||
f6634e26 HS |
157 | /* This function must be called with sbi->s_fc_lock held. */ |
158 | static void ext4_fc_wait_committing_inode(struct inode *inode) | |
fa329e27 | 159 | __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) |
f6634e26 HS |
160 | { |
161 | wait_queue_head_t *wq; | |
162 | struct ext4_inode_info *ei = EXT4_I(inode); | |
163 | ||
164 | #if (BITS_PER_LONG < 64) | |
165 | DEFINE_WAIT_BIT(wait, &ei->i_state_flags, | |
166 | EXT4_STATE_FC_COMMITTING); | |
167 | wq = bit_waitqueue(&ei->i_state_flags, | |
168 | EXT4_STATE_FC_COMMITTING); | |
169 | #else | |
170 | DEFINE_WAIT_BIT(wait, &ei->i_flags, | |
171 | EXT4_STATE_FC_COMMITTING); | |
172 | wq = bit_waitqueue(&ei->i_flags, | |
173 | EXT4_STATE_FC_COMMITTING); | |
174 | #endif | |
175 | lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
176 | prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); | |
177 | spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
178 | schedule(); | |
179 | finish_wait(wq, &wait.wq_entry); | |
180 | } | |
181 | ||
aa75f4d3 HS |
182 | /* |
183 | * Inform Ext4's fast about start of an inode update | |
184 | * | |
185 | * This function is called by the high level call VFS callbacks before | |
186 | * performing any inode update. This function blocks if there's an ongoing | |
187 | * fast commit on the inode in question. | |
188 | */ | |
189 | void ext4_fc_start_update(struct inode *inode) | |
190 | { | |
191 | struct ext4_inode_info *ei = EXT4_I(inode); | |
192 | ||
8016e29f HS |
193 | if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || |
194 | (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) | |
aa75f4d3 HS |
195 | return; |
196 | ||
197 | restart: | |
198 | spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
199 | if (list_empty(&ei->i_fc_list)) | |
200 | goto out; | |
201 | ||
202 | if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { | |
f6634e26 | 203 | ext4_fc_wait_committing_inode(inode); |
aa75f4d3 HS |
204 | goto restart; |
205 | } | |
206 | out: | |
207 | atomic_inc(&ei->i_fc_updates); | |
208 | spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
209 | } | |
210 | ||
211 | /* | |
212 | * Stop inode update and wake up waiting fast commits if any. | |
213 | */ | |
214 | void ext4_fc_stop_update(struct inode *inode) | |
215 | { | |
216 | struct ext4_inode_info *ei = EXT4_I(inode); | |
217 | ||
8016e29f HS |
218 | if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || |
219 | (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) | |
aa75f4d3 HS |
220 | return; |
221 | ||
222 | if (atomic_dec_and_test(&ei->i_fc_updates)) | |
223 | wake_up_all(&ei->i_fc_wait); | |
224 | } | |
225 | ||
226 | /* | |
227 | * Remove inode from fast commit list. If the inode is being committed | |
228 | * we wait until inode commit is done. | |
229 | */ | |
230 | void ext4_fc_del(struct inode *inode) | |
231 | { | |
232 | struct ext4_inode_info *ei = EXT4_I(inode); | |
233 | ||
8016e29f HS |
234 | if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || |
235 | (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) | |
aa75f4d3 HS |
236 | return; |
237 | ||
238 | restart: | |
239 | spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
240 | if (list_empty(&ei->i_fc_list)) { | |
241 | spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
242 | return; | |
243 | } | |
244 | ||
245 | if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { | |
f6634e26 | 246 | ext4_fc_wait_committing_inode(inode); |
aa75f4d3 HS |
247 | goto restart; |
248 | } | |
f6634e26 | 249 | list_del_init(&ei->i_fc_list); |
aa75f4d3 HS |
250 | spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); |
251 | } | |
252 | ||
253 | /* | |
254 | * Mark file system as fast commit ineligible. This means that next commit | |
255 | * operation would result in a full jbd2 commit. | |
256 | */ | |
257 | void ext4_fc_mark_ineligible(struct super_block *sb, int reason) | |
258 | { | |
259 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
260 | ||
8016e29f HS |
261 | if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || |
262 | (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) | |
263 | return; | |
264 | ||
9b5f6c9b | 265 | ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); |
aa75f4d3 HS |
266 | WARN_ON(reason >= EXT4_FC_REASON_MAX); |
267 | sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; | |
268 | } | |
269 | ||
270 | /* | |
271 | * Start a fast commit ineligible update. Any commits that happen while | |
272 | * such an operation is in progress fall back to full commits. | |
273 | */ | |
274 | void ext4_fc_start_ineligible(struct super_block *sb, int reason) | |
275 | { | |
276 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
277 | ||
8016e29f HS |
278 | if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || |
279 | (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) | |
280 | return; | |
281 | ||
aa75f4d3 HS |
282 | WARN_ON(reason >= EXT4_FC_REASON_MAX); |
283 | sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; | |
284 | atomic_inc(&sbi->s_fc_ineligible_updates); | |
285 | } | |
286 | ||
287 | /* | |
ababea77 | 288 | * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here |
aa75f4d3 HS |
289 | * to ensure that after stopping the ineligible update, at least one full |
290 | * commit takes place. | |
291 | */ | |
292 | void ext4_fc_stop_ineligible(struct super_block *sb) | |
293 | { | |
8016e29f HS |
294 | if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || |
295 | (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) | |
296 | return; | |
297 | ||
9b5f6c9b | 298 | ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); |
aa75f4d3 HS |
299 | atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); |
300 | } | |
301 | ||
302 | static inline int ext4_fc_is_ineligible(struct super_block *sb) | |
303 | { | |
9b5f6c9b HS |
304 | return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) || |
305 | atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates)); | |
aa75f4d3 HS |
306 | } |
307 | ||
308 | /* | |
309 | * Generic fast commit tracking function. If this is the first time this we are | |
310 | * called after a full commit, we initialize fast commit fields and then call | |
311 | * __fc_track_fn() with update = 0. If we have already been called after a full | |
312 | * commit, we pass update = 1. Based on that, the track function can determine | |
313 | * if it needs to track a field for the first time or if it needs to just | |
314 | * update the previously tracked value. | |
315 | * | |
316 | * If enqueue is set, this function enqueues the inode in fast commit list. | |
317 | */ | |
318 | static int ext4_fc_track_template( | |
a80f7fcf HS |
319 | handle_t *handle, struct inode *inode, |
320 | int (*__fc_track_fn)(struct inode *, void *, bool), | |
aa75f4d3 HS |
321 | void *args, int enqueue) |
322 | { | |
aa75f4d3 HS |
323 | bool update = false; |
324 | struct ext4_inode_info *ei = EXT4_I(inode); | |
325 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | |
a80f7fcf | 326 | tid_t tid = 0; |
aa75f4d3 HS |
327 | int ret; |
328 | ||
8016e29f HS |
329 | if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || |
330 | (sbi->s_mount_state & EXT4_FC_REPLAY)) | |
aa75f4d3 HS |
331 | return -EOPNOTSUPP; |
332 | ||
333 | if (ext4_fc_is_ineligible(inode->i_sb)) | |
334 | return -EINVAL; | |
335 | ||
a80f7fcf | 336 | tid = handle->h_transaction->t_tid; |
aa75f4d3 | 337 | mutex_lock(&ei->i_fc_lock); |
a80f7fcf | 338 | if (tid == ei->i_sync_tid) { |
aa75f4d3 HS |
339 | update = true; |
340 | } else { | |
341 | ext4_fc_reset_inode(inode); | |
a80f7fcf | 342 | ei->i_sync_tid = tid; |
aa75f4d3 HS |
343 | } |
344 | ret = __fc_track_fn(inode, args, update); | |
345 | mutex_unlock(&ei->i_fc_lock); | |
346 | ||
347 | if (!enqueue) | |
348 | return ret; | |
349 | ||
350 | spin_lock(&sbi->s_fc_lock); | |
351 | if (list_empty(&EXT4_I(inode)->i_fc_list)) | |
352 | list_add_tail(&EXT4_I(inode)->i_fc_list, | |
9b5f6c9b | 353 | (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ? |
aa75f4d3 HS |
354 | &sbi->s_fc_q[FC_Q_STAGING] : |
355 | &sbi->s_fc_q[FC_Q_MAIN]); | |
356 | spin_unlock(&sbi->s_fc_lock); | |
357 | ||
358 | return ret; | |
359 | } | |
360 | ||
361 | struct __track_dentry_update_args { | |
362 | struct dentry *dentry; | |
363 | int op; | |
364 | }; | |
365 | ||
366 | /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ | |
367 | static int __track_dentry_update(struct inode *inode, void *arg, bool update) | |
368 | { | |
369 | struct ext4_fc_dentry_update *node; | |
370 | struct ext4_inode_info *ei = EXT4_I(inode); | |
371 | struct __track_dentry_update_args *dentry_update = | |
372 | (struct __track_dentry_update_args *)arg; | |
373 | struct dentry *dentry = dentry_update->dentry; | |
374 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | |
375 | ||
376 | mutex_unlock(&ei->i_fc_lock); | |
377 | node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); | |
378 | if (!node) { | |
b21ebf14 | 379 | ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); |
aa75f4d3 HS |
380 | mutex_lock(&ei->i_fc_lock); |
381 | return -ENOMEM; | |
382 | } | |
383 | ||
384 | node->fcd_op = dentry_update->op; | |
385 | node->fcd_parent = dentry->d_parent->d_inode->i_ino; | |
386 | node->fcd_ino = inode->i_ino; | |
387 | if (dentry->d_name.len > DNAME_INLINE_LEN) { | |
388 | node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); | |
389 | if (!node->fcd_name.name) { | |
390 | kmem_cache_free(ext4_fc_dentry_cachep, node); | |
391 | ext4_fc_mark_ineligible(inode->i_sb, | |
b21ebf14 | 392 | EXT4_FC_REASON_NOMEM); |
aa75f4d3 HS |
393 | mutex_lock(&ei->i_fc_lock); |
394 | return -ENOMEM; | |
395 | } | |
396 | memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, | |
397 | dentry->d_name.len); | |
398 | } else { | |
399 | memcpy(node->fcd_iname, dentry->d_name.name, | |
400 | dentry->d_name.len); | |
401 | node->fcd_name.name = node->fcd_iname; | |
402 | } | |
403 | node->fcd_name.len = dentry->d_name.len; | |
404 | ||
405 | spin_lock(&sbi->s_fc_lock); | |
9b5f6c9b | 406 | if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) |
aa75f4d3 HS |
407 | list_add_tail(&node->fcd_list, |
408 | &sbi->s_fc_dentry_q[FC_Q_STAGING]); | |
409 | else | |
410 | list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); | |
411 | spin_unlock(&sbi->s_fc_lock); | |
412 | mutex_lock(&ei->i_fc_lock); | |
413 | ||
414 | return 0; | |
415 | } | |
416 | ||
a80f7fcf HS |
417 | void __ext4_fc_track_unlink(handle_t *handle, |
418 | struct inode *inode, struct dentry *dentry) | |
aa75f4d3 HS |
419 | { |
420 | struct __track_dentry_update_args args; | |
421 | int ret; | |
422 | ||
423 | args.dentry = dentry; | |
424 | args.op = EXT4_FC_TAG_UNLINK; | |
425 | ||
a80f7fcf | 426 | ret = ext4_fc_track_template(handle, inode, __track_dentry_update, |
aa75f4d3 HS |
427 | (void *)&args, 0); |
428 | trace_ext4_fc_track_unlink(inode, dentry, ret); | |
429 | } | |
430 | ||
a80f7fcf HS |
431 | void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) |
432 | { | |
433 | __ext4_fc_track_unlink(handle, d_inode(dentry), dentry); | |
434 | } | |
435 | ||
436 | void __ext4_fc_track_link(handle_t *handle, | |
437 | struct inode *inode, struct dentry *dentry) | |
aa75f4d3 HS |
438 | { |
439 | struct __track_dentry_update_args args; | |
440 | int ret; | |
441 | ||
442 | args.dentry = dentry; | |
443 | args.op = EXT4_FC_TAG_LINK; | |
444 | ||
a80f7fcf | 445 | ret = ext4_fc_track_template(handle, inode, __track_dentry_update, |
aa75f4d3 HS |
446 | (void *)&args, 0); |
447 | trace_ext4_fc_track_link(inode, dentry, ret); | |
448 | } | |
449 | ||
a80f7fcf HS |
450 | void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) |
451 | { | |
452 | __ext4_fc_track_link(handle, d_inode(dentry), dentry); | |
453 | } | |
454 | ||
455 | void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) | |
aa75f4d3 HS |
456 | { |
457 | struct __track_dentry_update_args args; | |
a80f7fcf | 458 | struct inode *inode = d_inode(dentry); |
aa75f4d3 HS |
459 | int ret; |
460 | ||
461 | args.dentry = dentry; | |
462 | args.op = EXT4_FC_TAG_CREAT; | |
463 | ||
a80f7fcf | 464 | ret = ext4_fc_track_template(handle, inode, __track_dentry_update, |
aa75f4d3 HS |
465 | (void *)&args, 0); |
466 | trace_ext4_fc_track_create(inode, dentry, ret); | |
467 | } | |
468 | ||
469 | /* __track_fn for inode tracking */ | |
470 | static int __track_inode(struct inode *inode, void *arg, bool update) | |
471 | { | |
472 | if (update) | |
473 | return -EEXIST; | |
474 | ||
475 | EXT4_I(inode)->i_fc_lblk_len = 0; | |
476 | ||
477 | return 0; | |
478 | } | |
479 | ||
a80f7fcf | 480 | void ext4_fc_track_inode(handle_t *handle, struct inode *inode) |
aa75f4d3 HS |
481 | { |
482 | int ret; | |
483 | ||
484 | if (S_ISDIR(inode->i_mode)) | |
485 | return; | |
486 | ||
556e0319 HS |
487 | if (ext4_should_journal_data(inode)) { |
488 | ext4_fc_mark_ineligible(inode->i_sb, | |
489 | EXT4_FC_REASON_INODE_JOURNAL_DATA); | |
490 | return; | |
491 | } | |
492 | ||
a80f7fcf | 493 | ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); |
aa75f4d3 HS |
494 | trace_ext4_fc_track_inode(inode, ret); |
495 | } | |
496 | ||
497 | struct __track_range_args { | |
498 | ext4_lblk_t start, end; | |
499 | }; | |
500 | ||
501 | /* __track_fn for tracking data updates */ | |
502 | static int __track_range(struct inode *inode, void *arg, bool update) | |
503 | { | |
504 | struct ext4_inode_info *ei = EXT4_I(inode); | |
505 | ext4_lblk_t oldstart; | |
506 | struct __track_range_args *__arg = | |
507 | (struct __track_range_args *)arg; | |
508 | ||
509 | if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { | |
510 | ext4_debug("Special inode %ld being modified\n", inode->i_ino); | |
511 | return -ECANCELED; | |
512 | } | |
513 | ||
514 | oldstart = ei->i_fc_lblk_start; | |
515 | ||
516 | if (update && ei->i_fc_lblk_len > 0) { | |
517 | ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); | |
518 | ei->i_fc_lblk_len = | |
519 | max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - | |
520 | ei->i_fc_lblk_start + 1; | |
521 | } else { | |
522 | ei->i_fc_lblk_start = __arg->start; | |
523 | ei->i_fc_lblk_len = __arg->end - __arg->start + 1; | |
524 | } | |
525 | ||
526 | return 0; | |
527 | } | |
528 | ||
a80f7fcf | 529 | void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, |
aa75f4d3 HS |
530 | ext4_lblk_t end) |
531 | { | |
532 | struct __track_range_args args; | |
533 | int ret; | |
534 | ||
535 | if (S_ISDIR(inode->i_mode)) | |
536 | return; | |
537 | ||
538 | args.start = start; | |
539 | args.end = end; | |
540 | ||
a80f7fcf | 541 | ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); |
aa75f4d3 HS |
542 | |
543 | trace_ext4_fc_track_range(inode, start, end, ret); | |
544 | } | |
545 | ||
546 | static void ext4_fc_submit_bh(struct super_block *sb) | |
547 | { | |
548 | int write_flags = REQ_SYNC; | |
549 | struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; | |
550 | ||
a740762f | 551 | /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */ |
aa75f4d3 HS |
552 | if (test_opt(sb, BARRIER)) |
553 | write_flags |= REQ_FUA | REQ_PREFLUSH; | |
554 | lock_buffer(bh); | |
764b3fd3 | 555 | set_buffer_dirty(bh); |
aa75f4d3 HS |
556 | set_buffer_uptodate(bh); |
557 | bh->b_end_io = ext4_end_buffer_io_sync; | |
558 | submit_bh(REQ_OP_WRITE, write_flags, bh); | |
559 | EXT4_SB(sb)->s_fc_bh = NULL; | |
560 | } | |
561 | ||
562 | /* Ext4 commit path routines */ | |
563 | ||
564 | /* memzero and update CRC */ | |
565 | static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, | |
566 | u32 *crc) | |
567 | { | |
568 | void *ret; | |
569 | ||
570 | ret = memset(dst, 0, len); | |
571 | if (crc) | |
572 | *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); | |
573 | return ret; | |
574 | } | |
575 | ||
576 | /* | |
577 | * Allocate len bytes on a fast commit buffer. | |
578 | * | |
579 | * During the commit time this function is used to manage fast commit | |
580 | * block space. We don't split a fast commit log onto different | |
581 | * blocks. So this function makes sure that if there's not enough space | |
582 | * on the current block, the remaining space in the current block is | |
583 | * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, | |
584 | * new block is from jbd2 and CRC is updated to reflect the padding | |
585 | * we added. | |
586 | */ | |
587 | static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) | |
588 | { | |
589 | struct ext4_fc_tl *tl; | |
590 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
591 | struct buffer_head *bh; | |
592 | int bsize = sbi->s_journal->j_blocksize; | |
593 | int ret, off = sbi->s_fc_bytes % bsize; | |
594 | int pad_len; | |
595 | ||
596 | /* | |
597 | * After allocating len, we should have space at least for a 0 byte | |
598 | * padding. | |
599 | */ | |
600 | if (len + sizeof(struct ext4_fc_tl) > bsize) | |
601 | return NULL; | |
602 | ||
603 | if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { | |
604 | /* | |
605 | * Only allocate from current buffer if we have enough space for | |
606 | * this request AND we have space to add a zero byte padding. | |
607 | */ | |
608 | if (!sbi->s_fc_bh) { | |
609 | ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); | |
610 | if (ret) | |
611 | return NULL; | |
612 | sbi->s_fc_bh = bh; | |
613 | } | |
614 | sbi->s_fc_bytes += len; | |
615 | return sbi->s_fc_bh->b_data + off; | |
616 | } | |
617 | /* Need to add PAD tag */ | |
618 | tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); | |
619 | tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); | |
620 | pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); | |
621 | tl->fc_len = cpu_to_le16(pad_len); | |
622 | if (crc) | |
623 | *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); | |
624 | if (pad_len > 0) | |
625 | ext4_fc_memzero(sb, tl + 1, pad_len, crc); | |
626 | ext4_fc_submit_bh(sb); | |
627 | ||
628 | ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); | |
629 | if (ret) | |
630 | return NULL; | |
631 | sbi->s_fc_bh = bh; | |
632 | sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; | |
633 | return sbi->s_fc_bh->b_data; | |
634 | } | |
635 | ||
636 | /* memcpy to fc reserved space and update CRC */ | |
637 | static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, | |
638 | int len, u32 *crc) | |
639 | { | |
640 | if (crc) | |
641 | *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); | |
642 | return memcpy(dst, src, len); | |
643 | } | |
644 | ||
645 | /* | |
646 | * Complete a fast commit by writing tail tag. | |
647 | * | |
648 | * Writing tail tag marks the end of a fast commit. In order to guarantee | |
649 | * atomicity, after writing tail tag, even if there's space remaining | |
650 | * in the block, next commit shouldn't use it. That's why tail tag | |
651 | * has the length as that of the remaining space on the block. | |
652 | */ | |
653 | static int ext4_fc_write_tail(struct super_block *sb, u32 crc) | |
654 | { | |
655 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
656 | struct ext4_fc_tl tl; | |
657 | struct ext4_fc_tail tail; | |
658 | int off, bsize = sbi->s_journal->j_blocksize; | |
659 | u8 *dst; | |
660 | ||
661 | /* | |
662 | * ext4_fc_reserve_space takes care of allocating an extra block if | |
663 | * there's no enough space on this block for accommodating this tail. | |
664 | */ | |
665 | dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); | |
666 | if (!dst) | |
667 | return -ENOSPC; | |
668 | ||
669 | off = sbi->s_fc_bytes % bsize; | |
670 | ||
671 | tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); | |
672 | tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); | |
673 | sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); | |
674 | ||
675 | ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); | |
676 | dst += sizeof(tl); | |
677 | tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); | |
678 | ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); | |
679 | dst += sizeof(tail.fc_tid); | |
680 | tail.fc_crc = cpu_to_le32(crc); | |
681 | ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); | |
682 | ||
683 | ext4_fc_submit_bh(sb); | |
684 | ||
685 | return 0; | |
686 | } | |
687 | ||
688 | /* | |
689 | * Adds tag, length, value and updates CRC. Returns true if tlv was added. | |
690 | * Returns false if there's not enough space. | |
691 | */ | |
692 | static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, | |
693 | u32 *crc) | |
694 | { | |
695 | struct ext4_fc_tl tl; | |
696 | u8 *dst; | |
697 | ||
698 | dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); | |
699 | if (!dst) | |
700 | return false; | |
701 | ||
702 | tl.fc_tag = cpu_to_le16(tag); | |
703 | tl.fc_len = cpu_to_le16(len); | |
704 | ||
705 | ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); | |
706 | ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); | |
707 | ||
708 | return true; | |
709 | } | |
710 | ||
711 | /* Same as above, but adds dentry tlv. */ | |
712 | static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag, | |
713 | int parent_ino, int ino, int dlen, | |
714 | const unsigned char *dname, | |
715 | u32 *crc) | |
716 | { | |
717 | struct ext4_fc_dentry_info fcd; | |
718 | struct ext4_fc_tl tl; | |
719 | u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, | |
720 | crc); | |
721 | ||
722 | if (!dst) | |
723 | return false; | |
724 | ||
725 | fcd.fc_parent_ino = cpu_to_le32(parent_ino); | |
726 | fcd.fc_ino = cpu_to_le32(ino); | |
727 | tl.fc_tag = cpu_to_le16(tag); | |
728 | tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); | |
729 | ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); | |
730 | dst += sizeof(tl); | |
731 | ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); | |
732 | dst += sizeof(fcd); | |
733 | ext4_fc_memcpy(sb, dst, dname, dlen, crc); | |
734 | dst += dlen; | |
735 | ||
736 | return true; | |
737 | } | |
738 | ||
739 | /* | |
740 | * Writes inode in the fast commit space under TLV with tag @tag. | |
741 | * Returns 0 on success, error on failure. | |
742 | */ | |
743 | static int ext4_fc_write_inode(struct inode *inode, u32 *crc) | |
744 | { | |
745 | struct ext4_inode_info *ei = EXT4_I(inode); | |
746 | int inode_len = EXT4_GOOD_OLD_INODE_SIZE; | |
747 | int ret; | |
748 | struct ext4_iloc iloc; | |
749 | struct ext4_fc_inode fc_inode; | |
750 | struct ext4_fc_tl tl; | |
751 | u8 *dst; | |
752 | ||
753 | ret = ext4_get_inode_loc(inode, &iloc); | |
754 | if (ret) | |
755 | return ret; | |
756 | ||
757 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) | |
758 | inode_len += ei->i_extra_isize; | |
759 | ||
760 | fc_inode.fc_ino = cpu_to_le32(inode->i_ino); | |
761 | tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); | |
762 | tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); | |
763 | ||
764 | dst = ext4_fc_reserve_space(inode->i_sb, | |
765 | sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); | |
766 | if (!dst) | |
767 | return -ECANCELED; | |
768 | ||
769 | if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) | |
770 | return -ECANCELED; | |
771 | dst += sizeof(tl); | |
772 | if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) | |
773 | return -ECANCELED; | |
774 | dst += sizeof(fc_inode); | |
775 | if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), | |
776 | inode_len, crc)) | |
777 | return -ECANCELED; | |
778 | ||
779 | return 0; | |
780 | } | |
781 | ||
782 | /* | |
783 | * Writes updated data ranges for the inode in question. Updates CRC. | |
784 | * Returns 0 on success, error otherwise. | |
785 | */ | |
786 | static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) | |
787 | { | |
788 | ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; | |
789 | struct ext4_inode_info *ei = EXT4_I(inode); | |
790 | struct ext4_map_blocks map; | |
791 | struct ext4_fc_add_range fc_ext; | |
792 | struct ext4_fc_del_range lrange; | |
793 | struct ext4_extent *ex; | |
794 | int ret; | |
795 | ||
796 | mutex_lock(&ei->i_fc_lock); | |
797 | if (ei->i_fc_lblk_len == 0) { | |
798 | mutex_unlock(&ei->i_fc_lock); | |
799 | return 0; | |
800 | } | |
801 | old_blk_size = ei->i_fc_lblk_start; | |
802 | new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; | |
803 | ei->i_fc_lblk_len = 0; | |
804 | mutex_unlock(&ei->i_fc_lock); | |
805 | ||
806 | cur_lblk_off = old_blk_size; | |
807 | jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n", | |
808 | __func__, cur_lblk_off, new_blk_size, inode->i_ino); | |
809 | ||
810 | while (cur_lblk_off <= new_blk_size) { | |
811 | map.m_lblk = cur_lblk_off; | |
812 | map.m_len = new_blk_size - cur_lblk_off + 1; | |
813 | ret = ext4_map_blocks(NULL, inode, &map, 0); | |
814 | if (ret < 0) | |
815 | return -ECANCELED; | |
816 | ||
817 | if (map.m_len == 0) { | |
818 | cur_lblk_off++; | |
819 | continue; | |
820 | } | |
821 | ||
822 | if (ret == 0) { | |
823 | lrange.fc_ino = cpu_to_le32(inode->i_ino); | |
824 | lrange.fc_lblk = cpu_to_le32(map.m_lblk); | |
825 | lrange.fc_len = cpu_to_le32(map.m_len); | |
826 | if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, | |
827 | sizeof(lrange), (u8 *)&lrange, crc)) | |
828 | return -ENOSPC; | |
829 | } else { | |
830 | fc_ext.fc_ino = cpu_to_le32(inode->i_ino); | |
831 | ex = (struct ext4_extent *)&fc_ext.fc_ex; | |
832 | ex->ee_block = cpu_to_le32(map.m_lblk); | |
833 | ex->ee_len = cpu_to_le16(map.m_len); | |
834 | ext4_ext_store_pblock(ex, map.m_pblk); | |
835 | if (map.m_flags & EXT4_MAP_UNWRITTEN) | |
836 | ext4_ext_mark_unwritten(ex); | |
837 | else | |
838 | ext4_ext_mark_initialized(ex); | |
839 | if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, | |
840 | sizeof(fc_ext), (u8 *)&fc_ext, crc)) | |
841 | return -ENOSPC; | |
842 | } | |
843 | ||
844 | cur_lblk_off += map.m_len; | |
845 | } | |
846 | ||
847 | return 0; | |
848 | } | |
849 | ||
850 | ||
851 | /* Submit data for all the fast commit inodes */ | |
852 | static int ext4_fc_submit_inode_data_all(journal_t *journal) | |
853 | { | |
854 | struct super_block *sb = (struct super_block *)(journal->j_private); | |
855 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
856 | struct ext4_inode_info *ei; | |
857 | struct list_head *pos; | |
858 | int ret = 0; | |
859 | ||
860 | spin_lock(&sbi->s_fc_lock); | |
9b5f6c9b | 861 | ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); |
aa75f4d3 HS |
862 | list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { |
863 | ei = list_entry(pos, struct ext4_inode_info, i_fc_list); | |
864 | ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); | |
865 | while (atomic_read(&ei->i_fc_updates)) { | |
866 | DEFINE_WAIT(wait); | |
867 | ||
868 | prepare_to_wait(&ei->i_fc_wait, &wait, | |
869 | TASK_UNINTERRUPTIBLE); | |
870 | if (atomic_read(&ei->i_fc_updates)) { | |
871 | spin_unlock(&sbi->s_fc_lock); | |
872 | schedule(); | |
873 | spin_lock(&sbi->s_fc_lock); | |
874 | } | |
875 | finish_wait(&ei->i_fc_wait, &wait); | |
876 | } | |
877 | spin_unlock(&sbi->s_fc_lock); | |
878 | ret = jbd2_submit_inode_data(ei->jinode); | |
879 | if (ret) | |
880 | return ret; | |
881 | spin_lock(&sbi->s_fc_lock); | |
882 | } | |
883 | spin_unlock(&sbi->s_fc_lock); | |
884 | ||
885 | return ret; | |
886 | } | |
887 | ||
888 | /* Wait for completion of data for all the fast commit inodes */ | |
889 | static int ext4_fc_wait_inode_data_all(journal_t *journal) | |
890 | { | |
891 | struct super_block *sb = (struct super_block *)(journal->j_private); | |
892 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
893 | struct ext4_inode_info *pos, *n; | |
894 | int ret = 0; | |
895 | ||
896 | spin_lock(&sbi->s_fc_lock); | |
897 | list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { | |
898 | if (!ext4_test_inode_state(&pos->vfs_inode, | |
899 | EXT4_STATE_FC_COMMITTING)) | |
900 | continue; | |
901 | spin_unlock(&sbi->s_fc_lock); | |
902 | ||
903 | ret = jbd2_wait_inode_data(journal, pos->jinode); | |
904 | if (ret) | |
905 | return ret; | |
906 | spin_lock(&sbi->s_fc_lock); | |
907 | } | |
908 | spin_unlock(&sbi->s_fc_lock); | |
909 | ||
910 | return 0; | |
911 | } | |
912 | ||
913 | /* Commit all the directory entry updates */ | |
914 | static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) | |
fa329e27 TT |
915 | __acquires(&sbi->s_fc_lock) |
916 | __releases(&sbi->s_fc_lock) | |
aa75f4d3 HS |
917 | { |
918 | struct super_block *sb = (struct super_block *)(journal->j_private); | |
919 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
920 | struct ext4_fc_dentry_update *fc_dentry; | |
921 | struct inode *inode; | |
922 | struct list_head *pos, *n, *fcd_pos, *fcd_n; | |
923 | struct ext4_inode_info *ei; | |
924 | int ret; | |
925 | ||
926 | if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) | |
927 | return 0; | |
928 | list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) { | |
929 | fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update, | |
930 | fcd_list); | |
931 | if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { | |
932 | spin_unlock(&sbi->s_fc_lock); | |
933 | if (!ext4_fc_add_dentry_tlv( | |
934 | sb, fc_dentry->fcd_op, | |
935 | fc_dentry->fcd_parent, fc_dentry->fcd_ino, | |
936 | fc_dentry->fcd_name.len, | |
937 | fc_dentry->fcd_name.name, crc)) { | |
938 | ret = -ENOSPC; | |
939 | goto lock_and_exit; | |
940 | } | |
941 | spin_lock(&sbi->s_fc_lock); | |
942 | continue; | |
943 | } | |
944 | ||
945 | inode = NULL; | |
946 | list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { | |
947 | ei = list_entry(pos, struct ext4_inode_info, i_fc_list); | |
948 | if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) { | |
949 | inode = &ei->vfs_inode; | |
950 | break; | |
951 | } | |
952 | } | |
953 | /* | |
954 | * If we don't find inode in our list, then it was deleted, | |
955 | * in which case, we don't need to record it's create tag. | |
956 | */ | |
957 | if (!inode) | |
958 | continue; | |
959 | spin_unlock(&sbi->s_fc_lock); | |
960 | ||
961 | /* | |
962 | * We first write the inode and then the create dirent. This | |
963 | * allows the recovery code to create an unnamed inode first | |
964 | * and then link it to a directory entry. This allows us | |
965 | * to use namei.c routines almost as is and simplifies | |
966 | * the recovery code. | |
967 | */ | |
968 | ret = ext4_fc_write_inode(inode, crc); | |
969 | if (ret) | |
970 | goto lock_and_exit; | |
971 | ||
972 | ret = ext4_fc_write_inode_data(inode, crc); | |
973 | if (ret) | |
974 | goto lock_and_exit; | |
975 | ||
976 | if (!ext4_fc_add_dentry_tlv( | |
977 | sb, fc_dentry->fcd_op, | |
978 | fc_dentry->fcd_parent, fc_dentry->fcd_ino, | |
979 | fc_dentry->fcd_name.len, | |
980 | fc_dentry->fcd_name.name, crc)) { | |
aa75f4d3 HS |
981 | ret = -ENOSPC; |
982 | goto lock_and_exit; | |
983 | } | |
984 | ||
985 | spin_lock(&sbi->s_fc_lock); | |
986 | } | |
987 | return 0; | |
988 | lock_and_exit: | |
989 | spin_lock(&sbi->s_fc_lock); | |
990 | return ret; | |
991 | } | |
992 | ||
993 | static int ext4_fc_perform_commit(journal_t *journal) | |
994 | { | |
995 | struct super_block *sb = (struct super_block *)(journal->j_private); | |
996 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
997 | struct ext4_inode_info *iter; | |
998 | struct ext4_fc_head head; | |
999 | struct list_head *pos; | |
1000 | struct inode *inode; | |
1001 | struct blk_plug plug; | |
1002 | int ret = 0; | |
1003 | u32 crc = 0; | |
1004 | ||
1005 | ret = ext4_fc_submit_inode_data_all(journal); | |
1006 | if (ret) | |
1007 | return ret; | |
1008 | ||
1009 | ret = ext4_fc_wait_inode_data_all(journal); | |
1010 | if (ret) | |
1011 | return ret; | |
1012 | ||
da0c5d26 HS |
1013 | /* |
1014 | * If file system device is different from journal device, issue a cache | |
1015 | * flush before we start writing fast commit blocks. | |
1016 | */ | |
1017 | if (journal->j_fs_dev != journal->j_dev) | |
1018 | blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); | |
1019 | ||
aa75f4d3 HS |
1020 | blk_start_plug(&plug); |
1021 | if (sbi->s_fc_bytes == 0) { | |
1022 | /* | |
1023 | * Add a head tag only if this is the first fast commit | |
1024 | * in this TID. | |
1025 | */ | |
1026 | head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); | |
1027 | head.fc_tid = cpu_to_le32( | |
1028 | sbi->s_journal->j_running_transaction->t_tid); | |
1029 | if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), | |
1030 | (u8 *)&head, &crc)) | |
1031 | goto out; | |
1032 | } | |
1033 | ||
1034 | spin_lock(&sbi->s_fc_lock); | |
1035 | ret = ext4_fc_commit_dentry_updates(journal, &crc); | |
1036 | if (ret) { | |
1037 | spin_unlock(&sbi->s_fc_lock); | |
1038 | goto out; | |
1039 | } | |
1040 | ||
1041 | list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { | |
1042 | iter = list_entry(pos, struct ext4_inode_info, i_fc_list); | |
1043 | inode = &iter->vfs_inode; | |
1044 | if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) | |
1045 | continue; | |
1046 | ||
1047 | spin_unlock(&sbi->s_fc_lock); | |
1048 | ret = ext4_fc_write_inode_data(inode, &crc); | |
1049 | if (ret) | |
1050 | goto out; | |
1051 | ret = ext4_fc_write_inode(inode, &crc); | |
1052 | if (ret) | |
1053 | goto out; | |
1054 | spin_lock(&sbi->s_fc_lock); | |
aa75f4d3 HS |
1055 | } |
1056 | spin_unlock(&sbi->s_fc_lock); | |
1057 | ||
1058 | ret = ext4_fc_write_tail(sb, crc); | |
1059 | ||
1060 | out: | |
1061 | blk_finish_plug(&plug); | |
1062 | return ret; | |
1063 | } | |
1064 | ||
1065 | /* | |
1066 | * The main commit entry point. Performs a fast commit for transaction | |
1067 | * commit_tid if needed. If it's not possible to perform a fast commit | |
1068 | * due to various reasons, we fall back to full commit. Returns 0 | |
1069 | * on success, error otherwise. | |
1070 | */ | |
1071 | int ext4_fc_commit(journal_t *journal, tid_t commit_tid) | |
1072 | { | |
1073 | struct super_block *sb = (struct super_block *)(journal->j_private); | |
1074 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
1075 | int nblks = 0, ret, bsize = journal->j_blocksize; | |
1076 | int subtid = atomic_read(&sbi->s_fc_subtid); | |
1077 | int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0; | |
1078 | ktime_t start_time, commit_time; | |
1079 | ||
1080 | trace_ext4_fc_commit_start(sb); | |
1081 | ||
1082 | start_time = ktime_get(); | |
1083 | ||
1084 | if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || | |
1085 | (ext4_fc_is_ineligible(sb))) { | |
1086 | reason = EXT4_FC_REASON_INELIGIBLE; | |
1087 | goto out; | |
1088 | } | |
1089 | ||
1090 | restart_fc: | |
1091 | ret = jbd2_fc_begin_commit(journal, commit_tid); | |
1092 | if (ret == -EALREADY) { | |
1093 | /* There was an ongoing commit, check if we need to restart */ | |
1094 | if (atomic_read(&sbi->s_fc_subtid) <= subtid && | |
1095 | commit_tid > journal->j_commit_sequence) | |
1096 | goto restart_fc; | |
1097 | reason = EXT4_FC_REASON_ALREADY_COMMITTED; | |
1098 | goto out; | |
1099 | } else if (ret) { | |
1100 | sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; | |
1101 | reason = EXT4_FC_REASON_FC_START_FAILED; | |
1102 | goto out; | |
1103 | } | |
1104 | ||
1105 | fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; | |
1106 | ret = ext4_fc_perform_commit(journal); | |
1107 | if (ret < 0) { | |
1108 | sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; | |
1109 | reason = EXT4_FC_REASON_FC_FAILED; | |
1110 | goto out; | |
1111 | } | |
1112 | nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; | |
1113 | ret = jbd2_fc_wait_bufs(journal, nblks); | |
1114 | if (ret < 0) { | |
1115 | sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; | |
1116 | reason = EXT4_FC_REASON_FC_FAILED; | |
1117 | goto out; | |
1118 | } | |
1119 | atomic_inc(&sbi->s_fc_subtid); | |
1120 | jbd2_fc_end_commit(journal); | |
1121 | out: | |
1122 | /* Has any ineligible update happened since we started? */ | |
1123 | if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) { | |
1124 | sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; | |
1125 | reason = EXT4_FC_REASON_INELIGIBLE; | |
1126 | } | |
1127 | ||
1128 | spin_lock(&sbi->s_fc_lock); | |
1129 | if (reason != EXT4_FC_REASON_OK && | |
1130 | reason != EXT4_FC_REASON_ALREADY_COMMITTED) { | |
1131 | sbi->s_fc_stats.fc_ineligible_commits++; | |
1132 | } else { | |
1133 | sbi->s_fc_stats.fc_num_commits++; | |
1134 | sbi->s_fc_stats.fc_numblks += nblks; | |
1135 | } | |
1136 | spin_unlock(&sbi->s_fc_lock); | |
1137 | nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0; | |
1138 | trace_ext4_fc_commit_stop(sb, nblks, reason); | |
1139 | commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); | |
1140 | /* | |
1141 | * weight the commit time higher than the average time so we don't | |
1142 | * react too strongly to vast changes in the commit time | |
1143 | */ | |
1144 | if (likely(sbi->s_fc_avg_commit_time)) | |
1145 | sbi->s_fc_avg_commit_time = (commit_time + | |
1146 | sbi->s_fc_avg_commit_time * 3) / 4; | |
1147 | else | |
1148 | sbi->s_fc_avg_commit_time = commit_time; | |
1149 | jbd_debug(1, | |
1150 | "Fast commit ended with blks = %d, reason = %d, subtid - %d", | |
1151 | nblks, reason, subtid); | |
1152 | if (reason == EXT4_FC_REASON_FC_FAILED) | |
0bce577b | 1153 | return jbd2_fc_end_commit_fallback(journal); |
aa75f4d3 HS |
1154 | if (reason == EXT4_FC_REASON_FC_START_FAILED || |
1155 | reason == EXT4_FC_REASON_INELIGIBLE) | |
1156 | return jbd2_complete_transaction(journal, commit_tid); | |
1157 | return 0; | |
1158 | } | |
1159 | ||
ff780b91 HS |
1160 | /* |
1161 | * Fast commit cleanup routine. This is called after every fast commit and | |
1162 | * full commit. full is true if we are called after a full commit. | |
1163 | */ | |
1164 | static void ext4_fc_cleanup(journal_t *journal, int full) | |
1165 | { | |
aa75f4d3 HS |
1166 | struct super_block *sb = journal->j_private; |
1167 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
1168 | struct ext4_inode_info *iter; | |
1169 | struct ext4_fc_dentry_update *fc_dentry; | |
1170 | struct list_head *pos, *n; | |
1171 | ||
1172 | if (full && sbi->s_fc_bh) | |
1173 | sbi->s_fc_bh = NULL; | |
1174 | ||
1175 | jbd2_fc_release_bufs(journal); | |
1176 | ||
1177 | spin_lock(&sbi->s_fc_lock); | |
1178 | list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { | |
1179 | iter = list_entry(pos, struct ext4_inode_info, i_fc_list); | |
1180 | list_del_init(&iter->i_fc_list); | |
1181 | ext4_clear_inode_state(&iter->vfs_inode, | |
1182 | EXT4_STATE_FC_COMMITTING); | |
1183 | ext4_fc_reset_inode(&iter->vfs_inode); | |
1184 | /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ | |
1185 | smp_mb(); | |
1186 | #if (BITS_PER_LONG < 64) | |
1187 | wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); | |
1188 | #else | |
1189 | wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); | |
1190 | #endif | |
1191 | } | |
1192 | ||
1193 | while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { | |
1194 | fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], | |
1195 | struct ext4_fc_dentry_update, | |
1196 | fcd_list); | |
1197 | list_del_init(&fc_dentry->fcd_list); | |
1198 | spin_unlock(&sbi->s_fc_lock); | |
1199 | ||
1200 | if (fc_dentry->fcd_name.name && | |
1201 | fc_dentry->fcd_name.len > DNAME_INLINE_LEN) | |
1202 | kfree(fc_dentry->fcd_name.name); | |
1203 | kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); | |
1204 | spin_lock(&sbi->s_fc_lock); | |
1205 | } | |
1206 | ||
1207 | list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], | |
1208 | &sbi->s_fc_dentry_q[FC_Q_MAIN]); | |
1209 | list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], | |
1210 | &sbi->s_fc_q[FC_Q_STAGING]); | |
1211 | ||
9b5f6c9b HS |
1212 | ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); |
1213 | ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); | |
aa75f4d3 HS |
1214 | |
1215 | if (full) | |
1216 | sbi->s_fc_bytes = 0; | |
1217 | spin_unlock(&sbi->s_fc_lock); | |
1218 | trace_ext4_fc_stats(sb); | |
ff780b91 | 1219 | } |
6866d7b3 | 1220 | |
8016e29f HS |
1221 | /* Ext4 Replay Path Routines */ |
1222 | ||
1223 | /* Get length of a particular tlv */ | |
1224 | static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl) | |
1225 | { | |
1226 | return le16_to_cpu(tl->fc_len); | |
1227 | } | |
1228 | ||
1229 | /* Get a pointer to "value" of a tlv */ | |
1230 | static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl) | |
1231 | { | |
1232 | return (u8 *)tl + sizeof(*tl); | |
1233 | } | |
1234 | ||
1235 | /* Helper struct for dentry replay routines */ | |
1236 | struct dentry_info_args { | |
1237 | int parent_ino, dname_len, ino, inode_len; | |
1238 | char *dname; | |
1239 | }; | |
1240 | ||
1241 | static inline void tl_to_darg(struct dentry_info_args *darg, | |
1242 | struct ext4_fc_tl *tl) | |
1243 | { | |
1244 | struct ext4_fc_dentry_info *fcd; | |
1245 | ||
1246 | fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl); | |
1247 | ||
1248 | darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino); | |
1249 | darg->ino = le32_to_cpu(fcd->fc_ino); | |
1250 | darg->dname = fcd->fc_dname; | |
1251 | darg->dname_len = ext4_fc_tag_len(tl) - | |
1252 | sizeof(struct ext4_fc_dentry_info); | |
1253 | } | |
1254 | ||
1255 | /* Unlink replay function */ | |
1256 | static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl) | |
1257 | { | |
1258 | struct inode *inode, *old_parent; | |
1259 | struct qstr entry; | |
1260 | struct dentry_info_args darg; | |
1261 | int ret = 0; | |
1262 | ||
1263 | tl_to_darg(&darg, tl); | |
1264 | ||
1265 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, | |
1266 | darg.parent_ino, darg.dname_len); | |
1267 | ||
1268 | entry.name = darg.dname; | |
1269 | entry.len = darg.dname_len; | |
1270 | inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); | |
1271 | ||
1272 | if (IS_ERR_OR_NULL(inode)) { | |
1273 | jbd_debug(1, "Inode %d not found", darg.ino); | |
1274 | return 0; | |
1275 | } | |
1276 | ||
1277 | old_parent = ext4_iget(sb, darg.parent_ino, | |
1278 | EXT4_IGET_NORMAL); | |
1279 | if (IS_ERR_OR_NULL(old_parent)) { | |
1280 | jbd_debug(1, "Dir with inode %d not found", darg.parent_ino); | |
1281 | iput(inode); | |
1282 | return 0; | |
1283 | } | |
1284 | ||
a80f7fcf | 1285 | ret = __ext4_unlink(NULL, old_parent, &entry, inode); |
8016e29f HS |
1286 | /* -ENOENT ok coz it might not exist anymore. */ |
1287 | if (ret == -ENOENT) | |
1288 | ret = 0; | |
1289 | iput(old_parent); | |
1290 | iput(inode); | |
1291 | return ret; | |
1292 | } | |
1293 | ||
1294 | static int ext4_fc_replay_link_internal(struct super_block *sb, | |
1295 | struct dentry_info_args *darg, | |
1296 | struct inode *inode) | |
1297 | { | |
1298 | struct inode *dir = NULL; | |
1299 | struct dentry *dentry_dir = NULL, *dentry_inode = NULL; | |
1300 | struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); | |
1301 | int ret = 0; | |
1302 | ||
1303 | dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); | |
1304 | if (IS_ERR(dir)) { | |
1305 | jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino); | |
1306 | dir = NULL; | |
1307 | goto out; | |
1308 | } | |
1309 | ||
1310 | dentry_dir = d_obtain_alias(dir); | |
1311 | if (IS_ERR(dentry_dir)) { | |
1312 | jbd_debug(1, "Failed to obtain dentry"); | |
1313 | dentry_dir = NULL; | |
1314 | goto out; | |
1315 | } | |
1316 | ||
1317 | dentry_inode = d_alloc(dentry_dir, &qstr_dname); | |
1318 | if (!dentry_inode) { | |
1319 | jbd_debug(1, "Inode dentry not created."); | |
1320 | ret = -ENOMEM; | |
1321 | goto out; | |
1322 | } | |
1323 | ||
1324 | ret = __ext4_link(dir, inode, dentry_inode); | |
1325 | /* | |
1326 | * It's possible that link already existed since data blocks | |
1327 | * for the dir in question got persisted before we crashed OR | |
1328 | * we replayed this tag and crashed before the entire replay | |
1329 | * could complete. | |
1330 | */ | |
1331 | if (ret && ret != -EEXIST) { | |
1332 | jbd_debug(1, "Failed to link\n"); | |
1333 | goto out; | |
1334 | } | |
1335 | ||
1336 | ret = 0; | |
1337 | out: | |
1338 | if (dentry_dir) { | |
1339 | d_drop(dentry_dir); | |
1340 | dput(dentry_dir); | |
1341 | } else if (dir) { | |
1342 | iput(dir); | |
1343 | } | |
1344 | if (dentry_inode) { | |
1345 | d_drop(dentry_inode); | |
1346 | dput(dentry_inode); | |
1347 | } | |
1348 | ||
1349 | return ret; | |
1350 | } | |
1351 | ||
1352 | /* Link replay function */ | |
1353 | static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl) | |
1354 | { | |
1355 | struct inode *inode; | |
1356 | struct dentry_info_args darg; | |
1357 | int ret = 0; | |
1358 | ||
1359 | tl_to_darg(&darg, tl); | |
1360 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, | |
1361 | darg.parent_ino, darg.dname_len); | |
1362 | ||
1363 | inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); | |
1364 | if (IS_ERR_OR_NULL(inode)) { | |
1365 | jbd_debug(1, "Inode not found."); | |
1366 | return 0; | |
1367 | } | |
1368 | ||
1369 | ret = ext4_fc_replay_link_internal(sb, &darg, inode); | |
1370 | iput(inode); | |
1371 | return ret; | |
1372 | } | |
1373 | ||
1374 | /* | |
1375 | * Record all the modified inodes during replay. We use this later to setup | |
1376 | * block bitmaps correctly. | |
1377 | */ | |
1378 | static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) | |
1379 | { | |
1380 | struct ext4_fc_replay_state *state; | |
1381 | int i; | |
1382 | ||
1383 | state = &EXT4_SB(sb)->s_fc_replay_state; | |
1384 | for (i = 0; i < state->fc_modified_inodes_used; i++) | |
1385 | if (state->fc_modified_inodes[i] == ino) | |
1386 | return 0; | |
1387 | if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { | |
1388 | state->fc_modified_inodes_size += | |
1389 | EXT4_FC_REPLAY_REALLOC_INCREMENT; | |
1390 | state->fc_modified_inodes = krealloc( | |
1391 | state->fc_modified_inodes, sizeof(int) * | |
1392 | state->fc_modified_inodes_size, | |
1393 | GFP_KERNEL); | |
1394 | if (!state->fc_modified_inodes) | |
1395 | return -ENOMEM; | |
1396 | } | |
1397 | state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; | |
1398 | return 0; | |
1399 | } | |
1400 | ||
1401 | /* | |
1402 | * Inode replay function | |
1403 | */ | |
1404 | static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl) | |
1405 | { | |
1406 | struct ext4_fc_inode *fc_inode; | |
1407 | struct ext4_inode *raw_inode; | |
1408 | struct ext4_inode *raw_fc_inode; | |
1409 | struct inode *inode = NULL; | |
1410 | struct ext4_iloc iloc; | |
1411 | int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); | |
1412 | struct ext4_extent_header *eh; | |
1413 | ||
1414 | fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl); | |
1415 | ||
1416 | ino = le32_to_cpu(fc_inode->fc_ino); | |
1417 | trace_ext4_fc_replay(sb, tag, ino, 0, 0); | |
1418 | ||
1419 | inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); | |
1420 | if (!IS_ERR_OR_NULL(inode)) { | |
1421 | ext4_ext_clear_bb(inode); | |
1422 | iput(inode); | |
1423 | } | |
1424 | ||
1425 | ext4_fc_record_modified_inode(sb, ino); | |
1426 | ||
1427 | raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode; | |
1428 | ret = ext4_get_fc_inode_loc(sb, ino, &iloc); | |
1429 | if (ret) | |
1430 | goto out; | |
1431 | ||
1432 | inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode); | |
1433 | raw_inode = ext4_raw_inode(&iloc); | |
1434 | ||
1435 | memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); | |
1436 | memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation, | |
1437 | inode_len - offsetof(struct ext4_inode, i_generation)); | |
1438 | if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { | |
1439 | eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); | |
1440 | if (eh->eh_magic != EXT4_EXT_MAGIC) { | |
1441 | memset(eh, 0, sizeof(*eh)); | |
1442 | eh->eh_magic = EXT4_EXT_MAGIC; | |
1443 | eh->eh_max = cpu_to_le16( | |
1444 | (sizeof(raw_inode->i_block) - | |
1445 | sizeof(struct ext4_extent_header)) | |
1446 | / sizeof(struct ext4_extent)); | |
1447 | } | |
1448 | } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { | |
1449 | memcpy(raw_inode->i_block, raw_fc_inode->i_block, | |
1450 | sizeof(raw_inode->i_block)); | |
1451 | } | |
1452 | ||
1453 | /* Immediately update the inode on disk. */ | |
1454 | ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); | |
1455 | if (ret) | |
1456 | goto out; | |
1457 | ret = sync_dirty_buffer(iloc.bh); | |
1458 | if (ret) | |
1459 | goto out; | |
1460 | ret = ext4_mark_inode_used(sb, ino); | |
1461 | if (ret) | |
1462 | goto out; | |
1463 | ||
1464 | /* Given that we just wrote the inode on disk, this SHOULD succeed. */ | |
1465 | inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); | |
1466 | if (IS_ERR_OR_NULL(inode)) { | |
1467 | jbd_debug(1, "Inode not found."); | |
1468 | return -EFSCORRUPTED; | |
1469 | } | |
1470 | ||
1471 | /* | |
1472 | * Our allocator could have made different decisions than before | |
1473 | * crashing. This should be fixed but until then, we calculate | |
1474 | * the number of blocks the inode. | |
1475 | */ | |
1476 | ext4_ext_replay_set_iblocks(inode); | |
1477 | ||
1478 | inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); | |
1479 | ext4_reset_inode_seed(inode); | |
1480 | ||
1481 | ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); | |
1482 | ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); | |
1483 | sync_dirty_buffer(iloc.bh); | |
1484 | brelse(iloc.bh); | |
1485 | out: | |
1486 | iput(inode); | |
1487 | if (!ret) | |
1488 | blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); | |
1489 | ||
1490 | return 0; | |
1491 | } | |
1492 | ||
1493 | /* | |
1494 | * Dentry create replay function. | |
1495 | * | |
1496 | * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the | |
1497 | * inode for which we are trying to create a dentry here, should already have | |
1498 | * been replayed before we start here. | |
1499 | */ | |
1500 | static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl) | |
1501 | { | |
1502 | int ret = 0; | |
1503 | struct inode *inode = NULL; | |
1504 | struct inode *dir = NULL; | |
1505 | struct dentry_info_args darg; | |
1506 | ||
1507 | tl_to_darg(&darg, tl); | |
1508 | ||
1509 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, | |
1510 | darg.parent_ino, darg.dname_len); | |
1511 | ||
1512 | /* This takes care of update group descriptor and other metadata */ | |
1513 | ret = ext4_mark_inode_used(sb, darg.ino); | |
1514 | if (ret) | |
1515 | goto out; | |
1516 | ||
1517 | inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); | |
1518 | if (IS_ERR_OR_NULL(inode)) { | |
1519 | jbd_debug(1, "inode %d not found.", darg.ino); | |
1520 | inode = NULL; | |
1521 | ret = -EINVAL; | |
1522 | goto out; | |
1523 | } | |
1524 | ||
1525 | if (S_ISDIR(inode->i_mode)) { | |
1526 | /* | |
1527 | * If we are creating a directory, we need to make sure that the | |
1528 | * dot and dot dot dirents are setup properly. | |
1529 | */ | |
1530 | dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); | |
1531 | if (IS_ERR_OR_NULL(dir)) { | |
1532 | jbd_debug(1, "Dir %d not found.", darg.ino); | |
1533 | goto out; | |
1534 | } | |
1535 | ret = ext4_init_new_dir(NULL, dir, inode); | |
1536 | iput(dir); | |
1537 | if (ret) { | |
1538 | ret = 0; | |
1539 | goto out; | |
1540 | } | |
1541 | } | |
1542 | ret = ext4_fc_replay_link_internal(sb, &darg, inode); | |
1543 | if (ret) | |
1544 | goto out; | |
1545 | set_nlink(inode, 1); | |
1546 | ext4_mark_inode_dirty(NULL, inode); | |
1547 | out: | |
1548 | if (inode) | |
1549 | iput(inode); | |
1550 | return ret; | |
1551 | } | |
1552 | ||
1553 | /* | |
1554 | * Record physical disk regions which are in use as per fast commit area. Our | |
1555 | * simple replay phase allocator excludes these regions from allocation. | |
1556 | */ | |
1557 | static int ext4_fc_record_regions(struct super_block *sb, int ino, | |
1558 | ext4_lblk_t lblk, ext4_fsblk_t pblk, int len) | |
1559 | { | |
1560 | struct ext4_fc_replay_state *state; | |
1561 | struct ext4_fc_alloc_region *region; | |
1562 | ||
1563 | state = &EXT4_SB(sb)->s_fc_replay_state; | |
1564 | if (state->fc_regions_used == state->fc_regions_size) { | |
1565 | state->fc_regions_size += | |
1566 | EXT4_FC_REPLAY_REALLOC_INCREMENT; | |
1567 | state->fc_regions = krealloc( | |
1568 | state->fc_regions, | |
1569 | state->fc_regions_size * | |
1570 | sizeof(struct ext4_fc_alloc_region), | |
1571 | GFP_KERNEL); | |
1572 | if (!state->fc_regions) | |
1573 | return -ENOMEM; | |
1574 | } | |
1575 | region = &state->fc_regions[state->fc_regions_used++]; | |
1576 | region->ino = ino; | |
1577 | region->lblk = lblk; | |
1578 | region->pblk = pblk; | |
1579 | region->len = len; | |
1580 | ||
1581 | return 0; | |
1582 | } | |
1583 | ||
1584 | /* Replay add range tag */ | |
1585 | static int ext4_fc_replay_add_range(struct super_block *sb, | |
1586 | struct ext4_fc_tl *tl) | |
1587 | { | |
1588 | struct ext4_fc_add_range *fc_add_ex; | |
1589 | struct ext4_extent newex, *ex; | |
1590 | struct inode *inode; | |
1591 | ext4_lblk_t start, cur; | |
1592 | int remaining, len; | |
1593 | ext4_fsblk_t start_pblk; | |
1594 | struct ext4_map_blocks map; | |
1595 | struct ext4_ext_path *path = NULL; | |
1596 | int ret; | |
1597 | ||
1598 | fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); | |
1599 | ex = (struct ext4_extent *)&fc_add_ex->fc_ex; | |
1600 | ||
1601 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, | |
1602 | le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block), | |
1603 | ext4_ext_get_actual_len(ex)); | |
1604 | ||
1605 | inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino), | |
1606 | EXT4_IGET_NORMAL); | |
1607 | if (IS_ERR_OR_NULL(inode)) { | |
1608 | jbd_debug(1, "Inode not found."); | |
1609 | return 0; | |
1610 | } | |
1611 | ||
1612 | ret = ext4_fc_record_modified_inode(sb, inode->i_ino); | |
1613 | ||
1614 | start = le32_to_cpu(ex->ee_block); | |
1615 | start_pblk = ext4_ext_pblock(ex); | |
1616 | len = ext4_ext_get_actual_len(ex); | |
1617 | ||
1618 | cur = start; | |
1619 | remaining = len; | |
1620 | jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", | |
1621 | start, start_pblk, len, ext4_ext_is_unwritten(ex), | |
1622 | inode->i_ino); | |
1623 | ||
1624 | while (remaining > 0) { | |
1625 | map.m_lblk = cur; | |
1626 | map.m_len = remaining; | |
1627 | map.m_pblk = 0; | |
1628 | ret = ext4_map_blocks(NULL, inode, &map, 0); | |
1629 | ||
1630 | if (ret < 0) { | |
1631 | iput(inode); | |
1632 | return 0; | |
1633 | } | |
1634 | ||
1635 | if (ret == 0) { | |
1636 | /* Range is not mapped */ | |
1637 | path = ext4_find_extent(inode, cur, NULL, 0); | |
8c9be1e5 HS |
1638 | if (IS_ERR(path)) { |
1639 | iput(inode); | |
1640 | return 0; | |
1641 | } | |
8016e29f HS |
1642 | memset(&newex, 0, sizeof(newex)); |
1643 | newex.ee_block = cpu_to_le32(cur); | |
1644 | ext4_ext_store_pblock( | |
1645 | &newex, start_pblk + cur - start); | |
1646 | newex.ee_len = cpu_to_le16(map.m_len); | |
1647 | if (ext4_ext_is_unwritten(ex)) | |
1648 | ext4_ext_mark_unwritten(&newex); | |
1649 | down_write(&EXT4_I(inode)->i_data_sem); | |
1650 | ret = ext4_ext_insert_extent( | |
1651 | NULL, inode, &path, &newex, 0); | |
1652 | up_write((&EXT4_I(inode)->i_data_sem)); | |
1653 | ext4_ext_drop_refs(path); | |
1654 | kfree(path); | |
1655 | if (ret) { | |
1656 | iput(inode); | |
1657 | return 0; | |
1658 | } | |
1659 | goto next; | |
1660 | } | |
1661 | ||
1662 | if (start_pblk + cur - start != map.m_pblk) { | |
1663 | /* | |
1664 | * Logical to physical mapping changed. This can happen | |
1665 | * if this range was removed and then reallocated to | |
1666 | * map to new physical blocks during a fast commit. | |
1667 | */ | |
1668 | ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, | |
1669 | ext4_ext_is_unwritten(ex), | |
1670 | start_pblk + cur - start); | |
1671 | if (ret) { | |
1672 | iput(inode); | |
1673 | return 0; | |
1674 | } | |
1675 | /* | |
1676 | * Mark the old blocks as free since they aren't used | |
1677 | * anymore. We maintain an array of all the modified | |
1678 | * inodes. In case these blocks are still used at either | |
1679 | * a different logical range in the same inode or in | |
1680 | * some different inode, we will mark them as allocated | |
1681 | * at the end of the FC replay using our array of | |
1682 | * modified inodes. | |
1683 | */ | |
1684 | ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); | |
1685 | goto next; | |
1686 | } | |
1687 | ||
1688 | /* Range is mapped and needs a state change */ | |
1689 | jbd_debug(1, "Converting from %d to %d %lld", | |
1690 | map.m_flags & EXT4_MAP_UNWRITTEN, | |
1691 | ext4_ext_is_unwritten(ex), map.m_pblk); | |
1692 | ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, | |
1693 | ext4_ext_is_unwritten(ex), map.m_pblk); | |
1694 | if (ret) { | |
1695 | iput(inode); | |
1696 | return 0; | |
1697 | } | |
1698 | /* | |
1699 | * We may have split the extent tree while toggling the state. | |
1700 | * Try to shrink the extent tree now. | |
1701 | */ | |
1702 | ext4_ext_replay_shrink_inode(inode, start + len); | |
1703 | next: | |
1704 | cur += map.m_len; | |
1705 | remaining -= map.m_len; | |
1706 | } | |
1707 | ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> | |
1708 | sb->s_blocksize_bits); | |
1709 | iput(inode); | |
1710 | return 0; | |
1711 | } | |
1712 | ||
1713 | /* Replay DEL_RANGE tag */ | |
1714 | static int | |
1715 | ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl) | |
1716 | { | |
1717 | struct inode *inode; | |
1718 | struct ext4_fc_del_range *lrange; | |
1719 | struct ext4_map_blocks map; | |
1720 | ext4_lblk_t cur, remaining; | |
1721 | int ret; | |
1722 | ||
1723 | lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl); | |
1724 | cur = le32_to_cpu(lrange->fc_lblk); | |
1725 | remaining = le32_to_cpu(lrange->fc_len); | |
1726 | ||
1727 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, | |
1728 | le32_to_cpu(lrange->fc_ino), cur, remaining); | |
1729 | ||
1730 | inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL); | |
1731 | if (IS_ERR_OR_NULL(inode)) { | |
1732 | jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino)); | |
1733 | return 0; | |
1734 | } | |
1735 | ||
1736 | ret = ext4_fc_record_modified_inode(sb, inode->i_ino); | |
1737 | ||
1738 | jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", | |
1739 | inode->i_ino, le32_to_cpu(lrange->fc_lblk), | |
1740 | le32_to_cpu(lrange->fc_len)); | |
1741 | while (remaining > 0) { | |
1742 | map.m_lblk = cur; | |
1743 | map.m_len = remaining; | |
1744 | ||
1745 | ret = ext4_map_blocks(NULL, inode, &map, 0); | |
1746 | if (ret < 0) { | |
1747 | iput(inode); | |
1748 | return 0; | |
1749 | } | |
1750 | if (ret > 0) { | |
1751 | remaining -= ret; | |
1752 | cur += ret; | |
1753 | ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); | |
1754 | } else { | |
1755 | remaining -= map.m_len; | |
1756 | cur += map.m_len; | |
1757 | } | |
1758 | } | |
1759 | ||
1760 | ret = ext4_punch_hole(inode, | |
1761 | le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits, | |
1762 | le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits); | |
1763 | if (ret) | |
1764 | jbd_debug(1, "ext4_punch_hole returned %d", ret); | |
1765 | ext4_ext_replay_shrink_inode(inode, | |
1766 | i_size_read(inode) >> sb->s_blocksize_bits); | |
1767 | ext4_mark_inode_dirty(NULL, inode); | |
1768 | iput(inode); | |
1769 | ||
1770 | return 0; | |
1771 | } | |
1772 | ||
1773 | static inline const char *tag2str(u16 tag) | |
1774 | { | |
1775 | switch (tag) { | |
1776 | case EXT4_FC_TAG_LINK: | |
1777 | return "TAG_ADD_ENTRY"; | |
1778 | case EXT4_FC_TAG_UNLINK: | |
1779 | return "TAG_DEL_ENTRY"; | |
1780 | case EXT4_FC_TAG_ADD_RANGE: | |
1781 | return "TAG_ADD_RANGE"; | |
1782 | case EXT4_FC_TAG_CREAT: | |
1783 | return "TAG_CREAT_DENTRY"; | |
1784 | case EXT4_FC_TAG_DEL_RANGE: | |
1785 | return "TAG_DEL_RANGE"; | |
1786 | case EXT4_FC_TAG_INODE: | |
1787 | return "TAG_INODE"; | |
1788 | case EXT4_FC_TAG_PAD: | |
1789 | return "TAG_PAD"; | |
1790 | case EXT4_FC_TAG_TAIL: | |
1791 | return "TAG_TAIL"; | |
1792 | case EXT4_FC_TAG_HEAD: | |
1793 | return "TAG_HEAD"; | |
1794 | default: | |
1795 | return "TAG_ERROR"; | |
1796 | } | |
1797 | } | |
1798 | ||
1799 | static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) | |
1800 | { | |
1801 | struct ext4_fc_replay_state *state; | |
1802 | struct inode *inode; | |
1803 | struct ext4_ext_path *path = NULL; | |
1804 | struct ext4_map_blocks map; | |
1805 | int i, ret, j; | |
1806 | ext4_lblk_t cur, end; | |
1807 | ||
1808 | state = &EXT4_SB(sb)->s_fc_replay_state; | |
1809 | for (i = 0; i < state->fc_modified_inodes_used; i++) { | |
1810 | inode = ext4_iget(sb, state->fc_modified_inodes[i], | |
1811 | EXT4_IGET_NORMAL); | |
1812 | if (IS_ERR_OR_NULL(inode)) { | |
1813 | jbd_debug(1, "Inode %d not found.", | |
1814 | state->fc_modified_inodes[i]); | |
1815 | continue; | |
1816 | } | |
1817 | cur = 0; | |
1818 | end = EXT_MAX_BLOCKS; | |
1819 | while (cur < end) { | |
1820 | map.m_lblk = cur; | |
1821 | map.m_len = end - cur; | |
1822 | ||
1823 | ret = ext4_map_blocks(NULL, inode, &map, 0); | |
1824 | if (ret < 0) | |
1825 | break; | |
1826 | ||
1827 | if (ret > 0) { | |
1828 | path = ext4_find_extent(inode, map.m_lblk, NULL, 0); | |
1829 | if (!IS_ERR_OR_NULL(path)) { | |
1830 | for (j = 0; j < path->p_depth; j++) | |
1831 | ext4_mb_mark_bb(inode->i_sb, | |
1832 | path[j].p_block, 1, 1); | |
1833 | ext4_ext_drop_refs(path); | |
1834 | kfree(path); | |
1835 | } | |
1836 | cur += ret; | |
1837 | ext4_mb_mark_bb(inode->i_sb, map.m_pblk, | |
1838 | map.m_len, 1); | |
1839 | } else { | |
1840 | cur = cur + (map.m_len ? map.m_len : 1); | |
1841 | } | |
1842 | } | |
1843 | iput(inode); | |
1844 | } | |
1845 | } | |
1846 | ||
1847 | /* | |
1848 | * Check if block is in excluded regions for block allocation. The simple | |
1849 | * allocator that runs during replay phase is calls this function to see | |
1850 | * if it is okay to use a block. | |
1851 | */ | |
1852 | bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) | |
1853 | { | |
1854 | int i; | |
1855 | struct ext4_fc_replay_state *state; | |
1856 | ||
1857 | state = &EXT4_SB(sb)->s_fc_replay_state; | |
1858 | for (i = 0; i < state->fc_regions_valid; i++) { | |
1859 | if (state->fc_regions[i].ino == 0 || | |
1860 | state->fc_regions[i].len == 0) | |
1861 | continue; | |
1862 | if (blk >= state->fc_regions[i].pblk && | |
1863 | blk < state->fc_regions[i].pblk + state->fc_regions[i].len) | |
1864 | return true; | |
1865 | } | |
1866 | return false; | |
1867 | } | |
1868 | ||
1869 | /* Cleanup function called after replay */ | |
1870 | void ext4_fc_replay_cleanup(struct super_block *sb) | |
1871 | { | |
1872 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
1873 | ||
1874 | sbi->s_mount_state &= ~EXT4_FC_REPLAY; | |
1875 | kfree(sbi->s_fc_replay_state.fc_regions); | |
1876 | kfree(sbi->s_fc_replay_state.fc_modified_inodes); | |
1877 | } | |
1878 | ||
1879 | /* | |
1880 | * Recovery Scan phase handler | |
1881 | * | |
1882 | * This function is called during the scan phase and is responsible | |
1883 | * for doing following things: | |
1884 | * - Make sure the fast commit area has valid tags for replay | |
1885 | * - Count number of tags that need to be replayed by the replay handler | |
1886 | * - Verify CRC | |
1887 | * - Create a list of excluded blocks for allocation during replay phase | |
1888 | * | |
1889 | * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is | |
1890 | * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP | |
1891 | * to indicate that scan has finished and JBD2 can now start replay phase. | |
1892 | * It returns a negative error to indicate that there was an error. At the end | |
1893 | * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set | |
1894 | * to indicate the number of tags that need to replayed during the replay phase. | |
1895 | */ | |
1896 | static int ext4_fc_replay_scan(journal_t *journal, | |
1897 | struct buffer_head *bh, int off, | |
1898 | tid_t expected_tid) | |
1899 | { | |
1900 | struct super_block *sb = journal->j_private; | |
1901 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
1902 | struct ext4_fc_replay_state *state; | |
1903 | int ret = JBD2_FC_REPLAY_CONTINUE; | |
1904 | struct ext4_fc_add_range *ext; | |
1905 | struct ext4_fc_tl *tl; | |
1906 | struct ext4_fc_tail *tail; | |
1907 | __u8 *start, *end; | |
1908 | struct ext4_fc_head *head; | |
1909 | struct ext4_extent *ex; | |
1910 | ||
1911 | state = &sbi->s_fc_replay_state; | |
1912 | ||
1913 | start = (u8 *)bh->b_data; | |
1914 | end = (__u8 *)bh->b_data + journal->j_blocksize - 1; | |
1915 | ||
1916 | if (state->fc_replay_expected_off == 0) { | |
1917 | state->fc_cur_tag = 0; | |
1918 | state->fc_replay_num_tags = 0; | |
1919 | state->fc_crc = 0; | |
1920 | state->fc_regions = NULL; | |
1921 | state->fc_regions_valid = state->fc_regions_used = | |
1922 | state->fc_regions_size = 0; | |
1923 | /* Check if we can stop early */ | |
1924 | if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) | |
1925 | != EXT4_FC_TAG_HEAD) | |
1926 | return 0; | |
1927 | } | |
1928 | ||
1929 | if (off != state->fc_replay_expected_off) { | |
1930 | ret = -EFSCORRUPTED; | |
1931 | goto out_err; | |
1932 | } | |
1933 | ||
1934 | state->fc_replay_expected_off++; | |
1935 | fc_for_each_tl(start, end, tl) { | |
1936 | jbd_debug(3, "Scan phase, tag:%s, blk %lld\n", | |
1937 | tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr); | |
1938 | switch (le16_to_cpu(tl->fc_tag)) { | |
1939 | case EXT4_FC_TAG_ADD_RANGE: | |
1940 | ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); | |
1941 | ex = (struct ext4_extent *)&ext->fc_ex; | |
1942 | ret = ext4_fc_record_regions(sb, | |
1943 | le32_to_cpu(ext->fc_ino), | |
1944 | le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), | |
1945 | ext4_ext_get_actual_len(ex)); | |
1946 | if (ret < 0) | |
1947 | break; | |
1948 | ret = JBD2_FC_REPLAY_CONTINUE; | |
1949 | fallthrough; | |
1950 | case EXT4_FC_TAG_DEL_RANGE: | |
1951 | case EXT4_FC_TAG_LINK: | |
1952 | case EXT4_FC_TAG_UNLINK: | |
1953 | case EXT4_FC_TAG_CREAT: | |
1954 | case EXT4_FC_TAG_INODE: | |
1955 | case EXT4_FC_TAG_PAD: | |
1956 | state->fc_cur_tag++; | |
1957 | state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, | |
1958 | sizeof(*tl) + ext4_fc_tag_len(tl)); | |
1959 | break; | |
1960 | case EXT4_FC_TAG_TAIL: | |
1961 | state->fc_cur_tag++; | |
1962 | tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); | |
1963 | state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, | |
1964 | sizeof(*tl) + | |
1965 | offsetof(struct ext4_fc_tail, | |
1966 | fc_crc)); | |
1967 | if (le32_to_cpu(tail->fc_tid) == expected_tid && | |
1968 | le32_to_cpu(tail->fc_crc) == state->fc_crc) { | |
1969 | state->fc_replay_num_tags = state->fc_cur_tag; | |
1970 | state->fc_regions_valid = | |
1971 | state->fc_regions_used; | |
1972 | } else { | |
1973 | ret = state->fc_replay_num_tags ? | |
1974 | JBD2_FC_REPLAY_STOP : -EFSBADCRC; | |
1975 | } | |
1976 | state->fc_crc = 0; | |
1977 | break; | |
1978 | case EXT4_FC_TAG_HEAD: | |
1979 | head = (struct ext4_fc_head *)ext4_fc_tag_val(tl); | |
1980 | if (le32_to_cpu(head->fc_features) & | |
1981 | ~EXT4_FC_SUPPORTED_FEATURES) { | |
1982 | ret = -EOPNOTSUPP; | |
1983 | break; | |
1984 | } | |
1985 | if (le32_to_cpu(head->fc_tid) != expected_tid) { | |
1986 | ret = JBD2_FC_REPLAY_STOP; | |
1987 | break; | |
1988 | } | |
1989 | state->fc_cur_tag++; | |
1990 | state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, | |
1991 | sizeof(*tl) + ext4_fc_tag_len(tl)); | |
1992 | break; | |
1993 | default: | |
1994 | ret = state->fc_replay_num_tags ? | |
1995 | JBD2_FC_REPLAY_STOP : -ECANCELED; | |
1996 | } | |
1997 | if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) | |
1998 | break; | |
1999 | } | |
2000 | ||
2001 | out_err: | |
2002 | trace_ext4_fc_replay_scan(sb, ret, off); | |
2003 | return ret; | |
2004 | } | |
2005 | ||
5b849b5f HS |
2006 | /* |
2007 | * Main recovery path entry point. | |
8016e29f | 2008 | * The meaning of return codes is similar as above. |
5b849b5f HS |
2009 | */ |
2010 | static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, | |
2011 | enum passtype pass, int off, tid_t expected_tid) | |
2012 | { | |
8016e29f HS |
2013 | struct super_block *sb = journal->j_private; |
2014 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
2015 | struct ext4_fc_tl *tl; | |
2016 | __u8 *start, *end; | |
2017 | int ret = JBD2_FC_REPLAY_CONTINUE; | |
2018 | struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; | |
2019 | struct ext4_fc_tail *tail; | |
2020 | ||
2021 | if (pass == PASS_SCAN) { | |
2022 | state->fc_current_pass = PASS_SCAN; | |
2023 | return ext4_fc_replay_scan(journal, bh, off, expected_tid); | |
2024 | } | |
2025 | ||
2026 | if (state->fc_current_pass != pass) { | |
2027 | state->fc_current_pass = pass; | |
2028 | sbi->s_mount_state |= EXT4_FC_REPLAY; | |
2029 | } | |
2030 | if (!sbi->s_fc_replay_state.fc_replay_num_tags) { | |
2031 | jbd_debug(1, "Replay stops\n"); | |
2032 | ext4_fc_set_bitmaps_and_counters(sb); | |
2033 | return 0; | |
2034 | } | |
2035 | ||
2036 | #ifdef CONFIG_EXT4_DEBUG | |
2037 | if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { | |
2038 | pr_warn("Dropping fc block %d because max_replay set\n", off); | |
2039 | return JBD2_FC_REPLAY_STOP; | |
2040 | } | |
2041 | #endif | |
2042 | ||
2043 | start = (u8 *)bh->b_data; | |
2044 | end = (__u8 *)bh->b_data + journal->j_blocksize - 1; | |
2045 | ||
2046 | fc_for_each_tl(start, end, tl) { | |
2047 | if (state->fc_replay_num_tags == 0) { | |
2048 | ret = JBD2_FC_REPLAY_STOP; | |
2049 | ext4_fc_set_bitmaps_and_counters(sb); | |
2050 | break; | |
2051 | } | |
2052 | jbd_debug(3, "Replay phase, tag:%s\n", | |
2053 | tag2str(le16_to_cpu(tl->fc_tag))); | |
2054 | state->fc_replay_num_tags--; | |
2055 | switch (le16_to_cpu(tl->fc_tag)) { | |
2056 | case EXT4_FC_TAG_LINK: | |
2057 | ret = ext4_fc_replay_link(sb, tl); | |
2058 | break; | |
2059 | case EXT4_FC_TAG_UNLINK: | |
2060 | ret = ext4_fc_replay_unlink(sb, tl); | |
2061 | break; | |
2062 | case EXT4_FC_TAG_ADD_RANGE: | |
2063 | ret = ext4_fc_replay_add_range(sb, tl); | |
2064 | break; | |
2065 | case EXT4_FC_TAG_CREAT: | |
2066 | ret = ext4_fc_replay_create(sb, tl); | |
2067 | break; | |
2068 | case EXT4_FC_TAG_DEL_RANGE: | |
2069 | ret = ext4_fc_replay_del_range(sb, tl); | |
2070 | break; | |
2071 | case EXT4_FC_TAG_INODE: | |
2072 | ret = ext4_fc_replay_inode(sb, tl); | |
2073 | break; | |
2074 | case EXT4_FC_TAG_PAD: | |
2075 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, | |
2076 | ext4_fc_tag_len(tl), 0); | |
2077 | break; | |
2078 | case EXT4_FC_TAG_TAIL: | |
2079 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, | |
2080 | ext4_fc_tag_len(tl), 0); | |
2081 | tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); | |
2082 | WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid); | |
2083 | break; | |
2084 | case EXT4_FC_TAG_HEAD: | |
2085 | break; | |
2086 | default: | |
2087 | trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0, | |
2088 | ext4_fc_tag_len(tl), 0); | |
2089 | ret = -ECANCELED; | |
2090 | break; | |
2091 | } | |
2092 | if (ret < 0) | |
2093 | break; | |
2094 | ret = JBD2_FC_REPLAY_CONTINUE; | |
2095 | } | |
2096 | return ret; | |
5b849b5f HS |
2097 | } |
2098 | ||
6866d7b3 HS |
2099 | void ext4_fc_init(struct super_block *sb, journal_t *journal) |
2100 | { | |
5b849b5f HS |
2101 | /* |
2102 | * We set replay callback even if fast commit disabled because we may | |
2103 | * could still have fast commit blocks that need to be replayed even if | |
2104 | * fast commit has now been turned off. | |
2105 | */ | |
2106 | journal->j_fc_replay_callback = ext4_fc_replay; | |
6866d7b3 HS |
2107 | if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) |
2108 | return; | |
ff780b91 | 2109 | journal->j_fc_cleanup_callback = ext4_fc_cleanup; |
6866d7b3 | 2110 | } |
aa75f4d3 | 2111 | |
fa329e27 | 2112 | static const char *fc_ineligible_reasons[] = { |
ce8c59d1 HS |
2113 | "Extended attributes changed", |
2114 | "Cross rename", | |
2115 | "Journal flag changed", | |
2116 | "Insufficient memory", | |
2117 | "Swap boot", | |
2118 | "Resize", | |
2119 | "Dir renamed", | |
2120 | "Falloc range op", | |
556e0319 | 2121 | "Data journalling", |
ce8c59d1 HS |
2122 | "FC Commit Failed" |
2123 | }; | |
2124 | ||
2125 | int ext4_fc_info_show(struct seq_file *seq, void *v) | |
2126 | { | |
2127 | struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); | |
2128 | struct ext4_fc_stats *stats = &sbi->s_fc_stats; | |
2129 | int i; | |
2130 | ||
2131 | if (v != SEQ_START_TOKEN) | |
2132 | return 0; | |
2133 | ||
2134 | seq_printf(seq, | |
2135 | "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", | |
2136 | stats->fc_num_commits, stats->fc_ineligible_commits, | |
2137 | stats->fc_numblks, | |
2138 | div_u64(sbi->s_fc_avg_commit_time, 1000)); | |
2139 | seq_puts(seq, "Ineligible reasons:\n"); | |
2140 | for (i = 0; i < EXT4_FC_REASON_MAX; i++) | |
2141 | seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], | |
2142 | stats->fc_ineligible_reason_count[i]); | |
2143 | ||
2144 | return 0; | |
2145 | } | |
2146 | ||
aa75f4d3 HS |
2147 | int __init ext4_fc_init_dentry_cache(void) |
2148 | { | |
2149 | ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, | |
2150 | SLAB_RECLAIM_ACCOUNT); | |
2151 | ||
2152 | if (ext4_fc_dentry_cachep == NULL) | |
2153 | return -ENOMEM; | |
2154 | ||
2155 | return 0; | |
2156 | } |