Commit | Line | Data |
---|---|---|
6866d7b3 HS |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | /* | |
4 | * fs/ext4/fast_commit.c | |
5 | * | |
6 | * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> | |
7 | * | |
8 | * Ext4 fast commits routines. | |
9 | */ | |
aa75f4d3 | 10 | #include "ext4.h" |
6866d7b3 | 11 | #include "ext4_jbd2.h" |
aa75f4d3 HS |
12 | #include "ext4_extents.h" |
13 | #include "mballoc.h" | |
14 | ||
15 | /* | |
16 | * Ext4 Fast Commits | |
17 | * ----------------- | |
18 | * | |
19 | * Ext4 fast commits implement fine grained journalling for Ext4. | |
20 | * | |
21 | * Fast commits are organized as a log of tag-length-value (TLV) structs. (See | |
22 | * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by | |
23 | * TLV during the recovery phase. For the scenarios for which we currently | |
24 | * don't have replay code, fast commit falls back to full commits. | |
25 | * Fast commits record delta in one of the following three categories. | |
26 | * | |
27 | * (A) Directory entry updates: | |
28 | * | |
29 | * - EXT4_FC_TAG_UNLINK - records directory entry unlink | |
30 | * - EXT4_FC_TAG_LINK - records directory entry link | |
31 | * - EXT4_FC_TAG_CREAT - records inode and directory entry creation | |
32 | * | |
33 | * (B) File specific data range updates: | |
34 | * | |
35 | * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode | |
36 | * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode | |
37 | * | |
38 | * (C) Inode metadata (mtime / ctime etc): | |
39 | * | |
40 | * - EXT4_FC_TAG_INODE - record the inode that should be replayed | |
41 | * during recovery. Note that iblocks field is | |
42 | * not replayed and instead derived during | |
43 | * replay. | |
44 | * Commit Operation | |
45 | * ---------------- | |
46 | * With fast commits, we maintain all the directory entry operations in the | |
47 | * order in which they are issued in an in-memory queue. This queue is flushed | |
48 | * to disk during the commit operation. We also maintain a list of inodes | |
49 | * that need to be committed during a fast commit in another in memory queue of | |
50 | * inodes. During the commit operation, we commit in the following order: | |
51 | * | |
52 | * [1] Lock inodes for any further data updates by setting COMMITTING state | |
53 | * [2] Submit data buffers of all the inodes | |
54 | * [3] Wait for [2] to complete | |
55 | * [4] Commit all the directory entry updates in the fast commit space | |
56 | * [5] Commit all the changed inode structures | |
57 | * [6] Write tail tag (this tag ensures the atomicity, please read the following | |
58 | * section for more details). | |
59 | * [7] Wait for [4], [5] and [6] to complete. | |
60 | * | |
61 | * All the inode updates must call ext4_fc_start_update() before starting an | |
62 | * update. If such an ongoing update is present, fast commit waits for it to | |
63 | * complete. The completion of such an update is marked by | |
64 | * ext4_fc_stop_update(). | |
65 | * | |
66 | * Fast Commit Ineligibility | |
67 | * ------------------------- | |
68 | * Not all operations are supported by fast commits today (e.g extended | |
69 | * attributes). Fast commit ineligiblity is marked by calling one of the | |
70 | * two following functions: | |
71 | * | |
72 | * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall | |
73 | * back to full commit. This is useful in case of transient errors. | |
74 | * | |
75 | * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all | |
76 | * the fast commits happening between ext4_fc_start_ineligible() and | |
77 | * ext4_fc_stop_ineligible() and one fast commit after the call to | |
78 | * ext4_fc_stop_ineligible() to fall back to full commits. It is important to | |
79 | * make one more fast commit to fall back to full commit after stop call so | |
80 | * that it guaranteed that the fast commit ineligible operation contained | |
81 | * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is | |
82 | * followed by at least 1 full commit. | |
83 | * | |
84 | * Atomicity of commits | |
85 | * -------------------- | |
a740762f | 86 | * In order to guarantee atomicity during the commit operation, fast commit |
aa75f4d3 HS |
87 | * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail |
88 | * tag contains CRC of the contents and TID of the transaction after which | |
89 | * this fast commit should be applied. Recovery code replays fast commit | |
90 | * logs only if there's at least 1 valid tail present. For every fast commit | |
91 | * operation, there is 1 tail. This means, we may end up with multiple tails | |
92 | * in the fast commit space. Here's an example: | |
93 | * | |
94 | * - Create a new file A and remove existing file B | |
95 | * - fsync() | |
96 | * - Append contents to file A | |
97 | * - Truncate file A | |
98 | * - fsync() | |
99 | * | |
100 | * The fast commit space at the end of above operations would look like this: | |
101 | * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] | |
102 | * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| | |
103 | * | |
104 | * Replay code should thus check for all the valid tails in the FC area. | |
105 | * | |
106 | * TODOs | |
107 | * ----- | |
108 | * 1) Make fast commit atomic updates more fine grained. Today, a fast commit | |
109 | * eligible update must be protected within ext4_fc_start_update() and | |
110 | * ext4_fc_stop_update(). These routines are called at much higher | |
111 | * routines. This can be made more fine grained by combining with | |
112 | * ext4_journal_start(). | |
113 | * | |
114 | * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() | |
115 | * | |
116 | * 3) Handle more ineligible cases. | |
117 | */ | |
118 | ||
119 | #include <trace/events/ext4.h> | |
120 | static struct kmem_cache *ext4_fc_dentry_cachep; | |
121 | ||
122 | static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) | |
123 | { | |
124 | BUFFER_TRACE(bh, ""); | |
125 | if (uptodate) { | |
126 | ext4_debug("%s: Block %lld up-to-date", | |
127 | __func__, bh->b_blocknr); | |
128 | set_buffer_uptodate(bh); | |
129 | } else { | |
130 | ext4_debug("%s: Block %lld not up-to-date", | |
131 | __func__, bh->b_blocknr); | |
132 | clear_buffer_uptodate(bh); | |
133 | } | |
134 | ||
135 | unlock_buffer(bh); | |
136 | } | |
137 | ||
138 | static inline void ext4_fc_reset_inode(struct inode *inode) | |
139 | { | |
140 | struct ext4_inode_info *ei = EXT4_I(inode); | |
141 | ||
142 | ei->i_fc_lblk_start = 0; | |
143 | ei->i_fc_lblk_len = 0; | |
144 | } | |
145 | ||
146 | void ext4_fc_init_inode(struct inode *inode) | |
147 | { | |
148 | struct ext4_inode_info *ei = EXT4_I(inode); | |
149 | ||
150 | ext4_fc_reset_inode(inode); | |
151 | ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); | |
152 | INIT_LIST_HEAD(&ei->i_fc_list); | |
153 | init_waitqueue_head(&ei->i_fc_wait); | |
154 | atomic_set(&ei->i_fc_updates, 0); | |
aa75f4d3 HS |
155 | } |
156 | ||
f6634e26 HS |
157 | /* This function must be called with sbi->s_fc_lock held. */ |
158 | static void ext4_fc_wait_committing_inode(struct inode *inode) | |
159 | { | |
160 | wait_queue_head_t *wq; | |
161 | struct ext4_inode_info *ei = EXT4_I(inode); | |
162 | ||
163 | #if (BITS_PER_LONG < 64) | |
164 | DEFINE_WAIT_BIT(wait, &ei->i_state_flags, | |
165 | EXT4_STATE_FC_COMMITTING); | |
166 | wq = bit_waitqueue(&ei->i_state_flags, | |
167 | EXT4_STATE_FC_COMMITTING); | |
168 | #else | |
169 | DEFINE_WAIT_BIT(wait, &ei->i_flags, | |
170 | EXT4_STATE_FC_COMMITTING); | |
171 | wq = bit_waitqueue(&ei->i_flags, | |
172 | EXT4_STATE_FC_COMMITTING); | |
173 | #endif | |
174 | lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
175 | prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); | |
176 | spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
177 | schedule(); | |
178 | finish_wait(wq, &wait.wq_entry); | |
179 | } | |
180 | ||
aa75f4d3 HS |
181 | /* |
182 | * Inform Ext4's fast about start of an inode update | |
183 | * | |
184 | * This function is called by the high level call VFS callbacks before | |
185 | * performing any inode update. This function blocks if there's an ongoing | |
186 | * fast commit on the inode in question. | |
187 | */ | |
188 | void ext4_fc_start_update(struct inode *inode) | |
189 | { | |
190 | struct ext4_inode_info *ei = EXT4_I(inode); | |
191 | ||
8016e29f HS |
192 | if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || |
193 | (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) | |
aa75f4d3 HS |
194 | return; |
195 | ||
196 | restart: | |
197 | spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
198 | if (list_empty(&ei->i_fc_list)) | |
199 | goto out; | |
200 | ||
201 | if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { | |
f6634e26 | 202 | ext4_fc_wait_committing_inode(inode); |
aa75f4d3 HS |
203 | goto restart; |
204 | } | |
205 | out: | |
206 | atomic_inc(&ei->i_fc_updates); | |
207 | spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
208 | } | |
209 | ||
210 | /* | |
211 | * Stop inode update and wake up waiting fast commits if any. | |
212 | */ | |
213 | void ext4_fc_stop_update(struct inode *inode) | |
214 | { | |
215 | struct ext4_inode_info *ei = EXT4_I(inode); | |
216 | ||
8016e29f HS |
217 | if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || |
218 | (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) | |
aa75f4d3 HS |
219 | return; |
220 | ||
221 | if (atomic_dec_and_test(&ei->i_fc_updates)) | |
222 | wake_up_all(&ei->i_fc_wait); | |
223 | } | |
224 | ||
225 | /* | |
226 | * Remove inode from fast commit list. If the inode is being committed | |
227 | * we wait until inode commit is done. | |
228 | */ | |
229 | void ext4_fc_del(struct inode *inode) | |
230 | { | |
231 | struct ext4_inode_info *ei = EXT4_I(inode); | |
232 | ||
8016e29f HS |
233 | if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || |
234 | (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) | |
aa75f4d3 HS |
235 | return; |
236 | ||
237 | restart: | |
238 | spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
239 | if (list_empty(&ei->i_fc_list)) { | |
240 | spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
241 | return; | |
242 | } | |
243 | ||
244 | if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { | |
f6634e26 | 245 | ext4_fc_wait_committing_inode(inode); |
aa75f4d3 HS |
246 | goto restart; |
247 | } | |
f6634e26 | 248 | list_del_init(&ei->i_fc_list); |
aa75f4d3 HS |
249 | spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); |
250 | } | |
251 | ||
252 | /* | |
253 | * Mark file system as fast commit ineligible. This means that next commit | |
254 | * operation would result in a full jbd2 commit. | |
255 | */ | |
256 | void ext4_fc_mark_ineligible(struct super_block *sb, int reason) | |
257 | { | |
258 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
259 | ||
8016e29f HS |
260 | if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || |
261 | (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) | |
262 | return; | |
263 | ||
ababea77 | 264 | sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE; |
aa75f4d3 HS |
265 | WARN_ON(reason >= EXT4_FC_REASON_MAX); |
266 | sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; | |
267 | } | |
268 | ||
269 | /* | |
270 | * Start a fast commit ineligible update. Any commits that happen while | |
271 | * such an operation is in progress fall back to full commits. | |
272 | */ | |
273 | void ext4_fc_start_ineligible(struct super_block *sb, int reason) | |
274 | { | |
275 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
276 | ||
8016e29f HS |
277 | if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || |
278 | (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) | |
279 | return; | |
280 | ||
aa75f4d3 HS |
281 | WARN_ON(reason >= EXT4_FC_REASON_MAX); |
282 | sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; | |
283 | atomic_inc(&sbi->s_fc_ineligible_updates); | |
284 | } | |
285 | ||
286 | /* | |
ababea77 | 287 | * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here |
aa75f4d3 HS |
288 | * to ensure that after stopping the ineligible update, at least one full |
289 | * commit takes place. | |
290 | */ | |
291 | void ext4_fc_stop_ineligible(struct super_block *sb) | |
292 | { | |
8016e29f HS |
293 | if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || |
294 | (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) | |
295 | return; | |
296 | ||
ababea77 | 297 | EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE; |
aa75f4d3 HS |
298 | atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); |
299 | } | |
300 | ||
301 | static inline int ext4_fc_is_ineligible(struct super_block *sb) | |
302 | { | |
ababea77 | 303 | return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) || |
aa75f4d3 HS |
304 | atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates); |
305 | } | |
306 | ||
307 | /* | |
308 | * Generic fast commit tracking function. If this is the first time this we are | |
309 | * called after a full commit, we initialize fast commit fields and then call | |
310 | * __fc_track_fn() with update = 0. If we have already been called after a full | |
311 | * commit, we pass update = 1. Based on that, the track function can determine | |
312 | * if it needs to track a field for the first time or if it needs to just | |
313 | * update the previously tracked value. | |
314 | * | |
315 | * If enqueue is set, this function enqueues the inode in fast commit list. | |
316 | */ | |
317 | static int ext4_fc_track_template( | |
a80f7fcf HS |
318 | handle_t *handle, struct inode *inode, |
319 | int (*__fc_track_fn)(struct inode *, void *, bool), | |
aa75f4d3 HS |
320 | void *args, int enqueue) |
321 | { | |
aa75f4d3 HS |
322 | bool update = false; |
323 | struct ext4_inode_info *ei = EXT4_I(inode); | |
324 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | |
a80f7fcf | 325 | tid_t tid = 0; |
aa75f4d3 HS |
326 | int ret; |
327 | ||
8016e29f HS |
328 | if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || |
329 | (sbi->s_mount_state & EXT4_FC_REPLAY)) | |
aa75f4d3 HS |
330 | return -EOPNOTSUPP; |
331 | ||
332 | if (ext4_fc_is_ineligible(inode->i_sb)) | |
333 | return -EINVAL; | |
334 | ||
a80f7fcf | 335 | tid = handle->h_transaction->t_tid; |
aa75f4d3 | 336 | mutex_lock(&ei->i_fc_lock); |
a80f7fcf | 337 | if (tid == ei->i_sync_tid) { |
aa75f4d3 HS |
338 | update = true; |
339 | } else { | |
340 | ext4_fc_reset_inode(inode); | |
a80f7fcf | 341 | ei->i_sync_tid = tid; |
aa75f4d3 HS |
342 | } |
343 | ret = __fc_track_fn(inode, args, update); | |
344 | mutex_unlock(&ei->i_fc_lock); | |
345 | ||
346 | if (!enqueue) | |
347 | return ret; | |
348 | ||
349 | spin_lock(&sbi->s_fc_lock); | |
350 | if (list_empty(&EXT4_I(inode)->i_fc_list)) | |
351 | list_add_tail(&EXT4_I(inode)->i_fc_list, | |
ababea77 | 352 | (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ? |
aa75f4d3 HS |
353 | &sbi->s_fc_q[FC_Q_STAGING] : |
354 | &sbi->s_fc_q[FC_Q_MAIN]); | |
355 | spin_unlock(&sbi->s_fc_lock); | |
356 | ||
357 | return ret; | |
358 | } | |
359 | ||
360 | struct __track_dentry_update_args { | |
361 | struct dentry *dentry; | |
362 | int op; | |
363 | }; | |
364 | ||
365 | /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ | |
366 | static int __track_dentry_update(struct inode *inode, void *arg, bool update) | |
367 | { | |
368 | struct ext4_fc_dentry_update *node; | |
369 | struct ext4_inode_info *ei = EXT4_I(inode); | |
370 | struct __track_dentry_update_args *dentry_update = | |
371 | (struct __track_dentry_update_args *)arg; | |
372 | struct dentry *dentry = dentry_update->dentry; | |
373 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | |
374 | ||
375 | mutex_unlock(&ei->i_fc_lock); | |
376 | node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); | |
377 | if (!node) { | |
b21ebf14 | 378 | ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); |
aa75f4d3 HS |
379 | mutex_lock(&ei->i_fc_lock); |
380 | return -ENOMEM; | |
381 | } | |
382 | ||
383 | node->fcd_op = dentry_update->op; | |
384 | node->fcd_parent = dentry->d_parent->d_inode->i_ino; | |
385 | node->fcd_ino = inode->i_ino; | |
386 | if (dentry->d_name.len > DNAME_INLINE_LEN) { | |
387 | node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); | |
388 | if (!node->fcd_name.name) { | |
389 | kmem_cache_free(ext4_fc_dentry_cachep, node); | |
390 | ext4_fc_mark_ineligible(inode->i_sb, | |
b21ebf14 | 391 | EXT4_FC_REASON_NOMEM); |
aa75f4d3 HS |
392 | mutex_lock(&ei->i_fc_lock); |
393 | return -ENOMEM; | |
394 | } | |
395 | memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, | |
396 | dentry->d_name.len); | |
397 | } else { | |
398 | memcpy(node->fcd_iname, dentry->d_name.name, | |
399 | dentry->d_name.len); | |
400 | node->fcd_name.name = node->fcd_iname; | |
401 | } | |
402 | node->fcd_name.len = dentry->d_name.len; | |
403 | ||
404 | spin_lock(&sbi->s_fc_lock); | |
ababea77 | 405 | if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) |
aa75f4d3 HS |
406 | list_add_tail(&node->fcd_list, |
407 | &sbi->s_fc_dentry_q[FC_Q_STAGING]); | |
408 | else | |
409 | list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); | |
410 | spin_unlock(&sbi->s_fc_lock); | |
411 | mutex_lock(&ei->i_fc_lock); | |
412 | ||
413 | return 0; | |
414 | } | |
415 | ||
a80f7fcf HS |
416 | void __ext4_fc_track_unlink(handle_t *handle, |
417 | struct inode *inode, struct dentry *dentry) | |
aa75f4d3 HS |
418 | { |
419 | struct __track_dentry_update_args args; | |
420 | int ret; | |
421 | ||
422 | args.dentry = dentry; | |
423 | args.op = EXT4_FC_TAG_UNLINK; | |
424 | ||
a80f7fcf | 425 | ret = ext4_fc_track_template(handle, inode, __track_dentry_update, |
aa75f4d3 HS |
426 | (void *)&args, 0); |
427 | trace_ext4_fc_track_unlink(inode, dentry, ret); | |
428 | } | |
429 | ||
a80f7fcf HS |
430 | void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) |
431 | { | |
432 | __ext4_fc_track_unlink(handle, d_inode(dentry), dentry); | |
433 | } | |
434 | ||
435 | void __ext4_fc_track_link(handle_t *handle, | |
436 | struct inode *inode, struct dentry *dentry) | |
aa75f4d3 HS |
437 | { |
438 | struct __track_dentry_update_args args; | |
439 | int ret; | |
440 | ||
441 | args.dentry = dentry; | |
442 | args.op = EXT4_FC_TAG_LINK; | |
443 | ||
a80f7fcf | 444 | ret = ext4_fc_track_template(handle, inode, __track_dentry_update, |
aa75f4d3 HS |
445 | (void *)&args, 0); |
446 | trace_ext4_fc_track_link(inode, dentry, ret); | |
447 | } | |
448 | ||
a80f7fcf HS |
449 | void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) |
450 | { | |
451 | __ext4_fc_track_link(handle, d_inode(dentry), dentry); | |
452 | } | |
453 | ||
454 | void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) | |
aa75f4d3 HS |
455 | { |
456 | struct __track_dentry_update_args args; | |
a80f7fcf | 457 | struct inode *inode = d_inode(dentry); |
aa75f4d3 HS |
458 | int ret; |
459 | ||
460 | args.dentry = dentry; | |
461 | args.op = EXT4_FC_TAG_CREAT; | |
462 | ||
a80f7fcf | 463 | ret = ext4_fc_track_template(handle, inode, __track_dentry_update, |
aa75f4d3 HS |
464 | (void *)&args, 0); |
465 | trace_ext4_fc_track_create(inode, dentry, ret); | |
466 | } | |
467 | ||
468 | /* __track_fn for inode tracking */ | |
469 | static int __track_inode(struct inode *inode, void *arg, bool update) | |
470 | { | |
471 | if (update) | |
472 | return -EEXIST; | |
473 | ||
474 | EXT4_I(inode)->i_fc_lblk_len = 0; | |
475 | ||
476 | return 0; | |
477 | } | |
478 | ||
a80f7fcf | 479 | void ext4_fc_track_inode(handle_t *handle, struct inode *inode) |
aa75f4d3 HS |
480 | { |
481 | int ret; | |
482 | ||
483 | if (S_ISDIR(inode->i_mode)) | |
484 | return; | |
485 | ||
556e0319 HS |
486 | if (ext4_should_journal_data(inode)) { |
487 | ext4_fc_mark_ineligible(inode->i_sb, | |
488 | EXT4_FC_REASON_INODE_JOURNAL_DATA); | |
489 | return; | |
490 | } | |
491 | ||
a80f7fcf | 492 | ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); |
aa75f4d3 HS |
493 | trace_ext4_fc_track_inode(inode, ret); |
494 | } | |
495 | ||
496 | struct __track_range_args { | |
497 | ext4_lblk_t start, end; | |
498 | }; | |
499 | ||
500 | /* __track_fn for tracking data updates */ | |
501 | static int __track_range(struct inode *inode, void *arg, bool update) | |
502 | { | |
503 | struct ext4_inode_info *ei = EXT4_I(inode); | |
504 | ext4_lblk_t oldstart; | |
505 | struct __track_range_args *__arg = | |
506 | (struct __track_range_args *)arg; | |
507 | ||
508 | if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { | |
509 | ext4_debug("Special inode %ld being modified\n", inode->i_ino); | |
510 | return -ECANCELED; | |
511 | } | |
512 | ||
513 | oldstart = ei->i_fc_lblk_start; | |
514 | ||
515 | if (update && ei->i_fc_lblk_len > 0) { | |
516 | ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); | |
517 | ei->i_fc_lblk_len = | |
518 | max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - | |
519 | ei->i_fc_lblk_start + 1; | |
520 | } else { | |
521 | ei->i_fc_lblk_start = __arg->start; | |
522 | ei->i_fc_lblk_len = __arg->end - __arg->start + 1; | |
523 | } | |
524 | ||
525 | return 0; | |
526 | } | |
527 | ||
a80f7fcf | 528 | void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, |
aa75f4d3 HS |
529 | ext4_lblk_t end) |
530 | { | |
531 | struct __track_range_args args; | |
532 | int ret; | |
533 | ||
534 | if (S_ISDIR(inode->i_mode)) | |
535 | return; | |
536 | ||
537 | args.start = start; | |
538 | args.end = end; | |
539 | ||
a80f7fcf | 540 | ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); |
aa75f4d3 HS |
541 | |
542 | trace_ext4_fc_track_range(inode, start, end, ret); | |
543 | } | |
544 | ||
545 | static void ext4_fc_submit_bh(struct super_block *sb) | |
546 | { | |
547 | int write_flags = REQ_SYNC; | |
548 | struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; | |
549 | ||
a740762f | 550 | /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */ |
aa75f4d3 HS |
551 | if (test_opt(sb, BARRIER)) |
552 | write_flags |= REQ_FUA | REQ_PREFLUSH; | |
553 | lock_buffer(bh); | |
764b3fd3 | 554 | set_buffer_dirty(bh); |
aa75f4d3 HS |
555 | set_buffer_uptodate(bh); |
556 | bh->b_end_io = ext4_end_buffer_io_sync; | |
557 | submit_bh(REQ_OP_WRITE, write_flags, bh); | |
558 | EXT4_SB(sb)->s_fc_bh = NULL; | |
559 | } | |
560 | ||
561 | /* Ext4 commit path routines */ | |
562 | ||
563 | /* memzero and update CRC */ | |
564 | static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, | |
565 | u32 *crc) | |
566 | { | |
567 | void *ret; | |
568 | ||
569 | ret = memset(dst, 0, len); | |
570 | if (crc) | |
571 | *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); | |
572 | return ret; | |
573 | } | |
574 | ||
575 | /* | |
576 | * Allocate len bytes on a fast commit buffer. | |
577 | * | |
578 | * During the commit time this function is used to manage fast commit | |
579 | * block space. We don't split a fast commit log onto different | |
580 | * blocks. So this function makes sure that if there's not enough space | |
581 | * on the current block, the remaining space in the current block is | |
582 | * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, | |
583 | * new block is from jbd2 and CRC is updated to reflect the padding | |
584 | * we added. | |
585 | */ | |
586 | static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) | |
587 | { | |
588 | struct ext4_fc_tl *tl; | |
589 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
590 | struct buffer_head *bh; | |
591 | int bsize = sbi->s_journal->j_blocksize; | |
592 | int ret, off = sbi->s_fc_bytes % bsize; | |
593 | int pad_len; | |
594 | ||
595 | /* | |
596 | * After allocating len, we should have space at least for a 0 byte | |
597 | * padding. | |
598 | */ | |
599 | if (len + sizeof(struct ext4_fc_tl) > bsize) | |
600 | return NULL; | |
601 | ||
602 | if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { | |
603 | /* | |
604 | * Only allocate from current buffer if we have enough space for | |
605 | * this request AND we have space to add a zero byte padding. | |
606 | */ | |
607 | if (!sbi->s_fc_bh) { | |
608 | ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); | |
609 | if (ret) | |
610 | return NULL; | |
611 | sbi->s_fc_bh = bh; | |
612 | } | |
613 | sbi->s_fc_bytes += len; | |
614 | return sbi->s_fc_bh->b_data + off; | |
615 | } | |
616 | /* Need to add PAD tag */ | |
617 | tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); | |
618 | tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); | |
619 | pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); | |
620 | tl->fc_len = cpu_to_le16(pad_len); | |
621 | if (crc) | |
622 | *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); | |
623 | if (pad_len > 0) | |
624 | ext4_fc_memzero(sb, tl + 1, pad_len, crc); | |
625 | ext4_fc_submit_bh(sb); | |
626 | ||
627 | ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); | |
628 | if (ret) | |
629 | return NULL; | |
630 | sbi->s_fc_bh = bh; | |
631 | sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; | |
632 | return sbi->s_fc_bh->b_data; | |
633 | } | |
634 | ||
635 | /* memcpy to fc reserved space and update CRC */ | |
636 | static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, | |
637 | int len, u32 *crc) | |
638 | { | |
639 | if (crc) | |
640 | *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); | |
641 | return memcpy(dst, src, len); | |
642 | } | |
643 | ||
644 | /* | |
645 | * Complete a fast commit by writing tail tag. | |
646 | * | |
647 | * Writing tail tag marks the end of a fast commit. In order to guarantee | |
648 | * atomicity, after writing tail tag, even if there's space remaining | |
649 | * in the block, next commit shouldn't use it. That's why tail tag | |
650 | * has the length as that of the remaining space on the block. | |
651 | */ | |
652 | static int ext4_fc_write_tail(struct super_block *sb, u32 crc) | |
653 | { | |
654 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
655 | struct ext4_fc_tl tl; | |
656 | struct ext4_fc_tail tail; | |
657 | int off, bsize = sbi->s_journal->j_blocksize; | |
658 | u8 *dst; | |
659 | ||
660 | /* | |
661 | * ext4_fc_reserve_space takes care of allocating an extra block if | |
662 | * there's no enough space on this block for accommodating this tail. | |
663 | */ | |
664 | dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); | |
665 | if (!dst) | |
666 | return -ENOSPC; | |
667 | ||
668 | off = sbi->s_fc_bytes % bsize; | |
669 | ||
670 | tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); | |
671 | tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); | |
672 | sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); | |
673 | ||
674 | ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); | |
675 | dst += sizeof(tl); | |
676 | tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); | |
677 | ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); | |
678 | dst += sizeof(tail.fc_tid); | |
679 | tail.fc_crc = cpu_to_le32(crc); | |
680 | ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); | |
681 | ||
682 | ext4_fc_submit_bh(sb); | |
683 | ||
684 | return 0; | |
685 | } | |
686 | ||
687 | /* | |
688 | * Adds tag, length, value and updates CRC. Returns true if tlv was added. | |
689 | * Returns false if there's not enough space. | |
690 | */ | |
691 | static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, | |
692 | u32 *crc) | |
693 | { | |
694 | struct ext4_fc_tl tl; | |
695 | u8 *dst; | |
696 | ||
697 | dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); | |
698 | if (!dst) | |
699 | return false; | |
700 | ||
701 | tl.fc_tag = cpu_to_le16(tag); | |
702 | tl.fc_len = cpu_to_le16(len); | |
703 | ||
704 | ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); | |
705 | ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); | |
706 | ||
707 | return true; | |
708 | } | |
709 | ||
710 | /* Same as above, but adds dentry tlv. */ | |
711 | static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag, | |
712 | int parent_ino, int ino, int dlen, | |
713 | const unsigned char *dname, | |
714 | u32 *crc) | |
715 | { | |
716 | struct ext4_fc_dentry_info fcd; | |
717 | struct ext4_fc_tl tl; | |
718 | u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, | |
719 | crc); | |
720 | ||
721 | if (!dst) | |
722 | return false; | |
723 | ||
724 | fcd.fc_parent_ino = cpu_to_le32(parent_ino); | |
725 | fcd.fc_ino = cpu_to_le32(ino); | |
726 | tl.fc_tag = cpu_to_le16(tag); | |
727 | tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); | |
728 | ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); | |
729 | dst += sizeof(tl); | |
730 | ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); | |
731 | dst += sizeof(fcd); | |
732 | ext4_fc_memcpy(sb, dst, dname, dlen, crc); | |
733 | dst += dlen; | |
734 | ||
735 | return true; | |
736 | } | |
737 | ||
738 | /* | |
739 | * Writes inode in the fast commit space under TLV with tag @tag. | |
740 | * Returns 0 on success, error on failure. | |
741 | */ | |
742 | static int ext4_fc_write_inode(struct inode *inode, u32 *crc) | |
743 | { | |
744 | struct ext4_inode_info *ei = EXT4_I(inode); | |
745 | int inode_len = EXT4_GOOD_OLD_INODE_SIZE; | |
746 | int ret; | |
747 | struct ext4_iloc iloc; | |
748 | struct ext4_fc_inode fc_inode; | |
749 | struct ext4_fc_tl tl; | |
750 | u8 *dst; | |
751 | ||
752 | ret = ext4_get_inode_loc(inode, &iloc); | |
753 | if (ret) | |
754 | return ret; | |
755 | ||
756 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) | |
757 | inode_len += ei->i_extra_isize; | |
758 | ||
759 | fc_inode.fc_ino = cpu_to_le32(inode->i_ino); | |
760 | tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); | |
761 | tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); | |
762 | ||
763 | dst = ext4_fc_reserve_space(inode->i_sb, | |
764 | sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); | |
765 | if (!dst) | |
766 | return -ECANCELED; | |
767 | ||
768 | if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) | |
769 | return -ECANCELED; | |
770 | dst += sizeof(tl); | |
771 | if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) | |
772 | return -ECANCELED; | |
773 | dst += sizeof(fc_inode); | |
774 | if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), | |
775 | inode_len, crc)) | |
776 | return -ECANCELED; | |
777 | ||
778 | return 0; | |
779 | } | |
780 | ||
781 | /* | |
782 | * Writes updated data ranges for the inode in question. Updates CRC. | |
783 | * Returns 0 on success, error otherwise. | |
784 | */ | |
785 | static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) | |
786 | { | |
787 | ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; | |
788 | struct ext4_inode_info *ei = EXT4_I(inode); | |
789 | struct ext4_map_blocks map; | |
790 | struct ext4_fc_add_range fc_ext; | |
791 | struct ext4_fc_del_range lrange; | |
792 | struct ext4_extent *ex; | |
793 | int ret; | |
794 | ||
795 | mutex_lock(&ei->i_fc_lock); | |
796 | if (ei->i_fc_lblk_len == 0) { | |
797 | mutex_unlock(&ei->i_fc_lock); | |
798 | return 0; | |
799 | } | |
800 | old_blk_size = ei->i_fc_lblk_start; | |
801 | new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; | |
802 | ei->i_fc_lblk_len = 0; | |
803 | mutex_unlock(&ei->i_fc_lock); | |
804 | ||
805 | cur_lblk_off = old_blk_size; | |
806 | jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n", | |
807 | __func__, cur_lblk_off, new_blk_size, inode->i_ino); | |
808 | ||
809 | while (cur_lblk_off <= new_blk_size) { | |
810 | map.m_lblk = cur_lblk_off; | |
811 | map.m_len = new_blk_size - cur_lblk_off + 1; | |
812 | ret = ext4_map_blocks(NULL, inode, &map, 0); | |
813 | if (ret < 0) | |
814 | return -ECANCELED; | |
815 | ||
816 | if (map.m_len == 0) { | |
817 | cur_lblk_off++; | |
818 | continue; | |
819 | } | |
820 | ||
821 | if (ret == 0) { | |
822 | lrange.fc_ino = cpu_to_le32(inode->i_ino); | |
823 | lrange.fc_lblk = cpu_to_le32(map.m_lblk); | |
824 | lrange.fc_len = cpu_to_le32(map.m_len); | |
825 | if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, | |
826 | sizeof(lrange), (u8 *)&lrange, crc)) | |
827 | return -ENOSPC; | |
828 | } else { | |
829 | fc_ext.fc_ino = cpu_to_le32(inode->i_ino); | |
830 | ex = (struct ext4_extent *)&fc_ext.fc_ex; | |
831 | ex->ee_block = cpu_to_le32(map.m_lblk); | |
832 | ex->ee_len = cpu_to_le16(map.m_len); | |
833 | ext4_ext_store_pblock(ex, map.m_pblk); | |
834 | if (map.m_flags & EXT4_MAP_UNWRITTEN) | |
835 | ext4_ext_mark_unwritten(ex); | |
836 | else | |
837 | ext4_ext_mark_initialized(ex); | |
838 | if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, | |
839 | sizeof(fc_ext), (u8 *)&fc_ext, crc)) | |
840 | return -ENOSPC; | |
841 | } | |
842 | ||
843 | cur_lblk_off += map.m_len; | |
844 | } | |
845 | ||
846 | return 0; | |
847 | } | |
848 | ||
849 | ||
850 | /* Submit data for all the fast commit inodes */ | |
851 | static int ext4_fc_submit_inode_data_all(journal_t *journal) | |
852 | { | |
853 | struct super_block *sb = (struct super_block *)(journal->j_private); | |
854 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
855 | struct ext4_inode_info *ei; | |
856 | struct list_head *pos; | |
857 | int ret = 0; | |
858 | ||
859 | spin_lock(&sbi->s_fc_lock); | |
ababea77 | 860 | sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING; |
aa75f4d3 HS |
861 | list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { |
862 | ei = list_entry(pos, struct ext4_inode_info, i_fc_list); | |
863 | ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); | |
864 | while (atomic_read(&ei->i_fc_updates)) { | |
865 | DEFINE_WAIT(wait); | |
866 | ||
867 | prepare_to_wait(&ei->i_fc_wait, &wait, | |
868 | TASK_UNINTERRUPTIBLE); | |
869 | if (atomic_read(&ei->i_fc_updates)) { | |
870 | spin_unlock(&sbi->s_fc_lock); | |
871 | schedule(); | |
872 | spin_lock(&sbi->s_fc_lock); | |
873 | } | |
874 | finish_wait(&ei->i_fc_wait, &wait); | |
875 | } | |
876 | spin_unlock(&sbi->s_fc_lock); | |
877 | ret = jbd2_submit_inode_data(ei->jinode); | |
878 | if (ret) | |
879 | return ret; | |
880 | spin_lock(&sbi->s_fc_lock); | |
881 | } | |
882 | spin_unlock(&sbi->s_fc_lock); | |
883 | ||
884 | return ret; | |
885 | } | |
886 | ||
887 | /* Wait for completion of data for all the fast commit inodes */ | |
888 | static int ext4_fc_wait_inode_data_all(journal_t *journal) | |
889 | { | |
890 | struct super_block *sb = (struct super_block *)(journal->j_private); | |
891 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
892 | struct ext4_inode_info *pos, *n; | |
893 | int ret = 0; | |
894 | ||
895 | spin_lock(&sbi->s_fc_lock); | |
896 | list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { | |
897 | if (!ext4_test_inode_state(&pos->vfs_inode, | |
898 | EXT4_STATE_FC_COMMITTING)) | |
899 | continue; | |
900 | spin_unlock(&sbi->s_fc_lock); | |
901 | ||
902 | ret = jbd2_wait_inode_data(journal, pos->jinode); | |
903 | if (ret) | |
904 | return ret; | |
905 | spin_lock(&sbi->s_fc_lock); | |
906 | } | |
907 | spin_unlock(&sbi->s_fc_lock); | |
908 | ||
909 | return 0; | |
910 | } | |
911 | ||
912 | /* Commit all the directory entry updates */ | |
913 | static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) | |
914 | { | |
915 | struct super_block *sb = (struct super_block *)(journal->j_private); | |
916 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
917 | struct ext4_fc_dentry_update *fc_dentry; | |
918 | struct inode *inode; | |
919 | struct list_head *pos, *n, *fcd_pos, *fcd_n; | |
920 | struct ext4_inode_info *ei; | |
921 | int ret; | |
922 | ||
923 | if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) | |
924 | return 0; | |
925 | list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) { | |
926 | fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update, | |
927 | fcd_list); | |
928 | if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { | |
929 | spin_unlock(&sbi->s_fc_lock); | |
930 | if (!ext4_fc_add_dentry_tlv( | |
931 | sb, fc_dentry->fcd_op, | |
932 | fc_dentry->fcd_parent, fc_dentry->fcd_ino, | |
933 | fc_dentry->fcd_name.len, | |
934 | fc_dentry->fcd_name.name, crc)) { | |
935 | ret = -ENOSPC; | |
936 | goto lock_and_exit; | |
937 | } | |
938 | spin_lock(&sbi->s_fc_lock); | |
939 | continue; | |
940 | } | |
941 | ||
942 | inode = NULL; | |
943 | list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { | |
944 | ei = list_entry(pos, struct ext4_inode_info, i_fc_list); | |
945 | if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) { | |
946 | inode = &ei->vfs_inode; | |
947 | break; | |
948 | } | |
949 | } | |
950 | /* | |
951 | * If we don't find inode in our list, then it was deleted, | |
952 | * in which case, we don't need to record it's create tag. | |
953 | */ | |
954 | if (!inode) | |
955 | continue; | |
956 | spin_unlock(&sbi->s_fc_lock); | |
957 | ||
958 | /* | |
959 | * We first write the inode and then the create dirent. This | |
960 | * allows the recovery code to create an unnamed inode first | |
961 | * and then link it to a directory entry. This allows us | |
962 | * to use namei.c routines almost as is and simplifies | |
963 | * the recovery code. | |
964 | */ | |
965 | ret = ext4_fc_write_inode(inode, crc); | |
966 | if (ret) | |
967 | goto lock_and_exit; | |
968 | ||
969 | ret = ext4_fc_write_inode_data(inode, crc); | |
970 | if (ret) | |
971 | goto lock_and_exit; | |
972 | ||
973 | if (!ext4_fc_add_dentry_tlv( | |
974 | sb, fc_dentry->fcd_op, | |
975 | fc_dentry->fcd_parent, fc_dentry->fcd_ino, | |
976 | fc_dentry->fcd_name.len, | |
977 | fc_dentry->fcd_name.name, crc)) { | |
aa75f4d3 HS |
978 | ret = -ENOSPC; |
979 | goto lock_and_exit; | |
980 | } | |
981 | ||
982 | spin_lock(&sbi->s_fc_lock); | |
983 | } | |
984 | return 0; | |
985 | lock_and_exit: | |
986 | spin_lock(&sbi->s_fc_lock); | |
987 | return ret; | |
988 | } | |
989 | ||
990 | static int ext4_fc_perform_commit(journal_t *journal) | |
991 | { | |
992 | struct super_block *sb = (struct super_block *)(journal->j_private); | |
993 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
994 | struct ext4_inode_info *iter; | |
995 | struct ext4_fc_head head; | |
996 | struct list_head *pos; | |
997 | struct inode *inode; | |
998 | struct blk_plug plug; | |
999 | int ret = 0; | |
1000 | u32 crc = 0; | |
1001 | ||
1002 | ret = ext4_fc_submit_inode_data_all(journal); | |
1003 | if (ret) | |
1004 | return ret; | |
1005 | ||
1006 | ret = ext4_fc_wait_inode_data_all(journal); | |
1007 | if (ret) | |
1008 | return ret; | |
1009 | ||
1010 | blk_start_plug(&plug); | |
1011 | if (sbi->s_fc_bytes == 0) { | |
1012 | /* | |
1013 | * Add a head tag only if this is the first fast commit | |
1014 | * in this TID. | |
1015 | */ | |
1016 | head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); | |
1017 | head.fc_tid = cpu_to_le32( | |
1018 | sbi->s_journal->j_running_transaction->t_tid); | |
1019 | if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), | |
1020 | (u8 *)&head, &crc)) | |
1021 | goto out; | |
1022 | } | |
1023 | ||
1024 | spin_lock(&sbi->s_fc_lock); | |
1025 | ret = ext4_fc_commit_dentry_updates(journal, &crc); | |
1026 | if (ret) { | |
1027 | spin_unlock(&sbi->s_fc_lock); | |
1028 | goto out; | |
1029 | } | |
1030 | ||
1031 | list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { | |
1032 | iter = list_entry(pos, struct ext4_inode_info, i_fc_list); | |
1033 | inode = &iter->vfs_inode; | |
1034 | if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) | |
1035 | continue; | |
1036 | ||
1037 | spin_unlock(&sbi->s_fc_lock); | |
1038 | ret = ext4_fc_write_inode_data(inode, &crc); | |
1039 | if (ret) | |
1040 | goto out; | |
1041 | ret = ext4_fc_write_inode(inode, &crc); | |
1042 | if (ret) | |
1043 | goto out; | |
1044 | spin_lock(&sbi->s_fc_lock); | |
aa75f4d3 HS |
1045 | } |
1046 | spin_unlock(&sbi->s_fc_lock); | |
1047 | ||
1048 | ret = ext4_fc_write_tail(sb, crc); | |
1049 | ||
1050 | out: | |
1051 | blk_finish_plug(&plug); | |
1052 | return ret; | |
1053 | } | |
1054 | ||
1055 | /* | |
1056 | * The main commit entry point. Performs a fast commit for transaction | |
1057 | * commit_tid if needed. If it's not possible to perform a fast commit | |
1058 | * due to various reasons, we fall back to full commit. Returns 0 | |
1059 | * on success, error otherwise. | |
1060 | */ | |
1061 | int ext4_fc_commit(journal_t *journal, tid_t commit_tid) | |
1062 | { | |
1063 | struct super_block *sb = (struct super_block *)(journal->j_private); | |
1064 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
1065 | int nblks = 0, ret, bsize = journal->j_blocksize; | |
1066 | int subtid = atomic_read(&sbi->s_fc_subtid); | |
1067 | int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0; | |
1068 | ktime_t start_time, commit_time; | |
1069 | ||
1070 | trace_ext4_fc_commit_start(sb); | |
1071 | ||
1072 | start_time = ktime_get(); | |
1073 | ||
1074 | if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || | |
1075 | (ext4_fc_is_ineligible(sb))) { | |
1076 | reason = EXT4_FC_REASON_INELIGIBLE; | |
1077 | goto out; | |
1078 | } | |
1079 | ||
1080 | restart_fc: | |
1081 | ret = jbd2_fc_begin_commit(journal, commit_tid); | |
1082 | if (ret == -EALREADY) { | |
1083 | /* There was an ongoing commit, check if we need to restart */ | |
1084 | if (atomic_read(&sbi->s_fc_subtid) <= subtid && | |
1085 | commit_tid > journal->j_commit_sequence) | |
1086 | goto restart_fc; | |
1087 | reason = EXT4_FC_REASON_ALREADY_COMMITTED; | |
1088 | goto out; | |
1089 | } else if (ret) { | |
1090 | sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; | |
1091 | reason = EXT4_FC_REASON_FC_START_FAILED; | |
1092 | goto out; | |
1093 | } | |
1094 | ||
1095 | fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; | |
1096 | ret = ext4_fc_perform_commit(journal); | |
1097 | if (ret < 0) { | |
1098 | sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; | |
1099 | reason = EXT4_FC_REASON_FC_FAILED; | |
1100 | goto out; | |
1101 | } | |
1102 | nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; | |
1103 | ret = jbd2_fc_wait_bufs(journal, nblks); | |
1104 | if (ret < 0) { | |
1105 | sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; | |
1106 | reason = EXT4_FC_REASON_FC_FAILED; | |
1107 | goto out; | |
1108 | } | |
1109 | atomic_inc(&sbi->s_fc_subtid); | |
1110 | jbd2_fc_end_commit(journal); | |
1111 | out: | |
1112 | /* Has any ineligible update happened since we started? */ | |
1113 | if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) { | |
1114 | sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; | |
1115 | reason = EXT4_FC_REASON_INELIGIBLE; | |
1116 | } | |
1117 | ||
1118 | spin_lock(&sbi->s_fc_lock); | |
1119 | if (reason != EXT4_FC_REASON_OK && | |
1120 | reason != EXT4_FC_REASON_ALREADY_COMMITTED) { | |
1121 | sbi->s_fc_stats.fc_ineligible_commits++; | |
1122 | } else { | |
1123 | sbi->s_fc_stats.fc_num_commits++; | |
1124 | sbi->s_fc_stats.fc_numblks += nblks; | |
1125 | } | |
1126 | spin_unlock(&sbi->s_fc_lock); | |
1127 | nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0; | |
1128 | trace_ext4_fc_commit_stop(sb, nblks, reason); | |
1129 | commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); | |
1130 | /* | |
1131 | * weight the commit time higher than the average time so we don't | |
1132 | * react too strongly to vast changes in the commit time | |
1133 | */ | |
1134 | if (likely(sbi->s_fc_avg_commit_time)) | |
1135 | sbi->s_fc_avg_commit_time = (commit_time + | |
1136 | sbi->s_fc_avg_commit_time * 3) / 4; | |
1137 | else | |
1138 | sbi->s_fc_avg_commit_time = commit_time; | |
1139 | jbd_debug(1, | |
1140 | "Fast commit ended with blks = %d, reason = %d, subtid - %d", | |
1141 | nblks, reason, subtid); | |
1142 | if (reason == EXT4_FC_REASON_FC_FAILED) | |
0bce577b | 1143 | return jbd2_fc_end_commit_fallback(journal); |
aa75f4d3 HS |
1144 | if (reason == EXT4_FC_REASON_FC_START_FAILED || |
1145 | reason == EXT4_FC_REASON_INELIGIBLE) | |
1146 | return jbd2_complete_transaction(journal, commit_tid); | |
1147 | return 0; | |
1148 | } | |
1149 | ||
ff780b91 HS |
1150 | /* |
1151 | * Fast commit cleanup routine. This is called after every fast commit and | |
1152 | * full commit. full is true if we are called after a full commit. | |
1153 | */ | |
1154 | static void ext4_fc_cleanup(journal_t *journal, int full) | |
1155 | { | |
aa75f4d3 HS |
1156 | struct super_block *sb = journal->j_private; |
1157 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
1158 | struct ext4_inode_info *iter; | |
1159 | struct ext4_fc_dentry_update *fc_dentry; | |
1160 | struct list_head *pos, *n; | |
1161 | ||
1162 | if (full && sbi->s_fc_bh) | |
1163 | sbi->s_fc_bh = NULL; | |
1164 | ||
1165 | jbd2_fc_release_bufs(journal); | |
1166 | ||
1167 | spin_lock(&sbi->s_fc_lock); | |
1168 | list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { | |
1169 | iter = list_entry(pos, struct ext4_inode_info, i_fc_list); | |
1170 | list_del_init(&iter->i_fc_list); | |
1171 | ext4_clear_inode_state(&iter->vfs_inode, | |
1172 | EXT4_STATE_FC_COMMITTING); | |
1173 | ext4_fc_reset_inode(&iter->vfs_inode); | |
1174 | /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ | |
1175 | smp_mb(); | |
1176 | #if (BITS_PER_LONG < 64) | |
1177 | wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); | |
1178 | #else | |
1179 | wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); | |
1180 | #endif | |
1181 | } | |
1182 | ||
1183 | while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { | |
1184 | fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], | |
1185 | struct ext4_fc_dentry_update, | |
1186 | fcd_list); | |
1187 | list_del_init(&fc_dentry->fcd_list); | |
1188 | spin_unlock(&sbi->s_fc_lock); | |
1189 | ||
1190 | if (fc_dentry->fcd_name.name && | |
1191 | fc_dentry->fcd_name.len > DNAME_INLINE_LEN) | |
1192 | kfree(fc_dentry->fcd_name.name); | |
1193 | kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); | |
1194 | spin_lock(&sbi->s_fc_lock); | |
1195 | } | |
1196 | ||
1197 | list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], | |
1198 | &sbi->s_fc_dentry_q[FC_Q_MAIN]); | |
1199 | list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], | |
1200 | &sbi->s_fc_q[FC_Q_STAGING]); | |
1201 | ||
ababea77 HS |
1202 | sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING; |
1203 | sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE; | |
aa75f4d3 HS |
1204 | |
1205 | if (full) | |
1206 | sbi->s_fc_bytes = 0; | |
1207 | spin_unlock(&sbi->s_fc_lock); | |
1208 | trace_ext4_fc_stats(sb); | |
ff780b91 | 1209 | } |
6866d7b3 | 1210 | |
8016e29f HS |
1211 | /* Ext4 Replay Path Routines */ |
1212 | ||
1213 | /* Get length of a particular tlv */ | |
1214 | static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl) | |
1215 | { | |
1216 | return le16_to_cpu(tl->fc_len); | |
1217 | } | |
1218 | ||
1219 | /* Get a pointer to "value" of a tlv */ | |
1220 | static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl) | |
1221 | { | |
1222 | return (u8 *)tl + sizeof(*tl); | |
1223 | } | |
1224 | ||
1225 | /* Helper struct for dentry replay routines */ | |
1226 | struct dentry_info_args { | |
1227 | int parent_ino, dname_len, ino, inode_len; | |
1228 | char *dname; | |
1229 | }; | |
1230 | ||
1231 | static inline void tl_to_darg(struct dentry_info_args *darg, | |
1232 | struct ext4_fc_tl *tl) | |
1233 | { | |
1234 | struct ext4_fc_dentry_info *fcd; | |
1235 | ||
1236 | fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl); | |
1237 | ||
1238 | darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino); | |
1239 | darg->ino = le32_to_cpu(fcd->fc_ino); | |
1240 | darg->dname = fcd->fc_dname; | |
1241 | darg->dname_len = ext4_fc_tag_len(tl) - | |
1242 | sizeof(struct ext4_fc_dentry_info); | |
1243 | } | |
1244 | ||
1245 | /* Unlink replay function */ | |
1246 | static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl) | |
1247 | { | |
1248 | struct inode *inode, *old_parent; | |
1249 | struct qstr entry; | |
1250 | struct dentry_info_args darg; | |
1251 | int ret = 0; | |
1252 | ||
1253 | tl_to_darg(&darg, tl); | |
1254 | ||
1255 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, | |
1256 | darg.parent_ino, darg.dname_len); | |
1257 | ||
1258 | entry.name = darg.dname; | |
1259 | entry.len = darg.dname_len; | |
1260 | inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); | |
1261 | ||
1262 | if (IS_ERR_OR_NULL(inode)) { | |
1263 | jbd_debug(1, "Inode %d not found", darg.ino); | |
1264 | return 0; | |
1265 | } | |
1266 | ||
1267 | old_parent = ext4_iget(sb, darg.parent_ino, | |
1268 | EXT4_IGET_NORMAL); | |
1269 | if (IS_ERR_OR_NULL(old_parent)) { | |
1270 | jbd_debug(1, "Dir with inode %d not found", darg.parent_ino); | |
1271 | iput(inode); | |
1272 | return 0; | |
1273 | } | |
1274 | ||
a80f7fcf | 1275 | ret = __ext4_unlink(NULL, old_parent, &entry, inode); |
8016e29f HS |
1276 | /* -ENOENT ok coz it might not exist anymore. */ |
1277 | if (ret == -ENOENT) | |
1278 | ret = 0; | |
1279 | iput(old_parent); | |
1280 | iput(inode); | |
1281 | return ret; | |
1282 | } | |
1283 | ||
1284 | static int ext4_fc_replay_link_internal(struct super_block *sb, | |
1285 | struct dentry_info_args *darg, | |
1286 | struct inode *inode) | |
1287 | { | |
1288 | struct inode *dir = NULL; | |
1289 | struct dentry *dentry_dir = NULL, *dentry_inode = NULL; | |
1290 | struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); | |
1291 | int ret = 0; | |
1292 | ||
1293 | dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); | |
1294 | if (IS_ERR(dir)) { | |
1295 | jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino); | |
1296 | dir = NULL; | |
1297 | goto out; | |
1298 | } | |
1299 | ||
1300 | dentry_dir = d_obtain_alias(dir); | |
1301 | if (IS_ERR(dentry_dir)) { | |
1302 | jbd_debug(1, "Failed to obtain dentry"); | |
1303 | dentry_dir = NULL; | |
1304 | goto out; | |
1305 | } | |
1306 | ||
1307 | dentry_inode = d_alloc(dentry_dir, &qstr_dname); | |
1308 | if (!dentry_inode) { | |
1309 | jbd_debug(1, "Inode dentry not created."); | |
1310 | ret = -ENOMEM; | |
1311 | goto out; | |
1312 | } | |
1313 | ||
1314 | ret = __ext4_link(dir, inode, dentry_inode); | |
1315 | /* | |
1316 | * It's possible that link already existed since data blocks | |
1317 | * for the dir in question got persisted before we crashed OR | |
1318 | * we replayed this tag and crashed before the entire replay | |
1319 | * could complete. | |
1320 | */ | |
1321 | if (ret && ret != -EEXIST) { | |
1322 | jbd_debug(1, "Failed to link\n"); | |
1323 | goto out; | |
1324 | } | |
1325 | ||
1326 | ret = 0; | |
1327 | out: | |
1328 | if (dentry_dir) { | |
1329 | d_drop(dentry_dir); | |
1330 | dput(dentry_dir); | |
1331 | } else if (dir) { | |
1332 | iput(dir); | |
1333 | } | |
1334 | if (dentry_inode) { | |
1335 | d_drop(dentry_inode); | |
1336 | dput(dentry_inode); | |
1337 | } | |
1338 | ||
1339 | return ret; | |
1340 | } | |
1341 | ||
1342 | /* Link replay function */ | |
1343 | static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl) | |
1344 | { | |
1345 | struct inode *inode; | |
1346 | struct dentry_info_args darg; | |
1347 | int ret = 0; | |
1348 | ||
1349 | tl_to_darg(&darg, tl); | |
1350 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, | |
1351 | darg.parent_ino, darg.dname_len); | |
1352 | ||
1353 | inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); | |
1354 | if (IS_ERR_OR_NULL(inode)) { | |
1355 | jbd_debug(1, "Inode not found."); | |
1356 | return 0; | |
1357 | } | |
1358 | ||
1359 | ret = ext4_fc_replay_link_internal(sb, &darg, inode); | |
1360 | iput(inode); | |
1361 | return ret; | |
1362 | } | |
1363 | ||
1364 | /* | |
1365 | * Record all the modified inodes during replay. We use this later to setup | |
1366 | * block bitmaps correctly. | |
1367 | */ | |
1368 | static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) | |
1369 | { | |
1370 | struct ext4_fc_replay_state *state; | |
1371 | int i; | |
1372 | ||
1373 | state = &EXT4_SB(sb)->s_fc_replay_state; | |
1374 | for (i = 0; i < state->fc_modified_inodes_used; i++) | |
1375 | if (state->fc_modified_inodes[i] == ino) | |
1376 | return 0; | |
1377 | if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { | |
1378 | state->fc_modified_inodes_size += | |
1379 | EXT4_FC_REPLAY_REALLOC_INCREMENT; | |
1380 | state->fc_modified_inodes = krealloc( | |
1381 | state->fc_modified_inodes, sizeof(int) * | |
1382 | state->fc_modified_inodes_size, | |
1383 | GFP_KERNEL); | |
1384 | if (!state->fc_modified_inodes) | |
1385 | return -ENOMEM; | |
1386 | } | |
1387 | state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; | |
1388 | return 0; | |
1389 | } | |
1390 | ||
1391 | /* | |
1392 | * Inode replay function | |
1393 | */ | |
1394 | static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl) | |
1395 | { | |
1396 | struct ext4_fc_inode *fc_inode; | |
1397 | struct ext4_inode *raw_inode; | |
1398 | struct ext4_inode *raw_fc_inode; | |
1399 | struct inode *inode = NULL; | |
1400 | struct ext4_iloc iloc; | |
1401 | int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); | |
1402 | struct ext4_extent_header *eh; | |
1403 | ||
1404 | fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl); | |
1405 | ||
1406 | ino = le32_to_cpu(fc_inode->fc_ino); | |
1407 | trace_ext4_fc_replay(sb, tag, ino, 0, 0); | |
1408 | ||
1409 | inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); | |
1410 | if (!IS_ERR_OR_NULL(inode)) { | |
1411 | ext4_ext_clear_bb(inode); | |
1412 | iput(inode); | |
1413 | } | |
1414 | ||
1415 | ext4_fc_record_modified_inode(sb, ino); | |
1416 | ||
1417 | raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode; | |
1418 | ret = ext4_get_fc_inode_loc(sb, ino, &iloc); | |
1419 | if (ret) | |
1420 | goto out; | |
1421 | ||
1422 | inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode); | |
1423 | raw_inode = ext4_raw_inode(&iloc); | |
1424 | ||
1425 | memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); | |
1426 | memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation, | |
1427 | inode_len - offsetof(struct ext4_inode, i_generation)); | |
1428 | if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { | |
1429 | eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); | |
1430 | if (eh->eh_magic != EXT4_EXT_MAGIC) { | |
1431 | memset(eh, 0, sizeof(*eh)); | |
1432 | eh->eh_magic = EXT4_EXT_MAGIC; | |
1433 | eh->eh_max = cpu_to_le16( | |
1434 | (sizeof(raw_inode->i_block) - | |
1435 | sizeof(struct ext4_extent_header)) | |
1436 | / sizeof(struct ext4_extent)); | |
1437 | } | |
1438 | } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { | |
1439 | memcpy(raw_inode->i_block, raw_fc_inode->i_block, | |
1440 | sizeof(raw_inode->i_block)); | |
1441 | } | |
1442 | ||
1443 | /* Immediately update the inode on disk. */ | |
1444 | ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); | |
1445 | if (ret) | |
1446 | goto out; | |
1447 | ret = sync_dirty_buffer(iloc.bh); | |
1448 | if (ret) | |
1449 | goto out; | |
1450 | ret = ext4_mark_inode_used(sb, ino); | |
1451 | if (ret) | |
1452 | goto out; | |
1453 | ||
1454 | /* Given that we just wrote the inode on disk, this SHOULD succeed. */ | |
1455 | inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); | |
1456 | if (IS_ERR_OR_NULL(inode)) { | |
1457 | jbd_debug(1, "Inode not found."); | |
1458 | return -EFSCORRUPTED; | |
1459 | } | |
1460 | ||
1461 | /* | |
1462 | * Our allocator could have made different decisions than before | |
1463 | * crashing. This should be fixed but until then, we calculate | |
1464 | * the number of blocks the inode. | |
1465 | */ | |
1466 | ext4_ext_replay_set_iblocks(inode); | |
1467 | ||
1468 | inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); | |
1469 | ext4_reset_inode_seed(inode); | |
1470 | ||
1471 | ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); | |
1472 | ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); | |
1473 | sync_dirty_buffer(iloc.bh); | |
1474 | brelse(iloc.bh); | |
1475 | out: | |
1476 | iput(inode); | |
1477 | if (!ret) | |
1478 | blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); | |
1479 | ||
1480 | return 0; | |
1481 | } | |
1482 | ||
1483 | /* | |
1484 | * Dentry create replay function. | |
1485 | * | |
1486 | * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the | |
1487 | * inode for which we are trying to create a dentry here, should already have | |
1488 | * been replayed before we start here. | |
1489 | */ | |
1490 | static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl) | |
1491 | { | |
1492 | int ret = 0; | |
1493 | struct inode *inode = NULL; | |
1494 | struct inode *dir = NULL; | |
1495 | struct dentry_info_args darg; | |
1496 | ||
1497 | tl_to_darg(&darg, tl); | |
1498 | ||
1499 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, | |
1500 | darg.parent_ino, darg.dname_len); | |
1501 | ||
1502 | /* This takes care of update group descriptor and other metadata */ | |
1503 | ret = ext4_mark_inode_used(sb, darg.ino); | |
1504 | if (ret) | |
1505 | goto out; | |
1506 | ||
1507 | inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); | |
1508 | if (IS_ERR_OR_NULL(inode)) { | |
1509 | jbd_debug(1, "inode %d not found.", darg.ino); | |
1510 | inode = NULL; | |
1511 | ret = -EINVAL; | |
1512 | goto out; | |
1513 | } | |
1514 | ||
1515 | if (S_ISDIR(inode->i_mode)) { | |
1516 | /* | |
1517 | * If we are creating a directory, we need to make sure that the | |
1518 | * dot and dot dot dirents are setup properly. | |
1519 | */ | |
1520 | dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); | |
1521 | if (IS_ERR_OR_NULL(dir)) { | |
1522 | jbd_debug(1, "Dir %d not found.", darg.ino); | |
1523 | goto out; | |
1524 | } | |
1525 | ret = ext4_init_new_dir(NULL, dir, inode); | |
1526 | iput(dir); | |
1527 | if (ret) { | |
1528 | ret = 0; | |
1529 | goto out; | |
1530 | } | |
1531 | } | |
1532 | ret = ext4_fc_replay_link_internal(sb, &darg, inode); | |
1533 | if (ret) | |
1534 | goto out; | |
1535 | set_nlink(inode, 1); | |
1536 | ext4_mark_inode_dirty(NULL, inode); | |
1537 | out: | |
1538 | if (inode) | |
1539 | iput(inode); | |
1540 | return ret; | |
1541 | } | |
1542 | ||
1543 | /* | |
1544 | * Record physical disk regions which are in use as per fast commit area. Our | |
1545 | * simple replay phase allocator excludes these regions from allocation. | |
1546 | */ | |
1547 | static int ext4_fc_record_regions(struct super_block *sb, int ino, | |
1548 | ext4_lblk_t lblk, ext4_fsblk_t pblk, int len) | |
1549 | { | |
1550 | struct ext4_fc_replay_state *state; | |
1551 | struct ext4_fc_alloc_region *region; | |
1552 | ||
1553 | state = &EXT4_SB(sb)->s_fc_replay_state; | |
1554 | if (state->fc_regions_used == state->fc_regions_size) { | |
1555 | state->fc_regions_size += | |
1556 | EXT4_FC_REPLAY_REALLOC_INCREMENT; | |
1557 | state->fc_regions = krealloc( | |
1558 | state->fc_regions, | |
1559 | state->fc_regions_size * | |
1560 | sizeof(struct ext4_fc_alloc_region), | |
1561 | GFP_KERNEL); | |
1562 | if (!state->fc_regions) | |
1563 | return -ENOMEM; | |
1564 | } | |
1565 | region = &state->fc_regions[state->fc_regions_used++]; | |
1566 | region->ino = ino; | |
1567 | region->lblk = lblk; | |
1568 | region->pblk = pblk; | |
1569 | region->len = len; | |
1570 | ||
1571 | return 0; | |
1572 | } | |
1573 | ||
1574 | /* Replay add range tag */ | |
1575 | static int ext4_fc_replay_add_range(struct super_block *sb, | |
1576 | struct ext4_fc_tl *tl) | |
1577 | { | |
1578 | struct ext4_fc_add_range *fc_add_ex; | |
1579 | struct ext4_extent newex, *ex; | |
1580 | struct inode *inode; | |
1581 | ext4_lblk_t start, cur; | |
1582 | int remaining, len; | |
1583 | ext4_fsblk_t start_pblk; | |
1584 | struct ext4_map_blocks map; | |
1585 | struct ext4_ext_path *path = NULL; | |
1586 | int ret; | |
1587 | ||
1588 | fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); | |
1589 | ex = (struct ext4_extent *)&fc_add_ex->fc_ex; | |
1590 | ||
1591 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, | |
1592 | le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block), | |
1593 | ext4_ext_get_actual_len(ex)); | |
1594 | ||
1595 | inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino), | |
1596 | EXT4_IGET_NORMAL); | |
1597 | if (IS_ERR_OR_NULL(inode)) { | |
1598 | jbd_debug(1, "Inode not found."); | |
1599 | return 0; | |
1600 | } | |
1601 | ||
1602 | ret = ext4_fc_record_modified_inode(sb, inode->i_ino); | |
1603 | ||
1604 | start = le32_to_cpu(ex->ee_block); | |
1605 | start_pblk = ext4_ext_pblock(ex); | |
1606 | len = ext4_ext_get_actual_len(ex); | |
1607 | ||
1608 | cur = start; | |
1609 | remaining = len; | |
1610 | jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", | |
1611 | start, start_pblk, len, ext4_ext_is_unwritten(ex), | |
1612 | inode->i_ino); | |
1613 | ||
1614 | while (remaining > 0) { | |
1615 | map.m_lblk = cur; | |
1616 | map.m_len = remaining; | |
1617 | map.m_pblk = 0; | |
1618 | ret = ext4_map_blocks(NULL, inode, &map, 0); | |
1619 | ||
1620 | if (ret < 0) { | |
1621 | iput(inode); | |
1622 | return 0; | |
1623 | } | |
1624 | ||
1625 | if (ret == 0) { | |
1626 | /* Range is not mapped */ | |
1627 | path = ext4_find_extent(inode, cur, NULL, 0); | |
8c9be1e5 HS |
1628 | if (IS_ERR(path)) { |
1629 | iput(inode); | |
1630 | return 0; | |
1631 | } | |
8016e29f HS |
1632 | memset(&newex, 0, sizeof(newex)); |
1633 | newex.ee_block = cpu_to_le32(cur); | |
1634 | ext4_ext_store_pblock( | |
1635 | &newex, start_pblk + cur - start); | |
1636 | newex.ee_len = cpu_to_le16(map.m_len); | |
1637 | if (ext4_ext_is_unwritten(ex)) | |
1638 | ext4_ext_mark_unwritten(&newex); | |
1639 | down_write(&EXT4_I(inode)->i_data_sem); | |
1640 | ret = ext4_ext_insert_extent( | |
1641 | NULL, inode, &path, &newex, 0); | |
1642 | up_write((&EXT4_I(inode)->i_data_sem)); | |
1643 | ext4_ext_drop_refs(path); | |
1644 | kfree(path); | |
1645 | if (ret) { | |
1646 | iput(inode); | |
1647 | return 0; | |
1648 | } | |
1649 | goto next; | |
1650 | } | |
1651 | ||
1652 | if (start_pblk + cur - start != map.m_pblk) { | |
1653 | /* | |
1654 | * Logical to physical mapping changed. This can happen | |
1655 | * if this range was removed and then reallocated to | |
1656 | * map to new physical blocks during a fast commit. | |
1657 | */ | |
1658 | ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, | |
1659 | ext4_ext_is_unwritten(ex), | |
1660 | start_pblk + cur - start); | |
1661 | if (ret) { | |
1662 | iput(inode); | |
1663 | return 0; | |
1664 | } | |
1665 | /* | |
1666 | * Mark the old blocks as free since they aren't used | |
1667 | * anymore. We maintain an array of all the modified | |
1668 | * inodes. In case these blocks are still used at either | |
1669 | * a different logical range in the same inode or in | |
1670 | * some different inode, we will mark them as allocated | |
1671 | * at the end of the FC replay using our array of | |
1672 | * modified inodes. | |
1673 | */ | |
1674 | ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); | |
1675 | goto next; | |
1676 | } | |
1677 | ||
1678 | /* Range is mapped and needs a state change */ | |
1679 | jbd_debug(1, "Converting from %d to %d %lld", | |
1680 | map.m_flags & EXT4_MAP_UNWRITTEN, | |
1681 | ext4_ext_is_unwritten(ex), map.m_pblk); | |
1682 | ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, | |
1683 | ext4_ext_is_unwritten(ex), map.m_pblk); | |
1684 | if (ret) { | |
1685 | iput(inode); | |
1686 | return 0; | |
1687 | } | |
1688 | /* | |
1689 | * We may have split the extent tree while toggling the state. | |
1690 | * Try to shrink the extent tree now. | |
1691 | */ | |
1692 | ext4_ext_replay_shrink_inode(inode, start + len); | |
1693 | next: | |
1694 | cur += map.m_len; | |
1695 | remaining -= map.m_len; | |
1696 | } | |
1697 | ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> | |
1698 | sb->s_blocksize_bits); | |
1699 | iput(inode); | |
1700 | return 0; | |
1701 | } | |
1702 | ||
1703 | /* Replay DEL_RANGE tag */ | |
1704 | static int | |
1705 | ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl) | |
1706 | { | |
1707 | struct inode *inode; | |
1708 | struct ext4_fc_del_range *lrange; | |
1709 | struct ext4_map_blocks map; | |
1710 | ext4_lblk_t cur, remaining; | |
1711 | int ret; | |
1712 | ||
1713 | lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl); | |
1714 | cur = le32_to_cpu(lrange->fc_lblk); | |
1715 | remaining = le32_to_cpu(lrange->fc_len); | |
1716 | ||
1717 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, | |
1718 | le32_to_cpu(lrange->fc_ino), cur, remaining); | |
1719 | ||
1720 | inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL); | |
1721 | if (IS_ERR_OR_NULL(inode)) { | |
1722 | jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino)); | |
1723 | return 0; | |
1724 | } | |
1725 | ||
1726 | ret = ext4_fc_record_modified_inode(sb, inode->i_ino); | |
1727 | ||
1728 | jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", | |
1729 | inode->i_ino, le32_to_cpu(lrange->fc_lblk), | |
1730 | le32_to_cpu(lrange->fc_len)); | |
1731 | while (remaining > 0) { | |
1732 | map.m_lblk = cur; | |
1733 | map.m_len = remaining; | |
1734 | ||
1735 | ret = ext4_map_blocks(NULL, inode, &map, 0); | |
1736 | if (ret < 0) { | |
1737 | iput(inode); | |
1738 | return 0; | |
1739 | } | |
1740 | if (ret > 0) { | |
1741 | remaining -= ret; | |
1742 | cur += ret; | |
1743 | ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); | |
1744 | } else { | |
1745 | remaining -= map.m_len; | |
1746 | cur += map.m_len; | |
1747 | } | |
1748 | } | |
1749 | ||
1750 | ret = ext4_punch_hole(inode, | |
1751 | le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits, | |
1752 | le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits); | |
1753 | if (ret) | |
1754 | jbd_debug(1, "ext4_punch_hole returned %d", ret); | |
1755 | ext4_ext_replay_shrink_inode(inode, | |
1756 | i_size_read(inode) >> sb->s_blocksize_bits); | |
1757 | ext4_mark_inode_dirty(NULL, inode); | |
1758 | iput(inode); | |
1759 | ||
1760 | return 0; | |
1761 | } | |
1762 | ||
1763 | static inline const char *tag2str(u16 tag) | |
1764 | { | |
1765 | switch (tag) { | |
1766 | case EXT4_FC_TAG_LINK: | |
1767 | return "TAG_ADD_ENTRY"; | |
1768 | case EXT4_FC_TAG_UNLINK: | |
1769 | return "TAG_DEL_ENTRY"; | |
1770 | case EXT4_FC_TAG_ADD_RANGE: | |
1771 | return "TAG_ADD_RANGE"; | |
1772 | case EXT4_FC_TAG_CREAT: | |
1773 | return "TAG_CREAT_DENTRY"; | |
1774 | case EXT4_FC_TAG_DEL_RANGE: | |
1775 | return "TAG_DEL_RANGE"; | |
1776 | case EXT4_FC_TAG_INODE: | |
1777 | return "TAG_INODE"; | |
1778 | case EXT4_FC_TAG_PAD: | |
1779 | return "TAG_PAD"; | |
1780 | case EXT4_FC_TAG_TAIL: | |
1781 | return "TAG_TAIL"; | |
1782 | case EXT4_FC_TAG_HEAD: | |
1783 | return "TAG_HEAD"; | |
1784 | default: | |
1785 | return "TAG_ERROR"; | |
1786 | } | |
1787 | } | |
1788 | ||
1789 | static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) | |
1790 | { | |
1791 | struct ext4_fc_replay_state *state; | |
1792 | struct inode *inode; | |
1793 | struct ext4_ext_path *path = NULL; | |
1794 | struct ext4_map_blocks map; | |
1795 | int i, ret, j; | |
1796 | ext4_lblk_t cur, end; | |
1797 | ||
1798 | state = &EXT4_SB(sb)->s_fc_replay_state; | |
1799 | for (i = 0; i < state->fc_modified_inodes_used; i++) { | |
1800 | inode = ext4_iget(sb, state->fc_modified_inodes[i], | |
1801 | EXT4_IGET_NORMAL); | |
1802 | if (IS_ERR_OR_NULL(inode)) { | |
1803 | jbd_debug(1, "Inode %d not found.", | |
1804 | state->fc_modified_inodes[i]); | |
1805 | continue; | |
1806 | } | |
1807 | cur = 0; | |
1808 | end = EXT_MAX_BLOCKS; | |
1809 | while (cur < end) { | |
1810 | map.m_lblk = cur; | |
1811 | map.m_len = end - cur; | |
1812 | ||
1813 | ret = ext4_map_blocks(NULL, inode, &map, 0); | |
1814 | if (ret < 0) | |
1815 | break; | |
1816 | ||
1817 | if (ret > 0) { | |
1818 | path = ext4_find_extent(inode, map.m_lblk, NULL, 0); | |
1819 | if (!IS_ERR_OR_NULL(path)) { | |
1820 | for (j = 0; j < path->p_depth; j++) | |
1821 | ext4_mb_mark_bb(inode->i_sb, | |
1822 | path[j].p_block, 1, 1); | |
1823 | ext4_ext_drop_refs(path); | |
1824 | kfree(path); | |
1825 | } | |
1826 | cur += ret; | |
1827 | ext4_mb_mark_bb(inode->i_sb, map.m_pblk, | |
1828 | map.m_len, 1); | |
1829 | } else { | |
1830 | cur = cur + (map.m_len ? map.m_len : 1); | |
1831 | } | |
1832 | } | |
1833 | iput(inode); | |
1834 | } | |
1835 | } | |
1836 | ||
1837 | /* | |
1838 | * Check if block is in excluded regions for block allocation. The simple | |
1839 | * allocator that runs during replay phase is calls this function to see | |
1840 | * if it is okay to use a block. | |
1841 | */ | |
1842 | bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) | |
1843 | { | |
1844 | int i; | |
1845 | struct ext4_fc_replay_state *state; | |
1846 | ||
1847 | state = &EXT4_SB(sb)->s_fc_replay_state; | |
1848 | for (i = 0; i < state->fc_regions_valid; i++) { | |
1849 | if (state->fc_regions[i].ino == 0 || | |
1850 | state->fc_regions[i].len == 0) | |
1851 | continue; | |
1852 | if (blk >= state->fc_regions[i].pblk && | |
1853 | blk < state->fc_regions[i].pblk + state->fc_regions[i].len) | |
1854 | return true; | |
1855 | } | |
1856 | return false; | |
1857 | } | |
1858 | ||
1859 | /* Cleanup function called after replay */ | |
1860 | void ext4_fc_replay_cleanup(struct super_block *sb) | |
1861 | { | |
1862 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
1863 | ||
1864 | sbi->s_mount_state &= ~EXT4_FC_REPLAY; | |
1865 | kfree(sbi->s_fc_replay_state.fc_regions); | |
1866 | kfree(sbi->s_fc_replay_state.fc_modified_inodes); | |
1867 | } | |
1868 | ||
1869 | /* | |
1870 | * Recovery Scan phase handler | |
1871 | * | |
1872 | * This function is called during the scan phase and is responsible | |
1873 | * for doing following things: | |
1874 | * - Make sure the fast commit area has valid tags for replay | |
1875 | * - Count number of tags that need to be replayed by the replay handler | |
1876 | * - Verify CRC | |
1877 | * - Create a list of excluded blocks for allocation during replay phase | |
1878 | * | |
1879 | * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is | |
1880 | * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP | |
1881 | * to indicate that scan has finished and JBD2 can now start replay phase. | |
1882 | * It returns a negative error to indicate that there was an error. At the end | |
1883 | * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set | |
1884 | * to indicate the number of tags that need to replayed during the replay phase. | |
1885 | */ | |
1886 | static int ext4_fc_replay_scan(journal_t *journal, | |
1887 | struct buffer_head *bh, int off, | |
1888 | tid_t expected_tid) | |
1889 | { | |
1890 | struct super_block *sb = journal->j_private; | |
1891 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
1892 | struct ext4_fc_replay_state *state; | |
1893 | int ret = JBD2_FC_REPLAY_CONTINUE; | |
1894 | struct ext4_fc_add_range *ext; | |
1895 | struct ext4_fc_tl *tl; | |
1896 | struct ext4_fc_tail *tail; | |
1897 | __u8 *start, *end; | |
1898 | struct ext4_fc_head *head; | |
1899 | struct ext4_extent *ex; | |
1900 | ||
1901 | state = &sbi->s_fc_replay_state; | |
1902 | ||
1903 | start = (u8 *)bh->b_data; | |
1904 | end = (__u8 *)bh->b_data + journal->j_blocksize - 1; | |
1905 | ||
1906 | if (state->fc_replay_expected_off == 0) { | |
1907 | state->fc_cur_tag = 0; | |
1908 | state->fc_replay_num_tags = 0; | |
1909 | state->fc_crc = 0; | |
1910 | state->fc_regions = NULL; | |
1911 | state->fc_regions_valid = state->fc_regions_used = | |
1912 | state->fc_regions_size = 0; | |
1913 | /* Check if we can stop early */ | |
1914 | if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) | |
1915 | != EXT4_FC_TAG_HEAD) | |
1916 | return 0; | |
1917 | } | |
1918 | ||
1919 | if (off != state->fc_replay_expected_off) { | |
1920 | ret = -EFSCORRUPTED; | |
1921 | goto out_err; | |
1922 | } | |
1923 | ||
1924 | state->fc_replay_expected_off++; | |
1925 | fc_for_each_tl(start, end, tl) { | |
1926 | jbd_debug(3, "Scan phase, tag:%s, blk %lld\n", | |
1927 | tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr); | |
1928 | switch (le16_to_cpu(tl->fc_tag)) { | |
1929 | case EXT4_FC_TAG_ADD_RANGE: | |
1930 | ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); | |
1931 | ex = (struct ext4_extent *)&ext->fc_ex; | |
1932 | ret = ext4_fc_record_regions(sb, | |
1933 | le32_to_cpu(ext->fc_ino), | |
1934 | le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), | |
1935 | ext4_ext_get_actual_len(ex)); | |
1936 | if (ret < 0) | |
1937 | break; | |
1938 | ret = JBD2_FC_REPLAY_CONTINUE; | |
1939 | fallthrough; | |
1940 | case EXT4_FC_TAG_DEL_RANGE: | |
1941 | case EXT4_FC_TAG_LINK: | |
1942 | case EXT4_FC_TAG_UNLINK: | |
1943 | case EXT4_FC_TAG_CREAT: | |
1944 | case EXT4_FC_TAG_INODE: | |
1945 | case EXT4_FC_TAG_PAD: | |
1946 | state->fc_cur_tag++; | |
1947 | state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, | |
1948 | sizeof(*tl) + ext4_fc_tag_len(tl)); | |
1949 | break; | |
1950 | case EXT4_FC_TAG_TAIL: | |
1951 | state->fc_cur_tag++; | |
1952 | tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); | |
1953 | state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, | |
1954 | sizeof(*tl) + | |
1955 | offsetof(struct ext4_fc_tail, | |
1956 | fc_crc)); | |
1957 | if (le32_to_cpu(tail->fc_tid) == expected_tid && | |
1958 | le32_to_cpu(tail->fc_crc) == state->fc_crc) { | |
1959 | state->fc_replay_num_tags = state->fc_cur_tag; | |
1960 | state->fc_regions_valid = | |
1961 | state->fc_regions_used; | |
1962 | } else { | |
1963 | ret = state->fc_replay_num_tags ? | |
1964 | JBD2_FC_REPLAY_STOP : -EFSBADCRC; | |
1965 | } | |
1966 | state->fc_crc = 0; | |
1967 | break; | |
1968 | case EXT4_FC_TAG_HEAD: | |
1969 | head = (struct ext4_fc_head *)ext4_fc_tag_val(tl); | |
1970 | if (le32_to_cpu(head->fc_features) & | |
1971 | ~EXT4_FC_SUPPORTED_FEATURES) { | |
1972 | ret = -EOPNOTSUPP; | |
1973 | break; | |
1974 | } | |
1975 | if (le32_to_cpu(head->fc_tid) != expected_tid) { | |
1976 | ret = JBD2_FC_REPLAY_STOP; | |
1977 | break; | |
1978 | } | |
1979 | state->fc_cur_tag++; | |
1980 | state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, | |
1981 | sizeof(*tl) + ext4_fc_tag_len(tl)); | |
1982 | break; | |
1983 | default: | |
1984 | ret = state->fc_replay_num_tags ? | |
1985 | JBD2_FC_REPLAY_STOP : -ECANCELED; | |
1986 | } | |
1987 | if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) | |
1988 | break; | |
1989 | } | |
1990 | ||
1991 | out_err: | |
1992 | trace_ext4_fc_replay_scan(sb, ret, off); | |
1993 | return ret; | |
1994 | } | |
1995 | ||
5b849b5f HS |
1996 | /* |
1997 | * Main recovery path entry point. | |
8016e29f | 1998 | * The meaning of return codes is similar as above. |
5b849b5f HS |
1999 | */ |
2000 | static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, | |
2001 | enum passtype pass, int off, tid_t expected_tid) | |
2002 | { | |
8016e29f HS |
2003 | struct super_block *sb = journal->j_private; |
2004 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
2005 | struct ext4_fc_tl *tl; | |
2006 | __u8 *start, *end; | |
2007 | int ret = JBD2_FC_REPLAY_CONTINUE; | |
2008 | struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; | |
2009 | struct ext4_fc_tail *tail; | |
2010 | ||
2011 | if (pass == PASS_SCAN) { | |
2012 | state->fc_current_pass = PASS_SCAN; | |
2013 | return ext4_fc_replay_scan(journal, bh, off, expected_tid); | |
2014 | } | |
2015 | ||
2016 | if (state->fc_current_pass != pass) { | |
2017 | state->fc_current_pass = pass; | |
2018 | sbi->s_mount_state |= EXT4_FC_REPLAY; | |
2019 | } | |
2020 | if (!sbi->s_fc_replay_state.fc_replay_num_tags) { | |
2021 | jbd_debug(1, "Replay stops\n"); | |
2022 | ext4_fc_set_bitmaps_and_counters(sb); | |
2023 | return 0; | |
2024 | } | |
2025 | ||
2026 | #ifdef CONFIG_EXT4_DEBUG | |
2027 | if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { | |
2028 | pr_warn("Dropping fc block %d because max_replay set\n", off); | |
2029 | return JBD2_FC_REPLAY_STOP; | |
2030 | } | |
2031 | #endif | |
2032 | ||
2033 | start = (u8 *)bh->b_data; | |
2034 | end = (__u8 *)bh->b_data + journal->j_blocksize - 1; | |
2035 | ||
2036 | fc_for_each_tl(start, end, tl) { | |
2037 | if (state->fc_replay_num_tags == 0) { | |
2038 | ret = JBD2_FC_REPLAY_STOP; | |
2039 | ext4_fc_set_bitmaps_and_counters(sb); | |
2040 | break; | |
2041 | } | |
2042 | jbd_debug(3, "Replay phase, tag:%s\n", | |
2043 | tag2str(le16_to_cpu(tl->fc_tag))); | |
2044 | state->fc_replay_num_tags--; | |
2045 | switch (le16_to_cpu(tl->fc_tag)) { | |
2046 | case EXT4_FC_TAG_LINK: | |
2047 | ret = ext4_fc_replay_link(sb, tl); | |
2048 | break; | |
2049 | case EXT4_FC_TAG_UNLINK: | |
2050 | ret = ext4_fc_replay_unlink(sb, tl); | |
2051 | break; | |
2052 | case EXT4_FC_TAG_ADD_RANGE: | |
2053 | ret = ext4_fc_replay_add_range(sb, tl); | |
2054 | break; | |
2055 | case EXT4_FC_TAG_CREAT: | |
2056 | ret = ext4_fc_replay_create(sb, tl); | |
2057 | break; | |
2058 | case EXT4_FC_TAG_DEL_RANGE: | |
2059 | ret = ext4_fc_replay_del_range(sb, tl); | |
2060 | break; | |
2061 | case EXT4_FC_TAG_INODE: | |
2062 | ret = ext4_fc_replay_inode(sb, tl); | |
2063 | break; | |
2064 | case EXT4_FC_TAG_PAD: | |
2065 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, | |
2066 | ext4_fc_tag_len(tl), 0); | |
2067 | break; | |
2068 | case EXT4_FC_TAG_TAIL: | |
2069 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, | |
2070 | ext4_fc_tag_len(tl), 0); | |
2071 | tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); | |
2072 | WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid); | |
2073 | break; | |
2074 | case EXT4_FC_TAG_HEAD: | |
2075 | break; | |
2076 | default: | |
2077 | trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0, | |
2078 | ext4_fc_tag_len(tl), 0); | |
2079 | ret = -ECANCELED; | |
2080 | break; | |
2081 | } | |
2082 | if (ret < 0) | |
2083 | break; | |
2084 | ret = JBD2_FC_REPLAY_CONTINUE; | |
2085 | } | |
2086 | return ret; | |
5b849b5f HS |
2087 | } |
2088 | ||
6866d7b3 HS |
2089 | void ext4_fc_init(struct super_block *sb, journal_t *journal) |
2090 | { | |
5b849b5f HS |
2091 | /* |
2092 | * We set replay callback even if fast commit disabled because we may | |
2093 | * could still have fast commit blocks that need to be replayed even if | |
2094 | * fast commit has now been turned off. | |
2095 | */ | |
2096 | journal->j_fc_replay_callback = ext4_fc_replay; | |
6866d7b3 HS |
2097 | if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) |
2098 | return; | |
ff780b91 | 2099 | journal->j_fc_cleanup_callback = ext4_fc_cleanup; |
6866d7b3 | 2100 | } |
aa75f4d3 | 2101 | |
ce8c59d1 HS |
2102 | const char *fc_ineligible_reasons[] = { |
2103 | "Extended attributes changed", | |
2104 | "Cross rename", | |
2105 | "Journal flag changed", | |
2106 | "Insufficient memory", | |
2107 | "Swap boot", | |
2108 | "Resize", | |
2109 | "Dir renamed", | |
2110 | "Falloc range op", | |
556e0319 | 2111 | "Data journalling", |
ce8c59d1 HS |
2112 | "FC Commit Failed" |
2113 | }; | |
2114 | ||
2115 | int ext4_fc_info_show(struct seq_file *seq, void *v) | |
2116 | { | |
2117 | struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); | |
2118 | struct ext4_fc_stats *stats = &sbi->s_fc_stats; | |
2119 | int i; | |
2120 | ||
2121 | if (v != SEQ_START_TOKEN) | |
2122 | return 0; | |
2123 | ||
2124 | seq_printf(seq, | |
2125 | "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", | |
2126 | stats->fc_num_commits, stats->fc_ineligible_commits, | |
2127 | stats->fc_numblks, | |
2128 | div_u64(sbi->s_fc_avg_commit_time, 1000)); | |
2129 | seq_puts(seq, "Ineligible reasons:\n"); | |
2130 | for (i = 0; i < EXT4_FC_REASON_MAX; i++) | |
2131 | seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], | |
2132 | stats->fc_ineligible_reason_count[i]); | |
2133 | ||
2134 | return 0; | |
2135 | } | |
2136 | ||
aa75f4d3 HS |
2137 | int __init ext4_fc_init_dentry_cache(void) |
2138 | { | |
2139 | ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, | |
2140 | SLAB_RECLAIM_ACCOUNT); | |
2141 | ||
2142 | if (ext4_fc_dentry_cachep == NULL) | |
2143 | return -ENOMEM; | |
2144 | ||
2145 | return 0; | |
2146 | } |