Commit | Line | Data |
---|---|---|
6866d7b3 HS |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | /* | |
4 | * fs/ext4/fast_commit.c | |
5 | * | |
6 | * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> | |
7 | * | |
8 | * Ext4 fast commits routines. | |
9 | */ | |
aa75f4d3 | 10 | #include "ext4.h" |
6866d7b3 | 11 | #include "ext4_jbd2.h" |
aa75f4d3 HS |
12 | #include "ext4_extents.h" |
13 | #include "mballoc.h" | |
14 | ||
15 | /* | |
16 | * Ext4 Fast Commits | |
17 | * ----------------- | |
18 | * | |
19 | * Ext4 fast commits implement fine grained journalling for Ext4. | |
20 | * | |
21 | * Fast commits are organized as a log of tag-length-value (TLV) structs. (See | |
22 | * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by | |
23 | * TLV during the recovery phase. For the scenarios for which we currently | |
24 | * don't have replay code, fast commit falls back to full commits. | |
25 | * Fast commits record delta in one of the following three categories. | |
26 | * | |
27 | * (A) Directory entry updates: | |
28 | * | |
29 | * - EXT4_FC_TAG_UNLINK - records directory entry unlink | |
30 | * - EXT4_FC_TAG_LINK - records directory entry link | |
31 | * - EXT4_FC_TAG_CREAT - records inode and directory entry creation | |
32 | * | |
33 | * (B) File specific data range updates: | |
34 | * | |
35 | * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode | |
36 | * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode | |
37 | * | |
38 | * (C) Inode metadata (mtime / ctime etc): | |
39 | * | |
40 | * - EXT4_FC_TAG_INODE - record the inode that should be replayed | |
41 | * during recovery. Note that iblocks field is | |
42 | * not replayed and instead derived during | |
43 | * replay. | |
44 | * Commit Operation | |
45 | * ---------------- | |
46 | * With fast commits, we maintain all the directory entry operations in the | |
47 | * order in which they are issued in an in-memory queue. This queue is flushed | |
48 | * to disk during the commit operation. We also maintain a list of inodes | |
49 | * that need to be committed during a fast commit in another in memory queue of | |
50 | * inodes. During the commit operation, we commit in the following order: | |
51 | * | |
52 | * [1] Lock inodes for any further data updates by setting COMMITTING state | |
53 | * [2] Submit data buffers of all the inodes | |
54 | * [3] Wait for [2] to complete | |
55 | * [4] Commit all the directory entry updates in the fast commit space | |
56 | * [5] Commit all the changed inode structures | |
57 | * [6] Write tail tag (this tag ensures the atomicity, please read the following | |
58 | * section for more details). | |
59 | * [7] Wait for [4], [5] and [6] to complete. | |
60 | * | |
61 | * All the inode updates must call ext4_fc_start_update() before starting an | |
62 | * update. If such an ongoing update is present, fast commit waits for it to | |
63 | * complete. The completion of such an update is marked by | |
64 | * ext4_fc_stop_update(). | |
65 | * | |
66 | * Fast Commit Ineligibility | |
67 | * ------------------------- | |
68 | * Not all operations are supported by fast commits today (e.g extended | |
69 | * attributes). Fast commit ineligiblity is marked by calling one of the | |
70 | * two following functions: | |
71 | * | |
72 | * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall | |
73 | * back to full commit. This is useful in case of transient errors. | |
74 | * | |
75 | * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all | |
76 | * the fast commits happening between ext4_fc_start_ineligible() and | |
77 | * ext4_fc_stop_ineligible() and one fast commit after the call to | |
78 | * ext4_fc_stop_ineligible() to fall back to full commits. It is important to | |
79 | * make one more fast commit to fall back to full commit after stop call so | |
80 | * that it guaranteed that the fast commit ineligible operation contained | |
81 | * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is | |
82 | * followed by at least 1 full commit. | |
83 | * | |
84 | * Atomicity of commits | |
85 | * -------------------- | |
86 | * In order to gaurantee atomicity during the commit operation, fast commit | |
87 | * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail | |
88 | * tag contains CRC of the contents and TID of the transaction after which | |
89 | * this fast commit should be applied. Recovery code replays fast commit | |
90 | * logs only if there's at least 1 valid tail present. For every fast commit | |
91 | * operation, there is 1 tail. This means, we may end up with multiple tails | |
92 | * in the fast commit space. Here's an example: | |
93 | * | |
94 | * - Create a new file A and remove existing file B | |
95 | * - fsync() | |
96 | * - Append contents to file A | |
97 | * - Truncate file A | |
98 | * - fsync() | |
99 | * | |
100 | * The fast commit space at the end of above operations would look like this: | |
101 | * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] | |
102 | * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| | |
103 | * | |
104 | * Replay code should thus check for all the valid tails in the FC area. | |
105 | * | |
106 | * TODOs | |
107 | * ----- | |
108 | * 1) Make fast commit atomic updates more fine grained. Today, a fast commit | |
109 | * eligible update must be protected within ext4_fc_start_update() and | |
110 | * ext4_fc_stop_update(). These routines are called at much higher | |
111 | * routines. This can be made more fine grained by combining with | |
112 | * ext4_journal_start(). | |
113 | * | |
114 | * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() | |
115 | * | |
116 | * 3) Handle more ineligible cases. | |
117 | */ | |
118 | ||
119 | #include <trace/events/ext4.h> | |
120 | static struct kmem_cache *ext4_fc_dentry_cachep; | |
121 | ||
122 | static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) | |
123 | { | |
124 | BUFFER_TRACE(bh, ""); | |
125 | if (uptodate) { | |
126 | ext4_debug("%s: Block %lld up-to-date", | |
127 | __func__, bh->b_blocknr); | |
128 | set_buffer_uptodate(bh); | |
129 | } else { | |
130 | ext4_debug("%s: Block %lld not up-to-date", | |
131 | __func__, bh->b_blocknr); | |
132 | clear_buffer_uptodate(bh); | |
133 | } | |
134 | ||
135 | unlock_buffer(bh); | |
136 | } | |
137 | ||
138 | static inline void ext4_fc_reset_inode(struct inode *inode) | |
139 | { | |
140 | struct ext4_inode_info *ei = EXT4_I(inode); | |
141 | ||
142 | ei->i_fc_lblk_start = 0; | |
143 | ei->i_fc_lblk_len = 0; | |
144 | } | |
145 | ||
146 | void ext4_fc_init_inode(struct inode *inode) | |
147 | { | |
148 | struct ext4_inode_info *ei = EXT4_I(inode); | |
149 | ||
150 | ext4_fc_reset_inode(inode); | |
151 | ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); | |
152 | INIT_LIST_HEAD(&ei->i_fc_list); | |
153 | init_waitqueue_head(&ei->i_fc_wait); | |
154 | atomic_set(&ei->i_fc_updates, 0); | |
155 | ei->i_fc_committed_subtid = 0; | |
156 | } | |
157 | ||
158 | /* | |
159 | * Inform Ext4's fast about start of an inode update | |
160 | * | |
161 | * This function is called by the high level call VFS callbacks before | |
162 | * performing any inode update. This function blocks if there's an ongoing | |
163 | * fast commit on the inode in question. | |
164 | */ | |
165 | void ext4_fc_start_update(struct inode *inode) | |
166 | { | |
167 | struct ext4_inode_info *ei = EXT4_I(inode); | |
168 | ||
169 | if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) | |
170 | return; | |
171 | ||
172 | restart: | |
173 | spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
174 | if (list_empty(&ei->i_fc_list)) | |
175 | goto out; | |
176 | ||
177 | if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { | |
178 | wait_queue_head_t *wq; | |
179 | #if (BITS_PER_LONG < 64) | |
180 | DEFINE_WAIT_BIT(wait, &ei->i_state_flags, | |
181 | EXT4_STATE_FC_COMMITTING); | |
182 | wq = bit_waitqueue(&ei->i_state_flags, | |
183 | EXT4_STATE_FC_COMMITTING); | |
184 | #else | |
185 | DEFINE_WAIT_BIT(wait, &ei->i_flags, | |
186 | EXT4_STATE_FC_COMMITTING); | |
187 | wq = bit_waitqueue(&ei->i_flags, | |
188 | EXT4_STATE_FC_COMMITTING); | |
189 | #endif | |
190 | prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); | |
191 | spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
192 | schedule(); | |
193 | finish_wait(wq, &wait.wq_entry); | |
194 | goto restart; | |
195 | } | |
196 | out: | |
197 | atomic_inc(&ei->i_fc_updates); | |
198 | spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
199 | } | |
200 | ||
201 | /* | |
202 | * Stop inode update and wake up waiting fast commits if any. | |
203 | */ | |
204 | void ext4_fc_stop_update(struct inode *inode) | |
205 | { | |
206 | struct ext4_inode_info *ei = EXT4_I(inode); | |
207 | ||
208 | if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) | |
209 | return; | |
210 | ||
211 | if (atomic_dec_and_test(&ei->i_fc_updates)) | |
212 | wake_up_all(&ei->i_fc_wait); | |
213 | } | |
214 | ||
215 | /* | |
216 | * Remove inode from fast commit list. If the inode is being committed | |
217 | * we wait until inode commit is done. | |
218 | */ | |
219 | void ext4_fc_del(struct inode *inode) | |
220 | { | |
221 | struct ext4_inode_info *ei = EXT4_I(inode); | |
222 | ||
223 | if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) | |
224 | return; | |
225 | ||
226 | ||
227 | if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) | |
228 | return; | |
229 | ||
230 | restart: | |
231 | spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
232 | if (list_empty(&ei->i_fc_list)) { | |
233 | spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
234 | return; | |
235 | } | |
236 | ||
237 | if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { | |
238 | wait_queue_head_t *wq; | |
239 | #if (BITS_PER_LONG < 64) | |
240 | DEFINE_WAIT_BIT(wait, &ei->i_state_flags, | |
241 | EXT4_STATE_FC_COMMITTING); | |
242 | wq = bit_waitqueue(&ei->i_state_flags, | |
243 | EXT4_STATE_FC_COMMITTING); | |
244 | #else | |
245 | DEFINE_WAIT_BIT(wait, &ei->i_flags, | |
246 | EXT4_STATE_FC_COMMITTING); | |
247 | wq = bit_waitqueue(&ei->i_flags, | |
248 | EXT4_STATE_FC_COMMITTING); | |
249 | #endif | |
250 | prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); | |
251 | spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
252 | schedule(); | |
253 | finish_wait(wq, &wait.wq_entry); | |
254 | goto restart; | |
255 | } | |
256 | if (!list_empty(&ei->i_fc_list)) | |
257 | list_del_init(&ei->i_fc_list); | |
258 | spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); | |
259 | } | |
260 | ||
261 | /* | |
262 | * Mark file system as fast commit ineligible. This means that next commit | |
263 | * operation would result in a full jbd2 commit. | |
264 | */ | |
265 | void ext4_fc_mark_ineligible(struct super_block *sb, int reason) | |
266 | { | |
267 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
268 | ||
269 | sbi->s_mount_state |= EXT4_FC_INELIGIBLE; | |
270 | WARN_ON(reason >= EXT4_FC_REASON_MAX); | |
271 | sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; | |
272 | } | |
273 | ||
274 | /* | |
275 | * Start a fast commit ineligible update. Any commits that happen while | |
276 | * such an operation is in progress fall back to full commits. | |
277 | */ | |
278 | void ext4_fc_start_ineligible(struct super_block *sb, int reason) | |
279 | { | |
280 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
281 | ||
282 | WARN_ON(reason >= EXT4_FC_REASON_MAX); | |
283 | sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; | |
284 | atomic_inc(&sbi->s_fc_ineligible_updates); | |
285 | } | |
286 | ||
287 | /* | |
288 | * Stop a fast commit ineligible update. We set EXT4_FC_INELIGIBLE flag here | |
289 | * to ensure that after stopping the ineligible update, at least one full | |
290 | * commit takes place. | |
291 | */ | |
292 | void ext4_fc_stop_ineligible(struct super_block *sb) | |
293 | { | |
294 | EXT4_SB(sb)->s_mount_state |= EXT4_FC_INELIGIBLE; | |
295 | atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); | |
296 | } | |
297 | ||
298 | static inline int ext4_fc_is_ineligible(struct super_block *sb) | |
299 | { | |
300 | return (EXT4_SB(sb)->s_mount_state & EXT4_FC_INELIGIBLE) || | |
301 | atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates); | |
302 | } | |
303 | ||
304 | /* | |
305 | * Generic fast commit tracking function. If this is the first time this we are | |
306 | * called after a full commit, we initialize fast commit fields and then call | |
307 | * __fc_track_fn() with update = 0. If we have already been called after a full | |
308 | * commit, we pass update = 1. Based on that, the track function can determine | |
309 | * if it needs to track a field for the first time or if it needs to just | |
310 | * update the previously tracked value. | |
311 | * | |
312 | * If enqueue is set, this function enqueues the inode in fast commit list. | |
313 | */ | |
314 | static int ext4_fc_track_template( | |
315 | struct inode *inode, int (*__fc_track_fn)(struct inode *, void *, bool), | |
316 | void *args, int enqueue) | |
317 | { | |
318 | tid_t running_txn_tid; | |
319 | bool update = false; | |
320 | struct ext4_inode_info *ei = EXT4_I(inode); | |
321 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | |
322 | int ret; | |
323 | ||
324 | if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) | |
325 | return -EOPNOTSUPP; | |
326 | ||
327 | if (ext4_fc_is_ineligible(inode->i_sb)) | |
328 | return -EINVAL; | |
329 | ||
330 | running_txn_tid = sbi->s_journal ? | |
331 | sbi->s_journal->j_commit_sequence + 1 : 0; | |
332 | ||
333 | mutex_lock(&ei->i_fc_lock); | |
334 | if (running_txn_tid == ei->i_sync_tid) { | |
335 | update = true; | |
336 | } else { | |
337 | ext4_fc_reset_inode(inode); | |
338 | ei->i_sync_tid = running_txn_tid; | |
339 | } | |
340 | ret = __fc_track_fn(inode, args, update); | |
341 | mutex_unlock(&ei->i_fc_lock); | |
342 | ||
343 | if (!enqueue) | |
344 | return ret; | |
345 | ||
346 | spin_lock(&sbi->s_fc_lock); | |
347 | if (list_empty(&EXT4_I(inode)->i_fc_list)) | |
348 | list_add_tail(&EXT4_I(inode)->i_fc_list, | |
349 | (sbi->s_mount_state & EXT4_FC_COMMITTING) ? | |
350 | &sbi->s_fc_q[FC_Q_STAGING] : | |
351 | &sbi->s_fc_q[FC_Q_MAIN]); | |
352 | spin_unlock(&sbi->s_fc_lock); | |
353 | ||
354 | return ret; | |
355 | } | |
356 | ||
357 | struct __track_dentry_update_args { | |
358 | struct dentry *dentry; | |
359 | int op; | |
360 | }; | |
361 | ||
362 | /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ | |
363 | static int __track_dentry_update(struct inode *inode, void *arg, bool update) | |
364 | { | |
365 | struct ext4_fc_dentry_update *node; | |
366 | struct ext4_inode_info *ei = EXT4_I(inode); | |
367 | struct __track_dentry_update_args *dentry_update = | |
368 | (struct __track_dentry_update_args *)arg; | |
369 | struct dentry *dentry = dentry_update->dentry; | |
370 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | |
371 | ||
372 | mutex_unlock(&ei->i_fc_lock); | |
373 | node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); | |
374 | if (!node) { | |
375 | ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MEM); | |
376 | mutex_lock(&ei->i_fc_lock); | |
377 | return -ENOMEM; | |
378 | } | |
379 | ||
380 | node->fcd_op = dentry_update->op; | |
381 | node->fcd_parent = dentry->d_parent->d_inode->i_ino; | |
382 | node->fcd_ino = inode->i_ino; | |
383 | if (dentry->d_name.len > DNAME_INLINE_LEN) { | |
384 | node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); | |
385 | if (!node->fcd_name.name) { | |
386 | kmem_cache_free(ext4_fc_dentry_cachep, node); | |
387 | ext4_fc_mark_ineligible(inode->i_sb, | |
388 | EXT4_FC_REASON_MEM); | |
389 | mutex_lock(&ei->i_fc_lock); | |
390 | return -ENOMEM; | |
391 | } | |
392 | memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, | |
393 | dentry->d_name.len); | |
394 | } else { | |
395 | memcpy(node->fcd_iname, dentry->d_name.name, | |
396 | dentry->d_name.len); | |
397 | node->fcd_name.name = node->fcd_iname; | |
398 | } | |
399 | node->fcd_name.len = dentry->d_name.len; | |
400 | ||
401 | spin_lock(&sbi->s_fc_lock); | |
402 | if (sbi->s_mount_state & EXT4_FC_COMMITTING) | |
403 | list_add_tail(&node->fcd_list, | |
404 | &sbi->s_fc_dentry_q[FC_Q_STAGING]); | |
405 | else | |
406 | list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); | |
407 | spin_unlock(&sbi->s_fc_lock); | |
408 | mutex_lock(&ei->i_fc_lock); | |
409 | ||
410 | return 0; | |
411 | } | |
412 | ||
413 | void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry) | |
414 | { | |
415 | struct __track_dentry_update_args args; | |
416 | int ret; | |
417 | ||
418 | args.dentry = dentry; | |
419 | args.op = EXT4_FC_TAG_UNLINK; | |
420 | ||
421 | ret = ext4_fc_track_template(inode, __track_dentry_update, | |
422 | (void *)&args, 0); | |
423 | trace_ext4_fc_track_unlink(inode, dentry, ret); | |
424 | } | |
425 | ||
426 | void ext4_fc_track_link(struct inode *inode, struct dentry *dentry) | |
427 | { | |
428 | struct __track_dentry_update_args args; | |
429 | int ret; | |
430 | ||
431 | args.dentry = dentry; | |
432 | args.op = EXT4_FC_TAG_LINK; | |
433 | ||
434 | ret = ext4_fc_track_template(inode, __track_dentry_update, | |
435 | (void *)&args, 0); | |
436 | trace_ext4_fc_track_link(inode, dentry, ret); | |
437 | } | |
438 | ||
439 | void ext4_fc_track_create(struct inode *inode, struct dentry *dentry) | |
440 | { | |
441 | struct __track_dentry_update_args args; | |
442 | int ret; | |
443 | ||
444 | args.dentry = dentry; | |
445 | args.op = EXT4_FC_TAG_CREAT; | |
446 | ||
447 | ret = ext4_fc_track_template(inode, __track_dentry_update, | |
448 | (void *)&args, 0); | |
449 | trace_ext4_fc_track_create(inode, dentry, ret); | |
450 | } | |
451 | ||
452 | /* __track_fn for inode tracking */ | |
453 | static int __track_inode(struct inode *inode, void *arg, bool update) | |
454 | { | |
455 | if (update) | |
456 | return -EEXIST; | |
457 | ||
458 | EXT4_I(inode)->i_fc_lblk_len = 0; | |
459 | ||
460 | return 0; | |
461 | } | |
462 | ||
463 | void ext4_fc_track_inode(struct inode *inode) | |
464 | { | |
465 | int ret; | |
466 | ||
467 | if (S_ISDIR(inode->i_mode)) | |
468 | return; | |
469 | ||
470 | ret = ext4_fc_track_template(inode, __track_inode, NULL, 1); | |
471 | trace_ext4_fc_track_inode(inode, ret); | |
472 | } | |
473 | ||
474 | struct __track_range_args { | |
475 | ext4_lblk_t start, end; | |
476 | }; | |
477 | ||
478 | /* __track_fn for tracking data updates */ | |
479 | static int __track_range(struct inode *inode, void *arg, bool update) | |
480 | { | |
481 | struct ext4_inode_info *ei = EXT4_I(inode); | |
482 | ext4_lblk_t oldstart; | |
483 | struct __track_range_args *__arg = | |
484 | (struct __track_range_args *)arg; | |
485 | ||
486 | if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { | |
487 | ext4_debug("Special inode %ld being modified\n", inode->i_ino); | |
488 | return -ECANCELED; | |
489 | } | |
490 | ||
491 | oldstart = ei->i_fc_lblk_start; | |
492 | ||
493 | if (update && ei->i_fc_lblk_len > 0) { | |
494 | ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); | |
495 | ei->i_fc_lblk_len = | |
496 | max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - | |
497 | ei->i_fc_lblk_start + 1; | |
498 | } else { | |
499 | ei->i_fc_lblk_start = __arg->start; | |
500 | ei->i_fc_lblk_len = __arg->end - __arg->start + 1; | |
501 | } | |
502 | ||
503 | return 0; | |
504 | } | |
505 | ||
506 | void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start, | |
507 | ext4_lblk_t end) | |
508 | { | |
509 | struct __track_range_args args; | |
510 | int ret; | |
511 | ||
512 | if (S_ISDIR(inode->i_mode)) | |
513 | return; | |
514 | ||
515 | args.start = start; | |
516 | args.end = end; | |
517 | ||
518 | ret = ext4_fc_track_template(inode, __track_range, &args, 1); | |
519 | ||
520 | trace_ext4_fc_track_range(inode, start, end, ret); | |
521 | } | |
522 | ||
523 | static void ext4_fc_submit_bh(struct super_block *sb) | |
524 | { | |
525 | int write_flags = REQ_SYNC; | |
526 | struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; | |
527 | ||
528 | if (test_opt(sb, BARRIER)) | |
529 | write_flags |= REQ_FUA | REQ_PREFLUSH; | |
530 | lock_buffer(bh); | |
531 | clear_buffer_dirty(bh); | |
532 | set_buffer_uptodate(bh); | |
533 | bh->b_end_io = ext4_end_buffer_io_sync; | |
534 | submit_bh(REQ_OP_WRITE, write_flags, bh); | |
535 | EXT4_SB(sb)->s_fc_bh = NULL; | |
536 | } | |
537 | ||
538 | /* Ext4 commit path routines */ | |
539 | ||
540 | /* memzero and update CRC */ | |
541 | static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, | |
542 | u32 *crc) | |
543 | { | |
544 | void *ret; | |
545 | ||
546 | ret = memset(dst, 0, len); | |
547 | if (crc) | |
548 | *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); | |
549 | return ret; | |
550 | } | |
551 | ||
552 | /* | |
553 | * Allocate len bytes on a fast commit buffer. | |
554 | * | |
555 | * During the commit time this function is used to manage fast commit | |
556 | * block space. We don't split a fast commit log onto different | |
557 | * blocks. So this function makes sure that if there's not enough space | |
558 | * on the current block, the remaining space in the current block is | |
559 | * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, | |
560 | * new block is from jbd2 and CRC is updated to reflect the padding | |
561 | * we added. | |
562 | */ | |
563 | static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) | |
564 | { | |
565 | struct ext4_fc_tl *tl; | |
566 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
567 | struct buffer_head *bh; | |
568 | int bsize = sbi->s_journal->j_blocksize; | |
569 | int ret, off = sbi->s_fc_bytes % bsize; | |
570 | int pad_len; | |
571 | ||
572 | /* | |
573 | * After allocating len, we should have space at least for a 0 byte | |
574 | * padding. | |
575 | */ | |
576 | if (len + sizeof(struct ext4_fc_tl) > bsize) | |
577 | return NULL; | |
578 | ||
579 | if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { | |
580 | /* | |
581 | * Only allocate from current buffer if we have enough space for | |
582 | * this request AND we have space to add a zero byte padding. | |
583 | */ | |
584 | if (!sbi->s_fc_bh) { | |
585 | ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); | |
586 | if (ret) | |
587 | return NULL; | |
588 | sbi->s_fc_bh = bh; | |
589 | } | |
590 | sbi->s_fc_bytes += len; | |
591 | return sbi->s_fc_bh->b_data + off; | |
592 | } | |
593 | /* Need to add PAD tag */ | |
594 | tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); | |
595 | tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); | |
596 | pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); | |
597 | tl->fc_len = cpu_to_le16(pad_len); | |
598 | if (crc) | |
599 | *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); | |
600 | if (pad_len > 0) | |
601 | ext4_fc_memzero(sb, tl + 1, pad_len, crc); | |
602 | ext4_fc_submit_bh(sb); | |
603 | ||
604 | ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); | |
605 | if (ret) | |
606 | return NULL; | |
607 | sbi->s_fc_bh = bh; | |
608 | sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; | |
609 | return sbi->s_fc_bh->b_data; | |
610 | } | |
611 | ||
612 | /* memcpy to fc reserved space and update CRC */ | |
613 | static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, | |
614 | int len, u32 *crc) | |
615 | { | |
616 | if (crc) | |
617 | *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); | |
618 | return memcpy(dst, src, len); | |
619 | } | |
620 | ||
621 | /* | |
622 | * Complete a fast commit by writing tail tag. | |
623 | * | |
624 | * Writing tail tag marks the end of a fast commit. In order to guarantee | |
625 | * atomicity, after writing tail tag, even if there's space remaining | |
626 | * in the block, next commit shouldn't use it. That's why tail tag | |
627 | * has the length as that of the remaining space on the block. | |
628 | */ | |
629 | static int ext4_fc_write_tail(struct super_block *sb, u32 crc) | |
630 | { | |
631 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
632 | struct ext4_fc_tl tl; | |
633 | struct ext4_fc_tail tail; | |
634 | int off, bsize = sbi->s_journal->j_blocksize; | |
635 | u8 *dst; | |
636 | ||
637 | /* | |
638 | * ext4_fc_reserve_space takes care of allocating an extra block if | |
639 | * there's no enough space on this block for accommodating this tail. | |
640 | */ | |
641 | dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); | |
642 | if (!dst) | |
643 | return -ENOSPC; | |
644 | ||
645 | off = sbi->s_fc_bytes % bsize; | |
646 | ||
647 | tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); | |
648 | tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); | |
649 | sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); | |
650 | ||
651 | ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); | |
652 | dst += sizeof(tl); | |
653 | tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); | |
654 | ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); | |
655 | dst += sizeof(tail.fc_tid); | |
656 | tail.fc_crc = cpu_to_le32(crc); | |
657 | ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); | |
658 | ||
659 | ext4_fc_submit_bh(sb); | |
660 | ||
661 | return 0; | |
662 | } | |
663 | ||
664 | /* | |
665 | * Adds tag, length, value and updates CRC. Returns true if tlv was added. | |
666 | * Returns false if there's not enough space. | |
667 | */ | |
668 | static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, | |
669 | u32 *crc) | |
670 | { | |
671 | struct ext4_fc_tl tl; | |
672 | u8 *dst; | |
673 | ||
674 | dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); | |
675 | if (!dst) | |
676 | return false; | |
677 | ||
678 | tl.fc_tag = cpu_to_le16(tag); | |
679 | tl.fc_len = cpu_to_le16(len); | |
680 | ||
681 | ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); | |
682 | ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); | |
683 | ||
684 | return true; | |
685 | } | |
686 | ||
687 | /* Same as above, but adds dentry tlv. */ | |
688 | static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag, | |
689 | int parent_ino, int ino, int dlen, | |
690 | const unsigned char *dname, | |
691 | u32 *crc) | |
692 | { | |
693 | struct ext4_fc_dentry_info fcd; | |
694 | struct ext4_fc_tl tl; | |
695 | u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, | |
696 | crc); | |
697 | ||
698 | if (!dst) | |
699 | return false; | |
700 | ||
701 | fcd.fc_parent_ino = cpu_to_le32(parent_ino); | |
702 | fcd.fc_ino = cpu_to_le32(ino); | |
703 | tl.fc_tag = cpu_to_le16(tag); | |
704 | tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); | |
705 | ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); | |
706 | dst += sizeof(tl); | |
707 | ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); | |
708 | dst += sizeof(fcd); | |
709 | ext4_fc_memcpy(sb, dst, dname, dlen, crc); | |
710 | dst += dlen; | |
711 | ||
712 | return true; | |
713 | } | |
714 | ||
715 | /* | |
716 | * Writes inode in the fast commit space under TLV with tag @tag. | |
717 | * Returns 0 on success, error on failure. | |
718 | */ | |
719 | static int ext4_fc_write_inode(struct inode *inode, u32 *crc) | |
720 | { | |
721 | struct ext4_inode_info *ei = EXT4_I(inode); | |
722 | int inode_len = EXT4_GOOD_OLD_INODE_SIZE; | |
723 | int ret; | |
724 | struct ext4_iloc iloc; | |
725 | struct ext4_fc_inode fc_inode; | |
726 | struct ext4_fc_tl tl; | |
727 | u8 *dst; | |
728 | ||
729 | ret = ext4_get_inode_loc(inode, &iloc); | |
730 | if (ret) | |
731 | return ret; | |
732 | ||
733 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) | |
734 | inode_len += ei->i_extra_isize; | |
735 | ||
736 | fc_inode.fc_ino = cpu_to_le32(inode->i_ino); | |
737 | tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); | |
738 | tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); | |
739 | ||
740 | dst = ext4_fc_reserve_space(inode->i_sb, | |
741 | sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); | |
742 | if (!dst) | |
743 | return -ECANCELED; | |
744 | ||
745 | if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) | |
746 | return -ECANCELED; | |
747 | dst += sizeof(tl); | |
748 | if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) | |
749 | return -ECANCELED; | |
750 | dst += sizeof(fc_inode); | |
751 | if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), | |
752 | inode_len, crc)) | |
753 | return -ECANCELED; | |
754 | ||
755 | return 0; | |
756 | } | |
757 | ||
758 | /* | |
759 | * Writes updated data ranges for the inode in question. Updates CRC. | |
760 | * Returns 0 on success, error otherwise. | |
761 | */ | |
762 | static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) | |
763 | { | |
764 | ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; | |
765 | struct ext4_inode_info *ei = EXT4_I(inode); | |
766 | struct ext4_map_blocks map; | |
767 | struct ext4_fc_add_range fc_ext; | |
768 | struct ext4_fc_del_range lrange; | |
769 | struct ext4_extent *ex; | |
770 | int ret; | |
771 | ||
772 | mutex_lock(&ei->i_fc_lock); | |
773 | if (ei->i_fc_lblk_len == 0) { | |
774 | mutex_unlock(&ei->i_fc_lock); | |
775 | return 0; | |
776 | } | |
777 | old_blk_size = ei->i_fc_lblk_start; | |
778 | new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; | |
779 | ei->i_fc_lblk_len = 0; | |
780 | mutex_unlock(&ei->i_fc_lock); | |
781 | ||
782 | cur_lblk_off = old_blk_size; | |
783 | jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n", | |
784 | __func__, cur_lblk_off, new_blk_size, inode->i_ino); | |
785 | ||
786 | while (cur_lblk_off <= new_blk_size) { | |
787 | map.m_lblk = cur_lblk_off; | |
788 | map.m_len = new_blk_size - cur_lblk_off + 1; | |
789 | ret = ext4_map_blocks(NULL, inode, &map, 0); | |
790 | if (ret < 0) | |
791 | return -ECANCELED; | |
792 | ||
793 | if (map.m_len == 0) { | |
794 | cur_lblk_off++; | |
795 | continue; | |
796 | } | |
797 | ||
798 | if (ret == 0) { | |
799 | lrange.fc_ino = cpu_to_le32(inode->i_ino); | |
800 | lrange.fc_lblk = cpu_to_le32(map.m_lblk); | |
801 | lrange.fc_len = cpu_to_le32(map.m_len); | |
802 | if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, | |
803 | sizeof(lrange), (u8 *)&lrange, crc)) | |
804 | return -ENOSPC; | |
805 | } else { | |
806 | fc_ext.fc_ino = cpu_to_le32(inode->i_ino); | |
807 | ex = (struct ext4_extent *)&fc_ext.fc_ex; | |
808 | ex->ee_block = cpu_to_le32(map.m_lblk); | |
809 | ex->ee_len = cpu_to_le16(map.m_len); | |
810 | ext4_ext_store_pblock(ex, map.m_pblk); | |
811 | if (map.m_flags & EXT4_MAP_UNWRITTEN) | |
812 | ext4_ext_mark_unwritten(ex); | |
813 | else | |
814 | ext4_ext_mark_initialized(ex); | |
815 | if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, | |
816 | sizeof(fc_ext), (u8 *)&fc_ext, crc)) | |
817 | return -ENOSPC; | |
818 | } | |
819 | ||
820 | cur_lblk_off += map.m_len; | |
821 | } | |
822 | ||
823 | return 0; | |
824 | } | |
825 | ||
826 | ||
827 | /* Submit data for all the fast commit inodes */ | |
828 | static int ext4_fc_submit_inode_data_all(journal_t *journal) | |
829 | { | |
830 | struct super_block *sb = (struct super_block *)(journal->j_private); | |
831 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
832 | struct ext4_inode_info *ei; | |
833 | struct list_head *pos; | |
834 | int ret = 0; | |
835 | ||
836 | spin_lock(&sbi->s_fc_lock); | |
837 | sbi->s_mount_state |= EXT4_FC_COMMITTING; | |
838 | list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { | |
839 | ei = list_entry(pos, struct ext4_inode_info, i_fc_list); | |
840 | ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); | |
841 | while (atomic_read(&ei->i_fc_updates)) { | |
842 | DEFINE_WAIT(wait); | |
843 | ||
844 | prepare_to_wait(&ei->i_fc_wait, &wait, | |
845 | TASK_UNINTERRUPTIBLE); | |
846 | if (atomic_read(&ei->i_fc_updates)) { | |
847 | spin_unlock(&sbi->s_fc_lock); | |
848 | schedule(); | |
849 | spin_lock(&sbi->s_fc_lock); | |
850 | } | |
851 | finish_wait(&ei->i_fc_wait, &wait); | |
852 | } | |
853 | spin_unlock(&sbi->s_fc_lock); | |
854 | ret = jbd2_submit_inode_data(ei->jinode); | |
855 | if (ret) | |
856 | return ret; | |
857 | spin_lock(&sbi->s_fc_lock); | |
858 | } | |
859 | spin_unlock(&sbi->s_fc_lock); | |
860 | ||
861 | return ret; | |
862 | } | |
863 | ||
864 | /* Wait for completion of data for all the fast commit inodes */ | |
865 | static int ext4_fc_wait_inode_data_all(journal_t *journal) | |
866 | { | |
867 | struct super_block *sb = (struct super_block *)(journal->j_private); | |
868 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
869 | struct ext4_inode_info *pos, *n; | |
870 | int ret = 0; | |
871 | ||
872 | spin_lock(&sbi->s_fc_lock); | |
873 | list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { | |
874 | if (!ext4_test_inode_state(&pos->vfs_inode, | |
875 | EXT4_STATE_FC_COMMITTING)) | |
876 | continue; | |
877 | spin_unlock(&sbi->s_fc_lock); | |
878 | ||
879 | ret = jbd2_wait_inode_data(journal, pos->jinode); | |
880 | if (ret) | |
881 | return ret; | |
882 | spin_lock(&sbi->s_fc_lock); | |
883 | } | |
884 | spin_unlock(&sbi->s_fc_lock); | |
885 | ||
886 | return 0; | |
887 | } | |
888 | ||
889 | /* Commit all the directory entry updates */ | |
890 | static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) | |
891 | { | |
892 | struct super_block *sb = (struct super_block *)(journal->j_private); | |
893 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
894 | struct ext4_fc_dentry_update *fc_dentry; | |
895 | struct inode *inode; | |
896 | struct list_head *pos, *n, *fcd_pos, *fcd_n; | |
897 | struct ext4_inode_info *ei; | |
898 | int ret; | |
899 | ||
900 | if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) | |
901 | return 0; | |
902 | list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) { | |
903 | fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update, | |
904 | fcd_list); | |
905 | if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { | |
906 | spin_unlock(&sbi->s_fc_lock); | |
907 | if (!ext4_fc_add_dentry_tlv( | |
908 | sb, fc_dentry->fcd_op, | |
909 | fc_dentry->fcd_parent, fc_dentry->fcd_ino, | |
910 | fc_dentry->fcd_name.len, | |
911 | fc_dentry->fcd_name.name, crc)) { | |
912 | ret = -ENOSPC; | |
913 | goto lock_and_exit; | |
914 | } | |
915 | spin_lock(&sbi->s_fc_lock); | |
916 | continue; | |
917 | } | |
918 | ||
919 | inode = NULL; | |
920 | list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { | |
921 | ei = list_entry(pos, struct ext4_inode_info, i_fc_list); | |
922 | if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) { | |
923 | inode = &ei->vfs_inode; | |
924 | break; | |
925 | } | |
926 | } | |
927 | /* | |
928 | * If we don't find inode in our list, then it was deleted, | |
929 | * in which case, we don't need to record it's create tag. | |
930 | */ | |
931 | if (!inode) | |
932 | continue; | |
933 | spin_unlock(&sbi->s_fc_lock); | |
934 | ||
935 | /* | |
936 | * We first write the inode and then the create dirent. This | |
937 | * allows the recovery code to create an unnamed inode first | |
938 | * and then link it to a directory entry. This allows us | |
939 | * to use namei.c routines almost as is and simplifies | |
940 | * the recovery code. | |
941 | */ | |
942 | ret = ext4_fc_write_inode(inode, crc); | |
943 | if (ret) | |
944 | goto lock_and_exit; | |
945 | ||
946 | ret = ext4_fc_write_inode_data(inode, crc); | |
947 | if (ret) | |
948 | goto lock_and_exit; | |
949 | ||
950 | if (!ext4_fc_add_dentry_tlv( | |
951 | sb, fc_dentry->fcd_op, | |
952 | fc_dentry->fcd_parent, fc_dentry->fcd_ino, | |
953 | fc_dentry->fcd_name.len, | |
954 | fc_dentry->fcd_name.name, crc)) { | |
955 | spin_lock(&sbi->s_fc_lock); | |
956 | ret = -ENOSPC; | |
957 | goto lock_and_exit; | |
958 | } | |
959 | ||
960 | spin_lock(&sbi->s_fc_lock); | |
961 | } | |
962 | return 0; | |
963 | lock_and_exit: | |
964 | spin_lock(&sbi->s_fc_lock); | |
965 | return ret; | |
966 | } | |
967 | ||
968 | static int ext4_fc_perform_commit(journal_t *journal) | |
969 | { | |
970 | struct super_block *sb = (struct super_block *)(journal->j_private); | |
971 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
972 | struct ext4_inode_info *iter; | |
973 | struct ext4_fc_head head; | |
974 | struct list_head *pos; | |
975 | struct inode *inode; | |
976 | struct blk_plug plug; | |
977 | int ret = 0; | |
978 | u32 crc = 0; | |
979 | ||
980 | ret = ext4_fc_submit_inode_data_all(journal); | |
981 | if (ret) | |
982 | return ret; | |
983 | ||
984 | ret = ext4_fc_wait_inode_data_all(journal); | |
985 | if (ret) | |
986 | return ret; | |
987 | ||
988 | blk_start_plug(&plug); | |
989 | if (sbi->s_fc_bytes == 0) { | |
990 | /* | |
991 | * Add a head tag only if this is the first fast commit | |
992 | * in this TID. | |
993 | */ | |
994 | head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); | |
995 | head.fc_tid = cpu_to_le32( | |
996 | sbi->s_journal->j_running_transaction->t_tid); | |
997 | if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), | |
998 | (u8 *)&head, &crc)) | |
999 | goto out; | |
1000 | } | |
1001 | ||
1002 | spin_lock(&sbi->s_fc_lock); | |
1003 | ret = ext4_fc_commit_dentry_updates(journal, &crc); | |
1004 | if (ret) { | |
1005 | spin_unlock(&sbi->s_fc_lock); | |
1006 | goto out; | |
1007 | } | |
1008 | ||
1009 | list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { | |
1010 | iter = list_entry(pos, struct ext4_inode_info, i_fc_list); | |
1011 | inode = &iter->vfs_inode; | |
1012 | if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) | |
1013 | continue; | |
1014 | ||
1015 | spin_unlock(&sbi->s_fc_lock); | |
1016 | ret = ext4_fc_write_inode_data(inode, &crc); | |
1017 | if (ret) | |
1018 | goto out; | |
1019 | ret = ext4_fc_write_inode(inode, &crc); | |
1020 | if (ret) | |
1021 | goto out; | |
1022 | spin_lock(&sbi->s_fc_lock); | |
1023 | EXT4_I(inode)->i_fc_committed_subtid = | |
1024 | atomic_read(&sbi->s_fc_subtid); | |
1025 | } | |
1026 | spin_unlock(&sbi->s_fc_lock); | |
1027 | ||
1028 | ret = ext4_fc_write_tail(sb, crc); | |
1029 | ||
1030 | out: | |
1031 | blk_finish_plug(&plug); | |
1032 | return ret; | |
1033 | } | |
1034 | ||
1035 | /* | |
1036 | * The main commit entry point. Performs a fast commit for transaction | |
1037 | * commit_tid if needed. If it's not possible to perform a fast commit | |
1038 | * due to various reasons, we fall back to full commit. Returns 0 | |
1039 | * on success, error otherwise. | |
1040 | */ | |
1041 | int ext4_fc_commit(journal_t *journal, tid_t commit_tid) | |
1042 | { | |
1043 | struct super_block *sb = (struct super_block *)(journal->j_private); | |
1044 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
1045 | int nblks = 0, ret, bsize = journal->j_blocksize; | |
1046 | int subtid = atomic_read(&sbi->s_fc_subtid); | |
1047 | int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0; | |
1048 | ktime_t start_time, commit_time; | |
1049 | ||
1050 | trace_ext4_fc_commit_start(sb); | |
1051 | ||
1052 | start_time = ktime_get(); | |
1053 | ||
1054 | if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || | |
1055 | (ext4_fc_is_ineligible(sb))) { | |
1056 | reason = EXT4_FC_REASON_INELIGIBLE; | |
1057 | goto out; | |
1058 | } | |
1059 | ||
1060 | restart_fc: | |
1061 | ret = jbd2_fc_begin_commit(journal, commit_tid); | |
1062 | if (ret == -EALREADY) { | |
1063 | /* There was an ongoing commit, check if we need to restart */ | |
1064 | if (atomic_read(&sbi->s_fc_subtid) <= subtid && | |
1065 | commit_tid > journal->j_commit_sequence) | |
1066 | goto restart_fc; | |
1067 | reason = EXT4_FC_REASON_ALREADY_COMMITTED; | |
1068 | goto out; | |
1069 | } else if (ret) { | |
1070 | sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; | |
1071 | reason = EXT4_FC_REASON_FC_START_FAILED; | |
1072 | goto out; | |
1073 | } | |
1074 | ||
1075 | fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; | |
1076 | ret = ext4_fc_perform_commit(journal); | |
1077 | if (ret < 0) { | |
1078 | sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; | |
1079 | reason = EXT4_FC_REASON_FC_FAILED; | |
1080 | goto out; | |
1081 | } | |
1082 | nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; | |
1083 | ret = jbd2_fc_wait_bufs(journal, nblks); | |
1084 | if (ret < 0) { | |
1085 | sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; | |
1086 | reason = EXT4_FC_REASON_FC_FAILED; | |
1087 | goto out; | |
1088 | } | |
1089 | atomic_inc(&sbi->s_fc_subtid); | |
1090 | jbd2_fc_end_commit(journal); | |
1091 | out: | |
1092 | /* Has any ineligible update happened since we started? */ | |
1093 | if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) { | |
1094 | sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; | |
1095 | reason = EXT4_FC_REASON_INELIGIBLE; | |
1096 | } | |
1097 | ||
1098 | spin_lock(&sbi->s_fc_lock); | |
1099 | if (reason != EXT4_FC_REASON_OK && | |
1100 | reason != EXT4_FC_REASON_ALREADY_COMMITTED) { | |
1101 | sbi->s_fc_stats.fc_ineligible_commits++; | |
1102 | } else { | |
1103 | sbi->s_fc_stats.fc_num_commits++; | |
1104 | sbi->s_fc_stats.fc_numblks += nblks; | |
1105 | } | |
1106 | spin_unlock(&sbi->s_fc_lock); | |
1107 | nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0; | |
1108 | trace_ext4_fc_commit_stop(sb, nblks, reason); | |
1109 | commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); | |
1110 | /* | |
1111 | * weight the commit time higher than the average time so we don't | |
1112 | * react too strongly to vast changes in the commit time | |
1113 | */ | |
1114 | if (likely(sbi->s_fc_avg_commit_time)) | |
1115 | sbi->s_fc_avg_commit_time = (commit_time + | |
1116 | sbi->s_fc_avg_commit_time * 3) / 4; | |
1117 | else | |
1118 | sbi->s_fc_avg_commit_time = commit_time; | |
1119 | jbd_debug(1, | |
1120 | "Fast commit ended with blks = %d, reason = %d, subtid - %d", | |
1121 | nblks, reason, subtid); | |
1122 | if (reason == EXT4_FC_REASON_FC_FAILED) | |
1123 | return jbd2_fc_end_commit_fallback(journal, commit_tid); | |
1124 | if (reason == EXT4_FC_REASON_FC_START_FAILED || | |
1125 | reason == EXT4_FC_REASON_INELIGIBLE) | |
1126 | return jbd2_complete_transaction(journal, commit_tid); | |
1127 | return 0; | |
1128 | } | |
1129 | ||
ff780b91 HS |
1130 | /* |
1131 | * Fast commit cleanup routine. This is called after every fast commit and | |
1132 | * full commit. full is true if we are called after a full commit. | |
1133 | */ | |
1134 | static void ext4_fc_cleanup(journal_t *journal, int full) | |
1135 | { | |
aa75f4d3 HS |
1136 | struct super_block *sb = journal->j_private; |
1137 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
1138 | struct ext4_inode_info *iter; | |
1139 | struct ext4_fc_dentry_update *fc_dentry; | |
1140 | struct list_head *pos, *n; | |
1141 | ||
1142 | if (full && sbi->s_fc_bh) | |
1143 | sbi->s_fc_bh = NULL; | |
1144 | ||
1145 | jbd2_fc_release_bufs(journal); | |
1146 | ||
1147 | spin_lock(&sbi->s_fc_lock); | |
1148 | list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { | |
1149 | iter = list_entry(pos, struct ext4_inode_info, i_fc_list); | |
1150 | list_del_init(&iter->i_fc_list); | |
1151 | ext4_clear_inode_state(&iter->vfs_inode, | |
1152 | EXT4_STATE_FC_COMMITTING); | |
1153 | ext4_fc_reset_inode(&iter->vfs_inode); | |
1154 | /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ | |
1155 | smp_mb(); | |
1156 | #if (BITS_PER_LONG < 64) | |
1157 | wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); | |
1158 | #else | |
1159 | wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); | |
1160 | #endif | |
1161 | } | |
1162 | ||
1163 | while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { | |
1164 | fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], | |
1165 | struct ext4_fc_dentry_update, | |
1166 | fcd_list); | |
1167 | list_del_init(&fc_dentry->fcd_list); | |
1168 | spin_unlock(&sbi->s_fc_lock); | |
1169 | ||
1170 | if (fc_dentry->fcd_name.name && | |
1171 | fc_dentry->fcd_name.len > DNAME_INLINE_LEN) | |
1172 | kfree(fc_dentry->fcd_name.name); | |
1173 | kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); | |
1174 | spin_lock(&sbi->s_fc_lock); | |
1175 | } | |
1176 | ||
1177 | list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], | |
1178 | &sbi->s_fc_dentry_q[FC_Q_MAIN]); | |
1179 | list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], | |
1180 | &sbi->s_fc_q[FC_Q_STAGING]); | |
1181 | ||
1182 | sbi->s_mount_state &= ~EXT4_FC_COMMITTING; | |
1183 | sbi->s_mount_state &= ~EXT4_FC_INELIGIBLE; | |
1184 | ||
1185 | if (full) | |
1186 | sbi->s_fc_bytes = 0; | |
1187 | spin_unlock(&sbi->s_fc_lock); | |
1188 | trace_ext4_fc_stats(sb); | |
ff780b91 | 1189 | } |
6866d7b3 | 1190 | |
5b849b5f HS |
1191 | /* |
1192 | * Main recovery path entry point. | |
1193 | */ | |
1194 | static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, | |
1195 | enum passtype pass, int off, tid_t expected_tid) | |
1196 | { | |
1197 | return 0; | |
1198 | } | |
1199 | ||
6866d7b3 HS |
1200 | void ext4_fc_init(struct super_block *sb, journal_t *journal) |
1201 | { | |
5b849b5f HS |
1202 | /* |
1203 | * We set replay callback even if fast commit disabled because we may | |
1204 | * could still have fast commit blocks that need to be replayed even if | |
1205 | * fast commit has now been turned off. | |
1206 | */ | |
1207 | journal->j_fc_replay_callback = ext4_fc_replay; | |
6866d7b3 HS |
1208 | if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) |
1209 | return; | |
ff780b91 | 1210 | journal->j_fc_cleanup_callback = ext4_fc_cleanup; |
6866d7b3 HS |
1211 | if (jbd2_fc_init(journal, EXT4_NUM_FC_BLKS)) { |
1212 | pr_warn("Error while enabling fast commits, turning off."); | |
1213 | ext4_clear_feature_fast_commit(sb); | |
1214 | } | |
1215 | } | |
aa75f4d3 HS |
1216 | |
1217 | int __init ext4_fc_init_dentry_cache(void) | |
1218 | { | |
1219 | ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, | |
1220 | SLAB_RECLAIM_ACCOUNT); | |
1221 | ||
1222 | if (ext4_fc_dentry_cachep == NULL) | |
1223 | return -ENOMEM; | |
1224 | ||
1225 | return 0; | |
1226 | } |