xfs: journal IO cache flush reductions
[linux-block.git] / fs / xfs / xfs_log_cil.c
CommitLineData
0b61f8a4 1// SPDX-License-Identifier: GPL-2.0
71e330b5
DC
2/*
3 * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
71e330b5
DC
4 */
5
6#include "xfs.h"
7#include "xfs_fs.h"
4fb6e8ad 8#include "xfs_format.h"
239880ef 9#include "xfs_log_format.h"
70a9883c 10#include "xfs_shared.h"
239880ef 11#include "xfs_trans_resv.h"
71e330b5 12#include "xfs_mount.h"
efc27b52 13#include "xfs_extent_busy.h"
239880ef
DC
14#include "xfs_trans.h"
15#include "xfs_trans_priv.h"
16#include "xfs_log.h"
17#include "xfs_log_priv.h"
4560e78f
CH
18#include "xfs_trace.h"
19
20struct workqueue_struct *xfs_discard_wq;
71e330b5 21
71e330b5
DC
22/*
23 * Allocate a new ticket. Failing to get a new ticket makes it really hard to
24 * recover, so we don't allow failure here. Also, we allocate in a context that
25 * we don't want to be issuing transactions from, so we need to tell the
26 * allocation code this as well.
27 *
28 * We don't reserve any space for the ticket - we are going to steal whatever
29 * space we require from transactions as they commit. To ensure we reserve all
30 * the space required, we need to set the current reservation of the ticket to
31 * zero so that we know to steal the initial transaction overhead from the
32 * first transaction commit.
33 */
34static struct xlog_ticket *
35xlog_cil_ticket_alloc(
f7bdf03a 36 struct xlog *log)
71e330b5
DC
37{
38 struct xlog_ticket *tic;
39
ca4f2589 40 tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0);
71e330b5
DC
41
42 /*
43 * set the current reservation to zero so we know to steal the basic
44 * transaction overhead reservation from the first transaction commit.
45 */
46 tic->t_curr_res = 0;
47 return tic;
48}
49
50/*
51 * After the first stage of log recovery is done, we know where the head and
52 * tail of the log are. We need this log initialisation done before we can
53 * initialise the first CIL checkpoint context.
54 *
55 * Here we allocate a log ticket to track space usage during a CIL push. This
56 * ticket is passed to xlog_write() directly so that we don't slowly leak log
57 * space by failing to account for space used by log headers and additional
58 * region headers for split regions.
59 */
60void
61xlog_cil_init_post_recovery(
f7bdf03a 62 struct xlog *log)
71e330b5 63{
71e330b5
DC
64 log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
65 log->l_cilp->xc_ctx->sequence = 1;
71e330b5
DC
66}
67
b1c5ebb2
DC
68static inline int
69xlog_cil_iovec_space(
70 uint niovecs)
71{
72 return round_up((sizeof(struct xfs_log_vec) +
73 niovecs * sizeof(struct xfs_log_iovec)),
74 sizeof(uint64_t));
75}
76
77/*
78 * Allocate or pin log vector buffers for CIL insertion.
79 *
80 * The CIL currently uses disposable buffers for copying a snapshot of the
81 * modified items into the log during a push. The biggest problem with this is
82 * the requirement to allocate the disposable buffer during the commit if:
83 * a) does not exist; or
84 * b) it is too small
85 *
86 * If we do this allocation within xlog_cil_insert_format_items(), it is done
87 * under the xc_ctx_lock, which means that a CIL push cannot occur during
88 * the memory allocation. This means that we have a potential deadlock situation
89 * under low memory conditions when we have lots of dirty metadata pinned in
90 * the CIL and we need a CIL commit to occur to free memory.
91 *
92 * To avoid this, we need to move the memory allocation outside the
93 * xc_ctx_lock, but because the log vector buffers are disposable, that opens
94 * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
95 * vector buffers between the check and the formatting of the item into the
96 * log vector buffer within the xc_ctx_lock.
97 *
98 * Because the log vector buffer needs to be unchanged during the CIL push
99 * process, we cannot share the buffer between the transaction commit (which
100 * modifies the buffer) and the CIL push context that is writing the changes
101 * into the log. This means skipping preallocation of buffer space is
102 * unreliable, but we most definitely do not want to be allocating and freeing
103 * buffers unnecessarily during commits when overwrites can be done safely.
104 *
105 * The simplest solution to this problem is to allocate a shadow buffer when a
106 * log item is committed for the second time, and then to only use this buffer
107 * if necessary. The buffer can remain attached to the log item until such time
108 * it is needed, and this is the buffer that is reallocated to match the size of
109 * the incoming modification. Then during the formatting of the item we can swap
110 * the active buffer with the new one if we can't reuse the existing buffer. We
111 * don't free the old buffer as it may be reused on the next modification if
112 * it's size is right, otherwise we'll free and reallocate it at that point.
113 *
114 * This function builds a vector for the changes in each log item in the
115 * transaction. It then works out the length of the buffer needed for each log
116 * item, allocates them and attaches the vector to the log item in preparation
117 * for the formatting step which occurs under the xc_ctx_lock.
118 *
119 * While this means the memory footprint goes up, it avoids the repeated
120 * alloc/free pattern that repeated modifications of an item would otherwise
121 * cause, and hence minimises the CPU overhead of such behaviour.
122 */
123static void
124xlog_cil_alloc_shadow_bufs(
125 struct xlog *log,
126 struct xfs_trans *tp)
127{
e6631f85 128 struct xfs_log_item *lip;
b1c5ebb2 129
e6631f85 130 list_for_each_entry(lip, &tp->t_items, li_trans) {
b1c5ebb2
DC
131 struct xfs_log_vec *lv;
132 int niovecs = 0;
133 int nbytes = 0;
134 int buf_size;
135 bool ordered = false;
136
137 /* Skip items which aren't dirty in this transaction. */
e6631f85 138 if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
b1c5ebb2
DC
139 continue;
140
141 /* get number of vecs and size of data to be stored */
142 lip->li_ops->iop_size(lip, &niovecs, &nbytes);
143
144 /*
145 * Ordered items need to be tracked but we do not wish to write
146 * them. We need a logvec to track the object, but we do not
147 * need an iovec or buffer to be allocated for copying data.
148 */
149 if (niovecs == XFS_LOG_VEC_ORDERED) {
150 ordered = true;
151 niovecs = 0;
152 nbytes = 0;
153 }
154
155 /*
156 * We 64-bit align the length of each iovec so that the start
157 * of the next one is naturally aligned. We'll need to
158 * account for that slack space here. Then round nbytes up
159 * to 64-bit alignment so that the initial buffer alignment is
160 * easy to calculate and verify.
161 */
162 nbytes += niovecs * sizeof(uint64_t);
163 nbytes = round_up(nbytes, sizeof(uint64_t));
164
165 /*
166 * The data buffer needs to start 64-bit aligned, so round up
167 * that space to ensure we can align it appropriately and not
168 * overrun the buffer.
169 */
170 buf_size = nbytes + xlog_cil_iovec_space(niovecs);
171
172 /*
173 * if we have no shadow buffer, or it is too small, we need to
174 * reallocate it.
175 */
176 if (!lip->li_lv_shadow ||
177 buf_size > lip->li_lv_shadow->lv_size) {
178
179 /*
180 * We free and allocate here as a realloc would copy
cf085a1b 181 * unnecessary data. We don't use kmem_zalloc() for the
b1c5ebb2
DC
182 * same reason - we don't need to zero the data area in
183 * the buffer, only the log vector header and the iovec
184 * storage.
185 */
186 kmem_free(lip->li_lv_shadow);
187
707e0dda 188 lv = kmem_alloc_large(buf_size, KM_NOFS);
b1c5ebb2
DC
189 memset(lv, 0, xlog_cil_iovec_space(niovecs));
190
191 lv->lv_item = lip;
192 lv->lv_size = buf_size;
193 if (ordered)
194 lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
195 else
196 lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
197 lip->li_lv_shadow = lv;
198 } else {
199 /* same or smaller, optimise common overwrite case */
200 lv = lip->li_lv_shadow;
201 if (ordered)
202 lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
203 else
204 lv->lv_buf_len = 0;
205 lv->lv_bytes = 0;
206 lv->lv_next = NULL;
207 }
208
209 /* Ensure the lv is set up according to ->iop_size */
210 lv->lv_niovecs = niovecs;
211
212 /* The allocated data region lies beyond the iovec region */
213 lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs);
214 }
215
216}
217
991aaf65
DC
218/*
219 * Prepare the log item for insertion into the CIL. Calculate the difference in
220 * log space and vectors it will consume, and if it is a new item pin it as
221 * well.
222 */
223STATIC void
224xfs_cil_prepare_item(
225 struct xlog *log,
226 struct xfs_log_vec *lv,
227 struct xfs_log_vec *old_lv,
228 int *diff_len,
229 int *diff_iovecs)
230{
231 /* Account for the new LV being passed in */
232 if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
110dc24a 233 *diff_len += lv->lv_bytes;
991aaf65
DC
234 *diff_iovecs += lv->lv_niovecs;
235 }
236
237 /*
238 * If there is no old LV, this is the first time we've seen the item in
239 * this CIL context and so we need to pin it. If we are replacing the
b1c5ebb2
DC
240 * old_lv, then remove the space it accounts for and make it the shadow
241 * buffer for later freeing. In both cases we are now switching to the
b63da6c8 242 * shadow buffer, so update the pointer to it appropriately.
991aaf65 243 */
b1c5ebb2 244 if (!old_lv) {
e8b78db7
CH
245 if (lv->lv_item->li_ops->iop_pin)
246 lv->lv_item->li_ops->iop_pin(lv->lv_item);
b1c5ebb2
DC
247 lv->lv_item->li_lv_shadow = NULL;
248 } else if (old_lv != lv) {
991aaf65
DC
249 ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
250
110dc24a 251 *diff_len -= old_lv->lv_bytes;
991aaf65 252 *diff_iovecs -= old_lv->lv_niovecs;
b1c5ebb2 253 lv->lv_item->li_lv_shadow = old_lv;
991aaf65
DC
254 }
255
256 /* attach new log vector to log item */
257 lv->lv_item->li_lv = lv;
258
259 /*
260 * If this is the first time the item is being committed to the
261 * CIL, store the sequence number on the log item so we can
262 * tell in future commits whether this is the first checkpoint
263 * the item is being committed into.
264 */
265 if (!lv->lv_item->li_seq)
266 lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
267}
268
71e330b5
DC
269/*
270 * Format log item into a flat buffers
271 *
272 * For delayed logging, we need to hold a formatted buffer containing all the
273 * changes on the log item. This enables us to relog the item in memory and
274 * write it out asynchronously without needing to relock the object that was
275 * modified at the time it gets written into the iclog.
276 *
b1c5ebb2
DC
277 * This function takes the prepared log vectors attached to each log item, and
278 * formats the changes into the log vector buffer. The buffer it uses is
279 * dependent on the current state of the vector in the CIL - the shadow lv is
280 * guaranteed to be large enough for the current modification, but we will only
281 * use that if we can't reuse the existing lv. If we can't reuse the existing
282 * lv, then simple swap it out for the shadow lv. We don't free it - that is
283 * done lazily either by th enext modification or the freeing of the log item.
71e330b5
DC
284 *
285 * We don't set up region headers during this process; we simply copy the
286 * regions into the flat buffer. We can do this because we still have to do a
287 * formatting step to write the regions into the iclog buffer. Writing the
288 * ophdrs during the iclog write means that we can support splitting large
289 * regions across iclog boundares without needing a change in the format of the
290 * item/region encapsulation.
291 *
292 * Hence what we need to do now is change the rewrite the vector array to point
293 * to the copied region inside the buffer we just allocated. This allows us to
294 * format the regions into the iclog as though they are being formatted
295 * directly out of the objects themselves.
296 */
991aaf65
DC
297static void
298xlog_cil_insert_format_items(
299 struct xlog *log,
300 struct xfs_trans *tp,
301 int *diff_len,
302 int *diff_iovecs)
71e330b5 303{
e6631f85 304 struct xfs_log_item *lip;
71e330b5 305
0244b960
CH
306
307 /* Bail out if we didn't find a log item. */
308 if (list_empty(&tp->t_items)) {
309 ASSERT(0);
991aaf65 310 return;
0244b960
CH
311 }
312
e6631f85 313 list_for_each_entry(lip, &tp->t_items, li_trans) {
7492c5b4 314 struct xfs_log_vec *lv;
b1c5ebb2
DC
315 struct xfs_log_vec *old_lv = NULL;
316 struct xfs_log_vec *shadow;
fd63875c 317 bool ordered = false;
71e330b5 318
0244b960 319 /* Skip items which aren't dirty in this transaction. */
e6631f85 320 if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
0244b960
CH
321 continue;
322
fd63875c 323 /*
b1c5ebb2
DC
324 * The formatting size information is already attached to
325 * the shadow lv on the log item.
fd63875c 326 */
b1c5ebb2
DC
327 shadow = lip->li_lv_shadow;
328 if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
fd63875c 329 ordered = true;
fd63875c 330
b1c5ebb2
DC
331 /* Skip items that do not have any vectors for writing */
332 if (!shadow->lv_niovecs && !ordered)
333 continue;
0244b960 334
f5baac35 335 /* compare to existing item size */
b1c5ebb2
DC
336 old_lv = lip->li_lv;
337 if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
f5baac35
DC
338 /* same or smaller, optimise common overwrite case */
339 lv = lip->li_lv;
340 lv->lv_next = NULL;
341
342 if (ordered)
343 goto insert;
344
991aaf65
DC
345 /*
346 * set the item up as though it is a new insertion so
347 * that the space reservation accounting is correct.
348 */
349 *diff_iovecs -= lv->lv_niovecs;
110dc24a 350 *diff_len -= lv->lv_bytes;
b1c5ebb2
DC
351
352 /* Ensure the lv is set up according to ->iop_size */
353 lv->lv_niovecs = shadow->lv_niovecs;
354
355 /* reset the lv buffer information for new formatting */
356 lv->lv_buf_len = 0;
357 lv->lv_bytes = 0;
358 lv->lv_buf = (char *)lv +
359 xlog_cil_iovec_space(lv->lv_niovecs);
9597df6b 360 } else {
b1c5ebb2
DC
361 /* switch to shadow buffer! */
362 lv = shadow;
9597df6b 363 lv->lv_item = lip;
9597df6b
CH
364 if (ordered) {
365 /* track as an ordered logvec */
366 ASSERT(lip->li_lv == NULL);
9597df6b
CH
367 goto insert;
368 }
f5baac35
DC
369 }
370
3895e51f 371 ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
bde7cff6 372 lip->li_ops->iop_format(lip, lv);
7492c5b4 373insert:
991aaf65 374 xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
3b93c7aa 375 }
d1583a38
DC
376}
377
378/*
379 * Insert the log items into the CIL and calculate the difference in space
380 * consumed by the item. Add the space to the checkpoint ticket and calculate
381 * if the change requires additional log metadata. If it does, take that space
42b2aa86 382 * as well. Remove the amount of space we added to the checkpoint ticket from
d1583a38
DC
383 * the current transaction ticket so that the accounting works out correctly.
384 */
3b93c7aa
DC
385static void
386xlog_cil_insert_items(
f7bdf03a 387 struct xlog *log,
991aaf65 388 struct xfs_trans *tp)
3b93c7aa 389{
d1583a38
DC
390 struct xfs_cil *cil = log->l_cilp;
391 struct xfs_cil_ctx *ctx = cil->xc_ctx;
e6631f85 392 struct xfs_log_item *lip;
d1583a38
DC
393 int len = 0;
394 int diff_iovecs = 0;
395 int iclog_space;
e2f23426 396 int iovhdr_res = 0, split_res = 0, ctx_res = 0;
3b93c7aa 397
991aaf65 398 ASSERT(tp);
d1583a38
DC
399
400 /*
d1583a38
DC
401 * We can do this safely because the context can't checkpoint until we
402 * are done so it doesn't matter exactly how we update the CIL.
403 */
991aaf65
DC
404 xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs);
405
d1583a38 406 spin_lock(&cil->xc_cil_lock);
d1583a38 407
fd63875c 408 /* account for space used by new iovec headers */
e2f23426
BF
409 iovhdr_res = diff_iovecs * sizeof(xlog_op_header_t);
410 len += iovhdr_res;
d1583a38
DC
411 ctx->nvecs += diff_iovecs;
412
991aaf65
DC
413 /* attach the transaction to the CIL if it has any busy extents */
414 if (!list_empty(&tp->t_busy))
415 list_splice_init(&tp->t_busy, &ctx->busy_extents);
416
d1583a38
DC
417 /*
418 * Now transfer enough transaction reservation to the context ticket
419 * for the checkpoint. The context ticket is special - the unit
420 * reservation has to grow as well as the current reservation as we
421 * steal from tickets so we can correctly determine the space used
422 * during the transaction commit.
423 */
424 if (ctx->ticket->t_curr_res == 0) {
e2f23426
BF
425 ctx_res = ctx->ticket->t_unit_res;
426 ctx->ticket->t_curr_res = ctx_res;
427 tp->t_ticket->t_curr_res -= ctx_res;
d1583a38
DC
428 }
429
430 /* do we need space for more log record headers? */
431 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
432 if (len > 0 && (ctx->space_used / iclog_space !=
433 (ctx->space_used + len) / iclog_space)) {
e2f23426 434 split_res = (len + iclog_space - 1) / iclog_space;
d1583a38 435 /* need to take into account split region headers, too */
e2f23426
BF
436 split_res *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
437 ctx->ticket->t_unit_res += split_res;
438 ctx->ticket->t_curr_res += split_res;
439 tp->t_ticket->t_curr_res -= split_res;
991aaf65 440 ASSERT(tp->t_ticket->t_curr_res >= len);
d1583a38 441 }
991aaf65 442 tp->t_ticket->t_curr_res -= len;
d1583a38
DC
443 ctx->space_used += len;
444
d4ca1d55
BF
445 /*
446 * If we've overrun the reservation, dump the tx details before we move
447 * the log items. Shutdown is imminent...
448 */
449 if (WARN_ON(tp->t_ticket->t_curr_res < 0)) {
450 xfs_warn(log->l_mp, "Transaction log reservation overrun:");
451 xfs_warn(log->l_mp,
452 " log items: %d bytes (iov hdrs: %d bytes)",
453 len, iovhdr_res);
454 xfs_warn(log->l_mp, " split region headers: %d bytes",
455 split_res);
456 xfs_warn(log->l_mp, " ctx ticket: %d bytes", ctx_res);
457 xlog_print_trans(tp);
458 }
459
e2f23426
BF
460 /*
461 * Now (re-)position everything modified at the tail of the CIL.
462 * We do this here so we only need to take the CIL lock once during
463 * the transaction commit.
464 */
e6631f85 465 list_for_each_entry(lip, &tp->t_items, li_trans) {
e2f23426
BF
466
467 /* Skip items which aren't dirty in this transaction. */
e6631f85 468 if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
e2f23426
BF
469 continue;
470
471 /*
472 * Only move the item if it isn't already at the tail. This is
473 * to prevent a transient list_empty() state when reinserting
474 * an item that is already the only item in the CIL.
475 */
476 if (!list_is_last(&lip->li_cil, &cil->xc_cil))
477 list_move_tail(&lip->li_cil, &cil->xc_cil);
478 }
479
d1583a38 480 spin_unlock(&cil->xc_cil_lock);
d4ca1d55
BF
481
482 if (tp->t_ticket->t_curr_res < 0)
483 xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
71e330b5
DC
484}
485
486static void
487xlog_cil_free_logvec(
488 struct xfs_log_vec *log_vector)
489{
490 struct xfs_log_vec *lv;
491
492 for (lv = log_vector; lv; ) {
493 struct xfs_log_vec *next = lv->lv_next;
71e330b5
DC
494 kmem_free(lv);
495 lv = next;
496 }
497}
498
4560e78f
CH
499static void
500xlog_discard_endio_work(
501 struct work_struct *work)
502{
503 struct xfs_cil_ctx *ctx =
504 container_of(work, struct xfs_cil_ctx, discard_endio_work);
505 struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
506
507 xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
508 kmem_free(ctx);
509}
510
511/*
512 * Queue up the actual completion to a thread to avoid IRQ-safe locking for
513 * pagb_lock. Note that we need a unbounded workqueue, otherwise we might
514 * get the execution delayed up to 30 seconds for weird reasons.
515 */
516static void
517xlog_discard_endio(
518 struct bio *bio)
519{
520 struct xfs_cil_ctx *ctx = bio->bi_private;
521
522 INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work);
523 queue_work(xfs_discard_wq, &ctx->discard_endio_work);
ea7bd56f 524 bio_put(bio);
4560e78f
CH
525}
526
527static void
528xlog_discard_busy_extents(
529 struct xfs_mount *mp,
530 struct xfs_cil_ctx *ctx)
531{
532 struct list_head *list = &ctx->busy_extents;
533 struct xfs_extent_busy *busyp;
534 struct bio *bio = NULL;
535 struct blk_plug plug;
536 int error = 0;
537
538 ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
539
540 blk_start_plug(&plug);
541 list_for_each_entry(busyp, list, list) {
542 trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
543 busyp->length);
544
545 error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
546 XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
547 XFS_FSB_TO_BB(mp, busyp->length),
548 GFP_NOFS, 0, &bio);
549 if (error && error != -EOPNOTSUPP) {
550 xfs_info(mp,
551 "discard failed for extent [0x%llx,%u], error %d",
552 (unsigned long long)busyp->bno,
553 busyp->length,
554 error);
555 break;
556 }
557 }
558
559 if (bio) {
560 bio->bi_private = ctx;
561 bio->bi_end_io = xlog_discard_endio;
562 submit_bio(bio);
563 } else {
564 xlog_discard_endio_work(&ctx->discard_endio_work);
565 }
566 blk_finish_plug(&plug);
567}
568
71e330b5
DC
569/*
570 * Mark all items committed and clear busy extents. We free the log vector
571 * chains in a separate pass so that we unpin the log items as quickly as
572 * possible.
573 */
574static void
575xlog_cil_committed(
12e6a0f4 576 struct xfs_cil_ctx *ctx)
71e330b5 577{
e84661aa 578 struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
12e6a0f4 579 bool abort = XLOG_FORCED_SHUTDOWN(ctx->cil->xc_log);
71e330b5 580
545aa41f
BF
581 /*
582 * If the I/O failed, we're aborting the commit and already shutdown.
583 * Wake any commit waiters before aborting the log items so we don't
584 * block async log pushers on callbacks. Async log pushers explicitly do
585 * not wait on log force completion because they may be holding locks
586 * required to unpin items.
587 */
588 if (abort) {
589 spin_lock(&ctx->cil->xc_push_lock);
590 wake_up_all(&ctx->cil->xc_commit_wait);
591 spin_unlock(&ctx->cil->xc_push_lock);
592 }
593
0e57f6a3
DC
594 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
595 ctx->start_lsn, abort);
71e330b5 596
4ecbfe63
DC
597 xfs_extent_busy_sort(&ctx->busy_extents);
598 xfs_extent_busy_clear(mp, &ctx->busy_extents,
e84661aa 599 (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
71e330b5 600
4bb928cd 601 spin_lock(&ctx->cil->xc_push_lock);
71e330b5 602 list_del(&ctx->committing);
4bb928cd 603 spin_unlock(&ctx->cil->xc_push_lock);
71e330b5
DC
604
605 xlog_cil_free_logvec(ctx->lv_chain);
e84661aa 606
4560e78f
CH
607 if (!list_empty(&ctx->busy_extents))
608 xlog_discard_busy_extents(mp, ctx);
609 else
610 kmem_free(ctx);
71e330b5
DC
611}
612
89ae379d
CH
613void
614xlog_cil_process_committed(
12e6a0f4 615 struct list_head *list)
89ae379d
CH
616{
617 struct xfs_cil_ctx *ctx;
618
619 while ((ctx = list_first_entry_or_null(list,
620 struct xfs_cil_ctx, iclog_entry))) {
621 list_del(&ctx->iclog_entry);
12e6a0f4 622 xlog_cil_committed(ctx);
89ae379d
CH
623 }
624}
625
71e330b5 626/*
c7cc296d
CH
627 * Push the Committed Item List to the log.
628 *
629 * If the current sequence is the same as xc_push_seq we need to do a flush. If
630 * xc_push_seq is less than the current sequence, then it has already been
a44f13ed
DC
631 * flushed and we don't need to do anything - the caller will wait for it to
632 * complete if necessary.
633 *
c7cc296d
CH
634 * xc_push_seq is checked unlocked against the sequence number for a match.
635 * Hence we can allow log forces to run racily and not issue pushes for the
636 * same sequence twice. If we get a race between multiple pushes for the same
637 * sequence they will block on the first one and then abort, hence avoiding
638 * needless pushes.
71e330b5 639 */
c7cc296d
CH
640static void
641xlog_cil_push_work(
642 struct work_struct *work)
71e330b5 643{
c7cc296d
CH
644 struct xfs_cil *cil =
645 container_of(work, struct xfs_cil, xc_push_work);
646 struct xlog *log = cil->xc_log;
71e330b5
DC
647 struct xfs_log_vec *lv;
648 struct xfs_cil_ctx *ctx;
649 struct xfs_cil_ctx *new_ctx;
650 struct xlog_in_core *commit_iclog;
651 struct xlog_ticket *tic;
71e330b5 652 int num_iovecs;
71e330b5
DC
653 int error = 0;
654 struct xfs_trans_header thdr;
655 struct xfs_log_iovec lhdr;
656 struct xfs_log_vec lvhdr = { NULL };
657 xfs_lsn_t commit_lsn;
4c2d542f 658 xfs_lsn_t push_seq;
bad77c37
DC
659 struct bio bio;
660 DECLARE_COMPLETION_ONSTACK(bdev_flush);
71e330b5 661
707e0dda 662 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS);
71e330b5
DC
663 new_ctx->ticket = xlog_cil_ticket_alloc(log);
664
4c2d542f 665 down_write(&cil->xc_ctx_lock);
71e330b5
DC
666 ctx = cil->xc_ctx;
667
4bb928cd 668 spin_lock(&cil->xc_push_lock);
4c2d542f
DC
669 push_seq = cil->xc_push_seq;
670 ASSERT(push_seq <= ctx->sequence);
71e330b5 671
0e7ab7ef
DC
672 /*
673 * Wake up any background push waiters now this context is being pushed.
674 */
c7f87f39
DC
675 if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
676 wake_up_all(&cil->xc_push_wait);
0e7ab7ef 677
4c2d542f
DC
678 /*
679 * Check if we've anything to push. If there is nothing, then we don't
680 * move on to a new sequence number and so we have to be able to push
681 * this sequence again later.
682 */
683 if (list_empty(&cil->xc_cil)) {
684 cil->xc_push_seq = 0;
4bb928cd 685 spin_unlock(&cil->xc_push_lock);
a44f13ed 686 goto out_skip;
4c2d542f 687 }
4c2d542f 688
a44f13ed 689
cf085a1b 690 /* check for a previously pushed sequence */
8af3dcd3
DC
691 if (push_seq < cil->xc_ctx->sequence) {
692 spin_unlock(&cil->xc_push_lock);
df806158 693 goto out_skip;
8af3dcd3
DC
694 }
695
696 /*
697 * We are now going to push this context, so add it to the committing
698 * list before we do anything else. This ensures that anyone waiting on
699 * this push can easily detect the difference between a "push in
700 * progress" and "CIL is empty, nothing to do".
701 *
702 * IOWs, a wait loop can now check for:
703 * the current sequence not being found on the committing list;
704 * an empty CIL; and
705 * an unchanged sequence number
706 * to detect a push that had nothing to do and therefore does not need
707 * waiting on. If the CIL is not empty, we get put on the committing
708 * list before emptying the CIL and bumping the sequence number. Hence
709 * an empty CIL and an unchanged sequence number means we jumped out
710 * above after doing nothing.
711 *
712 * Hence the waiter will either find the commit sequence on the
713 * committing list or the sequence number will be unchanged and the CIL
714 * still dirty. In that latter case, the push has not yet started, and
715 * so the waiter will have to continue trying to check the CIL
716 * committing list until it is found. In extreme cases of delay, the
717 * sequence may fully commit between the attempts the wait makes to wait
718 * on the commit sequence.
719 */
720 list_add(&ctx->committing, &cil->xc_committing);
721 spin_unlock(&cil->xc_push_lock);
df806158 722
71e330b5 723 /*
bad77c37
DC
724 * The CIL is stable at this point - nothing new will be added to it
725 * because we hold the flush lock exclusively. Hence we can now issue
726 * a cache flush to ensure all the completed metadata in the journal we
727 * are about to overwrite is on stable storage.
728 */
729 xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev,
730 &bdev_flush);
731
732 /*
733 * Pull all the log vectors off the items in the CIL, and remove the
734 * items from the CIL. We don't need the CIL lock here because it's only
735 * needed on the transaction commit side which is currently locked out
736 * by the flush lock.
71e330b5
DC
737 */
738 lv = NULL;
71e330b5 739 num_iovecs = 0;
71e330b5
DC
740 while (!list_empty(&cil->xc_cil)) {
741 struct xfs_log_item *item;
71e330b5
DC
742
743 item = list_first_entry(&cil->xc_cil,
744 struct xfs_log_item, li_cil);
745 list_del_init(&item->li_cil);
746 if (!ctx->lv_chain)
747 ctx->lv_chain = item->li_lv;
748 else
749 lv->lv_next = item->li_lv;
750 lv = item->li_lv;
751 item->li_lv = NULL;
71e330b5 752 num_iovecs += lv->lv_niovecs;
71e330b5
DC
753 }
754
755 /*
756 * initialise the new context and attach it to the CIL. Then attach
c7f87f39 757 * the current context to the CIL committing list so it can be found
71e330b5
DC
758 * during log forces to extract the commit lsn of the sequence that
759 * needs to be forced.
760 */
761 INIT_LIST_HEAD(&new_ctx->committing);
762 INIT_LIST_HEAD(&new_ctx->busy_extents);
763 new_ctx->sequence = ctx->sequence + 1;
764 new_ctx->cil = cil;
765 cil->xc_ctx = new_ctx;
766
767 /*
768 * The switch is now done, so we can drop the context lock and move out
769 * of a shared context. We can't just go straight to the commit record,
770 * though - we need to synchronise with previous and future commits so
771 * that the commit records are correctly ordered in the log to ensure
772 * that we process items during log IO completion in the correct order.
773 *
774 * For example, if we get an EFI in one checkpoint and the EFD in the
775 * next (e.g. due to log forces), we do not want the checkpoint with
776 * the EFD to be committed before the checkpoint with the EFI. Hence
777 * we must strictly order the commit records of the checkpoints so
778 * that: a) the checkpoint callbacks are attached to the iclogs in the
779 * correct order; and b) the checkpoints are replayed in correct order
780 * in log recovery.
781 *
782 * Hence we need to add this context to the committing context list so
783 * that higher sequences will wait for us to write out a commit record
784 * before they do.
f876e446
DC
785 *
786 * xfs_log_force_lsn requires us to mirror the new sequence into the cil
787 * structure atomically with the addition of this sequence to the
788 * committing list. This also ensures that we can do unlocked checks
789 * against the current sequence in log forces without risking
790 * deferencing a freed context pointer.
71e330b5 791 */
4bb928cd 792 spin_lock(&cil->xc_push_lock);
f876e446 793 cil->xc_current_sequence = new_ctx->sequence;
4bb928cd 794 spin_unlock(&cil->xc_push_lock);
71e330b5
DC
795 up_write(&cil->xc_ctx_lock);
796
797 /*
798 * Build a checkpoint transaction header and write it to the log to
799 * begin the transaction. We need to account for the space used by the
800 * transaction header here as it is not accounted for in xlog_write().
801 *
802 * The LSN we need to pass to the log items on transaction commit is
803 * the LSN reported by the first log vector write. If we use the commit
804 * record lsn then we can move the tail beyond the grant write head.
805 */
806 tic = ctx->ticket;
807 thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
808 thdr.th_type = XFS_TRANS_CHECKPOINT;
809 thdr.th_tid = tic->t_tid;
810 thdr.th_num_items = num_iovecs;
4e0d5f92 811 lhdr.i_addr = &thdr;
71e330b5
DC
812 lhdr.i_len = sizeof(xfs_trans_header_t);
813 lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
814 tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
815
816 lvhdr.lv_niovecs = 1;
817 lvhdr.lv_iovecp = &lhdr;
818 lvhdr.lv_next = ctx->lv_chain;
819
bad77c37
DC
820 /*
821 * Before we format and submit the first iclog, we have to ensure that
822 * the metadata writeback ordering cache flush is complete.
823 */
824 wait_for_completion(&bdev_flush);
825
3468bb1c
DC
826 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL,
827 XLOG_START_TRANS);
71e330b5 828 if (error)
7db37c5e 829 goto out_abort_free_ticket;
71e330b5
DC
830
831 /*
832 * now that we've written the checkpoint into the log, strictly
833 * order the commit records so replay will get them in the right order.
834 */
835restart:
4bb928cd 836 spin_lock(&cil->xc_push_lock);
71e330b5 837 list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
ac983517
DC
838 /*
839 * Avoid getting stuck in this loop because we were woken by the
840 * shutdown, but then went back to sleep once already in the
841 * shutdown state.
842 */
843 if (XLOG_FORCED_SHUTDOWN(log)) {
844 spin_unlock(&cil->xc_push_lock);
845 goto out_abort_free_ticket;
846 }
847
71e330b5
DC
848 /*
849 * Higher sequences will wait for this one so skip them.
ac983517 850 * Don't wait for our own sequence, either.
71e330b5
DC
851 */
852 if (new_ctx->sequence >= ctx->sequence)
853 continue;
854 if (!new_ctx->commit_lsn) {
855 /*
856 * It is still being pushed! Wait for the push to
857 * complete, then start again from the beginning.
858 */
4bb928cd 859 xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
71e330b5
DC
860 goto restart;
861 }
862 }
4bb928cd 863 spin_unlock(&cil->xc_push_lock);
71e330b5 864
f10e925d 865 error = xlog_commit_record(log, tic, &commit_iclog, &commit_lsn);
dd401770
DC
866 if (error)
867 goto out_abort_free_ticket;
868
8b41e3f9 869 xfs_log_ticket_ungrant(log, tic);
71e330b5 870
89ae379d 871 spin_lock(&commit_iclog->ic_callback_lock);
1858bb0b 872 if (commit_iclog->ic_state == XLOG_STATE_IOERROR) {
89ae379d 873 spin_unlock(&commit_iclog->ic_callback_lock);
71e330b5 874 goto out_abort;
89ae379d
CH
875 }
876 ASSERT_ALWAYS(commit_iclog->ic_state == XLOG_STATE_ACTIVE ||
877 commit_iclog->ic_state == XLOG_STATE_WANT_SYNC);
878 list_add_tail(&ctx->iclog_entry, &commit_iclog->ic_callbacks);
879 spin_unlock(&commit_iclog->ic_callback_lock);
71e330b5
DC
880
881 /*
882 * now the checkpoint commit is complete and we've attached the
883 * callbacks to the iclog we can assign the commit LSN to the context
884 * and wake up anyone who is waiting for the commit to complete.
885 */
4bb928cd 886 spin_lock(&cil->xc_push_lock);
71e330b5 887 ctx->commit_lsn = commit_lsn;
eb40a875 888 wake_up_all(&cil->xc_commit_wait);
4bb928cd 889 spin_unlock(&cil->xc_push_lock);
71e330b5 890
a79b28c2
DC
891 /*
892 * If the checkpoint spans multiple iclogs, wait for all previous
eef983ff
DC
893 * iclogs to complete before we submit the commit_iclog. In this case,
894 * the commit_iclog write needs to issue a pre-flush so that the
895 * ordering is correctly preserved down to stable storage.
a79b28c2 896 */
eef983ff 897 spin_lock(&log->l_icloglock);
a79b28c2 898 if (ctx->start_lsn != commit_lsn) {
a79b28c2 899 xlog_wait_on_iclog(commit_iclog->ic_prev);
eef983ff
DC
900 spin_lock(&log->l_icloglock);
901 commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
a79b28c2
DC
902 }
903
eef983ff
DC
904 /*
905 * The commit iclog must be written to stable storage to guarantee
906 * journal IO vs metadata writeback IO is correctly ordered on stable
907 * storage.
908 */
909 commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA;
910 xlog_state_release_iclog(log, commit_iclog);
911 spin_unlock(&log->l_icloglock);
c7cc296d 912 return;
71e330b5
DC
913
914out_skip:
915 up_write(&cil->xc_ctx_lock);
916 xfs_log_ticket_put(new_ctx->ticket);
917 kmem_free(new_ctx);
c7cc296d 918 return;
71e330b5 919
7db37c5e 920out_abort_free_ticket:
8b41e3f9 921 xfs_log_ticket_ungrant(log, tic);
71e330b5 922out_abort:
12e6a0f4
CH
923 ASSERT(XLOG_FORCED_SHUTDOWN(log));
924 xlog_cil_committed(ctx);
4c2d542f
DC
925}
926
927/*
928 * We need to push CIL every so often so we don't cache more than we can fit in
929 * the log. The limit really is that a checkpoint can't be more than half the
930 * log (the current checkpoint is not allowed to overwrite the previous
931 * checkpoint), but commit latency and memory usage limit this to a smaller
932 * size.
933 */
934static void
935xlog_cil_push_background(
0e7ab7ef 936 struct xlog *log) __releases(cil->xc_ctx_lock)
4c2d542f
DC
937{
938 struct xfs_cil *cil = log->l_cilp;
939
940 /*
941 * The cil won't be empty because we are called while holding the
942 * context lock so whatever we added to the CIL will still be there
943 */
944 ASSERT(!list_empty(&cil->xc_cil));
945
946 /*
947 * don't do a background push if we haven't used up all the
948 * space available yet.
949 */
0e7ab7ef
DC
950 if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
951 up_read(&cil->xc_ctx_lock);
4c2d542f 952 return;
0e7ab7ef 953 }
4c2d542f 954
4bb928cd 955 spin_lock(&cil->xc_push_lock);
4c2d542f
DC
956 if (cil->xc_push_seq < cil->xc_current_sequence) {
957 cil->xc_push_seq = cil->xc_current_sequence;
958 queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
959 }
0e7ab7ef
DC
960
961 /*
962 * Drop the context lock now, we can't hold that if we need to sleep
963 * because we are over the blocking threshold. The push_lock is still
964 * held, so blocking threshold sleep/wakeup is still correctly
965 * serialised here.
966 */
967 up_read(&cil->xc_ctx_lock);
968
969 /*
970 * If we are well over the space limit, throttle the work that is being
971 * done until the push work on this context has begun.
972 */
973 if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
974 trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
975 ASSERT(cil->xc_ctx->space_used < log->l_logsize);
c7f87f39 976 xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
0e7ab7ef
DC
977 return;
978 }
979
4bb928cd 980 spin_unlock(&cil->xc_push_lock);
4c2d542f
DC
981
982}
983
f876e446
DC
984/*
985 * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence
986 * number that is passed. When it returns, the work will be queued for
987 * @push_seq, but it won't be completed. The caller is expected to do any
988 * waiting for push_seq to complete if it is required.
989 */
4c2d542f 990static void
f876e446 991xlog_cil_push_now(
f7bdf03a 992 struct xlog *log,
4c2d542f
DC
993 xfs_lsn_t push_seq)
994{
995 struct xfs_cil *cil = log->l_cilp;
996
997 if (!cil)
998 return;
999
1000 ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
1001
1002 /* start on any pending background push to minimise wait time on it */
1003 flush_work(&cil->xc_push_work);
1004
1005 /*
1006 * If the CIL is empty or we've already pushed the sequence then
1007 * there's no work we need to do.
1008 */
4bb928cd 1009 spin_lock(&cil->xc_push_lock);
4c2d542f 1010 if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
4bb928cd 1011 spin_unlock(&cil->xc_push_lock);
4c2d542f
DC
1012 return;
1013 }
1014
1015 cil->xc_push_seq = push_seq;
f876e446 1016 queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
4bb928cd 1017 spin_unlock(&cil->xc_push_lock);
4c2d542f
DC
1018}
1019
2c6e24ce
DC
1020bool
1021xlog_cil_empty(
1022 struct xlog *log)
1023{
1024 struct xfs_cil *cil = log->l_cilp;
1025 bool empty = false;
1026
1027 spin_lock(&cil->xc_push_lock);
1028 if (list_empty(&cil->xc_cil))
1029 empty = true;
1030 spin_unlock(&cil->xc_push_lock);
1031 return empty;
1032}
1033
a44f13ed
DC
1034/*
1035 * Commit a transaction with the given vector to the Committed Item List.
1036 *
1037 * To do this, we need to format the item, pin it in memory if required and
1038 * account for the space used by the transaction. Once we have done that we
1039 * need to release the unused reservation for the transaction, attach the
1040 * transaction to the checkpoint context so we carry the busy extents through
1041 * to checkpoint completion, and then unlock all the items in the transaction.
1042 *
a44f13ed
DC
1043 * Called with the context lock already held in read mode to lock out
1044 * background commit, returns without it held once background commits are
1045 * allowed again.
1046 */
c6f97264 1047void
a44f13ed
DC
1048xfs_log_commit_cil(
1049 struct xfs_mount *mp,
1050 struct xfs_trans *tp,
a44f13ed 1051 xfs_lsn_t *commit_lsn,
70393313 1052 bool regrant)
a44f13ed 1053{
f7bdf03a 1054 struct xlog *log = mp->m_log;
991aaf65 1055 struct xfs_cil *cil = log->l_cilp;
195cd83d 1056 struct xfs_log_item *lip, *next;
f990fc5a 1057 xfs_lsn_t xc_commit_lsn;
a44f13ed 1058
b1c5ebb2
DC
1059 /*
1060 * Do all necessary memory allocation before we lock the CIL.
1061 * This ensures the allocation does not deadlock with a CIL
1062 * push in memory reclaim (e.g. from kswapd).
1063 */
1064 xlog_cil_alloc_shadow_bufs(log, tp);
1065
f5baac35 1066 /* lock out background commit */
991aaf65 1067 down_read(&cil->xc_ctx_lock);
f5baac35 1068
991aaf65 1069 xlog_cil_insert_items(log, tp);
a44f13ed 1070
f990fc5a 1071 xc_commit_lsn = cil->xc_ctx->sequence;
991aaf65 1072 if (commit_lsn)
f990fc5a 1073 *commit_lsn = xc_commit_lsn;
a44f13ed 1074
8b41e3f9
CH
1075 if (regrant && !XLOG_FORCED_SHUTDOWN(log))
1076 xfs_log_ticket_regrant(log, tp->t_ticket);
1077 else
1078 xfs_log_ticket_ungrant(log, tp->t_ticket);
ba18781b 1079 tp->t_ticket = NULL;
a44f13ed
DC
1080 xfs_trans_unreserve_and_mod_sb(tp);
1081
1082 /*
1083 * Once all the items of the transaction have been copied to the CIL,
195cd83d 1084 * the items can be unlocked and possibly freed.
a44f13ed
DC
1085 *
1086 * This needs to be done before we drop the CIL context lock because we
1087 * have to update state in the log items and unlock them before they go
1088 * to disk. If we don't, then the CIL checkpoint can race with us and
1089 * we can run checkpoint completion before we've updated and unlocked
1090 * the log items. This affects (at least) processing of stale buffers,
1091 * inodes and EFIs.
1092 */
195cd83d
CH
1093 trace_xfs_trans_commit_items(tp, _RET_IP_);
1094 list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
1095 xfs_trans_del_item(lip);
1096 if (lip->li_ops->iop_committing)
1097 lip->li_ops->iop_committing(lip, xc_commit_lsn);
195cd83d 1098 }
a44f13ed 1099
0e7ab7ef
DC
1100 /* xlog_cil_push_background() releases cil->xc_ctx_lock */
1101 xlog_cil_push_background(log);
a44f13ed
DC
1102}
1103
71e330b5
DC
1104/*
1105 * Conditionally push the CIL based on the sequence passed in.
1106 *
1107 * We only need to push if we haven't already pushed the sequence
1108 * number given. Hence the only time we will trigger a push here is
1109 * if the push sequence is the same as the current context.
1110 *
1111 * We return the current commit lsn to allow the callers to determine if a
1112 * iclog flush is necessary following this call.
71e330b5
DC
1113 */
1114xfs_lsn_t
a44f13ed 1115xlog_cil_force_lsn(
f7bdf03a 1116 struct xlog *log,
a44f13ed 1117 xfs_lsn_t sequence)
71e330b5
DC
1118{
1119 struct xfs_cil *cil = log->l_cilp;
1120 struct xfs_cil_ctx *ctx;
1121 xfs_lsn_t commit_lsn = NULLCOMMITLSN;
1122
a44f13ed
DC
1123 ASSERT(sequence <= cil->xc_current_sequence);
1124
1125 /*
1126 * check to see if we need to force out the current context.
1127 * xlog_cil_push() handles racing pushes for the same sequence,
1128 * so no need to deal with it here.
1129 */
f876e446
DC
1130restart:
1131 xlog_cil_push_now(log, sequence);
71e330b5
DC
1132
1133 /*
1134 * See if we can find a previous sequence still committing.
71e330b5
DC
1135 * We need to wait for all previous sequence commits to complete
1136 * before allowing the force of push_seq to go ahead. Hence block
1137 * on commits for those as well.
1138 */
4bb928cd 1139 spin_lock(&cil->xc_push_lock);
71e330b5 1140 list_for_each_entry(ctx, &cil->xc_committing, committing) {
ac983517
DC
1141 /*
1142 * Avoid getting stuck in this loop because we were woken by the
1143 * shutdown, but then went back to sleep once already in the
1144 * shutdown state.
1145 */
1146 if (XLOG_FORCED_SHUTDOWN(log))
1147 goto out_shutdown;
a44f13ed 1148 if (ctx->sequence > sequence)
71e330b5
DC
1149 continue;
1150 if (!ctx->commit_lsn) {
1151 /*
1152 * It is still being pushed! Wait for the push to
1153 * complete, then start again from the beginning.
1154 */
4bb928cd 1155 xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
71e330b5
DC
1156 goto restart;
1157 }
a44f13ed 1158 if (ctx->sequence != sequence)
71e330b5
DC
1159 continue;
1160 /* found it! */
1161 commit_lsn = ctx->commit_lsn;
1162 }
f876e446
DC
1163
1164 /*
1165 * The call to xlog_cil_push_now() executes the push in the background.
1166 * Hence by the time we have got here it our sequence may not have been
1167 * pushed yet. This is true if the current sequence still matches the
1168 * push sequence after the above wait loop and the CIL still contains
8af3dcd3
DC
1169 * dirty objects. This is guaranteed by the push code first adding the
1170 * context to the committing list before emptying the CIL.
f876e446 1171 *
8af3dcd3
DC
1172 * Hence if we don't find the context in the committing list and the
1173 * current sequence number is unchanged then the CIL contents are
1174 * significant. If the CIL is empty, if means there was nothing to push
1175 * and that means there is nothing to wait for. If the CIL is not empty,
1176 * it means we haven't yet started the push, because if it had started
1177 * we would have found the context on the committing list.
f876e446 1178 */
f876e446
DC
1179 if (sequence == cil->xc_current_sequence &&
1180 !list_empty(&cil->xc_cil)) {
1181 spin_unlock(&cil->xc_push_lock);
1182 goto restart;
1183 }
1184
4bb928cd 1185 spin_unlock(&cil->xc_push_lock);
71e330b5 1186 return commit_lsn;
ac983517
DC
1187
1188 /*
1189 * We detected a shutdown in progress. We need to trigger the log force
1190 * to pass through it's iclog state machine error handling, even though
1191 * we are already in a shutdown state. Hence we can't return
1192 * NULLCOMMITLSN here as that has special meaning to log forces (i.e.
1193 * LSN is already stable), so we return a zero LSN instead.
1194 */
1195out_shutdown:
1196 spin_unlock(&cil->xc_push_lock);
1197 return 0;
71e330b5 1198}
ccf7c23f
DC
1199
1200/*
1201 * Check if the current log item was first committed in this sequence.
1202 * We can't rely on just the log item being in the CIL, we have to check
1203 * the recorded commit sequence number.
1204 *
1205 * Note: for this to be used in a non-racy manner, it has to be called with
1206 * CIL flushing locked out. As a result, it should only be used during the
1207 * transaction commit process when deciding what to format into the item.
1208 */
1209bool
1210xfs_log_item_in_current_chkpt(
1211 struct xfs_log_item *lip)
1212{
1213 struct xfs_cil_ctx *ctx;
1214
ccf7c23f
DC
1215 if (list_empty(&lip->li_cil))
1216 return false;
1217
1218 ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
1219
1220 /*
1221 * li_seq is written on the first commit of a log item to record the
1222 * first checkpoint it is written to. Hence if it is different to the
1223 * current sequence, we're in a new checkpoint.
1224 */
1225 if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
1226 return false;
1227 return true;
1228}
4c2d542f
DC
1229
1230/*
1231 * Perform initial CIL structure initialisation.
1232 */
1233int
1234xlog_cil_init(
f7bdf03a 1235 struct xlog *log)
4c2d542f
DC
1236{
1237 struct xfs_cil *cil;
1238 struct xfs_cil_ctx *ctx;
1239
707e0dda 1240 cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
4c2d542f 1241 if (!cil)
2451337d 1242 return -ENOMEM;
4c2d542f 1243
707e0dda 1244 ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL);
4c2d542f
DC
1245 if (!ctx) {
1246 kmem_free(cil);
2451337d 1247 return -ENOMEM;
4c2d542f
DC
1248 }
1249
1250 INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
1251 INIT_LIST_HEAD(&cil->xc_cil);
1252 INIT_LIST_HEAD(&cil->xc_committing);
1253 spin_lock_init(&cil->xc_cil_lock);
4bb928cd 1254 spin_lock_init(&cil->xc_push_lock);
c7f87f39 1255 init_waitqueue_head(&cil->xc_push_wait);
4c2d542f
DC
1256 init_rwsem(&cil->xc_ctx_lock);
1257 init_waitqueue_head(&cil->xc_commit_wait);
1258
1259 INIT_LIST_HEAD(&ctx->committing);
1260 INIT_LIST_HEAD(&ctx->busy_extents);
1261 ctx->sequence = 1;
1262 ctx->cil = cil;
1263 cil->xc_ctx = ctx;
1264 cil->xc_current_sequence = ctx->sequence;
1265
1266 cil->xc_log = log;
1267 log->l_cilp = cil;
1268 return 0;
1269}
1270
1271void
1272xlog_cil_destroy(
f7bdf03a 1273 struct xlog *log)
4c2d542f
DC
1274{
1275 if (log->l_cilp->xc_ctx) {
1276 if (log->l_cilp->xc_ctx->ticket)
1277 xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
1278 kmem_free(log->l_cilp->xc_ctx);
1279 }
1280
1281 ASSERT(list_empty(&log->l_cilp->xc_cil));
1282 kmem_free(log->l_cilp);
1283}
1284