xfs: teach the tempfile to set up atomic file content exchanges
[linux-2.6-block.git] / fs / xfs / scrub / tempfile.c
CommitLineData
84c14ee3
DW
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (c) 2021-2024 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_log_format.h"
13#include "xfs_trans.h"
14#include "xfs_inode.h"
15#include "xfs_ialloc.h"
16#include "xfs_quota.h"
e81ce424 17#include "xfs_bmap.h"
84c14ee3
DW
18#include "xfs_bmap_btree.h"
19#include "xfs_trans_space.h"
20#include "xfs_dir2.h"
21#include "xfs_exchrange.h"
56596d8b 22#include "xfs_exchmaps.h"
e81ce424 23#include "xfs_defer.h"
84c14ee3
DW
24#include "scrub/scrub.h"
25#include "scrub/common.h"
e81ce424 26#include "scrub/repair.h"
84c14ee3
DW
27#include "scrub/trace.h"
28#include "scrub/tempfile.h"
56596d8b 29#include "scrub/tempexch.h"
e81ce424 30#include "scrub/xfile.h"
84c14ee3
DW
31
32/*
33 * Create a temporary file for reconstructing metadata, with the intention of
34 * atomically exchanging the temporary file's contents with the file that's
35 * being repaired.
36 */
37int
38xrep_tempfile_create(
39 struct xfs_scrub *sc,
40 uint16_t mode)
41{
42 struct xfs_mount *mp = sc->mp;
43 struct xfs_trans *tp = NULL;
44 struct xfs_dquot *udqp = NULL;
45 struct xfs_dquot *gdqp = NULL;
46 struct xfs_dquot *pdqp = NULL;
47 struct xfs_trans_res *tres;
48 struct xfs_inode *dp = mp->m_rootip;
49 xfs_ino_t ino;
50 unsigned int resblks;
51 bool is_dir = S_ISDIR(mode);
52 int error;
53
54 if (xfs_is_shutdown(mp))
55 return -EIO;
56 if (xfs_is_readonly(mp))
57 return -EROFS;
58
59 ASSERT(sc->tp == NULL);
60 ASSERT(sc->tempip == NULL);
61
62 /*
63 * Make sure that we have allocated dquot(s) on disk. The temporary
64 * inode should be completely root owned so that we don't fail due to
65 * quota limits.
66 */
67 error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
68 XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp);
69 if (error)
70 return error;
71
72 if (is_dir) {
73 resblks = XFS_MKDIR_SPACE_RES(mp, 0);
74 tres = &M_RES(mp)->tr_mkdir;
75 } else {
76 resblks = XFS_IALLOC_SPACE_RES(mp);
77 tres = &M_RES(mp)->tr_create_tmpfile;
78 }
79
80 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
81 &tp);
82 if (error)
83 goto out_release_dquots;
84
85 /* Allocate inode, set up directory. */
86 error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
87 if (error)
88 goto out_trans_cancel;
89 error = xfs_init_new_inode(&nop_mnt_idmap, tp, dp, ino, mode, 0, 0,
90 0, false, &sc->tempip);
91 if (error)
92 goto out_trans_cancel;
93
94 /* Change the ownership of the inode to root. */
95 VFS_I(sc->tempip)->i_uid = GLOBAL_ROOT_UID;
96 VFS_I(sc->tempip)->i_gid = GLOBAL_ROOT_GID;
97 sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT);
98 xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE);
99
100 /*
101 * Mark our temporary file as private so that LSMs and the ACL code
102 * don't try to add their own metadata or reason about these files.
103 * The file should never be exposed to userspace.
104 */
105 VFS_I(sc->tempip)->i_flags |= S_PRIVATE;
106 VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR;
107
108 if (is_dir) {
109 error = xfs_dir_init(tp, sc->tempip, dp);
110 if (error)
111 goto out_trans_cancel;
112 }
113
114 /*
115 * Attach the dquot(s) to the inodes and modify them incore.
116 * These ids of the inode couldn't have changed since the new
117 * inode has been locked ever since it was created.
118 */
119 xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp);
120
121 /*
122 * Put our temp file on the unlinked list so it's purged automatically.
123 * All file-based metadata being reconstructed using this file must be
124 * atomically exchanged with the original file because the contents
125 * here will be purged when the inode is dropped or log recovery cleans
126 * out the unlinked list.
127 */
128 error = xfs_iunlink(tp, sc->tempip);
129 if (error)
130 goto out_trans_cancel;
131
132 error = xfs_trans_commit(tp);
133 if (error)
134 goto out_release_inode;
135
136 trace_xrep_tempfile_create(sc);
137
138 xfs_qm_dqrele(udqp);
139 xfs_qm_dqrele(gdqp);
140 xfs_qm_dqrele(pdqp);
141
142 /* Finish setting up the incore / vfs context. */
143 xfs_setup_iops(sc->tempip);
144 xfs_finish_inode_setup(sc->tempip);
145
146 sc->temp_ilock_flags = 0;
147 return error;
148
149out_trans_cancel:
150 xfs_trans_cancel(tp);
151out_release_inode:
152 /*
153 * Wait until after the current transaction is aborted to finish the
154 * setup of the inode and release the inode. This prevents recursive
155 * transactions and deadlocks from xfs_inactive.
156 */
157 if (sc->tempip) {
158 xfs_finish_inode_setup(sc->tempip);
159 xchk_irele(sc, sc->tempip);
160 }
161out_release_dquots:
162 xfs_qm_dqrele(udqp);
163 xfs_qm_dqrele(gdqp);
164 xfs_qm_dqrele(pdqp);
165
166 return error;
167}
168
169/* Take IOLOCK_EXCL on the temporary file, maybe. */
170bool
171xrep_tempfile_iolock_nowait(
172 struct xfs_scrub *sc)
173{
174 if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) {
175 sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
176 return true;
177 }
178
179 return false;
180}
181
182/*
183 * Take the temporary file's IOLOCK while holding a different inode's IOLOCK.
184 * In theory nobody else should hold the tempfile's IOLOCK, but we use trylock
185 * to avoid deadlocks and lockdep complaints.
186 */
187int
188xrep_tempfile_iolock_polled(
189 struct xfs_scrub *sc)
190{
191 int error = 0;
192
193 while (!xrep_tempfile_iolock_nowait(sc)) {
194 if (xchk_should_terminate(sc, &error))
195 return error;
196 delay(1);
197 }
198
199 return 0;
200}
201
202/* Release IOLOCK_EXCL on the temporary file. */
203void
204xrep_tempfile_iounlock(
205 struct xfs_scrub *sc)
206{
207 xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL);
208 sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL;
209}
210
211/* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */
212void
213xrep_tempfile_ilock(
214 struct xfs_scrub *sc)
215{
216 sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
217 xfs_ilock(sc->tempip, XFS_ILOCK_EXCL);
218}
219
220/* Try to grab ILOCK_EXCL on the temporary file. */
221bool
222xrep_tempfile_ilock_nowait(
223 struct xfs_scrub *sc)
224{
225 if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) {
226 sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
227 return true;
228 }
229
230 return false;
231}
232
233/* Unlock ILOCK_EXCL on the temporary file after an update. */
234void
235xrep_tempfile_iunlock(
236 struct xfs_scrub *sc)
237{
238 xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
239 sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL;
240}
241
242/* Release the temporary file. */
243void
244xrep_tempfile_rele(
245 struct xfs_scrub *sc)
246{
247 if (!sc->tempip)
248 return;
249
250 if (sc->temp_ilock_flags) {
251 xfs_iunlock(sc->tempip, sc->temp_ilock_flags);
252 sc->temp_ilock_flags = 0;
253 }
254
255 xchk_irele(sc, sc->tempip);
256 sc->tempip = NULL;
257}
e81ce424
DW
258
259/*
260 * Make sure that the given range of the data fork of the temporary file is
261 * mapped to written blocks. The caller must ensure that both inodes are
262 * joined to the transaction.
263 */
264int
265xrep_tempfile_prealloc(
266 struct xfs_scrub *sc,
267 xfs_fileoff_t off,
268 xfs_filblks_t len)
269{
270 struct xfs_bmbt_irec map;
271 xfs_fileoff_t end = off + len;
272 int error;
273
274 ASSERT(sc->tempip != NULL);
275 ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip));
276
277 for (; off < end; off = map.br_startoff + map.br_blockcount) {
278 int nmaps = 1;
279
280 /*
281 * If we have a real extent mapping this block then we're
282 * in ok shape.
283 */
284 error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps,
285 XFS_DATA_FORK);
286 if (error)
287 return error;
288 if (nmaps == 0) {
289 ASSERT(nmaps != 0);
290 return -EFSCORRUPTED;
291 }
292
293 if (xfs_bmap_is_written_extent(&map))
294 continue;
295
296 /*
297 * If we find a delalloc reservation then something is very
298 * very wrong. Bail out.
299 */
300 if (map.br_startblock == DELAYSTARTBLOCK)
301 return -EFSCORRUPTED;
302
303 /*
304 * Make sure this block has a real zeroed extent allocated to
305 * it.
306 */
307 nmaps = 1;
308 error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off,
309 XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map,
310 &nmaps);
311 if (error)
312 return error;
313 if (nmaps != 1)
314 return -EFSCORRUPTED;
315
316 trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map);
317
318 /* Commit new extent and all deferred work. */
319 error = xfs_defer_finish(&sc->tp);
320 if (error)
321 return error;
322 }
323
324 return 0;
325}
326
327/*
328 * Write data to each block of a file. The given range of the tempfile's data
329 * fork must already be populated with written extents.
330 */
331int
332xrep_tempfile_copyin(
333 struct xfs_scrub *sc,
334 xfs_fileoff_t off,
335 xfs_filblks_t len,
336 xrep_tempfile_copyin_fn prep_fn,
337 void *data)
338{
339 LIST_HEAD(buffers_list);
340 struct xfs_mount *mp = sc->mp;
341 struct xfs_buf *bp;
342 xfs_fileoff_t flush_mask;
343 xfs_fileoff_t end = off + len;
344 loff_t pos = XFS_FSB_TO_B(mp, off);
345 int error = 0;
346
347 ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode));
348
349 /* Flush buffers to disk every 512K */
350 flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1;
351
352 for (; off < end; off++, pos += mp->m_sb.sb_blocksize) {
353 struct xfs_bmbt_irec map;
354 int nmaps = 1;
355
356 /* Read block mapping for this file block. */
357 error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0);
358 if (error)
359 goto out_err;
360 if (nmaps == 0 || !xfs_bmap_is_written_extent(&map)) {
361 error = -EFSCORRUPTED;
362 goto out_err;
363 }
364
365 /* Get the metadata buffer for this offset in the file. */
366 error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp,
367 XFS_FSB_TO_DADDR(mp, map.br_startblock),
368 mp->m_bsize, 0, &bp);
369 if (error)
370 goto out_err;
371
372 trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map);
373
374 /* Read in a block's worth of data from the xfile. */
375 error = prep_fn(sc, bp, data);
376 if (error) {
377 xfs_trans_brelse(sc->tp, bp);
378 goto out_err;
379 }
380
381 /* Queue buffer, and flush if we have too much dirty data. */
382 xfs_buf_delwri_queue_here(bp, &buffers_list);
383 xfs_trans_brelse(sc->tp, bp);
384
385 if (!(off & flush_mask)) {
386 error = xfs_buf_delwri_submit(&buffers_list);
387 if (error)
388 goto out_err;
389 }
390 }
391
392 /*
393 * Write the new blocks to disk. If the ordered list isn't empty after
394 * that, then something went wrong and we have to fail. This should
395 * never happen, but we'll check anyway.
396 */
397 error = xfs_buf_delwri_submit(&buffers_list);
398 if (error)
399 goto out_err;
400
401 if (!list_empty(&buffers_list)) {
402 ASSERT(list_empty(&buffers_list));
403 error = -EIO;
404 goto out_err;
405 }
406
407 return 0;
408
409out_err:
410 xfs_buf_delwri_cancel(&buffers_list);
411 return error;
412}
413
414/*
415 * Set the temporary file's size. Caller must join the tempfile to the scrub
416 * transaction and is responsible for adjusting block mappings as needed.
417 */
418int
419xrep_tempfile_set_isize(
420 struct xfs_scrub *sc,
421 unsigned long long isize)
422{
423 if (sc->tempip->i_disk_size == isize)
424 return 0;
425
426 sc->tempip->i_disk_size = isize;
427 i_size_write(VFS_I(sc->tempip), isize);
428 return xrep_tempfile_roll_trans(sc);
429}
430
431/*
432 * Roll a repair transaction involving the temporary file. Caller must join
433 * both the temporary file and the file being scrubbed to the transaction.
434 * This function return with both inodes joined to a new scrub transaction,
435 * or the usual negative errno.
436 */
437int
438xrep_tempfile_roll_trans(
439 struct xfs_scrub *sc)
440{
441 int error;
442
443 xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE);
444 error = xrep_roll_trans(sc);
445 if (error)
446 return error;
447
448 xfs_trans_ijoin(sc->tp, sc->tempip, 0);
449 return 0;
450}
56596d8b
DW
451
452/* Enable file content exchanges. */
453int
454xrep_tempexch_enable(
455 struct xfs_scrub *sc)
456{
457 if (sc->flags & XREP_FSGATES_EXCHANGE_RANGE)
458 return 0;
459
460 if (!xfs_has_exchange_range(sc->mp))
461 return -EOPNOTSUPP;
462
463 trace_xchk_fsgates_enable(sc, XREP_FSGATES_EXCHANGE_RANGE);
464
465 sc->flags |= XREP_FSGATES_EXCHANGE_RANGE;
466 return 0;
467}
468
469/*
470 * Fill out the mapping exchange request in preparation for atomically
471 * committing the contents of a metadata file that we've rebuilt in the temp
472 * file.
473 */
474STATIC int
475xrep_tempexch_prep_request(
476 struct xfs_scrub *sc,
477 int whichfork,
478 struct xrep_tempexch *tx)
479{
480 struct xfs_exchmaps_req *req = &tx->req;
481
482 memset(tx, 0, sizeof(struct xrep_tempexch));
483
484 /* COW forks don't exist on disk. */
485 if (whichfork == XFS_COW_FORK) {
486 ASSERT(0);
487 return -EINVAL;
488 }
489
490 /* Both files should have the relevant forks. */
491 if (!xfs_ifork_ptr(sc->ip, whichfork) ||
492 !xfs_ifork_ptr(sc->tempip, whichfork)) {
493 ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
494 ASSERT(xfs_ifork_ptr(sc->tempip, whichfork) != NULL);
495 return -EINVAL;
496 }
497
498 /* Exchange all mappings in both forks. */
499 req->ip1 = sc->tempip;
500 req->ip2 = sc->ip;
501 req->startoff1 = 0;
502 req->startoff2 = 0;
503 switch (whichfork) {
504 case XFS_ATTR_FORK:
505 req->flags |= XFS_EXCHMAPS_ATTR_FORK;
506 break;
507 case XFS_DATA_FORK:
508 /* Always exchange sizes when exchanging data fork mappings. */
509 req->flags |= XFS_EXCHMAPS_SET_SIZES;
510 break;
511 }
512 req->blockcount = XFS_MAX_FILEOFF;
513
514 return 0;
515}
516
517/*
518 * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip
519 * this if quota enforcement is disabled or if both inodes' dquots are the
520 * same. The qretry structure must be initialized to zeroes before the first
521 * call to this function.
522 */
523STATIC int
524xrep_tempexch_reserve_quota(
525 struct xfs_scrub *sc,
526 const struct xrep_tempexch *tx)
527{
528 struct xfs_trans *tp = sc->tp;
529 const struct xfs_exchmaps_req *req = &tx->req;
530 int64_t ddelta, rdelta;
531 int error;
532
533 /*
534 * Don't bother with a quota reservation if we're not enforcing them
535 * or the two inodes have the same dquots.
536 */
537 if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
538 (req->ip1->i_udquot == req->ip2->i_udquot &&
539 req->ip1->i_gdquot == req->ip2->i_gdquot &&
540 req->ip1->i_pdquot == req->ip2->i_pdquot))
541 return 0;
542
543 /*
544 * Quota reservation for each file comes from two sources. First, we
545 * need to account for any net gain in mapped blocks during the
546 * exchange. Second, we need reservation for the gross gain in mapped
547 * blocks so that we don't trip over any quota block reservation
548 * assertions. We must reserve the gross gain because the quota code
549 * subtracts from bcount the number of blocks that we unmap; it does
550 * not add that quantity back to the quota block reservation.
551 */
552 ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount);
553 rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount);
554 error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
555 ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount,
556 true);
557 if (error)
558 return error;
559
560 ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount);
561 rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount);
562 return xfs_trans_reserve_quota_nblks(tp, req->ip2,
563 ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount,
564 true);
565}
566
567/*
568 * Prepare an existing transaction for an atomic file contents exchange.
569 *
570 * This function fills out the mapping exchange request and resource estimation
571 * structures in preparation for exchanging the contents of a metadata file
572 * that has been rebuilt in the temp file. Next, it reserves space and quota
573 * for the transaction.
574 *
575 * The caller must hold ILOCK_EXCL of the scrub target file and the temporary
576 * file. The caller must join both inodes to the transaction with no unlock
577 * flags, and is responsible for dropping both ILOCKs when appropriate. Only
578 * use this when those ILOCKs cannot be dropped.
579 */
580int
581xrep_tempexch_trans_reserve(
582 struct xfs_scrub *sc,
583 int whichfork,
584 struct xrep_tempexch *tx)
585{
586 int error;
587
588 ASSERT(sc->tp != NULL);
589 xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL);
590 xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL);
591
592 error = xrep_tempexch_prep_request(sc, whichfork, tx);
593 if (error)
594 return error;
595
596 error = xfs_exchmaps_estimate(&tx->req);
597 if (error)
598 return error;
599
600 error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0);
601 if (error)
602 return error;
603
604 return xrep_tempexch_reserve_quota(sc, tx);
605}
606
607/*
608 * Exchange file mappings (and hence file contents) between the file being
609 * repaired and the temporary file. Returns with both inodes locked and joined
610 * to a clean scrub transaction.
611 */
612int
613xrep_tempexch_contents(
614 struct xfs_scrub *sc,
615 struct xrep_tempexch *tx)
616{
617 int error;
618
619 ASSERT(sc->flags & XREP_FSGATES_EXCHANGE_RANGE);
620
621 xfs_exchange_mappings(sc->tp, &tx->req);
622 error = xfs_defer_finish(&sc->tp);
623 if (error)
624 return error;
625
626 /*
627 * If we exchanged the ondisk sizes of two metadata files, we must
628 * exchanged the incore sizes as well.
629 */
630 if (tx->req.flags & XFS_EXCHMAPS_SET_SIZES) {
631 loff_t temp;
632
633 temp = i_size_read(VFS_I(sc->ip));
634 i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip)));
635 i_size_write(VFS_I(sc->tempip), temp);
636 }
637
638 return 0;
639}