Merge tag 'irq_urgent_for_v6.8_rc2' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-block.git] / fs / xfs / xfs_notify_failure.c
CommitLineData
6f643c57
SR
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2022 Fujitsu. All Rights Reserved.
4 */
5
6#include "xfs.h"
7#include "xfs_shared.h"
8#include "xfs_format.h"
9#include "xfs_log_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_alloc.h"
13#include "xfs_bit.h"
14#include "xfs_btree.h"
15#include "xfs_inode.h"
16#include "xfs_icache.h"
17#include "xfs_rmap.h"
18#include "xfs_rmap_btree.h"
19#include "xfs_rtalloc.h"
20#include "xfs_trans.h"
6614a3c3 21#include "xfs_ag.h"
6f643c57
SR
22
23#include <linux/mm.h>
24#include <linux/dax.h>
fa422b35 25#include <linux/fs.h>
6f643c57 26
e033f40b 27struct xfs_failure_info {
6f643c57
SR
28 xfs_agblock_t startblock;
29 xfs_extlen_t blockcount;
30 int mf_flags;
e033f40b 31 bool want_shutdown;
6f643c57
SR
32};
33
34static pgoff_t
35xfs_failure_pgoff(
36 struct xfs_mount *mp,
37 const struct xfs_rmap_irec *rec,
e033f40b 38 const struct xfs_failure_info *notify)
6f643c57
SR
39{
40 loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset);
41
42 if (notify->startblock > rec->rm_startblock)
43 pos += XFS_FSB_TO_B(mp,
44 notify->startblock - rec->rm_startblock);
45 return pos >> PAGE_SHIFT;
46}
47
48static unsigned long
49xfs_failure_pgcnt(
50 struct xfs_mount *mp,
51 const struct xfs_rmap_irec *rec,
e033f40b 52 const struct xfs_failure_info *notify)
6f643c57
SR
53{
54 xfs_agblock_t end_rec;
55 xfs_agblock_t end_notify;
56 xfs_agblock_t start_cross;
57 xfs_agblock_t end_cross;
58
59 start_cross = max(rec->rm_startblock, notify->startblock);
60
61 end_rec = rec->rm_startblock + rec->rm_blockcount;
62 end_notify = notify->startblock + notify->blockcount;
63 end_cross = min(end_rec, end_notify);
64
65 return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
66}
67
68static int
69xfs_dax_failure_fn(
70 struct xfs_btree_cur *cur,
71 const struct xfs_rmap_irec *rec,
72 void *data)
73{
74 struct xfs_mount *mp = cur->bc_mp;
75 struct xfs_inode *ip;
e033f40b 76 struct xfs_failure_info *notify = data;
fa422b35
SR
77 struct address_space *mapping;
78 pgoff_t pgoff;
79 unsigned long pgcnt;
6f643c57
SR
80 int error = 0;
81
82 if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
83 (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
fa422b35
SR
84 /* Continue the query because this isn't a failure. */
85 if (notify->mf_flags & MF_MEM_PRE_REMOVE)
86 return 0;
e033f40b
DW
87 notify->want_shutdown = true;
88 return 0;
6f643c57
SR
89 }
90
91 /* Get files that incore, filter out others that are not in use. */
92 error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
93 0, &ip);
94 /* Continue the rmap query if the inode isn't incore */
95 if (error == -ENODATA)
96 return 0;
e033f40b
DW
97 if (error) {
98 notify->want_shutdown = true;
99 return 0;
100 }
6f643c57 101
fa422b35
SR
102 mapping = VFS_I(ip)->i_mapping;
103 pgoff = xfs_failure_pgoff(mp, rec, notify);
104 pgcnt = xfs_failure_pgcnt(mp, rec, notify);
105
106 /* Continue the rmap query if the inode isn't a dax file. */
107 if (dax_mapping(mapping))
108 error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
109 notify->mf_flags);
110
111 /* Invalidate the cache in dax pages. */
112 if (notify->mf_flags & MF_MEM_PRE_REMOVE)
113 invalidate_inode_pages2_range(mapping, pgoff,
114 pgoff + pgcnt - 1);
115
6f643c57
SR
116 xfs_irele(ip);
117 return error;
118}
119
fa422b35
SR
120static int
121xfs_dax_notify_failure_freeze(
122 struct xfs_mount *mp)
123{
124 struct super_block *sb = mp->m_super;
125 int error;
126
127 error = freeze_super(sb, FREEZE_HOLDER_KERNEL);
128 if (error)
129 xfs_emerg(mp, "already frozen by kernel, err=%d", error);
130
131 return error;
132}
133
134static void
135xfs_dax_notify_failure_thaw(
136 struct xfs_mount *mp,
137 bool kernel_frozen)
138{
139 struct super_block *sb = mp->m_super;
140 int error;
141
142 if (kernel_frozen) {
143 error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
144 if (error)
145 xfs_emerg(mp, "still frozen after notify failure, err=%d",
146 error);
147 }
148
149 /*
150 * Also thaw userspace call anyway because the device is about to be
151 * removed immediately.
152 */
153 thaw_super(sb, FREEZE_HOLDER_USERSPACE);
154}
155
6f643c57
SR
156static int
157xfs_dax_notify_ddev_failure(
158 struct xfs_mount *mp,
159 xfs_daddr_t daddr,
160 xfs_daddr_t bblen,
161 int mf_flags)
162{
e033f40b 163 struct xfs_failure_info notify = { .mf_flags = mf_flags };
6f643c57
SR
164 struct xfs_trans *tp = NULL;
165 struct xfs_btree_cur *cur = NULL;
166 struct xfs_buf *agf_bp = NULL;
167 int error = 0;
fa422b35 168 bool kernel_frozen = false;
6f643c57
SR
169 xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr);
170 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
5cf32f63
SR
171 xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp,
172 daddr + bblen - 1);
6f643c57
SR
173 xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
174
fa422b35
SR
175 if (mf_flags & MF_MEM_PRE_REMOVE) {
176 xfs_info(mp, "Device is about to be removed!");
177 /*
178 * Freeze fs to prevent new mappings from being created.
179 * - Keep going on if others already hold the kernel forzen.
180 * - Keep going on if other errors too because this device is
181 * starting to fail.
182 * - If kernel frozen state is hold successfully here, thaw it
183 * here as well at the end.
184 */
185 kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
186 }
187
6f643c57
SR
188 error = xfs_trans_alloc_empty(mp, &tp);
189 if (error)
fa422b35 190 goto out;
6f643c57
SR
191
192 for (; agno <= end_agno; agno++) {
193 struct xfs_rmap_irec ri_low = { };
194 struct xfs_rmap_irec ri_high;
6f643c57 195 struct xfs_agf *agf;
6614a3c3 196 struct xfs_perag *pag;
3c90c01e 197 xfs_agblock_t range_agend;
6f643c57 198
6614a3c3
LT
199 pag = xfs_perag_get(mp, agno);
200 error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
201 if (error) {
202 xfs_perag_put(pag);
6f643c57 203 break;
6614a3c3 204 }
6f643c57 205
6614a3c3 206 cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
6f643c57
SR
207
208 /*
209 * Set the rmap range from ri_low to ri_high, which represents
210 * a [start, end] where we looking for the files or metadata.
211 */
212 memset(&ri_high, 0xFF, sizeof(ri_high));
213 ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
214 if (agno == end_agno)
215 ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);
216
217 agf = agf_bp->b_addr;
3c90c01e 218 range_agend = min(be32_to_cpu(agf->agf_length) - 1,
6f643c57
SR
219 ri_high.rm_startblock);
220 notify.startblock = ri_low.rm_startblock;
3c90c01e 221 notify.blockcount = range_agend + 1 - ri_low.rm_startblock;
6f643c57
SR
222
223 error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
224 xfs_dax_failure_fn, &notify);
225 xfs_btree_del_cursor(cur, error);
226 xfs_trans_brelse(tp, agf_bp);
6614a3c3 227 xfs_perag_put(pag);
6f643c57
SR
228 if (error)
229 break;
230
231 fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0);
232 }
233
234 xfs_trans_cancel(tp);
fa422b35
SR
235
236 /*
237 * Shutdown fs from a force umount in pre-remove case which won't fail,
238 * so errors can be ignored. Otherwise, shutdown the filesystem with
239 * CORRUPT flag if error occured or notify.want_shutdown was set during
240 * RMAP querying.
241 */
242 if (mf_flags & MF_MEM_PRE_REMOVE)
243 xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
244 else if (error || notify.want_shutdown) {
e033f40b
DW
245 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
246 if (!error)
247 error = -EFSCORRUPTED;
248 }
fa422b35
SR
249
250out:
251 /* Thaw the fs if it has been frozen before. */
252 if (mf_flags & MF_MEM_PRE_REMOVE)
253 xfs_dax_notify_failure_thaw(mp, kernel_frozen);
254
6f643c57
SR
255 return error;
256}
257
258static int
259xfs_dax_notify_failure(
260 struct dax_device *dax_dev,
261 u64 offset,
262 u64 len,
263 int mf_flags)
264{
265 struct xfs_mount *mp = dax_holder(dax_dev);
266 u64 ddev_start;
267 u64 ddev_end;
268
fd63612a 269 if (!(mp->m_super->s_flags & SB_BORN)) {
6f643c57
SR
270 xfs_warn(mp, "filesystem is not ready for notify_failure()!");
271 return -EIO;
272 }
273
274 if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
b14d067e 275 xfs_debug(mp,
6f643c57
SR
276 "notify_failure() not supported on realtime device!");
277 return -EOPNOTSUPP;
278 }
279
280 if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
281 mp->m_logdev_targp != mp->m_ddev_targp) {
fa422b35
SR
282 /*
283 * In the pre-remove case the failure notification is attempting
284 * to trigger a force unmount. The expectation is that the
285 * device is still present, but its removal is in progress and
286 * can not be cancelled, proceed with accessing the log device.
287 */
288 if (mf_flags & MF_MEM_PRE_REMOVE)
289 return 0;
6f643c57
SR
290 xfs_err(mp, "ondisk log corrupt, shutting down fs!");
291 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
292 return -EFSCORRUPTED;
293 }
294
295 if (!xfs_has_rmapbt(mp)) {
b14d067e 296 xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
6f643c57
SR
297 return -EOPNOTSUPP;
298 }
299
300 ddev_start = mp->m_ddev_targp->bt_dax_part_off;
301 ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
302
fa422b35
SR
303 /* Notify failure on the whole device. */
304 if (offset == 0 && len == U64_MAX) {
305 offset = ddev_start;
306 len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
307 }
308
6f643c57 309 /* Ignore the range out of filesystem area */
5cf32f63 310 if (offset + len - 1 < ddev_start)
6f643c57
SR
311 return -ENXIO;
312 if (offset > ddev_end)
313 return -ENXIO;
314
315 /* Calculate the real range when it touches the boundary */
316 if (offset > ddev_start)
317 offset -= ddev_start;
318 else {
319 len -= ddev_start - offset;
320 offset = 0;
321 }
5cf32f63
SR
322 if (offset + len - 1 > ddev_end)
323 len = ddev_end - offset + 1;
6f643c57
SR
324
325 return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
326 mf_flags);
327}
328
329const struct dax_holder_operations xfs_dax_holder_operations = {
330 .notify_failure = xfs_dax_notify_failure,
331};