Commit | Line | Data |
---|---|---|
dfa0a449 DH |
1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* vnode and volume validity verification. | |
3 | * | |
4 | * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. | |
5 | * Written by David Howells (dhowells@redhat.com) | |
6 | */ | |
7 | ||
8 | #include <linux/kernel.h> | |
9 | #include <linux/module.h> | |
10 | #include <linux/sched.h> | |
11 | #include "internal.h" | |
12 | ||
453924de DH |
13 | /* |
14 | * Data validation is managed through a number of mechanisms from the server: | |
15 | * | |
16 | * (1) On first contact with a server (such as if it has just been rebooted), | |
17 | * the server sends us a CB.InitCallBackState* request. | |
18 | * | |
19 | * (2) On a RW volume, in response to certain vnode (inode)-accessing RPC | |
20 | * calls, the server maintains a time-limited per-vnode promise that it | |
21 | * will send us a CB.CallBack request if a third party alters the vnodes | |
22 | * accessed. | |
23 | * | |
24 | * Note that a vnode-level callbacks may also be sent for other reasons, | |
25 | * such as filelock release. | |
26 | * | |
27 | * (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC | |
28 | * calls, each server maintains a time-limited per-volume promise that it | |
29 | * will send us a CB.CallBack request if the RO volume is updated to a | |
30 | * snapshot of the RW volume ("vos release"). This is an atomic event | |
31 | * that cuts over all instances of the RO volume across multiple servers | |
32 | * simultaneously. | |
33 | * | |
34 | * Note that a volume-level callbacks may also be sent for other reasons, | |
35 | * such as the volumeserver taking over control of the volume from the | |
36 | * fileserver. | |
37 | * | |
38 | * Note also that each server maintains an independent time limit on an | |
39 | * independent callback. | |
40 | * | |
41 | * (4) Certain RPC calls include a volume information record "VolSync" in | |
42 | * their reply. This contains a creation date for the volume that should | |
43 | * remain unchanged for a RW volume (but will be changed if the volume is | |
44 | * restored from backup) or will be bumped to the time of snapshotting | |
45 | * when a RO volume is released. | |
46 | * | |
47 | * In order to track this events, the following are provided: | |
48 | * | |
49 | * ->cb_v_break. A counter of events that might mean that the contents of | |
50 | * a volume have been altered since we last checked a vnode. | |
51 | * | |
52 | * ->cb_v_check. A counter of the number of events that we've sent a | |
53 | * query to the server for. Everything's up to date if this equals | |
54 | * cb_v_break. | |
55 | * | |
56 | * ->cb_scrub. A counter of the number of regression events for which we | |
57 | * have to completely wipe the cache. | |
58 | * | |
59 | * ->cb_ro_snapshot. A counter of the number of times that we've | |
60 | * recognised that a RO volume has been updated. | |
61 | * | |
62 | * ->cb_break. A counter of events that might mean that the contents of a | |
63 | * vnode have been altered. | |
64 | * | |
65 | * ->cb_expires_at. The time at which the callback promise expires or | |
66 | * AFS_NO_CB_PROMISE if we have no promise. | |
67 | * | |
68 | * The way we manage things is: | |
69 | * | |
70 | * (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on | |
71 | * the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the | |
72 | * volume and volume's server record. | |
73 | * | |
74 | * (2) When a CB.InitCallBackState occurs, we treat this as a volume-level | |
75 | * callback break on all the volumes that have been using that volume | |
76 | * (ie. increment ->cb_v_break and reset ->cb_expires_at). | |
77 | * | |
78 | * (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the | |
79 | * vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also | |
80 | * dispatch a work item to unmap all PTEs to the vnode's pagecache to | |
81 | * force reentry to the filesystem for revalidation. | |
82 | * | |
83 | * (4) When entering the filesystem, we call afs_validate() to check the | |
84 | * validity of a vnode. This first checks to see if ->cb_v_check and | |
85 | * ->cb_v_break match, and if they don't, we lock volume->cb_check_lock | |
86 | * exclusively and perform an FS.FetchStatus on the vnode. | |
87 | * | |
88 | * After checking the volume, we check the vnode. If there's a mismatch | |
89 | * between the volume counters and the vnode's mirrors of those counters, | |
90 | * we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode. | |
91 | * | |
92 | * (5) When the reply from FS.FetchStatus arrives, the VolSync record is | |
93 | * parsed: | |
94 | * | |
95 | * (A) If the Creation timestamp has changed on a RW volume or regressed | |
96 | * on a RO volume, we try to increment ->cb_scrub; if it advances on a | |
97 | * RO volume, we assume "vos release" happened and try to increment | |
98 | * ->cb_ro_snapshot. | |
99 | * | |
100 | * (B) If the Update timestamp has regressed, we try to increment | |
101 | * ->cb_scrub. | |
102 | * | |
103 | * Note that in both of these cases, we only do the increment if we can | |
104 | * cmpxchg the value of the timestamp from the value we noted before the | |
105 | * op. This tries to prevent parallel ops from fighting one another. | |
106 | * | |
107 | * volume->cb_v_check is then set to ->cb_v_break. | |
108 | * | |
109 | * (6) The AFSCallBack record included in the FS.FetchStatus reply is also | |
110 | * parsed and used to set the promise in ->cb_expires_at for the vnode, | |
111 | * the volume and the volume's server record. | |
112 | * | |
113 | * (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for | |
114 | * the vnode. | |
115 | */ | |
116 | ||
117 | /* | |
118 | * Check the validity of a vnode/inode and its parent volume. | |
119 | */ | |
120 | bool afs_check_validity(const struct afs_vnode *vnode) | |
121 | { | |
122 | const struct afs_volume *volume = vnode->volume; | |
123 | time64_t deadline = ktime_get_real_seconds() + 10; | |
124 | ||
b74c02a3 DH |
125 | if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) |
126 | return true; | |
127 | ||
453924de DH |
128 | if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) || |
129 | atomic64_read(&vnode->cb_expires_at) <= deadline || | |
130 | volume->cb_expires_at <= deadline || | |
131 | vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) || | |
132 | vnode->cb_scrub != atomic_read(&volume->cb_scrub) || | |
133 | test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) { | |
134 | _debug("inval"); | |
135 | return false; | |
136 | } | |
137 | ||
138 | return true; | |
139 | } | |
140 | ||
16069e13 DH |
141 | /* |
142 | * See if the server we've just talked to is currently excluded. | |
143 | */ | |
144 | static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume) | |
145 | { | |
146 | const struct afs_server_entry *se; | |
147 | const struct afs_server_list *slist; | |
148 | bool is_excluded = true; | |
149 | int i; | |
150 | ||
151 | rcu_read_lock(); | |
152 | ||
153 | slist = rcu_dereference(volume->servers); | |
154 | for (i = 0; i < slist->nr_servers; i++) { | |
155 | se = &slist->servers[i]; | |
156 | if (op->server == se->server) { | |
157 | is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags); | |
158 | break; | |
159 | } | |
160 | } | |
161 | ||
162 | rcu_read_unlock(); | |
163 | return is_excluded; | |
164 | } | |
165 | ||
166 | /* | |
167 | * Update the volume's server list when the creation time changes and see if | |
168 | * the server we've just talked to is currently excluded. | |
169 | */ | |
170 | static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume) | |
171 | { | |
172 | int ret; | |
173 | ||
174 | if (__afs_is_server_excluded(op, volume)) | |
175 | return 1; | |
176 | ||
177 | set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags); | |
178 | ret = afs_check_volume_status(op->volume, op); | |
179 | if (ret < 0) | |
180 | return ret; | |
181 | ||
182 | return __afs_is_server_excluded(op, volume); | |
183 | } | |
184 | ||
185 | /* | |
186 | * Handle a change to the volume creation time in the VolSync record. | |
187 | */ | |
188 | static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume) | |
189 | { | |
190 | unsigned int snap; | |
191 | time64_t cur = volume->creation_time; | |
192 | time64_t old = op->pre_volsync.creation; | |
193 | time64_t new = op->volsync.creation; | |
194 | int ret; | |
195 | ||
196 | _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new); | |
197 | ||
198 | if (cur == TIME64_MIN) { | |
199 | volume->creation_time = new; | |
200 | return 0; | |
201 | } | |
202 | ||
203 | if (new == cur) | |
204 | return 0; | |
205 | ||
206 | /* Try to advance the creation timestamp from what we had before the | |
207 | * operation to what we got back from the server. This should | |
208 | * hopefully ensure that in a race between multiple operations only one | |
209 | * of them will do this. | |
210 | */ | |
211 | if (cur != old) | |
212 | return 0; | |
213 | ||
214 | /* If the creation time changes in an unexpected way, we need to scrub | |
215 | * our caches. For a RW vol, this will only change if the volume is | |
216 | * restored from a backup; for a RO/Backup vol, this will advance when | |
217 | * the volume is updated to a new snapshot (eg. "vos release"). | |
218 | */ | |
219 | if (volume->type == AFSVL_RWVOL) | |
220 | goto regressed; | |
221 | if (volume->type == AFSVL_BACKVOL) { | |
222 | if (new < old) | |
223 | goto regressed; | |
224 | goto advance; | |
225 | } | |
226 | ||
227 | /* We have an RO volume, we need to query the VL server and look at the | |
228 | * server flags to see if RW->RO replication is in progress. | |
229 | */ | |
230 | ret = afs_is_server_excluded(op, volume); | |
231 | if (ret < 0) | |
232 | return ret; | |
233 | if (ret > 0) { | |
234 | snap = atomic_read(&volume->cb_ro_snapshot); | |
235 | trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded); | |
236 | return ret; | |
237 | } | |
238 | ||
239 | advance: | |
240 | snap = atomic_inc_return(&volume->cb_ro_snapshot); | |
241 | trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release); | |
242 | volume->creation_time = new; | |
243 | return 0; | |
244 | ||
245 | regressed: | |
246 | atomic_inc(&volume->cb_scrub); | |
247 | trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress); | |
248 | volume->creation_time = new; | |
249 | return 0; | |
250 | } | |
251 | ||
252 | /* | |
253 | * Handle a change to the volume update time in the VolSync record. | |
254 | */ | |
255 | static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume) | |
256 | { | |
257 | enum afs_cb_break_reason reason = afs_cb_break_no_break; | |
258 | time64_t cur = volume->update_time; | |
259 | time64_t old = op->pre_volsync.update; | |
260 | time64_t new = op->volsync.update; | |
261 | ||
262 | _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new); | |
263 | ||
264 | if (cur == TIME64_MIN) { | |
265 | volume->update_time = new; | |
266 | return; | |
267 | } | |
268 | ||
269 | if (new == cur) | |
270 | return; | |
271 | ||
272 | /* If the volume update time changes in an unexpected way, we need to | |
273 | * scrub our caches. For a RW vol, this will advance on every | |
274 | * modification op; for a RO/Backup vol, this will advance when the | |
275 | * volume is updated to a new snapshot (eg. "vos release"). | |
276 | */ | |
277 | if (new < old) | |
278 | reason = afs_cb_break_for_update_regress; | |
279 | ||
280 | /* Try to advance the update timestamp from what we had before the | |
281 | * operation to what we got back from the server. This should | |
282 | * hopefully ensure that in a race between multiple operations only one | |
283 | * of them will do this. | |
284 | */ | |
285 | if (cur == old) { | |
286 | if (reason == afs_cb_break_for_update_regress) { | |
287 | atomic_inc(&volume->cb_scrub); | |
288 | trace_afs_cb_v_break(volume->vid, 0, reason); | |
289 | } | |
290 | volume->update_time = new; | |
291 | } | |
292 | } | |
293 | ||
294 | static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume) | |
295 | { | |
296 | int ret = 0; | |
297 | ||
298 | if (likely(op->volsync.creation == volume->creation_time && | |
299 | op->volsync.update == volume->update_time)) | |
300 | return 0; | |
301 | ||
302 | mutex_lock(&volume->volsync_lock); | |
303 | if (op->volsync.creation != volume->creation_time) { | |
304 | ret = afs_update_volume_creation_time(op, volume); | |
305 | if (ret < 0) | |
306 | goto out; | |
307 | } | |
308 | if (op->volsync.update != volume->update_time) | |
309 | afs_update_volume_update_time(op, volume); | |
310 | out: | |
311 | mutex_unlock(&volume->volsync_lock); | |
312 | return ret; | |
313 | } | |
314 | ||
315 | /* | |
453924de DH |
316 | * Update the state of a volume, including recording the expiration time of the |
317 | * callback promise. Returns 1 to redo the operation from the start. | |
16069e13 DH |
318 | */ |
319 | int afs_update_volume_state(struct afs_operation *op) | |
320 | { | |
453924de DH |
321 | struct afs_server_list *slist = op->server_list; |
322 | struct afs_server_entry *se = &slist->servers[op->server_index]; | |
323 | struct afs_callback *cb = &op->file[0].scb.callback; | |
16069e13 | 324 | struct afs_volume *volume = op->volume; |
453924de DH |
325 | unsigned int cb_v_break = atomic_read(&volume->cb_v_break); |
326 | unsigned int cb_v_check = atomic_read(&volume->cb_v_check); | |
16069e13 DH |
327 | int ret; |
328 | ||
329 | _enter("%llx", op->volume->vid); | |
330 | ||
331 | if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) { | |
332 | ret = afs_update_volume_times(op, volume); | |
333 | if (ret != 0) { | |
334 | _leave(" = %d", ret); | |
335 | return ret; | |
336 | } | |
337 | } | |
338 | ||
453924de DH |
339 | if (op->cb_v_break == cb_v_break && |
340 | (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) { | |
341 | time64_t expires_at = cb->expires_at; | |
342 | ||
343 | if (!op->file[0].scb.have_cb) | |
344 | expires_at = op->file[1].scb.callback.expires_at; | |
345 | ||
346 | se->cb_expires_at = expires_at; | |
347 | volume->cb_expires_at = expires_at; | |
348 | } | |
349 | if (cb_v_check < op->cb_v_break) | |
350 | atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break); | |
16069e13 DH |
351 | return 0; |
352 | } | |
353 | ||
dfa0a449 DH |
354 | /* |
355 | * mark the data attached to an inode as obsolete due to a write on the server | |
356 | * - might also want to ditch all the outstanding writes and dirty pages | |
357 | */ | |
358 | static void afs_zap_data(struct afs_vnode *vnode) | |
359 | { | |
360 | _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); | |
361 | ||
362 | afs_invalidate_cache(vnode, 0); | |
363 | ||
364 | /* nuke all the non-dirty pages that aren't locked, mapped or being | |
365 | * written back in a regular file and completely discard the pages in a | |
366 | * directory or symlink */ | |
367 | if (S_ISREG(vnode->netfs.inode.i_mode)) | |
d73065e6 | 368 | filemap_invalidate_inode(&vnode->netfs.inode, true, 0, LLONG_MAX); |
dfa0a449 | 369 | else |
d73065e6 | 370 | filemap_invalidate_inode(&vnode->netfs.inode, false, 0, LLONG_MAX); |
dfa0a449 DH |
371 | } |
372 | ||
dfa0a449 DH |
373 | /* |
374 | * validate a vnode/inode | |
375 | * - there are several things we need to check | |
376 | * - parent dir data changes (rm, rmdir, rename, mkdir, create, link, | |
377 | * symlink) | |
378 | * - parent dir metadata changed (security changes) | |
379 | * - dentry data changed (write, truncate) | |
380 | * - dentry metadata changed (security changes) | |
381 | */ | |
382 | int afs_validate(struct afs_vnode *vnode, struct key *key) | |
383 | { | |
453924de DH |
384 | struct afs_volume *volume = vnode->volume; |
385 | unsigned int cb_ro_snapshot, cb_scrub; | |
386 | time64_t deadline = ktime_get_real_seconds() + 10; | |
387 | bool zap = false, locked_vol = false; | |
dfa0a449 DH |
388 | int ret; |
389 | ||
390 | _enter("{v={%llx:%llu} fl=%lx},%x", | |
391 | vnode->fid.vid, vnode->fid.vnode, vnode->flags, | |
392 | key_serial(key)); | |
393 | ||
453924de | 394 | if (afs_check_validity(vnode)) |
b74c02a3 | 395 | return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0; |
dfa0a449 | 396 | |
453924de DH |
397 | ret = down_write_killable(&vnode->validate_lock); |
398 | if (ret < 0) | |
399 | goto error; | |
400 | ||
b74c02a3 DH |
401 | if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { |
402 | ret = -ESTALE; | |
403 | goto error_unlock; | |
404 | } | |
405 | ||
453924de DH |
406 | /* Validate a volume after the v_break has changed or the volume |
407 | * callback expired. We only want to do this once per volume per | |
408 | * v_break change. The actual work will be done when parsing the | |
409 | * status fetch reply. | |
410 | */ | |
411 | if (volume->cb_expires_at <= deadline || | |
412 | atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) { | |
413 | ret = mutex_lock_interruptible(&volume->cb_check_lock); | |
414 | if (ret < 0) | |
415 | goto error_unlock; | |
416 | locked_vol = true; | |
417 | } | |
dfa0a449 | 418 | |
453924de DH |
419 | cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot); |
420 | cb_scrub = atomic_read(&volume->cb_scrub); | |
421 | if (vnode->cb_ro_snapshot != cb_ro_snapshot || | |
422 | vnode->cb_scrub != cb_scrub) | |
423 | unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false); | |
424 | ||
425 | if (vnode->cb_ro_snapshot != cb_ro_snapshot || | |
426 | vnode->cb_scrub != cb_scrub || | |
427 | volume->cb_expires_at <= deadline || | |
428 | atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) || | |
429 | atomic64_read(&vnode->cb_expires_at) <= deadline | |
430 | ) { | |
dfa0a449 DH |
431 | ret = afs_fetch_status(vnode, key, false, NULL); |
432 | if (ret < 0) { | |
433 | if (ret == -ENOENT) { | |
434 | set_bit(AFS_VNODE_DELETED, &vnode->flags); | |
435 | ret = -ESTALE; | |
436 | } | |
437 | goto error_unlock; | |
438 | } | |
453924de | 439 | |
dfa0a449 DH |
440 | _debug("new promise [fl=%lx]", vnode->flags); |
441 | } | |
442 | ||
453924de DH |
443 | /* We can drop the volume lock now as. */ |
444 | if (locked_vol) { | |
445 | mutex_unlock(&volume->cb_check_lock); | |
446 | locked_vol = false; | |
447 | } | |
448 | ||
449 | cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot); | |
450 | cb_scrub = atomic_read(&volume->cb_scrub); | |
451 | _debug("vnode inval %x==%x %x==%x", | |
452 | vnode->cb_ro_snapshot, cb_ro_snapshot, | |
453 | vnode->cb_scrub, cb_scrub); | |
454 | if (vnode->cb_scrub != cb_scrub) | |
455 | zap = true; | |
456 | vnode->cb_ro_snapshot = cb_ro_snapshot; | |
457 | vnode->cb_scrub = cb_scrub; | |
458 | ||
dfa0a449 DH |
459 | /* if the vnode's data version number changed then its contents are |
460 | * different */ | |
453924de DH |
461 | zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); |
462 | if (zap) | |
dfa0a449 DH |
463 | afs_zap_data(vnode); |
464 | up_write(&vnode->validate_lock); | |
dfa0a449 DH |
465 | _leave(" = 0"); |
466 | return 0; | |
467 | ||
468 | error_unlock: | |
453924de DH |
469 | if (locked_vol) |
470 | mutex_unlock(&volume->cb_check_lock); | |
dfa0a449 | 471 | up_write(&vnode->validate_lock); |
453924de | 472 | error: |
dfa0a449 DH |
473 | _leave(" = %d", ret); |
474 | return ret; | |
475 | } |