[linux-2.6-block.git] / fs / afs / validation.c

// SPDX-License-Identifier: GPL-2.0-or-later
/* vnode and volume validity verification.
 *
 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/sched.h>
#include "internal.h"

/*
 * Data validation is managed through a number of mechanisms from the server:
 *
 *  (1) On first contact with a server (such as if it has just been rebooted),
 *      the server sends us a CB.InitCallBackState* request.
 *
 *  (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
 *      calls, the server maintains a time-limited per-vnode promise that it
 *      will send us a CB.CallBack request if a third party alters the vnodes
 *      accessed.
 *
 *      Note that a vnode-level callbacks may also be sent for other reasons,
 *      such as filelock release.
 *
 *  (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
 *      calls, each server maintains a time-limited per-volume promise that it
 *      will send us a CB.CallBack request if the RO volume is updated to a
 *      snapshot of the RW volume ("vos release").  This is an atomic event
 *      that cuts over all instances of the RO volume across multiple servers
 *      simultaneously.
 *
 *	Note that a volume-level callbacks may also be sent for other reasons,
 *	such as the volumeserver taking over control of the volume from the
 *	fileserver.
 *
 *	Note also that each server maintains an independent time limit on an
 *	independent callback.
 *
 *  (4) Certain RPC calls include a volume information record "VolSync" in
 *      their reply.  This contains a creation date for the volume that should
 *      remain unchanged for a RW volume (but will be changed if the volume is
 *      restored from backup) or will be bumped to the time of snapshotting
 *      when a RO volume is released.
 *
 * In order to track this events, the following are provided:
 *
 *	->cb_v_break.  A counter of events that might mean that the contents of
 *	a volume have been altered since we last checked a vnode.
 *
 *	->cb_v_check.  A counter of the number of events that we've sent a
 *	query to the server for.  Everything's up to date if this equals
 *	cb_v_break.
 *
 *	->cb_scrub.  A counter of the number of regression events for which we
 *	have to completely wipe the cache.
 *
 *	->cb_ro_snapshot.  A counter of the number of times that we've
 *      recognised that a RO volume has been updated.
 *
 *	->cb_break.  A counter of events that might mean that the contents of a
 *      vnode have been altered.
 *
 *	->cb_expires_at.  The time at which the callback promise expires or
 *      AFS_NO_CB_PROMISE if we have no promise.
 *
 * The way we manage things is:
 *
 *  (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
 *      the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
 *      volume and volume's server record.
 *
 *  (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
 *	callback break on all the volumes that have been using that volume
 *	(ie. increment ->cb_v_break and reset ->cb_expires_at).
 *
 *  (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
 *	vnode and reset its ->cb_expires_at.  If the vnode is mmapped, we also
 *	dispatch a work item to unmap all PTEs to the vnode's pagecache to
 *	force reentry to the filesystem for revalidation.
 *
 *  (4) When entering the filesystem, we call afs_validate() to check the
 *	validity of a vnode.  This first checks to see if ->cb_v_check and
 *	->cb_v_break match, and if they don't, we lock volume->cb_check_lock
 *	exclusively and perform an FS.FetchStatus on the vnode.
 *
 *	After checking the volume, we check the vnode.  If there's a mismatch
 *	between the volume counters and the vnode's mirrors of those counters,
 *	we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
 *
 *  (5) When the reply from FS.FetchStatus arrives, the VolSync record is
 *      parsed:
 *
 *	(A) If the Creation timestamp has changed on a RW volume or regressed
 *	    on a RO volume, we try to increment ->cb_scrub; if it advances on a
 *	    RO volume, we assume "vos release" happened and try to increment
 *	    ->cb_ro_snapshot.
 *
 *      (B) If the Update timestamp has regressed, we try to increment
 *	    ->cb_scrub.
 *
 *      Note that in both of these cases, we only do the increment if we can
 *      cmpxchg the value of the timestamp from the value we noted before the
 *      op.  This tries to prevent parallel ops from fighting one another.
 *
 *	volume->cb_v_check is then set to ->cb_v_break.
 *
 *  (6) The AFSCallBack record included in the FS.FetchStatus reply is also
 *	parsed and used to set the promise in ->cb_expires_at for the vnode,
 *	the volume and the volume's server record.
 *
 *  (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
 *      the vnode.
 */

/*
 * Check the validity of a vnode/inode and its parent volume.
 */
bool afs_check_validity(const struct afs_vnode *vnode)
{
	const struct afs_volume *volume = vnode->volume;
	time64_t deadline = ktime_get_real_seconds() + 10;

	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
		return true;

	if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
	    atomic64_read(&vnode->cb_expires_at)  <= deadline ||
	    volume->cb_expires_at <= deadline ||
	    vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
	    vnode->cb_scrub	  != atomic_read(&volume->cb_scrub) ||
	    test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
		_debug("inval");
		return false;
	}

	return true;
}

/*
 * See if the server we've just talked to is currently excluded.
 */
static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
{
	const struct afs_server_entry *se;
	const struct afs_server_list *slist;
	bool is_excluded = true;
	int i;

	rcu_read_lock();

	slist = rcu_dereference(volume->servers);
	for (i = 0; i < slist->nr_servers; i++) {
		se = &slist->servers[i];
		if (op->server == se->server) {
			is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
			break;
		}
	}

	rcu_read_unlock();
	return is_excluded;
}

/*
 * Update the volume's server list when the creation time changes and see if
 * the server we've just talked to is currently excluded.
 */
static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
{
	int ret;

	if (__afs_is_server_excluded(op, volume))
		return 1;

	set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
	ret = afs_check_volume_status(op->volume, op);
	if (ret < 0)
		return ret;

	return __afs_is_server_excluded(op, volume);
}

/*
 * Handle a change to the volume creation time in the VolSync record.
 */
static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
{
	unsigned int snap;
	time64_t cur = volume->creation_time;
	time64_t old = op->pre_volsync.creation;
	time64_t new = op->volsync.creation;
	int ret;

	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);

	if (cur == TIME64_MIN) {
		volume->creation_time = new;
		return 0;
	}

	if (new == cur)
		return 0;

	/* Try to advance the creation timestamp from what we had before the
	 * operation to what we got back from the server.  This should
	 * hopefully ensure that in a race between multiple operations only one
	 * of them will do this.
	 */
	if (cur != old)
		return 0;

	/* If the creation time changes in an unexpected way, we need to scrub
	 * our caches.  For a RW vol, this will only change if the volume is
	 * restored from a backup; for a RO/Backup vol, this will advance when
	 * the volume is updated to a new snapshot (eg. "vos release").
	 */
	if (volume->type == AFSVL_RWVOL)
		goto regressed;
	if (volume->type == AFSVL_BACKVOL) {
		if (new < old)
			goto regressed;
		goto advance;
	}

	/* We have an RO volume, we need to query the VL server and look at the
	 * server flags to see if RW->RO replication is in progress.
	 */
	ret = afs_is_server_excluded(op, volume);
	if (ret < 0)
		return ret;
	if (ret > 0) {
		snap = atomic_read(&volume->cb_ro_snapshot);
		trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
		return ret;
	}

advance:
	snap = atomic_inc_return(&volume->cb_ro_snapshot);
	trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
	volume->creation_time = new;
	return 0;

regressed:
	atomic_inc(&volume->cb_scrub);
	trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
	volume->creation_time = new;
	return 0;
}

/*
 * Handle a change to the volume update time in the VolSync record.
 */
static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
{
	enum afs_cb_break_reason reason = afs_cb_break_no_break;
	time64_t cur = volume->update_time;
	time64_t old = op->pre_volsync.update;
	time64_t new = op->volsync.update;

	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);

	if (cur == TIME64_MIN) {
		volume->update_time = new;
		return;
	}

	if (new == cur)
		return;

	/* If the volume update time changes in an unexpected way, we need to
	 * scrub our caches.  For a RW vol, this will advance on every
	 * modification op; for a RO/Backup vol, this will advance when the
	 * volume is updated to a new snapshot (eg. "vos release").
	 */
	if (new < old)
		reason = afs_cb_break_for_update_regress;

	/* Try to advance the update timestamp from what we had before the
	 * operation to what we got back from the server.  This should
	 * hopefully ensure that in a race between multiple operations only one
	 * of them will do this.
	 */
	if (cur == old) {
		if (reason == afs_cb_break_for_update_regress) {
			atomic_inc(&volume->cb_scrub);
			trace_afs_cb_v_break(volume->vid, 0, reason);
		}
		volume->update_time = new;
	}
}

static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
{
	int ret = 0;

	if (likely(op->volsync.creation == volume->creation_time &&
		   op->volsync.update == volume->update_time))
		return 0;

	mutex_lock(&volume->volsync_lock);
	if (op->volsync.creation != volume->creation_time) {
		ret = afs_update_volume_creation_time(op, volume);
		if (ret < 0)
			goto out;
	}
	if (op->volsync.update != volume->update_time)
		afs_update_volume_update_time(op, volume);
out:
	mutex_unlock(&volume->volsync_lock);
	return ret;
}

/*
 * Update the state of a volume, including recording the expiration time of the
 * callback promise.  Returns 1 to redo the operation from the start.
 */
int afs_update_volume_state(struct afs_operation *op)
{
	struct afs_server_list *slist = op->server_list;
	struct afs_server_entry *se = &slist->servers[op->server_index];
	struct afs_callback *cb = &op->file[0].scb.callback;
	struct afs_volume *volume = op->volume;
	unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
	unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
	int ret;

	_enter("%llx", op->volume->vid);

	if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
		ret = afs_update_volume_times(op, volume);
		if (ret != 0) {
			_leave(" = %d", ret);
			return ret;
		}
	}

	if (op->cb_v_break == cb_v_break &&
	    (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
		time64_t expires_at = cb->expires_at;

		if (!op->file[0].scb.have_cb)
			expires_at = op->file[1].scb.callback.expires_at;

		se->cb_expires_at = expires_at;
		volume->cb_expires_at = expires_at;
	}
	if (cb_v_check < op->cb_v_break)
		atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
	return 0;
}

/*
 * mark the data attached to an inode as obsolete due to a write on the server
 * - might also want to ditch all the outstanding writes and dirty pages
 */
static void afs_zap_data(struct afs_vnode *vnode)
{
	_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);

	afs_invalidate_cache(vnode, 0);

	/* nuke all the non-dirty pages that aren't locked, mapped or being
	 * written back in a regular file and completely discard the pages in a
	 * directory or symlink */
	if (S_ISREG(vnode->netfs.inode.i_mode))
		filemap_invalidate_inode(&vnode->netfs.inode, true, 0, LLONG_MAX);
	else
		filemap_invalidate_inode(&vnode->netfs.inode, false, 0, LLONG_MAX);
}

/*
 * validate a vnode/inode
 * - there are several things we need to check
 *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
 *     symlink)
 *   - parent dir metadata changed (security changes)
 *   - dentry data changed (write, truncate)
 *   - dentry metadata changed (security changes)
 */
int afs_validate(struct afs_vnode *vnode, struct key *key)
{
	struct afs_volume *volume = vnode->volume;
	unsigned int cb_ro_snapshot, cb_scrub;
	time64_t deadline = ktime_get_real_seconds() + 10;
	bool zap = false, locked_vol = false;
	int ret;

	_enter("{v={%llx:%llu} fl=%lx},%x",
	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
	       key_serial(key));

	if (afs_check_validity(vnode))
		return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;

	ret = down_write_killable(&vnode->validate_lock);
	if (ret < 0)
		goto error;

	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
		ret = -ESTALE;
		goto error_unlock;
	}

	/* Validate a volume after the v_break has changed or the volume
	 * callback expired.  We only want to do this once per volume per
	 * v_break change.  The actual work will be done when parsing the
	 * status fetch reply.
	 */
	if (volume->cb_expires_at <= deadline ||
	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
		ret = mutex_lock_interruptible(&volume->cb_check_lock);
		if (ret < 0)
			goto error_unlock;
		locked_vol = true;
	}

	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
	cb_scrub = atomic_read(&volume->cb_scrub);
	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
	    vnode->cb_scrub	  != cb_scrub)
		unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);

	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
	    vnode->cb_scrub	  != cb_scrub ||
	    volume->cb_expires_at <= deadline ||
	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
	    atomic64_read(&vnode->cb_expires_at) <= deadline
	    ) {
		ret = afs_fetch_status(vnode, key, false, NULL);
		if (ret < 0) {
			if (ret == -ENOENT) {
				set_bit(AFS_VNODE_DELETED, &vnode->flags);
				ret = -ESTALE;
			}
			goto error_unlock;
		}

		_debug("new promise [fl=%lx]", vnode->flags);
	}

	/* We can drop the volume lock now as. */
	if (locked_vol) {
		mutex_unlock(&volume->cb_check_lock);
		locked_vol = false;
	}

	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
	cb_scrub = atomic_read(&volume->cb_scrub);
	_debug("vnode inval %x==%x %x==%x",
	       vnode->cb_ro_snapshot, cb_ro_snapshot,
	       vnode->cb_scrub, cb_scrub);
	if (vnode->cb_scrub != cb_scrub)
		zap = true;
	vnode->cb_ro_snapshot = cb_ro_snapshot;
	vnode->cb_scrub = cb_scrub;

	/* if the vnode's data version number changed then its contents are
	 * different */
	zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
	if (zap)
		afs_zap_data(vnode);
	up_write(&vnode->validate_lock);
	_leave(" = 0");
	return 0;

error_unlock:
	if (locked_vol)
		mutex_unlock(&volume->cb_check_lock);
	up_write(&vnode->validate_lock);
error:
	_leave(" = %d", ret);
	return ret;
}
Commit	Line	Data
dfa0a449 DH	1	// SPDX-License-Identifier: GPL-2.0-or-later
	2	/* vnode and volume validity verification.
	3	*
	4	* Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
	5	* Written by David Howells (dhowells@redhat.com)
	6	*/
	7
	8	#include <linux/kernel.h>
	9	#include <linux/module.h>
	10	#include <linux/sched.h>
	11	#include "internal.h"
	12
453924de DH	13	/*
	14	* Data validation is managed through a number of mechanisms from the server:
	15	*
	16	* (1) On first contact with a server (such as if it has just been rebooted),
	17	* the server sends us a CB.InitCallBackState* request.
	18	*
	19	* (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
	20	* calls, the server maintains a time-limited per-vnode promise that it
	21	* will send us a CB.CallBack request if a third party alters the vnodes
	22	* accessed.
	23	*
	24	* Note that a vnode-level callbacks may also be sent for other reasons,
	25	* such as filelock release.
	26	*
	27	* (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
	28	* calls, each server maintains a time-limited per-volume promise that it
	29	* will send us a CB.CallBack request if the RO volume is updated to a
	30	* snapshot of the RW volume ("vos release"). This is an atomic event
	31	* that cuts over all instances of the RO volume across multiple servers
	32	* simultaneously.
	33	*
	34	* Note that a volume-level callbacks may also be sent for other reasons,
	35	* such as the volumeserver taking over control of the volume from the
	36	* fileserver.
	37	*
	38	* Note also that each server maintains an independent time limit on an
	39	* independent callback.
	40	*
	41	* (4) Certain RPC calls include a volume information record "VolSync" in
	42	* their reply. This contains a creation date for the volume that should
	43	* remain unchanged for a RW volume (but will be changed if the volume is
	44	* restored from backup) or will be bumped to the time of snapshotting
	45	* when a RO volume is released.
	46	*
	47	* In order to track this events, the following are provided:
	48	*
	49	* ->cb_v_break. A counter of events that might mean that the contents of
	50	* a volume have been altered since we last checked a vnode.
	51	*
	52	* ->cb_v_check. A counter of the number of events that we've sent a
	53	* query to the server for. Everything's up to date if this equals
	54	* cb_v_break.
	55	*
	56	* ->cb_scrub. A counter of the number of regression events for which we
	57	* have to completely wipe the cache.
	58	*
	59	* ->cb_ro_snapshot. A counter of the number of times that we've
	60	* recognised that a RO volume has been updated.
	61	*
	62	* ->cb_break. A counter of events that might mean that the contents of a
	63	* vnode have been altered.
	64	*
	65	* ->cb_expires_at. The time at which the callback promise expires or
	66	* AFS_NO_CB_PROMISE if we have no promise.
	67	*
	68	* The way we manage things is:
	69	*
	70	* (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
	71	* the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
	72	* volume and volume's server record.
	73	*
	74	* (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
	75	* callback break on all the volumes that have been using that volume
	76	* (ie. increment ->cb_v_break and reset ->cb_expires_at).
77	*
78	* (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
79	* vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also
80	* dispatch a work item to unmap all PTEs to the vnode's pagecache to
81	* force reentry to the filesystem for revalidation.
82	*
83	* (4) When entering the filesystem, we call afs_validate() to check the
84	* validity of a vnode. This first checks to see if ->cb_v_check and
85	* ->cb_v_break match, and if they don't, we lock volume->cb_check_lock
86	* exclusively and perform an FS.FetchStatus on the vnode.
87	*
88	* After checking the volume, we check the vnode. If there's a mismatch
89	* between the volume counters and the vnode's mirrors of those counters,
90	* we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
91	*
92	* (5) When the reply from FS.FetchStatus arrives, the VolSync record is
93	* parsed:
94	*
95	* (A) If the Creation timestamp has changed on a RW volume or regressed
96	* on a RO volume, we try to increment ->cb_scrub; if it advances on a
97	* RO volume, we assume "vos release" happened and try to increment
98	* ->cb_ro_snapshot.
99	*
100	* (B) If the Update timestamp has regressed, we try to increment
101	* ->cb_scrub.
102	*
103	* Note that in both of these cases, we only do the increment if we can
104	* cmpxchg the value of the timestamp from the value we noted before the
105	* op. This tries to prevent parallel ops from fighting one another.
106	*
107	* volume->cb_v_check is then set to ->cb_v_break.
108	*
109	* (6) The AFSCallBack record included in the FS.FetchStatus reply is also
110	* parsed and used to set the promise in ->cb_expires_at for the vnode,
111	* the volume and the volume's server record.
112	*
113	* (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
114	* the vnode.
115	*/
116
117	/*
118	* Check the validity of a vnode/inode and its parent volume.
119	*/
120	bool afs_check_validity(const struct afs_vnode *vnode)
121	{
122	const struct afs_volume *volume = vnode->volume;
123	time64_t deadline = ktime_get_real_seconds() + 10;
124
b74c02a3 DH	125	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
	126	return true;
	127
453924de DH	128	if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) \|\|
	129	atomic64_read(&vnode->cb_expires_at) <= deadline \|\|
	130	volume->cb_expires_at <= deadline \|\|
	131	vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) \|\|
	132	vnode->cb_scrub != atomic_read(&volume->cb_scrub) \|\|
	133	test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
	134	_debug("inval");
	135	return false;
	136	}
	137
	138	return true;
	139	}
	140
16069e13 DH	141	/*
	142	* See if the server we've just talked to is currently excluded.
	143	*/
	144	static bool __afs_is_server_excluded(struct afs_operation op, struct afs_volume volume)
	145	{
	146	const struct afs_server_entry *se;
	147	const struct afs_server_list *slist;
	148	bool is_excluded = true;
	149	int i;
	150
	151	rcu_read_lock();
	152
	153	slist = rcu_dereference(volume->servers);
	154	for (i = 0; i < slist->nr_servers; i++) {
	155	se = &slist->servers[i];
	156	if (op->server == se->server) {
	157	is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
	158	break;
	159	}
	160	}
	161
	162	rcu_read_unlock();
	163	return is_excluded;
	164	}
	165
	166	/*
	167	* Update the volume's server list when the creation time changes and see if
	168	* the server we've just talked to is currently excluded.
	169	*/
	170	static int afs_is_server_excluded(struct afs_operation op, struct afs_volume volume)
	171	{
	172	int ret;
	173
	174	if (__afs_is_server_excluded(op, volume))
	175	return 1;
	176
	177	set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
	178	ret = afs_check_volume_status(op->volume, op);
	179	if (ret < 0)
	180	return ret;
	181
	182	return __afs_is_server_excluded(op, volume);
	183	}
	184
	185	/*
	186	* Handle a change to the volume creation time in the VolSync record.
	187	*/
	188	static int afs_update_volume_creation_time(struct afs_operation op, struct afs_volume volume)
	189	{
	190	unsigned int snap;
	191	time64_t cur = volume->creation_time;
	192	time64_t old = op->pre_volsync.creation;
	193	time64_t new = op->volsync.creation;
	194	int ret;
	195
	196	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
	197
	198	if (cur == TIME64_MIN) {
	199	volume->creation_time = new;
	200	return 0;
	201	}
	202
	203	if (new == cur)
	204	return 0;
205
206	/* Try to advance the creation timestamp from what we had before the
207	* operation to what we got back from the server. This should
208	* hopefully ensure that in a race between multiple operations only one
209	* of them will do this.
210	*/
211	if (cur != old)
212	return 0;
213
214	/* If the creation time changes in an unexpected way, we need to scrub
215	* our caches. For a RW vol, this will only change if the volume is
216	* restored from a backup; for a RO/Backup vol, this will advance when
217	* the volume is updated to a new snapshot (eg. "vos release").
218	*/
219	if (volume->type == AFSVL_RWVOL)
220	goto regressed;
221	if (volume->type == AFSVL_BACKVOL) {
222	if (new < old)
223	goto regressed;
224	goto advance;
225	}
226
227	/* We have an RO volume, we need to query the VL server and look at the
228	* server flags to see if RW->RO replication is in progress.
229	*/
230	ret = afs_is_server_excluded(op, volume);
231	if (ret < 0)
232	return ret;
233	if (ret > 0) {
234	snap = atomic_read(&volume->cb_ro_snapshot);
235	trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
236	return ret;
237	}
238
239	advance:
240	snap = atomic_inc_return(&volume->cb_ro_snapshot);
241	trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
242	volume->creation_time = new;
243	return 0;
244
245	regressed:
246	atomic_inc(&volume->cb_scrub);
247	trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
248	volume->creation_time = new;
249	return 0;
250	}
251
252	/*
253	* Handle a change to the volume update time in the VolSync record.
254	*/
255	static void afs_update_volume_update_time(struct afs_operation op, struct afs_volume volume)
256	{
257	enum afs_cb_break_reason reason = afs_cb_break_no_break;
258	time64_t cur = volume->update_time;
259	time64_t old = op->pre_volsync.update;
260	time64_t new = op->volsync.update;
261
262	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
263
264	if (cur == TIME64_MIN) {
265	volume->update_time = new;
266	return;
267	}
268
269	if (new == cur)
270	return;
271
272	/* If the volume update time changes in an unexpected way, we need to
273	* scrub our caches. For a RW vol, this will advance on every
274	* modification op; for a RO/Backup vol, this will advance when the
275	* volume is updated to a new snapshot (eg. "vos release").
276	*/
277	if (new < old)
278	reason = afs_cb_break_for_update_regress;
279
280	/* Try to advance the update timestamp from what we had before the
281	* operation to what we got back from the server. This should
282	* hopefully ensure that in a race between multiple operations only one
283	* of them will do this.
284	*/
285	if (cur == old) {
286	if (reason == afs_cb_break_for_update_regress) {
287	atomic_inc(&volume->cb_scrub);
288	trace_afs_cb_v_break(volume->vid, 0, reason);
289	}
290	volume->update_time = new;
291	}
292	}
293
294	static int afs_update_volume_times(struct afs_operation op, struct afs_volume volume)
295	{
296	int ret = 0;
297
298	if (likely(op->volsync.creation == volume->creation_time &&
299	op->volsync.update == volume->update_time))
300	return 0;
301
302	mutex_lock(&volume->volsync_lock);
303	if (op->volsync.creation != volume->creation_time) {
304	ret = afs_update_volume_creation_time(op, volume);
305	if (ret < 0)
306	goto out;
307	}
308	if (op->volsync.update != volume->update_time)
309	afs_update_volume_update_time(op, volume);
310	out:
311	mutex_unlock(&volume->volsync_lock);
312	return ret;
313	}
314
315	/*
453924de DH	316	* Update the state of a volume, including recording the expiration time of the
453924de DH	317	* callback promise. Returns 1 to redo the operation from the start.
16069e13 DH	318	*/
	319	int afs_update_volume_state(struct afs_operation *op)
	320	{
453924de DH	321	struct afs_server_list *slist = op->server_list;
	322	struct afs_server_entry *se = &slist->servers[op->server_index];
	323	struct afs_callback *cb = &op->file[0].scb.callback;
16069e13	324	struct afs_volume *volume = op->volume;
453924de DH	325	unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
453924de DH	326	unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
16069e13 DH	327	int ret;
	328
	329	_enter("%llx", op->volume->vid);
	330
	331	if (op->volsync.creation != TIME64_MIN \|\| op->volsync.update != TIME64_MIN) {
	332	ret = afs_update_volume_times(op, volume);
	333	if (ret != 0) {
	334	_leave(" = %d", ret);
	335	return ret;
	336	}
	337	}
	338
453924de DH	339	if (op->cb_v_break == cb_v_break &&
	340	(op->file[0].scb.have_cb \|\| op->file[1].scb.have_cb)) {
	341	time64_t expires_at = cb->expires_at;
	342
	343	if (!op->file[0].scb.have_cb)
	344	expires_at = op->file[1].scb.callback.expires_at;
	345
	346	se->cb_expires_at = expires_at;
	347	volume->cb_expires_at = expires_at;
	348	}
	349	if (cb_v_check < op->cb_v_break)
	350	atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
16069e13 DH	351	return 0;
	352	}
	353
dfa0a449 DH	354	/*
	355	* mark the data attached to an inode as obsolete due to a write on the server
	356	* - might also want to ditch all the outstanding writes and dirty pages
	357	*/
	358	static void afs_zap_data(struct afs_vnode *vnode)
	359	{
	360	_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
	361
	362	afs_invalidate_cache(vnode, 0);
	363
	364	/* nuke all the non-dirty pages that aren't locked, mapped or being
	365	* written back in a regular file and completely discard the pages in a
	366	* directory or symlink */
	367	if (S_ISREG(vnode->netfs.inode.i_mode))
d73065e6	368	filemap_invalidate_inode(&vnode->netfs.inode, true, 0, LLONG_MAX);
dfa0a449	369	else
d73065e6	370	filemap_invalidate_inode(&vnode->netfs.inode, false, 0, LLONG_MAX);
dfa0a449 DH	371	}
dfa0a449 DH	372
dfa0a449 DH	373	/*
	374	* validate a vnode/inode
	375	* - there are several things we need to check
	376	* - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
	377	* symlink)
	378	* - parent dir metadata changed (security changes)
	379	* - dentry data changed (write, truncate)
	380	* - dentry metadata changed (security changes)
	381	*/
	382	int afs_validate(struct afs_vnode vnode, struct key key)
	383	{
453924de DH	384	struct afs_volume *volume = vnode->volume;
	385	unsigned int cb_ro_snapshot, cb_scrub;
	386	time64_t deadline = ktime_get_real_seconds() + 10;
	387	bool zap = false, locked_vol = false;
dfa0a449 DH	388	int ret;
	389
	390	_enter("{v={%llx:%llu} fl=%lx},%x",
	391	vnode->fid.vid, vnode->fid.vnode, vnode->flags,
	392	key_serial(key));
	393
453924de	394	if (afs_check_validity(vnode))
b74c02a3	395	return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
dfa0a449	396
453924de DH	397	ret = down_write_killable(&vnode->validate_lock);
	398	if (ret < 0)
	399	goto error;
	400
b74c02a3 DH	401	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
	402	ret = -ESTALE;
	403	goto error_unlock;
	404	}
	405
453924de DH	406	/* Validate a volume after the v_break has changed or the volume
	407	* callback expired. We only want to do this once per volume per
	408	* v_break change. The actual work will be done when parsing the
	409	* status fetch reply.
	410	*/
	411	if (volume->cb_expires_at <= deadline \|\|
	412	atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
	413	ret = mutex_lock_interruptible(&volume->cb_check_lock);
	414	if (ret < 0)
	415	goto error_unlock;
	416	locked_vol = true;
	417	}
dfa0a449	418
453924de DH	419	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
	420	cb_scrub = atomic_read(&volume->cb_scrub);
	421	if (vnode->cb_ro_snapshot != cb_ro_snapshot \|\|
	422	vnode->cb_scrub != cb_scrub)
	423	unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
	424
	425	if (vnode->cb_ro_snapshot != cb_ro_snapshot \|\|
	426	vnode->cb_scrub != cb_scrub \|\|
	427	volume->cb_expires_at <= deadline \|\|
	428	atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) \|\|
	429	atomic64_read(&vnode->cb_expires_at) <= deadline
	430	) {
dfa0a449 DH	431	ret = afs_fetch_status(vnode, key, false, NULL);
	432	if (ret < 0) {
	433	if (ret == -ENOENT) {
	434	set_bit(AFS_VNODE_DELETED, &vnode->flags);
	435	ret = -ESTALE;
	436	}
	437	goto error_unlock;
	438	}
453924de	439
dfa0a449 DH	440	_debug("new promise [fl=%lx]", vnode->flags);
	441	}
	442
453924de DH	443	/* We can drop the volume lock now as. */
	444	if (locked_vol) {
	445	mutex_unlock(&volume->cb_check_lock);
	446	locked_vol = false;
	447	}
	448
	449	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
	450	cb_scrub = atomic_read(&volume->cb_scrub);
	451	_debug("vnode inval %x==%x %x==%x",
	452	vnode->cb_ro_snapshot, cb_ro_snapshot,
	453	vnode->cb_scrub, cb_scrub);
	454	if (vnode->cb_scrub != cb_scrub)
	455	zap = true;
	456	vnode->cb_ro_snapshot = cb_ro_snapshot;
	457	vnode->cb_scrub = cb_scrub;
	458
dfa0a449 DH	459	/* if the vnode's data version number changed then its contents are
dfa0a449 DH	460	* different */
453924de DH	461	zap \|= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
453924de DH	462	if (zap)
dfa0a449 DH	463	afs_zap_data(vnode);
dfa0a449 DH	464	up_write(&vnode->validate_lock);
dfa0a449 DH	465	_leave(" = 0");
	466	return 0;
	467
	468	error_unlock:
453924de DH	469	if (locked_vol)
453924de DH	470	mutex_unlock(&volume->cb_check_lock);
dfa0a449	471	up_write(&vnode->validate_lock);
453924de	472	error:
dfa0a449 DH	473	_leave(" = %d", ret);
	474	return ret;
	475	}