afs: Overhaul invalidation handling to better support RO volumes

author David Howells <dhowells@redhat.com>

Wed, 8 Nov 2023 13:57:42 +0000 (13:57 +0000)

committer David Howells <dhowells@redhat.com>

Mon, 1 Jan 2024 16:37:27 +0000 (16:37 +0000)
author David Howells <dhowells@redhat.com>
Wed, 8 Nov 2023 13:57:42 +0000 (13:57 +0000)
committer David Howells <dhowells@redhat.com>
Mon, 1 Jan 2024 16:37:27 +0000 (16:37 +0000)
diff --git a/fs/afs/callback.c b/fs/afs/callback.c

index 8ddc99c9c16b587f6dd8b3392eabfa1e9f186521..99b2c8172021a2f1ac8bc1d8b4e36d947b971f48 100644 (file)
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -33,21 +33,20 @@ void afs_invalidate_mmap_work(struct work_struct *work)
         unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
  }
  
-static void afs_server_init_callback(struct afs_server *server)
+static void afs_volume_init_callback(struct afs_volume *volume)
  {
         struct afs_vnode *vnode;
-       struct afs_cell *cell = server->cell;
  
-       down_read(&cell->fs_open_mmaps_lock);
+       down_read(&volume->open_mmaps_lock);
  
-       list_for_each_entry(vnode, &cell->fs_open_mmaps, cb_mmap_link) {
-               if (vnode->cb_server == server) {
-                       clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+       list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) {
+               if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) {
+                       atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
                         queue_work(system_unbound_wq, &vnode->cb_work);
                 }
         }
  
-       up_read(&cell->fs_open_mmaps_lock);
+       up_read(&volume->open_mmaps_lock);
  }
  
  /*
@@ -56,19 +55,20 @@ static void afs_server_init_callback(struct afs_server *server)
   */
  void afs_init_callback_state(struct afs_server *server)
  {
-       struct afs_cell *cell = server->cell;
+       struct afs_server_entry *se;
  
-       down_read(&cell->vs_lock);
+       down_read(&server->cell->vs_lock);
  
-       do {
-               server->cb_s_break++;
-               atomic_inc(&server->cell->fs_s_break);
-               if (!list_empty(&server->cell->fs_open_mmaps))
-                       afs_server_init_callback(server);
-
-       } while ((server = rcu_dereference(server->uuid_next)));
+       list_for_each_entry(se, &server->volumes, slink) {
+               se->cb_expires_at = AFS_NO_CB_PROMISE;
+               se->volume->cb_expires_at = AFS_NO_CB_PROMISE;
+               trace_afs_cb_v_break(se->volume->vid, atomic_read(&se->volume->cb_v_break),
+                                    afs_cb_break_for_s_reinit);
+               if (!list_empty(&se->volume->open_mmaps))
+                       afs_volume_init_callback(se->volume);
+       }
  
-       up_read(&cell->vs_lock);
+       up_read(&server->cell->vs_lock);
  }
  
  /*
@@ -79,9 +79,9 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas
         _enter("");
  
         clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
-       if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+       if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE) {
                 vnode->cb_break++;
-               vnode->cb_v_break = atomic_read(&vnode->volume->cb_v_break);
+               vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
                 afs_clear_permits(vnode);
  
                 if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB)
@@ -147,29 +147,51 @@ static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell,
         return volume;
  }
  
+/*
+ * Allow the fileserver to break callbacks at the volume-level.  This is
+ * typically done when, for example, a R/W volume is snapshotted to a R/O
+ * volume (the only way to change an R/O volume).  It may also, however, happen
+ * when a volserver takes control of a volume (offlining it, moving it, etc.).
+ *
+ * Every file in that volume will need to be reevaluated.
+ */
+static void afs_break_volume_callback(struct afs_server *server,
+                                     struct afs_volume *volume)
+       __releases(RCU)
+{
+       struct afs_server_list *slist = rcu_dereference(volume->servers);
+       unsigned int i, cb_v_break;
+
+       write_lock(&volume->cb_v_break_lock);
+
+       for (i = 0; i < slist->nr_servers; i++)
+               if (slist->servers[i].server == server)
+                       slist->servers[i].cb_expires_at = AFS_NO_CB_PROMISE;
+       volume->cb_expires_at = AFS_NO_CB_PROMISE;
+
+       cb_v_break = atomic_inc_return_release(&volume->cb_v_break);
+       trace_afs_cb_v_break(volume->vid, cb_v_break, afs_cb_break_for_volume_callback);
+
+       write_unlock(&volume->cb_v_break_lock);
+       rcu_read_unlock();
+
+       if (!list_empty(&volume->open_mmaps))
+               afs_volume_init_callback(volume);
+}
+
  /*
   * allow the fileserver to explicitly break one callback
   * - happens when
   *   - the backing file is changed
   *   - a lock is released
   */
-static void afs_break_one_callback(struct afs_volume *volume,
+static void afs_break_one_callback(struct afs_server *server,
+                                  struct afs_volume *volume,
                                    struct afs_fid *fid)
  {
         struct super_block *sb;
         struct afs_vnode *vnode;
         struct inode *inode;
-       unsigned int cb_v_break;
-
-       if (fid->vnode == 0 && fid->unique == 0) {
-               /* The callback break applies to an entire volume. */
-               write_lock(&volume->cb_v_break_lock);
-               cb_v_break = atomic_inc_return(&volume->cb_v_break);
-               trace_afs_cb_break(fid, cb_v_break,
-                                  afs_cb_break_for_volume_callback, false);
-               write_unlock(&volume->cb_v_break_lock);
-               return;
-       }
  
         /* See if we can find a matching inode - even an I_NEW inode needs to
          * be marked as it can have its callback broken before we finish
@@ -199,24 +221,32 @@ static void afs_break_some_callbacks(struct afs_server *server,
  
         rcu_read_lock();
         volume = afs_lookup_volume_rcu(server->cell, vid);
-       /* TODO: Find all matching volumes if we couldn't match the server and
-        * break them anyway.
-        */
-       for (i = *_count; i > 0; cbb++, i--) {
-               if (cbb->fid.vid == vid) {
-                       _debug("- Fid { vl=%08llx n=%llu u=%u }",
-                              cbb->fid.vid,
-                              cbb->fid.vnode,
-                              cbb->fid.unique);
-                       --*_count;
-                       if (volume)
-                               afs_break_one_callback(volume, &cbb->fid);
-               } else {
-                       *residue++ = *cbb;
+       if (cbb->fid.vnode == 0 && cbb->fid.unique == 0) {
+               afs_break_volume_callback(server, volume);
+               *_count -= 1;
+               if (*_count)
+                       memmove(cbb, cbb + 1, sizeof(*cbb) * *_count);
+       } else {
+               /* TODO: Find all matching volumes if we couldn't match the server and
+                * break them anyway.
+                */
+
+               for (i = *_count; i > 0; cbb++, i--) {
+                       if (cbb->fid.vid == vid) {
+                               _debug("- Fid { vl=%08llx n=%llu u=%u }",
+                                      cbb->fid.vid,
+                                      cbb->fid.vnode,
+                                      cbb->fid.unique);
+                               --*_count;
+                               if (volume)
+                                       afs_break_one_callback(server, volume, &cbb->fid);
+                       } else {
+                               *residue++ = *cbb;
+                       }
                 }
+               rcu_read_unlock();
         }
  
-       rcu_read_unlock();
         afs_put_volume(volume, afs_volume_trace_put_callback);
  }
  
diff --git a/fs/afs/cell.c b/fs/afs/cell.c

index e4b6a80763d7be5fc1b3848aeeaf7c12d9ad6b50..caa09875f5209950a0e54962d82a862fa2e8916e 100644 (file)
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -167,8 +167,6 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
         seqlock_init(&cell->volume_lock);
         cell->fs_servers = RB_ROOT;
         seqlock_init(&cell->fs_lock);
-       INIT_LIST_HEAD(&cell->fs_open_mmaps);
-       init_rwsem(&cell->fs_open_mmaps_lock);
         rwlock_init(&cell->vl_servers_lock);
         cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS);
  
diff --git a/fs/afs/dir.c b/fs/afs/dir.c

index e232f713ece1b8d112b32b92e58b28cc16f5d71d..c14533ef108f191a7209f4f035e084fb8a41b57a 100644 (file)
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1118,7 +1118,12 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
         dir = AFS_FS_I(d_inode(parent));
  
         /* validate the parent directory */
-       afs_validate(dir, key);
+       ret = afs_validate(dir, key);
+       if (ret == -ERESTARTSYS) {
+               dput(parent);
+               key_put(key);
+               return ret;
+       }
  
         if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
                 _debug("%pd: parent dir deleted", dentry);
@@ -1260,6 +1265,7 @@ void afs_check_for_remote_deletion(struct afs_operation *op)
         switch (afs_op_abort_code(op)) {
         case VNOVNODE:
                 set_bit(AFS_VNODE_DELETED, &vnode->flags);
+               clear_nlink(&vnode->netfs.inode);
                 afs_break_callback(vnode, afs_cb_break_for_deleted);
         }
  }
@@ -1375,7 +1381,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry)
  
                 clear_nlink(&vnode->netfs.inode);
                 set_bit(AFS_VNODE_DELETED, &vnode->flags);
-               clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+               atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
                 clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
         }
  }
diff --git a/fs/afs/file.c b/fs/afs/file.c

index 8f9b424275698630fb6dc8228243e2d7a8368fd5..30914e0d9cb29903cd42aacc9da6c3088f52f78a 100644 (file)
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -514,13 +514,12 @@ static bool afs_release_folio(struct folio *folio, gfp_t gfp)
  static void afs_add_open_mmap(struct afs_vnode *vnode)
  {
         if (atomic_inc_return(&vnode->cb_nr_mmap) == 1) {
-               down_write(&vnode->volume->cell->fs_open_mmaps_lock);
+               down_write(&vnode->volume->open_mmaps_lock);
  
                 if (list_empty(&vnode->cb_mmap_link))
-                       list_add_tail(&vnode->cb_mmap_link,
-                                     &vnode->volume->cell->fs_open_mmaps);
+                       list_add_tail(&vnode->cb_mmap_link, &vnode->volume->open_mmaps);
  
-               up_write(&vnode->volume->cell->fs_open_mmaps_lock);
+               up_write(&vnode->volume->open_mmaps_lock);
         }
  }
  
@@ -529,12 +528,12 @@ static void afs_drop_open_mmap(struct afs_vnode *vnode)
         if (!atomic_dec_and_test(&vnode->cb_nr_mmap))
                 return;
  
-       down_write(&vnode->volume->cell->fs_open_mmaps_lock);
+       down_write(&vnode->volume->open_mmaps_lock);
  
         if (atomic_read(&vnode->cb_nr_mmap) == 0)
                 list_del_init(&vnode->cb_mmap_link);
  
-       up_write(&vnode->volume->cell->fs_open_mmaps_lock);
+       up_write(&vnode->volume->open_mmaps_lock);
         flush_work(&vnode->cb_work);
  }
  
@@ -570,7 +569,7 @@ static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pg
  {
         struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file));
  
-       if (afs_pagecache_valid(vnode))
+       if (afs_check_validity(vnode))
                 return filemap_map_pages(vmf, start_pgoff, end_pgoff);
         return 0;
  }
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c

index 99d1e649e92990c581eae4d7bec93600a10da562..cecc44af6a5f5ff190ed44a71a99c70d226f5ce9 100644 (file)
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -42,7 +42,7 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo
         op->pre_volsync.creation = volume->creation_time;
         op->pre_volsync.update  = volume->update_time;
         op->debug_id            = atomic_inc_return(&afs_operation_debug_counter);
-       op->nr_iterations = -1;
+       op->nr_iterations       = -1;
         afs_op_set_error(op, -EDESTADDRREQ);
  
         _leave(" = [op=%08x]", op->debug_id);
@@ -184,7 +184,6 @@ void afs_wait_for_operation(struct afs_operation *op)
                 op->call_responded = false;
                 op->call_error = 0;
                 op->call_abort_code = 0;
-               op->cb_s_break = op->server->cb_s_break;
                 if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags) &&
                     op->ops->issue_yfs_rpc)
                         op->ops->issue_yfs_rpc(op);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c

index df3d37577b5b5eca796d2032030397b6ebb5dbbe..4f04f6f33f46b940ffee52b25f527b17d729f1b9 100644 (file)
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -85,8 +85,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
  
         write_seqlock(&vnode->cb_lock);
  
-       vnode->cb_v_break = op->cb_v_break;
-       vnode->cb_s_break = op->cb_s_break;
+       vnode->cb_v_check = op->cb_v_break;
         vnode->status = *status;
  
         t = status->mtime_client;
@@ -146,11 +145,10 @@ static int afs_inode_init_from_status(struct afs_operation *op,
         if (!vp->scb.have_cb) {
                 /* it's a symlink we just created (the fileserver
                  * didn't give us a callback) */
-               vnode->cb_expires_at = ktime_get_real_seconds();
+               atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
         } else {
-               vnode->cb_expires_at = vp->scb.callback.expires_at;
                 vnode->cb_server = op->server;
-               set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+               atomic64_set(&vnode->cb_expires_at, vp->scb.callback.expires_at);
         }
  
         write_sequnlock(&vnode->cb_lock);
@@ -214,7 +212,8 @@ static void afs_apply_status(struct afs_operation *op,
         vnode->status = *status;
  
         if (vp->dv_before + vp->dv_delta != status->data_version) {
-               if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
+               if (vnode->cb_ro_snapshot == atomic_read(&vnode->volume->cb_ro_snapshot) &&
+                   atomic64_read(&vnode->cb_expires_at) != AFS_NO_CB_PROMISE)
                         pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n",
                                 vnode->fid.vid, vnode->fid.vnode,
                                 (unsigned long long)vp->dv_before + vp->dv_delta,
@@ -268,9 +267,9 @@ static void afs_apply_callback(struct afs_operation *op,
         struct afs_vnode *vnode = vp->vnode;
  
         if (!afs_cb_is_broken(vp->cb_break_before, vnode)) {
-               vnode->cb_expires_at    = cb->expires_at;
-               vnode->cb_server        = op->server;
-               set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+               if (op->volume->type == AFSVL_RWVOL)
+                       vnode->cb_server = op->server;
+               atomic64_set(&vnode->cb_expires_at, cb->expires_at);
         }
  }
  
@@ -542,7 +541,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key)
         BUG_ON(!(inode->i_state & I_NEW));
  
         vnode = AFS_FS_I(inode);
-       vnode->cb_v_break = atomic_read(&as->volume->cb_v_break),
+       vnode->cb_v_check = atomic_read(&as->volume->cb_v_break),
         afs_set_netfs_context(vnode);
  
         op = afs_alloc_operation(key, as->volume);
@@ -587,7 +586,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
  
         if (vnode->volume &&
             !(query_flags & AT_STATX_DONT_SYNC) &&
-           !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+           atomic64_read(&vnode->cb_expires_at) == AFS_NO_CB_PROMISE) {
                 key = afs_request_key(vnode->volume->cell);
                 if (IS_ERR(key))
                         return PTR_ERR(key);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h

index 4b730cbcf63e6ed3da427543bab3425409626af7..6d0cd886b5487c3b7a6709223e3b68818cb088bb 100644 (file)
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -422,9 +422,6 @@ struct afs_cell {
         /* Active fileserver interaction state. */
         struct rb_root          fs_servers;     /* afs_server (by server UUID) */
         seqlock_t               fs_lock;        /* For fs_servers  */
-       struct rw_semaphore     fs_open_mmaps_lock;
-       struct list_head        fs_open_mmaps;  /* List of vnodes that are mmapped */
-       atomic_t                fs_s_break;     /* Counter of CB.InitCallBackState messages */
  
         /* VL server list. */
         rwlock_t                vl_servers_lock; /* Lock on vl_servers */
@@ -591,9 +588,6 @@ struct afs_server {
         /* file service access */
         rwlock_t                fs_lock;        /* access lock */
  
-       /* callback promise management */
-       unsigned                cb_s_break;     /* Break-everything counter. */
-
         /* Probe state */
         struct afs_endpoint_state __rcu *endpoint_state; /* Latest endpoint/probe state */
         unsigned long           probed_at;      /* Time last probe was dispatched (jiffies) */
@@ -615,6 +609,7 @@ struct afs_server_entry {
         struct afs_server       *server;
         struct afs_volume       *volume;
         struct list_head        slink;          /* Link in server->volumes */
+       time64_t                cb_expires_at;  /* Time at which volume-level callback expires */
         unsigned long           flags;
  #define AFS_SE_EXCLUDED                0               /* Set if server is to be excluded in rotation */
  };
@@ -668,10 +663,15 @@ struct afs_volume {
         time64_t                update_time;    /* Volume update time (or TIME64_MIN) */
  
         /* Callback management */
+       struct mutex            cb_check_lock;  /* Lock to control race to check after v_break */
+       time64_t                cb_expires_at;  /* Earliest volume callback expiry time */
         atomic_t                cb_ro_snapshot; /* RO volume update-from-snapshot counter */
         atomic_t                cb_v_break;     /* Volume-break event counter. */
+       atomic_t                cb_v_check;     /* Volume-break has-been-checked counter. */
         atomic_t                cb_scrub;       /* Scrub-all-data event counter. */
         rwlock_t                cb_v_break_lock;
+       struct rw_semaphore     open_mmaps_lock;
+       struct list_head        open_mmaps;     /* List of vnodes that are mmapped */
  
         afs_voltype_t           type;           /* type of volume */
         char                    type_force;     /* force volume type (suppress R/O -> R/W) */
@@ -710,7 +710,6 @@ struct afs_vnode {
         spinlock_t              wb_lock;        /* lock for wb_keys */
         spinlock_t              lock;           /* waitqueue/flags lock */
         unsigned long           flags;
-#define AFS_VNODE_CB_PROMISED  0               /* Set if vnode has a callback promise */
  #define AFS_VNODE_UNSET                1               /* set if vnode attributes not yet set */
  #define AFS_VNODE_DIR_VALID    2               /* Set if dir contents are valid */
  #define AFS_VNODE_ZAP_DATA     3               /* set if vnode's data should be invalidated */
@@ -736,13 +735,14 @@ struct afs_vnode {
         struct list_head        cb_mmap_link;   /* Link in cell->fs_open_mmaps */
         void                    *cb_server;     /* Server with callback/filelock */
         atomic_t                cb_nr_mmap;     /* Number of mmaps */
-       unsigned int            cb_fs_s_break;  /* Mass server break counter (cell->fs_s_break) */
-       unsigned int            cb_s_break;     /* Mass break counter on ->server */
-       unsigned int            cb_v_break;     /* Mass break counter on ->volume */
+       unsigned int            cb_ro_snapshot; /* RO volume release counter on ->volume */
+       unsigned int            cb_scrub;       /* Scrub counter on ->volume */
         unsigned int            cb_break;       /* Break counter on vnode */
+       unsigned int            cb_v_check;     /* Break check counter on ->volume */
         seqlock_t               cb_lock;        /* Lock for ->cb_server, ->status, ->cb_*break */
  
-       time64_t                cb_expires_at;  /* time at which callback expires */
+       atomic64_t              cb_expires_at;  /* time at which callback expires */
+#define AFS_NO_CB_PROMISE TIME64_MIN
  };
  
  static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode)
@@ -839,7 +839,7 @@ struct afs_vnode_param {
         struct afs_fid          fid;            /* Fid to access */
         struct afs_status_cb    scb;            /* Returned status and callback promise */
         afs_dataversion_t       dv_before;      /* Data version before the call */
-       unsigned int            cb_break_before; /* cb_break + cb_s_break before the call */
+       unsigned int            cb_break_before; /* cb_break before the call */
         u8                      dv_delta;       /* Expected change in data version */
         bool                    put_vnode:1;    /* T if we have a ref on the vnode */
         bool                    need_io_lock:1; /* T if we need the I/O lock on this */
@@ -875,7 +875,6 @@ struct afs_operation {
         unsigned int            debug_id;
  
         unsigned int            cb_v_break;     /* Volume break counter before op */
-       unsigned int            cb_s_break;     /* Server break counter before op */
  
         union {
                 struct {
@@ -1066,13 +1065,15 @@ extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback
  
  static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode)
  {
-       return vnode->cb_break + vnode->cb_v_break;
+       return vnode->cb_break + vnode->cb_ro_snapshot + vnode->cb_scrub;
  }
  
  static inline bool afs_cb_is_broken(unsigned int cb_break,
                                     const struct afs_vnode *vnode)
  {
-       return cb_break != (vnode->cb_break + atomic_read(&vnode->volume->cb_v_break));
+       return cb_break != (vnode->cb_break +
+                           atomic_read(&vnode->volume->cb_ro_snapshot) +
+                           atomic_read(&vnode->volume->cb_scrub));
  }
  
  /*
@@ -1564,9 +1565,8 @@ extern void afs_fs_exit(void);
  /*
   * validation.c
   */
+bool afs_check_validity(const struct afs_vnode *vnode);
  int afs_update_volume_state(struct afs_operation *op);
-bool afs_check_validity(struct afs_vnode *vnode);
-bool afs_pagecache_valid(struct afs_vnode *vnode);
  int afs_validate(struct afs_vnode *vnode, struct key *key);
  
  /*
diff --git a/fs/afs/proc.c b/fs/afs/proc.c

index a138022d8e0d890e9ba583bbcba538897caf6b41..3bd02571f30debca6159756b5abe30e3dd905583 100644 (file)
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -443,8 +443,8 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
                    refcount_read(&server->ref),
                    atomic_read(&server->active),
                    server->cell->name);
-       seq_printf(m, "  - info: fl=%lx rtt=%u brk=%x\n",
-                  server->flags, server->rtt, server->cb_s_break);
+       seq_printf(m, "  - info: fl=%lx rtt=%u\n",
+                  server->flags, server->rtt);
         seq_printf(m, "  - probe: last=%d\n",
                    (int)(jiffies - server->probed_at) / HZ);
         failed = estate->failed_set;
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c

index 5c50c9aa1f8733cfd5365d52f5419b3453197e30..a8554b4d91b8aa5bf63f1185b6c5cd5a1b213efd 100644 (file)
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -59,7 +59,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
                 write_seqlock(&vnode->cb_lock);
                 ASSERTCMP(cb_server, ==, vnode->cb_server);
                 vnode->cb_server = NULL;
-               if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
+               if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE)
                         vnode->cb_break++;
                 write_sequnlock(&vnode->cb_lock);
         }
@@ -140,6 +140,22 @@ bool afs_select_fileserver(struct afs_operation *op)
         switch (op->call_error) {
         case 0:
                 op->cumul_error.responded = true;
+
+               /* We succeeded, but we may need to redo the op from another
+                * server if we're looking at a set of RO volumes where some of
+                * the servers have not yet been brought up to date lest we
+                * regress the data.  We only switch to the new version once
+                * >=50% of the servers are updated.
+                */
+               error = afs_update_volume_state(op);
+               if (error != 0) {
+                       if (error == 1) {
+                               afs_sleep_and_retry(op);
+                               goto restart_from_beginning;
+                       }
+                       afs_op_set_error(op, error);
+                       goto failed;
+               }
                 fallthrough;
         default:
                 /* Success or local failure.  Stop. */
@@ -484,10 +500,8 @@ selected_server:
         op->server = server;
         if (vnode->cb_server != server) {
                 vnode->cb_server = server;
-               vnode->cb_s_break = server->cb_s_break;
-               vnode->cb_fs_s_break = atomic_read(&server->cell->fs_s_break);
-               vnode->cb_v_break = atomic_read(&vnode->volume->cb_v_break);
-               clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+               vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
+               atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
         }
  
         read_lock(&server->fs_lock);
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c

index fb0f4afcb3040811bfff9bcac973a44954216eac..ac4a7afff45ecb4d5c251e68c3c9a6a904eef96d 100644 (file)
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c
@@ -110,6 +110,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
                 slist->servers[j].server = server;
                 slist->servers[j].volume = volume;
                 slist->servers[j].flags = se_flags;
+               slist->servers[j].cb_expires_at = AFS_NO_CB_PROMISE;
                 slist->nr_servers++;
         }
  
@@ -210,6 +211,7 @@ void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server
                 int diff;
  
                 if (pn && po && pn->server == po->server) {
+                       pn->cb_expires_at = po->cb_expires_at;
                         list_replace(&po->slink, &pn->slink);
                         n++;
                         o++;
diff --git a/fs/afs/validation.c b/fs/afs/validation.c

index 6aadd5e075e4931e4a08aace3c92fb26328cd279..46b37f2cce7d907562ef5014762b5cd37e99775a 100644 (file)
--- a/fs/afs/validation.c
+++ b/fs/afs/validation.c
@@ -10,6 +10,131 @@
  #include <linux/sched.h>
  #include "internal.h"
  
+/*
+ * Data validation is managed through a number of mechanisms from the server:
+ *
+ *  (1) On first contact with a server (such as if it has just been rebooted),
+ *      the server sends us a CB.InitCallBackState* request.
+ *
+ *  (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
+ *      calls, the server maintains a time-limited per-vnode promise that it
+ *      will send us a CB.CallBack request if a third party alters the vnodes
+ *      accessed.
+ *
+ *      Note that a vnode-level callbacks may also be sent for other reasons,
+ *      such as filelock release.
+ *
+ *  (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
+ *      calls, each server maintains a time-limited per-volume promise that it
+ *      will send us a CB.CallBack request if the RO volume is updated to a
+ *      snapshot of the RW volume ("vos release").  This is an atomic event
+ *      that cuts over all instances of the RO volume across multiple servers
+ *      simultaneously.
+ *
+ *     Note that a volume-level callbacks may also be sent for other reasons,
+ *     such as the volumeserver taking over control of the volume from the
+ *     fileserver.
+ *
+ *     Note also that each server maintains an independent time limit on an
+ *     independent callback.
+ *
+ *  (4) Certain RPC calls include a volume information record "VolSync" in
+ *      their reply.  This contains a creation date for the volume that should
+ *      remain unchanged for a RW volume (but will be changed if the volume is
+ *      restored from backup) or will be bumped to the time of snapshotting
+ *      when a RO volume is released.
+ *
+ * In order to track this events, the following are provided:
+ *
+ *     ->cb_v_break.  A counter of events that might mean that the contents of
+ *     a volume have been altered since we last checked a vnode.
+ *
+ *     ->cb_v_check.  A counter of the number of events that we've sent a
+ *     query to the server for.  Everything's up to date if this equals
+ *     cb_v_break.
+ *
+ *     ->cb_scrub.  A counter of the number of regression events for which we
+ *     have to completely wipe the cache.
+ *
+ *     ->cb_ro_snapshot.  A counter of the number of times that we've
+ *      recognised that a RO volume has been updated.
+ *
+ *     ->cb_break.  A counter of events that might mean that the contents of a
+ *      vnode have been altered.
+ *
+ *     ->cb_expires_at.  The time at which the callback promise expires or
+ *      AFS_NO_CB_PROMISE if we have no promise.
+ *
+ * The way we manage things is:
+ *
+ *  (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
+ *      the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
+ *      volume and volume's server record.
+ *
+ *  (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
+ *     callback break on all the volumes that have been using that volume
+ *     (ie. increment ->cb_v_break and reset ->cb_expires_at).
+ *
+ *  (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
+ *     vnode and reset its ->cb_expires_at.  If the vnode is mmapped, we also
+ *     dispatch a work item to unmap all PTEs to the vnode's pagecache to
+ *     force reentry to the filesystem for revalidation.
+ *
+ *  (4) When entering the filesystem, we call afs_validate() to check the
+ *     validity of a vnode.  This first checks to see if ->cb_v_check and
+ *     ->cb_v_break match, and if they don't, we lock volume->cb_check_lock
+ *     exclusively and perform an FS.FetchStatus on the vnode.
+ *
+ *     After checking the volume, we check the vnode.  If there's a mismatch
+ *     between the volume counters and the vnode's mirrors of those counters,
+ *     we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
+ *
+ *  (5) When the reply from FS.FetchStatus arrives, the VolSync record is
+ *      parsed:
+ *
+ *     (A) If the Creation timestamp has changed on a RW volume or regressed
+ *         on a RO volume, we try to increment ->cb_scrub; if it advances on a
+ *         RO volume, we assume "vos release" happened and try to increment
+ *         ->cb_ro_snapshot.
+ *
+ *      (B) If the Update timestamp has regressed, we try to increment
+ *         ->cb_scrub.
+ *
+ *      Note that in both of these cases, we only do the increment if we can
+ *      cmpxchg the value of the timestamp from the value we noted before the
+ *      op.  This tries to prevent parallel ops from fighting one another.
+ *
+ *     volume->cb_v_check is then set to ->cb_v_break.
+ *
+ *  (6) The AFSCallBack record included in the FS.FetchStatus reply is also
+ *     parsed and used to set the promise in ->cb_expires_at for the vnode,
+ *     the volume and the volume's server record.
+ *
+ *  (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
+ *      the vnode.
+ */
+
+/*
+ * Check the validity of a vnode/inode and its parent volume.
+ */
+bool afs_check_validity(const struct afs_vnode *vnode)
+{
+       const struct afs_volume *volume = vnode->volume;
+       time64_t deadline = ktime_get_real_seconds() + 10;
+
+       if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
+           atomic64_read(&vnode->cb_expires_at)  <= deadline ||
+           volume->cb_expires_at <= deadline ||
+           vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
+           vnode->cb_scrub       != atomic_read(&volume->cb_scrub) ||
+           test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+               _debug("inval");
+               return false;
+       }
+
+       return true;
+}
+
  /*
   * See if the server we've just talked to is currently excluded.
   */
@@ -185,11 +310,17 @@ out:
  }
  
  /*
- * Update the state of a volume.  Returns 1 to redo the operation from the start.
+ * Update the state of a volume, including recording the expiration time of the
+ * callback promise.  Returns 1 to redo the operation from the start.
   */
  int afs_update_volume_state(struct afs_operation *op)
  {
+       struct afs_server_list *slist = op->server_list;
+       struct afs_server_entry *se = &slist->servers[op->server_index];
+       struct afs_callback *cb = &op->file[0].scb.callback;
         struct afs_volume *volume = op->volume;
+       unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
+       unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
         int ret;
  
         _enter("%llx", op->volume->vid);
@@ -202,6 +333,18 @@ int afs_update_volume_state(struct afs_operation *op)
                 }
         }
  
+       if (op->cb_v_break == cb_v_break &&
+           (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
+               time64_t expires_at = cb->expires_at;
+
+               if (!op->file[0].scb.have_cb)
+                       expires_at = op->file[1].scb.callback.expires_at;
+
+               se->cb_expires_at = expires_at;
+               volume->cb_expires_at = expires_at;
+       }
+       if (cb_v_check < op->cb_v_break)
+               atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
         return 0;
  }
  
@@ -224,99 +367,6 @@ static void afs_zap_data(struct afs_vnode *vnode)
                 invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
  }
  
-/*
- * Check to see if we have a server currently serving this volume and that it
- * hasn't been reinitialised or dropped from the list.
- */
-static bool afs_check_server_good(struct afs_vnode *vnode)
-{
-       struct afs_server_list *slist;
-       struct afs_server *server;
-       bool good;
-       int i;
-
-       if (vnode->cb_fs_s_break == atomic_read(&vnode->volume->cell->fs_s_break))
-               return true;
-
-       rcu_read_lock();
-
-       slist = rcu_dereference(vnode->volume->servers);
-       for (i = 0; i < slist->nr_servers; i++) {
-               server = slist->servers[i].server;
-               if (server == vnode->cb_server) {
-                       good = (vnode->cb_s_break == server->cb_s_break);
-                       rcu_read_unlock();
-                       return good;
-               }
-       }
-
-       rcu_read_unlock();
-       return false;
-}
-
-/*
- * Check the validity of a vnode/inode.
- */
-bool afs_check_validity(struct afs_vnode *vnode)
-{
-       enum afs_cb_break_reason need_clear = afs_cb_break_no_break;
-       time64_t now = ktime_get_real_seconds();
-       unsigned int cb_break;
-       int seq;
-
-       do {
-               seq = read_seqbegin(&vnode->cb_lock);
-               cb_break = vnode->cb_break;
-
-               if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
-                       if (vnode->cb_v_break != atomic_read(&vnode->volume->cb_v_break))
-                               need_clear = afs_cb_break_for_v_break;
-                       else if (!afs_check_server_good(vnode))
-                               need_clear = afs_cb_break_for_s_reinit;
-                       else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
-                               need_clear = afs_cb_break_for_zap;
-                       else if (vnode->cb_expires_at - 10 <= now)
-                               need_clear = afs_cb_break_for_lapsed;
-               } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
-                       ;
-               } else {
-                       need_clear = afs_cb_break_no_promise;
-               }
-
-       } while (read_seqretry(&vnode->cb_lock, seq));
-
-       if (need_clear == afs_cb_break_no_break)
-               return true;
-
-       write_seqlock(&vnode->cb_lock);
-       if (need_clear == afs_cb_break_no_promise)
-               vnode->cb_v_break = atomic_read(&vnode->volume->cb_v_break);
-       else if (cb_break == vnode->cb_break)
-               __afs_break_callback(vnode, need_clear);
-       else
-               trace_afs_cb_miss(&vnode->fid, need_clear);
-       write_sequnlock(&vnode->cb_lock);
-       return false;
-}
-
-/*
- * Returns true if the pagecache is still valid.  Does not sleep.
- */
-bool afs_pagecache_valid(struct afs_vnode *vnode)
-{
-       if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
-               if (vnode->netfs.inode.i_nlink)
-                       clear_nlink(&vnode->netfs.inode);
-               return true;
-       }
-
-       if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
-           afs_check_validity(vnode))
-               return true;
-
-       return false;
-}
-
  /*
   * validate a vnode/inode
   * - there are several things we need to check
@@ -328,23 +378,48 @@ bool afs_pagecache_valid(struct afs_vnode *vnode)
   */
  int afs_validate(struct afs_vnode *vnode, struct key *key)
  {
+       struct afs_volume *volume = vnode->volume;
+       unsigned int cb_ro_snapshot, cb_scrub;
+       time64_t deadline = ktime_get_real_seconds() + 10;
+       bool zap = false, locked_vol = false;
         int ret;
  
         _enter("{v={%llx:%llu} fl=%lx},%x",
                vnode->fid.vid, vnode->fid.vnode, vnode->flags,
                key_serial(key));
  
-       if (afs_pagecache_valid(vnode))
-               goto valid;
+       if (afs_check_validity(vnode))
+               return 0;
  
-       down_write(&vnode->validate_lock);
+       ret = down_write_killable(&vnode->validate_lock);
+       if (ret < 0)
+               goto error;
+
+       /* Validate a volume after the v_break has changed or the volume
+        * callback expired.  We only want to do this once per volume per
+        * v_break change.  The actual work will be done when parsing the
+        * status fetch reply.
+        */
+       if (volume->cb_expires_at <= deadline ||
+           atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
+               ret = mutex_lock_interruptible(&volume->cb_check_lock);
+               if (ret < 0)
+                       goto error_unlock;
+               locked_vol = true;
+       }
  
-       /* if the promise has expired, we need to check the server again to get
-        * a new promise - note that if the (parent) directory's metadata was
-        * changed then the security may be different and we may no longer have
-        * access */
-       if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
-               _debug("not promised");
+       cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
+       cb_scrub = atomic_read(&volume->cb_scrub);
+       if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
+           vnode->cb_scrub       != cb_scrub)
+               unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
+
+       if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
+           vnode->cb_scrub       != cb_scrub ||
+           volume->cb_expires_at <= deadline ||
+           atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
+           atomic64_read(&vnode->cb_expires_at) <= deadline
+           ) {
                 ret = afs_fetch_status(vnode, key, false, NULL);
                 if (ret < 0) {
                         if (ret == -ENOENT) {
@@ -353,9 +428,26 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
                         }
                         goto error_unlock;
                 }
+
                 _debug("new promise [fl=%lx]", vnode->flags);
         }
  
+       /* We can drop the volume lock now as. */
+       if (locked_vol) {
+               mutex_unlock(&volume->cb_check_lock);
+               locked_vol = false;
+       }
+
+       cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
+       cb_scrub = atomic_read(&volume->cb_scrub);
+       _debug("vnode inval %x==%x %x==%x",
+              vnode->cb_ro_snapshot, cb_ro_snapshot,
+              vnode->cb_scrub, cb_scrub);
+       if (vnode->cb_scrub != cb_scrub)
+               zap = true;
+       vnode->cb_ro_snapshot = cb_ro_snapshot;
+       vnode->cb_scrub = cb_scrub;
+
         if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
                 _debug("file already deleted");
                 ret = -ESTALE;
@@ -364,15 +456,18 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
  
         /* if the vnode's data version number changed then its contents are
          * different */
-       if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
+       zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
+       if (zap)
                 afs_zap_data(vnode);
         up_write(&vnode->validate_lock);
-valid:
         _leave(" = 0");
         return 0;
  
  error_unlock:
+       if (locked_vol)
+               mutex_unlock(&volume->cb_check_lock);
         up_write(&vnode->validate_lock);
+error:
         _leave(" = %d", ret);
         return ret;
  }
diff --git a/fs/afs/volume.c b/fs/afs/volume.c

index 41ab1d3ff3ea28b83a295c5ad9ce48b51272aa95..cc207dca1b21645ace6df87841606c3360ec7281 100644 (file)
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -98,7 +98,10 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
         INIT_WORK(&volume->destructor, afs_destroy_volume);
         rwlock_init(&volume->servers_lock);
         mutex_init(&volume->volsync_lock);
+       mutex_init(&volume->cb_check_lock);
         rwlock_init(&volume->cb_v_break_lock);
+       INIT_LIST_HEAD(&volume->open_mmaps);
+       init_rwsem(&volume->open_mmaps_lock);
         memcpy(volume->name, vldb->name, vldb->name_len + 1);
  
         for (i = 0; i < AFS_MAXTYPES; i++)
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h

index bbe8dcab4b329c595d3ce8c384f0c386b2df9437..2df7d0fd3f2140f6d0de5695ec2c0b7e828053f1 100644 (file)
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -438,18 +438,14 @@ enum yfs_cm_operation {
  
  #define afs_cb_break_reasons                                           \
         EM(afs_cb_break_no_break,               "no-break")             \
-       EM(afs_cb_break_no_promise,             "no-promise")           \
         EM(afs_cb_break_for_callback,           "break-cb")             \
         EM(afs_cb_break_for_creation_regress,   "creation-regress")     \
         EM(afs_cb_break_for_deleted,            "break-del")            \
-       EM(afs_cb_break_for_lapsed,             "break-lapsed")         \
         EM(afs_cb_break_for_s_reinit,           "s-reinit")             \
         EM(afs_cb_break_for_unlink,             "break-unlink")         \
         EM(afs_cb_break_for_update_regress,     "update-regress")       \
-       EM(afs_cb_break_for_v_break,            "break-v")              \
         EM(afs_cb_break_for_volume_callback,    "break-v-cb")           \
         EM(afs_cb_break_for_vos_release,        "break-vos-release")    \
-       EM(afs_cb_break_for_zap,                "break-zap")            \
         E_(afs_cb_break_volume_excluded,        "vol-excluded")
  
  /*
author	David Howells <dhowells@redhat.com>
	Wed, 8 Nov 2023 13:57:42 +0000 (13:57 +0000)
committer	David Howells <dhowells@redhat.com>
	Mon, 1 Jan 2024 16:37:27 +0000 (16:37 +0000)
fs/afs/callback.c		patch \| blob \| blame \| history
fs/afs/cell.c		patch \| blob \| blame \| history
fs/afs/dir.c		patch \| blob \| blame \| history
fs/afs/file.c		patch \| blob \| blame \| history
fs/afs/fs_operation.c		patch \| blob \| blame \| history
fs/afs/inode.c		patch \| blob \| blame \| history
fs/afs/internal.h		patch \| blob \| blame \| history
fs/afs/proc.c		patch \| blob \| blame \| history
fs/afs/rotate.c		patch \| blob \| blame \| history
fs/afs/server_list.c		patch \| blob \| blame \| history
fs/afs/validation.c		patch \| blob \| blame \| history
fs/afs/volume.c		patch \| blob \| blame \| history
include/trace/events/afs.h		patch \| blob \| blame \| history