Merge tag 'nfs-for-4.20-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 26 Oct 2018 20:05:26 +0000 (13:05 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 26 Oct 2018 20:05:26 +0000 (13:05 -0700)
Pull NFS client updates from Trond Myklebust:
 "Highlights include:

  Stable fixes:
   - Fix the NFSv4.1 r/wsize sanity checking
   - Reset the RPC/RDMA credit grant properly after a disconnect
   - Fix a missed page unlock after pg_doio()

  Features and optimisations:
   - Overhaul of the RPC client socket code to eliminate a locking
     bottleneck and reduce the latency when transmitting lots of
     requests in parallel.
   - Allow parallelisation of the RPCSEC_GSS encoding of an RPC request.
   - Convert the RPC client socket receive code to use iovec_iter() for
     improved efficiency.
   - Convert several NFS and RPC lookup operations to use RCU instead of
     taking global locks.
   - Avoid the need for BH-safe locks in the RPC/RDMA back channel.

  Bugfixes and cleanups:
   - Fix lock recovery during NFSv4 delegation recalls
   - Fix the NFSv4 + NFSv4.1 "lookup revalidate + open file" case.
   - Fixes for the RPC connection metrics
   - Various RPC client layer cleanups to consolidate stream based
     sockets
   - RPC/RDMA connection cleanups
   - Simplify the RPC/RDMA cleanup after memory operation failures
   - Clean ups for NFS v4.2 copy completion and NFSv4 open state
     reclaim"

* tag 'nfs-for-4.20-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (97 commits)
  SUNRPC: Convert the auth cred cache to use refcount_t
  SUNRPC: Convert auth creds to use refcount_t
  SUNRPC: Simplify lookup code
  SUNRPC: Clean up the AUTH cache code
  NFS: change sign of nfs_fh length
  sunrpc: safely reallow resvport min/max inversion
  nfs: remove redundant call to nfs_context_set_write_error()
  nfs: Fix a missed page unlock after pg_doio()
  SUNRPC: Fix a compile warning for cmpxchg64()
  NFSv4.x: fix lock recovery during delegation recall
  SUNRPC: use cmpxchg64() in gss_seq_send64_fetch_and_inc()
  xprtrdma: Squelch a sparse warning
  xprtrdma: Clean up xprt_rdma_disconnect_inject
  xprtrdma: Add documenting comments
  xprtrdma: Report when there were zero posted Receives
  xprtrdma: Move rb_flags initialization
  xprtrdma: Don't disable BH's in backchannel server
  xprtrdma: Remove memory address of "ep" from an error message
  xprtrdma: Rename rpcrdma_qp_async_error_upcall
  xprtrdma: Simplify RPC wake-ups on connect
  ...

57 files changed:
fs/nfs/delegation.c
fs/nfs/dir.c
fs/nfs/filelayout/filelayout.c
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/flexfilelayout/flexfilelayoutdev.c
fs/nfs/inode.c
fs/nfs/nfs3proc.c
fs/nfs/nfs3xdr.c
fs/nfs/nfs4_fs.h
fs/nfs/nfs4client.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4state.c
fs/nfs/nfs4xdr.c
fs/nfs/pagelist.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/nfs/read.c
include/linux/nfs_fs.h
include/linux/nfs_fs_sb.h
include/linux/nfs_xdr.h
include/linux/sunrpc/auth.h
include/linux/sunrpc/auth_gss.h
include/linux/sunrpc/bc_xprt.h
include/linux/sunrpc/gss_krb5.h
include/linux/sunrpc/sched.h
include/linux/sunrpc/svc_xprt.h
include/linux/sunrpc/xdr.h
include/linux/sunrpc/xprt.h
include/linux/sunrpc/xprtsock.h
include/trace/events/rpcrdma.h
include/trace/events/sunrpc.h
net/sunrpc/auth.c
net/sunrpc/auth_generic.c
net/sunrpc/auth_gss/auth_gss.c
net/sunrpc/auth_gss/gss_krb5_seal.c
net/sunrpc/auth_gss/gss_krb5_wrap.c
net/sunrpc/auth_gss/gss_mech_switch.c
net/sunrpc/auth_gss/gss_rpc_xdr.c
net/sunrpc/auth_null.c
net/sunrpc/auth_unix.c
net/sunrpc/backchannel_rqst.c
net/sunrpc/clnt.c
net/sunrpc/sched.c
net/sunrpc/socklib.c
net/sunrpc/svc_xprt.c
net/sunrpc/svcsock.c
net/sunrpc/xdr.c
net/sunrpc/xprt.c
net/sunrpc/xprtrdma/backchannel.c
net/sunrpc/xprtrdma/fmr_ops.c
net/sunrpc/xprtrdma/frwr_ops.c
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/svc_rdma_backchannel.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h
net/sunrpc/xprtsock.c

index f033f3a69a3bcf7259192a9e062d7af295f90639..07b83956057627913ac64b44307d19a5765e03e4 100644 (file)
@@ -93,7 +93,7 @@ int nfs4_check_delegation(struct inode *inode, fmode_t flags)
        return nfs4_do_check_delegation(inode, flags, false);
 }
 
-static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
+static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_stateid *stateid)
 {
        struct inode *inode = state->inode;
        struct file_lock *fl;
@@ -108,7 +108,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
        spin_lock(&flctx->flc_lock);
 restart:
        list_for_each_entry(fl, list, fl_list) {
-               if (nfs_file_open_context(fl->fl_file) != ctx)
+               if (nfs_file_open_context(fl->fl_file)->state != state)
                        continue;
                spin_unlock(&flctx->flc_lock);
                status = nfs4_lock_delegation_recall(fl, state, stateid);
@@ -136,8 +136,8 @@ static int nfs_delegation_claim_opens(struct inode *inode,
        int err;
 
 again:
-       spin_lock(&inode->i_lock);
-       list_for_each_entry(ctx, &nfsi->open_files, list) {
+       rcu_read_lock();
+       list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
                state = ctx->state;
                if (state == NULL)
                        continue;
@@ -147,15 +147,16 @@ again:
                        continue;
                if (!nfs4_stateid_match(&state->stateid, stateid))
                        continue;
-               get_nfs_open_context(ctx);
-               spin_unlock(&inode->i_lock);
+               if (!get_nfs_open_context(ctx))
+                       continue;
+               rcu_read_unlock();
                sp = state->owner;
                /* Block nfs4_proc_unlck */
                mutex_lock(&sp->so_delegreturn_mutex);
                seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
                err = nfs4_open_delegation_recall(ctx, state, stateid, type);
                if (!err)
-                       err = nfs_delegation_claim_locks(ctx, state, stateid);
+                       err = nfs_delegation_claim_locks(state, stateid);
                if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
                        err = -EAGAIN;
                mutex_unlock(&sp->so_delegreturn_mutex);
@@ -164,7 +165,7 @@ again:
                        return err;
                goto again;
        }
-       spin_unlock(&inode->i_lock);
+       rcu_read_unlock();
        return 0;
 }
 
index 8bfaa658b2c190ddfa61f8a52acb4895b9f63b1d..71b2e390becf23fca2860250c2e85cb85ed7d733 100644 (file)
@@ -1072,6 +1072,100 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
        return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
 }
 
+static int
+nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
+                          struct inode *inode, int error)
+{
+       switch (error) {
+       case 1:
+               dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
+                       __func__, dentry);
+               return 1;
+       case 0:
+               nfs_mark_for_revalidate(dir);
+               if (inode && S_ISDIR(inode->i_mode)) {
+                       /* Purge readdir caches. */
+                       nfs_zap_caches(inode);
+                       /*
+                        * We can't d_drop the root of a disconnected tree:
+                        * its d_hash is on the s_anon list and d_drop() would hide
+                        * it from shrink_dcache_for_unmount(), leading to busy
+                        * inodes on unmount and further oopses.
+                        */
+                       if (IS_ROOT(dentry))
+                               return 1;
+               }
+               dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
+                               __func__, dentry);
+               return 0;
+       }
+       dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n",
+                               __func__, dentry, error);
+       return error;
+}
+
+static int
+nfs_lookup_revalidate_negative(struct inode *dir, struct dentry *dentry,
+                              unsigned int flags)
+{
+       int ret = 1;
+       if (nfs_neg_need_reval(dir, dentry, flags)) {
+               if (flags & LOOKUP_RCU)
+                       return -ECHILD;
+               ret = 0;
+       }
+       return nfs_lookup_revalidate_done(dir, dentry, NULL, ret);
+}
+
+static int
+nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry,
+                               struct inode *inode)
+{
+       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+       return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
+}
+
+static int
+nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
+                            struct inode *inode)
+{
+       struct nfs_fh *fhandle;
+       struct nfs_fattr *fattr;
+       struct nfs4_label *label;
+       int ret;
+
+       ret = -ENOMEM;
+       fhandle = nfs_alloc_fhandle();
+       fattr = nfs_alloc_fattr();
+       label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+       if (fhandle == NULL || fattr == NULL || IS_ERR(label))
+               goto out;
+
+       ret = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
+       if (ret < 0) {
+               if (ret == -ESTALE || ret == -ENOENT)
+                       ret = 0;
+               goto out;
+       }
+       ret = 0;
+       if (nfs_compare_fh(NFS_FH(inode), fhandle))
+               goto out;
+       if (nfs_refresh_inode(inode, fattr) < 0)
+               goto out;
+
+       nfs_setsecurity(inode, fattr, label);
+       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+
+       /* set a readdirplus hint that we had a cache miss */
+       nfs_force_use_readdirplus(dir);
+       ret = 1;
+out:
+       nfs_free_fattr(fattr);
+       nfs_free_fhandle(fhandle);
+       nfs4_label_free(label);
+       return nfs_lookup_revalidate_done(dir, dentry, inode, ret);
+}
+
 /*
  * This is called every time the dcache has a lookup hit,
  * and we should check whether we can really trust that
@@ -1083,58 +1177,36 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
  * If the parent directory is seen to have changed, we throw out the
  * cached dentry and do a new lookup.
  */
-static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+static int
+nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
+                        unsigned int flags)
 {
-       struct inode *dir;
        struct inode *inode;
-       struct dentry *parent;
-       struct nfs_fh *fhandle = NULL;
-       struct nfs_fattr *fattr = NULL;
-       struct nfs4_label *label = NULL;
        int error;
 
-       if (flags & LOOKUP_RCU) {
-               parent = READ_ONCE(dentry->d_parent);
-               dir = d_inode_rcu(parent);
-               if (!dir)
-                       return -ECHILD;
-       } else {
-               parent = dget_parent(dentry);
-               dir = d_inode(parent);
-       }
        nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
        inode = d_inode(dentry);
 
-       if (!inode) {
-               if (nfs_neg_need_reval(dir, dentry, flags)) {
-                       if (flags & LOOKUP_RCU)
-                               return -ECHILD;
-                       goto out_bad;
-               }
-               goto out_valid;
-       }
+       if (!inode)
+               return nfs_lookup_revalidate_negative(dir, dentry, flags);
 
        if (is_bad_inode(inode)) {
-               if (flags & LOOKUP_RCU)
-                       return -ECHILD;
                dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
                                __func__, dentry);
                goto out_bad;
        }
 
        if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
-               goto out_set_verifier;
+               return nfs_lookup_revalidate_delegated(dir, dentry, inode);
 
        /* Force a full look up iff the parent directory has changed */
        if (!(flags & (LOOKUP_EXCL | LOOKUP_REVAL)) &&
            nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
                error = nfs_lookup_verify_inode(inode, flags);
                if (error) {
-                       if (flags & LOOKUP_RCU)
-                               return -ECHILD;
                        if (error == -ESTALE)
-                               goto out_zap_parent;
-                       goto out_error;
+                               nfs_zap_caches(dir);
+                       goto out_bad;
                }
                nfs_advise_use_readdirplus(dir);
                goto out_valid;
@@ -1146,81 +1218,45 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
        if (NFS_STALE(inode))
                goto out_bad;
 
-       error = -ENOMEM;
-       fhandle = nfs_alloc_fhandle();
-       fattr = nfs_alloc_fattr();
-       if (fhandle == NULL || fattr == NULL)
-               goto out_error;
-
-       label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
-       if (IS_ERR(label))
-               goto out_error;
-
        trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
-       error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
+       error = nfs_lookup_revalidate_dentry(dir, dentry, inode);
        trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error);
-       if (error == -ESTALE || error == -ENOENT)
-               goto out_bad;
-       if (error)
-               goto out_error;
-       if (nfs_compare_fh(NFS_FH(inode), fhandle))
-               goto out_bad;
-       if ((error = nfs_refresh_inode(inode, fattr)) != 0)
-               goto out_bad;
-
-       nfs_setsecurity(inode, fattr, label);
-
-       nfs_free_fattr(fattr);
-       nfs_free_fhandle(fhandle);
-       nfs4_label_free(label);
+       return error;
+out_valid:
+       return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
+out_bad:
+       if (flags & LOOKUP_RCU)
+               return -ECHILD;
+       return nfs_lookup_revalidate_done(dir, dentry, inode, 0);
+}
 
-       /* set a readdirplus hint that we had a cache miss */
-       nfs_force_use_readdirplus(dir);
+static int
+__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
+                       int (*reval)(struct inode *, struct dentry *, unsigned int))
+{
+       struct dentry *parent;
+       struct inode *dir;
+       int ret;
 
-out_set_verifier:
-       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
- out_valid:
        if (flags & LOOKUP_RCU) {
+               parent = READ_ONCE(dentry->d_parent);
+               dir = d_inode_rcu(parent);
+               if (!dir)
+                       return -ECHILD;
+               ret = reval(dir, dentry, flags);
                if (parent != READ_ONCE(dentry->d_parent))
                        return -ECHILD;
-       } else
+       } else {
+               parent = dget_parent(dentry);
+               ret = reval(d_inode(parent), dentry, flags);
                dput(parent);
-       dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
-                       __func__, dentry);
-       return 1;
-out_zap_parent:
-       nfs_zap_caches(dir);
- out_bad:
-       WARN_ON(flags & LOOKUP_RCU);
-       nfs_free_fattr(fattr);
-       nfs_free_fhandle(fhandle);
-       nfs4_label_free(label);
-       nfs_mark_for_revalidate(dir);
-       if (inode && S_ISDIR(inode->i_mode)) {
-               /* Purge readdir caches. */
-               nfs_zap_caches(inode);
-               /*
-                * We can't d_drop the root of a disconnected tree:
-                * its d_hash is on the s_anon list and d_drop() would hide
-                * it from shrink_dcache_for_unmount(), leading to busy
-                * inodes on unmount and further oopses.
-                */
-               if (IS_ROOT(dentry))
-                       goto out_valid;
        }
-       dput(parent);
-       dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
-                       __func__, dentry);
-       return 0;
-out_error:
-       WARN_ON(flags & LOOKUP_RCU);
-       nfs_free_fattr(fattr);
-       nfs_free_fhandle(fhandle);
-       nfs4_label_free(label);
-       dput(parent);
-       dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n",
-                       __func__, dentry, error);
-       return error;
+       return ret;
+}
+
+static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+{
+       return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate);
 }
 
 /*
@@ -1579,62 +1615,55 @@ no_open:
 }
 EXPORT_SYMBOL_GPL(nfs_atomic_open);
 
-static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+static int
+nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
+                         unsigned int flags)
 {
        struct inode *inode;
-       int ret = 0;
 
        if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
-               goto no_open;
+               goto full_reval;
        if (d_mountpoint(dentry))
-               goto no_open;
-       if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1)
-               goto no_open;
+               goto full_reval;
 
        inode = d_inode(dentry);
 
        /* We can't create new files in nfs_open_revalidate(), so we
         * optimize away revalidation of negative dentries.
         */
-       if (inode == NULL) {
-               struct dentry *parent;
-               struct inode *dir;
-
-               if (flags & LOOKUP_RCU) {
-                       parent = READ_ONCE(dentry->d_parent);
-                       dir = d_inode_rcu(parent);
-                       if (!dir)
-                               return -ECHILD;
-               } else {
-                       parent = dget_parent(dentry);
-                       dir = d_inode(parent);
-               }
-               if (!nfs_neg_need_reval(dir, dentry, flags))
-                       ret = 1;
-               else if (flags & LOOKUP_RCU)
-                       ret = -ECHILD;
-               if (!(flags & LOOKUP_RCU))
-                       dput(parent);
-               else if (parent != READ_ONCE(dentry->d_parent))
-                       return -ECHILD;
-               goto out;
-       }
+       if (inode == NULL)
+               goto full_reval;
+
+       if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
+               return nfs_lookup_revalidate_delegated(dir, dentry, inode);
 
        /* NFS only supports OPEN on regular files */
        if (!S_ISREG(inode->i_mode))
-               goto no_open;
+               goto full_reval;
+
        /* We cannot do exclusive creation on a positive dentry */
-       if (flags & LOOKUP_EXCL)
-               goto no_open;
+       if (flags & (LOOKUP_EXCL | LOOKUP_REVAL))
+               goto reval_dentry;
+
+       /* Check if the directory changed */
+       if (!nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU))
+               goto reval_dentry;
 
        /* Let f_op->open() actually open (and revalidate) the file */
-       ret = 1;
+       return 1;
+reval_dentry:
+       if (flags & LOOKUP_RCU)
+               return -ECHILD;
+       return nfs_lookup_revalidate_dentry(dir, dentry, inode);;
 
-out:
-       return ret;
+full_reval:
+       return nfs_do_lookup_revalidate(dir, dentry, flags);
+}
 
-no_open:
-       return nfs_lookup_revalidate(dentry, flags);
+static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+{
+       return __nfs_lookup_revalidate(dentry, flags,
+                       nfs4_do_lookup_revalidate);
 }
 
 #endif /* CONFIG_NFSV4 */
index d175724ff566bf9a5525239a4aedb5308353f6b3..61f46facb39c379377b22566b00e47d8f0966645 100644 (file)
@@ -1164,6 +1164,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
        .id                     = LAYOUT_NFSV4_1_FILES,
        .name                   = "LAYOUT_NFSV4_1_FILES",
        .owner                  = THIS_MODULE,
+       .max_layoutget_response = 4096, /* 1 page or so... */
        .alloc_layout_hdr       = filelayout_alloc_layout_hdr,
        .free_layout_hdr        = filelayout_free_layout_hdr,
        .alloc_lseg             = filelayout_alloc_lseg,
index cae43333ef16035e11c78c4ab30ba08832396c1f..86bcba40ca61b27ee6228dc900783439facdf680 100644 (file)
@@ -2356,6 +2356,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
        .name                   = "LAYOUT_FLEX_FILES",
        .owner                  = THIS_MODULE,
        .flags                  = PNFS_LAYOUTGET_ON_OPEN,
+       .max_layoutget_response = 4096, /* 1 page or so... */
        .set_layoutdriver       = ff_layout_set_layoutdriver,
        .alloc_layout_hdr       = ff_layout_alloc_layout_hdr,
        .free_layout_hdr        = ff_layout_free_layout_hdr,
index 59aa04976331be3c7459785cae239fcd22bfd2b7..74d8d53524382abbaf3b961d81f37ac7fa249d53 100644 (file)
@@ -453,7 +453,7 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
        struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
        struct rpc_cred *cred;
 
-       if (mirror) {
+       if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) {
                cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode);
                if (!cred)
                        cred = get_rpccred(mdscred);
index b65aee481d131d00734c057cd16f4532f2898211..5b1eee4952b7309b2c80b0777d8c0390ab8e599c 100644 (file)
@@ -857,15 +857,14 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
 
 static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
 {
-       struct nfs_lock_context *head = &ctx->lock_context;
-       struct nfs_lock_context *pos = head;
+       struct nfs_lock_context *pos;
 
-       do {
+       list_for_each_entry_rcu(pos, &ctx->lock_context.list, list) {
                if (pos->lockowner != current->files)
                        continue;
-               refcount_inc(&pos->count);
-               return pos;
-       } while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head);
+               if (refcount_inc_not_zero(&pos->count))
+                       return pos;
+       }
        return NULL;
 }
 
@@ -874,10 +873,10 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
        struct nfs_lock_context *res, *new = NULL;
        struct inode *inode = d_inode(ctx->dentry);
 
-       spin_lock(&inode->i_lock);
+       rcu_read_lock();
        res = __nfs_find_lock_context(ctx);
+       rcu_read_unlock();
        if (res == NULL) {
-               spin_unlock(&inode->i_lock);
                new = kmalloc(sizeof(*new), GFP_KERNEL);
                if (new == NULL)
                        return ERR_PTR(-ENOMEM);
@@ -885,14 +884,14 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
                spin_lock(&inode->i_lock);
                res = __nfs_find_lock_context(ctx);
                if (res == NULL) {
-                       list_add_tail(&new->list, &ctx->lock_context.list);
+                       list_add_tail_rcu(&new->list, &ctx->lock_context.list);
                        new->open_context = ctx;
                        res = new;
                        new = NULL;
                }
+               spin_unlock(&inode->i_lock);
+               kfree(new);
        }
-       spin_unlock(&inode->i_lock);
-       kfree(new);
        return res;
 }
 EXPORT_SYMBOL_GPL(nfs_get_lock_context);
@@ -904,9 +903,9 @@ void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
 
        if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock))
                return;
-       list_del(&l_ctx->list);
+       list_del_rcu(&l_ctx->list);
        spin_unlock(&inode->i_lock);
-       kfree(l_ctx);
+       kfree_rcu(l_ctx, rcu_head);
 }
 EXPORT_SYMBOL_GPL(nfs_put_lock_context);
 
@@ -978,9 +977,9 @@ EXPORT_SYMBOL_GPL(alloc_nfs_open_context);
 
 struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 {
-       if (ctx != NULL)
-               refcount_inc(&ctx->lock_context.count);
-       return ctx;
+       if (ctx != NULL && refcount_inc_not_zero(&ctx->lock_context.count))
+               return ctx;
+       return NULL;
 }
 EXPORT_SYMBOL_GPL(get_nfs_open_context);
 
@@ -989,13 +988,13 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
        struct inode *inode = d_inode(ctx->dentry);
        struct super_block *sb = ctx->dentry->d_sb;
 
+       if (!refcount_dec_and_test(&ctx->lock_context.count))
+               return;
        if (!list_empty(&ctx->list)) {
-               if (!refcount_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
-                       return;
-               list_del(&ctx->list);
+               spin_lock(&inode->i_lock);
+               list_del_rcu(&ctx->list);
                spin_unlock(&inode->i_lock);
-       } else if (!refcount_dec_and_test(&ctx->lock_context.count))
-               return;
+       }
        if (inode != NULL)
                NFS_PROTO(inode)->close_context(ctx, is_sync);
        if (ctx->cred != NULL)
@@ -1003,7 +1002,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
        dput(ctx->dentry);
        nfs_sb_deactive(sb);
        kfree(ctx->mdsthreshold);
-       kfree(ctx);
+       kfree_rcu(ctx, rcu_head);
 }
 
 void put_nfs_open_context(struct nfs_open_context *ctx)
@@ -1027,10 +1026,7 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
        struct nfs_inode *nfsi = NFS_I(inode);
 
        spin_lock(&inode->i_lock);
-       if (ctx->mode & FMODE_WRITE)
-               list_add(&ctx->list, &nfsi->open_files);
-       else
-               list_add_tail(&ctx->list, &nfsi->open_files);
+       list_add_tail_rcu(&ctx->list, &nfsi->open_files);
        spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
@@ -1051,16 +1047,17 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_open_context *pos, *ctx = NULL;
 
-       spin_lock(&inode->i_lock);
-       list_for_each_entry(pos, &nfsi->open_files, list) {
+       rcu_read_lock();
+       list_for_each_entry_rcu(pos, &nfsi->open_files, list) {
                if (cred != NULL && pos->cred != cred)
                        continue;
                if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
                        continue;
                ctx = get_nfs_open_context(pos);
-               break;
+               if (ctx)
+                       break;
        }
-       spin_unlock(&inode->i_lock);
+       rcu_read_unlock();
        return ctx;
 }
 
@@ -1078,9 +1075,6 @@ void nfs_file_clear_open_context(struct file *filp)
                if (ctx->error < 0)
                        invalidate_inode_pages2(inode->i_mapping);
                filp->private_data = NULL;
-               spin_lock(&inode->i_lock);
-               list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
-               spin_unlock(&inode->i_lock);
                put_nfs_open_context_sync(ctx);
        }
 }
@@ -1329,19 +1323,11 @@ static bool nfs_file_has_writers(struct nfs_inode *nfsi)
 {
        struct inode *inode = &nfsi->vfs_inode;
 
-       assert_spin_locked(&inode->i_lock);
-
        if (!S_ISREG(inode->i_mode))
                return false;
        if (list_empty(&nfsi->open_files))
                return false;
-       /* Note: This relies on nfsi->open_files being ordered with writers
-        *       being placed at the head of the list.
-        *       See nfs_inode_attach_open_context()
-        */
-       return (list_first_entry(&nfsi->open_files,
-                       struct nfs_open_context,
-                       list)->mode & FMODE_WRITE) == FMODE_WRITE;
+       return inode_is_open_for_write(inode);
 }
 
 static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi)
index ec8a9efa268fec13f78daaf347b37a6b707d6f1e..71bc16225b9817ece6719a687c2c7c18d4ca81ef 100644 (file)
@@ -786,6 +786,7 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
        struct inode *inode = hdr->inode;
+       struct nfs_server *server = NFS_SERVER(inode);
 
        if (hdr->pgio_done_cb != NULL)
                return hdr->pgio_done_cb(task, hdr);
@@ -793,6 +794,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
        if (nfs3_async_handle_jukebox(task, inode))
                return -EAGAIN;
 
+       if (task->tk_status >= 0 && !server->read_hdrsize)
+               cmpxchg(&server->read_hdrsize, 0, hdr->res.replen);
+
        nfs_invalidate_atime(inode);
        nfs_refresh_inode(inode, &hdr->fattr);
        return 0;
@@ -802,6 +806,7 @@ static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr,
                                 struct rpc_message *msg)
 {
        msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
+       hdr->args.replen = NFS_SERVER(hdr->inode)->read_hdrsize;
 }
 
 static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task,
index 64e4fa33d89f0e347155db74507862ec6f09fcc8..78df4eb60f85b50561710e4c7285958c0674f947 100644 (file)
@@ -983,10 +983,11 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
                                   const void *data)
 {
        const struct nfs_pgio_args *args = data;
+       unsigned int replen = args->replen ? args->replen : NFS3_readres_sz;
 
        encode_read3args(xdr, args);
        prepare_reply_buffer(req, args->pages, args->pgbase,
-                                       args->count, NFS3_readres_sz);
+                                       args->count, replen);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
 }
 
@@ -1364,10 +1365,12 @@ static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
 
        encode_nfs_fh3(xdr, args->fh);
        encode_uint32(xdr, args->mask);
-       if (args->mask & (NFS_ACL | NFS_DFACL))
+       if (args->mask & (NFS_ACL | NFS_DFACL)) {
                prepare_reply_buffer(req, args->pages, 0,
                                        NFSACL_MAXPAGES << PAGE_SHIFT,
                                        ACL3_getaclres_sz);
+               req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
+       }
 }
 
 static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
@@ -1673,9 +1676,11 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
                                 void *data)
 {
        struct nfs_pgio_res *result = data;
+       unsigned int pos;
        enum nfs_stat status;
        int error;
 
+       pos = xdr_stream_pos(xdr);
        error = decode_nfsstat3(xdr, &status);
        if (unlikely(error))
                goto out;
@@ -1685,6 +1690,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
        result->op_status = status;
        if (status != NFS3_OK)
                goto out_status;
+       result->replen = 3 + ((xdr_stream_pos(xdr) - pos) >> 2);
        error = decode_read3resok(xdr, result);
 out:
        return error;
index 3a6904173214c6ce4399de30f007d479b7983174..8d59c9655ec4800c95cde103c578aa1396826386 100644 (file)
@@ -188,9 +188,10 @@ struct nfs4_state {
        unsigned int n_wronly;          /* Number of write-only references */
        unsigned int n_rdwr;            /* Number of read/write references */
        fmode_t state;                  /* State on the server (R,W, or RW) */
-       atomic_t count;
+       refcount_t count;
 
        wait_queue_head_t waitq;
+       struct rcu_head rcu_head;
 };
 
 
index 146e3086223478d79501564a185e9dd8aa488d03..8f53455c476530998396023edabf72bf3692d5ce 100644 (file)
@@ -950,10 +950,10 @@ EXPORT_SYMBOL_GPL(nfs4_set_ds_client);
 
 /*
  * Session has been established, and the client marked ready.
- * Set the mount rsize and wsize with negotiated fore channel
- * attributes which will be bound checked in nfs_server_set_fsinfo.
+ * Limit the mount rsize, wsize and dtsize using negotiated fore
+ * channel attributes.
  */
-static void nfs4_session_set_rwsize(struct nfs_server *server)
+static void nfs4_session_limit_rwsize(struct nfs_server *server)
 {
 #ifdef CONFIG_NFS_V4_1
        struct nfs4_session *sess;
@@ -966,9 +966,11 @@ static void nfs4_session_set_rwsize(struct nfs_server *server)
        server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead;
        server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead;
 
-       if (!server->rsize || server->rsize > server_resp_sz)
+       if (server->dtsize > server_resp_sz)
+               server->dtsize = server_resp_sz;
+       if (server->rsize > server_resp_sz)
                server->rsize = server_resp_sz;
-       if (!server->wsize || server->wsize > server_rqst_sz)
+       if (server->wsize > server_rqst_sz)
                server->wsize = server_rqst_sz;
 #endif /* CONFIG_NFS_V4_1 */
 }
@@ -1015,12 +1017,12 @@ static int nfs4_server_common_setup(struct nfs_server *server,
                        (unsigned long long) server->fsid.minor);
        nfs_display_fhandle(mntfh, "Pseudo-fs root FH");
 
-       nfs4_session_set_rwsize(server);
-
        error = nfs_probe_fsinfo(server, mntfh, fattr);
        if (error < 0)
                goto out;
 
+       nfs4_session_limit_rwsize(server);
+
        if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
                server->namelen = NFS4_MAXNAMLEN;
 
index 8220a168282e054164cb5b3bb8534f76be89faa7..db84b4adbc491d7cd62e782ac7440a71c3a6c764 100644 (file)
@@ -1349,12 +1349,20 @@ static bool nfs4_mode_match_open_stateid(struct nfs4_state *state,
        return false;
 }
 
-static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode)
+static int can_open_cached(struct nfs4_state *state, fmode_t mode,
+               int open_mode, enum open_claim_type4 claim)
 {
        int ret = 0;
 
        if (open_mode & (O_EXCL|O_TRUNC))
                goto out;
+       switch (claim) {
+       case NFS4_OPEN_CLAIM_NULL:
+       case NFS4_OPEN_CLAIM_FH:
+               goto out;
+       default:
+               break;
+       }
        switch (mode & (FMODE_READ|FMODE_WRITE)) {
                case FMODE_READ:
                        ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0
@@ -1747,7 +1755,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 
        for (;;) {
                spin_lock(&state->owner->so_lock);
-               if (can_open_cached(state, fmode, open_mode)) {
+               if (can_open_cached(state, fmode, open_mode, claim)) {
                        update_open_stateflags(state, fmode);
                        spin_unlock(&state->owner->so_lock);
                        goto out_return_state;
@@ -1777,7 +1785,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 out:
        return ERR_PTR(ret);
 out_return_state:
-       atomic_inc(&state->count);
+       refcount_inc(&state->count);
        return state;
 }
 
@@ -1849,7 +1857,7 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
 update:
        update_open_stateid(state, &data->o_res.stateid, NULL,
                            data->o_arg.fmode);
-       atomic_inc(&state->count);
+       refcount_inc(&state->count);
 
        return state;
 }
@@ -1887,7 +1895,7 @@ nfs4_opendata_find_nfs4_state(struct nfs4_opendata *data)
                return ERR_CAST(inode);
        if (data->state != NULL && data->state->inode == inode) {
                state = data->state;
-               atomic_inc(&state->count);
+               refcount_inc(&state->count);
        } else
                state = nfs4_get_open_state(inode, data->owner);
        iput(inode);
@@ -1933,23 +1941,41 @@ nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
        return ret;
 }
 
-static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state)
+static struct nfs_open_context *
+nfs4_state_find_open_context_mode(struct nfs4_state *state, fmode_t mode)
 {
        struct nfs_inode *nfsi = NFS_I(state->inode);
        struct nfs_open_context *ctx;
 
-       spin_lock(&state->inode->i_lock);
-       list_for_each_entry(ctx, &nfsi->open_files, list) {
+       rcu_read_lock();
+       list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
                if (ctx->state != state)
                        continue;
-               get_nfs_open_context(ctx);
-               spin_unlock(&state->inode->i_lock);
+               if ((ctx->mode & mode) != mode)
+                       continue;
+               if (!get_nfs_open_context(ctx))
+                       continue;
+               rcu_read_unlock();
                return ctx;
        }
-       spin_unlock(&state->inode->i_lock);
+       rcu_read_unlock();
        return ERR_PTR(-ENOENT);
 }
 
+static struct nfs_open_context *
+nfs4_state_find_open_context(struct nfs4_state *state)
+{
+       struct nfs_open_context *ctx;
+
+       ctx = nfs4_state_find_open_context_mode(state, FMODE_READ|FMODE_WRITE);
+       if (!IS_ERR(ctx))
+               return ctx;
+       ctx = nfs4_state_find_open_context_mode(state, FMODE_WRITE);
+       if (!IS_ERR(ctx))
+               return ctx;
+       return nfs4_state_find_open_context_mode(state, FMODE_READ);
+}
+
 static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx,
                struct nfs4_state *state, enum open_claim_type4 claim)
 {
@@ -1960,7 +1986,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
        if (opendata == NULL)
                return ERR_PTR(-ENOMEM);
        opendata->state = state;
-       atomic_inc(&state->count);
+       refcount_inc(&state->count);
        return opendata;
 }
 
@@ -2276,7 +2302,8 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
        if (data->state != NULL) {
                struct nfs_delegation *delegation;
 
-               if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags))
+               if (can_open_cached(data->state, data->o_arg.fmode,
+                                       data->o_arg.open_flags, claim))
                        goto out_no_action;
                rcu_read_lock();
                delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
index 40a08cd483f051072508b53eee362167ce26904f..62ae0fd345ad6751d5dbbf1ab8aefca85eff9e68 100644 (file)
@@ -655,7 +655,7 @@ nfs4_alloc_open_state(void)
        state = kzalloc(sizeof(*state), GFP_NOFS);
        if (!state)
                return NULL;
-       atomic_set(&state->count, 1);
+       refcount_set(&state->count, 1);
        INIT_LIST_HEAD(&state->lock_states);
        spin_lock_init(&state->state_lock);
        seqlock_init(&state->seqlock);
@@ -684,12 +684,12 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner)
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs4_state *state;
 
-       list_for_each_entry(state, &nfsi->open_states, inode_states) {
+       list_for_each_entry_rcu(state, &nfsi->open_states, inode_states) {
                if (state->owner != owner)
                        continue;
                if (!nfs4_valid_open_stateid(state))
                        continue;
-               if (atomic_inc_not_zero(&state->count))
+               if (refcount_inc_not_zero(&state->count))
                        return state;
        }
        return NULL;
@@ -698,7 +698,7 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner)
 static void
 nfs4_free_open_state(struct nfs4_state *state)
 {
-       kfree(state);
+       kfree_rcu(state, rcu_head);
 }
 
 struct nfs4_state *
@@ -707,9 +707,9 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner)
        struct nfs4_state *state, *new;
        struct nfs_inode *nfsi = NFS_I(inode);
 
-       spin_lock(&inode->i_lock);
+       rcu_read_lock();
        state = __nfs4_find_state_byowner(inode, owner);
-       spin_unlock(&inode->i_lock);
+       rcu_read_unlock();
        if (state)
                goto out;
        new = nfs4_alloc_open_state();
@@ -720,7 +720,7 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner)
                state = new;
                state->owner = owner;
                atomic_inc(&owner->so_count);
-               list_add(&state->inode_states, &nfsi->open_states);
+               list_add_rcu(&state->inode_states, &nfsi->open_states);
                ihold(inode);
                state->inode = inode;
                spin_unlock(&inode->i_lock);
@@ -743,10 +743,10 @@ void nfs4_put_open_state(struct nfs4_state *state)
        struct inode *inode = state->inode;
        struct nfs4_state_owner *owner = state->owner;
 
-       if (!atomic_dec_and_lock(&state->count, &owner->so_lock))
+       if (!refcount_dec_and_lock(&state->count, &owner->so_lock))
                return;
        spin_lock(&inode->i_lock);
-       list_del(&state->inode_states);
+       list_del_rcu(&state->inode_states);
        list_del(&state->open_states);
        spin_unlock(&inode->i_lock);
        spin_unlock(&owner->so_lock);
@@ -1437,8 +1437,8 @@ void nfs_inode_find_state_and_recover(struct inode *inode,
        struct nfs4_state *state;
        bool found = false;
 
-       spin_lock(&inode->i_lock);
-       list_for_each_entry(ctx, &nfsi->open_files, list) {
+       rcu_read_lock();
+       list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
                state = ctx->state;
                if (state == NULL)
                        continue;
@@ -1456,7 +1456,7 @@ void nfs_inode_find_state_and_recover(struct inode *inode,
                    nfs4_state_mark_reclaim_nograce(clp, state))
                        found = true;
        }
-       spin_unlock(&inode->i_lock);
+       rcu_read_unlock();
 
        nfs_inode_find_delegation_state_and_recover(inode, stateid);
        if (found)
@@ -1469,13 +1469,13 @@ static void nfs4_state_mark_open_context_bad(struct nfs4_state *state)
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_open_context *ctx;
 
-       spin_lock(&inode->i_lock);
-       list_for_each_entry(ctx, &nfsi->open_files, list) {
+       rcu_read_lock();
+       list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
                if (ctx->state != state)
                        continue;
                set_bit(NFS_CONTEXT_BAD, &ctx->flags);
        }
-       spin_unlock(&inode->i_lock);
+       rcu_read_unlock();
 }
 
 static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error)
@@ -1549,10 +1549,62 @@ out:
        return status;
 }
 
+#ifdef CONFIG_NFS_V4_2
+static void nfs42_complete_copies(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+       struct nfs4_copy_state *copy;
+
+       if (!test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags))
+               return;
+
+       spin_lock(&sp->so_server->nfs_client->cl_lock);
+       list_for_each_entry(copy, &sp->so_server->ss_copies, copies) {
+               if (nfs4_stateid_match_other(&state->stateid, &copy->parent_state->stateid))
+                       continue;
+               copy->flags = 1;
+               complete(&copy->completion);
+               break;
+       }
+       spin_unlock(&sp->so_server->nfs_client->cl_lock);
+}
+#else /* !CONFIG_NFS_V4_2 */
+static inline void nfs42_complete_copies(struct nfs4_state_owner *sp,
+                                        struct nfs4_state *state)
+{
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_state *state,
+                                    const struct nfs4_state_recovery_ops *ops)
+{
+       struct nfs4_lock_state *lock;
+       int status;
+
+       status = ops->recover_open(sp, state);
+       if (status < 0)
+               return status;
+
+       status = nfs4_reclaim_locks(state, ops);
+       if (status < 0)
+               return status;
+
+       if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) {
+               spin_lock(&state->state_lock);
+               list_for_each_entry(lock, &state->lock_states, ls_locks) {
+                       if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
+                               pr_warn_ratelimited("NFS: %s: Lock reclaim failed!\n", __func__);
+               }
+               spin_unlock(&state->state_lock);
+       }
+
+       nfs42_complete_copies(sp, state);
+       clear_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
+       return status;
+}
+
 static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops)
 {
        struct nfs4_state *state;
-       struct nfs4_lock_state *lock;
        int status = 0;
 
        /* Note: we rely on the sp->so_states list being ordered 
@@ -1573,79 +1625,45 @@ restart:
                        continue;
                if (state->state == 0)
                        continue;
-               atomic_inc(&state->count);
+               refcount_inc(&state->count);
                spin_unlock(&sp->so_lock);
-               status = ops->recover_open(sp, state);
-               if (status >= 0) {
-                       status = nfs4_reclaim_locks(state, ops);
-                       if (status >= 0) {
-                               if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) {
-                                       spin_lock(&state->state_lock);
-                                       list_for_each_entry(lock, &state->lock_states, ls_locks) {
-                                               if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
-                                                       pr_warn_ratelimited("NFS: "
-                                                                           "%s: Lock reclaim "
-                                                                           "failed!\n", __func__);
-                                       }
-                                       spin_unlock(&state->state_lock);
-                               }
-                               clear_bit(NFS_STATE_RECLAIM_NOGRACE,
-                                       &state->flags);
-#ifdef CONFIG_NFS_V4_2
-                               if (test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags)) {
-                                       struct nfs4_copy_state *copy;
-
-                                       spin_lock(&sp->so_server->nfs_client->cl_lock);
-                                       list_for_each_entry(copy, &sp->so_server->ss_copies, copies) {
-                                               if (memcmp(&state->stateid.other, &copy->parent_state->stateid.other, NFS4_STATEID_SIZE))
-                                                       continue;
-                                               copy->flags = 1;
-                                               complete(&copy->completion);
-                                               printk("AGLO: server rebooted waking up the copy\n");
-                                               break;
-                                       }
-                                       spin_unlock(&sp->so_server->nfs_client->cl_lock);
-                               }
-#endif /* CONFIG_NFS_V4_2 */
-                               nfs4_put_open_state(state);
-                               spin_lock(&sp->so_lock);
-                               goto restart;
-                       }
-               }
+               status = __nfs4_reclaim_open_state(sp, state, ops);
+
                switch (status) {
-                       default:
-                               printk(KERN_ERR "NFS: %s: unhandled error %d\n",
-                                       __func__, status);
-                               /* Fall through */
-                       case -ENOENT:
-                       case -ENOMEM:
-                       case -EACCES:
-                       case -EROFS:
-                       case -EIO:
-                       case -ESTALE:
-                               /* Open state on this file cannot be recovered */
-                               nfs4_state_mark_recovery_failed(state, status);
-                               break;
-                       case -EAGAIN:
-                               ssleep(1);
-                               /* Fall through */
-                       case -NFS4ERR_ADMIN_REVOKED:
-                       case -NFS4ERR_STALE_STATEID:
-                       case -NFS4ERR_OLD_STATEID:
-                       case -NFS4ERR_BAD_STATEID:
-                       case -NFS4ERR_RECLAIM_BAD:
-                       case -NFS4ERR_RECLAIM_CONFLICT:
-                               nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
+               default:
+                       if (status >= 0)
                                break;
-                       case -NFS4ERR_EXPIRED:
-                       case -NFS4ERR_NO_GRACE:
-                               nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
-                       case -NFS4ERR_STALE_CLIENTID:
-                       case -NFS4ERR_BADSESSION:
-                       case -NFS4ERR_BADSLOT:
-                       case -NFS4ERR_BAD_HIGH_SLOT:
-                       case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
-                               goto out_err;
+                       printk(KERN_ERR "NFS: %s: unhandled error %d\n", __func__, status);
+                       /* Fall through */
+               case -ENOENT:
+               case -ENOMEM:
+               case -EACCES:
+               case -EROFS:
+               case -EIO:
+               case -ESTALE:
+                       /* Open state on this file cannot be recovered */
+                       nfs4_state_mark_recovery_failed(state, status);
+                       break;
+               case -EAGAIN:
+                       ssleep(1);
+                       /* Fall through */
+               case -NFS4ERR_ADMIN_REVOKED:
+               case -NFS4ERR_STALE_STATEID:
+               case -NFS4ERR_OLD_STATEID:
+               case -NFS4ERR_BAD_STATEID:
+               case -NFS4ERR_RECLAIM_BAD:
+               case -NFS4ERR_RECLAIM_CONFLICT:
+                       nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
+                       break;
+               case -NFS4ERR_EXPIRED:
+               case -NFS4ERR_NO_GRACE:
+                       nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
+               case -NFS4ERR_STALE_CLIENTID:
+               case -NFS4ERR_BADSESSION:
+               case -NFS4ERR_BADSLOT:
+               case -NFS4ERR_BAD_HIGH_SLOT:
+               case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+                       goto out_err;
                }
                nfs4_put_open_state(state);
                spin_lock(&sp->so_lock);
@@ -1795,38 +1813,38 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
 static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 {
        switch (error) {
-               case 0:
-                       break;
-               case -NFS4ERR_CB_PATH_DOWN:
-                       nfs40_handle_cb_pathdown(clp);
-                       break;
-               case -NFS4ERR_NO_GRACE:
-                       nfs4_state_end_reclaim_reboot(clp);
-                       break;
-               case -NFS4ERR_STALE_CLIENTID:
-                       set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-                       nfs4_state_start_reclaim_reboot(clp);
-                       break;
-               case -NFS4ERR_EXPIRED:
-                       set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-                       nfs4_state_start_reclaim_nograce(clp);
-                       break;
-               case -NFS4ERR_BADSESSION:
-               case -NFS4ERR_BADSLOT:
-               case -NFS4ERR_BAD_HIGH_SLOT:
-               case -NFS4ERR_DEADSESSION:
-               case -NFS4ERR_SEQ_FALSE_RETRY:
-               case -NFS4ERR_SEQ_MISORDERED:
-                       set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
-                       /* Zero session reset errors */
-                       break;
-               case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
-                       set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
-                       break;
-               default:
-                       dprintk("%s: failed to handle error %d for server %s\n",
-                                       __func__, error, clp->cl_hostname);
-                       return error;
+       case 0:
+               break;
+       case -NFS4ERR_CB_PATH_DOWN:
+               nfs40_handle_cb_pathdown(clp);
+               break;
+       case -NFS4ERR_NO_GRACE:
+               nfs4_state_end_reclaim_reboot(clp);
+               break;
+       case -NFS4ERR_STALE_CLIENTID:
+               set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+               nfs4_state_start_reclaim_reboot(clp);
+               break;
+       case -NFS4ERR_EXPIRED:
+               set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+               nfs4_state_start_reclaim_nograce(clp);
+               break;
+       case -NFS4ERR_BADSESSION:
+       case -NFS4ERR_BADSLOT:
+       case -NFS4ERR_BAD_HIGH_SLOT:
+       case -NFS4ERR_DEADSESSION:
+       case -NFS4ERR_SEQ_FALSE_RETRY:
+       case -NFS4ERR_SEQ_MISORDERED:
+               set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+               /* Zero session reset errors */
+               break;
+       case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+               set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+               break;
+       default:
+               dprintk("%s: failed to handle error %d for server %s\n",
+                               __func__, error, clp->cl_hostname);
+               return error;
        }
        dprintk("%s: handled error %d for server %s\n", __func__, error,
                        clp->cl_hostname);
index b7bde12d8cd518ce7b8da11d7ae3b6239ca3292f..2fc8f6fa25e4b400553af4506f243ae1ada058e2 100644 (file)
@@ -3516,7 +3516,7 @@ static int decode_attr_exclcreat_supported(struct xdr_stream *xdr,
 static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
 {
        __be32 *p;
-       int len;
+       u32 len;
 
        if (fh != NULL)
                memset(fh, 0, sizeof(*fh));
index bb5476a6d264278aae9d549835b6f9b5bc2dbfc7..5c4568a0804b657a06f63b19185ac0d2d6a702cb 100644 (file)
@@ -63,14 +63,14 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init);
 
 void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
 {
-       spin_lock(&hdr->lock);
-       if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags)
-           || pos < hdr->io_start + hdr->good_bytes) {
+       unsigned int new = pos - hdr->io_start;
+
+       if (hdr->good_bytes > new) {
+               hdr->good_bytes = new;
                clear_bit(NFS_IOHDR_EOF, &hdr->flags);
-               hdr->good_bytes = pos - hdr->io_start;
-               hdr->error = error;
+               if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags))
+                       hdr->error = error;
        }
-       spin_unlock(&hdr->lock);
 }
 
 static inline struct nfs_page *
@@ -494,7 +494,6 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops)
 
        if (hdr) {
                INIT_LIST_HEAD(&hdr->pages);
-               spin_lock_init(&hdr->lock);
                hdr->rw_ops = ops;
        }
        return hdr;
@@ -1111,6 +1110,20 @@ static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc,
        return ret;
 }
 
+static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc)
+{
+       u32 midx;
+       struct nfs_pgio_mirror *mirror;
+
+       if (!desc->pg_error)
+               return;
+
+       for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+               mirror = &desc->pg_mirrors[midx];
+               desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
+       }
+}
+
 int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                           struct nfs_page *req)
 {
@@ -1161,25 +1174,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
        return 1;
 
 out_failed:
-       /*
-        * We might have failed before sending any reqs over wire.
-        * Clean up rest of the reqs in mirror pg_list.
-        */
-       if (desc->pg_error) {
-               struct nfs_pgio_mirror *mirror;
-               void (*func)(struct list_head *);
-
-               /* remember fatal errors */
-               if (nfs_error_is_fatal(desc->pg_error))
-                       nfs_context_set_write_error(req->wb_context,
-                                                   desc->pg_error);
-
-               func = desc->pg_completion_ops->error_cleanup;
-               for (midx = 0; midx < desc->pg_mirror_count; midx++) {
-                       mirror = &desc->pg_mirrors[midx];
-                       func(&mirror->pg_list);
-               }
-       }
+       nfs_pageio_error_cleanup(desc);
        return 0;
 }
 
@@ -1251,6 +1246,8 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
        for (midx = 0; midx < desc->pg_mirror_count; midx++)
                nfs_pageio_complete_mirror(desc, midx);
 
+       if (desc->pg_error < 0)
+               nfs_pageio_error_cleanup(desc);
        if (desc->pg_ops->pg_cleanup)
                desc->pg_ops->pg_cleanup(desc);
        nfs_pageio_cleanup_mirroring(desc);
index 7d9a51e6b847c65df159d6632a98ac891370f80f..06cb90e9bc6eccf2ef6628b2cff6cb4f51792900 100644 (file)
@@ -965,7 +965,7 @@ static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags)
        struct page **pages;
        int i;
 
-       pages = kcalloc(size, sizeof(struct page *), gfp_flags);
+       pages = kmalloc_array(size, sizeof(struct page *), gfp_flags);
        if (!pages) {
                dprintk("%s: can't alloc array of %zu pages\n", __func__, size);
                return NULL;
@@ -975,7 +975,7 @@ static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags)
                pages[i] = alloc_page(gfp_flags);
                if (!pages[i]) {
                        dprintk("%s: failed to allocate page\n", __func__);
-                       nfs4_free_pages(pages, size);
+                       nfs4_free_pages(pages, i);
                        return NULL;
                }
        }
@@ -991,6 +991,7 @@ pnfs_alloc_init_layoutget_args(struct inode *ino,
           gfp_t gfp_flags)
 {
        struct nfs_server *server = pnfs_find_server(ino, ctx);
+       size_t max_reply_sz = server->pnfs_curr_ld->max_layoutget_response;
        size_t max_pages = max_response_pages(server);
        struct nfs4_layoutget *lgp;
 
@@ -1000,6 +1001,12 @@ pnfs_alloc_init_layoutget_args(struct inode *ino,
        if (lgp == NULL)
                return NULL;
 
+       if (max_reply_sz) {
+               size_t npages = (max_reply_sz + PAGE_SIZE - 1) >> PAGE_SHIFT;
+               if (npages < max_pages)
+                       max_pages = npages;
+       }
+
        lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags);
        if (!lgp->args.layout.pages) {
                kfree(lgp);
@@ -1332,6 +1339,7 @@ bool pnfs_roc(struct inode *ino,
        if (!nfs_have_layout(ino))
                return false;
 retry:
+       rcu_read_lock();
        spin_lock(&ino->i_lock);
        lo = nfsi->layout;
        if (!lo || !pnfs_layout_is_valid(lo) ||
@@ -1342,6 +1350,7 @@ retry:
        pnfs_get_layout_hdr(lo);
        if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
                spin_unlock(&ino->i_lock);
+               rcu_read_unlock();
                wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
                                TASK_UNINTERRUPTIBLE);
                pnfs_put_layout_hdr(lo);
@@ -1355,7 +1364,7 @@ retry:
                skip_read = true;
        }
 
-       list_for_each_entry(ctx, &nfsi->open_files, list) {
+       list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
                state = ctx->state;
                if (state == NULL)
                        continue;
@@ -1403,6 +1412,7 @@ retry:
 
 out_noroc:
        spin_unlock(&ino->i_lock);
+       rcu_read_unlock();
        pnfs_layoutcommit_inode(ino, true);
        if (roc) {
                struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
index ece367ebde6928204418e51321401e06bc4fe9aa..e2e9fcd5341d22b80b56b6c19ba0a104ef298227 100644 (file)
@@ -125,6 +125,7 @@ struct pnfs_layoutdriver_type {
        struct module *owner;
        unsigned flags;
        unsigned max_deviceinfo_size;
+       unsigned max_layoutget_response;
 
        int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
        int (*clear_layoutdriver) (struct nfs_server *);
index 48d7277c60a9793b3684e9587a69d26af410a8fb..f9f19784db8279e2cdd6f17cbc6c7bd39d5ad332 100644 (file)
@@ -276,16 +276,14 @@ static void nfs_readpage_result(struct rpc_task *task,
                                struct nfs_pgio_header *hdr)
 {
        if (hdr->res.eof) {
-               loff_t bound;
+               loff_t pos = hdr->args.offset + hdr->res.count;
+               unsigned int new = pos - hdr->io_start;
 
-               bound = hdr->args.offset + hdr->res.count;
-               spin_lock(&hdr->lock);
-               if (bound < hdr->io_start + hdr->good_bytes) {
+               if (hdr->good_bytes > new) {
+                       hdr->good_bytes = new;
                        set_bit(NFS_IOHDR_EOF, &hdr->flags);
                        clear_bit(NFS_IOHDR_ERROR, &hdr->flags);
-                       hdr->good_bytes = bound - hdr->io_start;
                }
-               spin_unlock(&hdr->lock);
        } else if (hdr->res.count < hdr->args.count)
                nfs_readpage_retry(task, hdr);
 }
index a0831e9d19c9df9f26c6d0777e198c32241550b4..6e0417c022799154bc3809ec5ffd65344dac75a3 100644 (file)
@@ -62,6 +62,7 @@ struct nfs_lock_context {
        struct nfs_open_context *open_context;
        fl_owner_t lockowner;
        atomic_t io_count;
+       struct rcu_head rcu_head;
 };
 
 struct nfs4_state;
@@ -82,6 +83,7 @@ struct nfs_open_context {
 
        struct list_head list;
        struct nfs4_threshold   *mdsthreshold;
+       struct rcu_head rcu_head;
 };
 
 struct nfs_open_dir_context {
index bf39d9c92201f811c76df9db76ab44a60b29eb41..0fc0b9135d461bc81e1aa06b2fc8040583db3dc9 100644 (file)
@@ -228,6 +228,9 @@ struct nfs_server {
        unsigned short          mountd_port;
        unsigned short          mountd_protocol;
        struct rpc_wait_queue   uoc_rpcwaitq;
+
+       /* XDR related information */
+       unsigned int            read_hdrsize;
 };
 
 /* Server capabilities */
index bd1c889a9ed956c14543117a8835eb4ae4474a9a..0e016252cfc698c1e140f4f1990b2ba6af541d2a 100644 (file)
@@ -608,8 +608,13 @@ struct nfs_pgio_args {
        __u32                   count;
        unsigned int            pgbase;
        struct page **          pages;
-       const u32 *             bitmask;        /* used by write */
-       enum nfs3_stable_how    stable;         /* used by write */
+       union {
+               unsigned int            replen;                 /* used by read */
+               struct {
+                       const u32 *             bitmask;        /* used by write */
+                       enum nfs3_stable_how    stable;         /* used by write */
+               };
+       };
 };
 
 struct nfs_pgio_res {
@@ -617,10 +622,16 @@ struct nfs_pgio_res {
        struct nfs_fattr *      fattr;
        __u32                   count;
        __u32                   op_status;
-       int                     eof;            /* used by read */
-       struct nfs_writeverf *  verf;           /* used by write */
-       const struct nfs_server *server;        /* used by write */
-
+       union {
+               struct {
+                       unsigned int            replen;         /* used by read */
+                       int                     eof;            /* used by read */
+               };
+               struct {
+                       struct nfs_writeverf *  verf;           /* used by write */
+                       const struct nfs_server *server;        /* used by write */
+               };
+       };
 };
 
 /*
@@ -1471,11 +1482,10 @@ struct nfs_pgio_header {
        const struct nfs_rw_ops *rw_ops;
        struct nfs_io_completion *io_completion;
        struct nfs_direct_req   *dreq;
-       spinlock_t              lock;
-       /* fields protected by lock */
+
        int                     pnfs_error;
        int                     error;          /* merge with pnfs_error */
-       unsigned long           good_bytes;     /* boundary of good data */
+       unsigned int            good_bytes;     /* boundary of good data */
        unsigned long           flags;
 
        /*
index 58a6765c1c5e880f5efa2ef1948868e9d3ef2732..c4db9424b63bc743bcf34c0ee92c46cba903d2ff 100644 (file)
@@ -67,7 +67,7 @@ struct rpc_cred {
        const struct rpc_credops *cr_ops;
        unsigned long           cr_expire;      /* when to gc */
        unsigned long           cr_flags;       /* various flags */
-       atomic_t                cr_count;       /* ref count */
+       refcount_t              cr_count;       /* ref count */
 
        kuid_t                  cr_uid;
 
@@ -100,7 +100,7 @@ struct rpc_auth {
                                                 * differ from the flavor in
                                                 * au_ops->au_flavor in gss
                                                 * case) */
-       atomic_t                au_count;       /* Reference counter */
+       refcount_t              au_count;       /* Reference counter */
 
        struct rpc_cred_cache * au_credcache;
        /* per-flavor data */
@@ -157,6 +157,7 @@ struct rpc_credops {
        int                     (*crkey_timeout)(struct rpc_cred *);
        bool                    (*crkey_to_expire)(struct rpc_cred *);
        char *                  (*crstringify_acceptor)(struct rpc_cred *);
+       bool                    (*crneed_reencode)(struct rpc_task *);
 };
 
 extern const struct rpc_authops        authunix_ops;
@@ -192,6 +193,7 @@ __be32 *            rpcauth_marshcred(struct rpc_task *, __be32 *);
 __be32 *               rpcauth_checkverf(struct rpc_task *, __be32 *);
 int                    rpcauth_wrap_req(struct rpc_task *task, kxdreproc_t encode, void *rqstp, __be32 *data, void *obj);
 int                    rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp, __be32 *data, void *obj);
+bool                   rpcauth_xmit_need_reencode(struct rpc_task *task);
 int                    rpcauth_refreshcred(struct rpc_task *);
 void                   rpcauth_invalcred(struct rpc_task *);
 int                    rpcauth_uptodatecred(struct rpc_task *);
@@ -204,11 +206,11 @@ bool                      rpcauth_cred_key_to_expire(struct rpc_auth *, struct rpc_cred *);
 char *                 rpcauth_stringify_acceptor(struct rpc_cred *);
 
 static inline
-struct rpc_cred *      get_rpccred(struct rpc_cred *cred)
+struct rpc_cred *get_rpccred(struct rpc_cred *cred)
 {
-       if (cred != NULL)
-               atomic_inc(&cred->cr_count);
-       return cred;
+       if (cred != NULL && refcount_inc_not_zero(&cred->cr_count))
+               return cred;
+       return NULL;
 }
 
 /**
@@ -224,9 +226,7 @@ struct rpc_cred *   get_rpccred(struct rpc_cred *cred)
 static inline struct rpc_cred *
 get_rpccred_rcu(struct rpc_cred *cred)
 {
-       if (atomic_inc_not_zero(&cred->cr_count))
-               return cred;
-       return NULL;
+       return get_rpccred(cred);
 }
 
 #endif /* __KERNEL__ */
index 0c9eac351aabab4364be9f3dcc3f6a0bd57fc3d0..30427b729070b76cc528b9c5929e2a66b9d60cc4 100644 (file)
@@ -70,6 +70,7 @@ struct gss_cl_ctx {
        refcount_t              count;
        enum rpc_gss_proc       gc_proc;
        u32                     gc_seq;
+       u32                     gc_seq_xmit;
        spinlock_t              gc_seq_lock;
        struct gss_ctx          *gc_gss_ctx;
        struct xdr_netobj       gc_wire_ctx;
index 4397a4824c819f574f38800efe38d341bd212a15..28721cf73ec3cfe247c54e8f802eac18a551aa6f 100644 (file)
@@ -34,6 +34,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifdef CONFIG_SUNRPC_BACKCHANNEL
 struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid);
 void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied);
+void xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task);
 void xprt_free_bc_request(struct rpc_rqst *req);
 int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs);
 void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs);
index f6e8ceafafd8f1aa9b39fac833077de45d5afdca..131424cefc6a92381036c099acb3a9833c846506 100644 (file)
@@ -118,7 +118,8 @@ struct krb5_ctx {
        u8                      acceptor_integ[GSS_KRB5_MAX_KEYLEN];
 };
 
-extern spinlock_t krb5_seq_lock;
+extern u32 gss_seq_send_fetch_and_inc(struct krb5_ctx *ctx);
+extern u64 gss_seq_send64_fetch_and_inc(struct krb5_ctx *ctx);
 
 /* The length of the Kerberos GSS token header */
 #define GSS_KRB5_TOK_HDR_LEN   (16)
index 592653becd914a4e9dc3f8fdbea47c2939c706f7..7b540c06659440738d036f2af6275123792ce520 100644 (file)
@@ -140,8 +140,9 @@ struct rpc_task_setup {
 #define RPC_TASK_RUNNING       0
 #define RPC_TASK_QUEUED                1
 #define RPC_TASK_ACTIVE                2
-#define RPC_TASK_MSG_RECV      3
-#define RPC_TASK_MSG_RECV_WAIT 4
+#define RPC_TASK_NEED_XMIT     3
+#define RPC_TASK_NEED_RECV     4
+#define RPC_TASK_MSG_PIN_WAIT  5
 
 #define RPC_IS_RUNNING(t)      test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
 #define rpc_set_running(t)     set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
@@ -188,7 +189,6 @@ struct rpc_timer {
 struct rpc_wait_queue {
        spinlock_t              lock;
        struct list_head        tasks[RPC_NR_PRIORITY]; /* task queue for each priority level */
-       pid_t                   owner;                  /* process id of last task serviced */
        unsigned char           maxpriority;            /* maximum priority (0 if queue is not a priority queue) */
        unsigned char           priority;               /* current priority */
        unsigned char           nr;                     /* # tasks remaining for cookie */
@@ -204,7 +204,6 @@ struct rpc_wait_queue {
  * from a single cookie.  The aim is to improve
  * performance of NFS operations such as read/write.
  */
-#define RPC_BATCH_COUNT                        16
 #define RPC_IS_PRIORITY(q)             ((q)->maxpriority > 0)
 
 /*
@@ -234,6 +233,9 @@ void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq,
                struct rpc_task *task);
 void           rpc_wake_up_queued_task(struct rpc_wait_queue *,
                                        struct rpc_task *);
+void           rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *,
+                                                  struct rpc_task *,
+                                                  int);
 void           rpc_wake_up(struct rpc_wait_queue *);
 struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *);
 struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
index c3d72066d4b1f5426b09f3fa64194f6acfee6e15..6b7a86c4d6e6b3bb96ded2b5d270b7b94f980126 100644 (file)
@@ -84,7 +84,6 @@ struct svc_xprt {
        struct sockaddr_storage xpt_remote;     /* remote peer's address */
        size_t                  xpt_remotelen;  /* length of address */
        char                    xpt_remotebuf[INET6_ADDRSTRLEN + 10];
-       struct rpc_wait_queue   xpt_bc_pending; /* backchannel wait queue */
        struct list_head        xpt_users;      /* callbacks on free */
 
        struct net              *xpt_net;
index 2bd68177a442e5402f7b015d16ac6047c14c5b0e..43106ffa6788a40101840008d0c702f4c3586c45 100644 (file)
@@ -18,6 +18,7 @@
 #include <asm/unaligned.h>
 #include <linux/scatterlist.h>
 
+struct bio_vec;
 struct rpc_rqst;
 
 /*
@@ -52,12 +53,14 @@ struct xdr_buf {
        struct kvec     head[1],        /* RPC header + non-page data */
                        tail[1];        /* Appended after page data */
 
+       struct bio_vec  *bvec;
        struct page **  pages;          /* Array of pages */
        unsigned int    page_base,      /* Start of page data */
                        page_len,       /* Length of page data */
                        flags;          /* Flags for data disposition */
 #define XDRBUF_READ            0x01            /* target of file read */
 #define XDRBUF_WRITE           0x02            /* source of file write */
+#define XDRBUF_SPARSE_PAGES    0x04            /* Page array is sparse */
 
        unsigned int    buflen,         /* Total length of storage buffer */
                        len;            /* Length of XDR encoded message */
@@ -69,6 +72,8 @@ xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
        buf->head[0].iov_base = start;
        buf->head[0].iov_len = len;
        buf->tail[0].iov_len = 0;
+       buf->bvec = NULL;
+       buf->pages = NULL;
        buf->page_len = 0;
        buf->flags = 0;
        buf->len = 0;
@@ -115,6 +120,9 @@ __be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *);
 void   xdr_inline_pages(struct xdr_buf *, unsigned int,
                         struct page **, unsigned int, unsigned int);
 void   xdr_terminate_string(struct xdr_buf *, const u32);
+size_t xdr_buf_pagecount(struct xdr_buf *buf);
+int    xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp);
+void   xdr_free_bvec(struct xdr_buf *buf);
 
 static inline __be32 *xdr_encode_array(__be32 *p, const void *s, unsigned int len)
 {
@@ -177,10 +185,7 @@ struct xdr_skb_reader {
 
 typedef size_t (*xdr_skb_read_actor)(struct xdr_skb_reader *desc, void *to, size_t len);
 
-size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len);
 extern int csum_partial_copy_to_xdr(struct xdr_buf *, struct sk_buff *);
-extern ssize_t xdr_partial_copy_from_skb(struct xdr_buf *, unsigned int,
-               struct xdr_skb_reader *, xdr_skb_read_actor);
 
 extern int xdr_encode_word(struct xdr_buf *, unsigned int, u32);
 extern int xdr_decode_word(struct xdr_buf *, unsigned int, u32 *);
index 336fd1a19cca10deaeb811cb5330bf5d6520881f..a4ab4f8d914043a63a5b31306990c3b8dce64f04 100644 (file)
@@ -82,7 +82,14 @@ struct rpc_rqst {
        struct page             **rq_enc_pages; /* scratch pages for use by
                                                   gss privacy code */
        void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */
-       struct list_head        rq_list;
+
+       union {
+               struct list_head        rq_list;        /* Slot allocation list */
+               struct rb_node          rq_recv;        /* Receive queue */
+       };
+
+       struct list_head        rq_xmit;        /* Send queue */
+       struct list_head        rq_xmit2;       /* Send queue */
 
        void                    *rq_buffer;     /* Call XDR encode buffer */
        size_t                  rq_callsize;
@@ -103,6 +110,7 @@ struct rpc_rqst {
                                                /* A cookie used to track the
                                                   state of the transport
                                                   connection */
+       atomic_t                rq_pin;
        
        /*
         * Partial send handling
@@ -133,7 +141,8 @@ struct rpc_xprt_ops {
        void            (*connect)(struct rpc_xprt *xprt, struct rpc_task *task);
        int             (*buf_alloc)(struct rpc_task *task);
        void            (*buf_free)(struct rpc_task *task);
-       int             (*send_request)(struct rpc_task *task);
+       void            (*prepare_request)(struct rpc_rqst *req);
+       int             (*send_request)(struct rpc_rqst *req);
        void            (*set_retrans_timeout)(struct rpc_task *task);
        void            (*timer)(struct rpc_xprt *xprt, struct rpc_task *task);
        void            (*release_request)(struct rpc_task *task);
@@ -234,9 +243,12 @@ struct rpc_xprt {
         */
        spinlock_t              transport_lock; /* lock transport info */
        spinlock_t              reserve_lock;   /* lock slot table */
-       spinlock_t              recv_lock;      /* lock receive list */
+       spinlock_t              queue_lock;     /* send/receive queue lock */
        u32                     xid;            /* Next XID value to use */
        struct rpc_task *       snd_task;       /* Task blocked in send */
+
+       struct list_head        xmit_queue;     /* Send queue */
+
        struct svc_xprt         *bc_xprt;       /* NFSv4.1 backchannel */
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
        struct svc_serv         *bc_serv;       /* The RPC service which will */
@@ -248,7 +260,8 @@ struct rpc_xprt {
        struct list_head        bc_pa_list;     /* List of preallocated
                                                 * backchannel rpc_rqst's */
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
-       struct list_head        recv;
+
+       struct rb_root          recv_queue;     /* Receive queue */
 
        struct {
                unsigned long           bind_count,     /* total number of binds */
@@ -325,15 +338,18 @@ struct xprt_class {
 struct rpc_xprt                *xprt_create_transport(struct xprt_create *args);
 void                   xprt_connect(struct rpc_task *task);
 void                   xprt_reserve(struct rpc_task *task);
-void                   xprt_request_init(struct rpc_task *task);
 void                   xprt_retry_reserve(struct rpc_task *task);
 int                    xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task);
 int                    xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task);
 void                   xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task);
 void                   xprt_free_slot(struct rpc_xprt *xprt,
                                       struct rpc_rqst *req);
-void                   xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task);
+void                   xprt_request_prepare(struct rpc_rqst *req);
 bool                   xprt_prepare_transmit(struct rpc_task *task);
+void                   xprt_request_enqueue_transmit(struct rpc_task *task);
+void                   xprt_request_enqueue_receive(struct rpc_task *task);
+void                   xprt_request_wait_receive(struct rpc_task *task);
+bool                   xprt_request_need_retransmit(struct rpc_task *task);
 void                   xprt_transmit(struct rpc_task *task);
 void                   xprt_end_transmit(struct rpc_task *task);
 int                    xprt_adjust_timeout(struct rpc_rqst *req);
@@ -373,8 +389,8 @@ int                 xprt_load_transport(const char *);
 void                   xprt_set_retrans_timeout_def(struct rpc_task *task);
 void                   xprt_set_retrans_timeout_rtt(struct rpc_task *task);
 void                   xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status);
-void                   xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action);
-void                   xprt_write_space(struct rpc_xprt *xprt);
+void                   xprt_wait_for_buffer_space(struct rpc_xprt *xprt);
+bool                   xprt_write_space(struct rpc_xprt *xprt);
 void                   xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result);
 struct rpc_rqst *      xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid);
 void                   xprt_update_rtt(struct rpc_task *task);
@@ -382,6 +398,7 @@ void                        xprt_complete_rqst(struct rpc_task *task, int copied);
 void                   xprt_pin_rqst(struct rpc_rqst *req);
 void                   xprt_unpin_rqst(struct rpc_rqst *req);
 void                   xprt_release_rqst_cong(struct rpc_task *task);
+bool                   xprt_request_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req);
 void                   xprt_disconnect_done(struct rpc_xprt *xprt);
 void                   xprt_force_disconnect(struct rpc_xprt *xprt);
 void                   xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
@@ -400,6 +417,8 @@ void                        xprt_unlock_connect(struct rpc_xprt *, void *);
 #define XPRT_BINDING           (5)
 #define XPRT_CLOSING           (6)
 #define XPRT_CONGESTED         (9)
+#define XPRT_CWND_WAIT         (10)
+#define XPRT_WRITE_SPACE       (11)
 
 static inline void xprt_set_connected(struct rpc_xprt *xprt)
 {
index ae0f99b9b965cca7b2012e527bd7f0413a5e750c..458bfe0137f5ec818dca7f1da7deb264940652d8 100644 (file)
@@ -30,15 +30,25 @@ struct sock_xprt {
        /*
         * State of TCP reply receive
         */
-       __be32                  tcp_fraghdr,
-                               tcp_xid,
-                               tcp_calldir;
+       struct {
+               struct {
+                       __be32  fraghdr,
+                               xid,
+                               calldir;
+               } __attribute__((packed));
 
-       u32                     tcp_offset,
-                               tcp_reclen;
+               u32             offset,
+                               len;
 
-       unsigned long           tcp_copied,
-                               tcp_flags;
+               unsigned long   copied;
+       } recv;
+
+       /*
+        * State of TCP transmit queue
+        */
+       struct {
+               u32             offset;
+       } xmit;
 
        /*
         * Connection of transports
@@ -67,21 +77,9 @@ struct sock_xprt {
        void                    (*old_error_report)(struct sock *);
 };
 
-/*
- * TCP receive state flags
- */
-#define TCP_RCV_LAST_FRAG      (1UL << 0)
-#define TCP_RCV_COPY_FRAGHDR   (1UL << 1)
-#define TCP_RCV_COPY_XID       (1UL << 2)
-#define TCP_RCV_COPY_DATA      (1UL << 3)
-#define TCP_RCV_READ_CALLDIR   (1UL << 4)
-#define TCP_RCV_COPY_CALLDIR   (1UL << 5)
-
 /*
  * TCP RPC flags
  */
-#define TCP_RPC_REPLY          (1UL << 6)
-
 #define XPRT_SOCK_CONNECTING   1U
 #define XPRT_SOCK_DATA_READY   (2)
 #define XPRT_SOCK_UPD_TIMEOUT  (3)
index 53df203b8057afd417ebbcb93a298f51af791a49..b093058f78aacf446846277f2c10e5cd32fc171b 100644 (file)
@@ -263,7 +263,7 @@ DECLARE_EVENT_CLASS(xprtrdma_mr,
 );
 
 #define DEFINE_MR_EVENT(name) \
-               DEFINE_EVENT(xprtrdma_mr, name, \
+               DEFINE_EVENT(xprtrdma_mr, xprtrdma_mr_##name, \
                                TP_PROTO( \
                                        const struct rpcrdma_mr *mr \
                                ), \
@@ -306,7 +306,7 @@ DECLARE_EVENT_CLASS(xprtrdma_cb_event,
  ** Connection events
  **/
 
-TRACE_EVENT(xprtrdma_conn_upcall,
+TRACE_EVENT(xprtrdma_cm_event,
        TP_PROTO(
                const struct rpcrdma_xprt *r_xprt,
                struct rdma_cm_event *event
@@ -377,7 +377,7 @@ DEFINE_RXPRT_EVENT(xprtrdma_reinsert);
 DEFINE_RXPRT_EVENT(xprtrdma_reconnect);
 DEFINE_RXPRT_EVENT(xprtrdma_inject_dsc);
 
-TRACE_EVENT(xprtrdma_qp_error,
+TRACE_EVENT(xprtrdma_qp_event,
        TP_PROTO(
                const struct rpcrdma_xprt *r_xprt,
                const struct ib_event *event
@@ -509,7 +509,7 @@ TRACE_EVENT(xprtrdma_post_send,
        TP_STRUCT__entry(
                __field(const void *, req)
                __field(int, num_sge)
-               __field(bool, signaled)
+               __field(int, signaled)
                __field(int, status)
        ),
 
@@ -651,11 +651,11 @@ DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg);
 DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li);
 DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_wake);
 
-DEFINE_MR_EVENT(xprtrdma_localinv);
-DEFINE_MR_EVENT(xprtrdma_dma_map);
-DEFINE_MR_EVENT(xprtrdma_dma_unmap);
-DEFINE_MR_EVENT(xprtrdma_remoteinv);
-DEFINE_MR_EVENT(xprtrdma_recover_mr);
+DEFINE_MR_EVENT(localinv);
+DEFINE_MR_EVENT(map);
+DEFINE_MR_EVENT(unmap);
+DEFINE_MR_EVENT(remoteinv);
+DEFINE_MR_EVENT(recycle);
 
 /**
  ** Reply events
index bbb08a3ef5ccce100d2551580b35f6b42fa1fa9e..28e384186c3508f452113e04460843967eb37049 100644 (file)
@@ -470,14 +470,14 @@ TRACE_EVENT(xprt_ping,
                        __get_str(addr), __get_str(port), __entry->status)
 );
 
-TRACE_EVENT(xs_tcp_data_ready,
-       TP_PROTO(struct rpc_xprt *xprt, int err, unsigned int total),
+TRACE_EVENT(xs_stream_read_data,
+       TP_PROTO(struct rpc_xprt *xprt, ssize_t err, size_t total),
 
        TP_ARGS(xprt, err, total),
 
        TP_STRUCT__entry(
-               __field(int, err)
-               __field(unsigned int, total)
+               __field(ssize_t, err)
+               __field(size_t, total)
                __string(addr, xprt ? xprt->address_strings[RPC_DISPLAY_ADDR] :
                                "(null)")
                __string(port, xprt ? xprt->address_strings[RPC_DISPLAY_PORT] :
@@ -493,21 +493,11 @@ TRACE_EVENT(xs_tcp_data_ready,
                        xprt->address_strings[RPC_DISPLAY_PORT] : "(null)");
        ),
 
-       TP_printk("peer=[%s]:%s err=%d total=%u", __get_str(addr),
+       TP_printk("peer=[%s]:%s err=%zd total=%zu", __get_str(addr),
                        __get_str(port), __entry->err, __entry->total)
 );
 
-#define rpc_show_sock_xprt_flags(flags) \
-       __print_flags(flags, "|", \
-               { TCP_RCV_LAST_FRAG, "TCP_RCV_LAST_FRAG" }, \
-               { TCP_RCV_COPY_FRAGHDR, "TCP_RCV_COPY_FRAGHDR" }, \
-               { TCP_RCV_COPY_XID, "TCP_RCV_COPY_XID" }, \
-               { TCP_RCV_COPY_DATA, "TCP_RCV_COPY_DATA" }, \
-               { TCP_RCV_READ_CALLDIR, "TCP_RCV_READ_CALLDIR" }, \
-               { TCP_RCV_COPY_CALLDIR, "TCP_RCV_COPY_CALLDIR" }, \
-               { TCP_RPC_REPLY, "TCP_RPC_REPLY" })
-
-TRACE_EVENT(xs_tcp_data_recv,
+TRACE_EVENT(xs_stream_read_request,
        TP_PROTO(struct sock_xprt *xs),
 
        TP_ARGS(xs),
@@ -516,25 +506,22 @@ TRACE_EVENT(xs_tcp_data_recv,
                __string(addr, xs->xprt.address_strings[RPC_DISPLAY_ADDR])
                __string(port, xs->xprt.address_strings[RPC_DISPLAY_PORT])
                __field(u32, xid)
-               __field(unsigned long, flags)
                __field(unsigned long, copied)
                __field(unsigned int, reclen)
-               __field(unsigned long, offset)
+               __field(unsigned int, offset)
        ),
 
        TP_fast_assign(
                __assign_str(addr, xs->xprt.address_strings[RPC_DISPLAY_ADDR]);
                __assign_str(port, xs->xprt.address_strings[RPC_DISPLAY_PORT]);
-               __entry->xid = be32_to_cpu(xs->tcp_xid);
-               __entry->flags = xs->tcp_flags;
-               __entry->copied = xs->tcp_copied;
-               __entry->reclen = xs->tcp_reclen;
-               __entry->offset = xs->tcp_offset;
+               __entry->xid = be32_to_cpu(xs->recv.xid);
+               __entry->copied = xs->recv.copied;
+               __entry->reclen = xs->recv.len;
+               __entry->offset = xs->recv.offset;
        ),
 
-       TP_printk("peer=[%s]:%s xid=0x%08x flags=%s copied=%lu reclen=%u offset=%lu",
+       TP_printk("peer=[%s]:%s xid=0x%08x copied=%lu reclen=%u offset=%u",
                        __get_str(addr), __get_str(port), __entry->xid,
-                       rpc_show_sock_xprt_flags(__entry->flags),
                        __entry->copied, __entry->reclen, __entry->offset)
 );
 
index 305ecea9217071208147ed84755c745b6299606e..ad8ead73898115d18c905597b1c18ebb0d4b58c8 100644 (file)
@@ -30,10 +30,9 @@ struct rpc_cred_cache {
 
 static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS;
 
-static DEFINE_SPINLOCK(rpc_authflavor_lock);
-static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
-       &authnull_ops,          /* AUTH_NULL */
-       &authunix_ops,          /* AUTH_UNIX */
+static const struct rpc_authops __rcu *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
+       [RPC_AUTH_NULL] = (const struct rpc_authops __force __rcu *)&authnull_ops,
+       [RPC_AUTH_UNIX] = (const struct rpc_authops __force __rcu *)&authunix_ops,
        NULL,                   /* others can be loadable modules */
 };
 
@@ -93,39 +92,65 @@ pseudoflavor_to_flavor(u32 flavor) {
 int
 rpcauth_register(const struct rpc_authops *ops)
 {
+       const struct rpc_authops *old;
        rpc_authflavor_t flavor;
-       int ret = -EPERM;
 
        if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR)
                return -EINVAL;
-       spin_lock(&rpc_authflavor_lock);
-       if (auth_flavors[flavor] == NULL) {
-               auth_flavors[flavor] = ops;
-               ret = 0;
-       }
-       spin_unlock(&rpc_authflavor_lock);
-       return ret;
+       old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], NULL, ops);
+       if (old == NULL || old == ops)
+               return 0;
+       return -EPERM;
 }
 EXPORT_SYMBOL_GPL(rpcauth_register);
 
 int
 rpcauth_unregister(const struct rpc_authops *ops)
 {
+       const struct rpc_authops *old;
        rpc_authflavor_t flavor;
-       int ret = -EPERM;
 
        if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR)
                return -EINVAL;
-       spin_lock(&rpc_authflavor_lock);
-       if (auth_flavors[flavor] == ops) {
-               auth_flavors[flavor] = NULL;
-               ret = 0;
-       }
-       spin_unlock(&rpc_authflavor_lock);
-       return ret;
+
+       old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], ops, NULL);
+       if (old == ops || old == NULL)
+               return 0;
+       return -EPERM;
 }
 EXPORT_SYMBOL_GPL(rpcauth_unregister);
 
+static const struct rpc_authops *
+rpcauth_get_authops(rpc_authflavor_t flavor)
+{
+       const struct rpc_authops *ops;
+
+       if (flavor >= RPC_AUTH_MAXFLAVOR)
+               return NULL;
+
+       rcu_read_lock();
+       ops = rcu_dereference(auth_flavors[flavor]);
+       if (ops == NULL) {
+               rcu_read_unlock();
+               request_module("rpc-auth-%u", flavor);
+               rcu_read_lock();
+               ops = rcu_dereference(auth_flavors[flavor]);
+               if (ops == NULL)
+                       goto out;
+       }
+       if (!try_module_get(ops->owner))
+               ops = NULL;
+out:
+       rcu_read_unlock();
+       return ops;
+}
+
+static void
+rpcauth_put_authops(const struct rpc_authops *ops)
+{
+       module_put(ops->owner);
+}
+
 /**
  * rpcauth_get_pseudoflavor - check if security flavor is supported
  * @flavor: a security flavor
@@ -138,25 +163,16 @@ EXPORT_SYMBOL_GPL(rpcauth_unregister);
 rpc_authflavor_t
 rpcauth_get_pseudoflavor(rpc_authflavor_t flavor, struct rpcsec_gss_info *info)
 {
-       const struct rpc_authops *ops;
+       const struct rpc_authops *ops = rpcauth_get_authops(flavor);
        rpc_authflavor_t pseudoflavor;
 
-       ops = auth_flavors[flavor];
-       if (ops == NULL)
-               request_module("rpc-auth-%u", flavor);
-       spin_lock(&rpc_authflavor_lock);
-       ops = auth_flavors[flavor];
-       if (ops == NULL || !try_module_get(ops->owner)) {
-               spin_unlock(&rpc_authflavor_lock);
+       if (!ops)
                return RPC_AUTH_MAXFLAVOR;
-       }
-       spin_unlock(&rpc_authflavor_lock);
-
        pseudoflavor = flavor;
        if (ops->info2flavor != NULL)
                pseudoflavor = ops->info2flavor(info);
 
-       module_put(ops->owner);
+       rpcauth_put_authops(ops);
        return pseudoflavor;
 }
 EXPORT_SYMBOL_GPL(rpcauth_get_pseudoflavor);
@@ -176,25 +192,15 @@ rpcauth_get_gssinfo(rpc_authflavor_t pseudoflavor, struct rpcsec_gss_info *info)
        const struct rpc_authops *ops;
        int result;
 
-       if (flavor >= RPC_AUTH_MAXFLAVOR)
-               return -EINVAL;
-
-       ops = auth_flavors[flavor];
+       ops = rpcauth_get_authops(flavor);
        if (ops == NULL)
-               request_module("rpc-auth-%u", flavor);
-       spin_lock(&rpc_authflavor_lock);
-       ops = auth_flavors[flavor];
-       if (ops == NULL || !try_module_get(ops->owner)) {
-               spin_unlock(&rpc_authflavor_lock);
                return -ENOENT;
-       }
-       spin_unlock(&rpc_authflavor_lock);
 
        result = -ENOENT;
        if (ops->flavor2info != NULL)
                result = ops->flavor2info(pseudoflavor, info);
 
-       module_put(ops->owner);
+       rpcauth_put_authops(ops);
        return result;
 }
 EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo);
@@ -212,15 +218,13 @@ EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo);
 int
 rpcauth_list_flavors(rpc_authflavor_t *array, int size)
 {
-       rpc_authflavor_t flavor;
-       int result = 0;
+       const struct rpc_authops *ops;
+       rpc_authflavor_t flavor, pseudos[4];
+       int i, len, result = 0;
 
-       spin_lock(&rpc_authflavor_lock);
+       rcu_read_lock();
        for (flavor = 0; flavor < RPC_AUTH_MAXFLAVOR; flavor++) {
-               const struct rpc_authops *ops = auth_flavors[flavor];
-               rpc_authflavor_t pseudos[4];
-               int i, len;
-
+               ops = rcu_dereference(auth_flavors[flavor]);
                if (result >= size) {
                        result = -ENOMEM;
                        break;
@@ -245,7 +249,7 @@ rpcauth_list_flavors(rpc_authflavor_t *array, int size)
                        array[result++] = pseudos[i];
                }
        }
-       spin_unlock(&rpc_authflavor_lock);
+       rcu_read_unlock();
 
        dprintk("RPC:       %s returns %d\n", __func__, result);
        return result;
@@ -255,25 +259,17 @@ EXPORT_SYMBOL_GPL(rpcauth_list_flavors);
 struct rpc_auth *
 rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 {
-       struct rpc_auth         *auth;
+       struct rpc_auth *auth = ERR_PTR(-EINVAL);
        const struct rpc_authops *ops;
-       u32                     flavor = pseudoflavor_to_flavor(args->pseudoflavor);
+       u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor);
 
-       auth = ERR_PTR(-EINVAL);
-       if (flavor >= RPC_AUTH_MAXFLAVOR)
+       ops = rpcauth_get_authops(flavor);
+       if (ops == NULL)
                goto out;
 
-       if ((ops = auth_flavors[flavor]) == NULL)
-               request_module("rpc-auth-%u", flavor);
-       spin_lock(&rpc_authflavor_lock);
-       ops = auth_flavors[flavor];
-       if (ops == NULL || !try_module_get(ops->owner)) {
-               spin_unlock(&rpc_authflavor_lock);
-               goto out;
-       }
-       spin_unlock(&rpc_authflavor_lock);
        auth = ops->create(args, clnt);
-       module_put(ops->owner);
+
+       rpcauth_put_authops(ops);
        if (IS_ERR(auth))
                return auth;
        if (clnt->cl_auth)
@@ -288,32 +284,37 @@ EXPORT_SYMBOL_GPL(rpcauth_create);
 void
 rpcauth_release(struct rpc_auth *auth)
 {
-       if (!atomic_dec_and_test(&auth->au_count))
+       if (!refcount_dec_and_test(&auth->au_count))
                return;
        auth->au_ops->destroy(auth);
 }
 
 static DEFINE_SPINLOCK(rpc_credcache_lock);
 
-static void
+/*
+ * On success, the caller is responsible for freeing the reference
+ * held by the hashtable
+ */
+static bool
 rpcauth_unhash_cred_locked(struct rpc_cred *cred)
 {
+       if (!test_and_clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags))
+               return false;
        hlist_del_rcu(&cred->cr_hash);
-       smp_mb__before_atomic();
-       clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags);
+       return true;
 }
 
-static int
+static bool
 rpcauth_unhash_cred(struct rpc_cred *cred)
 {
        spinlock_t *cache_lock;
-       int ret;
+       bool ret;
 
+       if (!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags))
+               return false;
        cache_lock = &cred->cr_auth->au_credcache->lock;
        spin_lock(cache_lock);
-       ret = atomic_read(&cred->cr_count) == 0;
-       if (ret)
-               rpcauth_unhash_cred_locked(cred);
+       ret = rpcauth_unhash_cred_locked(cred);
        spin_unlock(cache_lock);
        return ret;
 }
@@ -392,6 +393,44 @@ void rpcauth_destroy_credlist(struct list_head *head)
        }
 }
 
+static void
+rpcauth_lru_add_locked(struct rpc_cred *cred)
+{
+       if (!list_empty(&cred->cr_lru))
+               return;
+       number_cred_unused++;
+       list_add_tail(&cred->cr_lru, &cred_unused);
+}
+
+static void
+rpcauth_lru_add(struct rpc_cred *cred)
+{
+       if (!list_empty(&cred->cr_lru))
+               return;
+       spin_lock(&rpc_credcache_lock);
+       rpcauth_lru_add_locked(cred);
+       spin_unlock(&rpc_credcache_lock);
+}
+
+static void
+rpcauth_lru_remove_locked(struct rpc_cred *cred)
+{
+       if (list_empty(&cred->cr_lru))
+               return;
+       number_cred_unused--;
+       list_del_init(&cred->cr_lru);
+}
+
+static void
+rpcauth_lru_remove(struct rpc_cred *cred)
+{
+       if (list_empty(&cred->cr_lru))
+               return;
+       spin_lock(&rpc_credcache_lock);
+       rpcauth_lru_remove_locked(cred);
+       spin_unlock(&rpc_credcache_lock);
+}
+
 /*
  * Clear the RPC credential cache, and delete those credentials
  * that are not referenced.
@@ -411,13 +450,10 @@ rpcauth_clear_credcache(struct rpc_cred_cache *cache)
                head = &cache->hashtable[i];
                while (!hlist_empty(head)) {
                        cred = hlist_entry(head->first, struct rpc_cred, cr_hash);
-                       get_rpccred(cred);
-                       if (!list_empty(&cred->cr_lru)) {
-                               list_del(&cred->cr_lru);
-                               number_cred_unused--;
-                       }
-                       list_add_tail(&cred->cr_lru, &free);
                        rpcauth_unhash_cred_locked(cred);
+                       /* Note: We now hold a reference to cred */
+                       rpcauth_lru_remove_locked(cred);
+                       list_add_tail(&cred->cr_lru, &free);
                }
        }
        spin_unlock(&cache->lock);
@@ -451,7 +487,6 @@ EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache);
 static long
 rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
 {
-       spinlock_t *cache_lock;
        struct rpc_cred *cred, *next;
        unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM;
        long freed = 0;
@@ -460,32 +495,24 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
 
                if (nr_to_scan-- == 0)
                        break;
+               if (refcount_read(&cred->cr_count) > 1) {
+                       rpcauth_lru_remove_locked(cred);
+                       continue;
+               }
                /*
                 * Enforce a 60 second garbage collection moratorium
                 * Note that the cred_unused list must be time-ordered.
                 */
-               if (time_in_range(cred->cr_expire, expired, jiffies) &&
-                   test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) {
-                       freed = SHRINK_STOP;
-                       break;
-               }
-
-               list_del_init(&cred->cr_lru);
-               number_cred_unused--;
-               freed++;
-               if (atomic_read(&cred->cr_count) != 0)
+               if (!time_in_range(cred->cr_expire, expired, jiffies))
+                       continue;
+               if (!rpcauth_unhash_cred(cred))
                        continue;
 
-               cache_lock = &cred->cr_auth->au_credcache->lock;
-               spin_lock(cache_lock);
-               if (atomic_read(&cred->cr_count) == 0) {
-                       get_rpccred(cred);
-                       list_add_tail(&cred->cr_lru, free);
-                       rpcauth_unhash_cred_locked(cred);
-               }
-               spin_unlock(cache_lock);
+               rpcauth_lru_remove_locked(cred);
+               freed++;
+               list_add_tail(&cred->cr_lru, free);
        }
-       return freed;
+       return freed ? freed : SHRINK_STOP;
 }
 
 static unsigned long
@@ -561,19 +588,15 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
                if (!entry->cr_ops->crmatch(acred, entry, flags))
                        continue;
                if (flags & RPCAUTH_LOOKUP_RCU) {
-                       if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) &&
-                           !test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags))
-                               cred = entry;
+                       if (test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags) ||
+                           refcount_read(&entry->cr_count) == 0)
+                               continue;
+                       cred = entry;
                        break;
                }
-               spin_lock(&cache->lock);
-               if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) {
-                       spin_unlock(&cache->lock);
-                       continue;
-               }
                cred = get_rpccred(entry);
-               spin_unlock(&cache->lock);
-               break;
+               if (cred)
+                       break;
        }
        rcu_read_unlock();
 
@@ -594,11 +617,13 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
                if (!entry->cr_ops->crmatch(acred, entry, flags))
                        continue;
                cred = get_rpccred(entry);
-               break;
+               if (cred)
+                       break;
        }
        if (cred == NULL) {
                cred = new;
                set_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags);
+               refcount_inc(&cred->cr_count);
                hlist_add_head_rcu(&cred->cr_hash, &cache->hashtable[nr]);
        } else
                list_add_tail(&new->cr_lru, &free);
@@ -645,7 +670,7 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
 {
        INIT_HLIST_NODE(&cred->cr_hash);
        INIT_LIST_HEAD(&cred->cr_lru);
-       atomic_set(&cred->cr_count, 1);
+       refcount_set(&cred->cr_count, 1);
        cred->cr_auth = auth;
        cred->cr_ops = ops;
        cred->cr_expire = jiffies;
@@ -713,36 +738,29 @@ put_rpccred(struct rpc_cred *cred)
 {
        if (cred == NULL)
                return;
-       /* Fast path for unhashed credentials */
-       if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) {
-               if (atomic_dec_and_test(&cred->cr_count))
-                       cred->cr_ops->crdestroy(cred);
-               return;
-       }
-
-       if (!atomic_dec_and_lock(&cred->cr_count, &rpc_credcache_lock))
-               return;
-       if (!list_empty(&cred->cr_lru)) {
-               number_cred_unused--;
-               list_del_init(&cred->cr_lru);
-       }
-       if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) {
-               if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) {
-                       cred->cr_expire = jiffies;
-                       list_add_tail(&cred->cr_lru, &cred_unused);
-                       number_cred_unused++;
-                       goto out_nodestroy;
-               }
-               if (!rpcauth_unhash_cred(cred)) {
-                       /* We were hashed and someone looked us up... */
-                       goto out_nodestroy;
-               }
+       rcu_read_lock();
+       if (refcount_dec_and_test(&cred->cr_count))
+               goto destroy;
+       if (refcount_read(&cred->cr_count) != 1 ||
+           !test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags))
+               goto out;
+       if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) {
+               cred->cr_expire = jiffies;
+               rpcauth_lru_add(cred);
+               /* Race breaker */
+               if (unlikely(!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)))
+                       rpcauth_lru_remove(cred);
+       } else if (rpcauth_unhash_cred(cred)) {
+               rpcauth_lru_remove(cred);
+               if (refcount_dec_and_test(&cred->cr_count))
+                       goto destroy;
        }
-       spin_unlock(&rpc_credcache_lock);
-       cred->cr_ops->crdestroy(cred);
+out:
+       rcu_read_unlock();
        return;
-out_nodestroy:
-       spin_unlock(&rpc_credcache_lock);
+destroy:
+       rcu_read_unlock();
+       cred->cr_ops->crdestroy(cred);
 }
 EXPORT_SYMBOL_GPL(put_rpccred);
 
@@ -817,6 +835,16 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp,
        return rpcauth_unwrap_req_decode(decode, rqstp, data, obj);
 }
 
+bool
+rpcauth_xmit_need_reencode(struct rpc_task *task)
+{
+       struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+
+       if (!cred || !cred->cr_ops->crneed_reencode)
+               return false;
+       return cred->cr_ops->crneed_reencode(task);
+}
+
 int
 rpcauth_refreshcred(struct rpc_task *task)
 {
index f1df9837f1acaf0161d360c6e4f02122dc6ff5d8..d8831b988b1e7a3273c73ee2457615b26c24b529 100644 (file)
@@ -274,7 +274,7 @@ static const struct rpc_authops generic_auth_ops = {
 
 static struct rpc_auth generic_auth = {
        .au_ops = &generic_auth_ops,
-       .au_count = ATOMIC_INIT(0),
+       .au_count = REFCOUNT_INIT(1),
 };
 
 static bool generic_key_to_expire(struct rpc_cred *cred)
index 21c0aa0a0d1d4fde901aed91a51fc65481f72b7b..30f970cdc7f66375d45e5363dfa07cab233f4978 100644 (file)
@@ -1058,7 +1058,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
        auth->au_flavor = flavor;
        if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor))
                auth->au_flags |= RPCAUTH_AUTH_DATATOUCH;
-       atomic_set(&auth->au_count, 1);
+       refcount_set(&auth->au_count, 1);
        kref_init(&gss_auth->kref);
 
        err = rpcauth_init_credcache(auth);
@@ -1187,7 +1187,7 @@ gss_auth_find_or_add_hashed(const struct rpc_auth_create_args *args,
                        if (strcmp(gss_auth->target_name, args->target_name))
                                continue;
                }
-               if (!atomic_inc_not_zero(&gss_auth->rpc_auth.au_count))
+               if (!refcount_inc_not_zero(&gss_auth->rpc_auth.au_count))
                        continue;
                goto out;
        }
@@ -1984,6 +1984,46 @@ gss_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp,
        return decode(rqstp, &xdr, obj);
 }
 
+static bool
+gss_seq_is_newer(u32 new, u32 old)
+{
+       return (s32)(new - old) > 0;
+}
+
+static bool
+gss_xmit_need_reencode(struct rpc_task *task)
+{
+       struct rpc_rqst *req = task->tk_rqstp;
+       struct rpc_cred *cred = req->rq_cred;
+       struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
+       u32 win, seq_xmit;
+       bool ret = true;
+
+       if (!ctx)
+               return true;
+
+       if (gss_seq_is_newer(req->rq_seqno, READ_ONCE(ctx->gc_seq)))
+               goto out;
+
+       seq_xmit = READ_ONCE(ctx->gc_seq_xmit);
+       while (gss_seq_is_newer(req->rq_seqno, seq_xmit)) {
+               u32 tmp = seq_xmit;
+
+               seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, req->rq_seqno);
+               if (seq_xmit == tmp) {
+                       ret = false;
+                       goto out;
+               }
+       }
+
+       win = ctx->gc_win;
+       if (win > 0)
+               ret = !gss_seq_is_newer(req->rq_seqno, seq_xmit - win);
+out:
+       gss_put_ctx(ctx);
+       return ret;
+}
+
 static int
 gss_unwrap_resp(struct rpc_task *task,
                kxdrdproc_t decode, void *rqstp, __be32 *p, void *obj)
@@ -2052,6 +2092,7 @@ static const struct rpc_credops gss_credops = {
        .crunwrap_resp          = gss_unwrap_resp,
        .crkey_timeout          = gss_key_timeout,
        .crstringify_acceptor   = gss_stringify_acceptor,
+       .crneed_reencode        = gss_xmit_need_reencode,
 };
 
 static const struct rpc_credops gss_nullops = {
index eaad9bc7a0bdc956ef25e40121bb17449723f6d1..b4adeb06660b15f6ffad21e5e0d79a88b61b1fd5 100644 (file)
 #include <linux/sunrpc/gss_krb5.h>
 #include <linux/random.h>
 #include <linux/crypto.h>
+#include <linux/atomic.h>
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY        RPCDBG_AUTH
 #endif
 
-DEFINE_SPINLOCK(krb5_seq_lock);
-
 static void *
 setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token)
 {
@@ -124,6 +123,30 @@ setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token)
        return krb5_hdr;
 }
 
+u32
+gss_seq_send_fetch_and_inc(struct krb5_ctx *ctx)
+{
+       u32 old, seq_send = READ_ONCE(ctx->seq_send);
+
+       do {
+               old = seq_send;
+               seq_send = cmpxchg(&ctx->seq_send, old, old + 1);
+       } while (old != seq_send);
+       return seq_send;
+}
+
+u64
+gss_seq_send64_fetch_and_inc(struct krb5_ctx *ctx)
+{
+       u64 old, seq_send = READ_ONCE(ctx->seq_send);
+
+       do {
+               old = seq_send;
+               seq_send = cmpxchg64(&ctx->seq_send64, old, old + 1);
+       } while (old != seq_send);
+       return seq_send;
+}
+
 static u32
 gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text,
                struct xdr_netobj *token)
@@ -154,9 +177,7 @@ gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text,
 
        memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len);
 
-       spin_lock(&krb5_seq_lock);
-       seq_send = ctx->seq_send++;
-       spin_unlock(&krb5_seq_lock);
+       seq_send = gss_seq_send_fetch_and_inc(ctx);
 
        if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff,
                              seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8))
@@ -174,7 +195,6 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
                                       .data = cksumdata};
        void *krb5_hdr;
        s32 now;
-       u64 seq_send;
        u8 *cksumkey;
        unsigned int cksum_usage;
        __be64 seq_send_be64;
@@ -185,11 +205,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
 
        /* Set up the sequence number. Now 64-bits in clear
         * text and w/o direction indicator */
-       spin_lock(&krb5_seq_lock);
-       seq_send = ctx->seq_send64++;
-       spin_unlock(&krb5_seq_lock);
-
-       seq_send_be64 = cpu_to_be64(seq_send);
+       seq_send_be64 = cpu_to_be64(gss_seq_send64_fetch_and_inc(ctx));
        memcpy(krb5_hdr + 8, (char *) &seq_send_be64, 8);
 
        if (ctx->initiate) {
index 3d975a4013d2ffd0ee4a9ec60a74122989f3cd06..962fa84e6db114f95790f8d6bba485fe226ed43e 100644 (file)
@@ -228,9 +228,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
 
        memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len);
 
-       spin_lock(&krb5_seq_lock);
-       seq_send = kctx->seq_send++;
-       spin_unlock(&krb5_seq_lock);
+       seq_send = gss_seq_send_fetch_and_inc(kctx);
 
        /* XXX would probably be more efficient to compute checksum
         * and encrypt at the same time: */
@@ -477,9 +475,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
        *be16ptr++ = 0;
 
        be64ptr = (__be64 *)be16ptr;
-       spin_lock(&krb5_seq_lock);
-       *be64ptr = cpu_to_be64(kctx->seq_send64++);
-       spin_unlock(&krb5_seq_lock);
+       *be64ptr = cpu_to_be64(gss_seq_send64_fetch_and_inc(kctx));
 
        err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages);
        if (err)
index 5fec3abbe19bb640de31bf85bfc9becdeab5f359..16ac0f4cb7d8e1ebae658c29234778ee7678aabf 100644 (file)
@@ -117,7 +117,7 @@ int gss_mech_register(struct gss_api_mech *gm)
        if (status)
                return status;
        spin_lock(&registered_mechs_lock);
-       list_add(&gm->gm_list, &registered_mechs);
+       list_add_rcu(&gm->gm_list, &registered_mechs);
        spin_unlock(&registered_mechs_lock);
        dprintk("RPC:       registered gss mechanism %s\n", gm->gm_name);
        return 0;
@@ -132,7 +132,7 @@ EXPORT_SYMBOL_GPL(gss_mech_register);
 void gss_mech_unregister(struct gss_api_mech *gm)
 {
        spin_lock(&registered_mechs_lock);
-       list_del(&gm->gm_list);
+       list_del_rcu(&gm->gm_list);
        spin_unlock(&registered_mechs_lock);
        dprintk("RPC:       unregistered gss mechanism %s\n", gm->gm_name);
        gss_mech_free(gm);
@@ -151,15 +151,15 @@ _gss_mech_get_by_name(const char *name)
 {
        struct gss_api_mech     *pos, *gm = NULL;
 
-       spin_lock(&registered_mechs_lock);
-       list_for_each_entry(pos, &registered_mechs, gm_list) {
+       rcu_read_lock();
+       list_for_each_entry_rcu(pos, &registered_mechs, gm_list) {
                if (0 == strcmp(name, pos->gm_name)) {
                        if (try_module_get(pos->gm_owner))
                                gm = pos;
                        break;
                }
        }
-       spin_unlock(&registered_mechs_lock);
+       rcu_read_unlock();
        return gm;
 
 }
@@ -186,8 +186,8 @@ struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj)
        dprintk("RPC:       %s(%s)\n", __func__, buf);
        request_module("rpc-auth-gss-%s", buf);
 
-       spin_lock(&registered_mechs_lock);
-       list_for_each_entry(pos, &registered_mechs, gm_list) {
+       rcu_read_lock();
+       list_for_each_entry_rcu(pos, &registered_mechs, gm_list) {
                if (obj->len == pos->gm_oid.len) {
                        if (0 == memcmp(obj->data, pos->gm_oid.data, obj->len)) {
                                if (try_module_get(pos->gm_owner))
@@ -196,7 +196,7 @@ struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj)
                        }
                }
        }
-       spin_unlock(&registered_mechs_lock);
+       rcu_read_unlock();
        return gm;
 }
 
@@ -216,15 +216,15 @@ static struct gss_api_mech *_gss_mech_get_by_pseudoflavor(u32 pseudoflavor)
 {
        struct gss_api_mech *gm = NULL, *pos;
 
-       spin_lock(&registered_mechs_lock);
-       list_for_each_entry(pos, &registered_mechs, gm_list) {
+       rcu_read_lock();
+       list_for_each_entry_rcu(pos, &registered_mechs, gm_list) {
                if (!mech_supports_pseudoflavor(pos, pseudoflavor))
                        continue;
                if (try_module_get(pos->gm_owner))
                        gm = pos;
                break;
        }
-       spin_unlock(&registered_mechs_lock);
+       rcu_read_unlock();
        return gm;
 }
 
@@ -257,8 +257,8 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size)
        struct gss_api_mech *pos = NULL;
        int j, i = 0;
 
-       spin_lock(&registered_mechs_lock);
-       list_for_each_entry(pos, &registered_mechs, gm_list) {
+       rcu_read_lock();
+       list_for_each_entry_rcu(pos, &registered_mechs, gm_list) {
                for (j = 0; j < pos->gm_pf_num; j++) {
                        if (i >= size) {
                                spin_unlock(&registered_mechs_lock);
@@ -267,7 +267,7 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size)
                        array_ptr[i++] = pos->gm_pfs[j].pseudoflavor;
                }
        }
-       spin_unlock(&registered_mechs_lock);
+       rcu_read_unlock();
        return i;
 }
 
index 444380f968f1158660f6a01a10cd8223c9db6081..006062ad5f583adf07c30f13f1310ea85a3d729b 100644 (file)
@@ -784,6 +784,7 @@ void gssx_enc_accept_sec_context(struct rpc_rqst *req,
        xdr_inline_pages(&req->rq_rcv_buf,
                PAGE_SIZE/2 /* pretty arbitrary */,
                arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE);
+       req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
 done:
        if (err)
                dprintk("RPC:       gssx_enc_accept_sec_context: %d\n", err);
index 4b48228ee8c74a6d313289d7425c0c3d5262b2e4..2694a1bc026b89f8f53af6bebd257f652f3378d4 100644 (file)
@@ -21,7 +21,7 @@ static struct rpc_cred null_cred;
 static struct rpc_auth *
 nul_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 {
-       atomic_inc(&null_auth.au_count);
+       refcount_inc(&null_auth.au_count);
        return &null_auth;
 }
 
@@ -119,7 +119,7 @@ struct rpc_auth null_auth = {
        .au_flags       = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
        .au_ops         = &authnull_ops,
        .au_flavor      = RPC_AUTH_NULL,
-       .au_count       = ATOMIC_INIT(0),
+       .au_count       = REFCOUNT_INIT(1),
 };
 
 static
@@ -138,6 +138,6 @@ struct rpc_cred null_cred = {
        .cr_lru         = LIST_HEAD_INIT(null_cred.cr_lru),
        .cr_auth        = &null_auth,
        .cr_ops         = &null_credops,
-       .cr_count       = ATOMIC_INIT(1),
+       .cr_count       = REFCOUNT_INIT(2),
        .cr_flags       = 1UL << RPCAUTH_CRED_UPTODATE,
 };
index 185e56d4f9aee4da149dc73556ae0e35abed4424..4c1c7e56288f50fe268c49c0c6f10e2bcb4be401 100644 (file)
@@ -34,7 +34,7 @@ unx_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 {
        dprintk("RPC:       creating UNIX authenticator for client %p\n",
                        clnt);
-       atomic_inc(&unix_auth.au_count);
+       refcount_inc(&unix_auth.au_count);
        return &unix_auth;
 }
 
@@ -239,7 +239,7 @@ struct rpc_auth             unix_auth = {
        .au_flags       = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
        .au_ops         = &authunix_ops,
        .au_flavor      = RPC_AUTH_UNIX,
-       .au_count       = ATOMIC_INIT(0),
+       .au_count       = REFCOUNT_INIT(1),
 };
 
 static
index 3c15a99b97003d1441a532d18c9ffe4f3980b5b9..fa5ba6ed3197a8be56bb401109759941ac0618b0 100644 (file)
@@ -91,7 +91,6 @@ struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt, gfp_t gfp_flags)
                return NULL;
 
        req->rq_xprt = xprt;
-       INIT_LIST_HEAD(&req->rq_list);
        INIT_LIST_HEAD(&req->rq_bc_list);
 
        /* Preallocate one XDR receive buffer */
index 8ea2f5fadd967c03a2fa87baa5c8c40710d8fedc..ae3b8145da35a236cb24a7aff544b3f99d67547d 100644 (file)
@@ -61,6 +61,7 @@ static void   call_start(struct rpc_task *task);
 static void    call_reserve(struct rpc_task *task);
 static void    call_reserveresult(struct rpc_task *task);
 static void    call_allocate(struct rpc_task *task);
+static void    call_encode(struct rpc_task *task);
 static void    call_decode(struct rpc_task *task);
 static void    call_bind(struct rpc_task *task);
 static void    call_bind_status(struct rpc_task *task);
@@ -1137,10 +1138,10 @@ EXPORT_SYMBOL_GPL(rpc_call_async);
 struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
 {
        struct rpc_task *task;
-       struct xdr_buf *xbufp = &req->rq_snd_buf;
        struct rpc_task_setup task_setup_data = {
                .callback_ops = &rpc_default_ops,
-               .flags = RPC_TASK_SOFTCONN,
+               .flags = RPC_TASK_SOFTCONN |
+                       RPC_TASK_NO_RETRANS_TIMEOUT,
        };
 
        dprintk("RPC: rpc_run_bc_task req= %p\n", req);
@@ -1148,14 +1149,7 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
         * Create an rpc_task to send the data
         */
        task = rpc_new_task(&task_setup_data);
-       task->tk_rqstp = req;
-
-       /*
-        * Set up the xdr_buf length.
-        * This also indicates that the buffer is XDR encoded already.
-        */
-       xbufp->len = xbufp->head[0].iov_len + xbufp->page_len +
-                       xbufp->tail[0].iov_len;
+       xprt_init_bc_request(req, task);
 
        task->tk_action = call_bc_transmit;
        atomic_inc(&task->tk_count);
@@ -1558,7 +1552,6 @@ call_reserveresult(struct rpc_task *task)
        task->tk_status = 0;
        if (status >= 0) {
                if (task->tk_rqstp) {
-                       xprt_request_init(task);
                        task->tk_action = call_refresh;
                        return;
                }
@@ -1680,7 +1673,7 @@ call_allocate(struct rpc_task *task)
        dprint_status(task);
 
        task->tk_status = 0;
-       task->tk_action = call_bind;
+       task->tk_action = call_encode;
 
        if (req->rq_buffer)
                return;
@@ -1721,22 +1714,15 @@ call_allocate(struct rpc_task *task)
        rpc_exit(task, -ERESTARTSYS);
 }
 
-static inline int
+static int
 rpc_task_need_encode(struct rpc_task *task)
 {
-       return task->tk_rqstp->rq_snd_buf.len == 0;
+       return test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) == 0 &&
+               (!(task->tk_flags & RPC_TASK_SENT) ||
+                !(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) ||
+                xprt_request_need_retransmit(task));
 }
 
-static inline void
-rpc_task_force_reencode(struct rpc_task *task)
-{
-       task->tk_rqstp->rq_snd_buf.len = 0;
-       task->tk_rqstp->rq_bytes_sent = 0;
-}
-
-/*
- * 3.  Encode arguments of an RPC call
- */
 static void
 rpc_xdr_encode(struct rpc_task *task)
 {
@@ -1752,6 +1738,7 @@ rpc_xdr_encode(struct rpc_task *task)
        xdr_buf_init(&req->rq_rcv_buf,
                     req->rq_rbuffer,
                     req->rq_rcvsize);
+       req->rq_bytes_sent = 0;
 
        p = rpc_encode_header(task);
        if (p == NULL) {
@@ -1766,6 +1753,36 @@ rpc_xdr_encode(struct rpc_task *task)
 
        task->tk_status = rpcauth_wrap_req(task, encode, req, p,
                        task->tk_msg.rpc_argp);
+       if (task->tk_status == 0)
+               xprt_request_prepare(req);
+}
+
+/*
+ * 3.  Encode arguments of an RPC call
+ */
+static void
+call_encode(struct rpc_task *task)
+{
+       if (!rpc_task_need_encode(task))
+               goto out;
+       /* Encode here so that rpcsec_gss can use correct sequence number. */
+       rpc_xdr_encode(task);
+       /* Did the encode result in an error condition? */
+       if (task->tk_status != 0) {
+               /* Was the error nonfatal? */
+               if (task->tk_status == -EAGAIN || task->tk_status == -ENOMEM)
+                       rpc_delay(task, HZ >> 4);
+               else
+                       rpc_exit(task, task->tk_status);
+               return;
+       }
+
+       /* Add task to reply queue before transmission to avoid races */
+       if (rpc_reply_expected(task))
+               xprt_request_enqueue_receive(task);
+       xprt_request_enqueue_transmit(task);
+out:
+       task->tk_action = call_bind;
 }
 
 /*
@@ -1947,43 +1964,16 @@ call_connect_status(struct rpc_task *task)
 static void
 call_transmit(struct rpc_task *task)
 {
-       int is_retrans = RPC_WAS_SENT(task);
-
        dprint_status(task);
 
-       task->tk_action = call_status;
-       if (task->tk_status < 0)
-               return;
-       if (!xprt_prepare_transmit(task))
-               return;
-       task->tk_action = call_transmit_status;
-       /* Encode here so that rpcsec_gss can use correct sequence number. */
-       if (rpc_task_need_encode(task)) {
-               rpc_xdr_encode(task);
-               /* Did the encode result in an error condition? */
-               if (task->tk_status != 0) {
-                       /* Was the error nonfatal? */
-                       if (task->tk_status == -EAGAIN)
-                               rpc_delay(task, HZ >> 4);
-                       else
-                               rpc_exit(task, task->tk_status);
+       task->tk_status = 0;
+       if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) {
+               if (!xprt_prepare_transmit(task))
                        return;
-               }
+               xprt_transmit(task);
        }
-       xprt_transmit(task);
-       if (task->tk_status < 0)
-               return;
-       if (is_retrans)
-               task->tk_client->cl_stats->rpcretrans++;
-       /*
-        * On success, ensure that we call xprt_end_transmit() before sleeping
-        * in order to allow access to the socket to other RPC requests.
-        */
-       call_transmit_status(task);
-       if (rpc_reply_expected(task))
-               return;
-       task->tk_action = rpc_exit_task;
-       rpc_wake_up_queued_task(&task->tk_rqstp->rq_xprt->pending, task);
+       task->tk_action = call_transmit_status;
+       xprt_end_transmit(task);
 }
 
 /*
@@ -1999,19 +1989,17 @@ call_transmit_status(struct rpc_task *task)
         * test first.
         */
        if (task->tk_status == 0) {
-               xprt_end_transmit(task);
-               rpc_task_force_reencode(task);
+               xprt_request_wait_receive(task);
                return;
        }
 
        switch (task->tk_status) {
-       case -EAGAIN:
-       case -ENOBUFS:
-               break;
        default:
                dprint_status(task);
-               xprt_end_transmit(task);
-               rpc_task_force_reencode(task);
+               break;
+       case -EBADMSG:
+               task->tk_status = 0;
+               task->tk_action = call_encode;
                break;
                /*
                 * Special cases: if we've been waiting on the
@@ -2019,6 +2007,14 @@ call_transmit_status(struct rpc_task *task)
                 * socket just returned a connection error,
                 * then hold onto the transport lock.
                 */
+       case -ENOBUFS:
+               rpc_delay(task, HZ>>2);
+               /* fall through */
+       case -EBADSLT:
+       case -EAGAIN:
+               task->tk_action = call_transmit;
+               task->tk_status = 0;
+               break;
        case -ECONNREFUSED:
        case -EHOSTDOWN:
        case -ENETDOWN:
@@ -2026,7 +2022,6 @@ call_transmit_status(struct rpc_task *task)
        case -ENETUNREACH:
        case -EPERM:
                if (RPC_IS_SOFTCONN(task)) {
-                       xprt_end_transmit(task);
                        if (!task->tk_msg.rpc_proc->p_proc)
                                trace_xprt_ping(task->tk_xprt,
                                                task->tk_status);
@@ -2039,7 +2034,7 @@ call_transmit_status(struct rpc_task *task)
        case -EADDRINUSE:
        case -ENOTCONN:
        case -EPIPE:
-               rpc_task_force_reencode(task);
+               break;
        }
 }
 
@@ -2053,6 +2048,11 @@ call_bc_transmit(struct rpc_task *task)
 {
        struct rpc_rqst *req = task->tk_rqstp;
 
+       if (rpc_task_need_encode(task))
+               xprt_request_enqueue_transmit(task);
+       if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
+               goto out_wakeup;
+
        if (!xprt_prepare_transmit(task))
                goto out_retry;
 
@@ -2061,14 +2061,9 @@ call_bc_transmit(struct rpc_task *task)
                        "error: %d\n", task->tk_status);
                goto out_done;
        }
-       if (req->rq_connect_cookie != req->rq_xprt->connect_cookie)
-               req->rq_bytes_sent = 0;
 
        xprt_transmit(task);
 
-       if (task->tk_status == -EAGAIN)
-               goto out_nospace;
-
        xprt_end_transmit(task);
        dprint_status(task);
        switch (task->tk_status) {
@@ -2084,6 +2079,8 @@ call_bc_transmit(struct rpc_task *task)
        case -ENOTCONN:
        case -EPIPE:
                break;
+       case -EAGAIN:
+               goto out_retry;
        case -ETIMEDOUT:
                /*
                 * Problem reaching the server.  Disconnect and let the
@@ -2107,12 +2104,11 @@ call_bc_transmit(struct rpc_task *task)
                        "error: %d\n", task->tk_status);
                break;
        }
+out_wakeup:
        rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
 out_done:
        task->tk_action = rpc_exit_task;
        return;
-out_nospace:
-       req->rq_connect_cookie = req->rq_xprt->connect_cookie;
 out_retry:
        task->tk_status = 0;
 }
@@ -2125,15 +2121,11 @@ static void
 call_status(struct rpc_task *task)
 {
        struct rpc_clnt *clnt = task->tk_client;
-       struct rpc_rqst *req = task->tk_rqstp;
        int             status;
 
        if (!task->tk_msg.rpc_proc->p_proc)
                trace_xprt_ping(task->tk_xprt, task->tk_status);
 
-       if (req->rq_reply_bytes_recvd > 0 && !req->rq_bytes_sent)
-               task->tk_status = req->rq_reply_bytes_recvd;
-
        dprint_status(task);
 
        status = task->tk_status;
@@ -2173,13 +2165,8 @@ call_status(struct rpc_task *task)
                /* fall through */
        case -EPIPE:
        case -ENOTCONN:
-               task->tk_action = call_bind;
-               break;
-       case -ENOBUFS:
-               rpc_delay(task, HZ>>2);
-               /* fall through */
        case -EAGAIN:
-               task->tk_action = call_transmit;
+               task->tk_action = call_encode;
                break;
        case -EIO:
                /* shutdown or soft timeout */
@@ -2244,7 +2231,7 @@ call_timeout(struct rpc_task *task)
        rpcauth_invalcred(task);
 
 retry:
-       task->tk_action = call_bind;
+       task->tk_action = call_encode;
        task->tk_status = 0;
 }
 
@@ -2261,6 +2248,11 @@ call_decode(struct rpc_task *task)
 
        dprint_status(task);
 
+       if (!decode) {
+               task->tk_action = rpc_exit_task;
+               return;
+       }
+
        if (task->tk_flags & RPC_CALL_MAJORSEEN) {
                if (clnt->cl_chatty) {
                        printk(KERN_NOTICE "%s: server %s OK\n",
@@ -2283,7 +2275,7 @@ call_decode(struct rpc_task *task)
 
        if (req->rq_rcv_buf.len < 12) {
                if (!RPC_IS_SOFT(task)) {
-                       task->tk_action = call_bind;
+                       task->tk_action = call_encode;
                        goto out_retry;
                }
                dprintk("RPC:       %s: too small RPC reply size (%d bytes)\n",
@@ -2298,13 +2290,11 @@ call_decode(struct rpc_task *task)
                        goto out_retry;
                return;
        }
-
        task->tk_action = rpc_exit_task;
 
-       if (decode) {
-               task->tk_status = rpcauth_unwrap_resp(task, decode, req, p,
-                                                     task->tk_msg.rpc_resp);
-       }
+       task->tk_status = rpcauth_unwrap_resp(task, decode, req, p,
+                                             task->tk_msg.rpc_resp);
+
        dprintk("RPC: %5u call_decode result %d\n", task->tk_pid,
                        task->tk_status);
        return;
@@ -2416,7 +2406,7 @@ rpc_verify_header(struct rpc_task *task)
                        task->tk_garb_retry--;
                        dprintk("RPC: %5u %s: retry garbled creds\n",
                                        task->tk_pid, __func__);
-                       task->tk_action = call_bind;
+                       task->tk_action = call_encode;
                        goto out_retry;
                case RPC_AUTH_TOOWEAK:
                        printk(KERN_NOTICE "RPC: server %s requires stronger "
@@ -2485,7 +2475,7 @@ out_garbage:
                task->tk_garb_retry--;
                dprintk("RPC: %5u %s: retrying\n",
                                task->tk_pid, __func__);
-               task->tk_action = call_bind;
+               task->tk_action = call_encode;
 out_retry:
                return ERR_PTR(-EAGAIN);
        }
index 3fe5d60ab0e2c6ecd24a4946c32e07dfd941411f..57ca5bead1cb4d7c58020243997c7f1fdf741ed4 100644 (file)
@@ -99,64 +99,78 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
        list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list);
 }
 
-static void rpc_rotate_queue_owner(struct rpc_wait_queue *queue)
-{
-       struct list_head *q = &queue->tasks[queue->priority];
-       struct rpc_task *task;
-
-       if (!list_empty(q)) {
-               task = list_first_entry(q, struct rpc_task, u.tk_wait.list);
-               if (task->tk_owner == queue->owner)
-                       list_move_tail(&task->u.tk_wait.list, q);
-       }
-}
-
 static void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority)
 {
        if (queue->priority != priority) {
-               /* Fairness: rotate the list when changing priority */
-               rpc_rotate_queue_owner(queue);
                queue->priority = priority;
+               queue->nr = 1U << priority;
        }
 }
 
-static void rpc_set_waitqueue_owner(struct rpc_wait_queue *queue, pid_t pid)
-{
-       queue->owner = pid;
-       queue->nr = RPC_BATCH_COUNT;
-}
-
 static void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue)
 {
        rpc_set_waitqueue_priority(queue, queue->maxpriority);
-       rpc_set_waitqueue_owner(queue, 0);
 }
 
 /*
- * Add new request to a priority queue.
+ * Add a request to a queue list
  */
-static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue,
-               struct rpc_task *task,
-               unsigned char queue_priority)
+static void
+__rpc_list_enqueue_task(struct list_head *q, struct rpc_task *task)
 {
-       struct list_head *q;
        struct rpc_task *t;
 
-       INIT_LIST_HEAD(&task->u.tk_wait.links);
-       if (unlikely(queue_priority > queue->maxpriority))
-               queue_priority = queue->maxpriority;
-       if (queue_priority > queue->priority)
-               rpc_set_waitqueue_priority(queue, queue_priority);
-       q = &queue->tasks[queue_priority];
        list_for_each_entry(t, q, u.tk_wait.list) {
                if (t->tk_owner == task->tk_owner) {
-                       list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links);
+                       list_add_tail(&task->u.tk_wait.links,
+                                       &t->u.tk_wait.links);
+                       /* Cache the queue head in task->u.tk_wait.list */
+                       task->u.tk_wait.list.next = q;
+                       task->u.tk_wait.list.prev = NULL;
                        return;
                }
        }
+       INIT_LIST_HEAD(&task->u.tk_wait.links);
        list_add_tail(&task->u.tk_wait.list, q);
 }
 
+/*
+ * Remove request from a queue list
+ */
+static void
+__rpc_list_dequeue_task(struct rpc_task *task)
+{
+       struct list_head *q;
+       struct rpc_task *t;
+
+       if (task->u.tk_wait.list.prev == NULL) {
+               list_del(&task->u.tk_wait.links);
+               return;
+       }
+       if (!list_empty(&task->u.tk_wait.links)) {
+               t = list_first_entry(&task->u.tk_wait.links,
+                               struct rpc_task,
+                               u.tk_wait.links);
+               /* Assume __rpc_list_enqueue_task() cached the queue head */
+               q = t->u.tk_wait.list.next;
+               list_add_tail(&t->u.tk_wait.list, q);
+               list_del(&task->u.tk_wait.links);
+       }
+       list_del(&task->u.tk_wait.list);
+}
+
+/*
+ * Add new request to a priority queue.
+ */
+static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue,
+               struct rpc_task *task,
+               unsigned char queue_priority)
+{
+       if (unlikely(queue_priority > queue->maxpriority))
+               queue_priority = queue->maxpriority;
+       __rpc_list_enqueue_task(&queue->tasks[queue_priority], task);
+}
+
 /*
  * Add new request to wait queue.
  *
@@ -194,13 +208,7 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue,
  */
 static void __rpc_remove_wait_queue_priority(struct rpc_task *task)
 {
-       struct rpc_task *t;
-
-       if (!list_empty(&task->u.tk_wait.links)) {
-               t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list);
-               list_move(&t->u.tk_wait.list, &task->u.tk_wait.list);
-               list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links);
-       }
+       __rpc_list_dequeue_task(task);
 }
 
 /*
@@ -212,7 +220,8 @@ static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_tas
        __rpc_disable_timer(queue, task);
        if (RPC_IS_PRIORITY(queue))
                __rpc_remove_wait_queue_priority(task);
-       list_del(&task->u.tk_wait.list);
+       else
+               list_del(&task->u.tk_wait.list);
        queue->qlen--;
        dprintk("RPC: %5u removed from queue %p \"%s\"\n",
                        task->tk_pid, queue, rpc_qname(queue));
@@ -440,14 +449,28 @@ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq,
 /*
  * Wake up a queued task while the queue lock is being held
  */
-static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq,
-               struct rpc_wait_queue *queue, struct rpc_task *task)
+static struct rpc_task *
+rpc_wake_up_task_on_wq_queue_action_locked(struct workqueue_struct *wq,
+               struct rpc_wait_queue *queue, struct rpc_task *task,
+               bool (*action)(struct rpc_task *, void *), void *data)
 {
        if (RPC_IS_QUEUED(task)) {
                smp_rmb();
-               if (task->tk_waitqueue == queue)
-                       __rpc_do_wake_up_task_on_wq(wq, queue, task);
+               if (task->tk_waitqueue == queue) {
+                       if (action == NULL || action(task, data)) {
+                               __rpc_do_wake_up_task_on_wq(wq, queue, task);
+                               return task;
+                       }
+               }
        }
+       return NULL;
+}
+
+static void
+rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq,
+               struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+       rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, task, NULL, NULL);
 }
 
 /*
@@ -465,6 +488,8 @@ void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq,
                struct rpc_wait_queue *queue,
                struct rpc_task *task)
 {
+       if (!RPC_IS_QUEUED(task))
+               return;
        spin_lock_bh(&queue->lock);
        rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
        spin_unlock_bh(&queue->lock);
@@ -475,12 +500,48 @@ void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq,
  */
 void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task)
 {
+       if (!RPC_IS_QUEUED(task))
+               return;
        spin_lock_bh(&queue->lock);
        rpc_wake_up_task_queue_locked(queue, task);
        spin_unlock_bh(&queue->lock);
 }
 EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task);
 
+static bool rpc_task_action_set_status(struct rpc_task *task, void *status)
+{
+       task->tk_status = *(int *)status;
+       return true;
+}
+
+static void
+rpc_wake_up_task_queue_set_status_locked(struct rpc_wait_queue *queue,
+               struct rpc_task *task, int status)
+{
+       rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue,
+                       task, rpc_task_action_set_status, &status);
+}
+
+/**
+ * rpc_wake_up_queued_task_set_status - wake up a task and set task->tk_status
+ * @queue: pointer to rpc_wait_queue
+ * @task: pointer to rpc_task
+ * @status: integer error value
+ *
+ * If @task is queued on @queue, then it is woken up, and @task->tk_status is
+ * set to the value of @status.
+ */
+void
+rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *queue,
+               struct rpc_task *task, int status)
+{
+       if (!RPC_IS_QUEUED(task))
+               return;
+       spin_lock_bh(&queue->lock);
+       rpc_wake_up_task_queue_set_status_locked(queue, task, status);
+       spin_unlock_bh(&queue->lock);
+}
+
 /*
  * Wake up the next task on a priority queue.
  */
@@ -493,17 +554,9 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q
         * Service a batch of tasks from a single owner.
         */
        q = &queue->tasks[queue->priority];
-       if (!list_empty(q)) {
-               task = list_entry(q->next, struct rpc_task, u.tk_wait.list);
-               if (queue->owner == task->tk_owner) {
-                       if (--queue->nr)
-                               goto out;
-                       list_move_tail(&task->u.tk_wait.list, q);
-               }
-               /*
-                * Check if we need to switch queues.
-                */
-               goto new_owner;
+       if (!list_empty(q) && --queue->nr) {
+               task = list_first_entry(q, struct rpc_task, u.tk_wait.list);
+               goto out;
        }
 
        /*
@@ -515,7 +568,7 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q
                else
                        q = q - 1;
                if (!list_empty(q)) {
-                       task = list_entry(q->next, struct rpc_task, u.tk_wait.list);
+                       task = list_first_entry(q, struct rpc_task, u.tk_wait.list);
                        goto new_queue;
                }
        } while (q != &queue->tasks[queue->priority]);
@@ -525,8 +578,6 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q
 
 new_queue:
        rpc_set_waitqueue_priority(queue, (unsigned int)(q - &queue->tasks[0]));
-new_owner:
-       rpc_set_waitqueue_owner(queue, task->tk_owner);
 out:
        return task;
 }
@@ -553,12 +604,9 @@ struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
                        queue, rpc_qname(queue));
        spin_lock_bh(&queue->lock);
        task = __rpc_find_next_queued(queue);
-       if (task != NULL) {
-               if (func(task, data))
-                       rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
-               else
-                       task = NULL;
-       }
+       if (task != NULL)
+               task = rpc_wake_up_task_on_wq_queue_action_locked(wq, queue,
+                               task, func, data);
        spin_unlock_bh(&queue->lock);
 
        return task;
index f217c348b34100b4cd47e4864b22cb648c4fc141..9062967575c4921b799b6ca4220bfb0e1f9cd35d 100644 (file)
@@ -26,7 +26,8 @@
  * Possibly called several times to iterate over an sk_buff and copy
  * data out of it.
  */
-size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len)
+static size_t
+xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len)
 {
        if (len > desc->count)
                len = desc->count;
@@ -36,7 +37,6 @@ size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len)
        desc->offset += len;
        return len;
 }
-EXPORT_SYMBOL_GPL(xdr_skb_read_bits);
 
 /**
  * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer
@@ -69,7 +69,8 @@ static size_t xdr_skb_read_and_csum_bits(struct xdr_skb_reader *desc, void *to,
  * @copy_actor: virtual method for copying data
  *
  */
-ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor)
+static ssize_t
+xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor)
 {
        struct page     **ppage = xdr->pages;
        unsigned int    len, pglen = xdr->page_len;
@@ -104,7 +105,7 @@ ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct
 
                /* ACL likes to be lazy in allocating pages - ACLs
                 * are small by default but can get huge. */
-               if (unlikely(*ppage == NULL)) {
+               if ((xdr->flags & XDRBUF_SPARSE_PAGES) && *ppage == NULL) {
                        *ppage = alloc_page(GFP_ATOMIC);
                        if (unlikely(*ppage == NULL)) {
                                if (copied == 0)
@@ -140,7 +141,6 @@ copy_tail:
 out:
        return copied;
 }
-EXPORT_SYMBOL_GPL(xdr_partial_copy_from_skb);
 
 /**
  * csum_partial_copy_to_xdr - checksum and copy data
index 5185efb9027b7da153cb7a6d430d096a6c3cefde..87533fbb96cfa89b2fdbc9bfa4ba240d881903d4 100644 (file)
@@ -171,7 +171,6 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl,
        mutex_init(&xprt->xpt_mutex);
        spin_lock_init(&xprt->xpt_lock);
        set_bit(XPT_BUSY, &xprt->xpt_flags);
-       rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending");
        xprt->xpt_net = get_net(net);
        strcpy(xprt->xpt_remotebuf, "uninitialized");
 }
@@ -895,7 +894,6 @@ int svc_send(struct svc_rqst *rqstp)
        else
                len = xprt->xpt_ops->xpo_sendto(rqstp);
        mutex_unlock(&xprt->xpt_mutex);
-       rpc_wake_up(&xprt->xpt_bc_pending);
        trace_svc_send(rqstp, len);
        svc_xprt_release(rqstp);
 
index 5445145e639c9c82f8cc53f80b327da9260b0658..db8bb6b3a2b0faf387868b80539c33ae9c5626dd 100644 (file)
@@ -1004,7 +1004,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
 
        if (!bc_xprt)
                return -EAGAIN;
-       spin_lock(&bc_xprt->recv_lock);
+       spin_lock(&bc_xprt->queue_lock);
        req = xprt_lookup_rqst(bc_xprt, xid);
        if (!req)
                goto unlock_notfound;
@@ -1022,7 +1022,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
        memcpy(dst->iov_base, src->iov_base, src->iov_len);
        xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len);
        rqstp->rq_arg.len = 0;
-       spin_unlock(&bc_xprt->recv_lock);
+       spin_unlock(&bc_xprt->queue_lock);
        return 0;
 unlock_notfound:
        printk(KERN_NOTICE
@@ -1031,7 +1031,7 @@ unlock_notfound:
                __func__, ntohl(calldir),
                bc_xprt, ntohl(xid));
 unlock_eagain:
-       spin_unlock(&bc_xprt->recv_lock);
+       spin_unlock(&bc_xprt->queue_lock);
        return -EAGAIN;
 }
 
index 30afbd236656672e0e9e87d6bfc07a76b742bf4d..2bbb8d38d2bf5f6eeb87a5771aeb92683d25543f 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/errno.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/msg_prot.h>
+#include <linux/bvec.h>
 
 /*
  * XDR functions for basic NFS types
@@ -128,6 +129,39 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len)
 }
 EXPORT_SYMBOL_GPL(xdr_terminate_string);
 
+size_t
+xdr_buf_pagecount(struct xdr_buf *buf)
+{
+       if (!buf->page_len)
+               return 0;
+       return (buf->page_base + buf->page_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+}
+
+int
+xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp)
+{
+       size_t i, n = xdr_buf_pagecount(buf);
+
+       if (n != 0 && buf->bvec == NULL) {
+               buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp);
+               if (!buf->bvec)
+                       return -ENOMEM;
+               for (i = 0; i < n; i++) {
+                       buf->bvec[i].bv_page = buf->pages[i];
+                       buf->bvec[i].bv_len = PAGE_SIZE;
+                       buf->bvec[i].bv_offset = 0;
+               }
+       }
+       return 0;
+}
+
+void
+xdr_free_bvec(struct xdr_buf *buf)
+{
+       kfree(buf->bvec);
+       buf->bvec = NULL;
+}
+
 void
 xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
                 struct page **pages, unsigned int base, unsigned int len)
index a8db2e3f89044460741fe41dcb6dcadfd7b2d669..86bea4520c4d1fb3db7249bbd69f87721d02e6b5 100644 (file)
@@ -68,8 +68,6 @@
 static void     xprt_init(struct rpc_xprt *xprt, struct net *net);
 static __be32  xprt_alloc_xid(struct rpc_xprt *xprt);
 static void    xprt_connect_status(struct rpc_task *task);
-static int      __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
-static void     __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *);
 static void     xprt_destroy(struct rpc_xprt *xprt);
 
 static DEFINE_SPINLOCK(xprt_list_lock);
@@ -171,6 +169,17 @@ out:
 }
 EXPORT_SYMBOL_GPL(xprt_load_transport);
 
+static void xprt_clear_locked(struct rpc_xprt *xprt)
+{
+       xprt->snd_task = NULL;
+       if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) {
+               smp_mb__before_atomic();
+               clear_bit(XPRT_LOCKED, &xprt->state);
+               smp_mb__after_atomic();
+       } else
+               queue_work(xprtiod_workqueue, &xprt->task_cleanup);
+}
+
 /**
  * xprt_reserve_xprt - serialize write access to transports
  * @task: task that is requesting access to the transport
@@ -183,44 +192,53 @@ EXPORT_SYMBOL_GPL(xprt_load_transport);
 int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
 {
        struct rpc_rqst *req = task->tk_rqstp;
-       int priority;
 
        if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
                if (task == xprt->snd_task)
                        return 1;
                goto out_sleep;
        }
+       if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
+               goto out_unlock;
        xprt->snd_task = task;
-       if (req != NULL)
-               req->rq_ntrans++;
 
        return 1;
 
+out_unlock:
+       xprt_clear_locked(xprt);
 out_sleep:
        dprintk("RPC: %5u failed to lock transport %p\n",
                        task->tk_pid, xprt);
-       task->tk_timeout = 0;
+       task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0;
        task->tk_status = -EAGAIN;
-       if (req == NULL)
-               priority = RPC_PRIORITY_LOW;
-       else if (!req->rq_ntrans)
-               priority = RPC_PRIORITY_NORMAL;
-       else
-               priority = RPC_PRIORITY_HIGH;
-       rpc_sleep_on_priority(&xprt->sending, task, NULL, priority);
+       rpc_sleep_on(&xprt->sending, task, NULL);
        return 0;
 }
 EXPORT_SYMBOL_GPL(xprt_reserve_xprt);
 
-static void xprt_clear_locked(struct rpc_xprt *xprt)
+static bool
+xprt_need_congestion_window_wait(struct rpc_xprt *xprt)
 {
-       xprt->snd_task = NULL;
-       if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) {
-               smp_mb__before_atomic();
-               clear_bit(XPRT_LOCKED, &xprt->state);
-               smp_mb__after_atomic();
-       } else
-               queue_work(xprtiod_workqueue, &xprt->task_cleanup);
+       return test_bit(XPRT_CWND_WAIT, &xprt->state);
+}
+
+static void
+xprt_set_congestion_window_wait(struct rpc_xprt *xprt)
+{
+       if (!list_empty(&xprt->xmit_queue)) {
+               /* Peek at head of queue to see if it can make progress */
+               if (list_first_entry(&xprt->xmit_queue, struct rpc_rqst,
+                                       rq_xmit)->rq_cong)
+                       return;
+       }
+       set_bit(XPRT_CWND_WAIT, &xprt->state);
+}
+
+static void
+xprt_test_and_clear_congestion_window_wait(struct rpc_xprt *xprt)
+{
+       if (!RPCXPRT_CONGESTED(xprt))
+               clear_bit(XPRT_CWND_WAIT, &xprt->state);
 }
 
 /*
@@ -230,11 +248,11 @@ static void xprt_clear_locked(struct rpc_xprt *xprt)
  * Same as xprt_reserve_xprt, but Van Jacobson congestion control is
  * integrated into the decision of whether a request is allowed to be
  * woken up and given access to the transport.
+ * Note that the lock is only granted if we know there are free slots.
  */
 int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
 {
        struct rpc_rqst *req = task->tk_rqstp;
-       int priority;
 
        if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
                if (task == xprt->snd_task)
@@ -245,25 +263,19 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
                xprt->snd_task = task;
                return 1;
        }
-       if (__xprt_get_cong(xprt, task)) {
+       if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
+               goto out_unlock;
+       if (!xprt_need_congestion_window_wait(xprt)) {
                xprt->snd_task = task;
-               req->rq_ntrans++;
                return 1;
        }
+out_unlock:
        xprt_clear_locked(xprt);
 out_sleep:
-       if (req)
-               __xprt_put_cong(xprt, req);
        dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt);
-       task->tk_timeout = 0;
+       task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0;
        task->tk_status = -EAGAIN;
-       if (req == NULL)
-               priority = RPC_PRIORITY_LOW;
-       else if (!req->rq_ntrans)
-               priority = RPC_PRIORITY_NORMAL;
-       else
-               priority = RPC_PRIORITY_HIGH;
-       rpc_sleep_on_priority(&xprt->sending, task, NULL, priority);
+       rpc_sleep_on(&xprt->sending, task, NULL);
        return 0;
 }
 EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong);
@@ -272,6 +284,8 @@ static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
 {
        int retval;
 
+       if (test_bit(XPRT_LOCKED, &xprt->state) && xprt->snd_task == task)
+               return 1;
        spin_lock_bh(&xprt->transport_lock);
        retval = xprt->ops->reserve_xprt(xprt, task);
        spin_unlock_bh(&xprt->transport_lock);
@@ -281,12 +295,8 @@ static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
 static bool __xprt_lock_write_func(struct rpc_task *task, void *data)
 {
        struct rpc_xprt *xprt = data;
-       struct rpc_rqst *req;
 
-       req = task->tk_rqstp;
        xprt->snd_task = task;
-       if (req)
-               req->rq_ntrans++;
        return true;
 }
 
@@ -294,53 +304,30 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt)
 {
        if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
                return;
-
+       if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
+               goto out_unlock;
        if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
                                __xprt_lock_write_func, xprt))
                return;
+out_unlock:
        xprt_clear_locked(xprt);
 }
 
-static bool __xprt_lock_write_cong_func(struct rpc_task *task, void *data)
-{
-       struct rpc_xprt *xprt = data;
-       struct rpc_rqst *req;
-
-       req = task->tk_rqstp;
-       if (req == NULL) {
-               xprt->snd_task = task;
-               return true;
-       }
-       if (__xprt_get_cong(xprt, task)) {
-               xprt->snd_task = task;
-               req->rq_ntrans++;
-               return true;
-       }
-       return false;
-}
-
 static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt)
 {
        if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
                return;
-       if (RPCXPRT_CONGESTED(xprt))
+       if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
+               goto out_unlock;
+       if (xprt_need_congestion_window_wait(xprt))
                goto out_unlock;
        if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
-                               __xprt_lock_write_cong_func, xprt))
+                               __xprt_lock_write_func, xprt))
                return;
 out_unlock:
        xprt_clear_locked(xprt);
 }
 
-static void xprt_task_clear_bytes_sent(struct rpc_task *task)
-{
-       if (task != NULL) {
-               struct rpc_rqst *req = task->tk_rqstp;
-               if (req != NULL)
-                       req->rq_bytes_sent = 0;
-       }
-}
-
 /**
  * xprt_release_xprt - allow other requests to use a transport
  * @xprt: transport with other tasks potentially waiting
@@ -351,7 +338,6 @@ static void xprt_task_clear_bytes_sent(struct rpc_task *task)
 void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
 {
        if (xprt->snd_task == task) {
-               xprt_task_clear_bytes_sent(task);
                xprt_clear_locked(xprt);
                __xprt_lock_write_next(xprt);
        }
@@ -369,7 +355,6 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt);
 void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
 {
        if (xprt->snd_task == task) {
-               xprt_task_clear_bytes_sent(task);
                xprt_clear_locked(xprt);
                __xprt_lock_write_next_cong(xprt);
        }
@@ -378,6 +363,8 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt_cong);
 
 static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
 {
+       if (xprt->snd_task != task)
+               return;
        spin_lock_bh(&xprt->transport_lock);
        xprt->ops->release_xprt(xprt, task);
        spin_unlock_bh(&xprt->transport_lock);
@@ -388,16 +375,16 @@ static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *ta
  * overflowed. Put the task to sleep if this is the case.
  */
 static int
-__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_task *task)
+__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
 {
-       struct rpc_rqst *req = task->tk_rqstp;
-
        if (req->rq_cong)
                return 1;
        dprintk("RPC: %5u xprt_cwnd_limited cong = %lu cwnd = %lu\n",
-                       task->tk_pid, xprt->cong, xprt->cwnd);
-       if (RPCXPRT_CONGESTED(xprt))
+                       req->rq_task->tk_pid, xprt->cong, xprt->cwnd);
+       if (RPCXPRT_CONGESTED(xprt)) {
+               xprt_set_congestion_window_wait(xprt);
                return 0;
+       }
        req->rq_cong = 1;
        xprt->cong += RPC_CWNDSCALE;
        return 1;
@@ -414,9 +401,31 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
                return;
        req->rq_cong = 0;
        xprt->cong -= RPC_CWNDSCALE;
+       xprt_test_and_clear_congestion_window_wait(xprt);
        __xprt_lock_write_next_cong(xprt);
 }
 
+/**
+ * xprt_request_get_cong - Request congestion control credits
+ * @xprt: pointer to transport
+ * @req: pointer to RPC request
+ *
+ * Useful for transports that require congestion control.
+ */
+bool
+xprt_request_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
+{
+       bool ret = false;
+
+       if (req->rq_cong)
+               return true;
+       spin_lock_bh(&xprt->transport_lock);
+       ret = __xprt_get_cong(xprt, req) != 0;
+       spin_unlock_bh(&xprt->transport_lock);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(xprt_request_get_cong);
+
 /**
  * xprt_release_rqst_cong - housekeeping when request is complete
  * @task: RPC request that recently completed
@@ -431,6 +440,20 @@ void xprt_release_rqst_cong(struct rpc_task *task)
 }
 EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
 
+/*
+ * Clear the congestion window wait flag and wake up the next
+ * entry on xprt->sending
+ */
+static void
+xprt_clear_congestion_window_wait(struct rpc_xprt *xprt)
+{
+       if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state)) {
+               spin_lock_bh(&xprt->transport_lock);
+               __xprt_lock_write_next_cong(xprt);
+               spin_unlock_bh(&xprt->transport_lock);
+       }
+}
+
 /**
  * xprt_adjust_cwnd - adjust transport congestion window
  * @xprt: pointer to xprt
@@ -488,39 +511,46 @@ EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks);
 
 /**
  * xprt_wait_for_buffer_space - wait for transport output buffer to clear
- * @task: task to be put to sleep
- * @action: function pointer to be executed after wait
+ * @xprt: transport
  *
  * Note that we only set the timer for the case of RPC_IS_SOFT(), since
  * we don't in general want to force a socket disconnection due to
  * an incomplete RPC call transmission.
  */
-void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action)
+void xprt_wait_for_buffer_space(struct rpc_xprt *xprt)
 {
-       struct rpc_rqst *req = task->tk_rqstp;
-       struct rpc_xprt *xprt = req->rq_xprt;
-
-       task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0;
-       rpc_sleep_on(&xprt->pending, task, action);
+       set_bit(XPRT_WRITE_SPACE, &xprt->state);
 }
 EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
 
+static bool
+xprt_clear_write_space_locked(struct rpc_xprt *xprt)
+{
+       if (test_and_clear_bit(XPRT_WRITE_SPACE, &xprt->state)) {
+               __xprt_lock_write_next(xprt);
+               dprintk("RPC:       write space: waking waiting task on "
+                               "xprt %p\n", xprt);
+               return true;
+       }
+       return false;
+}
+
 /**
  * xprt_write_space - wake the task waiting for transport output buffer space
  * @xprt: transport with waiting tasks
  *
  * Can be called in a soft IRQ context, so xprt_write_space never sleeps.
  */
-void xprt_write_space(struct rpc_xprt *xprt)
+bool xprt_write_space(struct rpc_xprt *xprt)
 {
+       bool ret;
+
+       if (!test_bit(XPRT_WRITE_SPACE, &xprt->state))
+               return false;
        spin_lock_bh(&xprt->transport_lock);
-       if (xprt->snd_task) {
-               dprintk("RPC:       write space: waking waiting task on "
-                               "xprt %p\n", xprt);
-               rpc_wake_up_queued_task_on_wq(xprtiod_workqueue,
-                               &xprt->pending, xprt->snd_task);
-       }
+       ret = xprt_clear_write_space_locked(xprt);
        spin_unlock_bh(&xprt->transport_lock);
+       return ret;
 }
 EXPORT_SYMBOL_GPL(xprt_write_space);
 
@@ -631,6 +661,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt)
        dprintk("RPC:       disconnected transport %p\n", xprt);
        spin_lock_bh(&xprt->transport_lock);
        xprt_clear_connected(xprt);
+       xprt_clear_write_space_locked(xprt);
        xprt_wake_pending_tasks(xprt, -EAGAIN);
        spin_unlock_bh(&xprt->transport_lock);
 }
@@ -654,6 +685,22 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
 }
 EXPORT_SYMBOL_GPL(xprt_force_disconnect);
 
+static unsigned int
+xprt_connect_cookie(struct rpc_xprt *xprt)
+{
+       return READ_ONCE(xprt->connect_cookie);
+}
+
+static bool
+xprt_request_retransmit_after_disconnect(struct rpc_task *task)
+{
+       struct rpc_rqst *req = task->tk_rqstp;
+       struct rpc_xprt *xprt = req->rq_xprt;
+
+       return req->rq_connect_cookie != xprt_connect_cookie(xprt) ||
+               !xprt_connected(xprt);
+}
+
 /**
  * xprt_conditional_disconnect - force a transport to disconnect
  * @xprt: transport to disconnect
@@ -692,7 +739,7 @@ static void
 xprt_schedule_autodisconnect(struct rpc_xprt *xprt)
        __must_hold(&xprt->transport_lock)
 {
-       if (list_empty(&xprt->recv) && xprt_has_timer(xprt))
+       if (RB_EMPTY_ROOT(&xprt->recv_queue) && xprt_has_timer(xprt))
                mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout);
 }
 
@@ -702,7 +749,7 @@ xprt_init_autodisconnect(struct timer_list *t)
        struct rpc_xprt *xprt = from_timer(xprt, t, timer);
 
        spin_lock(&xprt->transport_lock);
-       if (!list_empty(&xprt->recv))
+       if (!RB_EMPTY_ROOT(&xprt->recv_queue))
                goto out_abort;
        /* Reset xprt->last_used to avoid connect/autodisconnect cycling */
        xprt->last_used = jiffies;
@@ -726,7 +773,6 @@ bool xprt_lock_connect(struct rpc_xprt *xprt,
                goto out;
        if (xprt->snd_task != task)
                goto out;
-       xprt_task_clear_bytes_sent(task);
        xprt->snd_task = cookie;
        ret = true;
 out:
@@ -772,7 +818,6 @@ void xprt_connect(struct rpc_task *task)
                xprt->ops->close(xprt);
 
        if (!xprt_connected(xprt)) {
-               task->tk_rqstp->rq_bytes_sent = 0;
                task->tk_timeout = task->tk_rqstp->rq_timeout;
                task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie;
                rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
@@ -789,17 +834,11 @@ void xprt_connect(struct rpc_task *task)
 
 static void xprt_connect_status(struct rpc_task *task)
 {
-       struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
-
-       if (task->tk_status == 0) {
-               xprt->stat.connect_count++;
-               xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start;
+       switch (task->tk_status) {
+       case 0:
                dprintk("RPC: %5u xprt_connect_status: connection established\n",
                                task->tk_pid);
-               return;
-       }
-
-       switch (task->tk_status) {
+               break;
        case -ECONNREFUSED:
        case -ECONNRESET:
        case -ECONNABORTED:
@@ -816,28 +855,97 @@ static void xprt_connect_status(struct rpc_task *task)
        default:
                dprintk("RPC: %5u xprt_connect_status: error %d connecting to "
                                "server %s\n", task->tk_pid, -task->tk_status,
-                               xprt->servername);
+                               task->tk_rqstp->rq_xprt->servername);
                task->tk_status = -EIO;
        }
 }
 
+enum xprt_xid_rb_cmp {
+       XID_RB_EQUAL,
+       XID_RB_LEFT,
+       XID_RB_RIGHT,
+};
+static enum xprt_xid_rb_cmp
+xprt_xid_cmp(__be32 xid1, __be32 xid2)
+{
+       if (xid1 == xid2)
+               return XID_RB_EQUAL;
+       if ((__force u32)xid1 < (__force u32)xid2)
+               return XID_RB_LEFT;
+       return XID_RB_RIGHT;
+}
+
+static struct rpc_rqst *
+xprt_request_rb_find(struct rpc_xprt *xprt, __be32 xid)
+{
+       struct rb_node *n = xprt->recv_queue.rb_node;
+       struct rpc_rqst *req;
+
+       while (n != NULL) {
+               req = rb_entry(n, struct rpc_rqst, rq_recv);
+               switch (xprt_xid_cmp(xid, req->rq_xid)) {
+               case XID_RB_LEFT:
+                       n = n->rb_left;
+                       break;
+               case XID_RB_RIGHT:
+                       n = n->rb_right;
+                       break;
+               case XID_RB_EQUAL:
+                       return req;
+               }
+       }
+       return NULL;
+}
+
+static void
+xprt_request_rb_insert(struct rpc_xprt *xprt, struct rpc_rqst *new)
+{
+       struct rb_node **p = &xprt->recv_queue.rb_node;
+       struct rb_node *n = NULL;
+       struct rpc_rqst *req;
+
+       while (*p != NULL) {
+               n = *p;
+               req = rb_entry(n, struct rpc_rqst, rq_recv);
+               switch(xprt_xid_cmp(new->rq_xid, req->rq_xid)) {
+               case XID_RB_LEFT:
+                       p = &n->rb_left;
+                       break;
+               case XID_RB_RIGHT:
+                       p = &n->rb_right;
+                       break;
+               case XID_RB_EQUAL:
+                       WARN_ON_ONCE(new != req);
+                       return;
+               }
+       }
+       rb_link_node(&new->rq_recv, n, p);
+       rb_insert_color(&new->rq_recv, &xprt->recv_queue);
+}
+
+static void
+xprt_request_rb_remove(struct rpc_xprt *xprt, struct rpc_rqst *req)
+{
+       rb_erase(&req->rq_recv, &xprt->recv_queue);
+}
+
 /**
  * xprt_lookup_rqst - find an RPC request corresponding to an XID
  * @xprt: transport on which the original request was transmitted
  * @xid: RPC XID of incoming reply
  *
- * Caller holds xprt->recv_lock.
+ * Caller holds xprt->queue_lock.
  */
 struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
 {
        struct rpc_rqst *entry;
 
-       list_for_each_entry(entry, &xprt->recv, rq_list)
-               if (entry->rq_xid == xid) {
-                       trace_xprt_lookup_rqst(xprt, xid, 0);
-                       entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime);
-                       return entry;
-               }
+       entry = xprt_request_rb_find(xprt, xid);
+       if (entry != NULL) {
+               trace_xprt_lookup_rqst(xprt, xid, 0);
+               entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime);
+               return entry;
+       }
 
        dprintk("RPC:       xprt_lookup_rqst did not find xid %08x\n",
                        ntohl(xid));
@@ -847,16 +955,22 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
 }
 EXPORT_SYMBOL_GPL(xprt_lookup_rqst);
 
+static bool
+xprt_is_pinned_rqst(struct rpc_rqst *req)
+{
+       return atomic_read(&req->rq_pin) != 0;
+}
+
 /**
  * xprt_pin_rqst - Pin a request on the transport receive list
  * @req: Request to pin
  *
  * Caller must ensure this is atomic with the call to xprt_lookup_rqst()
- * so should be holding the xprt transport lock.
+ * so should be holding the xprt receive lock.
  */
 void xprt_pin_rqst(struct rpc_rqst *req)
 {
-       set_bit(RPC_TASK_MSG_RECV, &req->rq_task->tk_runstate);
+       atomic_inc(&req->rq_pin);
 }
 EXPORT_SYMBOL_GPL(xprt_pin_rqst);
 
@@ -864,38 +978,87 @@ EXPORT_SYMBOL_GPL(xprt_pin_rqst);
  * xprt_unpin_rqst - Unpin a request on the transport receive list
  * @req: Request to pin
  *
- * Caller should be holding the xprt transport lock.
+ * Caller should be holding the xprt receive lock.
  */
 void xprt_unpin_rqst(struct rpc_rqst *req)
 {
-       struct rpc_task *task = req->rq_task;
-
-       clear_bit(RPC_TASK_MSG_RECV, &task->tk_runstate);
-       if (test_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate))
-               wake_up_bit(&task->tk_runstate, RPC_TASK_MSG_RECV);
+       if (!test_bit(RPC_TASK_MSG_PIN_WAIT, &req->rq_task->tk_runstate)) {
+               atomic_dec(&req->rq_pin);
+               return;
+       }
+       if (atomic_dec_and_test(&req->rq_pin))
+               wake_up_var(&req->rq_pin);
 }
 EXPORT_SYMBOL_GPL(xprt_unpin_rqst);
 
 static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req)
-__must_hold(&req->rq_xprt->recv_lock)
 {
-       struct rpc_task *task = req->rq_task;
+       wait_var_event(&req->rq_pin, !xprt_is_pinned_rqst(req));
+}
 
-       if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) {
-               spin_unlock(&req->rq_xprt->recv_lock);
-               set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate);
-               wait_on_bit(&task->tk_runstate, RPC_TASK_MSG_RECV,
-                               TASK_UNINTERRUPTIBLE);
-               clear_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate);
-               spin_lock(&req->rq_xprt->recv_lock);
-       }
+static bool
+xprt_request_data_received(struct rpc_task *task)
+{
+       return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) &&
+               READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) != 0;
+}
+
+static bool
+xprt_request_need_enqueue_receive(struct rpc_task *task, struct rpc_rqst *req)
+{
+       return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) &&
+               READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) == 0;
+}
+
+/**
+ * xprt_request_enqueue_receive - Add an request to the receive queue
+ * @task: RPC task
+ *
+ */
+void
+xprt_request_enqueue_receive(struct rpc_task *task)
+{
+       struct rpc_rqst *req = task->tk_rqstp;
+       struct rpc_xprt *xprt = req->rq_xprt;
+
+       if (!xprt_request_need_enqueue_receive(task, req))
+               return;
+       spin_lock(&xprt->queue_lock);
+
+       /* Update the softirq receive buffer */
+       memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
+                       sizeof(req->rq_private_buf));
+
+       /* Add request to the receive list */
+       xprt_request_rb_insert(xprt, req);
+       set_bit(RPC_TASK_NEED_RECV, &task->tk_runstate);
+       spin_unlock(&xprt->queue_lock);
+
+       xprt_reset_majortimeo(req);
+       /* Turn off autodisconnect */
+       del_singleshot_timer_sync(&xprt->timer);
+}
+
+/**
+ * xprt_request_dequeue_receive_locked - Remove a request from the receive queue
+ * @task: RPC task
+ *
+ * Caller must hold xprt->queue_lock.
+ */
+static void
+xprt_request_dequeue_receive_locked(struct rpc_task *task)
+{
+       struct rpc_rqst *req = task->tk_rqstp;
+
+       if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate))
+               xprt_request_rb_remove(req->rq_xprt, req);
 }
 
 /**
  * xprt_update_rtt - Update RPC RTT statistics
  * @task: RPC request that recently completed
  *
- * Caller holds xprt->recv_lock.
+ * Caller holds xprt->queue_lock.
  */
 void xprt_update_rtt(struct rpc_task *task)
 {
@@ -917,7 +1080,7 @@ EXPORT_SYMBOL_GPL(xprt_update_rtt);
  * @task: RPC request that recently completed
  * @copied: actual number of bytes received from the transport
  *
- * Caller holds xprt->recv_lock.
+ * Caller holds xprt->queue_lock.
  */
 void xprt_complete_rqst(struct rpc_task *task, int copied)
 {
@@ -930,12 +1093,12 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
 
        xprt->stat.recvs++;
 
-       list_del_init(&req->rq_list);
        req->rq_private_buf.len = copied;
        /* Ensure all writes are done before we update */
        /* req->rq_reply_bytes_recvd */
        smp_wmb();
        req->rq_reply_bytes_recvd = copied;
+       xprt_request_dequeue_receive_locked(task);
        rpc_wake_up_queued_task(&xprt->pending, task);
 }
 EXPORT_SYMBOL_GPL(xprt_complete_rqst);
@@ -956,6 +1119,172 @@ static void xprt_timer(struct rpc_task *task)
                task->tk_status = 0;
 }
 
+/**
+ * xprt_request_wait_receive - wait for the reply to an RPC request
+ * @task: RPC task about to send a request
+ *
+ */
+void xprt_request_wait_receive(struct rpc_task *task)
+{
+       struct rpc_rqst *req = task->tk_rqstp;
+       struct rpc_xprt *xprt = req->rq_xprt;
+
+       if (!test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate))
+               return;
+       /*
+        * Sleep on the pending queue if we're expecting a reply.
+        * The spinlock ensures atomicity between the test of
+        * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on().
+        */
+       spin_lock(&xprt->queue_lock);
+       if (test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) {
+               xprt->ops->set_retrans_timeout(task);
+               rpc_sleep_on(&xprt->pending, task, xprt_timer);
+               /*
+                * Send an extra queue wakeup call if the
+                * connection was dropped in case the call to
+                * rpc_sleep_on() raced.
+                */
+               if (xprt_request_retransmit_after_disconnect(task))
+                       rpc_wake_up_queued_task_set_status(&xprt->pending,
+                                       task, -ENOTCONN);
+       }
+       spin_unlock(&xprt->queue_lock);
+}
+
+static bool
+xprt_request_need_enqueue_transmit(struct rpc_task *task, struct rpc_rqst *req)
+{
+       return !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate);
+}
+
+/**
+ * xprt_request_enqueue_transmit - queue a task for transmission
+ * @task: pointer to rpc_task
+ *
+ * Add a task to the transmission queue.
+ */
+void
+xprt_request_enqueue_transmit(struct rpc_task *task)
+{
+       struct rpc_rqst *pos, *req = task->tk_rqstp;
+       struct rpc_xprt *xprt = req->rq_xprt;
+
+       if (xprt_request_need_enqueue_transmit(task, req)) {
+               spin_lock(&xprt->queue_lock);
+               /*
+                * Requests that carry congestion control credits are added
+                * to the head of the list to avoid starvation issues.
+                */
+               if (req->rq_cong) {
+                       xprt_clear_congestion_window_wait(xprt);
+                       list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) {
+                               if (pos->rq_cong)
+                                       continue;
+                               /* Note: req is added _before_ pos */
+                               list_add_tail(&req->rq_xmit, &pos->rq_xmit);
+                               INIT_LIST_HEAD(&req->rq_xmit2);
+                               goto out;
+                       }
+               } else if (RPC_IS_SWAPPER(task)) {
+                       list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) {
+                               if (pos->rq_cong || pos->rq_bytes_sent)
+                                       continue;
+                               if (RPC_IS_SWAPPER(pos->rq_task))
+                                       continue;
+                               /* Note: req is added _before_ pos */
+                               list_add_tail(&req->rq_xmit, &pos->rq_xmit);
+                               INIT_LIST_HEAD(&req->rq_xmit2);
+                               goto out;
+                       }
+               } else {
+                       list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) {
+                               if (pos->rq_task->tk_owner != task->tk_owner)
+                                       continue;
+                               list_add_tail(&req->rq_xmit2, &pos->rq_xmit2);
+                               INIT_LIST_HEAD(&req->rq_xmit);
+                               goto out;
+                       }
+               }
+               list_add_tail(&req->rq_xmit, &xprt->xmit_queue);
+               INIT_LIST_HEAD(&req->rq_xmit2);
+out:
+               set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate);
+               spin_unlock(&xprt->queue_lock);
+       }
+}
+
+/**
+ * xprt_request_dequeue_transmit_locked - remove a task from the transmission queue
+ * @task: pointer to rpc_task
+ *
+ * Remove a task from the transmission queue
+ * Caller must hold xprt->queue_lock
+ */
+static void
+xprt_request_dequeue_transmit_locked(struct rpc_task *task)
+{
+       struct rpc_rqst *req = task->tk_rqstp;
+
+       if (!test_and_clear_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
+               return;
+       if (!list_empty(&req->rq_xmit)) {
+               list_del(&req->rq_xmit);
+               if (!list_empty(&req->rq_xmit2)) {
+                       struct rpc_rqst *next = list_first_entry(&req->rq_xmit2,
+                                       struct rpc_rqst, rq_xmit2);
+                       list_del(&req->rq_xmit2);
+                       list_add_tail(&next->rq_xmit, &next->rq_xprt->xmit_queue);
+               }
+       } else
+               list_del(&req->rq_xmit2);
+}
+
+/**
+ * xprt_request_dequeue_transmit - remove a task from the transmission queue
+ * @task: pointer to rpc_task
+ *
+ * Remove a task from the transmission queue
+ */
+static void
+xprt_request_dequeue_transmit(struct rpc_task *task)
+{
+       struct rpc_rqst *req = task->tk_rqstp;
+       struct rpc_xprt *xprt = req->rq_xprt;
+
+       spin_lock(&xprt->queue_lock);
+       xprt_request_dequeue_transmit_locked(task);
+       spin_unlock(&xprt->queue_lock);
+}
+
+/**
+ * xprt_request_prepare - prepare an encoded request for transport
+ * @req: pointer to rpc_rqst
+ *
+ * Calls into the transport layer to do whatever is needed to prepare
+ * the request for transmission or receive.
+ */
+void
+xprt_request_prepare(struct rpc_rqst *req)
+{
+       struct rpc_xprt *xprt = req->rq_xprt;
+
+       if (xprt->ops->prepare_request)
+               xprt->ops->prepare_request(req);
+}
+
+/**
+ * xprt_request_need_retransmit - Test if a task needs retransmission
+ * @task: pointer to rpc_task
+ *
+ * Test for whether a connection breakage requires the task to retransmit
+ */
+bool
+xprt_request_need_retransmit(struct rpc_task *task)
+{
+       return xprt_request_retransmit_after_disconnect(task);
+}
+
 /**
  * xprt_prepare_transmit - reserve the transport before sending a request
  * @task: RPC task about to send a request
@@ -965,32 +1294,18 @@ bool xprt_prepare_transmit(struct rpc_task *task)
 {
        struct rpc_rqst *req = task->tk_rqstp;
        struct rpc_xprt *xprt = req->rq_xprt;
-       bool ret = false;
 
        dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid);
 
-       spin_lock_bh(&xprt->transport_lock);
-       if (!req->rq_bytes_sent) {
-               if (req->rq_reply_bytes_recvd) {
-                       task->tk_status = req->rq_reply_bytes_recvd;
-                       goto out_unlock;
-               }
-               if ((task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT)
-                   && xprt_connected(xprt)
-                   && req->rq_connect_cookie == xprt->connect_cookie) {
-                       xprt->ops->set_retrans_timeout(task);
-                       rpc_sleep_on(&xprt->pending, task, xprt_timer);
-                       goto out_unlock;
-               }
-       }
-       if (!xprt->ops->reserve_xprt(xprt, task)) {
-               task->tk_status = -EAGAIN;
-               goto out_unlock;
+       if (!xprt_lock_write(xprt, task)) {
+               /* Race breaker: someone may have transmitted us */
+               if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
+                       rpc_wake_up_queued_task_set_status(&xprt->sending,
+                                       task, 0);
+               return false;
+
        }
-       ret = true;
-out_unlock:
-       spin_unlock_bh(&xprt->transport_lock);
-       return ret;
+       return true;
 }
 
 void xprt_end_transmit(struct rpc_task *task)
@@ -999,54 +1314,62 @@ void xprt_end_transmit(struct rpc_task *task)
 }
 
 /**
- * xprt_transmit - send an RPC request on a transport
- * @task: controlling RPC task
+ * xprt_request_transmit - send an RPC request on a transport
+ * @req: pointer to request to transmit
+ * @snd_task: RPC task that owns the transport lock
  *
- * We have to copy the iovec because sendmsg fiddles with its contents.
+ * This performs the transmission of a single request.
+ * Note that if the request is not the same as snd_task, then it
+ * does need to be pinned.
+ * Returns '0' on success.
  */
-void xprt_transmit(struct rpc_task *task)
+static int
+xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task)
 {
-       struct rpc_rqst *req = task->tk_rqstp;
-       struct rpc_xprt *xprt = req->rq_xprt;
+       struct rpc_xprt *xprt = req->rq_xprt;
+       struct rpc_task *task = req->rq_task;
        unsigned int connect_cookie;
+       int is_retrans = RPC_WAS_SENT(task);
        int status;
 
        dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen);
 
-       if (!req->rq_reply_bytes_recvd) {
-               if (list_empty(&req->rq_list) && rpc_reply_expected(task)) {
-                       /*
-                        * Add to the list only if we're expecting a reply
-                        */
-                       /* Update the softirq receive buffer */
-                       memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
-                                       sizeof(req->rq_private_buf));
-                       /* Add request to the receive list */
-                       spin_lock(&xprt->recv_lock);
-                       list_add_tail(&req->rq_list, &xprt->recv);
-                       spin_unlock(&xprt->recv_lock);
-                       xprt_reset_majortimeo(req);
-                       /* Turn off autodisconnect */
-                       del_singleshot_timer_sync(&xprt->timer);
+       if (!req->rq_bytes_sent) {
+               if (xprt_request_data_received(task)) {
+                       status = 0;
+                       goto out_dequeue;
                }
-       } else if (!req->rq_bytes_sent)
-               return;
+               /* Verify that our message lies in the RPCSEC_GSS window */
+               if (rpcauth_xmit_need_reencode(task)) {
+                       status = -EBADMSG;
+                       goto out_dequeue;
+               }
+       }
+
+       /*
+        * Update req->rq_ntrans before transmitting to avoid races with
+        * xprt_update_rtt(), which needs to know that it is recording a
+        * reply to the first transmission.
+        */
+       req->rq_ntrans++;
 
        connect_cookie = xprt->connect_cookie;
-       status = xprt->ops->send_request(task);
+       status = xprt->ops->send_request(req);
        trace_xprt_transmit(xprt, req->rq_xid, status);
        if (status != 0) {
-               task->tk_status = status;
-               return;
+               req->rq_ntrans--;
+               return status;
        }
+
+       if (is_retrans)
+               task->tk_client->cl_stats->rpcretrans++;
+
        xprt_inject_disconnect(xprt);
 
        dprintk("RPC: %5u xmit complete\n", task->tk_pid);
        task->tk_flags |= RPC_TASK_SENT;
        spin_lock_bh(&xprt->transport_lock);
 
-       xprt->ops->set_retrans_timeout(task);
-
        xprt->stat.sends++;
        xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
        xprt->stat.bklog_u += xprt->backlog.qlen;
@@ -1055,25 +1378,49 @@ void xprt_transmit(struct rpc_task *task)
        spin_unlock_bh(&xprt->transport_lock);
 
        req->rq_connect_cookie = connect_cookie;
-       if (rpc_reply_expected(task) && !READ_ONCE(req->rq_reply_bytes_recvd)) {
-               /*
-                * Sleep on the pending queue if we're expecting a reply.
-                * The spinlock ensures atomicity between the test of
-                * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on().
-                */
-               spin_lock(&xprt->recv_lock);
-               if (!req->rq_reply_bytes_recvd) {
-                       rpc_sleep_on(&xprt->pending, task, xprt_timer);
-                       /*
-                        * Send an extra queue wakeup call if the
-                        * connection was dropped in case the call to
-                        * rpc_sleep_on() raced.
-                        */
-                       if (!xprt_connected(xprt))
-                               xprt_wake_pending_tasks(xprt, -ENOTCONN);
-               }
-               spin_unlock(&xprt->recv_lock);
+out_dequeue:
+       xprt_request_dequeue_transmit(task);
+       rpc_wake_up_queued_task_set_status(&xprt->sending, task, status);
+       return status;
+}
+
+/**
+ * xprt_transmit - send an RPC request on a transport
+ * @task: controlling RPC task
+ *
+ * Attempts to drain the transmit queue. On exit, either the transport
+ * signalled an error that needs to be handled before transmission can
+ * resume, or @task finished transmitting, and detected that it already
+ * received a reply.
+ */
+void
+xprt_transmit(struct rpc_task *task)
+{
+       struct rpc_rqst *next, *req = task->tk_rqstp;
+       struct rpc_xprt *xprt = req->rq_xprt;
+       int status;
+
+       spin_lock(&xprt->queue_lock);
+       while (!list_empty(&xprt->xmit_queue)) {
+               next = list_first_entry(&xprt->xmit_queue,
+                               struct rpc_rqst, rq_xmit);
+               xprt_pin_rqst(next);
+               spin_unlock(&xprt->queue_lock);
+               status = xprt_request_transmit(next, task);
+               if (status == -EBADMSG && next != req)
+                       status = 0;
+               cond_resched();
+               spin_lock(&xprt->queue_lock);
+               xprt_unpin_rqst(next);
+               if (status == 0) {
+                       if (!xprt_request_data_received(task) ||
+                           test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
+                               continue;
+               } else if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
+                       task->tk_status = status;
+               break;
        }
+       spin_unlock(&xprt->queue_lock);
 }
 
 static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
@@ -1170,20 +1517,6 @@ out_init_req:
 }
 EXPORT_SYMBOL_GPL(xprt_alloc_slot);
 
-void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
-{
-       /* Note: grabbing the xprt_lock_write() ensures that we throttle
-        * new slot allocation if the transport is congested (i.e. when
-        * reconnecting a stream transport or when out of socket write
-        * buffer space).
-        */
-       if (xprt_lock_write(xprt, task)) {
-               xprt_alloc_slot(xprt, task);
-               xprt_release_write(xprt, task);
-       }
-}
-EXPORT_SYMBOL_GPL(xprt_lock_and_alloc_slot);
-
 void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
 {
        spin_lock(&xprt->reserve_lock);
@@ -1250,6 +1583,60 @@ void xprt_free(struct rpc_xprt *xprt)
 }
 EXPORT_SYMBOL_GPL(xprt_free);
 
+static void
+xprt_init_connect_cookie(struct rpc_rqst *req, struct rpc_xprt *xprt)
+{
+       req->rq_connect_cookie = xprt_connect_cookie(xprt) - 1;
+}
+
+static __be32
+xprt_alloc_xid(struct rpc_xprt *xprt)
+{
+       __be32 xid;
+
+       spin_lock(&xprt->reserve_lock);
+       xid = (__force __be32)xprt->xid++;
+       spin_unlock(&xprt->reserve_lock);
+       return xid;
+}
+
+static void
+xprt_init_xid(struct rpc_xprt *xprt)
+{
+       xprt->xid = prandom_u32();
+}
+
+static void
+xprt_request_init(struct rpc_task *task)
+{
+       struct rpc_xprt *xprt = task->tk_xprt;
+       struct rpc_rqst *req = task->tk_rqstp;
+
+       req->rq_timeout = task->tk_client->cl_timeout->to_initval;
+       req->rq_task    = task;
+       req->rq_xprt    = xprt;
+       req->rq_buffer  = NULL;
+       req->rq_xid     = xprt_alloc_xid(xprt);
+       xprt_init_connect_cookie(req, xprt);
+       req->rq_bytes_sent = 0;
+       req->rq_snd_buf.len = 0;
+       req->rq_snd_buf.buflen = 0;
+       req->rq_rcv_buf.len = 0;
+       req->rq_rcv_buf.buflen = 0;
+       req->rq_release_snd_buf = NULL;
+       xprt_reset_majortimeo(req);
+       dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
+                       req, ntohl(req->rq_xid));
+}
+
+static void
+xprt_do_reserve(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+       xprt->ops->alloc_slot(xprt, task);
+       if (task->tk_rqstp != NULL)
+               xprt_request_init(task);
+}
+
 /**
  * xprt_reserve - allocate an RPC request slot
  * @task: RPC task requesting a slot allocation
@@ -1269,7 +1656,7 @@ void xprt_reserve(struct rpc_task *task)
        task->tk_timeout = 0;
        task->tk_status = -EAGAIN;
        if (!xprt_throttle_congested(xprt, task))
-               xprt->ops->alloc_slot(xprt, task);
+               xprt_do_reserve(xprt, task);
 }
 
 /**
@@ -1291,45 +1678,29 @@ void xprt_retry_reserve(struct rpc_task *task)
 
        task->tk_timeout = 0;
        task->tk_status = -EAGAIN;
-       xprt->ops->alloc_slot(xprt, task);
-}
-
-static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt)
-{
-       __be32 xid;
-
-       spin_lock(&xprt->reserve_lock);
-       xid = (__force __be32)xprt->xid++;
-       spin_unlock(&xprt->reserve_lock);
-       return xid;
+       xprt_do_reserve(xprt, task);
 }
 
-static inline void xprt_init_xid(struct rpc_xprt *xprt)
-{
-       xprt->xid = prandom_u32();
-}
-
-void xprt_request_init(struct rpc_task *task)
+static void
+xprt_request_dequeue_all(struct rpc_task *task, struct rpc_rqst *req)
 {
-       struct rpc_xprt *xprt = task->tk_xprt;
-       struct rpc_rqst *req = task->tk_rqstp;
+       struct rpc_xprt *xprt = req->rq_xprt;
 
-       INIT_LIST_HEAD(&req->rq_list);
-       req->rq_timeout = task->tk_client->cl_timeout->to_initval;
-       req->rq_task    = task;
-       req->rq_xprt    = xprt;
-       req->rq_buffer  = NULL;
-       req->rq_xid     = xprt_alloc_xid(xprt);
-       req->rq_connect_cookie = xprt->connect_cookie - 1;
-       req->rq_bytes_sent = 0;
-       req->rq_snd_buf.len = 0;
-       req->rq_snd_buf.buflen = 0;
-       req->rq_rcv_buf.len = 0;
-       req->rq_rcv_buf.buflen = 0;
-       req->rq_release_snd_buf = NULL;
-       xprt_reset_majortimeo(req);
-       dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
-                       req, ntohl(req->rq_xid));
+       if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) ||
+           test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) ||
+           xprt_is_pinned_rqst(req)) {
+               spin_lock(&xprt->queue_lock);
+               xprt_request_dequeue_transmit_locked(task);
+               xprt_request_dequeue_receive_locked(task);
+               while (xprt_is_pinned_rqst(req)) {
+                       set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
+                       spin_unlock(&xprt->queue_lock);
+                       xprt_wait_on_pinned_rqst(req);
+                       spin_lock(&xprt->queue_lock);
+                       clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
+               }
+               spin_unlock(&xprt->queue_lock);
+       }
 }
 
 /**
@@ -1345,8 +1716,7 @@ void xprt_release(struct rpc_task *task)
        if (req == NULL) {
                if (task->tk_client) {
                        xprt = task->tk_xprt;
-                       if (xprt->snd_task == task)
-                               xprt_release_write(xprt, task);
+                       xprt_release_write(xprt, task);
                }
                return;
        }
@@ -1356,12 +1726,7 @@ void xprt_release(struct rpc_task *task)
                task->tk_ops->rpc_count_stats(task, task->tk_calldata);
        else if (task->tk_client)
                rpc_count_iostats(task, task->tk_client->cl_metrics);
-       spin_lock(&xprt->recv_lock);
-       if (!list_empty(&req->rq_list)) {
-               list_del_init(&req->rq_list);
-               xprt_wait_on_pinned_rqst(req);
-       }
-       spin_unlock(&xprt->recv_lock);
+       xprt_request_dequeue_all(task, req);
        spin_lock_bh(&xprt->transport_lock);
        xprt->ops->release_xprt(xprt, task);
        if (xprt->ops->release_request)
@@ -1372,6 +1737,7 @@ void xprt_release(struct rpc_task *task)
        if (req->rq_buffer)
                xprt->ops->buf_free(task);
        xprt_inject_disconnect(xprt);
+       xdr_free_bvec(&req->rq_rcv_buf);
        if (req->rq_cred != NULL)
                put_rpccred(req->rq_cred);
        task->tk_rqstp = NULL;
@@ -1385,16 +1751,36 @@ void xprt_release(struct rpc_task *task)
                xprt_free_bc_request(req);
 }
 
+#ifdef CONFIG_SUNRPC_BACKCHANNEL
+void
+xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task)
+{
+       struct xdr_buf *xbufp = &req->rq_snd_buf;
+
+       task->tk_rqstp = req;
+       req->rq_task = task;
+       xprt_init_connect_cookie(req, req->rq_xprt);
+       /*
+        * Set up the xdr_buf length.
+        * This also indicates that the buffer is XDR encoded already.
+        */
+       xbufp->len = xbufp->head[0].iov_len + xbufp->page_len +
+               xbufp->tail[0].iov_len;
+       req->rq_bytes_sent = 0;
+}
+#endif
+
 static void xprt_init(struct rpc_xprt *xprt, struct net *net)
 {
        kref_init(&xprt->kref);
 
        spin_lock_init(&xprt->transport_lock);
        spin_lock_init(&xprt->reserve_lock);
-       spin_lock_init(&xprt->recv_lock);
+       spin_lock_init(&xprt->queue_lock);
 
        INIT_LIST_HEAD(&xprt->free);
-       INIT_LIST_HEAD(&xprt->recv);
+       xprt->recv_queue = RB_ROOT;
+       INIT_LIST_HEAD(&xprt->xmit_queue);
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
        spin_lock_init(&xprt->bc_pa_lock);
        INIT_LIST_HEAD(&xprt->bc_pa_list);
@@ -1407,7 +1793,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net)
 
        rpc_init_wait_queue(&xprt->binding, "xprt_binding");
        rpc_init_wait_queue(&xprt->pending, "xprt_pending");
-       rpc_init_priority_wait_queue(&xprt->sending, "xprt_sending");
+       rpc_init_wait_queue(&xprt->sending, "xprt_sending");
        rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog");
 
        xprt_init_xid(xprt);
index 90adeff4c06b889391ae9d08e22ebe7d13dd9bbc..e5b367a3e517ac605efa0802976afebd8f365149 100644 (file)
@@ -51,12 +51,11 @@ static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt,
                rqst = &req->rl_slot;
 
                rqst->rq_xprt = xprt;
-               INIT_LIST_HEAD(&rqst->rq_list);
                INIT_LIST_HEAD(&rqst->rq_bc_list);
                __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
-               spin_lock_bh(&xprt->bc_pa_lock);
+               spin_lock(&xprt->bc_pa_lock);
                list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
-               spin_unlock_bh(&xprt->bc_pa_lock);
+               spin_unlock(&xprt->bc_pa_lock);
 
                size = r_xprt->rx_data.inline_rsize;
                rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
@@ -201,6 +200,9 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst)
        if (!xprt_connected(rqst->rq_xprt))
                goto drop_connection;
 
+       if (!xprt_request_get_cong(rqst->rq_xprt, rqst))
+               return -EBADSLT;
+
        rc = rpcrdma_bc_marshal_reply(rqst);
        if (rc < 0)
                goto failed_marshal;
@@ -228,16 +230,16 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs)
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
        struct rpc_rqst *rqst, *tmp;
 
-       spin_lock_bh(&xprt->bc_pa_lock);
+       spin_lock(&xprt->bc_pa_lock);
        list_for_each_entry_safe(rqst, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
                list_del(&rqst->rq_bc_pa_list);
-               spin_unlock_bh(&xprt->bc_pa_lock);
+               spin_unlock(&xprt->bc_pa_lock);
 
                rpcrdma_bc_free_rqst(r_xprt, rqst);
 
-               spin_lock_bh(&xprt->bc_pa_lock);
+               spin_lock(&xprt->bc_pa_lock);
        }
-       spin_unlock_bh(&xprt->bc_pa_lock);
+       spin_unlock(&xprt->bc_pa_lock);
 }
 
 /**
@@ -255,9 +257,9 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
        rpcrdma_recv_buffer_put(req->rl_reply);
        req->rl_reply = NULL;
 
-       spin_lock_bh(&xprt->bc_pa_lock);
+       spin_lock(&xprt->bc_pa_lock);
        list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
-       spin_unlock_bh(&xprt->bc_pa_lock);
+       spin_unlock(&xprt->bc_pa_lock);
 }
 
 /**
index 0f7c465d9a5aa1abdd29ae583a19d52a9f092df6..7f5632cd5a48acad0e258043bd8affbac6406c05 100644 (file)
@@ -49,46 +49,7 @@ fmr_is_supported(struct rpcrdma_ia *ia)
        return true;
 }
 
-static int
-fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
-{
-       static struct ib_fmr_attr fmr_attr = {
-               .max_pages      = RPCRDMA_MAX_FMR_SGES,
-               .max_maps       = 1,
-               .page_shift     = PAGE_SHIFT
-       };
-
-       mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
-                                      sizeof(u64), GFP_KERNEL);
-       if (!mr->fmr.fm_physaddrs)
-               goto out_free;
-
-       mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
-                           sizeof(*mr->mr_sg), GFP_KERNEL);
-       if (!mr->mr_sg)
-               goto out_free;
-
-       sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES);
-
-       mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
-                                    &fmr_attr);
-       if (IS_ERR(mr->fmr.fm_mr))
-               goto out_fmr_err;
-
-       INIT_LIST_HEAD(&mr->mr_list);
-       return 0;
-
-out_fmr_err:
-       dprintk("RPC:       %s: ib_alloc_fmr returned %ld\n", __func__,
-               PTR_ERR(mr->fmr.fm_mr));
-
-out_free:
-       kfree(mr->mr_sg);
-       kfree(mr->fmr.fm_physaddrs);
-       return -ENOMEM;
-}
-
-static int
+static void
 __fmr_unmap(struct rpcrdma_mr *mr)
 {
        LIST_HEAD(l);
@@ -97,13 +58,16 @@ __fmr_unmap(struct rpcrdma_mr *mr)
        list_add(&mr->fmr.fm_mr->list, &l);
        rc = ib_unmap_fmr(&l);
        list_del(&mr->fmr.fm_mr->list);
-       return rc;
+       if (rc)
+               pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
+                      mr, rc);
 }
 
+/* Release an MR.
+ */
 static void
 fmr_op_release_mr(struct rpcrdma_mr *mr)
 {
-       LIST_HEAD(unmap_list);
        int rc;
 
        kfree(mr->fmr.fm_physaddrs);
@@ -112,10 +76,7 @@ fmr_op_release_mr(struct rpcrdma_mr *mr)
        /* In case this one was left mapped, try to unmap it
         * to prevent dealloc_fmr from failing with EBUSY
         */
-       rc = __fmr_unmap(mr);
-       if (rc)
-               pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
-                      mr, rc);
+       __fmr_unmap(mr);
 
        rc = ib_dealloc_fmr(mr->fmr.fm_mr);
        if (rc)
@@ -125,40 +86,68 @@ fmr_op_release_mr(struct rpcrdma_mr *mr)
        kfree(mr);
 }
 
-/* Reset of a single FMR.
+/* MRs are dynamically allocated, so simply clean up and release the MR.
+ * A replacement MR will subsequently be allocated on demand.
  */
 static void
-fmr_op_recover_mr(struct rpcrdma_mr *mr)
+fmr_mr_recycle_worker(struct work_struct *work)
 {
+       struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle);
        struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
-       int rc;
 
-       /* ORDER: invalidate first */
-       rc = __fmr_unmap(mr);
-       if (rc)
-               goto out_release;
-
-       /* ORDER: then DMA unmap */
-       rpcrdma_mr_unmap_and_put(mr);
+       trace_xprtrdma_mr_recycle(mr);
 
-       r_xprt->rx_stats.mrs_recovered++;
-       return;
-
-out_release:
-       pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr);
-       r_xprt->rx_stats.mrs_orphaned++;
-
-       trace_xprtrdma_dma_unmap(mr);
+       trace_xprtrdma_mr_unmap(mr);
        ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
                        mr->mr_sg, mr->mr_nents, mr->mr_dir);
 
        spin_lock(&r_xprt->rx_buf.rb_mrlock);
        list_del(&mr->mr_all);
+       r_xprt->rx_stats.mrs_recycled++;
        spin_unlock(&r_xprt->rx_buf.rb_mrlock);
-
        fmr_op_release_mr(mr);
 }
 
+static int
+fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
+{
+       static struct ib_fmr_attr fmr_attr = {
+               .max_pages      = RPCRDMA_MAX_FMR_SGES,
+               .max_maps       = 1,
+               .page_shift     = PAGE_SHIFT
+       };
+
+       mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
+                                      sizeof(u64), GFP_KERNEL);
+       if (!mr->fmr.fm_physaddrs)
+               goto out_free;
+
+       mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
+                           sizeof(*mr->mr_sg), GFP_KERNEL);
+       if (!mr->mr_sg)
+               goto out_free;
+
+       sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES);
+
+       mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
+                                    &fmr_attr);
+       if (IS_ERR(mr->fmr.fm_mr))
+               goto out_fmr_err;
+
+       INIT_LIST_HEAD(&mr->mr_list);
+       INIT_WORK(&mr->mr_recycle, fmr_mr_recycle_worker);
+       return 0;
+
+out_fmr_err:
+       dprintk("RPC:       %s: ib_alloc_fmr returned %ld\n", __func__,
+               PTR_ERR(mr->fmr.fm_mr));
+
+out_free:
+       kfree(mr->mr_sg);
+       kfree(mr->fmr.fm_physaddrs);
+       return -ENOMEM;
+}
+
 /* On success, sets:
  *     ep->rep_attr.cap.max_send_wr
  *     ep->rep_attr.cap.max_recv_wr
@@ -187,6 +176,7 @@ fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 
        ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
                                RPCRDMA_MAX_FMR_SGES);
+       ia->ri_max_segs += 2;   /* segments for head and tail buffers */
        return 0;
 }
 
@@ -244,7 +234,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
                                     mr->mr_sg, i, mr->mr_dir);
        if (!mr->mr_nents)
                goto out_dmamap_err;
-       trace_xprtrdma_dma_map(mr);
+       trace_xprtrdma_mr_map(mr);
 
        for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++)
                dma_pages[i] = sg_dma_address(&mr->mr_sg[i]);
@@ -305,13 +295,13 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
        list_for_each_entry(mr, mrs, mr_list) {
                dprintk("RPC:       %s: unmapping fmr %p\n",
                        __func__, &mr->fmr);
-               trace_xprtrdma_localinv(mr);
+               trace_xprtrdma_mr_localinv(mr);
                list_add_tail(&mr->fmr.fm_mr->list, &unmap_list);
        }
        r_xprt->rx_stats.local_inv_needed++;
        rc = ib_unmap_fmr(&unmap_list);
        if (rc)
-               goto out_reset;
+               goto out_release;
 
        /* ORDER: Now DMA unmap all of the req's MRs, and return
         * them to the free MW list.
@@ -324,13 +314,13 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
 
        return;
 
-out_reset:
+out_release:
        pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
 
        while (!list_empty(mrs)) {
                mr = rpcrdma_mr_pop(mrs);
                list_del(&mr->fmr.fm_mr->list);
-               fmr_op_recover_mr(mr);
+               rpcrdma_mr_recycle(mr);
        }
 }
 
@@ -338,7 +328,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
        .ro_map                         = fmr_op_map,
        .ro_send                        = fmr_op_send,
        .ro_unmap_sync                  = fmr_op_unmap_sync,
-       .ro_recover_mr                  = fmr_op_recover_mr,
        .ro_open                        = fmr_op_open,
        .ro_maxpages                    = fmr_op_maxpages,
        .ro_init_mr                     = fmr_op_init_mr,
index 1bb00dd6ccdb83b780328c935fd1d5ffcc7f2eef..fc6378cc0c1c70d9d5e0f148e28db3152ba5cfe3 100644 (file)
@@ -97,6 +97,44 @@ out_not_supported:
        return false;
 }
 
+static void
+frwr_op_release_mr(struct rpcrdma_mr *mr)
+{
+       int rc;
+
+       rc = ib_dereg_mr(mr->frwr.fr_mr);
+       if (rc)
+               pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
+                      mr, rc);
+       kfree(mr->mr_sg);
+       kfree(mr);
+}
+
+/* MRs are dynamically allocated, so simply clean up and release the MR.
+ * A replacement MR will subsequently be allocated on demand.
+ */
+static void
+frwr_mr_recycle_worker(struct work_struct *work)
+{
+       struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle);
+       enum rpcrdma_frwr_state state = mr->frwr.fr_state;
+       struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
+
+       trace_xprtrdma_mr_recycle(mr);
+
+       if (state != FRWR_FLUSHED_LI) {
+               trace_xprtrdma_mr_unmap(mr);
+               ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
+                               mr->mr_sg, mr->mr_nents, mr->mr_dir);
+       }
+
+       spin_lock(&r_xprt->rx_buf.rb_mrlock);
+       list_del(&mr->mr_all);
+       r_xprt->rx_stats.mrs_recycled++;
+       spin_unlock(&r_xprt->rx_buf.rb_mrlock);
+       frwr_op_release_mr(mr);
+}
+
 static int
 frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
 {
@@ -113,6 +151,7 @@ frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
                goto out_list_err;
 
        INIT_LIST_HEAD(&mr->mr_list);
+       INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker);
        sg_init_table(mr->mr_sg, depth);
        init_completion(&frwr->fr_linv_done);
        return 0;
@@ -131,79 +170,6 @@ out_list_err:
        return rc;
 }
 
-static void
-frwr_op_release_mr(struct rpcrdma_mr *mr)
-{
-       int rc;
-
-       rc = ib_dereg_mr(mr->frwr.fr_mr);
-       if (rc)
-               pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
-                      mr, rc);
-       kfree(mr->mr_sg);
-       kfree(mr);
-}
-
-static int
-__frwr_mr_reset(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
-{
-       struct rpcrdma_frwr *frwr = &mr->frwr;
-       int rc;
-
-       rc = ib_dereg_mr(frwr->fr_mr);
-       if (rc) {
-               pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n",
-                       rc, mr);
-               return rc;
-       }
-
-       frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype,
-                                 ia->ri_max_frwr_depth);
-       if (IS_ERR(frwr->fr_mr)) {
-               pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
-                       PTR_ERR(frwr->fr_mr), mr);
-               return PTR_ERR(frwr->fr_mr);
-       }
-
-       dprintk("RPC:       %s: recovered FRWR %p\n", __func__, frwr);
-       frwr->fr_state = FRWR_IS_INVALID;
-       return 0;
-}
-
-/* Reset of a single FRWR. Generate a fresh rkey by replacing the MR.
- */
-static void
-frwr_op_recover_mr(struct rpcrdma_mr *mr)
-{
-       enum rpcrdma_frwr_state state = mr->frwr.fr_state;
-       struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       int rc;
-
-       rc = __frwr_mr_reset(ia, mr);
-       if (state != FRWR_FLUSHED_LI) {
-               trace_xprtrdma_dma_unmap(mr);
-               ib_dma_unmap_sg(ia->ri_device,
-                               mr->mr_sg, mr->mr_nents, mr->mr_dir);
-       }
-       if (rc)
-               goto out_release;
-
-       rpcrdma_mr_put(mr);
-       r_xprt->rx_stats.mrs_recovered++;
-       return;
-
-out_release:
-       pr_err("rpcrdma: FRWR reset failed %d, %p released\n", rc, mr);
-       r_xprt->rx_stats.mrs_orphaned++;
-
-       spin_lock(&r_xprt->rx_buf.rb_mrlock);
-       list_del(&mr->mr_all);
-       spin_unlock(&r_xprt->rx_buf.rb_mrlock);
-
-       frwr_op_release_mr(mr);
-}
-
 /* On success, sets:
  *     ep->rep_attr.cap.max_send_wr
  *     ep->rep_attr.cap.max_recv_wr
@@ -276,6 +242,7 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 
        ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
                                ia->ri_max_frwr_depth);
+       ia->ri_max_segs += 2;   /* segments for head and tail buffers */
        return 0;
 }
 
@@ -384,7 +351,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        mr = NULL;
        do {
                if (mr)
-                       rpcrdma_mr_defer_recovery(mr);
+                       rpcrdma_mr_recycle(mr);
                mr = rpcrdma_mr_get(r_xprt);
                if (!mr)
                        return ERR_PTR(-EAGAIN);
@@ -417,7 +384,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir);
        if (!mr->mr_nents)
                goto out_dmamap_err;
-       trace_xprtrdma_dma_map(mr);
+       trace_xprtrdma_mr_map(mr);
 
        ibmr = frwr->fr_mr;
        n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE);
@@ -451,7 +418,7 @@ out_dmamap_err:
 out_mapmr_err:
        pr_err("rpcrdma: failed to map mr %p (%d/%d)\n",
               frwr->fr_mr, n, mr->mr_nents);
-       rpcrdma_mr_defer_recovery(mr);
+       rpcrdma_mr_recycle(mr);
        return ERR_PTR(-EIO);
 }
 
@@ -499,7 +466,7 @@ frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
        list_for_each_entry(mr, mrs, mr_list)
                if (mr->mr_handle == rep->rr_inv_rkey) {
                        list_del_init(&mr->mr_list);
-                       trace_xprtrdma_remoteinv(mr);
+                       trace_xprtrdma_mr_remoteinv(mr);
                        mr->frwr.fr_state = FRWR_IS_INVALID;
                        rpcrdma_mr_unmap_and_put(mr);
                        break;  /* only one invalidated MR per RPC */
@@ -536,7 +503,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
                mr->frwr.fr_state = FRWR_IS_INVALID;
 
                frwr = &mr->frwr;
-               trace_xprtrdma_localinv(mr);
+               trace_xprtrdma_mr_localinv(mr);
 
                frwr->fr_cqe.done = frwr_wc_localinv;
                last = &frwr->fr_invwr;
@@ -570,7 +537,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
        if (bad_wr != first)
                wait_for_completion(&frwr->fr_linv_done);
        if (rc)
-               goto reset_mrs;
+               goto out_release;
 
        /* ORDER: Now DMA unmap all of the MRs, and return
         * them to the free MR list.
@@ -582,22 +549,21 @@ unmap:
        }
        return;
 
-reset_mrs:
+out_release:
        pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc);
 
-       /* Find and reset the MRs in the LOCAL_INV WRs that did not
+       /* Unmap and release the MRs in the LOCAL_INV WRs that did not
         * get posted.
         */
        while (bad_wr) {
                frwr = container_of(bad_wr, struct rpcrdma_frwr,
                                    fr_invwr);
                mr = container_of(frwr, struct rpcrdma_mr, frwr);
-
-               __frwr_mr_reset(ia, mr);
-
                bad_wr = bad_wr->next;
+
+               list_del(&mr->mr_list);
+               frwr_op_release_mr(mr);
        }
-       goto unmap;
 }
 
 const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
@@ -605,7 +571,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
        .ro_send                        = frwr_op_send,
        .ro_reminv                      = frwr_op_reminv,
        .ro_unmap_sync                  = frwr_op_unmap_sync,
-       .ro_recover_mr                  = frwr_op_recover_mr,
        .ro_open                        = frwr_op_open,
        .ro_maxpages                    = frwr_op_maxpages,
        .ro_init_mr                     = frwr_op_init_mr,
index c8ae983c6cc017ae342ac71604625e25bb7fe777..9f53e0240035e4608ccb2f287d39f55664ca4862 100644 (file)
@@ -71,7 +71,6 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
        size = RPCRDMA_HDRLEN_MIN;
 
        /* Maximum Read list size */
-       maxsegs += 2;   /* segment for head and tail buffers */
        size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32);
 
        /* Minimal Read chunk size */
@@ -97,7 +96,6 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
        size = RPCRDMA_HDRLEN_MIN;
 
        /* Maximum Write list size */
-       maxsegs += 2;   /* segment for head and tail buffers */
        size = sizeof(__be32);          /* segment count */
        size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32);
        size += sizeof(__be32); /* list discriminator */
@@ -805,7 +803,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
                struct rpcrdma_mr *mr;
 
                mr = rpcrdma_mr_pop(&req->rl_registered);
-               rpcrdma_mr_defer_recovery(mr);
+               rpcrdma_mr_recycle(mr);
        }
 
        /* This implementation supports the following combinations
@@ -866,7 +864,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
 out_err:
        switch (ret) {
        case -EAGAIN:
-               xprt_wait_for_buffer_space(rqst->rq_task, NULL);
+               xprt_wait_for_buffer_space(rqst->rq_xprt);
                break;
        case -ENOBUFS:
                break;
@@ -1216,7 +1214,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
        struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
        struct rpc_xprt *xprt = &r_xprt->rx_xprt;
        struct rpc_rqst *rqst = rep->rr_rqst;
-       unsigned long cwnd;
        int status;
 
        xprt->reestablish_timeout = 0;
@@ -1238,15 +1235,10 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
                goto out_badheader;
 
 out:
-       spin_lock(&xprt->recv_lock);
-       cwnd = xprt->cwnd;
-       xprt->cwnd = r_xprt->rx_buf.rb_credits << RPC_CWNDSHIFT;
-       if (xprt->cwnd > cwnd)
-               xprt_release_rqst_cong(rqst->rq_task);
-
+       spin_lock(&xprt->queue_lock);
        xprt_complete_rqst(rqst->rq_task, status);
        xprt_unpin_rqst(rqst);
-       spin_unlock(&xprt->recv_lock);
+       spin_unlock(&xprt->queue_lock);
        return;
 
 /* If the incoming reply terminated a pending RPC, the next
@@ -1345,19 +1337,23 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
        /* Match incoming rpcrdma_rep to an rpcrdma_req to
         * get context for handling any incoming chunks.
         */
-       spin_lock(&xprt->recv_lock);
+       spin_lock(&xprt->queue_lock);
        rqst = xprt_lookup_rqst(xprt, rep->rr_xid);
        if (!rqst)
                goto out_norqst;
        xprt_pin_rqst(rqst);
+       spin_unlock(&xprt->queue_lock);
 
        if (credits == 0)
                credits = 1;    /* don't deadlock */
        else if (credits > buf->rb_max_requests)
                credits = buf->rb_max_requests;
-       buf->rb_credits = credits;
-
-       spin_unlock(&xprt->recv_lock);
+       if (buf->rb_credits != credits) {
+               spin_lock_bh(&xprt->transport_lock);
+               buf->rb_credits = credits;
+               xprt->cwnd = credits << RPC_CWNDSHIFT;
+               spin_unlock_bh(&xprt->transport_lock);
+       }
 
        req = rpcr_to_rdmar(rqst);
        req->rl_reply = rep;
@@ -1378,7 +1374,7 @@ out_badversion:
  * is corrupt.
  */
 out_norqst:
-       spin_unlock(&xprt->recv_lock);
+       spin_unlock(&xprt->queue_lock);
        trace_xprtrdma_reply_rqst(rep);
        goto repost;
 
index a68180090554f2f40ebb9ed5cd72a70cd9639541..d3a1a237cee6e4f49f6af104c8cb0c68c7463d65 100644 (file)
@@ -56,7 +56,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
        if (src->iov_len < 24)
                goto out_shortreply;
 
-       spin_lock(&xprt->recv_lock);
+       spin_lock(&xprt->queue_lock);
        req = xprt_lookup_rqst(xprt, xid);
        if (!req)
                goto out_notfound;
@@ -86,7 +86,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
        rcvbuf->len = 0;
 
 out_unlock:
-       spin_unlock(&xprt->recv_lock);
+       spin_unlock(&xprt->queue_lock);
 out:
        return ret;
 
@@ -215,9 +215,8 @@ drop_connection:
  * connection.
  */
 static int
-xprt_rdma_bc_send_request(struct rpc_task *task)
+xprt_rdma_bc_send_request(struct rpc_rqst *rqst)
 {
-       struct rpc_rqst *rqst = task->tk_rqstp;
        struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
        struct svcxprt_rdma *rdma;
        int ret;
@@ -225,12 +224,7 @@ xprt_rdma_bc_send_request(struct rpc_task *task)
        dprintk("svcrdma: sending bc call with xid: %08x\n",
                be32_to_cpu(rqst->rq_xid));
 
-       if (!mutex_trylock(&sxprt->xpt_mutex)) {
-               rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL);
-               if (!mutex_trylock(&sxprt->xpt_mutex))
-                       return -EAGAIN;
-               rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task);
-       }
+       mutex_lock(&sxprt->xpt_mutex);
 
        ret = -ENOTCONN;
        rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
@@ -248,6 +242,7 @@ static void
 xprt_rdma_bc_close(struct rpc_xprt *xprt)
 {
        dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
+       xprt->cwnd = RPC_CWNDSHIFT;
 }
 
 static void
index 143ce2579ba90cca47a87b5413ba35caf9d10792..ae2a83828953706e9b5cde03b8a3449b9aeb3df1 100644 (file)
@@ -225,69 +225,59 @@ xprt_rdma_free_addresses(struct rpc_xprt *xprt)
                }
 }
 
-void
-rpcrdma_conn_func(struct rpcrdma_ep *ep)
-{
-       schedule_delayed_work(&ep->rep_connect_worker, 0);
-}
-
-void
-rpcrdma_connect_worker(struct work_struct *work)
-{
-       struct rpcrdma_ep *ep =
-               container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
-       struct rpcrdma_xprt *r_xprt =
-               container_of(ep, struct rpcrdma_xprt, rx_ep);
-       struct rpc_xprt *xprt = &r_xprt->rx_xprt;
-
-       spin_lock_bh(&xprt->transport_lock);
-       if (ep->rep_connected > 0) {
-               if (!xprt_test_and_set_connected(xprt))
-                       xprt_wake_pending_tasks(xprt, 0);
-       } else {
-               if (xprt_test_and_clear_connected(xprt))
-                       xprt_wake_pending_tasks(xprt, -ENOTCONN);
-       }
-       spin_unlock_bh(&xprt->transport_lock);
-}
-
+/**
+ * xprt_rdma_connect_worker - establish connection in the background
+ * @work: worker thread context
+ *
+ * Requester holds the xprt's send lock to prevent activity on this
+ * transport while a fresh connection is being established. RPC tasks
+ * sleep on the xprt's pending queue waiting for connect to complete.
+ */
 static void
 xprt_rdma_connect_worker(struct work_struct *work)
 {
        struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt,
                                                   rx_connect_worker.work);
        struct rpc_xprt *xprt = &r_xprt->rx_xprt;
-       int rc = 0;
-
-       xprt_clear_connected(xprt);
+       int rc;
 
        rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
-       if (rc)
-               xprt_wake_pending_tasks(xprt, rc);
-
        xprt_clear_connecting(xprt);
+       if (r_xprt->rx_ep.rep_connected > 0) {
+               if (!xprt_test_and_set_connected(xprt)) {
+                       xprt->stat.connect_count++;
+                       xprt->stat.connect_time += (long)jiffies -
+                                                  xprt->stat.connect_start;
+                       xprt_wake_pending_tasks(xprt, -EAGAIN);
+               }
+       } else {
+               if (xprt_test_and_clear_connected(xprt))
+                       xprt_wake_pending_tasks(xprt, rc);
+       }
 }
 
+/**
+ * xprt_rdma_inject_disconnect - inject a connection fault
+ * @xprt: transport context
+ *
+ * If @xprt is connected, disconnect it to simulate spurious connection
+ * loss.
+ */
 static void
 xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
 {
-       struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt,
-                                                  rx_xprt);
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 
        trace_xprtrdma_inject_dsc(r_xprt);
        rdma_disconnect(r_xprt->rx_ia.ri_id);
 }
 
-/*
- * xprt_rdma_destroy
+/**
+ * xprt_rdma_destroy - Full tear down of transport
+ * @xprt: doomed transport context
  *
- * Destroy the xprt.
- * Free all memory associated with the object, including its own.
- * NOTE: none of the *destroy methods free memory for their top-level
- * objects, even though they may have allocated it (they do free
- * private memory). It's up to the caller to handle it. In this
- * case (RDMA transport), all structure memory is inlined with the
- * struct rpcrdma_xprt.
+ * Caller guarantees there will be no more calls to us with
+ * this @xprt.
  */
 static void
 xprt_rdma_destroy(struct rpc_xprt *xprt)
@@ -298,8 +288,6 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
 
        cancel_delayed_work_sync(&r_xprt->rx_connect_worker);
 
-       xprt_clear_connected(xprt);
-
        rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
        rpcrdma_buffer_destroy(&r_xprt->rx_buf);
        rpcrdma_ia_close(&r_xprt->rx_ia);
@@ -442,11 +430,12 @@ out1:
 }
 
 /**
- * xprt_rdma_close - Close down RDMA connection
- * @xprt: generic transport to be closed
+ * xprt_rdma_close - close a transport connection
+ * @xprt: transport context
  *
- * Called during transport shutdown reconnect, or device
- * removal. Caller holds the transport's write lock.
+ * Called during transport shutdown, reconnect, or device removal.
+ * Caller holds @xprt's send lock to prevent activity on this
+ * transport while the connection is torn down.
  */
 static void
 xprt_rdma_close(struct rpc_xprt *xprt)
@@ -468,6 +457,12 @@ xprt_rdma_close(struct rpc_xprt *xprt)
                xprt->reestablish_timeout = 0;
        xprt_disconnect_done(xprt);
        rpcrdma_ep_disconnect(ep, ia);
+
+       /* Prepare @xprt for the next connection by reinitializing
+        * its credit grant to one (see RFC 8166, Section 3.3.3).
+        */
+       r_xprt->rx_buf.rb_credits = 1;
+       xprt->cwnd = RPC_CWNDSHIFT;
 }
 
 /**
@@ -519,6 +514,12 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task)
        xprt_force_disconnect(xprt);
 }
 
+/**
+ * xprt_rdma_connect - try to establish a transport connection
+ * @xprt: transport state
+ * @task: RPC scheduler context
+ *
+ */
 static void
 xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 {
@@ -638,13 +639,6 @@ rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
  *        0:   Success; rq_buffer points to RPC buffer to use
  *   ENOMEM:   Out of memory, call again later
  *      EIO:   A permanent error occurred, do not retry
- *
- * The RDMA allocate/free functions need the task structure as a place
- * to hide the struct rpcrdma_req, which is necessary for the actual
- * send/recv sequence.
- *
- * xprt_rdma_allocate provides buffers that are already mapped for
- * DMA, and a local DMA lkey is provided for each.
  */
 static int
 xprt_rdma_allocate(struct rpc_task *task)
@@ -693,7 +687,7 @@ xprt_rdma_free(struct rpc_task *task)
 
 /**
  * xprt_rdma_send_request - marshal and send an RPC request
- * @task: RPC task with an RPC message in rq_snd_buf
+ * @rqst: RPC message in rq_snd_buf
  *
  * Caller holds the transport's write lock.
  *
@@ -706,9 +700,8 @@ xprt_rdma_free(struct rpc_task *task)
  *             sent. Do not try to send this message again.
  */
 static int
-xprt_rdma_send_request(struct rpc_task *task)
+xprt_rdma_send_request(struct rpc_rqst *rqst)
 {
-       struct rpc_rqst *rqst = task->tk_rqstp;
        struct rpc_xprt *xprt = rqst->rq_xprt;
        struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
@@ -722,6 +715,9 @@ xprt_rdma_send_request(struct rpc_task *task)
        if (!xprt_connected(xprt))
                goto drop_connection;
 
+       if (!xprt_request_get_cong(xprt, rqst))
+               return -EBADSLT;
+
        rc = rpcrdma_marshal_req(r_xprt, rqst);
        if (rc < 0)
                goto failed_marshal;
@@ -741,7 +737,7 @@ xprt_rdma_send_request(struct rpc_task *task)
        /* An RPC with no reply will throw off credit accounting,
         * so drop the connection to reset the credit grant.
         */
-       if (!rpc_reply_expected(task))
+       if (!rpc_reply_expected(rqst->rq_task))
                goto drop_connection;
        return 0;
 
@@ -766,7 +762,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
                   0,   /* need a local port? */
                   xprt->stat.bind_count,
                   xprt->stat.connect_count,
-                  xprt->stat.connect_time,
+                  xprt->stat.connect_time / HZ,
                   idle_time,
                   xprt->stat.sends,
                   xprt->stat.recvs,
@@ -786,7 +782,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
                   r_xprt->rx_stats.bad_reply_count,
                   r_xprt->rx_stats.nomsg_call_count);
        seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n",
-                  r_xprt->rx_stats.mrs_recovered,
+                  r_xprt->rx_stats.mrs_recycled,
                   r_xprt->rx_stats.mrs_orphaned,
                   r_xprt->rx_stats.mrs_allocated,
                   r_xprt->rx_stats.local_inv_needed,
index 956a5ea47b58ee8a6009aa8b6315ce5887ec9110..3ddba94c939f64e223e375fdfc75d54aebe14a0b 100644 (file)
@@ -108,20 +108,48 @@ rpcrdma_destroy_wq(void)
        }
 }
 
+/**
+ * rpcrdma_disconnect_worker - Force a disconnect
+ * @work: endpoint to be disconnected
+ *
+ * Provider callbacks can possibly run in an IRQ context. This function
+ * is invoked in a worker thread to guarantee that disconnect wake-up
+ * calls are always done in process context.
+ */
+static void
+rpcrdma_disconnect_worker(struct work_struct *work)
+{
+       struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep,
+                                            rep_disconnect_worker.work);
+       struct rpcrdma_xprt *r_xprt =
+               container_of(ep, struct rpcrdma_xprt, rx_ep);
+
+       xprt_force_disconnect(&r_xprt->rx_xprt);
+}
+
+/**
+ * rpcrdma_qp_event_handler - Handle one QP event (error notification)
+ * @event: details of the event
+ * @context: ep that owns QP where event occurred
+ *
+ * Called from the RDMA provider (device driver) possibly in an interrupt
+ * context.
+ */
 static void
-rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
+rpcrdma_qp_event_handler(struct ib_event *event, void *context)
 {
        struct rpcrdma_ep *ep = context;
        struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
                                                   rx_ep);
 
-       trace_xprtrdma_qp_error(r_xprt, event);
-       pr_err("rpcrdma: %s on device %s ep %p\n",
-              ib_event_msg(event->event), event->device->name, context);
+       trace_xprtrdma_qp_event(r_xprt, event);
+       pr_err("rpcrdma: %s on device %s connected to %s:%s\n",
+              ib_event_msg(event->event), event->device->name,
+              rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt));
 
        if (ep->rep_connected == 1) {
                ep->rep_connected = -EIO;
-               rpcrdma_conn_func(ep);
+               schedule_delayed_work(&ep->rep_disconnect_worker, 0);
                wake_up_all(&ep->rep_connect_wait);
        }
 }
@@ -219,38 +247,48 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
        rpcrdma_set_max_header_sizes(r_xprt);
 }
 
+/**
+ * rpcrdma_cm_event_handler - Handle RDMA CM events
+ * @id: rdma_cm_id on which an event has occurred
+ * @event: details of the event
+ *
+ * Called with @id's mutex held. Returns 1 if caller should
+ * destroy @id, otherwise 0.
+ */
 static int
-rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
+rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
 {
-       struct rpcrdma_xprt *xprt = id->context;
-       struct rpcrdma_ia *ia = &xprt->rx_ia;
-       struct rpcrdma_ep *ep = &xprt->rx_ep;
-       int connstate = 0;
+       struct rpcrdma_xprt *r_xprt = id->context;
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+       struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+       struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+       might_sleep();
 
-       trace_xprtrdma_conn_upcall(xprt, event);
+       trace_xprtrdma_cm_event(r_xprt, event);
        switch (event->event) {
        case RDMA_CM_EVENT_ADDR_RESOLVED:
        case RDMA_CM_EVENT_ROUTE_RESOLVED:
                ia->ri_async_rc = 0;
                complete(&ia->ri_done);
-               break;
+               return 0;
        case RDMA_CM_EVENT_ADDR_ERROR:
                ia->ri_async_rc = -EPROTO;
                complete(&ia->ri_done);
-               break;
+               return 0;
        case RDMA_CM_EVENT_ROUTE_ERROR:
                ia->ri_async_rc = -ENETUNREACH;
                complete(&ia->ri_done);
-               break;
+               return 0;
        case RDMA_CM_EVENT_DEVICE_REMOVAL:
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
                pr_info("rpcrdma: removing device %s for %s:%s\n",
                        ia->ri_device->name,
-                       rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt));
+                       rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt));
 #endif
                set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
                ep->rep_connected = -ENODEV;
-               xprt_force_disconnect(&xprt->rx_xprt);
+               xprt_force_disconnect(xprt);
                wait_for_completion(&ia->ri_remove_done);
 
                ia->ri_id = NULL;
@@ -258,41 +296,40 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
                /* Return 1 to ensure the core destroys the id. */
                return 1;
        case RDMA_CM_EVENT_ESTABLISHED:
-               ++xprt->rx_xprt.connect_cookie;
-               connstate = 1;
-               rpcrdma_update_connect_private(xprt, &event->param.conn);
-               goto connected;
+               ++xprt->connect_cookie;
+               ep->rep_connected = 1;
+               rpcrdma_update_connect_private(r_xprt, &event->param.conn);
+               wake_up_all(&ep->rep_connect_wait);
+               break;
        case RDMA_CM_EVENT_CONNECT_ERROR:
-               connstate = -ENOTCONN;
-               goto connected;
+               ep->rep_connected = -ENOTCONN;
+               goto disconnected;
        case RDMA_CM_EVENT_UNREACHABLE:
-               connstate = -ENETUNREACH;
-               goto connected;
+               ep->rep_connected = -ENETUNREACH;
+               goto disconnected;
        case RDMA_CM_EVENT_REJECTED:
                dprintk("rpcrdma: connection to %s:%s rejected: %s\n",
-                       rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt),
+                       rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
                        rdma_reject_msg(id, event->status));
-               connstate = -ECONNREFUSED;
+               ep->rep_connected = -ECONNREFUSED;
                if (event->status == IB_CM_REJ_STALE_CONN)
-                       connstate = -EAGAIN;
-               goto connected;
+                       ep->rep_connected = -EAGAIN;
+               goto disconnected;
        case RDMA_CM_EVENT_DISCONNECTED:
-               ++xprt->rx_xprt.connect_cookie;
-               connstate = -ECONNABORTED;
-connected:
-               ep->rep_connected = connstate;
-               rpcrdma_conn_func(ep);
+               ++xprt->connect_cookie;
+               ep->rep_connected = -ECONNABORTED;
+disconnected:
+               xprt_force_disconnect(xprt);
                wake_up_all(&ep->rep_connect_wait);
-               /*FALLTHROUGH*/
+               break;
        default:
-               dprintk("RPC:       %s: %s:%s on %s/%s (ep 0x%p): %s\n",
-                       __func__,
-                       rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt),
-                       ia->ri_device->name, ia->ri_ops->ro_displayname,
-                       ep, rdma_event_msg(event->event));
                break;
        }
 
+       dprintk("RPC:       %s: %s:%s on %s/%s: %s\n", __func__,
+               rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
+               ia->ri_device->name, ia->ri_ops->ro_displayname,
+               rdma_event_msg(event->event));
        return 0;
 }
 
@@ -308,7 +345,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia)
        init_completion(&ia->ri_done);
        init_completion(&ia->ri_remove_done);
 
-       id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_conn_upcall,
+       id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler,
                            xprt, RDMA_PS_TCP, IB_QPT_RC);
        if (IS_ERR(id)) {
                rc = PTR_ERR(id);
@@ -519,7 +556,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        if (rc)
                return rc;
 
-       ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
+       ep->rep_attr.event_handler = rpcrdma_qp_event_handler;
        ep->rep_attr.qp_context = ep;
        ep->rep_attr.srq = NULL;
        ep->rep_attr.cap.max_send_sge = max_sge;
@@ -542,7 +579,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
                                   cdata->max_requests >> 2);
        ep->rep_send_count = ep->rep_send_batch;
        init_waitqueue_head(&ep->rep_connect_wait);
-       INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
+       INIT_DELAYED_WORK(&ep->rep_disconnect_worker,
+                         rpcrdma_disconnect_worker);
 
        sendcq = ib_alloc_cq(ia->ri_device, NULL,
                             ep->rep_attr.cap.max_send_wr + 1,
@@ -615,7 +653,7 @@ out1:
 void
 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
-       cancel_delayed_work_sync(&ep->rep_connect_worker);
+       cancel_delayed_work_sync(&ep->rep_disconnect_worker);
 
        if (ia->ri_id && ia->ri_id->qp) {
                rpcrdma_ep_disconnect(ep, ia);
@@ -728,6 +766,7 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
        struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
                                                   rx_ia);
+       struct rpc_xprt *xprt = &r_xprt->rx_xprt;
        int rc;
 
 retry:
@@ -754,6 +793,8 @@ retry:
        }
 
        ep->rep_connected = 0;
+       xprt_clear_connected(xprt);
+
        rpcrdma_post_recvs(r_xprt, true);
 
        rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
@@ -877,7 +918,6 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
                sc->sc_xprt = r_xprt;
                buf->rb_sc_ctxs[i] = sc;
        }
-       buf->rb_flags = 0;
 
        return 0;
 
@@ -977,39 +1017,6 @@ rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc)
        }
 }
 
-static void
-rpcrdma_mr_recovery_worker(struct work_struct *work)
-{
-       struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
-                                                 rb_recovery_worker.work);
-       struct rpcrdma_mr *mr;
-
-       spin_lock(&buf->rb_recovery_lock);
-       while (!list_empty(&buf->rb_stale_mrs)) {
-               mr = rpcrdma_mr_pop(&buf->rb_stale_mrs);
-               spin_unlock(&buf->rb_recovery_lock);
-
-               trace_xprtrdma_recover_mr(mr);
-               mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr);
-
-               spin_lock(&buf->rb_recovery_lock);
-       }
-       spin_unlock(&buf->rb_recovery_lock);
-}
-
-void
-rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr)
-{
-       struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
-       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-
-       spin_lock(&buf->rb_recovery_lock);
-       rpcrdma_mr_push(mr, &buf->rb_stale_mrs);
-       spin_unlock(&buf->rb_recovery_lock);
-
-       schedule_delayed_work(&buf->rb_recovery_worker, 0);
-}
-
 static void
 rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
 {
@@ -1019,7 +1026,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
        LIST_HEAD(free);
        LIST_HEAD(all);
 
-       for (count = 0; count < 3; count++) {
+       for (count = 0; count < ia->ri_max_segs; count++) {
                struct rpcrdma_mr *mr;
                int rc;
 
@@ -1138,18 +1145,15 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
        struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
        int i, rc;
 
+       buf->rb_flags = 0;
        buf->rb_max_requests = r_xprt->rx_data.max_requests;
        buf->rb_bc_srv_max_requests = 0;
        spin_lock_init(&buf->rb_mrlock);
        spin_lock_init(&buf->rb_lock);
-       spin_lock_init(&buf->rb_recovery_lock);
        INIT_LIST_HEAD(&buf->rb_mrs);
        INIT_LIST_HEAD(&buf->rb_all);
-       INIT_LIST_HEAD(&buf->rb_stale_mrs);
        INIT_DELAYED_WORK(&buf->rb_refresh_worker,
                          rpcrdma_mr_refresh_worker);
-       INIT_DELAYED_WORK(&buf->rb_recovery_worker,
-                         rpcrdma_mr_recovery_worker);
 
        rpcrdma_mrs_create(r_xprt);
 
@@ -1233,7 +1237,6 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf)
 void
 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 {
-       cancel_delayed_work_sync(&buf->rb_recovery_worker);
        cancel_delayed_work_sync(&buf->rb_refresh_worker);
 
        rpcrdma_sendctxs_destroy(buf);
@@ -1326,7 +1329,7 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr)
 {
        struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
 
-       trace_xprtrdma_dma_unmap(mr);
+       trace_xprtrdma_mr_unmap(mr);
        ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
                        mr->mr_sg, mr->mr_nents, mr->mr_dir);
        __rpcrdma_mr_put(&r_xprt->rx_buf, mr);
@@ -1518,9 +1521,11 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
        struct ib_recv_wr *wr, *bad_wr;
        int needed, count, rc;
 
+       rc = 0;
+       count = 0;
        needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
        if (buf->rb_posted_receives > needed)
-               return;
+               goto out;
        needed -= buf->rb_posted_receives;
 
        count = 0;
@@ -1556,7 +1561,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
                --needed;
        }
        if (!count)
-               return;
+               goto out;
 
        rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr,
                          (const struct ib_recv_wr **)&bad_wr);
@@ -1570,5 +1575,6 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
                }
        }
        buf->rb_posted_receives += count;
+out:
        trace_xprtrdma_post_recvs(r_xprt, count, rc);
 }
index 2ca14f7c2d51adb89d8df5f313fc2d30c1e9c16c..a13ccb643ce07e0a3838079f914a20ee586c417e 100644 (file)
@@ -101,7 +101,7 @@ struct rpcrdma_ep {
        wait_queue_head_t       rep_connect_wait;
        struct rpcrdma_connect_private  rep_cm_private;
        struct rdma_conn_param  rep_remote_cma;
-       struct delayed_work     rep_connect_worker;
+       struct delayed_work     rep_disconnect_worker;
 };
 
 /* Pre-allocate extra Work Requests for handling backward receives
@@ -280,6 +280,7 @@ struct rpcrdma_mr {
        u32                     mr_handle;
        u32                     mr_length;
        u64                     mr_offset;
+       struct work_struct      mr_recycle;
        struct list_head        mr_all;
 };
 
@@ -411,9 +412,6 @@ struct rpcrdma_buffer {
 
        u32                     rb_bc_max_requests;
 
-       spinlock_t              rb_recovery_lock; /* protect rb_stale_mrs */
-       struct list_head        rb_stale_mrs;
-       struct delayed_work     rb_recovery_worker;
        struct delayed_work     rb_refresh_worker;
 };
 #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
@@ -452,7 +450,7 @@ struct rpcrdma_stats {
        unsigned long           hardway_register_count;
        unsigned long           failed_marshal_count;
        unsigned long           bad_reply_count;
-       unsigned long           mrs_recovered;
+       unsigned long           mrs_recycled;
        unsigned long           mrs_orphaned;
        unsigned long           mrs_allocated;
        unsigned long           empty_sendctx_q;
@@ -481,7 +479,6 @@ struct rpcrdma_memreg_ops {
                                     struct list_head *mrs);
        void            (*ro_unmap_sync)(struct rpcrdma_xprt *,
                                         struct list_head *);
-       void            (*ro_recover_mr)(struct rpcrdma_mr *mr);
        int             (*ro_open)(struct rpcrdma_ia *,
                                   struct rpcrdma_ep *,
                                   struct rpcrdma_create_data_internal *);
@@ -559,7 +556,6 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
                                struct rpcrdma_create_data_internal *);
 void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
 int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
-void rpcrdma_conn_func(struct rpcrdma_ep *ep);
 void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
 
 int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
@@ -578,7 +574,12 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf);
 struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt);
 void rpcrdma_mr_put(struct rpcrdma_mr *mr);
 void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr);
-void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr);
+
+static inline void
+rpcrdma_mr_recycle(struct rpcrdma_mr *mr)
+{
+       schedule_work(&mr->mr_recycle);
+}
 
 struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
 void rpcrdma_buffer_put(struct rpcrdma_req *);
@@ -652,7 +653,6 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
 extern unsigned int xprt_rdma_max_inline_read;
 void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
 void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
-void rpcrdma_connect_worker(struct work_struct *work);
 void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
 int xprt_rdma_init(void);
 void xprt_rdma_cleanup(void);
index 6b7539c0466e85b164512de3845f11dec404c16d..1b51e04d356609f37b9e8042768a083d31391600 100644 (file)
 #include <net/checksum.h>
 #include <net/udp.h>
 #include <net/tcp.h>
+#include <linux/bvec.h>
+#include <linux/uio.h>
 
 #include <trace/events/sunrpc.h>
 
 #include "sunrpc.h"
 
-#define RPC_TCP_READ_CHUNK_SZ  (3*512*1024)
-
 static void xs_close(struct rpc_xprt *xprt);
 static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
                struct socket *sock);
@@ -129,7 +129,7 @@ static struct ctl_table xs_tunables_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &xprt_min_resvport_limit,
-               .extra2         = &xprt_max_resvport
+               .extra2         = &xprt_max_resvport_limit
        },
        {
                .procname       = "max_resvport",
@@ -137,7 +137,7 @@ static struct ctl_table xs_tunables_table[] = {
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &xprt_min_resvport,
+               .extra1         = &xprt_min_resvport_limit,
                .extra2         = &xprt_max_resvport_limit
        },
        {
@@ -325,6 +325,362 @@ static void xs_free_peer_addresses(struct rpc_xprt *xprt)
                }
 }
 
+static size_t
+xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp)
+{
+       size_t i,n;
+
+       if (!(buf->flags & XDRBUF_SPARSE_PAGES))
+               return want;
+       if (want > buf->page_len)
+               want = buf->page_len;
+       n = (buf->page_base + want + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       for (i = 0; i < n; i++) {
+               if (buf->pages[i])
+                       continue;
+               buf->bvec[i].bv_page = buf->pages[i] = alloc_page(gfp);
+               if (!buf->pages[i]) {
+                       buf->page_len = (i * PAGE_SIZE) - buf->page_base;
+                       return buf->page_len;
+               }
+       }
+       return want;
+}
+
+static ssize_t
+xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek)
+{
+       ssize_t ret;
+       if (seek != 0)
+               iov_iter_advance(&msg->msg_iter, seek);
+       ret = sock_recvmsg(sock, msg, flags);
+       return ret > 0 ? ret + seek : ret;
+}
+
+static ssize_t
+xs_read_kvec(struct socket *sock, struct msghdr *msg, int flags,
+               struct kvec *kvec, size_t count, size_t seek)
+{
+       iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, kvec, 1, count);
+       return xs_sock_recvmsg(sock, msg, flags, seek);
+}
+
+static ssize_t
+xs_read_bvec(struct socket *sock, struct msghdr *msg, int flags,
+               struct bio_vec *bvec, unsigned long nr, size_t count,
+               size_t seek)
+{
+       iov_iter_bvec(&msg->msg_iter, READ | ITER_BVEC, bvec, nr, count);
+       return xs_sock_recvmsg(sock, msg, flags, seek);
+}
+
+static ssize_t
+xs_read_discard(struct socket *sock, struct msghdr *msg, int flags,
+               size_t count)
+{
+       struct kvec kvec = { 0 };
+       return xs_read_kvec(sock, msg, flags | MSG_TRUNC, &kvec, count, 0);
+}
+
+static ssize_t
+xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags,
+               struct xdr_buf *buf, size_t count, size_t seek, size_t *read)
+{
+       size_t want, seek_init = seek, offset = 0;
+       ssize_t ret;
+
+       if (seek < buf->head[0].iov_len) {
+               want = min_t(size_t, count, buf->head[0].iov_len);
+               ret = xs_read_kvec(sock, msg, flags, &buf->head[0], want, seek);
+               if (ret <= 0)
+                       goto sock_err;
+               offset += ret;
+               if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC))
+                       goto out;
+               if (ret != want)
+                       goto eagain;
+               seek = 0;
+       } else {
+               seek -= buf->head[0].iov_len;
+               offset += buf->head[0].iov_len;
+       }
+       if (seek < buf->page_len) {
+               want = xs_alloc_sparse_pages(buf,
+                               min_t(size_t, count - offset, buf->page_len),
+                               GFP_NOWAIT);
+               ret = xs_read_bvec(sock, msg, flags, buf->bvec,
+                               xdr_buf_pagecount(buf),
+                               want + buf->page_base,
+                               seek + buf->page_base);
+               if (ret <= 0)
+                       goto sock_err;
+               offset += ret - buf->page_base;
+               if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC))
+                       goto out;
+               if (ret != want)
+                       goto eagain;
+               seek = 0;
+       } else {
+               seek -= buf->page_len;
+               offset += buf->page_len;
+       }
+       if (seek < buf->tail[0].iov_len) {
+               want = min_t(size_t, count - offset, buf->tail[0].iov_len);
+               ret = xs_read_kvec(sock, msg, flags, &buf->tail[0], want, seek);
+               if (ret <= 0)
+                       goto sock_err;
+               offset += ret;
+               if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC))
+                       goto out;
+               if (ret != want)
+                       goto eagain;
+       } else
+               offset += buf->tail[0].iov_len;
+       ret = -EMSGSIZE;
+       msg->msg_flags |= MSG_TRUNC;
+out:
+       *read = offset - seek_init;
+       return ret;
+eagain:
+       ret = -EAGAIN;
+       goto out;
+sock_err:
+       offset += seek;
+       goto out;
+}
+
+static void
+xs_read_header(struct sock_xprt *transport, struct xdr_buf *buf)
+{
+       if (!transport->recv.copied) {
+               if (buf->head[0].iov_len >= transport->recv.offset)
+                       memcpy(buf->head[0].iov_base,
+                                       &transport->recv.xid,
+                                       transport->recv.offset);
+               transport->recv.copied = transport->recv.offset;
+       }
+}
+
+static bool
+xs_read_stream_request_done(struct sock_xprt *transport)
+{
+       return transport->recv.fraghdr & cpu_to_be32(RPC_LAST_STREAM_FRAGMENT);
+}
+
+static ssize_t
+xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg,
+               int flags, struct rpc_rqst *req)
+{
+       struct xdr_buf *buf = &req->rq_private_buf;
+       size_t want, read;
+       ssize_t ret;
+
+       xs_read_header(transport, buf);
+
+       want = transport->recv.len - transport->recv.offset;
+       ret = xs_read_xdr_buf(transport->sock, msg, flags, buf,
+                       transport->recv.copied + want, transport->recv.copied,
+                       &read);
+       transport->recv.offset += read;
+       transport->recv.copied += read;
+       if (transport->recv.offset == transport->recv.len) {
+               if (xs_read_stream_request_done(transport))
+                       msg->msg_flags |= MSG_EOR;
+               return transport->recv.copied;
+       }
+
+       switch (ret) {
+       case -EMSGSIZE:
+               return transport->recv.copied;
+       case 0:
+               return -ESHUTDOWN;
+       default:
+               if (ret < 0)
+                       return ret;
+       }
+       return -EAGAIN;
+}
+
+static size_t
+xs_read_stream_headersize(bool isfrag)
+{
+       if (isfrag)
+               return sizeof(__be32);
+       return 3 * sizeof(__be32);
+}
+
+static ssize_t
+xs_read_stream_header(struct sock_xprt *transport, struct msghdr *msg,
+               int flags, size_t want, size_t seek)
+{
+       struct kvec kvec = {
+               .iov_base = &transport->recv.fraghdr,
+               .iov_len = want,
+       };
+       return xs_read_kvec(transport->sock, msg, flags, &kvec, want, seek);
+}
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+static ssize_t
+xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags)
+{
+       struct rpc_xprt *xprt = &transport->xprt;
+       struct rpc_rqst *req;
+       ssize_t ret;
+
+       /* Look up and lock the request corresponding to the given XID */
+       req = xprt_lookup_bc_request(xprt, transport->recv.xid);
+       if (!req) {
+               printk(KERN_WARNING "Callback slot table overflowed\n");
+               return -ESHUTDOWN;
+       }
+
+       ret = xs_read_stream_request(transport, msg, flags, req);
+       if (msg->msg_flags & (MSG_EOR|MSG_TRUNC))
+               xprt_complete_bc_request(req, ret);
+
+       return ret;
+}
+#else /* CONFIG_SUNRPC_BACKCHANNEL */
+static ssize_t
+xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags)
+{
+       return -ESHUTDOWN;
+}
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
+static ssize_t
+xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags)
+{
+       struct rpc_xprt *xprt = &transport->xprt;
+       struct rpc_rqst *req;
+       ssize_t ret = 0;
+
+       /* Look up and lock the request corresponding to the given XID */
+       spin_lock(&xprt->queue_lock);
+       req = xprt_lookup_rqst(xprt, transport->recv.xid);
+       if (!req) {
+               msg->msg_flags |= MSG_TRUNC;
+               goto out;
+       }
+       xprt_pin_rqst(req);
+       spin_unlock(&xprt->queue_lock);
+
+       ret = xs_read_stream_request(transport, msg, flags, req);
+
+       spin_lock(&xprt->queue_lock);
+       if (msg->msg_flags & (MSG_EOR|MSG_TRUNC))
+               xprt_complete_rqst(req->rq_task, ret);
+       xprt_unpin_rqst(req);
+out:
+       spin_unlock(&xprt->queue_lock);
+       return ret;
+}
+
+static ssize_t
+xs_read_stream(struct sock_xprt *transport, int flags)
+{
+       struct msghdr msg = { 0 };
+       size_t want, read = 0;
+       ssize_t ret = 0;
+
+       if (transport->recv.len == 0) {
+               want = xs_read_stream_headersize(transport->recv.copied != 0);
+               ret = xs_read_stream_header(transport, &msg, flags, want,
+                               transport->recv.offset);
+               if (ret <= 0)
+                       goto out_err;
+               transport->recv.offset = ret;
+               if (ret != want) {
+                       ret = -EAGAIN;
+                       goto out_err;
+               }
+               transport->recv.len = be32_to_cpu(transport->recv.fraghdr) &
+                       RPC_FRAGMENT_SIZE_MASK;
+               transport->recv.offset -= sizeof(transport->recv.fraghdr);
+               read = ret;
+       }
+
+       switch (be32_to_cpu(transport->recv.calldir)) {
+       case RPC_CALL:
+               ret = xs_read_stream_call(transport, &msg, flags);
+               break;
+       case RPC_REPLY:
+               ret = xs_read_stream_reply(transport, &msg, flags);
+       }
+       if (msg.msg_flags & MSG_TRUNC) {
+               transport->recv.calldir = cpu_to_be32(-1);
+               transport->recv.copied = -1;
+       }
+       if (ret < 0)
+               goto out_err;
+       read += ret;
+       if (transport->recv.offset < transport->recv.len) {
+               ret = xs_read_discard(transport->sock, &msg, flags,
+                               transport->recv.len - transport->recv.offset);
+               if (ret <= 0)
+                       goto out_err;
+               transport->recv.offset += ret;
+               read += ret;
+               if (transport->recv.offset != transport->recv.len)
+                       return -EAGAIN;
+       }
+       if (xs_read_stream_request_done(transport)) {
+               trace_xs_stream_read_request(transport);
+               transport->recv.copied = 0;
+       }
+       transport->recv.offset = 0;
+       transport->recv.len = 0;
+       return read;
+out_err:
+       switch (ret) {
+       case 0:
+       case -ESHUTDOWN:
+               xprt_force_disconnect(&transport->xprt);
+               return -ESHUTDOWN;
+       }
+       return ret;
+}
+
+static void xs_stream_data_receive(struct sock_xprt *transport)
+{
+       size_t read = 0;
+       ssize_t ret = 0;
+
+       mutex_lock(&transport->recv_mutex);
+       if (transport->sock == NULL)
+               goto out;
+       clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
+       for (;;) {
+               ret = xs_read_stream(transport, MSG_DONTWAIT);
+               if (ret <= 0)
+                       break;
+               read += ret;
+               cond_resched();
+       }
+out:
+       mutex_unlock(&transport->recv_mutex);
+       trace_xs_stream_read_data(&transport->xprt, ret, read);
+}
+
+static void xs_stream_data_receive_workfn(struct work_struct *work)
+{
+       struct sock_xprt *transport =
+               container_of(work, struct sock_xprt, recv_worker);
+       xs_stream_data_receive(transport);
+}
+
+static void
+xs_stream_reset_connect(struct sock_xprt *transport)
+{
+       transport->recv.offset = 0;
+       transport->recv.len = 0;
+       transport->recv.copied = 0;
+       transport->xmit.offset = 0;
+       transport->xprt.stat.connect_count++;
+       transport->xprt.stat.connect_start = jiffies;
+}
+
 #define XS_SENDMSG_FLAGS       (MSG_DONTWAIT | MSG_NOSIGNAL)
 
 static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen, struct kvec *vec, unsigned int base, int more)
@@ -440,28 +796,21 @@ out:
        return err;
 }
 
-static void xs_nospace_callback(struct rpc_task *task)
-{
-       struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt);
-
-       transport->inet->sk_write_pending--;
-}
-
 /**
- * xs_nospace - place task on wait queue if transmit was incomplete
- * @task: task to put to sleep
+ * xs_nospace - handle transmit was incomplete
+ * @req: pointer to RPC request
  *
  */
-static int xs_nospace(struct rpc_task *task)
+static int xs_nospace(struct rpc_rqst *req)
 {
-       struct rpc_rqst *req = task->tk_rqstp;
        struct rpc_xprt *xprt = req->rq_xprt;
        struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
        struct sock *sk = transport->inet;
        int ret = -EAGAIN;
 
        dprintk("RPC: %5u xmit incomplete (%u left of %u)\n",
-                       task->tk_pid, req->rq_slen - req->rq_bytes_sent,
+                       req->rq_task->tk_pid,
+                       req->rq_slen - transport->xmit.offset,
                        req->rq_slen);
 
        /* Protect against races with write_space */
@@ -471,7 +820,7 @@ static int xs_nospace(struct rpc_task *task)
        if (xprt_connected(xprt)) {
                /* wait for more buffer space */
                sk->sk_write_pending++;
-               xprt_wait_for_buffer_space(task, xs_nospace_callback);
+               xprt_wait_for_buffer_space(xprt);
        } else
                ret = -ENOTCONN;
 
@@ -491,6 +840,22 @@ static int xs_nospace(struct rpc_task *task)
        return ret;
 }
 
+static void
+xs_stream_prepare_request(struct rpc_rqst *req)
+{
+       req->rq_task->tk_status = xdr_alloc_bvec(&req->rq_rcv_buf, GFP_NOIO);
+}
+
+/*
+ * Determine if the previous message in the stream was aborted before it
+ * could complete transmission.
+ */
+static bool
+xs_send_request_was_aborted(struct sock_xprt *transport, struct rpc_rqst *req)
+{
+       return transport->xmit.offset != 0 && req->rq_bytes_sent == 0;
+}
+
 /*
  * Construct a stream transport record marker in @buf.
  */
@@ -503,7 +868,7 @@ static inline void xs_encode_stream_record_marker(struct xdr_buf *buf)
 
 /**
  * xs_local_send_request - write an RPC request to an AF_LOCAL socket
- * @task: RPC task that manages the state of an RPC request
+ * @req: pointer to RPC request
  *
  * Return values:
  *        0:   The request has been sent
@@ -512,9 +877,8 @@ static inline void xs_encode_stream_record_marker(struct xdr_buf *buf)
  * ENOTCONN:   Caller needs to invoke connect logic then call again
  *    other:   Some other error occured, the request was not sent
  */
-static int xs_local_send_request(struct rpc_task *task)
+static int xs_local_send_request(struct rpc_rqst *req)
 {
-       struct rpc_rqst *req = task->tk_rqstp;
        struct rpc_xprt *xprt = req->rq_xprt;
        struct sock_xprt *transport =
                                container_of(xprt, struct sock_xprt, xprt);
@@ -522,25 +886,34 @@ static int xs_local_send_request(struct rpc_task *task)
        int status;
        int sent = 0;
 
+       /* Close the stream if the previous transmission was incomplete */
+       if (xs_send_request_was_aborted(transport, req)) {
+               xs_close(xprt);
+               return -ENOTCONN;
+       }
+
        xs_encode_stream_record_marker(&req->rq_snd_buf);
 
        xs_pktdump("packet data:",
                        req->rq_svec->iov_base, req->rq_svec->iov_len);
 
        req->rq_xtime = ktime_get();
-       status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent,
+       status = xs_sendpages(transport->sock, NULL, 0, xdr,
+                             transport->xmit.offset,
                              true, &sent);
        dprintk("RPC:       %s(%u) = %d\n",
-                       __func__, xdr->len - req->rq_bytes_sent, status);
+                       __func__, xdr->len - transport->xmit.offset, status);
 
        if (status == -EAGAIN && sock_writeable(transport->inet))
                status = -ENOBUFS;
 
        if (likely(sent > 0) || status == 0) {
-               req->rq_bytes_sent += sent;
-               req->rq_xmit_bytes_sent += sent;
+               transport->xmit.offset += sent;
+               req->rq_bytes_sent = transport->xmit.offset;
                if (likely(req->rq_bytes_sent >= req->rq_slen)) {
+                       req->rq_xmit_bytes_sent += transport->xmit.offset;
                        req->rq_bytes_sent = 0;
+                       transport->xmit.offset = 0;
                        return 0;
                }
                status = -EAGAIN;
@@ -550,7 +923,7 @@ static int xs_local_send_request(struct rpc_task *task)
        case -ENOBUFS:
                break;
        case -EAGAIN:
-               status = xs_nospace(task);
+               status = xs_nospace(req);
                break;
        default:
                dprintk("RPC:       sendmsg returned unrecognized error %d\n",
@@ -566,7 +939,7 @@ static int xs_local_send_request(struct rpc_task *task)
 
 /**
  * xs_udp_send_request - write an RPC request to a UDP socket
- * @task: address of RPC task that manages the state of an RPC request
+ * @req: pointer to RPC request
  *
  * Return values:
  *        0:   The request has been sent
@@ -575,9 +948,8 @@ static int xs_local_send_request(struct rpc_task *task)
  * ENOTCONN:   Caller needs to invoke connect logic then call again
  *    other:   Some other error occurred, the request was not sent
  */
-static int xs_udp_send_request(struct rpc_task *task)
+static int xs_udp_send_request(struct rpc_rqst *req)
 {
-       struct rpc_rqst *req = task->tk_rqstp;
        struct rpc_xprt *xprt = req->rq_xprt;
        struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
        struct xdr_buf *xdr = &req->rq_snd_buf;
@@ -590,12 +962,16 @@ static int xs_udp_send_request(struct rpc_task *task)
 
        if (!xprt_bound(xprt))
                return -ENOTCONN;
+
+       if (!xprt_request_get_cong(xprt, req))
+               return -EBADSLT;
+
        req->rq_xtime = ktime_get();
        status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen,
-                             xdr, req->rq_bytes_sent, true, &sent);
+                             xdr, 0, true, &sent);
 
        dprintk("RPC:       xs_udp_send_request(%u) = %d\n",
-                       xdr->len - req->rq_bytes_sent, status);
+                       xdr->len, status);
 
        /* firewall is blocking us, don't return -EAGAIN or we end up looping */
        if (status == -EPERM)
@@ -619,7 +995,7 @@ process_status:
                /* Should we call xs_close() here? */
                break;
        case -EAGAIN:
-               status = xs_nospace(task);
+               status = xs_nospace(req);
                break;
        case -ENETUNREACH:
        case -ENOBUFS:
@@ -639,7 +1015,7 @@ process_status:
 
 /**
  * xs_tcp_send_request - write an RPC request to a TCP socket
- * @task: address of RPC task that manages the state of an RPC request
+ * @req: pointer to RPC request
  *
  * Return values:
  *        0:   The request has been sent
@@ -651,9 +1027,8 @@ process_status:
  * XXX: In the case of soft timeouts, should we eventually give up
  *     if sendmsg is not able to make progress?
  */
-static int xs_tcp_send_request(struct rpc_task *task)
+static int xs_tcp_send_request(struct rpc_rqst *req)
 {
-       struct rpc_rqst *req = task->tk_rqstp;
        struct rpc_xprt *xprt = req->rq_xprt;
        struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
        struct xdr_buf *xdr = &req->rq_snd_buf;
@@ -662,6 +1037,13 @@ static int xs_tcp_send_request(struct rpc_task *task)
        int status;
        int sent;
 
+       /* Close the stream if the previous transmission was incomplete */
+       if (xs_send_request_was_aborted(transport, req)) {
+               if (transport->sock != NULL)
+                       kernel_sock_shutdown(transport->sock, SHUT_RDWR);
+               return -ENOTCONN;
+       }
+
        xs_encode_stream_record_marker(&req->rq_snd_buf);
 
        xs_pktdump("packet data:",
@@ -671,7 +1053,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
         * completes while the socket holds a reference to the pages,
         * then we may end up resending corrupted data.
         */
-       if (task->tk_flags & RPC_TASK_SENT)
+       if (req->rq_task->tk_flags & RPC_TASK_SENT)
                zerocopy = false;
 
        if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state))
@@ -684,17 +1066,20 @@ static int xs_tcp_send_request(struct rpc_task *task)
        while (1) {
                sent = 0;
                status = xs_sendpages(transport->sock, NULL, 0, xdr,
-                                     req->rq_bytes_sent, zerocopy, &sent);
+                                     transport->xmit.offset,
+                                     zerocopy, &sent);
 
                dprintk("RPC:       xs_tcp_send_request(%u) = %d\n",
-                               xdr->len - req->rq_bytes_sent, status);
+                               xdr->len - transport->xmit.offset, status);
 
                /* If we've sent the entire packet, immediately
                 * reset the count of bytes sent. */
-               req->rq_bytes_sent += sent;
-               req->rq_xmit_bytes_sent += sent;
+               transport->xmit.offset += sent;
+               req->rq_bytes_sent = transport->xmit.offset;
                if (likely(req->rq_bytes_sent >= req->rq_slen)) {
+                       req->rq_xmit_bytes_sent += transport->xmit.offset;
                        req->rq_bytes_sent = 0;
+                       transport->xmit.offset = 0;
                        return 0;
                }
 
@@ -732,7 +1117,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
                /* Should we call xs_close() here? */
                break;
        case -EAGAIN:
-               status = xs_nospace(task);
+               status = xs_nospace(req);
                break;
        case -ECONNRESET:
        case -ECONNREFUSED:
@@ -749,35 +1134,6 @@ static int xs_tcp_send_request(struct rpc_task *task)
        return status;
 }
 
-/**
- * xs_tcp_release_xprt - clean up after a tcp transmission
- * @xprt: transport
- * @task: rpc task
- *
- * This cleans up if an error causes us to abort the transmission of a request.
- * In this case, the socket may need to be reset in order to avoid confusing
- * the server.
- */
-static void xs_tcp_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
-{
-       struct rpc_rqst *req;
-
-       if (task != xprt->snd_task)
-               return;
-       if (task == NULL)
-               goto out_release;
-       req = task->tk_rqstp;
-       if (req == NULL)
-               goto out_release;
-       if (req->rq_bytes_sent == 0)
-               goto out_release;
-       if (req->rq_bytes_sent == req->rq_snd_buf.len)
-               goto out_release;
-       set_bit(XPRT_CLOSE_WAIT, &xprt->state);
-out_release:
-       xprt_release_xprt(xprt, task);
-}
-
 static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk)
 {
        transport->old_data_ready = sk->sk_data_ready;
@@ -921,114 +1277,6 @@ static void xs_destroy(struct rpc_xprt *xprt)
        module_put(THIS_MODULE);
 }
 
-static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
-{
-       struct xdr_skb_reader desc = {
-               .skb            = skb,
-               .offset         = sizeof(rpc_fraghdr),
-               .count          = skb->len - sizeof(rpc_fraghdr),
-       };
-
-       if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0)
-               return -1;
-       if (desc.count)
-               return -1;
-       return 0;
-}
-
-/**
- * xs_local_data_read_skb
- * @xprt: transport
- * @sk: socket
- * @skb: skbuff
- *
- * Currently this assumes we can read the whole reply in a single gulp.
- */
-static void xs_local_data_read_skb(struct rpc_xprt *xprt,
-               struct sock *sk,
-               struct sk_buff *skb)
-{
-       struct rpc_task *task;
-       struct rpc_rqst *rovr;
-       int repsize, copied;
-       u32 _xid;
-       __be32 *xp;
-
-       repsize = skb->len - sizeof(rpc_fraghdr);
-       if (repsize < 4) {
-               dprintk("RPC:       impossible RPC reply size %d\n", repsize);
-               return;
-       }
-
-       /* Copy the XID from the skb... */
-       xp = skb_header_pointer(skb, sizeof(rpc_fraghdr), sizeof(_xid), &_xid);
-       if (xp == NULL)
-               return;
-
-       /* Look up and lock the request corresponding to the given XID */
-       spin_lock(&xprt->recv_lock);
-       rovr = xprt_lookup_rqst(xprt, *xp);
-       if (!rovr)
-               goto out_unlock;
-       xprt_pin_rqst(rovr);
-       spin_unlock(&xprt->recv_lock);
-       task = rovr->rq_task;
-
-       copied = rovr->rq_private_buf.buflen;
-       if (copied > repsize)
-               copied = repsize;
-
-       if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) {
-               dprintk("RPC:       sk_buff copy failed\n");
-               spin_lock(&xprt->recv_lock);
-               goto out_unpin;
-       }
-
-       spin_lock(&xprt->recv_lock);
-       xprt_complete_rqst(task, copied);
-out_unpin:
-       xprt_unpin_rqst(rovr);
- out_unlock:
-       spin_unlock(&xprt->recv_lock);
-}
-
-static void xs_local_data_receive(struct sock_xprt *transport)
-{
-       struct sk_buff *skb;
-       struct sock *sk;
-       int err;
-
-restart:
-       mutex_lock(&transport->recv_mutex);
-       sk = transport->inet;
-       if (sk == NULL)
-               goto out;
-       for (;;) {
-               skb = skb_recv_datagram(sk, 0, 1, &err);
-               if (skb != NULL) {
-                       xs_local_data_read_skb(&transport->xprt, sk, skb);
-                       skb_free_datagram(sk, skb);
-                       continue;
-               }
-               if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
-                       break;
-               if (need_resched()) {
-                       mutex_unlock(&transport->recv_mutex);
-                       cond_resched();
-                       goto restart;
-               }
-       }
-out:
-       mutex_unlock(&transport->recv_mutex);
-}
-
-static void xs_local_data_receive_workfn(struct work_struct *work)
-{
-       struct sock_xprt *transport =
-               container_of(work, struct sock_xprt, recv_worker);
-       xs_local_data_receive(transport);
-}
-
 /**
  * xs_udp_data_read_skb - receive callback for UDP sockets
  * @xprt: transport
@@ -1058,13 +1306,13 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
                return;
 
        /* Look up and lock the request corresponding to the given XID */
-       spin_lock(&xprt->recv_lock);
+       spin_lock(&xprt->queue_lock);
        rovr = xprt_lookup_rqst(xprt, *xp);
        if (!rovr)
                goto out_unlock;
        xprt_pin_rqst(rovr);
        xprt_update_rtt(rovr->rq_task);
-       spin_unlock(&xprt->recv_lock);
+       spin_unlock(&xprt->queue_lock);
        task = rovr->rq_task;
 
        if ((copied = rovr->rq_private_buf.buflen) > repsize)
@@ -1072,7 +1320,7 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
 
        /* Suck it into the iovec, verify checksum if not done by hw. */
        if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) {
-               spin_lock(&xprt->recv_lock);
+               spin_lock(&xprt->queue_lock);
                __UDPX_INC_STATS(sk, UDP_MIB_INERRORS);
                goto out_unpin;
        }
@@ -1081,13 +1329,13 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
        spin_lock_bh(&xprt->transport_lock);
        xprt_adjust_cwnd(xprt, task, copied);
        spin_unlock_bh(&xprt->transport_lock);
-       spin_lock(&xprt->recv_lock);
+       spin_lock(&xprt->queue_lock);
        xprt_complete_rqst(task, copied);
        __UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS);
 out_unpin:
        xprt_unpin_rqst(rovr);
  out_unlock:
-       spin_unlock(&xprt->recv_lock);
+       spin_unlock(&xprt->queue_lock);
 }
 
 static void xs_udp_data_receive(struct sock_xprt *transport)
@@ -1096,25 +1344,18 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
        struct sock *sk;
        int err;
 
-restart:
        mutex_lock(&transport->recv_mutex);
        sk = transport->inet;
        if (sk == NULL)
                goto out;
+       clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
        for (;;) {
                skb = skb_recv_udp(sk, 0, 1, &err);
-               if (skb != NULL) {
-                       xs_udp_data_read_skb(&transport->xprt, sk, skb);
-                       consume_skb(skb);
-                       continue;
-               }
-               if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
+               if (skb == NULL)
                        break;
-               if (need_resched()) {
-                       mutex_unlock(&transport->recv_mutex);
-                       cond_resched();
-                       goto restart;
-               }
+               xs_udp_data_read_skb(&transport->xprt, sk, skb);
+               consume_skb(skb);
+               cond_resched();
        }
 out:
        mutex_unlock(&transport->recv_mutex);
@@ -1163,263 +1404,7 @@ static void xs_tcp_force_close(struct rpc_xprt *xprt)
        xprt_force_disconnect(xprt);
 }
 
-static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc)
-{
-       struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
-       size_t len, used;
-       char *p;
-
-       p = ((char *) &transport->tcp_fraghdr) + transport->tcp_offset;
-       len = sizeof(transport->tcp_fraghdr) - transport->tcp_offset;
-       used = xdr_skb_read_bits(desc, p, len);
-       transport->tcp_offset += used;
-       if (used != len)
-               return;
-
-       transport->tcp_reclen = ntohl(transport->tcp_fraghdr);
-       if (transport->tcp_reclen & RPC_LAST_STREAM_FRAGMENT)
-               transport->tcp_flags |= TCP_RCV_LAST_FRAG;
-       else
-               transport->tcp_flags &= ~TCP_RCV_LAST_FRAG;
-       transport->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK;
-
-       transport->tcp_flags &= ~TCP_RCV_COPY_FRAGHDR;
-       transport->tcp_offset = 0;
-
-       /* Sanity check of the record length */
-       if (unlikely(transport->tcp_reclen < 8)) {
-               dprintk("RPC:       invalid TCP record fragment length\n");
-               xs_tcp_force_close(xprt);
-               return;
-       }
-       dprintk("RPC:       reading TCP record fragment of length %d\n",
-                       transport->tcp_reclen);
-}
-
-static void xs_tcp_check_fraghdr(struct sock_xprt *transport)
-{
-       if (transport->tcp_offset == transport->tcp_reclen) {
-               transport->tcp_flags |= TCP_RCV_COPY_FRAGHDR;
-               transport->tcp_offset = 0;
-               if (transport->tcp_flags & TCP_RCV_LAST_FRAG) {
-                       transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
-                       transport->tcp_flags |= TCP_RCV_COPY_XID;
-                       transport->tcp_copied = 0;
-               }
-       }
-}
-
-static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_reader *desc)
-{
-       size_t len, used;
-       char *p;
-
-       len = sizeof(transport->tcp_xid) - transport->tcp_offset;
-       dprintk("RPC:       reading XID (%zu bytes)\n", len);
-       p = ((char *) &transport->tcp_xid) + transport->tcp_offset;
-       used = xdr_skb_read_bits(desc, p, len);
-       transport->tcp_offset += used;
-       if (used != len)
-               return;
-       transport->tcp_flags &= ~TCP_RCV_COPY_XID;
-       transport->tcp_flags |= TCP_RCV_READ_CALLDIR;
-       transport->tcp_copied = 4;
-       dprintk("RPC:       reading %s XID %08x\n",
-                       (transport->tcp_flags & TCP_RPC_REPLY) ? "reply for"
-                                                             : "request with",
-                       ntohl(transport->tcp_xid));
-       xs_tcp_check_fraghdr(transport);
-}
-
-static inline void xs_tcp_read_calldir(struct sock_xprt *transport,
-                                      struct xdr_skb_reader *desc)
-{
-       size_t len, used;
-       u32 offset;
-       char *p;
-
-       /*
-        * We want transport->tcp_offset to be 8 at the end of this routine
-        * (4 bytes for the xid and 4 bytes for the call/reply flag).
-        * When this function is called for the first time,
-        * transport->tcp_offset is 4 (after having already read the xid).
-        */
-       offset = transport->tcp_offset - sizeof(transport->tcp_xid);
-       len = sizeof(transport->tcp_calldir) - offset;
-       dprintk("RPC:       reading CALL/REPLY flag (%zu bytes)\n", len);
-       p = ((char *) &transport->tcp_calldir) + offset;
-       used = xdr_skb_read_bits(desc, p, len);
-       transport->tcp_offset += used;
-       if (used != len)
-               return;
-       transport->tcp_flags &= ~TCP_RCV_READ_CALLDIR;
-       /*
-        * We don't yet have the XDR buffer, so we will write the calldir
-        * out after we get the buffer from the 'struct rpc_rqst'
-        */
-       switch (ntohl(transport->tcp_calldir)) {
-       case RPC_REPLY:
-               transport->tcp_flags |= TCP_RCV_COPY_CALLDIR;
-               transport->tcp_flags |= TCP_RCV_COPY_DATA;
-               transport->tcp_flags |= TCP_RPC_REPLY;
-               break;
-       case RPC_CALL:
-               transport->tcp_flags |= TCP_RCV_COPY_CALLDIR;
-               transport->tcp_flags |= TCP_RCV_COPY_DATA;
-               transport->tcp_flags &= ~TCP_RPC_REPLY;
-               break;
-       default:
-               dprintk("RPC:       invalid request message type\n");
-               xs_tcp_force_close(&transport->xprt);
-       }
-       xs_tcp_check_fraghdr(transport);
-}
-
-static inline void xs_tcp_read_common(struct rpc_xprt *xprt,
-                                    struct xdr_skb_reader *desc,
-                                    struct rpc_rqst *req)
-{
-       struct sock_xprt *transport =
-                               container_of(xprt, struct sock_xprt, xprt);
-       struct xdr_buf *rcvbuf;
-       size_t len;
-       ssize_t r;
-
-       rcvbuf = &req->rq_private_buf;
-
-       if (transport->tcp_flags & TCP_RCV_COPY_CALLDIR) {
-               /*
-                * Save the RPC direction in the XDR buffer
-                */
-               memcpy(rcvbuf->head[0].iov_base + transport->tcp_copied,
-                       &transport->tcp_calldir,
-                       sizeof(transport->tcp_calldir));
-               transport->tcp_copied += sizeof(transport->tcp_calldir);
-               transport->tcp_flags &= ~TCP_RCV_COPY_CALLDIR;
-       }
-
-       len = desc->count;
-       if (len > transport->tcp_reclen - transport->tcp_offset)
-               desc->count = transport->tcp_reclen - transport->tcp_offset;
-       r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied,
-                                         desc, xdr_skb_read_bits);
-
-       if (desc->count) {
-               /* Error when copying to the receive buffer,
-                * usually because we weren't able to allocate
-                * additional buffer pages. All we can do now
-                * is turn off TCP_RCV_COPY_DATA, so the request
-                * will not receive any additional updates,
-                * and time out.
-                * Any remaining data from this record will
-                * be discarded.
-                */
-               transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
-               dprintk("RPC:       XID %08x truncated request\n",
-                               ntohl(transport->tcp_xid));
-               dprintk("RPC:       xprt = %p, tcp_copied = %lu, "
-                               "tcp_offset = %u, tcp_reclen = %u\n",
-                               xprt, transport->tcp_copied,
-                               transport->tcp_offset, transport->tcp_reclen);
-               return;
-       }
-
-       transport->tcp_copied += r;
-       transport->tcp_offset += r;
-       desc->count = len - r;
-
-       dprintk("RPC:       XID %08x read %zd bytes\n",
-                       ntohl(transport->tcp_xid), r);
-       dprintk("RPC:       xprt = %p, tcp_copied = %lu, tcp_offset = %u, "
-                       "tcp_reclen = %u\n", xprt, transport->tcp_copied,
-                       transport->tcp_offset, transport->tcp_reclen);
-
-       if (transport->tcp_copied == req->rq_private_buf.buflen)
-               transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
-       else if (transport->tcp_offset == transport->tcp_reclen) {
-               if (transport->tcp_flags & TCP_RCV_LAST_FRAG)
-                       transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
-       }
-}
-
-/*
- * Finds the request corresponding to the RPC xid and invokes the common
- * tcp read code to read the data.
- */
-static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
-                                   struct xdr_skb_reader *desc)
-{
-       struct sock_xprt *transport =
-                               container_of(xprt, struct sock_xprt, xprt);
-       struct rpc_rqst *req;
-
-       dprintk("RPC:       read reply XID %08x\n", ntohl(transport->tcp_xid));
-
-       /* Find and lock the request corresponding to this xid */
-       spin_lock(&xprt->recv_lock);
-       req = xprt_lookup_rqst(xprt, transport->tcp_xid);
-       if (!req) {
-               dprintk("RPC:       XID %08x request not found!\n",
-                               ntohl(transport->tcp_xid));
-               spin_unlock(&xprt->recv_lock);
-               return -1;
-       }
-       xprt_pin_rqst(req);
-       spin_unlock(&xprt->recv_lock);
-
-       xs_tcp_read_common(xprt, desc, req);
-
-       spin_lock(&xprt->recv_lock);
-       if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
-               xprt_complete_rqst(req->rq_task, transport->tcp_copied);
-       xprt_unpin_rqst(req);
-       spin_unlock(&xprt->recv_lock);
-       return 0;
-}
-
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
-/*
- * Obtains an rpc_rqst previously allocated and invokes the common
- * tcp read code to read the data.  The result is placed in the callback
- * queue.
- * If we're unable to obtain the rpc_rqst we schedule the closing of the
- * connection and return -1.
- */
-static int xs_tcp_read_callback(struct rpc_xprt *xprt,
-                                      struct xdr_skb_reader *desc)
-{
-       struct sock_xprt *transport =
-                               container_of(xprt, struct sock_xprt, xprt);
-       struct rpc_rqst *req;
-
-       /* Look up the request corresponding to the given XID */
-       req = xprt_lookup_bc_request(xprt, transport->tcp_xid);
-       if (req == NULL) {
-               printk(KERN_WARNING "Callback slot table overflowed\n");
-               xprt_force_disconnect(xprt);
-               return -1;
-       }
-
-       dprintk("RPC:       read callback  XID %08x\n", ntohl(req->rq_xid));
-       xs_tcp_read_common(xprt, desc, req);
-
-       if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
-               xprt_complete_bc_request(req, transport->tcp_copied);
-
-       return 0;
-}
-
-static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
-                                       struct xdr_skb_reader *desc)
-{
-       struct sock_xprt *transport =
-                               container_of(xprt, struct sock_xprt, xprt);
-
-       return (transport->tcp_flags & TCP_RPC_REPLY) ?
-               xs_tcp_read_reply(xprt, desc) :
-               xs_tcp_read_callback(xprt, desc);
-}
-
 static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
 {
        int ret;
@@ -1435,145 +1420,8 @@ static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
 {
        return PAGE_SIZE;
 }
-#else
-static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
-                                       struct xdr_skb_reader *desc)
-{
-       return xs_tcp_read_reply(xprt, desc);
-}
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 
-/*
- * Read data off the transport.  This can be either an RPC_CALL or an
- * RPC_REPLY.  Relay the processing to helper functions.
- */
-static void xs_tcp_read_data(struct rpc_xprt *xprt,
-                                   struct xdr_skb_reader *desc)
-{
-       struct sock_xprt *transport =
-                               container_of(xprt, struct sock_xprt, xprt);
-
-       if (_xs_tcp_read_data(xprt, desc) == 0)
-               xs_tcp_check_fraghdr(transport);
-       else {
-               /*
-                * The transport_lock protects the request handling.
-                * There's no need to hold it to update the tcp_flags.
-                */
-               transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
-       }
-}
-
-static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_skb_reader *desc)
-{
-       size_t len;
-
-       len = transport->tcp_reclen - transport->tcp_offset;
-       if (len > desc->count)
-               len = desc->count;
-       desc->count -= len;
-       desc->offset += len;
-       transport->tcp_offset += len;
-       dprintk("RPC:       discarded %zu bytes\n", len);
-       xs_tcp_check_fraghdr(transport);
-}
-
-static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len)
-{
-       struct rpc_xprt *xprt = rd_desc->arg.data;
-       struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
-       struct xdr_skb_reader desc = {
-               .skb    = skb,
-               .offset = offset,
-               .count  = len,
-       };
-       size_t ret;
-
-       dprintk("RPC:       xs_tcp_data_recv started\n");
-       do {
-               trace_xs_tcp_data_recv(transport);
-               /* Read in a new fragment marker if necessary */
-               /* Can we ever really expect to get completely empty fragments? */
-               if (transport->tcp_flags & TCP_RCV_COPY_FRAGHDR) {
-                       xs_tcp_read_fraghdr(xprt, &desc);
-                       continue;
-               }
-               /* Read in the xid if necessary */
-               if (transport->tcp_flags & TCP_RCV_COPY_XID) {
-                       xs_tcp_read_xid(transport, &desc);
-                       continue;
-               }
-               /* Read in the call/reply flag */
-               if (transport->tcp_flags & TCP_RCV_READ_CALLDIR) {
-                       xs_tcp_read_calldir(transport, &desc);
-                       continue;
-               }
-               /* Read in the request data */
-               if (transport->tcp_flags & TCP_RCV_COPY_DATA) {
-                       xs_tcp_read_data(xprt, &desc);
-                       continue;
-               }
-               /* Skip over any trailing bytes on short reads */
-               xs_tcp_read_discard(transport, &desc);
-       } while (desc.count);
-       ret = len - desc.count;
-       if (ret < rd_desc->count)
-               rd_desc->count -= ret;
-       else
-               rd_desc->count = 0;
-       trace_xs_tcp_data_recv(transport);
-       dprintk("RPC:       xs_tcp_data_recv done\n");
-       return ret;
-}
-
-static void xs_tcp_data_receive(struct sock_xprt *transport)
-{
-       struct rpc_xprt *xprt = &transport->xprt;
-       struct sock *sk;
-       read_descriptor_t rd_desc = {
-               .arg.data = xprt,
-       };
-       unsigned long total = 0;
-       int read = 0;
-
-restart:
-       mutex_lock(&transport->recv_mutex);
-       sk = transport->inet;
-       if (sk == NULL)
-               goto out;
-
-       /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
-       for (;;) {
-               rd_desc.count = RPC_TCP_READ_CHUNK_SZ;
-               lock_sock(sk);
-               read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
-               if (rd_desc.count != 0 || read < 0) {
-                       clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
-                       release_sock(sk);
-                       break;
-               }
-               release_sock(sk);
-               total += read;
-               if (need_resched()) {
-                       mutex_unlock(&transport->recv_mutex);
-                       cond_resched();
-                       goto restart;
-               }
-       }
-       if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
-               queue_work(xprtiod_workqueue, &transport->recv_worker);
-out:
-       mutex_unlock(&transport->recv_mutex);
-       trace_xs_tcp_data_ready(xprt, read, total);
-}
-
-static void xs_tcp_data_receive_workfn(struct work_struct *work)
-{
-       struct sock_xprt *transport =
-               container_of(work, struct sock_xprt, recv_worker);
-       xs_tcp_data_receive(transport);
-}
-
 /**
  * xs_tcp_state_change - callback to handle TCP socket state changes
  * @sk: socket whose state has changed
@@ -1600,17 +1448,13 @@ static void xs_tcp_state_change(struct sock *sk)
        case TCP_ESTABLISHED:
                spin_lock(&xprt->transport_lock);
                if (!xprt_test_and_set_connected(xprt)) {
-
-                       /* Reset TCP record info */
-                       transport->tcp_offset = 0;
-                       transport->tcp_reclen = 0;
-                       transport->tcp_copied = 0;
-                       transport->tcp_flags =
-                               TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID;
                        xprt->connect_cookie++;
                        clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state);
                        xprt_clear_connecting(xprt);
 
+                       xprt->stat.connect_count++;
+                       xprt->stat.connect_time += (long)jiffies -
+                                                  xprt->stat.connect_start;
                        xprt_wake_pending_tasks(xprt, -EAGAIN);
                }
                spin_unlock(&xprt->transport_lock);
@@ -1675,7 +1519,8 @@ static void xs_write_space(struct sock *sk)
        if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0)
                goto out;
 
-       xprt_write_space(xprt);
+       if (xprt_write_space(xprt))
+               sk->sk_write_pending--;
 out:
        rcu_read_unlock();
 }
@@ -1773,11 +1618,17 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task)
        spin_unlock_bh(&xprt->transport_lock);
 }
 
-static unsigned short xs_get_random_port(void)
+static int xs_get_random_port(void)
 {
-       unsigned short range = xprt_max_resvport - xprt_min_resvport + 1;
-       unsigned short rand = (unsigned short) prandom_u32() % range;
-       return rand + xprt_min_resvport;
+       unsigned short min = xprt_min_resvport, max = xprt_max_resvport;
+       unsigned short range;
+       unsigned short rand;
+
+       if (max < min)
+               return -EADDRINUSE;
+       range = max - min + 1;
+       rand = (unsigned short) prandom_u32() % range;
+       return rand + min;
 }
 
 /**
@@ -1833,9 +1684,9 @@ static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock)
                transport->srcport = xs_sock_getport(sock);
 }
 
-static unsigned short xs_get_srcport(struct sock_xprt *transport)
+static int xs_get_srcport(struct sock_xprt *transport)
 {
-       unsigned short port = transport->srcport;
+       int port = transport->srcport;
 
        if (port == 0 && transport->xprt.resvport)
                port = xs_get_random_port();
@@ -1856,7 +1707,7 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock)
 {
        struct sockaddr_storage myaddr;
        int err, nloop = 0;
-       unsigned short port = xs_get_srcport(transport);
+       int port = xs_get_srcport(transport);
        unsigned short last;
 
        /*
@@ -1874,8 +1725,8 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock)
         * transport->xprt.resvport == 1) xs_get_srcport above will
         * ensure that port is non-zero and we will bind as needed.
         */
-       if (port == 0)
-               return 0;
+       if (port <= 0)
+               return port;
 
        memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen);
        do {
@@ -2028,9 +1879,8 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
                write_unlock_bh(&sk->sk_callback_lock);
        }
 
-       /* Tell the socket layer to start connecting... */
-       xprt->stat.connect_count++;
-       xprt->stat.connect_start = jiffies;
+       xs_stream_reset_connect(transport);
+
        return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0);
 }
 
@@ -2062,6 +1912,9 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
        case 0:
                dprintk("RPC:       xprt %p connected to %s\n",
                                xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
+               xprt->stat.connect_count++;
+               xprt->stat.connect_time += (long)jiffies -
+                                          xprt->stat.connect_start;
                xprt_set_connected(xprt);
        case -ENOBUFS:
                break;
@@ -2386,9 +2239,10 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 
        xs_set_memalloc(xprt);
 
+       /* Reset TCP record info */
+       xs_stream_reset_connect(transport);
+
        /* Tell the socket layer to start connecting... */
-       xprt->stat.connect_count++;
-       xprt->stat.connect_start = jiffies;
        set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state);
        ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK);
        switch (ret) {
@@ -2561,7 +2415,7 @@ static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
                        "%llu %llu %lu %llu %llu\n",
                        xprt->stat.bind_count,
                        xprt->stat.connect_count,
-                       xprt->stat.connect_time,
+                       xprt->stat.connect_time / HZ,
                        idle_time,
                        xprt->stat.sends,
                        xprt->stat.recvs,
@@ -2616,7 +2470,7 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
                        transport->srcport,
                        xprt->stat.bind_count,
                        xprt->stat.connect_count,
-                       xprt->stat.connect_time,
+                       xprt->stat.connect_time / HZ,
                        idle_time,
                        xprt->stat.sends,
                        xprt->stat.recvs,
@@ -2704,9 +2558,8 @@ static int bc_sendto(struct rpc_rqst *req)
 /*
  * The send routine. Borrows from svc_send
  */
-static int bc_send_request(struct rpc_task *task)
+static int bc_send_request(struct rpc_rqst *req)
 {
-       struct rpc_rqst *req = task->tk_rqstp;
        struct svc_xprt *xprt;
        int len;
 
@@ -2720,12 +2573,7 @@ static int bc_send_request(struct rpc_task *task)
         * Grab the mutex to serialize data as the connection is shared
         * with the fore channel
         */
-       if (!mutex_trylock(&xprt->xpt_mutex)) {
-               rpc_sleep_on(&xprt->xpt_bc_pending, task, NULL);
-               if (!mutex_trylock(&xprt->xpt_mutex))
-                       return -EAGAIN;
-               rpc_wake_up_queued_task(&xprt->xpt_bc_pending, task);
-       }
+       mutex_lock(&xprt->xpt_mutex);
        if (test_bit(XPT_DEAD, &xprt->xpt_flags))
                len = -ENOTCONN;
        else
@@ -2761,7 +2609,7 @@ static void bc_destroy(struct rpc_xprt *xprt)
 
 static const struct rpc_xprt_ops xs_local_ops = {
        .reserve_xprt           = xprt_reserve_xprt,
-       .release_xprt           = xs_tcp_release_xprt,
+       .release_xprt           = xprt_release_xprt,
        .alloc_slot             = xprt_alloc_slot,
        .free_slot              = xprt_free_slot,
        .rpcbind                = xs_local_rpcbind,
@@ -2769,6 +2617,7 @@ static const struct rpc_xprt_ops xs_local_ops = {
        .connect                = xs_local_connect,
        .buf_alloc              = rpc_malloc,
        .buf_free               = rpc_free,
+       .prepare_request        = xs_stream_prepare_request,
        .send_request           = xs_local_send_request,
        .set_retrans_timeout    = xprt_set_retrans_timeout_def,
        .close                  = xs_close,
@@ -2803,14 +2652,15 @@ static const struct rpc_xprt_ops xs_udp_ops = {
 
 static const struct rpc_xprt_ops xs_tcp_ops = {
        .reserve_xprt           = xprt_reserve_xprt,
-       .release_xprt           = xs_tcp_release_xprt,
-       .alloc_slot             = xprt_lock_and_alloc_slot,
+       .release_xprt           = xprt_release_xprt,
+       .alloc_slot             = xprt_alloc_slot,
        .free_slot              = xprt_free_slot,
        .rpcbind                = rpcb_getport_async,
        .set_port               = xs_set_port,
        .connect                = xs_connect,
        .buf_alloc              = rpc_malloc,
        .buf_free               = rpc_free,
+       .prepare_request        = xs_stream_prepare_request,
        .send_request           = xs_tcp_send_request,
        .set_retrans_timeout    = xprt_set_retrans_timeout_def,
        .close                  = xs_tcp_shutdown,
@@ -2952,9 +2802,8 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
        xprt->ops = &xs_local_ops;
        xprt->timeout = &xs_local_default_timeout;
 
-       INIT_WORK(&transport->recv_worker, xs_local_data_receive_workfn);
-       INIT_DELAYED_WORK(&transport->connect_worker,
-                       xs_dummy_setup_socket);
+       INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn);
+       INIT_DELAYED_WORK(&transport->connect_worker, xs_dummy_setup_socket);
 
        switch (sun->sun_family) {
        case AF_LOCAL:
@@ -3106,7 +2955,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
        xprt->connect_timeout = xprt->timeout->to_initval *
                (xprt->timeout->to_retries + 1);
 
-       INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn);
+       INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn);
        INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
 
        switch (addr->sa_family) {
@@ -3317,12 +3166,8 @@ static int param_set_uint_minmax(const char *val,
 
 static int param_set_portnr(const char *val, const struct kernel_param *kp)
 {
-       if (kp->arg == &xprt_min_resvport)
-               return param_set_uint_minmax(val, kp,
-                       RPC_MIN_RESVPORT,
-                       xprt_max_resvport);
        return param_set_uint_minmax(val, kp,
-                       xprt_min_resvport,
+                       RPC_MIN_RESVPORT,
                        RPC_MAX_RESVPORT);
 }