xprtrdma: Reduce the number of hardway buffer allocations
[linux-2.6-block.git] / net / sunrpc / xprtrdma / verbs.c
index 93726560eaa8864465b72a03e11d3dfa6d97703e..c80995af82de819dee79f463ed7254b75f08191e 100644 (file)
@@ -50,6 +50,7 @@
 #include <linux/interrupt.h>
 #include <linux/pci.h> /* for Tavor hack below */
 #include <linux/slab.h>
+#include <asm/bitops.h>
 
 #include "xprt_rdma.h"
 
@@ -142,98 +143,139 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
        }
 }
 
-static inline
-void rpcrdma_event_process(struct ib_wc *wc)
+static void
+rpcrdma_sendcq_process_wc(struct ib_wc *wc)
 {
-       struct rpcrdma_mw *frmr;
-       struct rpcrdma_rep *rep =
-                       (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
+       struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
 
-       dprintk("RPC:       %s: event rep %p status %X opcode %X length %u\n",
-               __func__, rep, wc->status, wc->opcode, wc->byte_len);
+       dprintk("RPC:       %s: frmr %p status %X opcode %d\n",
+               __func__, frmr, wc->status, wc->opcode);
 
-       if (!rep) /* send or bind completion that we don't care about */
+       if (wc->wr_id == 0ULL)
                return;
-
-       if (IB_WC_SUCCESS != wc->status) {
-               dprintk("RPC:       %s: WC opcode %d status %X, connection lost\n",
-                       __func__, wc->opcode, wc->status);
-               rep->rr_len = ~0U;
-               if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
-                       rpcrdma_schedule_tasklet(rep);
+       if (wc->status != IB_WC_SUCCESS)
                return;
-       }
 
-       switch (wc->opcode) {
-       case IB_WC_FAST_REG_MR:
-               frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+       if (wc->opcode == IB_WC_FAST_REG_MR)
                frmr->r.frmr.state = FRMR_IS_VALID;
-               break;
-       case IB_WC_LOCAL_INV:
-               frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+       else if (wc->opcode == IB_WC_LOCAL_INV)
                frmr->r.frmr.state = FRMR_IS_INVALID;
-               break;
-       case IB_WC_RECV:
-               rep->rr_len = wc->byte_len;
-               ib_dma_sync_single_for_cpu(
-                       rdmab_to_ia(rep->rr_buffer)->ri_id->device,
-                       rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
-               /* Keep (only) the most recent credits, after check validity */
-               if (rep->rr_len >= 16) {
-                       struct rpcrdma_msg *p =
-                                       (struct rpcrdma_msg *) rep->rr_base;
-                       unsigned int credits = ntohl(p->rm_credit);
-                       if (credits == 0) {
-                               dprintk("RPC:       %s: server"
-                                       " dropped credits to 0!\n", __func__);
-                               /* don't deadlock */
-                               credits = 1;
-                       } else if (credits > rep->rr_buffer->rb_max_requests) {
-                               dprintk("RPC:       %s: server"
-                                       " over-crediting: %d (%d)\n",
-                                       __func__, credits,
-                                       rep->rr_buffer->rb_max_requests);
-                               credits = rep->rr_buffer->rb_max_requests;
-                       }
-                       atomic_set(&rep->rr_buffer->rb_credits, credits);
-               }
-               /* fall through */
-       case IB_WC_BIND_MW:
-               rpcrdma_schedule_tasklet(rep);
-               break;
-       default:
-               dprintk("RPC:       %s: unexpected WC event %X\n",
-                       __func__, wc->opcode);
-               break;
-       }
 }
 
-static inline int
-rpcrdma_cq_poll(struct ib_cq *cq)
+static int
+rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
 {
-       struct ib_wc wc;
-       int rc;
+       struct ib_wc *wcs;
+       int budget, count, rc;
 
-       for (;;) {
-               rc = ib_poll_cq(cq, 1, &wc);
-               if (rc < 0) {
-                       dprintk("RPC:       %s: ib_poll_cq failed %i\n",
-                               __func__, rc);
+       budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
+       do {
+               wcs = ep->rep_send_wcs;
+
+               rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
+               if (rc <= 0)
                        return rc;
-               }
-               if (rc == 0)
-                       break;
 
-               rpcrdma_event_process(&wc);
+               count = rc;
+               while (count-- > 0)
+                       rpcrdma_sendcq_process_wc(wcs++);
+       } while (rc == RPCRDMA_POLLSIZE && --budget);
+       return 0;
+}
+
+/*
+ * Handle send, fast_reg_mr, and local_inv completions.
+ *
+ * Send events are typically suppressed and thus do not result
+ * in an upcall. Occasionally one is signaled, however. This
+ * prevents the provider's completion queue from wrapping and
+ * losing a completion.
+ */
+static void
+rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
+{
+       struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
+       int rc;
+
+       rc = rpcrdma_sendcq_poll(cq, ep);
+       if (rc) {
+               dprintk("RPC:       %s: ib_poll_cq failed: %i\n",
+                       __func__, rc);
+               return;
+       }
+
+       rc = ib_req_notify_cq(cq,
+                       IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+       if (rc == 0)
+               return;
+       if (rc < 0) {
+               dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
+                       __func__, rc);
+               return;
+       }
+
+       rpcrdma_sendcq_poll(cq, ep);
+}
+
+static void
+rpcrdma_recvcq_process_wc(struct ib_wc *wc)
+{
+       struct rpcrdma_rep *rep =
+                       (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
+
+       dprintk("RPC:       %s: rep %p status %X opcode %X length %u\n",
+               __func__, rep, wc->status, wc->opcode, wc->byte_len);
+
+       if (wc->status != IB_WC_SUCCESS) {
+               rep->rr_len = ~0U;
+               goto out_schedule;
        }
+       if (wc->opcode != IB_WC_RECV)
+               return;
+
+       rep->rr_len = wc->byte_len;
+       ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
+                       rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
+
+       if (rep->rr_len >= 16) {
+               struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
+               unsigned int credits = ntohl(p->rm_credit);
+
+               if (credits == 0)
+                       credits = 1;    /* don't deadlock */
+               else if (credits > rep->rr_buffer->rb_max_requests)
+                       credits = rep->rr_buffer->rb_max_requests;
+               atomic_set(&rep->rr_buffer->rb_credits, credits);
+       }
+
+out_schedule:
+       rpcrdma_schedule_tasklet(rep);
+}
 
+static int
+rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+{
+       struct ib_wc *wcs;
+       int budget, count, rc;
+
+       budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
+       do {
+               wcs = ep->rep_recv_wcs;
+
+               rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
+               if (rc <= 0)
+                       return rc;
+
+               count = rc;
+               while (count-- > 0)
+                       rpcrdma_recvcq_process_wc(wcs++);
+       } while (rc == RPCRDMA_POLLSIZE && --budget);
        return 0;
 }
 
 /*
- * rpcrdma_cq_event_upcall
+ * Handle receive completions.
  *
- * This upcall handles recv, send, bind and unbind events.
  * It is reentrant but processes single events in order to maintain
  * ordering of receives to keep server credits.
  *
@@ -242,26 +284,31 @@ rpcrdma_cq_poll(struct ib_cq *cq)
  * connection shutdown. That is, the structures required for
  * the completion of the reply handler must remain intact until
  * all memory has been reclaimed.
- *
- * Note that send events are suppressed and do not result in an upcall.
  */
 static void
-rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
+rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
 {
+       struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
        int rc;
 
-       rc = rpcrdma_cq_poll(cq);
-       if (rc)
+       rc = rpcrdma_recvcq_poll(cq, ep);
+       if (rc) {
+               dprintk("RPC:       %s: ib_poll_cq failed: %i\n",
+                       __func__, rc);
                return;
+       }
 
-       rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-       if (rc) {
-               dprintk("RPC:       %s: ib_req_notify_cq failed %i\n",
+       rc = ib_req_notify_cq(cq,
+                       IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+       if (rc == 0)
+               return;
+       if (rc < 0) {
+               dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
                        __func__, rc);
                return;
        }
 
-       rpcrdma_cq_poll(cq);
+       rpcrdma_recvcq_poll(cq, ep);
 }
 
 #ifdef RPC_DEBUG
@@ -493,54 +540,32 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
                ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
        }
 
-       switch (memreg) {
-       case RPCRDMA_MEMWINDOWS:
-       case RPCRDMA_MEMWINDOWS_ASYNC:
-               if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
-                       dprintk("RPC:       %s: MEMWINDOWS registration "
-                               "specified but not supported by adapter, "
-                               "using slower RPCRDMA_REGISTER\n",
-                               __func__);
-                       memreg = RPCRDMA_REGISTER;
-               }
-               break;
-       case RPCRDMA_MTHCAFMR:
-               if (!ia->ri_id->device->alloc_fmr) {
-#if RPCRDMA_PERSISTENT_REGISTRATION
-                       dprintk("RPC:       %s: MTHCAFMR registration "
-                               "specified but not supported by adapter, "
-                               "using riskier RPCRDMA_ALLPHYSICAL\n",
-                               __func__);
-                       memreg = RPCRDMA_ALLPHYSICAL;
-#else
-                       dprintk("RPC:       %s: MTHCAFMR registration "
-                               "specified but not supported by adapter, "
-                               "using slower RPCRDMA_REGISTER\n",
-                               __func__);
-                       memreg = RPCRDMA_REGISTER;
-#endif
-               }
-               break;
-       case RPCRDMA_FRMR:
+       if (memreg == RPCRDMA_FRMR) {
                /* Requires both frmr reg and local dma lkey */
                if ((devattr.device_cap_flags &
                     (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
                    (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
-#if RPCRDMA_PERSISTENT_REGISTRATION
                        dprintk("RPC:       %s: FRMR registration "
-                               "specified but not supported by adapter, "
-                               "using riskier RPCRDMA_ALLPHYSICAL\n",
-                               __func__);
+                               "not supported by HCA\n", __func__);
+                       memreg = RPCRDMA_MTHCAFMR;
+               } else {
+                       /* Mind the ia limit on FRMR page list depth */
+                       ia->ri_max_frmr_depth = min_t(unsigned int,
+                               RPCRDMA_MAX_DATA_SEGS,
+                               devattr.max_fast_reg_page_list_len);
+               }
+       }
+       if (memreg == RPCRDMA_MTHCAFMR) {
+               if (!ia->ri_id->device->alloc_fmr) {
+                       dprintk("RPC:       %s: MTHCAFMR registration "
+                               "not supported by HCA\n", __func__);
+#if RPCRDMA_PERSISTENT_REGISTRATION
                        memreg = RPCRDMA_ALLPHYSICAL;
 #else
-                       dprintk("RPC:       %s: FRMR registration "
-                               "specified but not supported by adapter, "
-                               "using slower RPCRDMA_REGISTER\n",
-                               __func__);
-                       memreg = RPCRDMA_REGISTER;
+                       rc = -ENOMEM;
+                       goto out2;
 #endif
                }
-               break;
        }
 
        /*
@@ -552,8 +577,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
         * adapter.
         */
        switch (memreg) {
-       case RPCRDMA_BOUNCEBUFFERS:
-       case RPCRDMA_REGISTER:
        case RPCRDMA_FRMR:
                break;
 #if RPCRDMA_PERSISTENT_REGISTRATION
@@ -563,30 +586,26 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
                                IB_ACCESS_REMOTE_READ;
                goto register_setup;
 #endif
-       case RPCRDMA_MEMWINDOWS_ASYNC:
-       case RPCRDMA_MEMWINDOWS:
-               mem_priv = IB_ACCESS_LOCAL_WRITE |
-                               IB_ACCESS_MW_BIND;
-               goto register_setup;
        case RPCRDMA_MTHCAFMR:
                if (ia->ri_have_dma_lkey)
                        break;
                mem_priv = IB_ACCESS_LOCAL_WRITE;
+#if RPCRDMA_PERSISTENT_REGISTRATION
        register_setup:
+#endif
                ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
                if (IS_ERR(ia->ri_bind_mem)) {
                        printk(KERN_ALERT "%s: ib_get_dma_mr for "
-                               "phys register failed with %lX\n\t"
-                               "Will continue with degraded performance\n",
+                               "phys register failed with %lX\n",
                                __func__, PTR_ERR(ia->ri_bind_mem));
-                       memreg = RPCRDMA_REGISTER;
-                       ia->ri_bind_mem = NULL;
+                       rc = -ENOMEM;
+                       goto out2;
                }
                break;
        default:
-               printk(KERN_ERR "%s: invalid memory registration mode %d\n",
-                               __func__, memreg);
-               rc = -EINVAL;
+               printk(KERN_ERR "RPC: Unsupported memory "
+                               "registration mode: %d\n", memreg);
+               rc = -ENOMEM;
                goto out2;
        }
        dprintk("RPC:       %s: memory registration strategy is %d\n",
@@ -640,6 +659,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
                                struct rpcrdma_create_data_internal *cdata)
 {
        struct ib_device_attr devattr;
+       struct ib_cq *sendcq, *recvcq;
        int rc, err;
 
        rc = ib_query_device(ia->ri_id->device, &devattr);
@@ -659,32 +679,42 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        ep->rep_attr.srq = NULL;
        ep->rep_attr.cap.max_send_wr = cdata->max_requests;
        switch (ia->ri_memreg_strategy) {
-       case RPCRDMA_FRMR:
+       case RPCRDMA_FRMR: {
+               int depth = 7;
+
                /* Add room for frmr register and invalidate WRs.
                 * 1. FRMR reg WR for head
                 * 2. FRMR invalidate WR for head
-                * 3. FRMR reg WR for pagelist
-                * 4. FRMR invalidate WR for pagelist
+                * 3. N FRMR reg WRs for pagelist
+                * 4. N FRMR invalidate WRs for pagelist
                 * 5. FRMR reg WR for tail
                 * 6. FRMR invalidate WR for tail
                 * 7. The RDMA_SEND WR
                 */
-               ep->rep_attr.cap.max_send_wr *= 7;
+
+               /* Calculate N if the device max FRMR depth is smaller than
+                * RPCRDMA_MAX_DATA_SEGS.
+                */
+               if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
+                       int delta = RPCRDMA_MAX_DATA_SEGS -
+                                   ia->ri_max_frmr_depth;
+
+                       do {
+                               depth += 2; /* FRMR reg + invalidate */
+                               delta -= ia->ri_max_frmr_depth;
+                       } while (delta > 0);
+
+               }
+               ep->rep_attr.cap.max_send_wr *= depth;
                if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
-                       cdata->max_requests = devattr.max_qp_wr / 7;
+                       cdata->max_requests = devattr.max_qp_wr / depth;
                        if (!cdata->max_requests)
                                return -EINVAL;
-                       ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
+                       ep->rep_attr.cap.max_send_wr = cdata->max_requests *
+                                                      depth;
                }
                break;
-       case RPCRDMA_MEMWINDOWS_ASYNC:
-       case RPCRDMA_MEMWINDOWS:
-               /* Add room for mw_binds+unbinds - overkill! */
-               ep->rep_attr.cap.max_send_wr++;
-               ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
-               if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
-                       return -EINVAL;
-               break;
+       }
        default:
                break;
        }
@@ -705,46 +735,51 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
                ep->rep_attr.cap.max_recv_sge);
 
        /* set trigger for requesting send completion */
-       ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /*  - 1*/;
-       switch (ia->ri_memreg_strategy) {
-       case RPCRDMA_MEMWINDOWS_ASYNC:
-       case RPCRDMA_MEMWINDOWS:
-               ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
-               break;
-       default:
-               break;
-       }
+       ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
        if (ep->rep_cqinit <= 2)
                ep->rep_cqinit = 0;
        INIT_CQCOUNT(ep);
        ep->rep_ia = ia;
        init_waitqueue_head(&ep->rep_connect_wait);
+       INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
 
-       /*
-        * Create a single cq for receive dto and mw_bind (only ever
-        * care about unbind, really). Send completions are suppressed.
-        * Use single threaded tasklet upcalls to maintain ordering.
-        */
-       ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
-                                 rpcrdma_cq_async_error_upcall, NULL,
-                                 ep->rep_attr.cap.max_recv_wr +
+       sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
+                                 rpcrdma_cq_async_error_upcall, ep,
                                  ep->rep_attr.cap.max_send_wr + 1, 0);
-       if (IS_ERR(ep->rep_cq)) {
-               rc = PTR_ERR(ep->rep_cq);
-               dprintk("RPC:       %s: ib_create_cq failed: %i\n",
+       if (IS_ERR(sendcq)) {
+               rc = PTR_ERR(sendcq);
+               dprintk("RPC:       %s: failed to create send CQ: %i\n",
                        __func__, rc);
                goto out1;
        }
 
-       rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
+       rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
        if (rc) {
                dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
                        __func__, rc);
                goto out2;
        }
 
-       ep->rep_attr.send_cq = ep->rep_cq;
-       ep->rep_attr.recv_cq = ep->rep_cq;
+       recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
+                                 rpcrdma_cq_async_error_upcall, ep,
+                                 ep->rep_attr.cap.max_recv_wr + 1, 0);
+       if (IS_ERR(recvcq)) {
+               rc = PTR_ERR(recvcq);
+               dprintk("RPC:       %s: failed to create recv CQ: %i\n",
+                       __func__, rc);
+               goto out2;
+       }
+
+       rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
+       if (rc) {
+               dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
+                       __func__, rc);
+               ib_destroy_cq(recvcq);
+               goto out2;
+       }
+
+       ep->rep_attr.send_cq = sendcq;
+       ep->rep_attr.recv_cq = recvcq;
 
        /* Initialize cma parameters */
 
@@ -754,9 +789,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 
        /* Client offers RDMA Read but does not initiate */
        ep->rep_remote_cma.initiator_depth = 0;
-       if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
-               ep->rep_remote_cma.responder_resources = 0;
-       else if (devattr.max_qp_rd_atom > 32)   /* arbitrary but <= 255 */
+       if (devattr.max_qp_rd_atom > 32)        /* arbitrary but <= 255 */
                ep->rep_remote_cma.responder_resources = 32;
        else
                ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
@@ -768,7 +801,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        return 0;
 
 out2:
-       err = ib_destroy_cq(ep->rep_cq);
+       err = ib_destroy_cq(sendcq);
        if (err)
                dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
                        __func__, err);
@@ -782,11 +815,8 @@ out1:
  * Disconnect and destroy endpoint. After this, the only
  * valid operations on the ep are to free it (if dynamically
  * allocated) or re-create it.
- *
- * The caller's error handling must be sure to not leak the endpoint
- * if this function fails.
  */
-int
+void
 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
        int rc;
@@ -794,6 +824,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
        dprintk("RPC:       %s: entering, connected is %d\n",
                __func__, ep->rep_connected);
 
+       cancel_delayed_work_sync(&ep->rep_connect_worker);
+
        if (ia->ri_id->qp) {
                rc = rpcrdma_ep_disconnect(ep, ia);
                if (rc)
@@ -809,13 +841,17 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
                ep->rep_pad_mr = NULL;
        }
 
-       rpcrdma_clean_cq(ep->rep_cq);
-       rc = ib_destroy_cq(ep->rep_cq);
+       rpcrdma_clean_cq(ep->rep_attr.recv_cq);
+       rc = ib_destroy_cq(ep->rep_attr.recv_cq);
        if (rc)
                dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
                        __func__, rc);
 
-       return rc;
+       rpcrdma_clean_cq(ep->rep_attr.send_cq);
+       rc = ib_destroy_cq(ep->rep_attr.send_cq);
+       if (rc)
+               dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
+                       __func__, rc);
 }
 
 /*
@@ -835,7 +871,9 @@ retry:
                if (rc && rc != -ENOTCONN)
                        dprintk("RPC:       %s: rpcrdma_ep_disconnect"
                                " status %i\n", __func__, rc);
-               rpcrdma_clean_cq(ep->rep_cq);
+
+               rpcrdma_clean_cq(ep->rep_attr.recv_cq);
+               rpcrdma_clean_cq(ep->rep_attr.send_cq);
 
                xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
                id = rpcrdma_create_id(xprt, ia,
@@ -944,7 +982,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
        int rc;
 
-       rpcrdma_clean_cq(ep->rep_cq);
+       rpcrdma_clean_cq(ep->rep_attr.recv_cq);
+       rpcrdma_clean_cq(ep->rep_attr.send_cq);
        rc = rdma_disconnect(ia->ri_id);
        if (!rc) {
                /* returns without wait if not connected */
@@ -967,7 +1006,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
        struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
 {
        char *p;
-       size_t len;
+       size_t len, rlen, wlen;
        int i, rc;
        struct rpcrdma_mw *r;
 
@@ -997,11 +1036,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
                len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
                                sizeof(struct rpcrdma_mw);
                break;
-       case RPCRDMA_MEMWINDOWS_ASYNC:
-       case RPCRDMA_MEMWINDOWS:
-               len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
-                               sizeof(struct rpcrdma_mw);
-               break;
        default:
                break;
        }
@@ -1032,32 +1066,29 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
        }
        p += cdata->padding;
 
-       /*
-        * Allocate the fmr's, or mw's for mw_bind chunk registration.
-        * We "cycle" the mw's in order to minimize rkey reuse,
-        * and also reduce unbind-to-bind collision.
-        */
        INIT_LIST_HEAD(&buf->rb_mws);
        r = (struct rpcrdma_mw *)p;
        switch (ia->ri_memreg_strategy) {
        case RPCRDMA_FRMR:
                for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
                        r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
-                                                        RPCRDMA_MAX_SEGS);
+                                               ia->ri_max_frmr_depth);
                        if (IS_ERR(r->r.frmr.fr_mr)) {
                                rc = PTR_ERR(r->r.frmr.fr_mr);
                                dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
                                        " failed %i\n", __func__, rc);
                                goto out;
                        }
-                       r->r.frmr.fr_pgl =
-                               ib_alloc_fast_reg_page_list(ia->ri_id->device,
-                                                           RPCRDMA_MAX_SEGS);
+                       r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
+                                               ia->ri_id->device,
+                                               ia->ri_max_frmr_depth);
                        if (IS_ERR(r->r.frmr.fr_pgl)) {
                                rc = PTR_ERR(r->r.frmr.fr_pgl);
                                dprintk("RPC:       %s: "
                                        "ib_alloc_fast_reg_page_list "
                                        "failed %i\n", __func__, rc);
+
+                               ib_dereg_mr(r->r.frmr.fr_mr);
                                goto out;
                        }
                        list_add(&r->mw_list, &buf->rb_mws);
@@ -1082,21 +1113,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
                        ++r;
                }
                break;
-       case RPCRDMA_MEMWINDOWS_ASYNC:
-       case RPCRDMA_MEMWINDOWS:
-               /* Allocate one extra request's worth, for full cycling */
-               for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
-                       r->r.mw = ib_alloc_mw(ia->ri_pd, IB_MW_TYPE_1);
-                       if (IS_ERR(r->r.mw)) {
-                               rc = PTR_ERR(r->r.mw);
-                               dprintk("RPC:       %s: ib_alloc_mw"
-                                       " failed %i\n", __func__, rc);
-                               goto out;
-                       }
-                       list_add(&r->mw_list, &buf->rb_mws);
-                       ++r;
-               }
-               break;
        default:
                break;
        }
@@ -1105,16 +1121,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
         * Allocate/init the request/reply buffers. Doing this
         * using kmalloc for now -- one for each buf.
         */
+       wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
+       rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
+       dprintk("RPC:       %s: wlen = %zu, rlen = %zu\n",
+               __func__, wlen, rlen);
+
        for (i = 0; i < buf->rb_max_requests; i++) {
                struct rpcrdma_req *req;
                struct rpcrdma_rep *rep;
 
-               len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
-               /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
-               /* Typical ~2400b, so rounding up saves work later */
-               if (len < 4096)
-                       len = 4096;
-               req = kmalloc(len, GFP_KERNEL);
+               req = kmalloc(wlen, GFP_KERNEL);
                if (req == NULL) {
                        dprintk("RPC:       %s: request buffer %d alloc"
                                " failed\n", __func__, i);
@@ -1126,16 +1142,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
                buf->rb_send_bufs[i]->rl_buffer = buf;
 
                rc = rpcrdma_register_internal(ia, req->rl_base,
-                               len - offsetof(struct rpcrdma_req, rl_base),
+                               wlen - offsetof(struct rpcrdma_req, rl_base),
                                &buf->rb_send_bufs[i]->rl_handle,
                                &buf->rb_send_bufs[i]->rl_iov);
                if (rc)
                        goto out;
 
-               buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
+               buf->rb_send_bufs[i]->rl_size = wlen -
+                                               sizeof(struct rpcrdma_req);
 
-               len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
-               rep = kmalloc(len, GFP_KERNEL);
+               rep = kmalloc(rlen, GFP_KERNEL);
                if (rep == NULL) {
                        dprintk("RPC:       %s: reply buffer %d alloc failed\n",
                                __func__, i);
@@ -1145,10 +1161,9 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
                memset(rep, 0, sizeof(struct rpcrdma_rep));
                buf->rb_recv_bufs[i] = rep;
                buf->rb_recv_bufs[i]->rr_buffer = buf;
-               init_waitqueue_head(&rep->rr_unbind);
 
                rc = rpcrdma_register_internal(ia, rep->rr_base,
-                               len - offsetof(struct rpcrdma_rep, rr_base),
+                               rlen - offsetof(struct rpcrdma_rep, rr_base),
                                &buf->rb_recv_bufs[i]->rr_handle,
                                &buf->rb_recv_bufs[i]->rr_iov);
                if (rc)
@@ -1179,7 +1194,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 
        /* clean up in reverse order from create
         *   1.  recv mr memory (mr free, then kfree)
-        *   1a. bind mw memory
         *   2.  send mr memory (mr free, then kfree)
         *   3.  padding (if any) [moved to rpcrdma_ep_destroy]
         *   4.  arrays
@@ -1194,41 +1208,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
                        kfree(buf->rb_recv_bufs[i]);
                }
                if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
-                       while (!list_empty(&buf->rb_mws)) {
-                               r = list_entry(buf->rb_mws.next,
-                                       struct rpcrdma_mw, mw_list);
-                               list_del(&r->mw_list);
-                               switch (ia->ri_memreg_strategy) {
-                               case RPCRDMA_FRMR:
-                                       rc = ib_dereg_mr(r->r.frmr.fr_mr);
-                                       if (rc)
-                                               dprintk("RPC:       %s:"
-                                                       " ib_dereg_mr"
-                                                       " failed %i\n",
-                                                       __func__, rc);
-                                       ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
-                                       break;
-                               case RPCRDMA_MTHCAFMR:
-                                       rc = ib_dealloc_fmr(r->r.fmr);
-                                       if (rc)
-                                               dprintk("RPC:       %s:"
-                                                       " ib_dealloc_fmr"
-                                                       " failed %i\n",
-                                                       __func__, rc);
-                                       break;
-                               case RPCRDMA_MEMWINDOWS_ASYNC:
-                               case RPCRDMA_MEMWINDOWS:
-                                       rc = ib_dealloc_mw(r->r.mw);
-                                       if (rc)
-                                               dprintk("RPC:       %s:"
-                                                       " ib_dealloc_mw"
-                                                       " failed %i\n",
-                                                       __func__, rc);
-                                       break;
-                               default:
-                                       break;
-                               }
-                       }
                        rpcrdma_deregister_internal(ia,
                                        buf->rb_send_bufs[i]->rl_handle,
                                        &buf->rb_send_bufs[i]->rl_iov);
@@ -1236,6 +1215,33 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
                }
        }
 
+       while (!list_empty(&buf->rb_mws)) {
+               r = list_entry(buf->rb_mws.next,
+                       struct rpcrdma_mw, mw_list);
+               list_del(&r->mw_list);
+               switch (ia->ri_memreg_strategy) {
+               case RPCRDMA_FRMR:
+                       rc = ib_dereg_mr(r->r.frmr.fr_mr);
+                       if (rc)
+                               dprintk("RPC:       %s:"
+                                       " ib_dereg_mr"
+                                       " failed %i\n",
+                                       __func__, rc);
+                       ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+                       break;
+               case RPCRDMA_MTHCAFMR:
+                       rc = ib_dealloc_fmr(r->r.fmr);
+                       if (rc)
+                               dprintk("RPC:       %s:"
+                                       " ib_dealloc_fmr"
+                                       " failed %i\n",
+                                       __func__, rc);
+                       break;
+               default:
+                       break;
+               }
+       }
+
        kfree(buf->rb_pool);
 }
 
@@ -1305,15 +1311,12 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
        req->rl_niovs = 0;
        if (req->rl_reply) {
                buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
-               init_waitqueue_head(&req->rl_reply->rr_unbind);
                req->rl_reply->rr_func = NULL;
                req->rl_reply = NULL;
        }
        switch (ia->ri_memreg_strategy) {
        case RPCRDMA_FRMR:
        case RPCRDMA_MTHCAFMR:
-       case RPCRDMA_MEMWINDOWS_ASYNC:
-       case RPCRDMA_MEMWINDOWS:
                /*
                 * Cycle mw's back in reverse order, and "spin" them.
                 * This delays and scrambles reuse as much as possible.
@@ -1358,8 +1361,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
 
 /*
  * Put reply buffers back into pool when not attached to
- * request. This happens in error conditions, and when
- * aborting unbinds. Pre-decrement counter/array index.
+ * request. This happens in error conditions.
  */
 void
 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
@@ -1498,8 +1500,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
        seg1->mr_offset -= pageoff;     /* start of page */
        seg1->mr_len += pageoff;
        len = -pageoff;
-       if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
-               *nsegs = RPCRDMA_MAX_DATA_SEGS;
+       if (*nsegs > ia->ri_max_frmr_depth)
+               *nsegs = ia->ri_max_frmr_depth;
        for (page_no = i = 0; i < *nsegs;) {
                rpcrdma_map_one(ia, seg, writing);
                pa = seg->mr_dma;
@@ -1661,135 +1663,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
        return rc;
 }
 
-static int
-rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
-                       int *nsegs, int writing, struct rpcrdma_ia *ia,
-                       struct rpcrdma_xprt *r_xprt)
-{
-       int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
-                                 IB_ACCESS_REMOTE_READ);
-       struct ib_mw_bind param;
-       int rc;
-
-       *nsegs = 1;
-       rpcrdma_map_one(ia, seg, writing);
-       param.bind_info.mr = ia->ri_bind_mem;
-       param.wr_id = 0ULL;     /* no send cookie */
-       param.bind_info.addr = seg->mr_dma;
-       param.bind_info.length = seg->mr_len;
-       param.send_flags = 0;
-       param.bind_info.mw_access_flags = mem_priv;
-
-       DECR_CQCOUNT(&r_xprt->rx_ep);
-       rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
-       if (rc) {
-               dprintk("RPC:       %s: failed ib_bind_mw "
-                       "%u@0x%llx status %i\n",
-                       __func__, seg->mr_len,
-                       (unsigned long long)seg->mr_dma, rc);
-               rpcrdma_unmap_one(ia, seg);
-       } else {
-               seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
-               seg->mr_base = param.bind_info.addr;
-               seg->mr_nsegs = 1;
-       }
-       return rc;
-}
-
-static int
-rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
-                       struct rpcrdma_ia *ia,
-                       struct rpcrdma_xprt *r_xprt, void **r)
-{
-       struct ib_mw_bind param;
-       LIST_HEAD(l);
-       int rc;
-
-       BUG_ON(seg->mr_nsegs != 1);
-       param.bind_info.mr = ia->ri_bind_mem;
-       param.bind_info.addr = 0ULL;    /* unbind */
-       param.bind_info.length = 0;
-       param.bind_info.mw_access_flags = 0;
-       if (*r) {
-               param.wr_id = (u64) (unsigned long) *r;
-               param.send_flags = IB_SEND_SIGNALED;
-               INIT_CQCOUNT(&r_xprt->rx_ep);
-       } else {
-               param.wr_id = 0ULL;
-               param.send_flags = 0;
-               DECR_CQCOUNT(&r_xprt->rx_ep);
-       }
-       rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
-       rpcrdma_unmap_one(ia, seg);
-       if (rc)
-               dprintk("RPC:       %s: failed ib_(un)bind_mw,"
-                       " status %i\n", __func__, rc);
-       else
-               *r = NULL;      /* will upcall on completion */
-       return rc;
-}
-
-static int
-rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
-                       int *nsegs, int writing, struct rpcrdma_ia *ia)
-{
-       int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
-                                 IB_ACCESS_REMOTE_READ);
-       struct rpcrdma_mr_seg *seg1 = seg;
-       struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
-       int len, i, rc = 0;
-
-       if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
-               *nsegs = RPCRDMA_MAX_DATA_SEGS;
-       for (len = 0, i = 0; i < *nsegs;) {
-               rpcrdma_map_one(ia, seg, writing);
-               ipb[i].addr = seg->mr_dma;
-               ipb[i].size = seg->mr_len;
-               len += seg->mr_len;
-               ++seg;
-               ++i;
-               /* Check for holes */
-               if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
-                   offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
-                       break;
-       }
-       seg1->mr_base = seg1->mr_dma;
-       seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
-                               ipb, i, mem_priv, &seg1->mr_base);
-       if (IS_ERR(seg1->mr_chunk.rl_mr)) {
-               rc = PTR_ERR(seg1->mr_chunk.rl_mr);
-               dprintk("RPC:       %s: failed ib_reg_phys_mr "
-                       "%u@0x%llx (%d)... status %i\n",
-                       __func__, len,
-                       (unsigned long long)seg1->mr_dma, i, rc);
-               while (i--)
-                       rpcrdma_unmap_one(ia, --seg);
-       } else {
-               seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
-               seg1->mr_nsegs = i;
-               seg1->mr_len = len;
-       }
-       *nsegs = i;
-       return rc;
-}
-
-static int
-rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
-                       struct rpcrdma_ia *ia)
-{
-       struct rpcrdma_mr_seg *seg1 = seg;
-       int rc;
-
-       rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
-       seg1->mr_chunk.rl_mr = NULL;
-       while (seg1->mr_nsegs--)
-               rpcrdma_unmap_one(ia, seg++);
-       if (rc)
-               dprintk("RPC:       %s: failed ib_dereg_mr,"
-                       " status %i\n", __func__, rc);
-       return rc;
-}
-
 int
 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
                        int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
@@ -1819,16 +1692,8 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
                rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
                break;
 
-       /* Registration using memory windows */
-       case RPCRDMA_MEMWINDOWS_ASYNC:
-       case RPCRDMA_MEMWINDOWS:
-               rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
-               break;
-
-       /* Default registration each time */
        default:
-               rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
-               break;
+               return -1;
        }
        if (rc)
                return -1;
@@ -1838,7 +1703,7 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
 
 int
 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
-               struct rpcrdma_xprt *r_xprt, void *r)
+               struct rpcrdma_xprt *r_xprt)
 {
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
        int nsegs = seg->mr_nsegs, rc;
@@ -1861,21 +1726,9 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
                rc = rpcrdma_deregister_fmr_external(seg, ia);
                break;
 
-       case RPCRDMA_MEMWINDOWS_ASYNC:
-       case RPCRDMA_MEMWINDOWS:
-               rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
-               break;
-
        default:
-               rc = rpcrdma_deregister_default_external(seg, ia);
                break;
        }
-       if (r) {
-               struct rpcrdma_rep *rep = r;
-               void (*func)(struct rpcrdma_rep *) = rep->rr_func;
-               rep->rr_func = NULL;
-               func(rep);      /* dereg done, callback now */
-       }
        return nsegs;
 }
 
@@ -1950,7 +1803,6 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
        ib_dma_sync_single_for_cpu(ia->ri_id->device,
                rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
 
-       DECR_CQCOUNT(ep);
        rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
 
        if (rc)