xprtrdma: Remove rl_mr field, and the mr_chunk union
[linux-2.6-block.git] / net / sunrpc / xprtrdma / verbs.c
1 /*
2  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the BSD-type
8  * license below:
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  *      Redistributions of source code must retain the above copyright
15  *      notice, this list of conditions and the following disclaimer.
16  *
17  *      Redistributions in binary form must reproduce the above
18  *      copyright notice, this list of conditions and the following
19  *      disclaimer in the documentation and/or other materials provided
20  *      with the distribution.
21  *
22  *      Neither the name of the Network Appliance, Inc. nor the names of
23  *      its contributors may be used to endorse or promote products
24  *      derived from this software without specific prior written
25  *      permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38  */
39
40 /*
41  * verbs.c
42  *
43  * Encapsulates the major functions managing:
44  *  o adapters
45  *  o endpoints
46  *  o connections
47  *  o buffer memory
48  */
49
50 #include <linux/interrupt.h>
51 #include <linux/slab.h>
52 #include <asm/bitops.h>
53
54 #include "xprt_rdma.h"
55
56 /*
57  * Globals/Macros
58  */
59
60 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
61 # define RPCDBG_FACILITY        RPCDBG_TRANS
62 #endif
63
64 static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
65 static void rpcrdma_reset_fmrs(struct rpcrdma_ia *);
66
67 /*
68  * internal functions
69  */
70
71 /*
72  * handle replies in tasklet context, using a single, global list
73  * rdma tasklet function -- just turn around and call the func
74  * for all replies on the list
75  */
76
77 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
78 static LIST_HEAD(rpcrdma_tasklets_g);
79
80 static void
81 rpcrdma_run_tasklet(unsigned long data)
82 {
83         struct rpcrdma_rep *rep;
84         void (*func)(struct rpcrdma_rep *);
85         unsigned long flags;
86
87         data = data;
88         spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
89         while (!list_empty(&rpcrdma_tasklets_g)) {
90                 rep = list_entry(rpcrdma_tasklets_g.next,
91                                  struct rpcrdma_rep, rr_list);
92                 list_del(&rep->rr_list);
93                 func = rep->rr_func;
94                 rep->rr_func = NULL;
95                 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
96
97                 if (func)
98                         func(rep);
99                 else
100                         rpcrdma_recv_buffer_put(rep);
101
102                 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
103         }
104         spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
105 }
106
107 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
108
109 static const char * const async_event[] = {
110         "CQ error",
111         "QP fatal error",
112         "QP request error",
113         "QP access error",
114         "communication established",
115         "send queue drained",
116         "path migration successful",
117         "path mig error",
118         "device fatal error",
119         "port active",
120         "port error",
121         "LID change",
122         "P_key change",
123         "SM change",
124         "SRQ error",
125         "SRQ limit reached",
126         "last WQE reached",
127         "client reregister",
128         "GID change",
129 };
130
131 #define ASYNC_MSG(status)                                       \
132         ((status) < ARRAY_SIZE(async_event) ?                   \
133                 async_event[(status)] : "unknown async error")
134
135 static void
136 rpcrdma_schedule_tasklet(struct list_head *sched_list)
137 {
138         unsigned long flags;
139
140         spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
141         list_splice_tail(sched_list, &rpcrdma_tasklets_g);
142         spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
143         tasklet_schedule(&rpcrdma_tasklet_g);
144 }
145
146 static void
147 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
148 {
149         struct rpcrdma_ep *ep = context;
150
151         pr_err("RPC:       %s: %s on device %s ep %p\n",
152                __func__, ASYNC_MSG(event->event),
153                 event->device->name, context);
154         if (ep->rep_connected == 1) {
155                 ep->rep_connected = -EIO;
156                 ep->rep_func(ep);
157                 wake_up_all(&ep->rep_connect_wait);
158         }
159 }
160
161 static void
162 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
163 {
164         struct rpcrdma_ep *ep = context;
165
166         pr_err("RPC:       %s: %s on device %s ep %p\n",
167                __func__, ASYNC_MSG(event->event),
168                 event->device->name, context);
169         if (ep->rep_connected == 1) {
170                 ep->rep_connected = -EIO;
171                 ep->rep_func(ep);
172                 wake_up_all(&ep->rep_connect_wait);
173         }
174 }
175
176 static const char * const wc_status[] = {
177         "success",
178         "local length error",
179         "local QP operation error",
180         "local EE context operation error",
181         "local protection error",
182         "WR flushed",
183         "memory management operation error",
184         "bad response error",
185         "local access error",
186         "remote invalid request error",
187         "remote access error",
188         "remote operation error",
189         "transport retry counter exceeded",
190         "RNR retrycounter exceeded",
191         "local RDD violation error",
192         "remove invalid RD request",
193         "operation aborted",
194         "invalid EE context number",
195         "invalid EE context state",
196         "fatal error",
197         "response timeout error",
198         "general error",
199 };
200
201 #define COMPLETION_MSG(status)                                  \
202         ((status) < ARRAY_SIZE(wc_status) ?                     \
203                 wc_status[(status)] : "unexpected completion error")
204
205 static void
206 rpcrdma_sendcq_process_wc(struct ib_wc *wc)
207 {
208         if (likely(wc->status == IB_WC_SUCCESS))
209                 return;
210
211         /* WARNING: Only wr_id and status are reliable at this point */
212         if (wc->wr_id == 0ULL) {
213                 if (wc->status != IB_WC_WR_FLUSH_ERR)
214                         pr_err("RPC:       %s: SEND: %s\n",
215                                __func__, COMPLETION_MSG(wc->status));
216         } else {
217                 struct rpcrdma_mw *r;
218
219                 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
220                 r->r.frmr.fr_state = FRMR_IS_STALE;
221                 pr_err("RPC:       %s: frmr %p (stale): %s\n",
222                        __func__, r, COMPLETION_MSG(wc->status));
223         }
224 }
225
226 static int
227 rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
228 {
229         struct ib_wc *wcs;
230         int budget, count, rc;
231
232         budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
233         do {
234                 wcs = ep->rep_send_wcs;
235
236                 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
237                 if (rc <= 0)
238                         return rc;
239
240                 count = rc;
241                 while (count-- > 0)
242                         rpcrdma_sendcq_process_wc(wcs++);
243         } while (rc == RPCRDMA_POLLSIZE && --budget);
244         return 0;
245 }
246
247 /*
248  * Handle send, fast_reg_mr, and local_inv completions.
249  *
250  * Send events are typically suppressed and thus do not result
251  * in an upcall. Occasionally one is signaled, however. This
252  * prevents the provider's completion queue from wrapping and
253  * losing a completion.
254  */
255 static void
256 rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
257 {
258         struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
259         int rc;
260
261         rc = rpcrdma_sendcq_poll(cq, ep);
262         if (rc) {
263                 dprintk("RPC:       %s: ib_poll_cq failed: %i\n",
264                         __func__, rc);
265                 return;
266         }
267
268         rc = ib_req_notify_cq(cq,
269                         IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
270         if (rc == 0)
271                 return;
272         if (rc < 0) {
273                 dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
274                         __func__, rc);
275                 return;
276         }
277
278         rpcrdma_sendcq_poll(cq, ep);
279 }
280
281 static void
282 rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
283 {
284         struct rpcrdma_rep *rep =
285                         (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
286
287         /* WARNING: Only wr_id and status are reliable at this point */
288         if (wc->status != IB_WC_SUCCESS)
289                 goto out_fail;
290
291         /* status == SUCCESS means all fields in wc are trustworthy */
292         if (wc->opcode != IB_WC_RECV)
293                 return;
294
295         dprintk("RPC:       %s: rep %p opcode 'recv', length %u: success\n",
296                 __func__, rep, wc->byte_len);
297
298         rep->rr_len = wc->byte_len;
299         ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
300                         rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
301
302         if (rep->rr_len >= 16) {
303                 struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
304                 unsigned int credits = ntohl(p->rm_credit);
305
306                 if (credits == 0)
307                         credits = 1;    /* don't deadlock */
308                 else if (credits > rep->rr_buffer->rb_max_requests)
309                         credits = rep->rr_buffer->rb_max_requests;
310                 atomic_set(&rep->rr_buffer->rb_credits, credits);
311         }
312
313 out_schedule:
314         list_add_tail(&rep->rr_list, sched_list);
315         return;
316 out_fail:
317         if (wc->status != IB_WC_WR_FLUSH_ERR)
318                 pr_err("RPC:       %s: rep %p: %s\n",
319                        __func__, rep, COMPLETION_MSG(wc->status));
320         rep->rr_len = ~0U;
321         goto out_schedule;
322 }
323
324 static int
325 rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
326 {
327         struct list_head sched_list;
328         struct ib_wc *wcs;
329         int budget, count, rc;
330
331         INIT_LIST_HEAD(&sched_list);
332         budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
333         do {
334                 wcs = ep->rep_recv_wcs;
335
336                 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
337                 if (rc <= 0)
338                         goto out_schedule;
339
340                 count = rc;
341                 while (count-- > 0)
342                         rpcrdma_recvcq_process_wc(wcs++, &sched_list);
343         } while (rc == RPCRDMA_POLLSIZE && --budget);
344         rc = 0;
345
346 out_schedule:
347         rpcrdma_schedule_tasklet(&sched_list);
348         return rc;
349 }
350
351 /*
352  * Handle receive completions.
353  *
354  * It is reentrant but processes single events in order to maintain
355  * ordering of receives to keep server credits.
356  *
357  * It is the responsibility of the scheduled tasklet to return
358  * recv buffers to the pool. NOTE: this affects synchronization of
359  * connection shutdown. That is, the structures required for
360  * the completion of the reply handler must remain intact until
361  * all memory has been reclaimed.
362  */
363 static void
364 rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
365 {
366         struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
367         int rc;
368
369         rc = rpcrdma_recvcq_poll(cq, ep);
370         if (rc) {
371                 dprintk("RPC:       %s: ib_poll_cq failed: %i\n",
372                         __func__, rc);
373                 return;
374         }
375
376         rc = ib_req_notify_cq(cq,
377                         IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
378         if (rc == 0)
379                 return;
380         if (rc < 0) {
381                 dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
382                         __func__, rc);
383                 return;
384         }
385
386         rpcrdma_recvcq_poll(cq, ep);
387 }
388
389 static void
390 rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
391 {
392         struct ib_wc wc;
393         LIST_HEAD(sched_list);
394
395         while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
396                 rpcrdma_recvcq_process_wc(&wc, &sched_list);
397         if (!list_empty(&sched_list))
398                 rpcrdma_schedule_tasklet(&sched_list);
399         while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
400                 rpcrdma_sendcq_process_wc(&wc);
401 }
402
403 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
404 static const char * const conn[] = {
405         "address resolved",
406         "address error",
407         "route resolved",
408         "route error",
409         "connect request",
410         "connect response",
411         "connect error",
412         "unreachable",
413         "rejected",
414         "established",
415         "disconnected",
416         "device removal",
417         "multicast join",
418         "multicast error",
419         "address change",
420         "timewait exit",
421 };
422
423 #define CONNECTION_MSG(status)                                          \
424         ((status) < ARRAY_SIZE(conn) ?                                  \
425                 conn[(status)] : "unrecognized connection error")
426 #endif
427
428 static int
429 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
430 {
431         struct rpcrdma_xprt *xprt = id->context;
432         struct rpcrdma_ia *ia = &xprt->rx_ia;
433         struct rpcrdma_ep *ep = &xprt->rx_ep;
434 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
435         struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
436 #endif
437         struct ib_qp_attr attr;
438         struct ib_qp_init_attr iattr;
439         int connstate = 0;
440
441         switch (event->event) {
442         case RDMA_CM_EVENT_ADDR_RESOLVED:
443         case RDMA_CM_EVENT_ROUTE_RESOLVED:
444                 ia->ri_async_rc = 0;
445                 complete(&ia->ri_done);
446                 break;
447         case RDMA_CM_EVENT_ADDR_ERROR:
448                 ia->ri_async_rc = -EHOSTUNREACH;
449                 dprintk("RPC:       %s: CM address resolution error, ep 0x%p\n",
450                         __func__, ep);
451                 complete(&ia->ri_done);
452                 break;
453         case RDMA_CM_EVENT_ROUTE_ERROR:
454                 ia->ri_async_rc = -ENETUNREACH;
455                 dprintk("RPC:       %s: CM route resolution error, ep 0x%p\n",
456                         __func__, ep);
457                 complete(&ia->ri_done);
458                 break;
459         case RDMA_CM_EVENT_ESTABLISHED:
460                 connstate = 1;
461                 ib_query_qp(ia->ri_id->qp, &attr,
462                         IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
463                         &iattr);
464                 dprintk("RPC:       %s: %d responder resources"
465                         " (%d initiator)\n",
466                         __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
467                 goto connected;
468         case RDMA_CM_EVENT_CONNECT_ERROR:
469                 connstate = -ENOTCONN;
470                 goto connected;
471         case RDMA_CM_EVENT_UNREACHABLE:
472                 connstate = -ENETDOWN;
473                 goto connected;
474         case RDMA_CM_EVENT_REJECTED:
475                 connstate = -ECONNREFUSED;
476                 goto connected;
477         case RDMA_CM_EVENT_DISCONNECTED:
478                 connstate = -ECONNABORTED;
479                 goto connected;
480         case RDMA_CM_EVENT_DEVICE_REMOVAL:
481                 connstate = -ENODEV;
482 connected:
483                 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
484                 dprintk("RPC:       %s: %sconnected\n",
485                                         __func__, connstate > 0 ? "" : "dis");
486                 ep->rep_connected = connstate;
487                 ep->rep_func(ep);
488                 wake_up_all(&ep->rep_connect_wait);
489                 /*FALLTHROUGH*/
490         default:
491                 dprintk("RPC:       %s: %pI4:%u (ep 0x%p): %s\n",
492                         __func__, &addr->sin_addr.s_addr,
493                         ntohs(addr->sin_port), ep,
494                         CONNECTION_MSG(event->event));
495                 break;
496         }
497
498 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
499         if (connstate == 1) {
500                 int ird = attr.max_dest_rd_atomic;
501                 int tird = ep->rep_remote_cma.responder_resources;
502                 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
503                         "on %s, memreg %d slots %d ird %d%s\n",
504                         &addr->sin_addr.s_addr,
505                         ntohs(addr->sin_port),
506                         ia->ri_id->device->name,
507                         ia->ri_memreg_strategy,
508                         xprt->rx_buf.rb_max_requests,
509                         ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
510         } else if (connstate < 0) {
511                 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
512                         &addr->sin_addr.s_addr,
513                         ntohs(addr->sin_port),
514                         connstate);
515         }
516 #endif
517
518         return 0;
519 }
520
521 static struct rdma_cm_id *
522 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
523                         struct rpcrdma_ia *ia, struct sockaddr *addr)
524 {
525         struct rdma_cm_id *id;
526         int rc;
527
528         init_completion(&ia->ri_done);
529
530         id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
531         if (IS_ERR(id)) {
532                 rc = PTR_ERR(id);
533                 dprintk("RPC:       %s: rdma_create_id() failed %i\n",
534                         __func__, rc);
535                 return id;
536         }
537
538         ia->ri_async_rc = -ETIMEDOUT;
539         rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
540         if (rc) {
541                 dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
542                         __func__, rc);
543                 goto out;
544         }
545         wait_for_completion_interruptible_timeout(&ia->ri_done,
546                                 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
547         rc = ia->ri_async_rc;
548         if (rc)
549                 goto out;
550
551         ia->ri_async_rc = -ETIMEDOUT;
552         rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
553         if (rc) {
554                 dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
555                         __func__, rc);
556                 goto out;
557         }
558         wait_for_completion_interruptible_timeout(&ia->ri_done,
559                                 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
560         rc = ia->ri_async_rc;
561         if (rc)
562                 goto out;
563
564         return id;
565
566 out:
567         rdma_destroy_id(id);
568         return ERR_PTR(rc);
569 }
570
571 /*
572  * Drain any cq, prior to teardown.
573  */
574 static void
575 rpcrdma_clean_cq(struct ib_cq *cq)
576 {
577         struct ib_wc wc;
578         int count = 0;
579
580         while (1 == ib_poll_cq(cq, 1, &wc))
581                 ++count;
582
583         if (count)
584                 dprintk("RPC:       %s: flushed %d events (last 0x%x)\n",
585                         __func__, count, wc.opcode);
586 }
587
588 /*
589  * Exported functions.
590  */
591
592 /*
593  * Open and initialize an Interface Adapter.
594  *  o initializes fields of struct rpcrdma_ia, including
595  *    interface and provider attributes and protection zone.
596  */
597 int
598 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
599 {
600         int rc, mem_priv;
601         struct ib_device_attr devattr;
602         struct rpcrdma_ia *ia = &xprt->rx_ia;
603
604         ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
605         if (IS_ERR(ia->ri_id)) {
606                 rc = PTR_ERR(ia->ri_id);
607                 goto out1;
608         }
609
610         ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
611         if (IS_ERR(ia->ri_pd)) {
612                 rc = PTR_ERR(ia->ri_pd);
613                 dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
614                         __func__, rc);
615                 goto out2;
616         }
617
618         /*
619          * Query the device to determine if the requested memory
620          * registration strategy is supported. If it isn't, set the
621          * strategy to a globally supported model.
622          */
623         rc = ib_query_device(ia->ri_id->device, &devattr);
624         if (rc) {
625                 dprintk("RPC:       %s: ib_query_device failed %d\n",
626                         __func__, rc);
627                 goto out2;
628         }
629
630         if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
631                 ia->ri_have_dma_lkey = 1;
632                 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
633         }
634
635         if (memreg == RPCRDMA_FRMR) {
636                 /* Requires both frmr reg and local dma lkey */
637                 if ((devattr.device_cap_flags &
638                      (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
639                     (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
640                         dprintk("RPC:       %s: FRMR registration "
641                                 "not supported by HCA\n", __func__);
642                         memreg = RPCRDMA_MTHCAFMR;
643                 } else {
644                         /* Mind the ia limit on FRMR page list depth */
645                         ia->ri_max_frmr_depth = min_t(unsigned int,
646                                 RPCRDMA_MAX_DATA_SEGS,
647                                 devattr.max_fast_reg_page_list_len);
648                 }
649         }
650         if (memreg == RPCRDMA_MTHCAFMR) {
651                 if (!ia->ri_id->device->alloc_fmr) {
652                         dprintk("RPC:       %s: MTHCAFMR registration "
653                                 "not supported by HCA\n", __func__);
654                         memreg = RPCRDMA_ALLPHYSICAL;
655                 }
656         }
657
658         /*
659          * Optionally obtain an underlying physical identity mapping in
660          * order to do a memory window-based bind. This base registration
661          * is protected from remote access - that is enabled only by binding
662          * for the specific bytes targeted during each RPC operation, and
663          * revoked after the corresponding completion similar to a storage
664          * adapter.
665          */
666         switch (memreg) {
667         case RPCRDMA_FRMR:
668                 break;
669         case RPCRDMA_ALLPHYSICAL:
670                 mem_priv = IB_ACCESS_LOCAL_WRITE |
671                                 IB_ACCESS_REMOTE_WRITE |
672                                 IB_ACCESS_REMOTE_READ;
673                 goto register_setup;
674         case RPCRDMA_MTHCAFMR:
675                 if (ia->ri_have_dma_lkey)
676                         break;
677                 mem_priv = IB_ACCESS_LOCAL_WRITE;
678         register_setup:
679                 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
680                 if (IS_ERR(ia->ri_bind_mem)) {
681                         printk(KERN_ALERT "%s: ib_get_dma_mr for "
682                                 "phys register failed with %lX\n",
683                                 __func__, PTR_ERR(ia->ri_bind_mem));
684                         rc = -ENOMEM;
685                         goto out2;
686                 }
687                 break;
688         default:
689                 printk(KERN_ERR "RPC: Unsupported memory "
690                                 "registration mode: %d\n", memreg);
691                 rc = -ENOMEM;
692                 goto out2;
693         }
694         dprintk("RPC:       %s: memory registration strategy is %d\n",
695                 __func__, memreg);
696
697         /* Else will do memory reg/dereg for each chunk */
698         ia->ri_memreg_strategy = memreg;
699
700         rwlock_init(&ia->ri_qplock);
701         return 0;
702 out2:
703         rdma_destroy_id(ia->ri_id);
704         ia->ri_id = NULL;
705 out1:
706         return rc;
707 }
708
709 /*
710  * Clean up/close an IA.
711  *   o if event handles and PD have been initialized, free them.
712  *   o close the IA
713  */
714 void
715 rpcrdma_ia_close(struct rpcrdma_ia *ia)
716 {
717         int rc;
718
719         dprintk("RPC:       %s: entering\n", __func__);
720         if (ia->ri_bind_mem != NULL) {
721                 rc = ib_dereg_mr(ia->ri_bind_mem);
722                 dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
723                         __func__, rc);
724         }
725         if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
726                 if (ia->ri_id->qp)
727                         rdma_destroy_qp(ia->ri_id);
728                 rdma_destroy_id(ia->ri_id);
729                 ia->ri_id = NULL;
730         }
731         if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
732                 rc = ib_dealloc_pd(ia->ri_pd);
733                 dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
734                         __func__, rc);
735         }
736 }
737
738 /*
739  * Create unconnected endpoint.
740  */
741 int
742 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
743                                 struct rpcrdma_create_data_internal *cdata)
744 {
745         struct ib_device_attr devattr;
746         struct ib_cq *sendcq, *recvcq;
747         int rc, err;
748
749         rc = ib_query_device(ia->ri_id->device, &devattr);
750         if (rc) {
751                 dprintk("RPC:       %s: ib_query_device failed %d\n",
752                         __func__, rc);
753                 return rc;
754         }
755
756         /* check provider's send/recv wr limits */
757         if (cdata->max_requests > devattr.max_qp_wr)
758                 cdata->max_requests = devattr.max_qp_wr;
759
760         ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
761         ep->rep_attr.qp_context = ep;
762         /* send_cq and recv_cq initialized below */
763         ep->rep_attr.srq = NULL;
764         ep->rep_attr.cap.max_send_wr = cdata->max_requests;
765         switch (ia->ri_memreg_strategy) {
766         case RPCRDMA_FRMR: {
767                 int depth = 7;
768
769                 /* Add room for frmr register and invalidate WRs.
770                  * 1. FRMR reg WR for head
771                  * 2. FRMR invalidate WR for head
772                  * 3. N FRMR reg WRs for pagelist
773                  * 4. N FRMR invalidate WRs for pagelist
774                  * 5. FRMR reg WR for tail
775                  * 6. FRMR invalidate WR for tail
776                  * 7. The RDMA_SEND WR
777                  */
778
779                 /* Calculate N if the device max FRMR depth is smaller than
780                  * RPCRDMA_MAX_DATA_SEGS.
781                  */
782                 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
783                         int delta = RPCRDMA_MAX_DATA_SEGS -
784                                     ia->ri_max_frmr_depth;
785
786                         do {
787                                 depth += 2; /* FRMR reg + invalidate */
788                                 delta -= ia->ri_max_frmr_depth;
789                         } while (delta > 0);
790
791                 }
792                 ep->rep_attr.cap.max_send_wr *= depth;
793                 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
794                         cdata->max_requests = devattr.max_qp_wr / depth;
795                         if (!cdata->max_requests)
796                                 return -EINVAL;
797                         ep->rep_attr.cap.max_send_wr = cdata->max_requests *
798                                                        depth;
799                 }
800                 break;
801         }
802         default:
803                 break;
804         }
805         ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
806         ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
807         ep->rep_attr.cap.max_recv_sge = 1;
808         ep->rep_attr.cap.max_inline_data = 0;
809         ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
810         ep->rep_attr.qp_type = IB_QPT_RC;
811         ep->rep_attr.port_num = ~0;
812
813         dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
814                 "iovs: send %d recv %d\n",
815                 __func__,
816                 ep->rep_attr.cap.max_send_wr,
817                 ep->rep_attr.cap.max_recv_wr,
818                 ep->rep_attr.cap.max_send_sge,
819                 ep->rep_attr.cap.max_recv_sge);
820
821         /* set trigger for requesting send completion */
822         ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
823         if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
824                 ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
825         else if (ep->rep_cqinit <= 2)
826                 ep->rep_cqinit = 0;
827         INIT_CQCOUNT(ep);
828         init_waitqueue_head(&ep->rep_connect_wait);
829         INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
830
831         sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
832                                   rpcrdma_cq_async_error_upcall, ep,
833                                   ep->rep_attr.cap.max_send_wr + 1, 0);
834         if (IS_ERR(sendcq)) {
835                 rc = PTR_ERR(sendcq);
836                 dprintk("RPC:       %s: failed to create send CQ: %i\n",
837                         __func__, rc);
838                 goto out1;
839         }
840
841         rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
842         if (rc) {
843                 dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
844                         __func__, rc);
845                 goto out2;
846         }
847
848         recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
849                                   rpcrdma_cq_async_error_upcall, ep,
850                                   ep->rep_attr.cap.max_recv_wr + 1, 0);
851         if (IS_ERR(recvcq)) {
852                 rc = PTR_ERR(recvcq);
853                 dprintk("RPC:       %s: failed to create recv CQ: %i\n",
854                         __func__, rc);
855                 goto out2;
856         }
857
858         rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
859         if (rc) {
860                 dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
861                         __func__, rc);
862                 ib_destroy_cq(recvcq);
863                 goto out2;
864         }
865
866         ep->rep_attr.send_cq = sendcq;
867         ep->rep_attr.recv_cq = recvcq;
868
869         /* Initialize cma parameters */
870
871         /* RPC/RDMA does not use private data */
872         ep->rep_remote_cma.private_data = NULL;
873         ep->rep_remote_cma.private_data_len = 0;
874
875         /* Client offers RDMA Read but does not initiate */
876         ep->rep_remote_cma.initiator_depth = 0;
877         if (devattr.max_qp_rd_atom > 32)        /* arbitrary but <= 255 */
878                 ep->rep_remote_cma.responder_resources = 32;
879         else
880                 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
881
882         ep->rep_remote_cma.retry_count = 7;
883         ep->rep_remote_cma.flow_control = 0;
884         ep->rep_remote_cma.rnr_retry_count = 0;
885
886         return 0;
887
888 out2:
889         err = ib_destroy_cq(sendcq);
890         if (err)
891                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
892                         __func__, err);
893 out1:
894         return rc;
895 }
896
897 /*
898  * rpcrdma_ep_destroy
899  *
900  * Disconnect and destroy endpoint. After this, the only
901  * valid operations on the ep are to free it (if dynamically
902  * allocated) or re-create it.
903  */
904 void
905 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
906 {
907         int rc;
908
909         dprintk("RPC:       %s: entering, connected is %d\n",
910                 __func__, ep->rep_connected);
911
912         cancel_delayed_work_sync(&ep->rep_connect_worker);
913
914         if (ia->ri_id->qp) {
915                 rpcrdma_ep_disconnect(ep, ia);
916                 rdma_destroy_qp(ia->ri_id);
917                 ia->ri_id->qp = NULL;
918         }
919
920         /* padding - could be done in rpcrdma_buffer_destroy... */
921         if (ep->rep_pad_mr) {
922                 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
923                 ep->rep_pad_mr = NULL;
924         }
925
926         rpcrdma_clean_cq(ep->rep_attr.recv_cq);
927         rc = ib_destroy_cq(ep->rep_attr.recv_cq);
928         if (rc)
929                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
930                         __func__, rc);
931
932         rpcrdma_clean_cq(ep->rep_attr.send_cq);
933         rc = ib_destroy_cq(ep->rep_attr.send_cq);
934         if (rc)
935                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
936                         __func__, rc);
937 }
938
939 /*
940  * Connect unconnected endpoint.
941  */
942 int
943 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
944 {
945         struct rdma_cm_id *id, *old;
946         int rc = 0;
947         int retry_count = 0;
948
949         if (ep->rep_connected != 0) {
950                 struct rpcrdma_xprt *xprt;
951 retry:
952                 dprintk("RPC:       %s: reconnecting...\n", __func__);
953
954                 rpcrdma_ep_disconnect(ep, ia);
955                 rpcrdma_flush_cqs(ep);
956
957                 switch (ia->ri_memreg_strategy) {
958                 case RPCRDMA_FRMR:
959                         rpcrdma_reset_frmrs(ia);
960                         break;
961                 case RPCRDMA_MTHCAFMR:
962                         rpcrdma_reset_fmrs(ia);
963                         break;
964                 case RPCRDMA_ALLPHYSICAL:
965                         break;
966                 default:
967                         rc = -EIO;
968                         goto out;
969                 }
970
971                 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
972                 id = rpcrdma_create_id(xprt, ia,
973                                 (struct sockaddr *)&xprt->rx_data.addr);
974                 if (IS_ERR(id)) {
975                         rc = -EHOSTUNREACH;
976                         goto out;
977                 }
978                 /* TEMP TEMP TEMP - fail if new device:
979                  * Deregister/remarshal *all* requests!
980                  * Close and recreate adapter, pd, etc!
981                  * Re-determine all attributes still sane!
982                  * More stuff I haven't thought of!
983                  * Rrrgh!
984                  */
985                 if (ia->ri_id->device != id->device) {
986                         printk("RPC:       %s: can't reconnect on "
987                                 "different device!\n", __func__);
988                         rdma_destroy_id(id);
989                         rc = -ENETUNREACH;
990                         goto out;
991                 }
992                 /* END TEMP */
993                 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
994                 if (rc) {
995                         dprintk("RPC:       %s: rdma_create_qp failed %i\n",
996                                 __func__, rc);
997                         rdma_destroy_id(id);
998                         rc = -ENETUNREACH;
999                         goto out;
1000                 }
1001
1002                 write_lock(&ia->ri_qplock);
1003                 old = ia->ri_id;
1004                 ia->ri_id = id;
1005                 write_unlock(&ia->ri_qplock);
1006
1007                 rdma_destroy_qp(old);
1008                 rdma_destroy_id(old);
1009         } else {
1010                 dprintk("RPC:       %s: connecting...\n", __func__);
1011                 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
1012                 if (rc) {
1013                         dprintk("RPC:       %s: rdma_create_qp failed %i\n",
1014                                 __func__, rc);
1015                         /* do not update ep->rep_connected */
1016                         return -ENETUNREACH;
1017                 }
1018         }
1019
1020         ep->rep_connected = 0;
1021
1022         rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
1023         if (rc) {
1024                 dprintk("RPC:       %s: rdma_connect() failed with %i\n",
1025                                 __func__, rc);
1026                 goto out;
1027         }
1028
1029         wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
1030
1031         /*
1032          * Check state. A non-peer reject indicates no listener
1033          * (ECONNREFUSED), which may be a transient state. All
1034          * others indicate a transport condition which has already
1035          * undergone a best-effort.
1036          */
1037         if (ep->rep_connected == -ECONNREFUSED &&
1038             ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
1039                 dprintk("RPC:       %s: non-peer_reject, retry\n", __func__);
1040                 goto retry;
1041         }
1042         if (ep->rep_connected <= 0) {
1043                 /* Sometimes, the only way to reliably connect to remote
1044                  * CMs is to use same nonzero values for ORD and IRD. */
1045                 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
1046                     (ep->rep_remote_cma.responder_resources == 0 ||
1047                      ep->rep_remote_cma.initiator_depth !=
1048                                 ep->rep_remote_cma.responder_resources)) {
1049                         if (ep->rep_remote_cma.responder_resources == 0)
1050                                 ep->rep_remote_cma.responder_resources = 1;
1051                         ep->rep_remote_cma.initiator_depth =
1052                                 ep->rep_remote_cma.responder_resources;
1053                         goto retry;
1054                 }
1055                 rc = ep->rep_connected;
1056         } else {
1057                 dprintk("RPC:       %s: connected\n", __func__);
1058         }
1059
1060 out:
1061         if (rc)
1062                 ep->rep_connected = rc;
1063         return rc;
1064 }
1065
1066 /*
1067  * rpcrdma_ep_disconnect
1068  *
1069  * This is separate from destroy to facilitate the ability
1070  * to reconnect without recreating the endpoint.
1071  *
1072  * This call is not reentrant, and must not be made in parallel
1073  * on the same endpoint.
1074  */
1075 void
1076 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
1077 {
1078         int rc;
1079
1080         rpcrdma_flush_cqs(ep);
1081         rc = rdma_disconnect(ia->ri_id);
1082         if (!rc) {
1083                 /* returns without wait if not connected */
1084                 wait_event_interruptible(ep->rep_connect_wait,
1085                                                         ep->rep_connected != 1);
1086                 dprintk("RPC:       %s: after wait, %sconnected\n", __func__,
1087                         (ep->rep_connected == 1) ? "still " : "dis");
1088         } else {
1089                 dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
1090                 ep->rep_connected = rc;
1091         }
1092 }
1093
1094 static int
1095 rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1096 {
1097         int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
1098         struct ib_fmr_attr fmr_attr = {
1099                 .max_pages      = RPCRDMA_MAX_DATA_SEGS,
1100                 .max_maps       = 1,
1101                 .page_shift     = PAGE_SHIFT
1102         };
1103         struct rpcrdma_mw *r;
1104         int i, rc;
1105
1106         i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1107         dprintk("RPC:       %s: initalizing %d FMRs\n", __func__, i);
1108
1109         while (i--) {
1110                 r = kzalloc(sizeof(*r), GFP_KERNEL);
1111                 if (r == NULL)
1112                         return -ENOMEM;
1113
1114                 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
1115                 if (IS_ERR(r->r.fmr)) {
1116                         rc = PTR_ERR(r->r.fmr);
1117                         dprintk("RPC:       %s: ib_alloc_fmr failed %i\n",
1118                                 __func__, rc);
1119                         goto out_free;
1120                 }
1121
1122                 list_add(&r->mw_list, &buf->rb_mws);
1123                 list_add(&r->mw_all, &buf->rb_all);
1124         }
1125         return 0;
1126
1127 out_free:
1128         kfree(r);
1129         return rc;
1130 }
1131
1132 static int
1133 rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1134 {
1135         struct rpcrdma_frmr *f;
1136         struct rpcrdma_mw *r;
1137         int i, rc;
1138
1139         i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1140         dprintk("RPC:       %s: initalizing %d FRMRs\n", __func__, i);
1141
1142         while (i--) {
1143                 r = kzalloc(sizeof(*r), GFP_KERNEL);
1144                 if (r == NULL)
1145                         return -ENOMEM;
1146                 f = &r->r.frmr;
1147
1148                 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1149                                                 ia->ri_max_frmr_depth);
1150                 if (IS_ERR(f->fr_mr)) {
1151                         rc = PTR_ERR(f->fr_mr);
1152                         dprintk("RPC:       %s: ib_alloc_fast_reg_mr "
1153                                 "failed %i\n", __func__, rc);
1154                         goto out_free;
1155                 }
1156
1157                 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
1158                                                         ia->ri_max_frmr_depth);
1159                 if (IS_ERR(f->fr_pgl)) {
1160                         rc = PTR_ERR(f->fr_pgl);
1161                         dprintk("RPC:       %s: ib_alloc_fast_reg_page_list "
1162                                 "failed %i\n", __func__, rc);
1163
1164                         ib_dereg_mr(f->fr_mr);
1165                         goto out_free;
1166                 }
1167
1168                 list_add(&r->mw_list, &buf->rb_mws);
1169                 list_add(&r->mw_all, &buf->rb_all);
1170         }
1171
1172         return 0;
1173
1174 out_free:
1175         kfree(r);
1176         return rc;
1177 }
1178
1179 int
1180 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1181         struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
1182 {
1183         char *p;
1184         size_t len, rlen, wlen;
1185         int i, rc;
1186
1187         buf->rb_max_requests = cdata->max_requests;
1188         spin_lock_init(&buf->rb_lock);
1189         atomic_set(&buf->rb_credits, 1);
1190
1191         /* Need to allocate:
1192          *   1.  arrays for send and recv pointers
1193          *   2.  arrays of struct rpcrdma_req to fill in pointers
1194          *   3.  array of struct rpcrdma_rep for replies
1195          *   4.  padding, if any
1196          * Send/recv buffers in req/rep need to be registered
1197          */
1198         len = buf->rb_max_requests *
1199                 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1200         len += cdata->padding;
1201
1202         p = kzalloc(len, GFP_KERNEL);
1203         if (p == NULL) {
1204                 dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1205                         __func__, len);
1206                 rc = -ENOMEM;
1207                 goto out;
1208         }
1209         buf->rb_pool = p;       /* for freeing it later */
1210
1211         buf->rb_send_bufs = (struct rpcrdma_req **) p;
1212         p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1213         buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1214         p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1215
1216         /*
1217          * Register the zeroed pad buffer, if any.
1218          */
1219         if (cdata->padding) {
1220                 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1221                                             &ep->rep_pad_mr, &ep->rep_pad);
1222                 if (rc)
1223                         goto out;
1224         }
1225         p += cdata->padding;
1226
1227         INIT_LIST_HEAD(&buf->rb_mws);
1228         INIT_LIST_HEAD(&buf->rb_all);
1229         switch (ia->ri_memreg_strategy) {
1230         case RPCRDMA_FRMR:
1231                 rc = rpcrdma_init_frmrs(ia, buf);
1232                 if (rc)
1233                         goto out;
1234                 break;
1235         case RPCRDMA_MTHCAFMR:
1236                 rc = rpcrdma_init_fmrs(ia, buf);
1237                 if (rc)
1238                         goto out;
1239                 break;
1240         default:
1241                 break;
1242         }
1243
1244         /*
1245          * Allocate/init the request/reply buffers. Doing this
1246          * using kmalloc for now -- one for each buf.
1247          */
1248         wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
1249         rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
1250         dprintk("RPC:       %s: wlen = %zu, rlen = %zu\n",
1251                 __func__, wlen, rlen);
1252
1253         for (i = 0; i < buf->rb_max_requests; i++) {
1254                 struct rpcrdma_req *req;
1255                 struct rpcrdma_rep *rep;
1256
1257                 req = kmalloc(wlen, GFP_KERNEL);
1258                 if (req == NULL) {
1259                         dprintk("RPC:       %s: request buffer %d alloc"
1260                                 " failed\n", __func__, i);
1261                         rc = -ENOMEM;
1262                         goto out;
1263                 }
1264                 memset(req, 0, sizeof(struct rpcrdma_req));
1265                 buf->rb_send_bufs[i] = req;
1266                 buf->rb_send_bufs[i]->rl_buffer = buf;
1267
1268                 rc = rpcrdma_register_internal(ia, req->rl_base,
1269                                 wlen - offsetof(struct rpcrdma_req, rl_base),
1270                                 &buf->rb_send_bufs[i]->rl_handle,
1271                                 &buf->rb_send_bufs[i]->rl_iov);
1272                 if (rc)
1273                         goto out;
1274
1275                 buf->rb_send_bufs[i]->rl_size = wlen -
1276                                                 sizeof(struct rpcrdma_req);
1277
1278                 rep = kmalloc(rlen, GFP_KERNEL);
1279                 if (rep == NULL) {
1280                         dprintk("RPC:       %s: reply buffer %d alloc failed\n",
1281                                 __func__, i);
1282                         rc = -ENOMEM;
1283                         goto out;
1284                 }
1285                 memset(rep, 0, sizeof(struct rpcrdma_rep));
1286                 buf->rb_recv_bufs[i] = rep;
1287                 buf->rb_recv_bufs[i]->rr_buffer = buf;
1288
1289                 rc = rpcrdma_register_internal(ia, rep->rr_base,
1290                                 rlen - offsetof(struct rpcrdma_rep, rr_base),
1291                                 &buf->rb_recv_bufs[i]->rr_handle,
1292                                 &buf->rb_recv_bufs[i]->rr_iov);
1293                 if (rc)
1294                         goto out;
1295
1296         }
1297         dprintk("RPC:       %s: max_requests %d\n",
1298                 __func__, buf->rb_max_requests);
1299         /* done */
1300         return 0;
1301 out:
1302         rpcrdma_buffer_destroy(buf);
1303         return rc;
1304 }
1305
1306 static void
1307 rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1308 {
1309         struct rpcrdma_mw *r;
1310         int rc;
1311
1312         while (!list_empty(&buf->rb_all)) {
1313                 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1314                 list_del(&r->mw_all);
1315                 list_del(&r->mw_list);
1316
1317                 rc = ib_dealloc_fmr(r->r.fmr);
1318                 if (rc)
1319                         dprintk("RPC:       %s: ib_dealloc_fmr failed %i\n",
1320                                 __func__, rc);
1321
1322                 kfree(r);
1323         }
1324 }
1325
1326 static void
1327 rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
1328 {
1329         struct rpcrdma_mw *r;
1330         int rc;
1331
1332         while (!list_empty(&buf->rb_all)) {
1333                 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1334                 list_del(&r->mw_all);
1335                 list_del(&r->mw_list);
1336
1337                 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1338                 if (rc)
1339                         dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
1340                                 __func__, rc);
1341                 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1342
1343                 kfree(r);
1344         }
1345 }
1346
1347 void
1348 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1349 {
1350         struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1351         int i;
1352
1353         /* clean up in reverse order from create
1354          *   1.  recv mr memory (mr free, then kfree)
1355          *   2.  send mr memory (mr free, then kfree)
1356          *   3.  MWs
1357          */
1358         dprintk("RPC:       %s: entering\n", __func__);
1359
1360         for (i = 0; i < buf->rb_max_requests; i++) {
1361                 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1362                         rpcrdma_deregister_internal(ia,
1363                                         buf->rb_recv_bufs[i]->rr_handle,
1364                                         &buf->rb_recv_bufs[i]->rr_iov);
1365                         kfree(buf->rb_recv_bufs[i]);
1366                 }
1367                 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1368                         rpcrdma_deregister_internal(ia,
1369                                         buf->rb_send_bufs[i]->rl_handle,
1370                                         &buf->rb_send_bufs[i]->rl_iov);
1371                         kfree(buf->rb_send_bufs[i]);
1372                 }
1373         }
1374
1375         switch (ia->ri_memreg_strategy) {
1376         case RPCRDMA_FRMR:
1377                 rpcrdma_destroy_frmrs(buf);
1378                 break;
1379         case RPCRDMA_MTHCAFMR:
1380                 rpcrdma_destroy_fmrs(buf);
1381                 break;
1382         default:
1383                 break;
1384         }
1385
1386         kfree(buf->rb_pool);
1387 }
1388
1389 /* After a disconnect, unmap all FMRs.
1390  *
1391  * This is invoked only in the transport connect worker in order
1392  * to serialize with rpcrdma_register_fmr_external().
1393  */
1394 static void
1395 rpcrdma_reset_fmrs(struct rpcrdma_ia *ia)
1396 {
1397         struct rpcrdma_xprt *r_xprt =
1398                                 container_of(ia, struct rpcrdma_xprt, rx_ia);
1399         struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1400         struct list_head *pos;
1401         struct rpcrdma_mw *r;
1402         LIST_HEAD(l);
1403         int rc;
1404
1405         list_for_each(pos, &buf->rb_all) {
1406                 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1407
1408                 INIT_LIST_HEAD(&l);
1409                 list_add(&r->r.fmr->list, &l);
1410                 rc = ib_unmap_fmr(&l);
1411                 if (rc)
1412                         dprintk("RPC:       %s: ib_unmap_fmr failed %i\n",
1413                                 __func__, rc);
1414         }
1415 }
1416
1417 /* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1418  * an unusable state. Find FRMRs in this state and dereg / reg
1419  * each.  FRMRs that are VALID and attached to an rpcrdma_req are
1420  * also torn down.
1421  *
1422  * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1423  *
1424  * This is invoked only in the transport connect worker in order
1425  * to serialize with rpcrdma_register_frmr_external().
1426  */
1427 static void
1428 rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
1429 {
1430         struct rpcrdma_xprt *r_xprt =
1431                                 container_of(ia, struct rpcrdma_xprt, rx_ia);
1432         struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1433         struct list_head *pos;
1434         struct rpcrdma_mw *r;
1435         int rc;
1436
1437         list_for_each(pos, &buf->rb_all) {
1438                 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1439
1440                 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
1441                         continue;
1442
1443                 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1444                 if (rc)
1445                         dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
1446                                 __func__, rc);
1447                 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1448
1449                 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1450                                         ia->ri_max_frmr_depth);
1451                 if (IS_ERR(r->r.frmr.fr_mr)) {
1452                         rc = PTR_ERR(r->r.frmr.fr_mr);
1453                         dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
1454                                 " failed %i\n", __func__, rc);
1455                         continue;
1456                 }
1457                 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1458                                         ia->ri_id->device,
1459                                         ia->ri_max_frmr_depth);
1460                 if (IS_ERR(r->r.frmr.fr_pgl)) {
1461                         rc = PTR_ERR(r->r.frmr.fr_pgl);
1462                         dprintk("RPC:       %s: "
1463                                 "ib_alloc_fast_reg_page_list "
1464                                 "failed %i\n", __func__, rc);
1465
1466                         ib_dereg_mr(r->r.frmr.fr_mr);
1467                         continue;
1468                 }
1469                 r->r.frmr.fr_state = FRMR_IS_INVALID;
1470         }
1471 }
1472
1473 /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1474  * some req segments uninitialized.
1475  */
1476 static void
1477 rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
1478 {
1479         if (*mw) {
1480                 list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
1481                 *mw = NULL;
1482         }
1483 }
1484
1485 /* Cycle mw's back in reverse order, and "spin" them.
1486  * This delays and scrambles reuse as much as possible.
1487  */
1488 static void
1489 rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1490 {
1491         struct rpcrdma_mr_seg *seg = req->rl_segments;
1492         struct rpcrdma_mr_seg *seg1 = seg;
1493         int i;
1494
1495         for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
1496                 rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
1497         rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
1498 }
1499
1500 static void
1501 rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1502 {
1503         buf->rb_send_bufs[--buf->rb_send_index] = req;
1504         req->rl_niovs = 0;
1505         if (req->rl_reply) {
1506                 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
1507                 req->rl_reply->rr_func = NULL;
1508                 req->rl_reply = NULL;
1509         }
1510 }
1511
1512 /* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
1513  * Redo only the ib_post_send().
1514  */
1515 static void
1516 rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
1517 {
1518         struct rpcrdma_xprt *r_xprt =
1519                                 container_of(ia, struct rpcrdma_xprt, rx_ia);
1520         struct ib_send_wr invalidate_wr, *bad_wr;
1521         int rc;
1522
1523         dprintk("RPC:       %s: FRMR %p is stale\n", __func__, r);
1524
1525         /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
1526         r->r.frmr.fr_state = FRMR_IS_INVALID;
1527
1528         memset(&invalidate_wr, 0, sizeof(invalidate_wr));
1529         invalidate_wr.wr_id = (unsigned long)(void *)r;
1530         invalidate_wr.opcode = IB_WR_LOCAL_INV;
1531         invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
1532         DECR_CQCOUNT(&r_xprt->rx_ep);
1533
1534         dprintk("RPC:       %s: frmr %p invalidating rkey %08x\n",
1535                 __func__, r, r->r.frmr.fr_mr->rkey);
1536
1537         read_lock(&ia->ri_qplock);
1538         rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1539         read_unlock(&ia->ri_qplock);
1540         if (rc) {
1541                 /* Force rpcrdma_buffer_get() to retry */
1542                 r->r.frmr.fr_state = FRMR_IS_STALE;
1543                 dprintk("RPC:       %s: ib_post_send failed, %i\n",
1544                         __func__, rc);
1545         }
1546 }
1547
1548 static void
1549 rpcrdma_retry_flushed_linv(struct list_head *stale,
1550                            struct rpcrdma_buffer *buf)
1551 {
1552         struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1553         struct list_head *pos;
1554         struct rpcrdma_mw *r;
1555         unsigned long flags;
1556
1557         list_for_each(pos, stale) {
1558                 r = list_entry(pos, struct rpcrdma_mw, mw_list);
1559                 rpcrdma_retry_local_inv(r, ia);
1560         }
1561
1562         spin_lock_irqsave(&buf->rb_lock, flags);
1563         list_splice_tail(stale, &buf->rb_mws);
1564         spin_unlock_irqrestore(&buf->rb_lock, flags);
1565 }
1566
1567 static struct rpcrdma_req *
1568 rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
1569                          struct list_head *stale)
1570 {
1571         struct rpcrdma_mw *r;
1572         int i;
1573
1574         i = RPCRDMA_MAX_SEGS - 1;
1575         while (!list_empty(&buf->rb_mws)) {
1576                 r = list_entry(buf->rb_mws.next,
1577                                struct rpcrdma_mw, mw_list);
1578                 list_del(&r->mw_list);
1579                 if (r->r.frmr.fr_state == FRMR_IS_STALE) {
1580                         list_add(&r->mw_list, stale);
1581                         continue;
1582                 }
1583                 req->rl_segments[i].rl_mw = r;
1584                 if (unlikely(i-- == 0))
1585                         return req;     /* Success */
1586         }
1587
1588         /* Not enough entries on rb_mws for this req */
1589         rpcrdma_buffer_put_sendbuf(req, buf);
1590         rpcrdma_buffer_put_mrs(req, buf);
1591         return NULL;
1592 }
1593
1594 static struct rpcrdma_req *
1595 rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1596 {
1597         struct rpcrdma_mw *r;
1598         int i;
1599
1600         i = RPCRDMA_MAX_SEGS - 1;
1601         while (!list_empty(&buf->rb_mws)) {
1602                 r = list_entry(buf->rb_mws.next,
1603                                struct rpcrdma_mw, mw_list);
1604                 list_del(&r->mw_list);
1605                 req->rl_segments[i].rl_mw = r;
1606                 if (unlikely(i-- == 0))
1607                         return req;     /* Success */
1608         }
1609
1610         /* Not enough entries on rb_mws for this req */
1611         rpcrdma_buffer_put_sendbuf(req, buf);
1612         rpcrdma_buffer_put_mrs(req, buf);
1613         return NULL;
1614 }
1615
1616 /*
1617  * Get a set of request/reply buffers.
1618  *
1619  * Reply buffer (if needed) is attached to send buffer upon return.
1620  * Rule:
1621  *    rb_send_index and rb_recv_index MUST always be pointing to the
1622  *    *next* available buffer (non-NULL). They are incremented after
1623  *    removing buffers, and decremented *before* returning them.
1624  */
1625 struct rpcrdma_req *
1626 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1627 {
1628         struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1629         struct list_head stale;
1630         struct rpcrdma_req *req;
1631         unsigned long flags;
1632
1633         spin_lock_irqsave(&buffers->rb_lock, flags);
1634         if (buffers->rb_send_index == buffers->rb_max_requests) {
1635                 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1636                 dprintk("RPC:       %s: out of request buffers\n", __func__);
1637                 return ((struct rpcrdma_req *)NULL);
1638         }
1639
1640         req = buffers->rb_send_bufs[buffers->rb_send_index];
1641         if (buffers->rb_send_index < buffers->rb_recv_index) {
1642                 dprintk("RPC:       %s: %d extra receives outstanding (ok)\n",
1643                         __func__,
1644                         buffers->rb_recv_index - buffers->rb_send_index);
1645                 req->rl_reply = NULL;
1646         } else {
1647                 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1648                 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1649         }
1650         buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1651
1652         INIT_LIST_HEAD(&stale);
1653         switch (ia->ri_memreg_strategy) {
1654         case RPCRDMA_FRMR:
1655                 req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
1656                 break;
1657         case RPCRDMA_MTHCAFMR:
1658                 req = rpcrdma_buffer_get_fmrs(req, buffers);
1659                 break;
1660         default:
1661                 break;
1662         }
1663         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1664         if (!list_empty(&stale))
1665                 rpcrdma_retry_flushed_linv(&stale, buffers);
1666         return req;
1667 }
1668
1669 /*
1670  * Put request/reply buffers back into pool.
1671  * Pre-decrement counter/array index.
1672  */
1673 void
1674 rpcrdma_buffer_put(struct rpcrdma_req *req)
1675 {
1676         struct rpcrdma_buffer *buffers = req->rl_buffer;
1677         struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1678         unsigned long flags;
1679
1680         spin_lock_irqsave(&buffers->rb_lock, flags);
1681         rpcrdma_buffer_put_sendbuf(req, buffers);
1682         switch (ia->ri_memreg_strategy) {
1683         case RPCRDMA_FRMR:
1684         case RPCRDMA_MTHCAFMR:
1685                 rpcrdma_buffer_put_mrs(req, buffers);
1686                 break;
1687         default:
1688                 break;
1689         }
1690         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1691 }
1692
1693 /*
1694  * Recover reply buffers from pool.
1695  * This happens when recovering from error conditions.
1696  * Post-increment counter/array index.
1697  */
1698 void
1699 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1700 {
1701         struct rpcrdma_buffer *buffers = req->rl_buffer;
1702         unsigned long flags;
1703
1704         if (req->rl_iov.length == 0)    /* special case xprt_rdma_allocate() */
1705                 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1706         spin_lock_irqsave(&buffers->rb_lock, flags);
1707         if (buffers->rb_recv_index < buffers->rb_max_requests) {
1708                 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1709                 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1710         }
1711         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1712 }
1713
1714 /*
1715  * Put reply buffers back into pool when not attached to
1716  * request. This happens in error conditions.
1717  */
1718 void
1719 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1720 {
1721         struct rpcrdma_buffer *buffers = rep->rr_buffer;
1722         unsigned long flags;
1723
1724         rep->rr_func = NULL;
1725         spin_lock_irqsave(&buffers->rb_lock, flags);
1726         buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1727         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1728 }
1729
1730 /*
1731  * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1732  */
1733
1734 int
1735 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1736                                 struct ib_mr **mrp, struct ib_sge *iov)
1737 {
1738         struct ib_phys_buf ipb;
1739         struct ib_mr *mr;
1740         int rc;
1741
1742         /*
1743          * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1744          */
1745         iov->addr = ib_dma_map_single(ia->ri_id->device,
1746                         va, len, DMA_BIDIRECTIONAL);
1747         if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
1748                 return -ENOMEM;
1749
1750         iov->length = len;
1751
1752         if (ia->ri_have_dma_lkey) {
1753                 *mrp = NULL;
1754                 iov->lkey = ia->ri_dma_lkey;
1755                 return 0;
1756         } else if (ia->ri_bind_mem != NULL) {
1757                 *mrp = NULL;
1758                 iov->lkey = ia->ri_bind_mem->lkey;
1759                 return 0;
1760         }
1761
1762         ipb.addr = iov->addr;
1763         ipb.size = iov->length;
1764         mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1765                         IB_ACCESS_LOCAL_WRITE, &iov->addr);
1766
1767         dprintk("RPC:       %s: phys convert: 0x%llx "
1768                         "registered 0x%llx length %d\n",
1769                         __func__, (unsigned long long)ipb.addr,
1770                         (unsigned long long)iov->addr, len);
1771
1772         if (IS_ERR(mr)) {
1773                 *mrp = NULL;
1774                 rc = PTR_ERR(mr);
1775                 dprintk("RPC:       %s: failed with %i\n", __func__, rc);
1776         } else {
1777                 *mrp = mr;
1778                 iov->lkey = mr->lkey;
1779                 rc = 0;
1780         }
1781
1782         return rc;
1783 }
1784
1785 int
1786 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1787                                 struct ib_mr *mr, struct ib_sge *iov)
1788 {
1789         int rc;
1790
1791         ib_dma_unmap_single(ia->ri_id->device,
1792                         iov->addr, iov->length, DMA_BIDIRECTIONAL);
1793
1794         if (NULL == mr)
1795                 return 0;
1796
1797         rc = ib_dereg_mr(mr);
1798         if (rc)
1799                 dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
1800         return rc;
1801 }
1802
1803 /*
1804  * Wrappers for chunk registration, shared by read/write chunk code.
1805  */
1806
1807 static void
1808 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1809 {
1810         seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1811         seg->mr_dmalen = seg->mr_len;
1812         if (seg->mr_page)
1813                 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1814                                 seg->mr_page, offset_in_page(seg->mr_offset),
1815                                 seg->mr_dmalen, seg->mr_dir);
1816         else
1817                 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1818                                 seg->mr_offset,
1819                                 seg->mr_dmalen, seg->mr_dir);
1820         if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1821                 dprintk("RPC:       %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1822                         __func__,
1823                         (unsigned long long)seg->mr_dma,
1824                         seg->mr_offset, seg->mr_dmalen);
1825         }
1826 }
1827
1828 static void
1829 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1830 {
1831         if (seg->mr_page)
1832                 ib_dma_unmap_page(ia->ri_id->device,
1833                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1834         else
1835                 ib_dma_unmap_single(ia->ri_id->device,
1836                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1837 }
1838
1839 static int
1840 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1841                         int *nsegs, int writing, struct rpcrdma_ia *ia,
1842                         struct rpcrdma_xprt *r_xprt)
1843 {
1844         struct rpcrdma_mr_seg *seg1 = seg;
1845         struct rpcrdma_mw *mw = seg1->rl_mw;
1846         struct rpcrdma_frmr *frmr = &mw->r.frmr;
1847         struct ib_mr *mr = frmr->fr_mr;
1848         struct ib_send_wr fastreg_wr, *bad_wr;
1849         u8 key;
1850         int len, pageoff;
1851         int i, rc;
1852         int seg_len;
1853         u64 pa;
1854         int page_no;
1855
1856         pageoff = offset_in_page(seg1->mr_offset);
1857         seg1->mr_offset -= pageoff;     /* start of page */
1858         seg1->mr_len += pageoff;
1859         len = -pageoff;
1860         if (*nsegs > ia->ri_max_frmr_depth)
1861                 *nsegs = ia->ri_max_frmr_depth;
1862         for (page_no = i = 0; i < *nsegs;) {
1863                 rpcrdma_map_one(ia, seg, writing);
1864                 pa = seg->mr_dma;
1865                 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1866                         frmr->fr_pgl->page_list[page_no++] = pa;
1867                         pa += PAGE_SIZE;
1868                 }
1869                 len += seg->mr_len;
1870                 ++seg;
1871                 ++i;
1872                 /* Check for holes */
1873                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1874                     offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1875                         break;
1876         }
1877         dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
1878                 __func__, mw, i);
1879
1880         frmr->fr_state = FRMR_IS_VALID;
1881
1882         memset(&fastreg_wr, 0, sizeof(fastreg_wr));
1883         fastreg_wr.wr_id = (unsigned long)(void *)mw;
1884         fastreg_wr.opcode = IB_WR_FAST_REG_MR;
1885         fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1886         fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
1887         fastreg_wr.wr.fast_reg.page_list_len = page_no;
1888         fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1889         fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1890         if (fastreg_wr.wr.fast_reg.length < len) {
1891                 rc = -EIO;
1892                 goto out_err;
1893         }
1894
1895         /* Bump the key */
1896         key = (u8)(mr->rkey & 0x000000FF);
1897         ib_update_fast_reg_key(mr, ++key);
1898
1899         fastreg_wr.wr.fast_reg.access_flags = (writing ?
1900                                 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1901                                 IB_ACCESS_REMOTE_READ);
1902         fastreg_wr.wr.fast_reg.rkey = mr->rkey;
1903         DECR_CQCOUNT(&r_xprt->rx_ep);
1904
1905         rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
1906         if (rc) {
1907                 dprintk("RPC:       %s: failed ib_post_send for register,"
1908                         " status %i\n", __func__, rc);
1909                 ib_update_fast_reg_key(mr, --key);
1910                 goto out_err;
1911         } else {
1912                 seg1->mr_rkey = mr->rkey;
1913                 seg1->mr_base = seg1->mr_dma + pageoff;
1914                 seg1->mr_nsegs = i;
1915                 seg1->mr_len = len;
1916         }
1917         *nsegs = i;
1918         return 0;
1919 out_err:
1920         frmr->fr_state = FRMR_IS_INVALID;
1921         while (i--)
1922                 rpcrdma_unmap_one(ia, --seg);
1923         return rc;
1924 }
1925
1926 static int
1927 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1928                         struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1929 {
1930         struct rpcrdma_mr_seg *seg1 = seg;
1931         struct ib_send_wr invalidate_wr, *bad_wr;
1932         int rc;
1933
1934         seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
1935
1936         memset(&invalidate_wr, 0, sizeof invalidate_wr);
1937         invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
1938         invalidate_wr.opcode = IB_WR_LOCAL_INV;
1939         invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
1940         DECR_CQCOUNT(&r_xprt->rx_ep);
1941
1942         read_lock(&ia->ri_qplock);
1943         while (seg1->mr_nsegs--)
1944                 rpcrdma_unmap_one(ia, seg++);
1945         rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1946         read_unlock(&ia->ri_qplock);
1947         if (rc) {
1948                 /* Force rpcrdma_buffer_get() to retry */
1949                 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
1950                 dprintk("RPC:       %s: failed ib_post_send for invalidate,"
1951                         " status %i\n", __func__, rc);
1952         }
1953         return rc;
1954 }
1955
1956 static int
1957 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1958                         int *nsegs, int writing, struct rpcrdma_ia *ia)
1959 {
1960         struct rpcrdma_mr_seg *seg1 = seg;
1961         u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1962         int len, pageoff, i, rc;
1963
1964         pageoff = offset_in_page(seg1->mr_offset);
1965         seg1->mr_offset -= pageoff;     /* start of page */
1966         seg1->mr_len += pageoff;
1967         len = -pageoff;
1968         if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1969                 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1970         for (i = 0; i < *nsegs;) {
1971                 rpcrdma_map_one(ia, seg, writing);
1972                 physaddrs[i] = seg->mr_dma;
1973                 len += seg->mr_len;
1974                 ++seg;
1975                 ++i;
1976                 /* Check for holes */
1977                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1978                     offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1979                         break;
1980         }
1981         rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma);
1982         if (rc) {
1983                 dprintk("RPC:       %s: failed ib_map_phys_fmr "
1984                         "%u@0x%llx+%i (%d)... status %i\n", __func__,
1985                         len, (unsigned long long)seg1->mr_dma,
1986                         pageoff, i, rc);
1987                 while (i--)
1988                         rpcrdma_unmap_one(ia, --seg);
1989         } else {
1990                 seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey;
1991                 seg1->mr_base = seg1->mr_dma + pageoff;
1992                 seg1->mr_nsegs = i;
1993                 seg1->mr_len = len;
1994         }
1995         *nsegs = i;
1996         return rc;
1997 }
1998
1999 static int
2000 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
2001                         struct rpcrdma_ia *ia)
2002 {
2003         struct rpcrdma_mr_seg *seg1 = seg;
2004         LIST_HEAD(l);
2005         int rc;
2006
2007         list_add(&seg1->rl_mw->r.fmr->list, &l);
2008         rc = ib_unmap_fmr(&l);
2009         read_lock(&ia->ri_qplock);
2010         while (seg1->mr_nsegs--)
2011                 rpcrdma_unmap_one(ia, seg++);
2012         read_unlock(&ia->ri_qplock);
2013         if (rc)
2014                 dprintk("RPC:       %s: failed ib_unmap_fmr,"
2015                         " status %i\n", __func__, rc);
2016         return rc;
2017 }
2018
2019 int
2020 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
2021                         int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
2022 {
2023         struct rpcrdma_ia *ia = &r_xprt->rx_ia;
2024         int rc = 0;
2025
2026         switch (ia->ri_memreg_strategy) {
2027
2028         case RPCRDMA_ALLPHYSICAL:
2029                 rpcrdma_map_one(ia, seg, writing);
2030                 seg->mr_rkey = ia->ri_bind_mem->rkey;
2031                 seg->mr_base = seg->mr_dma;
2032                 seg->mr_nsegs = 1;
2033                 nsegs = 1;
2034                 break;
2035
2036         /* Registration using frmr registration */
2037         case RPCRDMA_FRMR:
2038                 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
2039                 break;
2040
2041         /* Registration using fmr memory registration */
2042         case RPCRDMA_MTHCAFMR:
2043                 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
2044                 break;
2045
2046         default:
2047                 return -EIO;
2048         }
2049         if (rc)
2050                 return rc;
2051
2052         return nsegs;
2053 }
2054
2055 int
2056 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
2057                 struct rpcrdma_xprt *r_xprt)
2058 {
2059         struct rpcrdma_ia *ia = &r_xprt->rx_ia;
2060         int nsegs = seg->mr_nsegs, rc;
2061
2062         switch (ia->ri_memreg_strategy) {
2063
2064         case RPCRDMA_ALLPHYSICAL:
2065                 read_lock(&ia->ri_qplock);
2066                 rpcrdma_unmap_one(ia, seg);
2067                 read_unlock(&ia->ri_qplock);
2068                 break;
2069
2070         case RPCRDMA_FRMR:
2071                 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
2072                 break;
2073
2074         case RPCRDMA_MTHCAFMR:
2075                 rc = rpcrdma_deregister_fmr_external(seg, ia);
2076                 break;
2077
2078         default:
2079                 break;
2080         }
2081         return nsegs;
2082 }
2083
2084 /*
2085  * Prepost any receive buffer, then post send.
2086  *
2087  * Receive buffer is donated to hardware, reclaimed upon recv completion.
2088  */
2089 int
2090 rpcrdma_ep_post(struct rpcrdma_ia *ia,
2091                 struct rpcrdma_ep *ep,
2092                 struct rpcrdma_req *req)
2093 {
2094         struct ib_send_wr send_wr, *send_wr_fail;
2095         struct rpcrdma_rep *rep = req->rl_reply;
2096         int rc;
2097
2098         if (rep) {
2099                 rc = rpcrdma_ep_post_recv(ia, ep, rep);
2100                 if (rc)
2101                         goto out;
2102                 req->rl_reply = NULL;
2103         }
2104
2105         send_wr.next = NULL;
2106         send_wr.wr_id = 0ULL;   /* no send cookie */
2107         send_wr.sg_list = req->rl_send_iov;
2108         send_wr.num_sge = req->rl_niovs;
2109         send_wr.opcode = IB_WR_SEND;
2110         if (send_wr.num_sge == 4)       /* no need to sync any pad (constant) */
2111                 ib_dma_sync_single_for_device(ia->ri_id->device,
2112                         req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
2113                         DMA_TO_DEVICE);
2114         ib_dma_sync_single_for_device(ia->ri_id->device,
2115                 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
2116                 DMA_TO_DEVICE);
2117         ib_dma_sync_single_for_device(ia->ri_id->device,
2118                 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
2119                 DMA_TO_DEVICE);
2120
2121         if (DECR_CQCOUNT(ep) > 0)
2122                 send_wr.send_flags = 0;
2123         else { /* Provider must take a send completion every now and then */
2124                 INIT_CQCOUNT(ep);
2125                 send_wr.send_flags = IB_SEND_SIGNALED;
2126         }
2127
2128         rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
2129         if (rc)
2130                 dprintk("RPC:       %s: ib_post_send returned %i\n", __func__,
2131                         rc);
2132 out:
2133         return rc;
2134 }
2135
2136 /*
2137  * (Re)post a receive buffer.
2138  */
2139 int
2140 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
2141                      struct rpcrdma_ep *ep,
2142                      struct rpcrdma_rep *rep)
2143 {
2144         struct ib_recv_wr recv_wr, *recv_wr_fail;
2145         int rc;
2146
2147         recv_wr.next = NULL;
2148         recv_wr.wr_id = (u64) (unsigned long) rep;
2149         recv_wr.sg_list = &rep->rr_iov;
2150         recv_wr.num_sge = 1;
2151
2152         ib_dma_sync_single_for_cpu(ia->ri_id->device,
2153                 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
2154
2155         rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
2156
2157         if (rc)
2158                 dprintk("RPC:       %s: ib_post_recv returned %i\n", __func__,
2159                         rc);
2160         return rc;
2161 }
2162
2163 /* Physical mapping means one Read/Write list entry per-page.
2164  * All list entries must fit within an inline buffer
2165  *
2166  * NB: The server must return a Write list for NFS READ,
2167  *     which has the same constraint. Factor in the inline
2168  *     rsize as well.
2169  */
2170 static size_t
2171 rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
2172 {
2173         struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
2174         unsigned int inline_size, pages;
2175
2176         inline_size = min_t(unsigned int,
2177                             cdata->inline_wsize, cdata->inline_rsize);
2178         inline_size -= RPCRDMA_HDRLEN_MIN;
2179         pages = inline_size / sizeof(struct rpcrdma_segment);
2180         return pages << PAGE_SHIFT;
2181 }
2182
2183 static size_t
2184 rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
2185 {
2186         return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
2187 }
2188
2189 size_t
2190 rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
2191 {
2192         size_t result;
2193
2194         switch (r_xprt->rx_ia.ri_memreg_strategy) {
2195         case RPCRDMA_ALLPHYSICAL:
2196                 result = rpcrdma_physical_max_payload(r_xprt);
2197                 break;
2198         default:
2199                 result = rpcrdma_mr_max_payload(r_xprt);
2200         }
2201         return result;
2202 }