xprtrdma: Allocate each struct rpcrdma_mw separately
[linux-2.6-block.git] / net / sunrpc / xprtrdma / verbs.c
CommitLineData
f58851e6 1/*
c56c65fb
TT
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
f58851e6
TT
38 */
39
c56c65fb
TT
40/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
a6b7a407 50#include <linux/interrupt.h>
5a0e3ad6 51#include <linux/slab.h>
65866f82 52#include <asm/bitops.h>
c56c65fb 53
f58851e6
TT
54#include "xprt_rdma.h"
55
c56c65fb
TT
56/*
57 * Globals/Macros
58 */
59
60#ifdef RPC_DEBUG
61# define RPCDBG_FACILITY RPCDBG_TRANS
62#endif
63
9f9d802a
CL
64static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
65
c56c65fb
TT
66/*
67 * internal functions
68 */
69
70/*
71 * handle replies in tasklet context, using a single, global list
72 * rdma tasklet function -- just turn around and call the func
73 * for all replies on the list
74 */
75
76static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
77static LIST_HEAD(rpcrdma_tasklets_g);
78
79static void
80rpcrdma_run_tasklet(unsigned long data)
81{
82 struct rpcrdma_rep *rep;
83 void (*func)(struct rpcrdma_rep *);
84 unsigned long flags;
85
86 data = data;
87 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
88 while (!list_empty(&rpcrdma_tasklets_g)) {
89 rep = list_entry(rpcrdma_tasklets_g.next,
90 struct rpcrdma_rep, rr_list);
91 list_del(&rep->rr_list);
92 func = rep->rr_func;
93 rep->rr_func = NULL;
94 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
95
96 if (func)
97 func(rep);
98 else
99 rpcrdma_recv_buffer_put(rep);
100
101 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
102 }
103 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
104}
105
106static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
107
108static inline void
109rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
110{
111 unsigned long flags;
112
113 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
114 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
115 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
116 tasklet_schedule(&rpcrdma_tasklet_g);
117}
118
119static void
120rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
121{
122 struct rpcrdma_ep *ep = context;
123
124 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
125 __func__, event->event, event->device->name, context);
126 if (ep->rep_connected == 1) {
127 ep->rep_connected = -EIO;
128 ep->rep_func(ep);
129 wake_up_all(&ep->rep_connect_wait);
130 }
131}
132
133static void
134rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
135{
136 struct rpcrdma_ep *ep = context;
137
138 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
139 __func__, event->event, event->device->name, context);
140 if (ep->rep_connected == 1) {
141 ep->rep_connected = -EIO;
142 ep->rep_func(ep);
143 wake_up_all(&ep->rep_connect_wait);
144 }
145}
146
fc664485
CL
147static void
148rpcrdma_sendcq_process_wc(struct ib_wc *wc)
c56c65fb 149{
fc664485 150 struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
c56c65fb 151
fc664485
CL
152 dprintk("RPC: %s: frmr %p status %X opcode %d\n",
153 __func__, frmr, wc->status, wc->opcode);
c56c65fb 154
fc664485 155 if (wc->wr_id == 0ULL)
c56c65fb 156 return;
dab7e3b8 157 if (wc->status != IB_WC_SUCCESS)
9f9d802a 158 frmr->r.frmr.fr_state = FRMR_IS_STALE;
c56c65fb
TT
159}
160
fc664485 161static int
1c00dd07 162rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
c56c65fb 163{
1c00dd07 164 struct ib_wc *wcs;
8301a2c0 165 int budget, count, rc;
c56c65fb 166
8301a2c0 167 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
168 do {
169 wcs = ep->rep_send_wcs;
170
171 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
172 if (rc <= 0)
173 return rc;
174
175 count = rc;
176 while (count-- > 0)
177 rpcrdma_sendcq_process_wc(wcs++);
8301a2c0 178 } while (rc == RPCRDMA_POLLSIZE && --budget);
1c00dd07 179 return 0;
fc664485 180}
c56c65fb 181
fc664485
CL
182/*
183 * Handle send, fast_reg_mr, and local_inv completions.
184 *
185 * Send events are typically suppressed and thus do not result
186 * in an upcall. Occasionally one is signaled, however. This
187 * prevents the provider's completion queue from wrapping and
188 * losing a completion.
189 */
190static void
191rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
192{
1c00dd07 193 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
fc664485
CL
194 int rc;
195
1c00dd07 196 rc = rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
197 if (rc) {
198 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
199 __func__, rc);
200 return;
c56c65fb
TT
201 }
202
7f23f6f6
CL
203 rc = ib_req_notify_cq(cq,
204 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
205 if (rc == 0)
206 return;
207 if (rc < 0) {
fc664485
CL
208 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
209 __func__, rc);
210 return;
211 }
212
1c00dd07 213 rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
214}
215
216static void
217rpcrdma_recvcq_process_wc(struct ib_wc *wc)
218{
219 struct rpcrdma_rep *rep =
220 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
221
222 dprintk("RPC: %s: rep %p status %X opcode %X length %u\n",
223 __func__, rep, wc->status, wc->opcode, wc->byte_len);
224
225 if (wc->status != IB_WC_SUCCESS) {
226 rep->rr_len = ~0U;
227 goto out_schedule;
228 }
229 if (wc->opcode != IB_WC_RECV)
230 return;
231
232 rep->rr_len = wc->byte_len;
233 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
234 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
235
236 if (rep->rr_len >= 16) {
237 struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
238 unsigned int credits = ntohl(p->rm_credit);
239
240 if (credits == 0)
241 credits = 1; /* don't deadlock */
242 else if (credits > rep->rr_buffer->rb_max_requests)
243 credits = rep->rr_buffer->rb_max_requests;
244 atomic_set(&rep->rr_buffer->rb_credits, credits);
245 }
246
247out_schedule:
248 rpcrdma_schedule_tasklet(rep);
249}
250
251static int
1c00dd07 252rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
fc664485 253{
1c00dd07 254 struct ib_wc *wcs;
8301a2c0 255 int budget, count, rc;
fc664485 256
8301a2c0 257 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
258 do {
259 wcs = ep->rep_recv_wcs;
260
261 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
262 if (rc <= 0)
263 return rc;
264
265 count = rc;
266 while (count-- > 0)
267 rpcrdma_recvcq_process_wc(wcs++);
8301a2c0 268 } while (rc == RPCRDMA_POLLSIZE && --budget);
1c00dd07 269 return 0;
c56c65fb
TT
270}
271
272/*
fc664485 273 * Handle receive completions.
c56c65fb 274 *
c56c65fb
TT
275 * It is reentrant but processes single events in order to maintain
276 * ordering of receives to keep server credits.
277 *
278 * It is the responsibility of the scheduled tasklet to return
279 * recv buffers to the pool. NOTE: this affects synchronization of
280 * connection shutdown. That is, the structures required for
281 * the completion of the reply handler must remain intact until
282 * all memory has been reclaimed.
c56c65fb
TT
283 */
284static void
fc664485 285rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
c56c65fb 286{
1c00dd07 287 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
c56c65fb
TT
288 int rc;
289
1c00dd07 290 rc = rpcrdma_recvcq_poll(cq, ep);
fc664485
CL
291 if (rc) {
292 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
293 __func__, rc);
c56c65fb 294 return;
fc664485 295 }
c56c65fb 296
7f23f6f6
CL
297 rc = ib_req_notify_cq(cq,
298 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
299 if (rc == 0)
300 return;
301 if (rc < 0) {
fc664485 302 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
c56c65fb
TT
303 __func__, rc);
304 return;
305 }
306
1c00dd07 307 rpcrdma_recvcq_poll(cq, ep);
c56c65fb
TT
308}
309
a7bc211a
CL
310static void
311rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
312{
313 rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep);
314 rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep);
315}
316
c56c65fb
TT
317#ifdef RPC_DEBUG
318static const char * const conn[] = {
319 "address resolved",
320 "address error",
321 "route resolved",
322 "route error",
323 "connect request",
324 "connect response",
325 "connect error",
326 "unreachable",
327 "rejected",
328 "established",
329 "disconnected",
330 "device removal"
331};
332#endif
333
334static int
335rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
336{
337 struct rpcrdma_xprt *xprt = id->context;
338 struct rpcrdma_ia *ia = &xprt->rx_ia;
339 struct rpcrdma_ep *ep = &xprt->rx_ep;
ff0db049 340#ifdef RPC_DEBUG
c56c65fb 341 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
ff0db049 342#endif
c56c65fb
TT
343 struct ib_qp_attr attr;
344 struct ib_qp_init_attr iattr;
345 int connstate = 0;
346
347 switch (event->event) {
348 case RDMA_CM_EVENT_ADDR_RESOLVED:
349 case RDMA_CM_EVENT_ROUTE_RESOLVED:
5675add3 350 ia->ri_async_rc = 0;
c56c65fb
TT
351 complete(&ia->ri_done);
352 break;
353 case RDMA_CM_EVENT_ADDR_ERROR:
354 ia->ri_async_rc = -EHOSTUNREACH;
355 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
356 __func__, ep);
357 complete(&ia->ri_done);
358 break;
359 case RDMA_CM_EVENT_ROUTE_ERROR:
360 ia->ri_async_rc = -ENETUNREACH;
361 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
362 __func__, ep);
363 complete(&ia->ri_done);
364 break;
365 case RDMA_CM_EVENT_ESTABLISHED:
366 connstate = 1;
367 ib_query_qp(ia->ri_id->qp, &attr,
368 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
369 &iattr);
370 dprintk("RPC: %s: %d responder resources"
371 " (%d initiator)\n",
372 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
373 goto connected;
374 case RDMA_CM_EVENT_CONNECT_ERROR:
375 connstate = -ENOTCONN;
376 goto connected;
377 case RDMA_CM_EVENT_UNREACHABLE:
378 connstate = -ENETDOWN;
379 goto connected;
380 case RDMA_CM_EVENT_REJECTED:
381 connstate = -ECONNREFUSED;
382 goto connected;
383 case RDMA_CM_EVENT_DISCONNECTED:
384 connstate = -ECONNABORTED;
385 goto connected;
386 case RDMA_CM_EVENT_DEVICE_REMOVAL:
387 connstate = -ENODEV;
388connected:
21454aaa 389 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
c56c65fb
TT
390 __func__,
391 (event->event <= 11) ? conn[event->event] :
392 "unknown connection error",
21454aaa 393 &addr->sin_addr.s_addr,
c56c65fb
TT
394 ntohs(addr->sin_port),
395 ep, event->event);
396 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
397 dprintk("RPC: %s: %sconnected\n",
398 __func__, connstate > 0 ? "" : "dis");
399 ep->rep_connected = connstate;
400 ep->rep_func(ep);
401 wake_up_all(&ep->rep_connect_wait);
402 break;
403 default:
1a954051 404 dprintk("RPC: %s: unexpected CM event %d\n",
c56c65fb 405 __func__, event->event);
c56c65fb
TT
406 break;
407 }
408
b3cd8d45
TT
409#ifdef RPC_DEBUG
410 if (connstate == 1) {
411 int ird = attr.max_dest_rd_atomic;
412 int tird = ep->rep_remote_cma.responder_resources;
21454aaa 413 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
b3cd8d45 414 "on %s, memreg %d slots %d ird %d%s\n",
21454aaa 415 &addr->sin_addr.s_addr,
b3cd8d45
TT
416 ntohs(addr->sin_port),
417 ia->ri_id->device->name,
418 ia->ri_memreg_strategy,
419 xprt->rx_buf.rb_max_requests,
420 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
421 } else if (connstate < 0) {
21454aaa
HH
422 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
423 &addr->sin_addr.s_addr,
b3cd8d45
TT
424 ntohs(addr->sin_port),
425 connstate);
426 }
427#endif
428
c56c65fb
TT
429 return 0;
430}
431
432static struct rdma_cm_id *
433rpcrdma_create_id(struct rpcrdma_xprt *xprt,
434 struct rpcrdma_ia *ia, struct sockaddr *addr)
435{
436 struct rdma_cm_id *id;
437 int rc;
438
1a954051
TT
439 init_completion(&ia->ri_done);
440
b26f9b99 441 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
c56c65fb
TT
442 if (IS_ERR(id)) {
443 rc = PTR_ERR(id);
444 dprintk("RPC: %s: rdma_create_id() failed %i\n",
445 __func__, rc);
446 return id;
447 }
448
5675add3 449 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
450 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
451 if (rc) {
452 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
453 __func__, rc);
454 goto out;
455 }
5675add3
TT
456 wait_for_completion_interruptible_timeout(&ia->ri_done,
457 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
458 rc = ia->ri_async_rc;
459 if (rc)
460 goto out;
461
5675add3 462 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
463 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
464 if (rc) {
465 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
466 __func__, rc);
467 goto out;
468 }
5675add3
TT
469 wait_for_completion_interruptible_timeout(&ia->ri_done,
470 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
471 rc = ia->ri_async_rc;
472 if (rc)
473 goto out;
474
475 return id;
476
477out:
478 rdma_destroy_id(id);
479 return ERR_PTR(rc);
480}
481
482/*
483 * Drain any cq, prior to teardown.
484 */
485static void
486rpcrdma_clean_cq(struct ib_cq *cq)
487{
488 struct ib_wc wc;
489 int count = 0;
490
491 while (1 == ib_poll_cq(cq, 1, &wc))
492 ++count;
493
494 if (count)
495 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
496 __func__, count, wc.opcode);
497}
498
499/*
500 * Exported functions.
501 */
502
503/*
504 * Open and initialize an Interface Adapter.
505 * o initializes fields of struct rpcrdma_ia, including
506 * interface and provider attributes and protection zone.
507 */
508int
509rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
510{
bd7ed1d1
TT
511 int rc, mem_priv;
512 struct ib_device_attr devattr;
c56c65fb
TT
513 struct rpcrdma_ia *ia = &xprt->rx_ia;
514
c56c65fb
TT
515 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
516 if (IS_ERR(ia->ri_id)) {
517 rc = PTR_ERR(ia->ri_id);
518 goto out1;
519 }
520
521 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
522 if (IS_ERR(ia->ri_pd)) {
523 rc = PTR_ERR(ia->ri_pd);
524 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
525 __func__, rc);
526 goto out2;
527 }
528
bd7ed1d1
TT
529 /*
530 * Query the device to determine if the requested memory
531 * registration strategy is supported. If it isn't, set the
532 * strategy to a globally supported model.
533 */
534 rc = ib_query_device(ia->ri_id->device, &devattr);
535 if (rc) {
536 dprintk("RPC: %s: ib_query_device failed %d\n",
537 __func__, rc);
538 goto out2;
539 }
540
541 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
542 ia->ri_have_dma_lkey = 1;
543 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
544 }
545
f10eafd3 546 if (memreg == RPCRDMA_FRMR) {
3197d309
TT
547 /* Requires both frmr reg and local dma lkey */
548 if ((devattr.device_cap_flags &
549 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
550 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
3197d309 551 dprintk("RPC: %s: FRMR registration "
f10eafd3
CL
552 "not supported by HCA\n", __func__);
553 memreg = RPCRDMA_MTHCAFMR;
0fc6c4e7
SW
554 } else {
555 /* Mind the ia limit on FRMR page list depth */
556 ia->ri_max_frmr_depth = min_t(unsigned int,
557 RPCRDMA_MAX_DATA_SEGS,
558 devattr.max_fast_reg_page_list_len);
bd7ed1d1 559 }
f10eafd3
CL
560 }
561 if (memreg == RPCRDMA_MTHCAFMR) {
562 if (!ia->ri_id->device->alloc_fmr) {
563 dprintk("RPC: %s: MTHCAFMR registration "
564 "not supported by HCA\n", __func__);
565#if RPCRDMA_PERSISTENT_REGISTRATION
566 memreg = RPCRDMA_ALLPHYSICAL;
567#else
cdd9ade7 568 rc = -ENOMEM;
f10eafd3
CL
569 goto out2;
570#endif
571 }
bd7ed1d1
TT
572 }
573
c56c65fb
TT
574 /*
575 * Optionally obtain an underlying physical identity mapping in
576 * order to do a memory window-based bind. This base registration
577 * is protected from remote access - that is enabled only by binding
578 * for the specific bytes targeted during each RPC operation, and
579 * revoked after the corresponding completion similar to a storage
580 * adapter.
581 */
bd7ed1d1 582 switch (memreg) {
3197d309 583 case RPCRDMA_FRMR:
bd7ed1d1 584 break;
c56c65fb 585#if RPCRDMA_PERSISTENT_REGISTRATION
bd7ed1d1
TT
586 case RPCRDMA_ALLPHYSICAL:
587 mem_priv = IB_ACCESS_LOCAL_WRITE |
588 IB_ACCESS_REMOTE_WRITE |
589 IB_ACCESS_REMOTE_READ;
590 goto register_setup;
c56c65fb 591#endif
bd7ed1d1
TT
592 case RPCRDMA_MTHCAFMR:
593 if (ia->ri_have_dma_lkey)
c56c65fb 594 break;
bd7ed1d1 595 mem_priv = IB_ACCESS_LOCAL_WRITE;
b45ccfd2 596#if RPCRDMA_PERSISTENT_REGISTRATION
bd7ed1d1 597 register_setup:
b45ccfd2 598#endif
c56c65fb
TT
599 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
600 if (IS_ERR(ia->ri_bind_mem)) {
601 printk(KERN_ALERT "%s: ib_get_dma_mr for "
0ac531c1 602 "phys register failed with %lX\n",
c56c65fb 603 __func__, PTR_ERR(ia->ri_bind_mem));
0ac531c1
CL
604 rc = -ENOMEM;
605 goto out2;
c56c65fb 606 }
bd7ed1d1
TT
607 break;
608 default:
cdd9ade7
CL
609 printk(KERN_ERR "RPC: Unsupported memory "
610 "registration mode: %d\n", memreg);
611 rc = -ENOMEM;
bd7ed1d1 612 goto out2;
c56c65fb 613 }
bd7ed1d1
TT
614 dprintk("RPC: %s: memory registration strategy is %d\n",
615 __func__, memreg);
c56c65fb
TT
616
617 /* Else will do memory reg/dereg for each chunk */
618 ia->ri_memreg_strategy = memreg;
619
73806c88 620 rwlock_init(&ia->ri_qplock);
c56c65fb
TT
621 return 0;
622out2:
623 rdma_destroy_id(ia->ri_id);
fee08caf 624 ia->ri_id = NULL;
c56c65fb
TT
625out1:
626 return rc;
627}
628
629/*
630 * Clean up/close an IA.
631 * o if event handles and PD have been initialized, free them.
632 * o close the IA
633 */
634void
635rpcrdma_ia_close(struct rpcrdma_ia *ia)
636{
637 int rc;
638
639 dprintk("RPC: %s: entering\n", __func__);
640 if (ia->ri_bind_mem != NULL) {
641 rc = ib_dereg_mr(ia->ri_bind_mem);
642 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
643 __func__, rc);
644 }
fee08caf
TT
645 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
646 if (ia->ri_id->qp)
647 rdma_destroy_qp(ia->ri_id);
648 rdma_destroy_id(ia->ri_id);
649 ia->ri_id = NULL;
650 }
c56c65fb
TT
651 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
652 rc = ib_dealloc_pd(ia->ri_pd);
653 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
654 __func__, rc);
655 }
c56c65fb
TT
656}
657
658/*
659 * Create unconnected endpoint.
660 */
661int
662rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
663 struct rpcrdma_create_data_internal *cdata)
664{
665 struct ib_device_attr devattr;
fc664485 666 struct ib_cq *sendcq, *recvcq;
5d40a8a5 667 int rc, err;
c56c65fb
TT
668
669 rc = ib_query_device(ia->ri_id->device, &devattr);
670 if (rc) {
671 dprintk("RPC: %s: ib_query_device failed %d\n",
672 __func__, rc);
673 return rc;
674 }
675
676 /* check provider's send/recv wr limits */
677 if (cdata->max_requests > devattr.max_qp_wr)
678 cdata->max_requests = devattr.max_qp_wr;
679
680 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
681 ep->rep_attr.qp_context = ep;
682 /* send_cq and recv_cq initialized below */
683 ep->rep_attr.srq = NULL;
684 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
685 switch (ia->ri_memreg_strategy) {
0fc6c4e7
SW
686 case RPCRDMA_FRMR: {
687 int depth = 7;
688
15cdc644
TT
689 /* Add room for frmr register and invalidate WRs.
690 * 1. FRMR reg WR for head
691 * 2. FRMR invalidate WR for head
0fc6c4e7
SW
692 * 3. N FRMR reg WRs for pagelist
693 * 4. N FRMR invalidate WRs for pagelist
15cdc644
TT
694 * 5. FRMR reg WR for tail
695 * 6. FRMR invalidate WR for tail
696 * 7. The RDMA_SEND WR
697 */
0fc6c4e7
SW
698
699 /* Calculate N if the device max FRMR depth is smaller than
700 * RPCRDMA_MAX_DATA_SEGS.
701 */
702 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
703 int delta = RPCRDMA_MAX_DATA_SEGS -
704 ia->ri_max_frmr_depth;
705
706 do {
707 depth += 2; /* FRMR reg + invalidate */
708 delta -= ia->ri_max_frmr_depth;
709 } while (delta > 0);
710
711 }
712 ep->rep_attr.cap.max_send_wr *= depth;
15cdc644 713 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
0fc6c4e7 714 cdata->max_requests = devattr.max_qp_wr / depth;
15cdc644
TT
715 if (!cdata->max_requests)
716 return -EINVAL;
0fc6c4e7
SW
717 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
718 depth;
15cdc644 719 }
3197d309 720 break;
0fc6c4e7 721 }
c56c65fb
TT
722 default:
723 break;
724 }
725 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
726 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
727 ep->rep_attr.cap.max_recv_sge = 1;
728 ep->rep_attr.cap.max_inline_data = 0;
729 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
730 ep->rep_attr.qp_type = IB_QPT_RC;
731 ep->rep_attr.port_num = ~0;
732
733 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
734 "iovs: send %d recv %d\n",
735 __func__,
736 ep->rep_attr.cap.max_send_wr,
737 ep->rep_attr.cap.max_recv_wr,
738 ep->rep_attr.cap.max_send_sge,
739 ep->rep_attr.cap.max_recv_sge);
740
741 /* set trigger for requesting send completion */
fc664485 742 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
c56c65fb
TT
743 if (ep->rep_cqinit <= 2)
744 ep->rep_cqinit = 0;
745 INIT_CQCOUNT(ep);
746 ep->rep_ia = ia;
747 init_waitqueue_head(&ep->rep_connect_wait);
254f91e2 748 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
c56c65fb 749
fc664485 750 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
1c00dd07 751 rpcrdma_cq_async_error_upcall, ep,
c56c65fb 752 ep->rep_attr.cap.max_send_wr + 1, 0);
fc664485
CL
753 if (IS_ERR(sendcq)) {
754 rc = PTR_ERR(sendcq);
755 dprintk("RPC: %s: failed to create send CQ: %i\n",
c56c65fb
TT
756 __func__, rc);
757 goto out1;
758 }
759
fc664485 760 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
c56c65fb
TT
761 if (rc) {
762 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
763 __func__, rc);
764 goto out2;
765 }
766
fc664485 767 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
1c00dd07 768 rpcrdma_cq_async_error_upcall, ep,
fc664485
CL
769 ep->rep_attr.cap.max_recv_wr + 1, 0);
770 if (IS_ERR(recvcq)) {
771 rc = PTR_ERR(recvcq);
772 dprintk("RPC: %s: failed to create recv CQ: %i\n",
773 __func__, rc);
774 goto out2;
775 }
776
777 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
778 if (rc) {
779 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
780 __func__, rc);
781 ib_destroy_cq(recvcq);
782 goto out2;
783 }
784
785 ep->rep_attr.send_cq = sendcq;
786 ep->rep_attr.recv_cq = recvcq;
c56c65fb
TT
787
788 /* Initialize cma parameters */
789
790 /* RPC/RDMA does not use private data */
791 ep->rep_remote_cma.private_data = NULL;
792 ep->rep_remote_cma.private_data_len = 0;
793
794 /* Client offers RDMA Read but does not initiate */
b334eaab 795 ep->rep_remote_cma.initiator_depth = 0;
03ff8821 796 if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
b334eaab
TT
797 ep->rep_remote_cma.responder_resources = 32;
798 else
c56c65fb 799 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
c56c65fb
TT
800
801 ep->rep_remote_cma.retry_count = 7;
802 ep->rep_remote_cma.flow_control = 0;
803 ep->rep_remote_cma.rnr_retry_count = 0;
804
805 return 0;
806
807out2:
fc664485 808 err = ib_destroy_cq(sendcq);
5d40a8a5
CL
809 if (err)
810 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
811 __func__, err);
c56c65fb
TT
812out1:
813 return rc;
814}
815
816/*
817 * rpcrdma_ep_destroy
818 *
819 * Disconnect and destroy endpoint. After this, the only
820 * valid operations on the ep are to free it (if dynamically
821 * allocated) or re-create it.
c56c65fb 822 */
7f1d5419 823void
c56c65fb
TT
824rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
825{
826 int rc;
827
828 dprintk("RPC: %s: entering, connected is %d\n",
829 __func__, ep->rep_connected);
830
254f91e2
CL
831 cancel_delayed_work_sync(&ep->rep_connect_worker);
832
c56c65fb
TT
833 if (ia->ri_id->qp) {
834 rc = rpcrdma_ep_disconnect(ep, ia);
835 if (rc)
836 dprintk("RPC: %s: rpcrdma_ep_disconnect"
837 " returned %i\n", __func__, rc);
fee08caf
TT
838 rdma_destroy_qp(ia->ri_id);
839 ia->ri_id->qp = NULL;
c56c65fb
TT
840 }
841
c56c65fb
TT
842 /* padding - could be done in rpcrdma_buffer_destroy... */
843 if (ep->rep_pad_mr) {
844 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
845 ep->rep_pad_mr = NULL;
846 }
847
fc664485
CL
848 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
849 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
850 if (rc)
851 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
852 __func__, rc);
853
854 rpcrdma_clean_cq(ep->rep_attr.send_cq);
855 rc = ib_destroy_cq(ep->rep_attr.send_cq);
c56c65fb
TT
856 if (rc)
857 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
858 __func__, rc);
c56c65fb
TT
859}
860
861/*
862 * Connect unconnected endpoint.
863 */
864int
865rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
866{
73806c88 867 struct rdma_cm_id *id, *old;
c56c65fb
TT
868 int rc = 0;
869 int retry_count = 0;
c56c65fb 870
c055551e 871 if (ep->rep_connected != 0) {
c56c65fb
TT
872 struct rpcrdma_xprt *xprt;
873retry:
ec62f40d 874 dprintk("RPC: %s: reconnecting...\n", __func__);
c56c65fb
TT
875 rc = rpcrdma_ep_disconnect(ep, ia);
876 if (rc && rc != -ENOTCONN)
877 dprintk("RPC: %s: rpcrdma_ep_disconnect"
878 " status %i\n", __func__, rc);
a7bc211a 879 rpcrdma_flush_cqs(ep);
c56c65fb 880
9f9d802a
CL
881 if (ia->ri_memreg_strategy == RPCRDMA_FRMR)
882 rpcrdma_reset_frmrs(ia);
883
c56c65fb
TT
884 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
885 id = rpcrdma_create_id(xprt, ia,
886 (struct sockaddr *)&xprt->rx_data.addr);
887 if (IS_ERR(id)) {
ec62f40d 888 rc = -EHOSTUNREACH;
c56c65fb
TT
889 goto out;
890 }
891 /* TEMP TEMP TEMP - fail if new device:
892 * Deregister/remarshal *all* requests!
893 * Close and recreate adapter, pd, etc!
894 * Re-determine all attributes still sane!
895 * More stuff I haven't thought of!
896 * Rrrgh!
897 */
898 if (ia->ri_id->device != id->device) {
899 printk("RPC: %s: can't reconnect on "
900 "different device!\n", __func__);
901 rdma_destroy_id(id);
ec62f40d 902 rc = -ENETUNREACH;
c56c65fb
TT
903 goto out;
904 }
905 /* END TEMP */
ec62f40d
CL
906 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
907 if (rc) {
908 dprintk("RPC: %s: rdma_create_qp failed %i\n",
909 __func__, rc);
910 rdma_destroy_id(id);
911 rc = -ENETUNREACH;
912 goto out;
913 }
73806c88
CL
914
915 write_lock(&ia->ri_qplock);
916 old = ia->ri_id;
c56c65fb 917 ia->ri_id = id;
73806c88
CL
918 write_unlock(&ia->ri_qplock);
919
920 rdma_destroy_qp(old);
921 rdma_destroy_id(old);
ec62f40d
CL
922 } else {
923 dprintk("RPC: %s: connecting...\n", __func__);
924 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
925 if (rc) {
926 dprintk("RPC: %s: rdma_create_qp failed %i\n",
927 __func__, rc);
928 /* do not update ep->rep_connected */
929 return -ENETUNREACH;
930 }
c56c65fb
TT
931 }
932
c56c65fb
TT
933 ep->rep_connected = 0;
934
935 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
936 if (rc) {
937 dprintk("RPC: %s: rdma_connect() failed with %i\n",
938 __func__, rc);
939 goto out;
940 }
941
c56c65fb
TT
942 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
943
944 /*
945 * Check state. A non-peer reject indicates no listener
946 * (ECONNREFUSED), which may be a transient state. All
947 * others indicate a transport condition which has already
948 * undergone a best-effort.
949 */
f64f9e71
JP
950 if (ep->rep_connected == -ECONNREFUSED &&
951 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
c56c65fb
TT
952 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
953 goto retry;
954 }
955 if (ep->rep_connected <= 0) {
956 /* Sometimes, the only way to reliably connect to remote
957 * CMs is to use same nonzero values for ORD and IRD. */
b334eaab
TT
958 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
959 (ep->rep_remote_cma.responder_resources == 0 ||
960 ep->rep_remote_cma.initiator_depth !=
961 ep->rep_remote_cma.responder_resources)) {
962 if (ep->rep_remote_cma.responder_resources == 0)
963 ep->rep_remote_cma.responder_resources = 1;
964 ep->rep_remote_cma.initiator_depth =
965 ep->rep_remote_cma.responder_resources;
c56c65fb 966 goto retry;
b334eaab 967 }
c56c65fb
TT
968 rc = ep->rep_connected;
969 } else {
970 dprintk("RPC: %s: connected\n", __func__);
971 }
972
973out:
974 if (rc)
975 ep->rep_connected = rc;
976 return rc;
977}
978
979/*
980 * rpcrdma_ep_disconnect
981 *
982 * This is separate from destroy to facilitate the ability
983 * to reconnect without recreating the endpoint.
984 *
985 * This call is not reentrant, and must not be made in parallel
986 * on the same endpoint.
987 */
988int
989rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
990{
991 int rc;
992
a7bc211a 993 rpcrdma_flush_cqs(ep);
c56c65fb
TT
994 rc = rdma_disconnect(ia->ri_id);
995 if (!rc) {
996 /* returns without wait if not connected */
997 wait_event_interruptible(ep->rep_connect_wait,
998 ep->rep_connected != 1);
999 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
1000 (ep->rep_connected == 1) ? "still " : "dis");
1001 } else {
1002 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
1003 ep->rep_connected = rc;
1004 }
1005 return rc;
1006}
1007
2e84522c
CL
1008static int
1009rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1010{
1011 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
1012 struct ib_fmr_attr fmr_attr = {
1013 .max_pages = RPCRDMA_MAX_DATA_SEGS,
1014 .max_maps = 1,
1015 .page_shift = PAGE_SHIFT
1016 };
1017 struct rpcrdma_mw *r;
1018 int i, rc;
1019
1020 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1021 dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
1022
1023 while (i--) {
1024 r = kzalloc(sizeof(*r), GFP_KERNEL);
1025 if (r == NULL)
1026 return -ENOMEM;
1027
1028 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
1029 if (IS_ERR(r->r.fmr)) {
1030 rc = PTR_ERR(r->r.fmr);
1031 dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
1032 __func__, rc);
1033 goto out_free;
1034 }
1035
1036 list_add(&r->mw_list, &buf->rb_mws);
1037 list_add(&r->mw_all, &buf->rb_all);
1038 }
1039 return 0;
1040
1041out_free:
1042 kfree(r);
1043 return rc;
1044}
1045
1046static int
1047rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1048{
1049 struct rpcrdma_frmr *f;
1050 struct rpcrdma_mw *r;
1051 int i, rc;
1052
1053 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1054 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
1055
1056 while (i--) {
1057 r = kzalloc(sizeof(*r), GFP_KERNEL);
1058 if (r == NULL)
1059 return -ENOMEM;
1060 f = &r->r.frmr;
1061
1062 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1063 ia->ri_max_frmr_depth);
1064 if (IS_ERR(f->fr_mr)) {
1065 rc = PTR_ERR(f->fr_mr);
1066 dprintk("RPC: %s: ib_alloc_fast_reg_mr "
1067 "failed %i\n", __func__, rc);
1068 goto out_free;
1069 }
1070
1071 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
1072 ia->ri_max_frmr_depth);
1073 if (IS_ERR(f->fr_pgl)) {
1074 rc = PTR_ERR(f->fr_pgl);
1075 dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
1076 "failed %i\n", __func__, rc);
1077
1078 ib_dereg_mr(f->fr_mr);
1079 goto out_free;
1080 }
1081
1082 list_add(&r->mw_list, &buf->rb_mws);
1083 list_add(&r->mw_all, &buf->rb_all);
1084 }
1085
1086 return 0;
1087
1088out_free:
1089 kfree(r);
1090 return rc;
1091}
1092
c56c65fb
TT
1093int
1094rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1095 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
1096{
1097 char *p;
65866f82 1098 size_t len, rlen, wlen;
c56c65fb
TT
1099 int i, rc;
1100
1101 buf->rb_max_requests = cdata->max_requests;
1102 spin_lock_init(&buf->rb_lock);
1103 atomic_set(&buf->rb_credits, 1);
1104
1105 /* Need to allocate:
1106 * 1. arrays for send and recv pointers
1107 * 2. arrays of struct rpcrdma_req to fill in pointers
1108 * 3. array of struct rpcrdma_rep for replies
1109 * 4. padding, if any
c56c65fb
TT
1110 * Send/recv buffers in req/rep need to be registered
1111 */
c56c65fb
TT
1112 len = buf->rb_max_requests *
1113 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1114 len += cdata->padding;
c56c65fb 1115
c56c65fb
TT
1116 p = kzalloc(len, GFP_KERNEL);
1117 if (p == NULL) {
1118 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1119 __func__, len);
1120 rc = -ENOMEM;
1121 goto out;
1122 }
1123 buf->rb_pool = p; /* for freeing it later */
1124
1125 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1126 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1127 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1128 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1129
1130 /*
1131 * Register the zeroed pad buffer, if any.
1132 */
1133 if (cdata->padding) {
1134 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1135 &ep->rep_pad_mr, &ep->rep_pad);
1136 if (rc)
1137 goto out;
1138 }
1139 p += cdata->padding;
1140
c56c65fb 1141 INIT_LIST_HEAD(&buf->rb_mws);
3111d72c 1142 INIT_LIST_HEAD(&buf->rb_all);
c56c65fb 1143 switch (ia->ri_memreg_strategy) {
3197d309 1144 case RPCRDMA_FRMR:
2e84522c
CL
1145 rc = rpcrdma_init_frmrs(ia, buf);
1146 if (rc)
1147 goto out;
3197d309 1148 break;
c56c65fb 1149 case RPCRDMA_MTHCAFMR:
2e84522c
CL
1150 rc = rpcrdma_init_fmrs(ia, buf);
1151 if (rc)
1152 goto out;
c56c65fb 1153 break;
c56c65fb
TT
1154 default:
1155 break;
1156 }
1157
1158 /*
1159 * Allocate/init the request/reply buffers. Doing this
1160 * using kmalloc for now -- one for each buf.
1161 */
65866f82
CL
1162 wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
1163 rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
1164 dprintk("RPC: %s: wlen = %zu, rlen = %zu\n",
1165 __func__, wlen, rlen);
1166
c56c65fb
TT
1167 for (i = 0; i < buf->rb_max_requests; i++) {
1168 struct rpcrdma_req *req;
1169 struct rpcrdma_rep *rep;
1170
65866f82 1171 req = kmalloc(wlen, GFP_KERNEL);
c56c65fb
TT
1172 if (req == NULL) {
1173 dprintk("RPC: %s: request buffer %d alloc"
1174 " failed\n", __func__, i);
1175 rc = -ENOMEM;
1176 goto out;
1177 }
1178 memset(req, 0, sizeof(struct rpcrdma_req));
1179 buf->rb_send_bufs[i] = req;
1180 buf->rb_send_bufs[i]->rl_buffer = buf;
1181
1182 rc = rpcrdma_register_internal(ia, req->rl_base,
65866f82 1183 wlen - offsetof(struct rpcrdma_req, rl_base),
c56c65fb
TT
1184 &buf->rb_send_bufs[i]->rl_handle,
1185 &buf->rb_send_bufs[i]->rl_iov);
1186 if (rc)
1187 goto out;
1188
65866f82
CL
1189 buf->rb_send_bufs[i]->rl_size = wlen -
1190 sizeof(struct rpcrdma_req);
c56c65fb 1191
65866f82 1192 rep = kmalloc(rlen, GFP_KERNEL);
c56c65fb
TT
1193 if (rep == NULL) {
1194 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1195 __func__, i);
1196 rc = -ENOMEM;
1197 goto out;
1198 }
1199 memset(rep, 0, sizeof(struct rpcrdma_rep));
1200 buf->rb_recv_bufs[i] = rep;
1201 buf->rb_recv_bufs[i]->rr_buffer = buf;
c56c65fb
TT
1202
1203 rc = rpcrdma_register_internal(ia, rep->rr_base,
65866f82 1204 rlen - offsetof(struct rpcrdma_rep, rr_base),
c56c65fb
TT
1205 &buf->rb_recv_bufs[i]->rr_handle,
1206 &buf->rb_recv_bufs[i]->rr_iov);
1207 if (rc)
1208 goto out;
1209
1210 }
1211 dprintk("RPC: %s: max_requests %d\n",
1212 __func__, buf->rb_max_requests);
1213 /* done */
1214 return 0;
1215out:
1216 rpcrdma_buffer_destroy(buf);
1217 return rc;
1218}
1219
2e84522c
CL
1220static void
1221rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1222{
1223 struct rpcrdma_mw *r;
1224 int rc;
1225
1226 while (!list_empty(&buf->rb_all)) {
1227 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1228 list_del(&r->mw_all);
1229 list_del(&r->mw_list);
1230
1231 rc = ib_dealloc_fmr(r->r.fmr);
1232 if (rc)
1233 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
1234 __func__, rc);
1235
1236 kfree(r);
1237 }
1238}
1239
1240static void
1241rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
1242{
1243 struct rpcrdma_mw *r;
1244 int rc;
1245
1246 while (!list_empty(&buf->rb_all)) {
1247 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1248 list_del(&r->mw_all);
1249 list_del(&r->mw_list);
1250
1251 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1252 if (rc)
1253 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1254 __func__, rc);
1255 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1256
1257 kfree(r);
1258 }
1259}
1260
c56c65fb
TT
1261void
1262rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1263{
c56c65fb 1264 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
2e84522c 1265 int i;
c56c65fb
TT
1266
1267 /* clean up in reverse order from create
1268 * 1. recv mr memory (mr free, then kfree)
c56c65fb 1269 * 2. send mr memory (mr free, then kfree)
2e84522c 1270 * 3. MWs
c56c65fb
TT
1271 */
1272 dprintk("RPC: %s: entering\n", __func__);
1273
1274 for (i = 0; i < buf->rb_max_requests; i++) {
1275 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1276 rpcrdma_deregister_internal(ia,
1277 buf->rb_recv_bufs[i]->rr_handle,
1278 &buf->rb_recv_bufs[i]->rr_iov);
1279 kfree(buf->rb_recv_bufs[i]);
1280 }
1281 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
c56c65fb
TT
1282 rpcrdma_deregister_internal(ia,
1283 buf->rb_send_bufs[i]->rl_handle,
1284 &buf->rb_send_bufs[i]->rl_iov);
1285 kfree(buf->rb_send_bufs[i]);
1286 }
1287 }
1288
2e84522c
CL
1289 switch (ia->ri_memreg_strategy) {
1290 case RPCRDMA_FRMR:
1291 rpcrdma_destroy_frmrs(buf);
1292 break;
1293 case RPCRDMA_MTHCAFMR:
1294 rpcrdma_destroy_fmrs(buf);
1295 break;
1296 default:
1297 break;
4034ba04
AA
1298 }
1299
c56c65fb
TT
1300 kfree(buf->rb_pool);
1301}
1302
9f9d802a
CL
1303/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1304 * an unusable state. Find FRMRs in this state and dereg / reg
1305 * each. FRMRs that are VALID and attached to an rpcrdma_req are
1306 * also torn down.
1307 *
1308 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1309 *
1310 * This is invoked only in the transport connect worker in order
1311 * to serialize with rpcrdma_register_frmr_external().
1312 */
1313static void
1314rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
1315{
1316 struct rpcrdma_xprt *r_xprt =
1317 container_of(ia, struct rpcrdma_xprt, rx_ia);
1318 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1319 struct list_head *pos;
1320 struct rpcrdma_mw *r;
1321 int rc;
1322
1323 list_for_each(pos, &buf->rb_all) {
1324 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1325
1326 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
1327 continue;
1328
1329 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1330 if (rc)
1331 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1332 __func__, rc);
1333 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1334
1335 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1336 ia->ri_max_frmr_depth);
1337 if (IS_ERR(r->r.frmr.fr_mr)) {
1338 rc = PTR_ERR(r->r.frmr.fr_mr);
1339 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1340 " failed %i\n", __func__, rc);
1341 continue;
1342 }
1343 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1344 ia->ri_id->device,
1345 ia->ri_max_frmr_depth);
1346 if (IS_ERR(r->r.frmr.fr_pgl)) {
1347 rc = PTR_ERR(r->r.frmr.fr_pgl);
1348 dprintk("RPC: %s: "
1349 "ib_alloc_fast_reg_page_list "
1350 "failed %i\n", __func__, rc);
1351
1352 ib_dereg_mr(r->r.frmr.fr_mr);
1353 continue;
1354 }
1355 r->r.frmr.fr_state = FRMR_IS_INVALID;
1356 }
1357}
1358
c2922c02
CL
1359/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1360 * some req segments uninitialized.
1361 */
1362static void
1363rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
1364{
1365 if (*mw) {
1366 list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
1367 *mw = NULL;
1368 }
1369}
1370
1371/* Cycle mw's back in reverse order, and "spin" them.
1372 * This delays and scrambles reuse as much as possible.
1373 */
1374static void
1375rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1376{
1377 struct rpcrdma_mr_seg *seg = req->rl_segments;
1378 struct rpcrdma_mr_seg *seg1 = seg;
1379 int i;
1380
1381 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
1382 rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf);
1383 rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf);
1384}
1385
1386static void
1387rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1388{
1389 buf->rb_send_bufs[--buf->rb_send_index] = req;
1390 req->rl_niovs = 0;
1391 if (req->rl_reply) {
1392 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
1393 req->rl_reply->rr_func = NULL;
1394 req->rl_reply = NULL;
1395 }
1396}
1397
ddb6bebc
CL
1398/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
1399 * Redo only the ib_post_send().
1400 */
1401static void
1402rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
1403{
1404 struct rpcrdma_xprt *r_xprt =
1405 container_of(ia, struct rpcrdma_xprt, rx_ia);
1406 struct ib_send_wr invalidate_wr, *bad_wr;
1407 int rc;
1408
1409 dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
1410
1411 /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
dab7e3b8 1412 r->r.frmr.fr_state = FRMR_IS_INVALID;
ddb6bebc
CL
1413
1414 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
1415 invalidate_wr.wr_id = (unsigned long)(void *)r;
1416 invalidate_wr.opcode = IB_WR_LOCAL_INV;
ddb6bebc
CL
1417 invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
1418 DECR_CQCOUNT(&r_xprt->rx_ep);
1419
1420 dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
1421 __func__, r, r->r.frmr.fr_mr->rkey);
1422
1423 read_lock(&ia->ri_qplock);
1424 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1425 read_unlock(&ia->ri_qplock);
1426 if (rc) {
1427 /* Force rpcrdma_buffer_get() to retry */
1428 r->r.frmr.fr_state = FRMR_IS_STALE;
1429 dprintk("RPC: %s: ib_post_send failed, %i\n",
1430 __func__, rc);
1431 }
1432}
1433
1434static void
1435rpcrdma_retry_flushed_linv(struct list_head *stale,
1436 struct rpcrdma_buffer *buf)
1437{
1438 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1439 struct list_head *pos;
1440 struct rpcrdma_mw *r;
1441 unsigned long flags;
1442
1443 list_for_each(pos, stale) {
1444 r = list_entry(pos, struct rpcrdma_mw, mw_list);
1445 rpcrdma_retry_local_inv(r, ia);
1446 }
1447
1448 spin_lock_irqsave(&buf->rb_lock, flags);
1449 list_splice_tail(stale, &buf->rb_mws);
1450 spin_unlock_irqrestore(&buf->rb_lock, flags);
1451}
1452
1453static struct rpcrdma_req *
1454rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
1455 struct list_head *stale)
1456{
1457 struct rpcrdma_mw *r;
1458 int i;
1459
1460 i = RPCRDMA_MAX_SEGS - 1;
1461 while (!list_empty(&buf->rb_mws)) {
1462 r = list_entry(buf->rb_mws.next,
1463 struct rpcrdma_mw, mw_list);
1464 list_del(&r->mw_list);
1465 if (r->r.frmr.fr_state == FRMR_IS_STALE) {
1466 list_add(&r->mw_list, stale);
1467 continue;
1468 }
1469 req->rl_segments[i].mr_chunk.rl_mw = r;
1470 if (unlikely(i-- == 0))
1471 return req; /* Success */
1472 }
1473
1474 /* Not enough entries on rb_mws for this req */
1475 rpcrdma_buffer_put_sendbuf(req, buf);
1476 rpcrdma_buffer_put_mrs(req, buf);
1477 return NULL;
1478}
1479
c2922c02 1480static struct rpcrdma_req *
ddb6bebc 1481rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
c2922c02
CL
1482{
1483 struct rpcrdma_mw *r;
1484 int i;
1485
1486 i = RPCRDMA_MAX_SEGS - 1;
1487 while (!list_empty(&buf->rb_mws)) {
1488 r = list_entry(buf->rb_mws.next,
1489 struct rpcrdma_mw, mw_list);
1490 list_del(&r->mw_list);
1491 req->rl_segments[i].mr_chunk.rl_mw = r;
1492 if (unlikely(i-- == 0))
1493 return req; /* Success */
1494 }
1495
1496 /* Not enough entries on rb_mws for this req */
1497 rpcrdma_buffer_put_sendbuf(req, buf);
1498 rpcrdma_buffer_put_mrs(req, buf);
1499 return NULL;
1500}
1501
c56c65fb
TT
1502/*
1503 * Get a set of request/reply buffers.
1504 *
1505 * Reply buffer (if needed) is attached to send buffer upon return.
1506 * Rule:
1507 * rb_send_index and rb_recv_index MUST always be pointing to the
1508 * *next* available buffer (non-NULL). They are incremented after
1509 * removing buffers, and decremented *before* returning them.
1510 */
1511struct rpcrdma_req *
1512rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1513{
c2922c02 1514 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
ddb6bebc 1515 struct list_head stale;
c56c65fb
TT
1516 struct rpcrdma_req *req;
1517 unsigned long flags;
1518
1519 spin_lock_irqsave(&buffers->rb_lock, flags);
1520 if (buffers->rb_send_index == buffers->rb_max_requests) {
1521 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1522 dprintk("RPC: %s: out of request buffers\n", __func__);
1523 return ((struct rpcrdma_req *)NULL);
1524 }
1525
1526 req = buffers->rb_send_bufs[buffers->rb_send_index];
1527 if (buffers->rb_send_index < buffers->rb_recv_index) {
1528 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1529 __func__,
1530 buffers->rb_recv_index - buffers->rb_send_index);
1531 req->rl_reply = NULL;
1532 } else {
1533 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1534 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1535 }
1536 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
ddb6bebc
CL
1537
1538 INIT_LIST_HEAD(&stale);
c2922c02
CL
1539 switch (ia->ri_memreg_strategy) {
1540 case RPCRDMA_FRMR:
ddb6bebc
CL
1541 req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
1542 break;
c2922c02 1543 case RPCRDMA_MTHCAFMR:
ddb6bebc 1544 req = rpcrdma_buffer_get_fmrs(req, buffers);
c2922c02
CL
1545 break;
1546 default:
1547 break;
c56c65fb
TT
1548 }
1549 spin_unlock_irqrestore(&buffers->rb_lock, flags);
ddb6bebc
CL
1550 if (!list_empty(&stale))
1551 rpcrdma_retry_flushed_linv(&stale, buffers);
c56c65fb
TT
1552 return req;
1553}
1554
1555/*
1556 * Put request/reply buffers back into pool.
1557 * Pre-decrement counter/array index.
1558 */
1559void
1560rpcrdma_buffer_put(struct rpcrdma_req *req)
1561{
1562 struct rpcrdma_buffer *buffers = req->rl_buffer;
1563 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
c56c65fb
TT
1564 unsigned long flags;
1565
c56c65fb 1566 spin_lock_irqsave(&buffers->rb_lock, flags);
c2922c02 1567 rpcrdma_buffer_put_sendbuf(req, buffers);
c56c65fb 1568 switch (ia->ri_memreg_strategy) {
3197d309 1569 case RPCRDMA_FRMR:
c56c65fb 1570 case RPCRDMA_MTHCAFMR:
c2922c02 1571 rpcrdma_buffer_put_mrs(req, buffers);
c56c65fb
TT
1572 break;
1573 default:
1574 break;
1575 }
1576 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1577}
1578
1579/*
1580 * Recover reply buffers from pool.
1581 * This happens when recovering from error conditions.
1582 * Post-increment counter/array index.
1583 */
1584void
1585rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1586{
1587 struct rpcrdma_buffer *buffers = req->rl_buffer;
1588 unsigned long flags;
1589
1590 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1591 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1592 spin_lock_irqsave(&buffers->rb_lock, flags);
1593 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1594 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1595 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1596 }
1597 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1598}
1599
1600/*
1601 * Put reply buffers back into pool when not attached to
b45ccfd2 1602 * request. This happens in error conditions.
c56c65fb
TT
1603 */
1604void
1605rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1606{
1607 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1608 unsigned long flags;
1609
1610 rep->rr_func = NULL;
1611 spin_lock_irqsave(&buffers->rb_lock, flags);
1612 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1613 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1614}
1615
1616/*
1617 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1618 */
1619
1620int
1621rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1622 struct ib_mr **mrp, struct ib_sge *iov)
1623{
1624 struct ib_phys_buf ipb;
1625 struct ib_mr *mr;
1626 int rc;
1627
1628 /*
1629 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1630 */
1631 iov->addr = ib_dma_map_single(ia->ri_id->device,
1632 va, len, DMA_BIDIRECTIONAL);
bf858ab0
YB
1633 if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
1634 return -ENOMEM;
1635
c56c65fb
TT
1636 iov->length = len;
1637
bd7ed1d1
TT
1638 if (ia->ri_have_dma_lkey) {
1639 *mrp = NULL;
1640 iov->lkey = ia->ri_dma_lkey;
1641 return 0;
1642 } else if (ia->ri_bind_mem != NULL) {
c56c65fb
TT
1643 *mrp = NULL;
1644 iov->lkey = ia->ri_bind_mem->lkey;
1645 return 0;
1646 }
1647
1648 ipb.addr = iov->addr;
1649 ipb.size = iov->length;
1650 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1651 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1652
1653 dprintk("RPC: %s: phys convert: 0x%llx "
1654 "registered 0x%llx length %d\n",
a56daeb7
AM
1655 __func__, (unsigned long long)ipb.addr,
1656 (unsigned long long)iov->addr, len);
c56c65fb
TT
1657
1658 if (IS_ERR(mr)) {
1659 *mrp = NULL;
1660 rc = PTR_ERR(mr);
1661 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1662 } else {
1663 *mrp = mr;
1664 iov->lkey = mr->lkey;
1665 rc = 0;
1666 }
1667
1668 return rc;
1669}
1670
1671int
1672rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1673 struct ib_mr *mr, struct ib_sge *iov)
1674{
1675 int rc;
1676
1677 ib_dma_unmap_single(ia->ri_id->device,
1678 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1679
1680 if (NULL == mr)
1681 return 0;
1682
1683 rc = ib_dereg_mr(mr);
1684 if (rc)
1685 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1686 return rc;
1687}
1688
1689/*
1690 * Wrappers for chunk registration, shared by read/write chunk code.
1691 */
1692
1693static void
1694rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1695{
1696 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1697 seg->mr_dmalen = seg->mr_len;
1698 if (seg->mr_page)
1699 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1700 seg->mr_page, offset_in_page(seg->mr_offset),
1701 seg->mr_dmalen, seg->mr_dir);
1702 else
1703 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1704 seg->mr_offset,
1705 seg->mr_dmalen, seg->mr_dir);
5c635e09
TT
1706 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1707 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1708 __func__,
986d4abb
RD
1709 (unsigned long long)seg->mr_dma,
1710 seg->mr_offset, seg->mr_dmalen);
5c635e09 1711 }
c56c65fb
TT
1712}
1713
1714static void
1715rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1716{
1717 if (seg->mr_page)
1718 ib_dma_unmap_page(ia->ri_id->device,
1719 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1720 else
1721 ib_dma_unmap_single(ia->ri_id->device,
1722 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1723}
1724
3197d309
TT
1725static int
1726rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1727 int *nsegs, int writing, struct rpcrdma_ia *ia,
1728 struct rpcrdma_xprt *r_xprt)
1729{
1730 struct rpcrdma_mr_seg *seg1 = seg;
0dbb4108
CL
1731 struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw;
1732 struct rpcrdma_frmr *frmr = &mw->r.frmr;
1733 struct ib_mr *mr = frmr->fr_mr;
f590e878 1734 struct ib_send_wr fastreg_wr, *bad_wr;
3197d309
TT
1735 u8 key;
1736 int len, pageoff;
1737 int i, rc;
9b78145c
TT
1738 int seg_len;
1739 u64 pa;
1740 int page_no;
3197d309
TT
1741
1742 pageoff = offset_in_page(seg1->mr_offset);
1743 seg1->mr_offset -= pageoff; /* start of page */
1744 seg1->mr_len += pageoff;
1745 len = -pageoff;
0fc6c4e7
SW
1746 if (*nsegs > ia->ri_max_frmr_depth)
1747 *nsegs = ia->ri_max_frmr_depth;
9b78145c 1748 for (page_no = i = 0; i < *nsegs;) {
3197d309 1749 rpcrdma_map_one(ia, seg, writing);
9b78145c
TT
1750 pa = seg->mr_dma;
1751 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
0dbb4108 1752 frmr->fr_pgl->page_list[page_no++] = pa;
9b78145c
TT
1753 pa += PAGE_SIZE;
1754 }
3197d309
TT
1755 len += seg->mr_len;
1756 ++seg;
1757 ++i;
1758 /* Check for holes */
1759 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1760 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1761 break;
1762 }
1763 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
0dbb4108 1764 __func__, mw, i);
3197d309 1765
05055722
CL
1766 frmr->fr_state = FRMR_IS_VALID;
1767
f590e878
CL
1768 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
1769 fastreg_wr.wr_id = (unsigned long)(void *)mw;
1770 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
1771 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1772 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
1773 fastreg_wr.wr.fast_reg.page_list_len = page_no;
1774 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1775 fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1776 if (fastreg_wr.wr.fast_reg.length < len) {
5fc83f47
CL
1777 rc = -EIO;
1778 goto out_err;
c977dea2
CL
1779 }
1780
1781 /* Bump the key */
0dbb4108
CL
1782 key = (u8)(mr->rkey & 0x000000FF);
1783 ib_update_fast_reg_key(mr, ++key);
c977dea2 1784
f590e878 1785 fastreg_wr.wr.fast_reg.access_flags = (writing ?
68743082
VP
1786 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1787 IB_ACCESS_REMOTE_READ);
f590e878 1788 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
3197d309
TT
1789 DECR_CQCOUNT(&r_xprt->rx_ep);
1790
f590e878 1791 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
3197d309
TT
1792 if (rc) {
1793 dprintk("RPC: %s: failed ib_post_send for register,"
1794 " status %i\n", __func__, rc);
c93e986a 1795 ib_update_fast_reg_key(mr, --key);
5fc83f47 1796 goto out_err;
3197d309 1797 } else {
0dbb4108 1798 seg1->mr_rkey = mr->rkey;
3197d309
TT
1799 seg1->mr_base = seg1->mr_dma + pageoff;
1800 seg1->mr_nsegs = i;
1801 seg1->mr_len = len;
1802 }
1803 *nsegs = i;
5fc83f47
CL
1804 return 0;
1805out_err:
05055722 1806 frmr->fr_state = FRMR_IS_INVALID;
5fc83f47
CL
1807 while (i--)
1808 rpcrdma_unmap_one(ia, --seg);
3197d309
TT
1809 return rc;
1810}
1811
1812static int
1813rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1814 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1815{
1816 struct rpcrdma_mr_seg *seg1 = seg;
1817 struct ib_send_wr invalidate_wr, *bad_wr;
1818 int rc;
1819
dab7e3b8
CL
1820 seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
1821
3197d309 1822 memset(&invalidate_wr, 0, sizeof invalidate_wr);
5c635e09 1823 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
3197d309 1824 invalidate_wr.opcode = IB_WR_LOCAL_INV;
3197d309
TT
1825 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1826 DECR_CQCOUNT(&r_xprt->rx_ep);
1827
73806c88
CL
1828 read_lock(&ia->ri_qplock);
1829 while (seg1->mr_nsegs--)
1830 rpcrdma_unmap_one(ia, seg++);
3197d309 1831 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
73806c88 1832 read_unlock(&ia->ri_qplock);
dab7e3b8
CL
1833 if (rc) {
1834 /* Force rpcrdma_buffer_get() to retry */
1835 seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
3197d309
TT
1836 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1837 " status %i\n", __func__, rc);
dab7e3b8 1838 }
3197d309
TT
1839 return rc;
1840}
1841
8d4ba034
TT
1842static int
1843rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1844 int *nsegs, int writing, struct rpcrdma_ia *ia)
1845{
1846 struct rpcrdma_mr_seg *seg1 = seg;
1847 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1848 int len, pageoff, i, rc;
1849
1850 pageoff = offset_in_page(seg1->mr_offset);
1851 seg1->mr_offset -= pageoff; /* start of page */
1852 seg1->mr_len += pageoff;
1853 len = -pageoff;
1854 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1855 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1856 for (i = 0; i < *nsegs;) {
1857 rpcrdma_map_one(ia, seg, writing);
1858 physaddrs[i] = seg->mr_dma;
1859 len += seg->mr_len;
1860 ++seg;
1861 ++i;
1862 /* Check for holes */
1863 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1864 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1865 break;
1866 }
1867 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1868 physaddrs, i, seg1->mr_dma);
1869 if (rc) {
1870 dprintk("RPC: %s: failed ib_map_phys_fmr "
1871 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1872 len, (unsigned long long)seg1->mr_dma,
1873 pageoff, i, rc);
1874 while (i--)
1875 rpcrdma_unmap_one(ia, --seg);
1876 } else {
1877 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1878 seg1->mr_base = seg1->mr_dma + pageoff;
1879 seg1->mr_nsegs = i;
1880 seg1->mr_len = len;
1881 }
1882 *nsegs = i;
1883 return rc;
1884}
1885
1886static int
1887rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1888 struct rpcrdma_ia *ia)
1889{
1890 struct rpcrdma_mr_seg *seg1 = seg;
1891 LIST_HEAD(l);
1892 int rc;
1893
1894 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1895 rc = ib_unmap_fmr(&l);
73806c88 1896 read_lock(&ia->ri_qplock);
8d4ba034
TT
1897 while (seg1->mr_nsegs--)
1898 rpcrdma_unmap_one(ia, seg++);
73806c88 1899 read_unlock(&ia->ri_qplock);
8d4ba034
TT
1900 if (rc)
1901 dprintk("RPC: %s: failed ib_unmap_fmr,"
1902 " status %i\n", __func__, rc);
1903 return rc;
1904}
1905
c56c65fb
TT
1906int
1907rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1908 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1909{
1910 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1911 int rc = 0;
1912
1913 switch (ia->ri_memreg_strategy) {
1914
1915#if RPCRDMA_PERSISTENT_REGISTRATION
1916 case RPCRDMA_ALLPHYSICAL:
1917 rpcrdma_map_one(ia, seg, writing);
1918 seg->mr_rkey = ia->ri_bind_mem->rkey;
1919 seg->mr_base = seg->mr_dma;
1920 seg->mr_nsegs = 1;
1921 nsegs = 1;
1922 break;
1923#endif
1924
3197d309
TT
1925 /* Registration using frmr registration */
1926 case RPCRDMA_FRMR:
1927 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1928 break;
1929
8d4ba034 1930 /* Registration using fmr memory registration */
c56c65fb 1931 case RPCRDMA_MTHCAFMR:
8d4ba034 1932 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
c56c65fb
TT
1933 break;
1934
c56c65fb 1935 default:
0ac531c1 1936 return -1;
c56c65fb
TT
1937 }
1938 if (rc)
1939 return -1;
1940
1941 return nsegs;
1942}
1943
1944int
1945rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
13c9ff8f 1946 struct rpcrdma_xprt *r_xprt)
c56c65fb
TT
1947{
1948 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1949 int nsegs = seg->mr_nsegs, rc;
1950
1951 switch (ia->ri_memreg_strategy) {
1952
1953#if RPCRDMA_PERSISTENT_REGISTRATION
1954 case RPCRDMA_ALLPHYSICAL:
73806c88 1955 read_lock(&ia->ri_qplock);
c56c65fb 1956 rpcrdma_unmap_one(ia, seg);
73806c88 1957 read_unlock(&ia->ri_qplock);
c56c65fb
TT
1958 break;
1959#endif
1960
3197d309
TT
1961 case RPCRDMA_FRMR:
1962 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1963 break;
1964
c56c65fb 1965 case RPCRDMA_MTHCAFMR:
8d4ba034 1966 rc = rpcrdma_deregister_fmr_external(seg, ia);
c56c65fb
TT
1967 break;
1968
c56c65fb 1969 default:
c56c65fb
TT
1970 break;
1971 }
c56c65fb
TT
1972 return nsegs;
1973}
1974
1975/*
1976 * Prepost any receive buffer, then post send.
1977 *
1978 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1979 */
1980int
1981rpcrdma_ep_post(struct rpcrdma_ia *ia,
1982 struct rpcrdma_ep *ep,
1983 struct rpcrdma_req *req)
1984{
1985 struct ib_send_wr send_wr, *send_wr_fail;
1986 struct rpcrdma_rep *rep = req->rl_reply;
1987 int rc;
1988
1989 if (rep) {
1990 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1991 if (rc)
1992 goto out;
1993 req->rl_reply = NULL;
1994 }
1995
1996 send_wr.next = NULL;
1997 send_wr.wr_id = 0ULL; /* no send cookie */
1998 send_wr.sg_list = req->rl_send_iov;
1999 send_wr.num_sge = req->rl_niovs;
2000 send_wr.opcode = IB_WR_SEND;
c56c65fb
TT
2001 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
2002 ib_dma_sync_single_for_device(ia->ri_id->device,
2003 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
2004 DMA_TO_DEVICE);
2005 ib_dma_sync_single_for_device(ia->ri_id->device,
2006 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
2007 DMA_TO_DEVICE);
2008 ib_dma_sync_single_for_device(ia->ri_id->device,
2009 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
2010 DMA_TO_DEVICE);
2011
2012 if (DECR_CQCOUNT(ep) > 0)
2013 send_wr.send_flags = 0;
2014 else { /* Provider must take a send completion every now and then */
2015 INIT_CQCOUNT(ep);
2016 send_wr.send_flags = IB_SEND_SIGNALED;
2017 }
2018
2019 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
2020 if (rc)
2021 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
2022 rc);
2023out:
2024 return rc;
2025}
2026
2027/*
2028 * (Re)post a receive buffer.
2029 */
2030int
2031rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
2032 struct rpcrdma_ep *ep,
2033 struct rpcrdma_rep *rep)
2034{
2035 struct ib_recv_wr recv_wr, *recv_wr_fail;
2036 int rc;
2037
2038 recv_wr.next = NULL;
2039 recv_wr.wr_id = (u64) (unsigned long) rep;
2040 recv_wr.sg_list = &rep->rr_iov;
2041 recv_wr.num_sge = 1;
2042
2043 ib_dma_sync_single_for_cpu(ia->ri_id->device,
2044 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
2045
c56c65fb
TT
2046 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
2047
2048 if (rc)
2049 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
2050 rc);
2051 return rc;
2052}
43e95988
CL
2053
2054/* Physical mapping means one Read/Write list entry per-page.
2055 * All list entries must fit within an inline buffer
2056 *
2057 * NB: The server must return a Write list for NFS READ,
2058 * which has the same constraint. Factor in the inline
2059 * rsize as well.
2060 */
2061static size_t
2062rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
2063{
2064 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
2065 unsigned int inline_size, pages;
2066
2067 inline_size = min_t(unsigned int,
2068 cdata->inline_wsize, cdata->inline_rsize);
2069 inline_size -= RPCRDMA_HDRLEN_MIN;
2070 pages = inline_size / sizeof(struct rpcrdma_segment);
2071 return pages << PAGE_SHIFT;
2072}
2073
2074static size_t
2075rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
2076{
2077 return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
2078}
2079
2080size_t
2081rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
2082{
2083 size_t result;
2084
2085 switch (r_xprt->rx_ia.ri_memreg_strategy) {
2086 case RPCRDMA_ALLPHYSICAL:
2087 result = rpcrdma_physical_max_payload(r_xprt);
2088 break;
2089 default:
2090 result = rpcrdma_mr_max_payload(r_xprt);
2091 }
2092 return result;
2093}