xprtrdma: Remove rpcrdma_ep::rep_func and ::rep_xprt
[linux-2.6-block.git] / net / sunrpc / xprtrdma / verbs.c
CommitLineData
f58851e6 1/*
c56c65fb
TT
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
f58851e6
TT
38 */
39
c56c65fb
TT
40/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
a6b7a407 50#include <linux/interrupt.h>
5a0e3ad6 51#include <linux/slab.h>
eba8ff66 52#include <linux/prefetch.h>
65866f82 53#include <asm/bitops.h>
c56c65fb 54
f58851e6
TT
55#include "xprt_rdma.h"
56
c56c65fb
TT
57/*
58 * Globals/Macros
59 */
60
f895b252 61#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
c56c65fb
TT
62# define RPCDBG_FACILITY RPCDBG_TRANS
63#endif
64
9f9d802a 65static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
467c9674 66static void rpcrdma_reset_fmrs(struct rpcrdma_ia *);
9f9d802a 67
c56c65fb
TT
68/*
69 * internal functions
70 */
71
72/*
73 * handle replies in tasklet context, using a single, global list
74 * rdma tasklet function -- just turn around and call the func
75 * for all replies on the list
76 */
77
78static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
79static LIST_HEAD(rpcrdma_tasklets_g);
80
81static void
82rpcrdma_run_tasklet(unsigned long data)
83{
84 struct rpcrdma_rep *rep;
85 void (*func)(struct rpcrdma_rep *);
86 unsigned long flags;
87
88 data = data;
89 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
90 while (!list_empty(&rpcrdma_tasklets_g)) {
91 rep = list_entry(rpcrdma_tasklets_g.next,
92 struct rpcrdma_rep, rr_list);
93 list_del(&rep->rr_list);
94 func = rep->rr_func;
95 rep->rr_func = NULL;
96 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
97
98 if (func)
99 func(rep);
100 else
101 rpcrdma_recv_buffer_put(rep);
102
103 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
104 }
105 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
106}
107
108static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
109
7ff11de1
CL
110static const char * const async_event[] = {
111 "CQ error",
112 "QP fatal error",
113 "QP request error",
114 "QP access error",
115 "communication established",
116 "send queue drained",
117 "path migration successful",
118 "path mig error",
119 "device fatal error",
120 "port active",
121 "port error",
122 "LID change",
123 "P_key change",
124 "SM change",
125 "SRQ error",
126 "SRQ limit reached",
127 "last WQE reached",
128 "client reregister",
129 "GID change",
130};
131
132#define ASYNC_MSG(status) \
133 ((status) < ARRAY_SIZE(async_event) ? \
134 async_event[(status)] : "unknown async error")
135
f1a03b76
CL
136static void
137rpcrdma_schedule_tasklet(struct list_head *sched_list)
138{
139 unsigned long flags;
140
141 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
142 list_splice_tail(sched_list, &rpcrdma_tasklets_g);
143 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
144 tasklet_schedule(&rpcrdma_tasklet_g);
145}
146
c56c65fb
TT
147static void
148rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
149{
150 struct rpcrdma_ep *ep = context;
151
7ff11de1
CL
152 pr_err("RPC: %s: %s on device %s ep %p\n",
153 __func__, ASYNC_MSG(event->event),
154 event->device->name, context);
c56c65fb
TT
155 if (ep->rep_connected == 1) {
156 ep->rep_connected = -EIO;
afadc468 157 rpcrdma_conn_func(ep);
c56c65fb
TT
158 wake_up_all(&ep->rep_connect_wait);
159 }
160}
161
162static void
163rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
164{
165 struct rpcrdma_ep *ep = context;
166
7ff11de1
CL
167 pr_err("RPC: %s: %s on device %s ep %p\n",
168 __func__, ASYNC_MSG(event->event),
169 event->device->name, context);
c56c65fb
TT
170 if (ep->rep_connected == 1) {
171 ep->rep_connected = -EIO;
afadc468 172 rpcrdma_conn_func(ep);
c56c65fb
TT
173 wake_up_all(&ep->rep_connect_wait);
174 }
175}
176
8502427c
CL
177static const char * const wc_status[] = {
178 "success",
179 "local length error",
180 "local QP operation error",
181 "local EE context operation error",
182 "local protection error",
183 "WR flushed",
184 "memory management operation error",
185 "bad response error",
186 "local access error",
187 "remote invalid request error",
188 "remote access error",
189 "remote operation error",
190 "transport retry counter exceeded",
191 "RNR retrycounter exceeded",
192 "local RDD violation error",
193 "remove invalid RD request",
194 "operation aborted",
195 "invalid EE context number",
196 "invalid EE context state",
197 "fatal error",
198 "response timeout error",
199 "general error",
200};
201
202#define COMPLETION_MSG(status) \
203 ((status) < ARRAY_SIZE(wc_status) ? \
204 wc_status[(status)] : "unexpected completion error")
205
fc664485
CL
206static void
207rpcrdma_sendcq_process_wc(struct ib_wc *wc)
c56c65fb 208{
8502427c 209 if (likely(wc->status == IB_WC_SUCCESS))
c56c65fb 210 return;
8502427c
CL
211
212 /* WARNING: Only wr_id and status are reliable at this point */
213 if (wc->wr_id == 0ULL) {
214 if (wc->status != IB_WC_WR_FLUSH_ERR)
215 pr_err("RPC: %s: SEND: %s\n",
216 __func__, COMPLETION_MSG(wc->status));
217 } else {
218 struct rpcrdma_mw *r;
219
220 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
221 r->r.frmr.fr_state = FRMR_IS_STALE;
222 pr_err("RPC: %s: frmr %p (stale): %s\n",
223 __func__, r, COMPLETION_MSG(wc->status));
224 }
c56c65fb
TT
225}
226
fc664485 227static int
1c00dd07 228rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
c56c65fb 229{
1c00dd07 230 struct ib_wc *wcs;
8301a2c0 231 int budget, count, rc;
c56c65fb 232
8301a2c0 233 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
234 do {
235 wcs = ep->rep_send_wcs;
236
237 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
238 if (rc <= 0)
239 return rc;
240
241 count = rc;
242 while (count-- > 0)
243 rpcrdma_sendcq_process_wc(wcs++);
8301a2c0 244 } while (rc == RPCRDMA_POLLSIZE && --budget);
1c00dd07 245 return 0;
fc664485 246}
c56c65fb 247
fc664485
CL
248/*
249 * Handle send, fast_reg_mr, and local_inv completions.
250 *
251 * Send events are typically suppressed and thus do not result
252 * in an upcall. Occasionally one is signaled, however. This
253 * prevents the provider's completion queue from wrapping and
254 * losing a completion.
255 */
256static void
257rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
258{
1c00dd07 259 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
fc664485
CL
260 int rc;
261
1c00dd07 262 rc = rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
263 if (rc) {
264 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
265 __func__, rc);
266 return;
c56c65fb
TT
267 }
268
7f23f6f6
CL
269 rc = ib_req_notify_cq(cq,
270 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
271 if (rc == 0)
272 return;
273 if (rc < 0) {
fc664485
CL
274 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
275 __func__, rc);
276 return;
277 }
278
1c00dd07 279 rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
280}
281
282static void
bb96193d 283rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
fc664485
CL
284{
285 struct rpcrdma_rep *rep =
286 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
287
8502427c
CL
288 /* WARNING: Only wr_id and status are reliable at this point */
289 if (wc->status != IB_WC_SUCCESS)
290 goto out_fail;
fc664485 291
8502427c 292 /* status == SUCCESS means all fields in wc are trustworthy */
fc664485
CL
293 if (wc->opcode != IB_WC_RECV)
294 return;
295
8502427c
CL
296 dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
297 __func__, rep, wc->byte_len);
298
fc664485
CL
299 rep->rr_len = wc->byte_len;
300 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
301 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
eba8ff66 302 prefetch(rep->rr_base);
fc664485
CL
303
304out_schedule:
bb96193d 305 list_add_tail(&rep->rr_list, sched_list);
8502427c
CL
306 return;
307out_fail:
308 if (wc->status != IB_WC_WR_FLUSH_ERR)
309 pr_err("RPC: %s: rep %p: %s\n",
310 __func__, rep, COMPLETION_MSG(wc->status));
311 rep->rr_len = ~0U;
312 goto out_schedule;
fc664485
CL
313}
314
315static int
1c00dd07 316rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
fc664485 317{
bb96193d 318 struct list_head sched_list;
1c00dd07 319 struct ib_wc *wcs;
8301a2c0 320 int budget, count, rc;
fc664485 321
bb96193d 322 INIT_LIST_HEAD(&sched_list);
8301a2c0 323 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
324 do {
325 wcs = ep->rep_recv_wcs;
326
327 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
328 if (rc <= 0)
bb96193d 329 goto out_schedule;
1c00dd07
CL
330
331 count = rc;
332 while (count-- > 0)
bb96193d 333 rpcrdma_recvcq_process_wc(wcs++, &sched_list);
8301a2c0 334 } while (rc == RPCRDMA_POLLSIZE && --budget);
bb96193d
CL
335 rc = 0;
336
337out_schedule:
f1a03b76 338 rpcrdma_schedule_tasklet(&sched_list);
bb96193d 339 return rc;
c56c65fb
TT
340}
341
342/*
fc664485 343 * Handle receive completions.
c56c65fb 344 *
c56c65fb
TT
345 * It is reentrant but processes single events in order to maintain
346 * ordering of receives to keep server credits.
347 *
348 * It is the responsibility of the scheduled tasklet to return
349 * recv buffers to the pool. NOTE: this affects synchronization of
350 * connection shutdown. That is, the structures required for
351 * the completion of the reply handler must remain intact until
352 * all memory has been reclaimed.
c56c65fb
TT
353 */
354static void
fc664485 355rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
c56c65fb 356{
1c00dd07 357 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
c56c65fb
TT
358 int rc;
359
1c00dd07 360 rc = rpcrdma_recvcq_poll(cq, ep);
fc664485
CL
361 if (rc) {
362 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
363 __func__, rc);
c56c65fb 364 return;
fc664485 365 }
c56c65fb 366
7f23f6f6
CL
367 rc = ib_req_notify_cq(cq,
368 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
369 if (rc == 0)
370 return;
371 if (rc < 0) {
fc664485 372 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
c56c65fb
TT
373 __func__, rc);
374 return;
375 }
376
1c00dd07 377 rpcrdma_recvcq_poll(cq, ep);
c56c65fb
TT
378}
379
a7bc211a
CL
380static void
381rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
382{
5c166bef
CL
383 struct ib_wc wc;
384 LIST_HEAD(sched_list);
385
386 while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
387 rpcrdma_recvcq_process_wc(&wc, &sched_list);
388 if (!list_empty(&sched_list))
389 rpcrdma_schedule_tasklet(&sched_list);
390 while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
391 rpcrdma_sendcq_process_wc(&wc);
a7bc211a
CL
392}
393
f895b252 394#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
c56c65fb
TT
395static const char * const conn[] = {
396 "address resolved",
397 "address error",
398 "route resolved",
399 "route error",
400 "connect request",
401 "connect response",
402 "connect error",
403 "unreachable",
404 "rejected",
405 "established",
406 "disconnected",
8079fb78
CL
407 "device removal",
408 "multicast join",
409 "multicast error",
410 "address change",
411 "timewait exit",
c56c65fb 412};
8079fb78
CL
413
414#define CONNECTION_MSG(status) \
415 ((status) < ARRAY_SIZE(conn) ? \
416 conn[(status)] : "unrecognized connection error")
c56c65fb
TT
417#endif
418
419static int
420rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
421{
422 struct rpcrdma_xprt *xprt = id->context;
423 struct rpcrdma_ia *ia = &xprt->rx_ia;
424 struct rpcrdma_ep *ep = &xprt->rx_ep;
f895b252 425#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
c56c65fb 426 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
ff0db049 427#endif
c56c65fb
TT
428 struct ib_qp_attr attr;
429 struct ib_qp_init_attr iattr;
430 int connstate = 0;
431
432 switch (event->event) {
433 case RDMA_CM_EVENT_ADDR_RESOLVED:
434 case RDMA_CM_EVENT_ROUTE_RESOLVED:
5675add3 435 ia->ri_async_rc = 0;
c56c65fb
TT
436 complete(&ia->ri_done);
437 break;
438 case RDMA_CM_EVENT_ADDR_ERROR:
439 ia->ri_async_rc = -EHOSTUNREACH;
440 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
441 __func__, ep);
442 complete(&ia->ri_done);
443 break;
444 case RDMA_CM_EVENT_ROUTE_ERROR:
445 ia->ri_async_rc = -ENETUNREACH;
446 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
447 __func__, ep);
448 complete(&ia->ri_done);
449 break;
450 case RDMA_CM_EVENT_ESTABLISHED:
451 connstate = 1;
452 ib_query_qp(ia->ri_id->qp, &attr,
453 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
454 &iattr);
455 dprintk("RPC: %s: %d responder resources"
456 " (%d initiator)\n",
457 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
458 goto connected;
459 case RDMA_CM_EVENT_CONNECT_ERROR:
460 connstate = -ENOTCONN;
461 goto connected;
462 case RDMA_CM_EVENT_UNREACHABLE:
463 connstate = -ENETDOWN;
464 goto connected;
465 case RDMA_CM_EVENT_REJECTED:
466 connstate = -ECONNREFUSED;
467 goto connected;
468 case RDMA_CM_EVENT_DISCONNECTED:
469 connstate = -ECONNABORTED;
470 goto connected;
471 case RDMA_CM_EVENT_DEVICE_REMOVAL:
472 connstate = -ENODEV;
473connected:
c56c65fb
TT
474 dprintk("RPC: %s: %sconnected\n",
475 __func__, connstate > 0 ? "" : "dis");
476 ep->rep_connected = connstate;
afadc468 477 rpcrdma_conn_func(ep);
c56c65fb 478 wake_up_all(&ep->rep_connect_wait);
8079fb78 479 /*FALLTHROUGH*/
c56c65fb 480 default:
8079fb78
CL
481 dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n",
482 __func__, &addr->sin_addr.s_addr,
483 ntohs(addr->sin_port), ep,
484 CONNECTION_MSG(event->event));
c56c65fb
TT
485 break;
486 }
487
f895b252 488#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
b3cd8d45
TT
489 if (connstate == 1) {
490 int ird = attr.max_dest_rd_atomic;
491 int tird = ep->rep_remote_cma.responder_resources;
21454aaa 492 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
b3cd8d45 493 "on %s, memreg %d slots %d ird %d%s\n",
21454aaa 494 &addr->sin_addr.s_addr,
b3cd8d45
TT
495 ntohs(addr->sin_port),
496 ia->ri_id->device->name,
497 ia->ri_memreg_strategy,
498 xprt->rx_buf.rb_max_requests,
499 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
500 } else if (connstate < 0) {
21454aaa
HH
501 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
502 &addr->sin_addr.s_addr,
b3cd8d45
TT
503 ntohs(addr->sin_port),
504 connstate);
505 }
506#endif
507
c56c65fb
TT
508 return 0;
509}
510
511static struct rdma_cm_id *
512rpcrdma_create_id(struct rpcrdma_xprt *xprt,
513 struct rpcrdma_ia *ia, struct sockaddr *addr)
514{
515 struct rdma_cm_id *id;
516 int rc;
517
1a954051
TT
518 init_completion(&ia->ri_done);
519
b26f9b99 520 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
c56c65fb
TT
521 if (IS_ERR(id)) {
522 rc = PTR_ERR(id);
523 dprintk("RPC: %s: rdma_create_id() failed %i\n",
524 __func__, rc);
525 return id;
526 }
527
5675add3 528 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
529 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
530 if (rc) {
531 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
532 __func__, rc);
533 goto out;
534 }
5675add3
TT
535 wait_for_completion_interruptible_timeout(&ia->ri_done,
536 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
537 rc = ia->ri_async_rc;
538 if (rc)
539 goto out;
540
5675add3 541 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
542 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
543 if (rc) {
544 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
545 __func__, rc);
546 goto out;
547 }
5675add3
TT
548 wait_for_completion_interruptible_timeout(&ia->ri_done,
549 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
550 rc = ia->ri_async_rc;
551 if (rc)
552 goto out;
553
554 return id;
555
556out:
557 rdma_destroy_id(id);
558 return ERR_PTR(rc);
559}
560
561/*
562 * Drain any cq, prior to teardown.
563 */
564static void
565rpcrdma_clean_cq(struct ib_cq *cq)
566{
567 struct ib_wc wc;
568 int count = 0;
569
570 while (1 == ib_poll_cq(cq, 1, &wc))
571 ++count;
572
573 if (count)
574 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
575 __func__, count, wc.opcode);
576}
577
578/*
579 * Exported functions.
580 */
581
582/*
583 * Open and initialize an Interface Adapter.
584 * o initializes fields of struct rpcrdma_ia, including
585 * interface and provider attributes and protection zone.
586 */
587int
588rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
589{
bd7ed1d1
TT
590 int rc, mem_priv;
591 struct ib_device_attr devattr;
c56c65fb
TT
592 struct rpcrdma_ia *ia = &xprt->rx_ia;
593
c56c65fb
TT
594 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
595 if (IS_ERR(ia->ri_id)) {
596 rc = PTR_ERR(ia->ri_id);
597 goto out1;
598 }
599
600 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
601 if (IS_ERR(ia->ri_pd)) {
602 rc = PTR_ERR(ia->ri_pd);
603 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
604 __func__, rc);
605 goto out2;
606 }
607
bd7ed1d1
TT
608 /*
609 * Query the device to determine if the requested memory
610 * registration strategy is supported. If it isn't, set the
611 * strategy to a globally supported model.
612 */
613 rc = ib_query_device(ia->ri_id->device, &devattr);
614 if (rc) {
615 dprintk("RPC: %s: ib_query_device failed %d\n",
616 __func__, rc);
617 goto out2;
618 }
619
620 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
621 ia->ri_have_dma_lkey = 1;
622 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
623 }
624
f10eafd3 625 if (memreg == RPCRDMA_FRMR) {
3197d309
TT
626 /* Requires both frmr reg and local dma lkey */
627 if ((devattr.device_cap_flags &
628 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
629 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
3197d309 630 dprintk("RPC: %s: FRMR registration "
f10eafd3
CL
631 "not supported by HCA\n", __func__);
632 memreg = RPCRDMA_MTHCAFMR;
0fc6c4e7
SW
633 } else {
634 /* Mind the ia limit on FRMR page list depth */
635 ia->ri_max_frmr_depth = min_t(unsigned int,
636 RPCRDMA_MAX_DATA_SEGS,
637 devattr.max_fast_reg_page_list_len);
bd7ed1d1 638 }
f10eafd3
CL
639 }
640 if (memreg == RPCRDMA_MTHCAFMR) {
641 if (!ia->ri_id->device->alloc_fmr) {
642 dprintk("RPC: %s: MTHCAFMR registration "
643 "not supported by HCA\n", __func__);
f10eafd3 644 memreg = RPCRDMA_ALLPHYSICAL;
f10eafd3 645 }
bd7ed1d1
TT
646 }
647
c56c65fb
TT
648 /*
649 * Optionally obtain an underlying physical identity mapping in
650 * order to do a memory window-based bind. This base registration
651 * is protected from remote access - that is enabled only by binding
652 * for the specific bytes targeted during each RPC operation, and
653 * revoked after the corresponding completion similar to a storage
654 * adapter.
655 */
bd7ed1d1 656 switch (memreg) {
3197d309 657 case RPCRDMA_FRMR:
bd7ed1d1 658 break;
bd7ed1d1
TT
659 case RPCRDMA_ALLPHYSICAL:
660 mem_priv = IB_ACCESS_LOCAL_WRITE |
661 IB_ACCESS_REMOTE_WRITE |
662 IB_ACCESS_REMOTE_READ;
663 goto register_setup;
bd7ed1d1
TT
664 case RPCRDMA_MTHCAFMR:
665 if (ia->ri_have_dma_lkey)
c56c65fb 666 break;
bd7ed1d1
TT
667 mem_priv = IB_ACCESS_LOCAL_WRITE;
668 register_setup:
c56c65fb
TT
669 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
670 if (IS_ERR(ia->ri_bind_mem)) {
671 printk(KERN_ALERT "%s: ib_get_dma_mr for "
0ac531c1 672 "phys register failed with %lX\n",
c56c65fb 673 __func__, PTR_ERR(ia->ri_bind_mem));
0ac531c1
CL
674 rc = -ENOMEM;
675 goto out2;
c56c65fb 676 }
bd7ed1d1
TT
677 break;
678 default:
cdd9ade7
CL
679 printk(KERN_ERR "RPC: Unsupported memory "
680 "registration mode: %d\n", memreg);
681 rc = -ENOMEM;
bd7ed1d1 682 goto out2;
c56c65fb 683 }
bd7ed1d1
TT
684 dprintk("RPC: %s: memory registration strategy is %d\n",
685 __func__, memreg);
c56c65fb
TT
686
687 /* Else will do memory reg/dereg for each chunk */
688 ia->ri_memreg_strategy = memreg;
689
73806c88 690 rwlock_init(&ia->ri_qplock);
c56c65fb
TT
691 return 0;
692out2:
693 rdma_destroy_id(ia->ri_id);
fee08caf 694 ia->ri_id = NULL;
c56c65fb
TT
695out1:
696 return rc;
697}
698
699/*
700 * Clean up/close an IA.
701 * o if event handles and PD have been initialized, free them.
702 * o close the IA
703 */
704void
705rpcrdma_ia_close(struct rpcrdma_ia *ia)
706{
707 int rc;
708
709 dprintk("RPC: %s: entering\n", __func__);
710 if (ia->ri_bind_mem != NULL) {
711 rc = ib_dereg_mr(ia->ri_bind_mem);
712 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
713 __func__, rc);
714 }
fee08caf
TT
715 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
716 if (ia->ri_id->qp)
717 rdma_destroy_qp(ia->ri_id);
718 rdma_destroy_id(ia->ri_id);
719 ia->ri_id = NULL;
720 }
c56c65fb
TT
721 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
722 rc = ib_dealloc_pd(ia->ri_pd);
723 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
724 __func__, rc);
725 }
c56c65fb
TT
726}
727
728/*
729 * Create unconnected endpoint.
730 */
731int
732rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
733 struct rpcrdma_create_data_internal *cdata)
734{
735 struct ib_device_attr devattr;
fc664485 736 struct ib_cq *sendcq, *recvcq;
5d40a8a5 737 int rc, err;
c56c65fb
TT
738
739 rc = ib_query_device(ia->ri_id->device, &devattr);
740 if (rc) {
741 dprintk("RPC: %s: ib_query_device failed %d\n",
742 __func__, rc);
743 return rc;
744 }
745
746 /* check provider's send/recv wr limits */
747 if (cdata->max_requests > devattr.max_qp_wr)
748 cdata->max_requests = devattr.max_qp_wr;
749
750 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
751 ep->rep_attr.qp_context = ep;
752 /* send_cq and recv_cq initialized below */
753 ep->rep_attr.srq = NULL;
754 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
755 switch (ia->ri_memreg_strategy) {
0fc6c4e7
SW
756 case RPCRDMA_FRMR: {
757 int depth = 7;
758
15cdc644
TT
759 /* Add room for frmr register and invalidate WRs.
760 * 1. FRMR reg WR for head
761 * 2. FRMR invalidate WR for head
0fc6c4e7
SW
762 * 3. N FRMR reg WRs for pagelist
763 * 4. N FRMR invalidate WRs for pagelist
15cdc644
TT
764 * 5. FRMR reg WR for tail
765 * 6. FRMR invalidate WR for tail
766 * 7. The RDMA_SEND WR
767 */
0fc6c4e7
SW
768
769 /* Calculate N if the device max FRMR depth is smaller than
770 * RPCRDMA_MAX_DATA_SEGS.
771 */
772 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
773 int delta = RPCRDMA_MAX_DATA_SEGS -
774 ia->ri_max_frmr_depth;
775
776 do {
777 depth += 2; /* FRMR reg + invalidate */
778 delta -= ia->ri_max_frmr_depth;
779 } while (delta > 0);
780
781 }
782 ep->rep_attr.cap.max_send_wr *= depth;
15cdc644 783 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
0fc6c4e7 784 cdata->max_requests = devattr.max_qp_wr / depth;
15cdc644
TT
785 if (!cdata->max_requests)
786 return -EINVAL;
0fc6c4e7
SW
787 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
788 depth;
15cdc644 789 }
3197d309 790 break;
0fc6c4e7 791 }
c56c65fb
TT
792 default:
793 break;
794 }
795 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
796 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
797 ep->rep_attr.cap.max_recv_sge = 1;
798 ep->rep_attr.cap.max_inline_data = 0;
799 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
800 ep->rep_attr.qp_type = IB_QPT_RC;
801 ep->rep_attr.port_num = ~0;
802
803 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
804 "iovs: send %d recv %d\n",
805 __func__,
806 ep->rep_attr.cap.max_send_wr,
807 ep->rep_attr.cap.max_recv_wr,
808 ep->rep_attr.cap.max_send_sge,
809 ep->rep_attr.cap.max_recv_sge);
810
811 /* set trigger for requesting send completion */
fc664485 812 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
e7104a2a
CL
813 if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
814 ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
815 else if (ep->rep_cqinit <= 2)
c56c65fb
TT
816 ep->rep_cqinit = 0;
817 INIT_CQCOUNT(ep);
c56c65fb 818 init_waitqueue_head(&ep->rep_connect_wait);
254f91e2 819 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
c56c65fb 820
fc664485 821 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
1c00dd07 822 rpcrdma_cq_async_error_upcall, ep,
c56c65fb 823 ep->rep_attr.cap.max_send_wr + 1, 0);
fc664485
CL
824 if (IS_ERR(sendcq)) {
825 rc = PTR_ERR(sendcq);
826 dprintk("RPC: %s: failed to create send CQ: %i\n",
c56c65fb
TT
827 __func__, rc);
828 goto out1;
829 }
830
fc664485 831 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
c56c65fb
TT
832 if (rc) {
833 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
834 __func__, rc);
835 goto out2;
836 }
837
fc664485 838 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
1c00dd07 839 rpcrdma_cq_async_error_upcall, ep,
fc664485
CL
840 ep->rep_attr.cap.max_recv_wr + 1, 0);
841 if (IS_ERR(recvcq)) {
842 rc = PTR_ERR(recvcq);
843 dprintk("RPC: %s: failed to create recv CQ: %i\n",
844 __func__, rc);
845 goto out2;
846 }
847
848 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
849 if (rc) {
850 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
851 __func__, rc);
852 ib_destroy_cq(recvcq);
853 goto out2;
854 }
855
856 ep->rep_attr.send_cq = sendcq;
857 ep->rep_attr.recv_cq = recvcq;
c56c65fb
TT
858
859 /* Initialize cma parameters */
860
861 /* RPC/RDMA does not use private data */
862 ep->rep_remote_cma.private_data = NULL;
863 ep->rep_remote_cma.private_data_len = 0;
864
865 /* Client offers RDMA Read but does not initiate */
b334eaab 866 ep->rep_remote_cma.initiator_depth = 0;
03ff8821 867 if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
b334eaab
TT
868 ep->rep_remote_cma.responder_resources = 32;
869 else
c56c65fb 870 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
c56c65fb
TT
871
872 ep->rep_remote_cma.retry_count = 7;
873 ep->rep_remote_cma.flow_control = 0;
874 ep->rep_remote_cma.rnr_retry_count = 0;
875
876 return 0;
877
878out2:
fc664485 879 err = ib_destroy_cq(sendcq);
5d40a8a5
CL
880 if (err)
881 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
882 __func__, err);
c56c65fb
TT
883out1:
884 return rc;
885}
886
887/*
888 * rpcrdma_ep_destroy
889 *
890 * Disconnect and destroy endpoint. After this, the only
891 * valid operations on the ep are to free it (if dynamically
892 * allocated) or re-create it.
c56c65fb 893 */
7f1d5419 894void
c56c65fb
TT
895rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
896{
897 int rc;
898
899 dprintk("RPC: %s: entering, connected is %d\n",
900 __func__, ep->rep_connected);
901
254f91e2
CL
902 cancel_delayed_work_sync(&ep->rep_connect_worker);
903
c56c65fb 904 if (ia->ri_id->qp) {
282191cb 905 rpcrdma_ep_disconnect(ep, ia);
fee08caf
TT
906 rdma_destroy_qp(ia->ri_id);
907 ia->ri_id->qp = NULL;
c56c65fb
TT
908 }
909
c56c65fb
TT
910 /* padding - could be done in rpcrdma_buffer_destroy... */
911 if (ep->rep_pad_mr) {
912 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
913 ep->rep_pad_mr = NULL;
914 }
915
fc664485
CL
916 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
917 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
918 if (rc)
919 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
920 __func__, rc);
921
922 rpcrdma_clean_cq(ep->rep_attr.send_cq);
923 rc = ib_destroy_cq(ep->rep_attr.send_cq);
c56c65fb
TT
924 if (rc)
925 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
926 __func__, rc);
c56c65fb
TT
927}
928
929/*
930 * Connect unconnected endpoint.
931 */
932int
933rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
934{
73806c88 935 struct rdma_cm_id *id, *old;
c56c65fb
TT
936 int rc = 0;
937 int retry_count = 0;
c56c65fb 938
c055551e 939 if (ep->rep_connected != 0) {
c56c65fb
TT
940 struct rpcrdma_xprt *xprt;
941retry:
ec62f40d 942 dprintk("RPC: %s: reconnecting...\n", __func__);
282191cb
CL
943
944 rpcrdma_ep_disconnect(ep, ia);
a7bc211a 945 rpcrdma_flush_cqs(ep);
c56c65fb 946
467c9674
CL
947 switch (ia->ri_memreg_strategy) {
948 case RPCRDMA_FRMR:
9f9d802a 949 rpcrdma_reset_frmrs(ia);
467c9674
CL
950 break;
951 case RPCRDMA_MTHCAFMR:
952 rpcrdma_reset_fmrs(ia);
953 break;
954 case RPCRDMA_ALLPHYSICAL:
955 break;
956 default:
957 rc = -EIO;
958 goto out;
959 }
9f9d802a 960
c56c65fb
TT
961 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
962 id = rpcrdma_create_id(xprt, ia,
963 (struct sockaddr *)&xprt->rx_data.addr);
964 if (IS_ERR(id)) {
ec62f40d 965 rc = -EHOSTUNREACH;
c56c65fb
TT
966 goto out;
967 }
968 /* TEMP TEMP TEMP - fail if new device:
969 * Deregister/remarshal *all* requests!
970 * Close and recreate adapter, pd, etc!
971 * Re-determine all attributes still sane!
972 * More stuff I haven't thought of!
973 * Rrrgh!
974 */
975 if (ia->ri_id->device != id->device) {
976 printk("RPC: %s: can't reconnect on "
977 "different device!\n", __func__);
978 rdma_destroy_id(id);
ec62f40d 979 rc = -ENETUNREACH;
c56c65fb
TT
980 goto out;
981 }
982 /* END TEMP */
ec62f40d
CL
983 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
984 if (rc) {
985 dprintk("RPC: %s: rdma_create_qp failed %i\n",
986 __func__, rc);
987 rdma_destroy_id(id);
988 rc = -ENETUNREACH;
989 goto out;
990 }
73806c88
CL
991
992 write_lock(&ia->ri_qplock);
993 old = ia->ri_id;
c56c65fb 994 ia->ri_id = id;
73806c88
CL
995 write_unlock(&ia->ri_qplock);
996
997 rdma_destroy_qp(old);
998 rdma_destroy_id(old);
ec62f40d
CL
999 } else {
1000 dprintk("RPC: %s: connecting...\n", __func__);
1001 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
1002 if (rc) {
1003 dprintk("RPC: %s: rdma_create_qp failed %i\n",
1004 __func__, rc);
1005 /* do not update ep->rep_connected */
1006 return -ENETUNREACH;
1007 }
c56c65fb
TT
1008 }
1009
c56c65fb
TT
1010 ep->rep_connected = 0;
1011
1012 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
1013 if (rc) {
1014 dprintk("RPC: %s: rdma_connect() failed with %i\n",
1015 __func__, rc);
1016 goto out;
1017 }
1018
c56c65fb
TT
1019 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
1020
1021 /*
1022 * Check state. A non-peer reject indicates no listener
1023 * (ECONNREFUSED), which may be a transient state. All
1024 * others indicate a transport condition which has already
1025 * undergone a best-effort.
1026 */
f64f9e71
JP
1027 if (ep->rep_connected == -ECONNREFUSED &&
1028 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
c56c65fb
TT
1029 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
1030 goto retry;
1031 }
1032 if (ep->rep_connected <= 0) {
1033 /* Sometimes, the only way to reliably connect to remote
1034 * CMs is to use same nonzero values for ORD and IRD. */
b334eaab
TT
1035 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
1036 (ep->rep_remote_cma.responder_resources == 0 ||
1037 ep->rep_remote_cma.initiator_depth !=
1038 ep->rep_remote_cma.responder_resources)) {
1039 if (ep->rep_remote_cma.responder_resources == 0)
1040 ep->rep_remote_cma.responder_resources = 1;
1041 ep->rep_remote_cma.initiator_depth =
1042 ep->rep_remote_cma.responder_resources;
c56c65fb 1043 goto retry;
b334eaab 1044 }
c56c65fb
TT
1045 rc = ep->rep_connected;
1046 } else {
1047 dprintk("RPC: %s: connected\n", __func__);
1048 }
1049
1050out:
1051 if (rc)
1052 ep->rep_connected = rc;
1053 return rc;
1054}
1055
1056/*
1057 * rpcrdma_ep_disconnect
1058 *
1059 * This is separate from destroy to facilitate the ability
1060 * to reconnect without recreating the endpoint.
1061 *
1062 * This call is not reentrant, and must not be made in parallel
1063 * on the same endpoint.
1064 */
282191cb 1065void
c56c65fb
TT
1066rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
1067{
1068 int rc;
1069
a7bc211a 1070 rpcrdma_flush_cqs(ep);
c56c65fb
TT
1071 rc = rdma_disconnect(ia->ri_id);
1072 if (!rc) {
1073 /* returns without wait if not connected */
1074 wait_event_interruptible(ep->rep_connect_wait,
1075 ep->rep_connected != 1);
1076 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
1077 (ep->rep_connected == 1) ? "still " : "dis");
1078 } else {
1079 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
1080 ep->rep_connected = rc;
1081 }
c56c65fb
TT
1082}
1083
2e84522c
CL
1084static int
1085rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1086{
1087 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
1088 struct ib_fmr_attr fmr_attr = {
1089 .max_pages = RPCRDMA_MAX_DATA_SEGS,
1090 .max_maps = 1,
1091 .page_shift = PAGE_SHIFT
1092 };
1093 struct rpcrdma_mw *r;
1094 int i, rc;
1095
1096 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1097 dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
1098
1099 while (i--) {
1100 r = kzalloc(sizeof(*r), GFP_KERNEL);
1101 if (r == NULL)
1102 return -ENOMEM;
1103
1104 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
1105 if (IS_ERR(r->r.fmr)) {
1106 rc = PTR_ERR(r->r.fmr);
1107 dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
1108 __func__, rc);
1109 goto out_free;
1110 }
1111
1112 list_add(&r->mw_list, &buf->rb_mws);
1113 list_add(&r->mw_all, &buf->rb_all);
1114 }
1115 return 0;
1116
1117out_free:
1118 kfree(r);
1119 return rc;
1120}
1121
1122static int
1123rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1124{
1125 struct rpcrdma_frmr *f;
1126 struct rpcrdma_mw *r;
1127 int i, rc;
1128
1129 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1130 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
1131
1132 while (i--) {
1133 r = kzalloc(sizeof(*r), GFP_KERNEL);
1134 if (r == NULL)
1135 return -ENOMEM;
1136 f = &r->r.frmr;
1137
1138 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1139 ia->ri_max_frmr_depth);
1140 if (IS_ERR(f->fr_mr)) {
1141 rc = PTR_ERR(f->fr_mr);
1142 dprintk("RPC: %s: ib_alloc_fast_reg_mr "
1143 "failed %i\n", __func__, rc);
1144 goto out_free;
1145 }
1146
1147 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
1148 ia->ri_max_frmr_depth);
1149 if (IS_ERR(f->fr_pgl)) {
1150 rc = PTR_ERR(f->fr_pgl);
1151 dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
1152 "failed %i\n", __func__, rc);
1153
1154 ib_dereg_mr(f->fr_mr);
1155 goto out_free;
1156 }
1157
1158 list_add(&r->mw_list, &buf->rb_mws);
1159 list_add(&r->mw_all, &buf->rb_all);
1160 }
1161
1162 return 0;
1163
1164out_free:
1165 kfree(r);
1166 return rc;
1167}
1168
c56c65fb
TT
1169int
1170rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1171 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
1172{
1173 char *p;
65866f82 1174 size_t len, rlen, wlen;
c56c65fb
TT
1175 int i, rc;
1176
1177 buf->rb_max_requests = cdata->max_requests;
1178 spin_lock_init(&buf->rb_lock);
c56c65fb
TT
1179
1180 /* Need to allocate:
1181 * 1. arrays for send and recv pointers
1182 * 2. arrays of struct rpcrdma_req to fill in pointers
1183 * 3. array of struct rpcrdma_rep for replies
1184 * 4. padding, if any
c56c65fb
TT
1185 * Send/recv buffers in req/rep need to be registered
1186 */
c56c65fb
TT
1187 len = buf->rb_max_requests *
1188 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1189 len += cdata->padding;
c56c65fb 1190
c56c65fb
TT
1191 p = kzalloc(len, GFP_KERNEL);
1192 if (p == NULL) {
1193 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1194 __func__, len);
1195 rc = -ENOMEM;
1196 goto out;
1197 }
1198 buf->rb_pool = p; /* for freeing it later */
1199
1200 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1201 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1202 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1203 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1204
1205 /*
1206 * Register the zeroed pad buffer, if any.
1207 */
1208 if (cdata->padding) {
1209 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1210 &ep->rep_pad_mr, &ep->rep_pad);
1211 if (rc)
1212 goto out;
1213 }
1214 p += cdata->padding;
1215
c56c65fb 1216 INIT_LIST_HEAD(&buf->rb_mws);
3111d72c 1217 INIT_LIST_HEAD(&buf->rb_all);
c56c65fb 1218 switch (ia->ri_memreg_strategy) {
3197d309 1219 case RPCRDMA_FRMR:
2e84522c
CL
1220 rc = rpcrdma_init_frmrs(ia, buf);
1221 if (rc)
1222 goto out;
3197d309 1223 break;
c56c65fb 1224 case RPCRDMA_MTHCAFMR:
2e84522c
CL
1225 rc = rpcrdma_init_fmrs(ia, buf);
1226 if (rc)
1227 goto out;
c56c65fb 1228 break;
c56c65fb
TT
1229 default:
1230 break;
1231 }
1232
1233 /*
1234 * Allocate/init the request/reply buffers. Doing this
1235 * using kmalloc for now -- one for each buf.
1236 */
65866f82
CL
1237 wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
1238 rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
1239 dprintk("RPC: %s: wlen = %zu, rlen = %zu\n",
1240 __func__, wlen, rlen);
1241
c56c65fb
TT
1242 for (i = 0; i < buf->rb_max_requests; i++) {
1243 struct rpcrdma_req *req;
1244 struct rpcrdma_rep *rep;
1245
65866f82 1246 req = kmalloc(wlen, GFP_KERNEL);
c56c65fb
TT
1247 if (req == NULL) {
1248 dprintk("RPC: %s: request buffer %d alloc"
1249 " failed\n", __func__, i);
1250 rc = -ENOMEM;
1251 goto out;
1252 }
1253 memset(req, 0, sizeof(struct rpcrdma_req));
1254 buf->rb_send_bufs[i] = req;
1255 buf->rb_send_bufs[i]->rl_buffer = buf;
1256
1257 rc = rpcrdma_register_internal(ia, req->rl_base,
65866f82 1258 wlen - offsetof(struct rpcrdma_req, rl_base),
c56c65fb
TT
1259 &buf->rb_send_bufs[i]->rl_handle,
1260 &buf->rb_send_bufs[i]->rl_iov);
1261 if (rc)
1262 goto out;
1263
65866f82
CL
1264 buf->rb_send_bufs[i]->rl_size = wlen -
1265 sizeof(struct rpcrdma_req);
c56c65fb 1266
65866f82 1267 rep = kmalloc(rlen, GFP_KERNEL);
c56c65fb
TT
1268 if (rep == NULL) {
1269 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1270 __func__, i);
1271 rc = -ENOMEM;
1272 goto out;
1273 }
1274 memset(rep, 0, sizeof(struct rpcrdma_rep));
1275 buf->rb_recv_bufs[i] = rep;
1276 buf->rb_recv_bufs[i]->rr_buffer = buf;
c56c65fb
TT
1277
1278 rc = rpcrdma_register_internal(ia, rep->rr_base,
65866f82 1279 rlen - offsetof(struct rpcrdma_rep, rr_base),
c56c65fb
TT
1280 &buf->rb_recv_bufs[i]->rr_handle,
1281 &buf->rb_recv_bufs[i]->rr_iov);
1282 if (rc)
1283 goto out;
1284
1285 }
1286 dprintk("RPC: %s: max_requests %d\n",
1287 __func__, buf->rb_max_requests);
1288 /* done */
1289 return 0;
1290out:
1291 rpcrdma_buffer_destroy(buf);
1292 return rc;
1293}
1294
2e84522c
CL
1295static void
1296rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1297{
1298 struct rpcrdma_mw *r;
1299 int rc;
1300
1301 while (!list_empty(&buf->rb_all)) {
1302 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1303 list_del(&r->mw_all);
1304 list_del(&r->mw_list);
1305
1306 rc = ib_dealloc_fmr(r->r.fmr);
1307 if (rc)
1308 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
1309 __func__, rc);
1310
1311 kfree(r);
1312 }
1313}
1314
1315static void
1316rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
1317{
1318 struct rpcrdma_mw *r;
1319 int rc;
1320
1321 while (!list_empty(&buf->rb_all)) {
1322 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1323 list_del(&r->mw_all);
1324 list_del(&r->mw_list);
1325
1326 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1327 if (rc)
1328 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1329 __func__, rc);
1330 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1331
1332 kfree(r);
1333 }
1334}
1335
c56c65fb
TT
1336void
1337rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1338{
c56c65fb 1339 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
2e84522c 1340 int i;
c56c65fb
TT
1341
1342 /* clean up in reverse order from create
1343 * 1. recv mr memory (mr free, then kfree)
c56c65fb 1344 * 2. send mr memory (mr free, then kfree)
2e84522c 1345 * 3. MWs
c56c65fb
TT
1346 */
1347 dprintk("RPC: %s: entering\n", __func__);
1348
1349 for (i = 0; i < buf->rb_max_requests; i++) {
1350 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1351 rpcrdma_deregister_internal(ia,
1352 buf->rb_recv_bufs[i]->rr_handle,
1353 &buf->rb_recv_bufs[i]->rr_iov);
1354 kfree(buf->rb_recv_bufs[i]);
1355 }
1356 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
c56c65fb
TT
1357 rpcrdma_deregister_internal(ia,
1358 buf->rb_send_bufs[i]->rl_handle,
1359 &buf->rb_send_bufs[i]->rl_iov);
1360 kfree(buf->rb_send_bufs[i]);
1361 }
1362 }
1363
2e84522c
CL
1364 switch (ia->ri_memreg_strategy) {
1365 case RPCRDMA_FRMR:
1366 rpcrdma_destroy_frmrs(buf);
1367 break;
1368 case RPCRDMA_MTHCAFMR:
1369 rpcrdma_destroy_fmrs(buf);
1370 break;
1371 default:
1372 break;
4034ba04
AA
1373 }
1374
c56c65fb
TT
1375 kfree(buf->rb_pool);
1376}
1377
467c9674
CL
1378/* After a disconnect, unmap all FMRs.
1379 *
1380 * This is invoked only in the transport connect worker in order
1381 * to serialize with rpcrdma_register_fmr_external().
1382 */
1383static void
1384rpcrdma_reset_fmrs(struct rpcrdma_ia *ia)
1385{
1386 struct rpcrdma_xprt *r_xprt =
1387 container_of(ia, struct rpcrdma_xprt, rx_ia);
1388 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1389 struct list_head *pos;
1390 struct rpcrdma_mw *r;
1391 LIST_HEAD(l);
1392 int rc;
1393
1394 list_for_each(pos, &buf->rb_all) {
1395 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1396
1397 INIT_LIST_HEAD(&l);
1398 list_add(&r->r.fmr->list, &l);
1399 rc = ib_unmap_fmr(&l);
1400 if (rc)
1401 dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
1402 __func__, rc);
1403 }
1404}
1405
9f9d802a
CL
1406/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1407 * an unusable state. Find FRMRs in this state and dereg / reg
1408 * each. FRMRs that are VALID and attached to an rpcrdma_req are
1409 * also torn down.
1410 *
1411 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1412 *
1413 * This is invoked only in the transport connect worker in order
1414 * to serialize with rpcrdma_register_frmr_external().
1415 */
1416static void
1417rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
1418{
1419 struct rpcrdma_xprt *r_xprt =
1420 container_of(ia, struct rpcrdma_xprt, rx_ia);
1421 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1422 struct list_head *pos;
1423 struct rpcrdma_mw *r;
1424 int rc;
1425
1426 list_for_each(pos, &buf->rb_all) {
1427 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1428
1429 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
1430 continue;
1431
1432 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1433 if (rc)
1434 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1435 __func__, rc);
1436 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1437
1438 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1439 ia->ri_max_frmr_depth);
1440 if (IS_ERR(r->r.frmr.fr_mr)) {
1441 rc = PTR_ERR(r->r.frmr.fr_mr);
1442 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1443 " failed %i\n", __func__, rc);
1444 continue;
1445 }
1446 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1447 ia->ri_id->device,
1448 ia->ri_max_frmr_depth);
1449 if (IS_ERR(r->r.frmr.fr_pgl)) {
1450 rc = PTR_ERR(r->r.frmr.fr_pgl);
1451 dprintk("RPC: %s: "
1452 "ib_alloc_fast_reg_page_list "
1453 "failed %i\n", __func__, rc);
1454
1455 ib_dereg_mr(r->r.frmr.fr_mr);
1456 continue;
1457 }
1458 r->r.frmr.fr_state = FRMR_IS_INVALID;
1459 }
1460}
1461
c2922c02
CL
1462/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1463 * some req segments uninitialized.
1464 */
1465static void
1466rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
1467{
1468 if (*mw) {
1469 list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
1470 *mw = NULL;
1471 }
1472}
1473
1474/* Cycle mw's back in reverse order, and "spin" them.
1475 * This delays and scrambles reuse as much as possible.
1476 */
1477static void
1478rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1479{
1480 struct rpcrdma_mr_seg *seg = req->rl_segments;
1481 struct rpcrdma_mr_seg *seg1 = seg;
1482 int i;
1483
1484 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
3eb35810
CL
1485 rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
1486 rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
c2922c02
CL
1487}
1488
1489static void
1490rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1491{
1492 buf->rb_send_bufs[--buf->rb_send_index] = req;
1493 req->rl_niovs = 0;
1494 if (req->rl_reply) {
1495 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
1496 req->rl_reply->rr_func = NULL;
1497 req->rl_reply = NULL;
1498 }
1499}
1500
ddb6bebc
CL
1501/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
1502 * Redo only the ib_post_send().
1503 */
1504static void
1505rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
1506{
1507 struct rpcrdma_xprt *r_xprt =
1508 container_of(ia, struct rpcrdma_xprt, rx_ia);
1509 struct ib_send_wr invalidate_wr, *bad_wr;
1510 int rc;
1511
1512 dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
1513
1514 /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
dab7e3b8 1515 r->r.frmr.fr_state = FRMR_IS_INVALID;
ddb6bebc
CL
1516
1517 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
1518 invalidate_wr.wr_id = (unsigned long)(void *)r;
1519 invalidate_wr.opcode = IB_WR_LOCAL_INV;
ddb6bebc
CL
1520 invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
1521 DECR_CQCOUNT(&r_xprt->rx_ep);
1522
1523 dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
1524 __func__, r, r->r.frmr.fr_mr->rkey);
1525
1526 read_lock(&ia->ri_qplock);
1527 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1528 read_unlock(&ia->ri_qplock);
1529 if (rc) {
1530 /* Force rpcrdma_buffer_get() to retry */
1531 r->r.frmr.fr_state = FRMR_IS_STALE;
1532 dprintk("RPC: %s: ib_post_send failed, %i\n",
1533 __func__, rc);
1534 }
1535}
1536
1537static void
1538rpcrdma_retry_flushed_linv(struct list_head *stale,
1539 struct rpcrdma_buffer *buf)
1540{
1541 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1542 struct list_head *pos;
1543 struct rpcrdma_mw *r;
1544 unsigned long flags;
1545
1546 list_for_each(pos, stale) {
1547 r = list_entry(pos, struct rpcrdma_mw, mw_list);
1548 rpcrdma_retry_local_inv(r, ia);
1549 }
1550
1551 spin_lock_irqsave(&buf->rb_lock, flags);
1552 list_splice_tail(stale, &buf->rb_mws);
1553 spin_unlock_irqrestore(&buf->rb_lock, flags);
1554}
1555
1556static struct rpcrdma_req *
1557rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
1558 struct list_head *stale)
1559{
1560 struct rpcrdma_mw *r;
1561 int i;
1562
1563 i = RPCRDMA_MAX_SEGS - 1;
1564 while (!list_empty(&buf->rb_mws)) {
1565 r = list_entry(buf->rb_mws.next,
1566 struct rpcrdma_mw, mw_list);
1567 list_del(&r->mw_list);
1568 if (r->r.frmr.fr_state == FRMR_IS_STALE) {
1569 list_add(&r->mw_list, stale);
1570 continue;
1571 }
3eb35810 1572 req->rl_segments[i].rl_mw = r;
ddb6bebc
CL
1573 if (unlikely(i-- == 0))
1574 return req; /* Success */
1575 }
1576
1577 /* Not enough entries on rb_mws for this req */
1578 rpcrdma_buffer_put_sendbuf(req, buf);
1579 rpcrdma_buffer_put_mrs(req, buf);
1580 return NULL;
1581}
1582
c2922c02 1583static struct rpcrdma_req *
ddb6bebc 1584rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
c2922c02
CL
1585{
1586 struct rpcrdma_mw *r;
1587 int i;
1588
1589 i = RPCRDMA_MAX_SEGS - 1;
1590 while (!list_empty(&buf->rb_mws)) {
1591 r = list_entry(buf->rb_mws.next,
1592 struct rpcrdma_mw, mw_list);
1593 list_del(&r->mw_list);
3eb35810 1594 req->rl_segments[i].rl_mw = r;
c2922c02
CL
1595 if (unlikely(i-- == 0))
1596 return req; /* Success */
1597 }
1598
1599 /* Not enough entries on rb_mws for this req */
1600 rpcrdma_buffer_put_sendbuf(req, buf);
1601 rpcrdma_buffer_put_mrs(req, buf);
1602 return NULL;
1603}
1604
c56c65fb
TT
1605/*
1606 * Get a set of request/reply buffers.
1607 *
1608 * Reply buffer (if needed) is attached to send buffer upon return.
1609 * Rule:
1610 * rb_send_index and rb_recv_index MUST always be pointing to the
1611 * *next* available buffer (non-NULL). They are incremented after
1612 * removing buffers, and decremented *before* returning them.
1613 */
1614struct rpcrdma_req *
1615rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1616{
c2922c02 1617 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
ddb6bebc 1618 struct list_head stale;
c56c65fb
TT
1619 struct rpcrdma_req *req;
1620 unsigned long flags;
1621
1622 spin_lock_irqsave(&buffers->rb_lock, flags);
1623 if (buffers->rb_send_index == buffers->rb_max_requests) {
1624 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1625 dprintk("RPC: %s: out of request buffers\n", __func__);
1626 return ((struct rpcrdma_req *)NULL);
1627 }
1628
1629 req = buffers->rb_send_bufs[buffers->rb_send_index];
1630 if (buffers->rb_send_index < buffers->rb_recv_index) {
1631 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1632 __func__,
1633 buffers->rb_recv_index - buffers->rb_send_index);
1634 req->rl_reply = NULL;
1635 } else {
1636 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1637 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1638 }
1639 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
ddb6bebc
CL
1640
1641 INIT_LIST_HEAD(&stale);
c2922c02
CL
1642 switch (ia->ri_memreg_strategy) {
1643 case RPCRDMA_FRMR:
ddb6bebc
CL
1644 req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
1645 break;
c2922c02 1646 case RPCRDMA_MTHCAFMR:
ddb6bebc 1647 req = rpcrdma_buffer_get_fmrs(req, buffers);
c2922c02
CL
1648 break;
1649 default:
1650 break;
c56c65fb
TT
1651 }
1652 spin_unlock_irqrestore(&buffers->rb_lock, flags);
ddb6bebc
CL
1653 if (!list_empty(&stale))
1654 rpcrdma_retry_flushed_linv(&stale, buffers);
c56c65fb
TT
1655 return req;
1656}
1657
1658/*
1659 * Put request/reply buffers back into pool.
1660 * Pre-decrement counter/array index.
1661 */
1662void
1663rpcrdma_buffer_put(struct rpcrdma_req *req)
1664{
1665 struct rpcrdma_buffer *buffers = req->rl_buffer;
1666 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
c56c65fb
TT
1667 unsigned long flags;
1668
c56c65fb 1669 spin_lock_irqsave(&buffers->rb_lock, flags);
c2922c02 1670 rpcrdma_buffer_put_sendbuf(req, buffers);
c56c65fb 1671 switch (ia->ri_memreg_strategy) {
3197d309 1672 case RPCRDMA_FRMR:
c56c65fb 1673 case RPCRDMA_MTHCAFMR:
c2922c02 1674 rpcrdma_buffer_put_mrs(req, buffers);
c56c65fb
TT
1675 break;
1676 default:
1677 break;
1678 }
1679 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1680}
1681
1682/*
1683 * Recover reply buffers from pool.
1684 * This happens when recovering from error conditions.
1685 * Post-increment counter/array index.
1686 */
1687void
1688rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1689{
1690 struct rpcrdma_buffer *buffers = req->rl_buffer;
1691 unsigned long flags;
1692
1693 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1694 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1695 spin_lock_irqsave(&buffers->rb_lock, flags);
1696 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1697 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1698 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1699 }
1700 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1701}
1702
1703/*
1704 * Put reply buffers back into pool when not attached to
b45ccfd2 1705 * request. This happens in error conditions.
c56c65fb
TT
1706 */
1707void
1708rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1709{
1710 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1711 unsigned long flags;
1712
1713 rep->rr_func = NULL;
1714 spin_lock_irqsave(&buffers->rb_lock, flags);
1715 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1716 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1717}
1718
1719/*
1720 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1721 */
1722
1723int
1724rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1725 struct ib_mr **mrp, struct ib_sge *iov)
1726{
1727 struct ib_phys_buf ipb;
1728 struct ib_mr *mr;
1729 int rc;
1730
1731 /*
1732 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1733 */
1734 iov->addr = ib_dma_map_single(ia->ri_id->device,
1735 va, len, DMA_BIDIRECTIONAL);
bf858ab0
YB
1736 if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
1737 return -ENOMEM;
1738
c56c65fb
TT
1739 iov->length = len;
1740
bd7ed1d1
TT
1741 if (ia->ri_have_dma_lkey) {
1742 *mrp = NULL;
1743 iov->lkey = ia->ri_dma_lkey;
1744 return 0;
1745 } else if (ia->ri_bind_mem != NULL) {
c56c65fb
TT
1746 *mrp = NULL;
1747 iov->lkey = ia->ri_bind_mem->lkey;
1748 return 0;
1749 }
1750
1751 ipb.addr = iov->addr;
1752 ipb.size = iov->length;
1753 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1754 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1755
1756 dprintk("RPC: %s: phys convert: 0x%llx "
1757 "registered 0x%llx length %d\n",
a56daeb7
AM
1758 __func__, (unsigned long long)ipb.addr,
1759 (unsigned long long)iov->addr, len);
c56c65fb
TT
1760
1761 if (IS_ERR(mr)) {
1762 *mrp = NULL;
1763 rc = PTR_ERR(mr);
1764 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1765 } else {
1766 *mrp = mr;
1767 iov->lkey = mr->lkey;
1768 rc = 0;
1769 }
1770
1771 return rc;
1772}
1773
1774int
1775rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1776 struct ib_mr *mr, struct ib_sge *iov)
1777{
1778 int rc;
1779
1780 ib_dma_unmap_single(ia->ri_id->device,
1781 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1782
1783 if (NULL == mr)
1784 return 0;
1785
1786 rc = ib_dereg_mr(mr);
1787 if (rc)
1788 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1789 return rc;
1790}
1791
1792/*
1793 * Wrappers for chunk registration, shared by read/write chunk code.
1794 */
1795
1796static void
1797rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1798{
1799 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1800 seg->mr_dmalen = seg->mr_len;
1801 if (seg->mr_page)
1802 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1803 seg->mr_page, offset_in_page(seg->mr_offset),
1804 seg->mr_dmalen, seg->mr_dir);
1805 else
1806 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1807 seg->mr_offset,
1808 seg->mr_dmalen, seg->mr_dir);
5c635e09
TT
1809 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1810 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1811 __func__,
986d4abb
RD
1812 (unsigned long long)seg->mr_dma,
1813 seg->mr_offset, seg->mr_dmalen);
5c635e09 1814 }
c56c65fb
TT
1815}
1816
1817static void
1818rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1819{
1820 if (seg->mr_page)
1821 ib_dma_unmap_page(ia->ri_id->device,
1822 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1823 else
1824 ib_dma_unmap_single(ia->ri_id->device,
1825 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1826}
1827
3197d309
TT
1828static int
1829rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1830 int *nsegs, int writing, struct rpcrdma_ia *ia,
1831 struct rpcrdma_xprt *r_xprt)
1832{
1833 struct rpcrdma_mr_seg *seg1 = seg;
3eb35810 1834 struct rpcrdma_mw *mw = seg1->rl_mw;
0dbb4108
CL
1835 struct rpcrdma_frmr *frmr = &mw->r.frmr;
1836 struct ib_mr *mr = frmr->fr_mr;
f590e878 1837 struct ib_send_wr fastreg_wr, *bad_wr;
3197d309
TT
1838 u8 key;
1839 int len, pageoff;
1840 int i, rc;
9b78145c
TT
1841 int seg_len;
1842 u64 pa;
1843 int page_no;
3197d309
TT
1844
1845 pageoff = offset_in_page(seg1->mr_offset);
1846 seg1->mr_offset -= pageoff; /* start of page */
1847 seg1->mr_len += pageoff;
1848 len = -pageoff;
0fc6c4e7
SW
1849 if (*nsegs > ia->ri_max_frmr_depth)
1850 *nsegs = ia->ri_max_frmr_depth;
9b78145c 1851 for (page_no = i = 0; i < *nsegs;) {
3197d309 1852 rpcrdma_map_one(ia, seg, writing);
9b78145c
TT
1853 pa = seg->mr_dma;
1854 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
0dbb4108 1855 frmr->fr_pgl->page_list[page_no++] = pa;
9b78145c
TT
1856 pa += PAGE_SIZE;
1857 }
3197d309
TT
1858 len += seg->mr_len;
1859 ++seg;
1860 ++i;
1861 /* Check for holes */
1862 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1863 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1864 break;
1865 }
1866 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
0dbb4108 1867 __func__, mw, i);
3197d309 1868
05055722
CL
1869 frmr->fr_state = FRMR_IS_VALID;
1870
f590e878
CL
1871 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
1872 fastreg_wr.wr_id = (unsigned long)(void *)mw;
1873 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
1874 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1875 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
1876 fastreg_wr.wr.fast_reg.page_list_len = page_no;
1877 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1878 fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1879 if (fastreg_wr.wr.fast_reg.length < len) {
5fc83f47
CL
1880 rc = -EIO;
1881 goto out_err;
c977dea2
CL
1882 }
1883
1884 /* Bump the key */
0dbb4108
CL
1885 key = (u8)(mr->rkey & 0x000000FF);
1886 ib_update_fast_reg_key(mr, ++key);
c977dea2 1887
f590e878 1888 fastreg_wr.wr.fast_reg.access_flags = (writing ?
68743082
VP
1889 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1890 IB_ACCESS_REMOTE_READ);
f590e878 1891 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
3197d309
TT
1892 DECR_CQCOUNT(&r_xprt->rx_ep);
1893
f590e878 1894 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
3197d309
TT
1895 if (rc) {
1896 dprintk("RPC: %s: failed ib_post_send for register,"
1897 " status %i\n", __func__, rc);
c93e986a 1898 ib_update_fast_reg_key(mr, --key);
5fc83f47 1899 goto out_err;
3197d309 1900 } else {
0dbb4108 1901 seg1->mr_rkey = mr->rkey;
3197d309
TT
1902 seg1->mr_base = seg1->mr_dma + pageoff;
1903 seg1->mr_nsegs = i;
1904 seg1->mr_len = len;
1905 }
1906 *nsegs = i;
5fc83f47
CL
1907 return 0;
1908out_err:
05055722 1909 frmr->fr_state = FRMR_IS_INVALID;
5fc83f47
CL
1910 while (i--)
1911 rpcrdma_unmap_one(ia, --seg);
3197d309
TT
1912 return rc;
1913}
1914
1915static int
1916rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1917 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1918{
1919 struct rpcrdma_mr_seg *seg1 = seg;
1920 struct ib_send_wr invalidate_wr, *bad_wr;
1921 int rc;
1922
3eb35810 1923 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
dab7e3b8 1924
3197d309 1925 memset(&invalidate_wr, 0, sizeof invalidate_wr);
3eb35810 1926 invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
3197d309 1927 invalidate_wr.opcode = IB_WR_LOCAL_INV;
3eb35810 1928 invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
3197d309
TT
1929 DECR_CQCOUNT(&r_xprt->rx_ep);
1930
73806c88
CL
1931 read_lock(&ia->ri_qplock);
1932 while (seg1->mr_nsegs--)
1933 rpcrdma_unmap_one(ia, seg++);
3197d309 1934 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
73806c88 1935 read_unlock(&ia->ri_qplock);
dab7e3b8
CL
1936 if (rc) {
1937 /* Force rpcrdma_buffer_get() to retry */
3eb35810 1938 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
3197d309
TT
1939 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1940 " status %i\n", __func__, rc);
dab7e3b8 1941 }
3197d309
TT
1942 return rc;
1943}
1944
8d4ba034
TT
1945static int
1946rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1947 int *nsegs, int writing, struct rpcrdma_ia *ia)
1948{
1949 struct rpcrdma_mr_seg *seg1 = seg;
1950 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1951 int len, pageoff, i, rc;
1952
1953 pageoff = offset_in_page(seg1->mr_offset);
1954 seg1->mr_offset -= pageoff; /* start of page */
1955 seg1->mr_len += pageoff;
1956 len = -pageoff;
1957 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1958 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1959 for (i = 0; i < *nsegs;) {
1960 rpcrdma_map_one(ia, seg, writing);
1961 physaddrs[i] = seg->mr_dma;
1962 len += seg->mr_len;
1963 ++seg;
1964 ++i;
1965 /* Check for holes */
1966 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1967 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1968 break;
1969 }
3eb35810 1970 rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma);
8d4ba034
TT
1971 if (rc) {
1972 dprintk("RPC: %s: failed ib_map_phys_fmr "
1973 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1974 len, (unsigned long long)seg1->mr_dma,
1975 pageoff, i, rc);
1976 while (i--)
1977 rpcrdma_unmap_one(ia, --seg);
1978 } else {
3eb35810 1979 seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey;
8d4ba034
TT
1980 seg1->mr_base = seg1->mr_dma + pageoff;
1981 seg1->mr_nsegs = i;
1982 seg1->mr_len = len;
1983 }
1984 *nsegs = i;
1985 return rc;
1986}
1987
1988static int
1989rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1990 struct rpcrdma_ia *ia)
1991{
1992 struct rpcrdma_mr_seg *seg1 = seg;
1993 LIST_HEAD(l);
1994 int rc;
1995
3eb35810 1996 list_add(&seg1->rl_mw->r.fmr->list, &l);
8d4ba034 1997 rc = ib_unmap_fmr(&l);
73806c88 1998 read_lock(&ia->ri_qplock);
8d4ba034
TT
1999 while (seg1->mr_nsegs--)
2000 rpcrdma_unmap_one(ia, seg++);
73806c88 2001 read_unlock(&ia->ri_qplock);
8d4ba034
TT
2002 if (rc)
2003 dprintk("RPC: %s: failed ib_unmap_fmr,"
2004 " status %i\n", __func__, rc);
2005 return rc;
2006}
2007
c56c65fb
TT
2008int
2009rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
2010 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
2011{
2012 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
2013 int rc = 0;
2014
2015 switch (ia->ri_memreg_strategy) {
2016
c56c65fb
TT
2017 case RPCRDMA_ALLPHYSICAL:
2018 rpcrdma_map_one(ia, seg, writing);
2019 seg->mr_rkey = ia->ri_bind_mem->rkey;
2020 seg->mr_base = seg->mr_dma;
2021 seg->mr_nsegs = 1;
2022 nsegs = 1;
2023 break;
c56c65fb 2024
3197d309
TT
2025 /* Registration using frmr registration */
2026 case RPCRDMA_FRMR:
2027 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
2028 break;
2029
8d4ba034 2030 /* Registration using fmr memory registration */
c56c65fb 2031 case RPCRDMA_MTHCAFMR:
8d4ba034 2032 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
c56c65fb
TT
2033 break;
2034
c56c65fb 2035 default:
92b98361 2036 return -EIO;
c56c65fb
TT
2037 }
2038 if (rc)
92b98361 2039 return rc;
c56c65fb
TT
2040
2041 return nsegs;
2042}
2043
2044int
2045rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
13c9ff8f 2046 struct rpcrdma_xprt *r_xprt)
c56c65fb
TT
2047{
2048 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
2049 int nsegs = seg->mr_nsegs, rc;
2050
2051 switch (ia->ri_memreg_strategy) {
2052
c56c65fb 2053 case RPCRDMA_ALLPHYSICAL:
73806c88 2054 read_lock(&ia->ri_qplock);
c56c65fb 2055 rpcrdma_unmap_one(ia, seg);
73806c88 2056 read_unlock(&ia->ri_qplock);
c56c65fb 2057 break;
c56c65fb 2058
3197d309
TT
2059 case RPCRDMA_FRMR:
2060 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
2061 break;
2062
c56c65fb 2063 case RPCRDMA_MTHCAFMR:
8d4ba034 2064 rc = rpcrdma_deregister_fmr_external(seg, ia);
c56c65fb
TT
2065 break;
2066
c56c65fb 2067 default:
c56c65fb
TT
2068 break;
2069 }
c56c65fb
TT
2070 return nsegs;
2071}
2072
2073/*
2074 * Prepost any receive buffer, then post send.
2075 *
2076 * Receive buffer is donated to hardware, reclaimed upon recv completion.
2077 */
2078int
2079rpcrdma_ep_post(struct rpcrdma_ia *ia,
2080 struct rpcrdma_ep *ep,
2081 struct rpcrdma_req *req)
2082{
2083 struct ib_send_wr send_wr, *send_wr_fail;
2084 struct rpcrdma_rep *rep = req->rl_reply;
2085 int rc;
2086
2087 if (rep) {
2088 rc = rpcrdma_ep_post_recv(ia, ep, rep);
2089 if (rc)
2090 goto out;
2091 req->rl_reply = NULL;
2092 }
2093
2094 send_wr.next = NULL;
2095 send_wr.wr_id = 0ULL; /* no send cookie */
2096 send_wr.sg_list = req->rl_send_iov;
2097 send_wr.num_sge = req->rl_niovs;
2098 send_wr.opcode = IB_WR_SEND;
c56c65fb
TT
2099 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
2100 ib_dma_sync_single_for_device(ia->ri_id->device,
2101 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
2102 DMA_TO_DEVICE);
2103 ib_dma_sync_single_for_device(ia->ri_id->device,
2104 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
2105 DMA_TO_DEVICE);
2106 ib_dma_sync_single_for_device(ia->ri_id->device,
2107 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
2108 DMA_TO_DEVICE);
2109
2110 if (DECR_CQCOUNT(ep) > 0)
2111 send_wr.send_flags = 0;
2112 else { /* Provider must take a send completion every now and then */
2113 INIT_CQCOUNT(ep);
2114 send_wr.send_flags = IB_SEND_SIGNALED;
2115 }
2116
2117 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
2118 if (rc)
2119 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
2120 rc);
2121out:
2122 return rc;
2123}
2124
2125/*
2126 * (Re)post a receive buffer.
2127 */
2128int
2129rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
2130 struct rpcrdma_ep *ep,
2131 struct rpcrdma_rep *rep)
2132{
2133 struct ib_recv_wr recv_wr, *recv_wr_fail;
2134 int rc;
2135
2136 recv_wr.next = NULL;
2137 recv_wr.wr_id = (u64) (unsigned long) rep;
2138 recv_wr.sg_list = &rep->rr_iov;
2139 recv_wr.num_sge = 1;
2140
2141 ib_dma_sync_single_for_cpu(ia->ri_id->device,
2142 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
2143
c56c65fb
TT
2144 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
2145
2146 if (rc)
2147 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
2148 rc);
2149 return rc;
2150}
43e95988
CL
2151
2152/* Physical mapping means one Read/Write list entry per-page.
2153 * All list entries must fit within an inline buffer
2154 *
2155 * NB: The server must return a Write list for NFS READ,
2156 * which has the same constraint. Factor in the inline
2157 * rsize as well.
2158 */
2159static size_t
2160rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
2161{
2162 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
2163 unsigned int inline_size, pages;
2164
2165 inline_size = min_t(unsigned int,
2166 cdata->inline_wsize, cdata->inline_rsize);
2167 inline_size -= RPCRDMA_HDRLEN_MIN;
2168 pages = inline_size / sizeof(struct rpcrdma_segment);
2169 return pages << PAGE_SHIFT;
2170}
2171
2172static size_t
2173rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
2174{
2175 return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
2176}
2177
2178size_t
2179rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
2180{
2181 size_t result;
2182
2183 switch (r_xprt->rx_ia.ri_memreg_strategy) {
2184 case RPCRDMA_ALLPHYSICAL:
2185 result = rpcrdma_physical_max_payload(r_xprt);
2186 break;
2187 default:
2188 result = rpcrdma_mr_max_payload(r_xprt);
2189 }
2190 return result;
2191}