xprtrdma: Protect ia->ri_id when unmapping/invalidating MRs
[linux-2.6-block.git] / net / sunrpc / xprtrdma / verbs.c
CommitLineData
f58851e6 1/*
c56c65fb
TT
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
f58851e6
TT
38 */
39
c56c65fb
TT
40/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
a6b7a407 50#include <linux/interrupt.h>
5a0e3ad6 51#include <linux/slab.h>
65866f82 52#include <asm/bitops.h>
c56c65fb 53
f58851e6
TT
54#include "xprt_rdma.h"
55
c56c65fb
TT
56/*
57 * Globals/Macros
58 */
59
60#ifdef RPC_DEBUG
61# define RPCDBG_FACILITY RPCDBG_TRANS
62#endif
63
64/*
65 * internal functions
66 */
67
68/*
69 * handle replies in tasklet context, using a single, global list
70 * rdma tasklet function -- just turn around and call the func
71 * for all replies on the list
72 */
73
74static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
75static LIST_HEAD(rpcrdma_tasklets_g);
76
77static void
78rpcrdma_run_tasklet(unsigned long data)
79{
80 struct rpcrdma_rep *rep;
81 void (*func)(struct rpcrdma_rep *);
82 unsigned long flags;
83
84 data = data;
85 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
86 while (!list_empty(&rpcrdma_tasklets_g)) {
87 rep = list_entry(rpcrdma_tasklets_g.next,
88 struct rpcrdma_rep, rr_list);
89 list_del(&rep->rr_list);
90 func = rep->rr_func;
91 rep->rr_func = NULL;
92 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
93
94 if (func)
95 func(rep);
96 else
97 rpcrdma_recv_buffer_put(rep);
98
99 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
100 }
101 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
102}
103
104static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
105
106static inline void
107rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
108{
109 unsigned long flags;
110
111 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
112 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
113 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
114 tasklet_schedule(&rpcrdma_tasklet_g);
115}
116
117static void
118rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
119{
120 struct rpcrdma_ep *ep = context;
121
122 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
123 __func__, event->event, event->device->name, context);
124 if (ep->rep_connected == 1) {
125 ep->rep_connected = -EIO;
126 ep->rep_func(ep);
127 wake_up_all(&ep->rep_connect_wait);
128 }
129}
130
131static void
132rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
133{
134 struct rpcrdma_ep *ep = context;
135
136 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
137 __func__, event->event, event->device->name, context);
138 if (ep->rep_connected == 1) {
139 ep->rep_connected = -EIO;
140 ep->rep_func(ep);
141 wake_up_all(&ep->rep_connect_wait);
142 }
143}
144
fc664485
CL
145static void
146rpcrdma_sendcq_process_wc(struct ib_wc *wc)
c56c65fb 147{
fc664485 148 struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
c56c65fb 149
fc664485
CL
150 dprintk("RPC: %s: frmr %p status %X opcode %d\n",
151 __func__, frmr, wc->status, wc->opcode);
c56c65fb 152
fc664485 153 if (wc->wr_id == 0ULL)
c56c65fb 154 return;
fc664485 155 if (wc->status != IB_WC_SUCCESS)
c56c65fb 156 return;
c56c65fb 157
fc664485 158 if (wc->opcode == IB_WC_FAST_REG_MR)
5c635e09 159 frmr->r.frmr.state = FRMR_IS_VALID;
fc664485 160 else if (wc->opcode == IB_WC_LOCAL_INV)
5c635e09 161 frmr->r.frmr.state = FRMR_IS_INVALID;
c56c65fb
TT
162}
163
fc664485 164static int
1c00dd07 165rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
c56c65fb 166{
1c00dd07 167 struct ib_wc *wcs;
8301a2c0 168 int budget, count, rc;
c56c65fb 169
8301a2c0 170 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
171 do {
172 wcs = ep->rep_send_wcs;
173
174 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
175 if (rc <= 0)
176 return rc;
177
178 count = rc;
179 while (count-- > 0)
180 rpcrdma_sendcq_process_wc(wcs++);
8301a2c0 181 } while (rc == RPCRDMA_POLLSIZE && --budget);
1c00dd07 182 return 0;
fc664485 183}
c56c65fb 184
fc664485
CL
185/*
186 * Handle send, fast_reg_mr, and local_inv completions.
187 *
188 * Send events are typically suppressed and thus do not result
189 * in an upcall. Occasionally one is signaled, however. This
190 * prevents the provider's completion queue from wrapping and
191 * losing a completion.
192 */
193static void
194rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
195{
1c00dd07 196 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
fc664485
CL
197 int rc;
198
1c00dd07 199 rc = rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
200 if (rc) {
201 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
202 __func__, rc);
203 return;
c56c65fb
TT
204 }
205
7f23f6f6
CL
206 rc = ib_req_notify_cq(cq,
207 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
208 if (rc == 0)
209 return;
210 if (rc < 0) {
fc664485
CL
211 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
212 __func__, rc);
213 return;
214 }
215
1c00dd07 216 rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
217}
218
219static void
220rpcrdma_recvcq_process_wc(struct ib_wc *wc)
221{
222 struct rpcrdma_rep *rep =
223 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
224
225 dprintk("RPC: %s: rep %p status %X opcode %X length %u\n",
226 __func__, rep, wc->status, wc->opcode, wc->byte_len);
227
228 if (wc->status != IB_WC_SUCCESS) {
229 rep->rr_len = ~0U;
230 goto out_schedule;
231 }
232 if (wc->opcode != IB_WC_RECV)
233 return;
234
235 rep->rr_len = wc->byte_len;
236 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
237 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
238
239 if (rep->rr_len >= 16) {
240 struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
241 unsigned int credits = ntohl(p->rm_credit);
242
243 if (credits == 0)
244 credits = 1; /* don't deadlock */
245 else if (credits > rep->rr_buffer->rb_max_requests)
246 credits = rep->rr_buffer->rb_max_requests;
247 atomic_set(&rep->rr_buffer->rb_credits, credits);
248 }
249
250out_schedule:
251 rpcrdma_schedule_tasklet(rep);
252}
253
254static int
1c00dd07 255rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
fc664485 256{
1c00dd07 257 struct ib_wc *wcs;
8301a2c0 258 int budget, count, rc;
fc664485 259
8301a2c0 260 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
261 do {
262 wcs = ep->rep_recv_wcs;
263
264 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
265 if (rc <= 0)
266 return rc;
267
268 count = rc;
269 while (count-- > 0)
270 rpcrdma_recvcq_process_wc(wcs++);
8301a2c0 271 } while (rc == RPCRDMA_POLLSIZE && --budget);
1c00dd07 272 return 0;
c56c65fb
TT
273}
274
275/*
fc664485 276 * Handle receive completions.
c56c65fb 277 *
c56c65fb
TT
278 * It is reentrant but processes single events in order to maintain
279 * ordering of receives to keep server credits.
280 *
281 * It is the responsibility of the scheduled tasklet to return
282 * recv buffers to the pool. NOTE: this affects synchronization of
283 * connection shutdown. That is, the structures required for
284 * the completion of the reply handler must remain intact until
285 * all memory has been reclaimed.
c56c65fb
TT
286 */
287static void
fc664485 288rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
c56c65fb 289{
1c00dd07 290 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
c56c65fb
TT
291 int rc;
292
1c00dd07 293 rc = rpcrdma_recvcq_poll(cq, ep);
fc664485
CL
294 if (rc) {
295 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
296 __func__, rc);
c56c65fb 297 return;
fc664485 298 }
c56c65fb 299
7f23f6f6
CL
300 rc = ib_req_notify_cq(cq,
301 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
302 if (rc == 0)
303 return;
304 if (rc < 0) {
fc664485 305 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
c56c65fb
TT
306 __func__, rc);
307 return;
308 }
309
1c00dd07 310 rpcrdma_recvcq_poll(cq, ep);
c56c65fb
TT
311}
312
313#ifdef RPC_DEBUG
314static const char * const conn[] = {
315 "address resolved",
316 "address error",
317 "route resolved",
318 "route error",
319 "connect request",
320 "connect response",
321 "connect error",
322 "unreachable",
323 "rejected",
324 "established",
325 "disconnected",
326 "device removal"
327};
328#endif
329
330static int
331rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
332{
333 struct rpcrdma_xprt *xprt = id->context;
334 struct rpcrdma_ia *ia = &xprt->rx_ia;
335 struct rpcrdma_ep *ep = &xprt->rx_ep;
ff0db049 336#ifdef RPC_DEBUG
c56c65fb 337 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
ff0db049 338#endif
c56c65fb
TT
339 struct ib_qp_attr attr;
340 struct ib_qp_init_attr iattr;
341 int connstate = 0;
342
343 switch (event->event) {
344 case RDMA_CM_EVENT_ADDR_RESOLVED:
345 case RDMA_CM_EVENT_ROUTE_RESOLVED:
5675add3 346 ia->ri_async_rc = 0;
c56c65fb
TT
347 complete(&ia->ri_done);
348 break;
349 case RDMA_CM_EVENT_ADDR_ERROR:
350 ia->ri_async_rc = -EHOSTUNREACH;
351 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
352 __func__, ep);
353 complete(&ia->ri_done);
354 break;
355 case RDMA_CM_EVENT_ROUTE_ERROR:
356 ia->ri_async_rc = -ENETUNREACH;
357 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
358 __func__, ep);
359 complete(&ia->ri_done);
360 break;
361 case RDMA_CM_EVENT_ESTABLISHED:
362 connstate = 1;
363 ib_query_qp(ia->ri_id->qp, &attr,
364 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
365 &iattr);
366 dprintk("RPC: %s: %d responder resources"
367 " (%d initiator)\n",
368 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
369 goto connected;
370 case RDMA_CM_EVENT_CONNECT_ERROR:
371 connstate = -ENOTCONN;
372 goto connected;
373 case RDMA_CM_EVENT_UNREACHABLE:
374 connstate = -ENETDOWN;
375 goto connected;
376 case RDMA_CM_EVENT_REJECTED:
377 connstate = -ECONNREFUSED;
378 goto connected;
379 case RDMA_CM_EVENT_DISCONNECTED:
380 connstate = -ECONNABORTED;
381 goto connected;
382 case RDMA_CM_EVENT_DEVICE_REMOVAL:
383 connstate = -ENODEV;
384connected:
21454aaa 385 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
c56c65fb
TT
386 __func__,
387 (event->event <= 11) ? conn[event->event] :
388 "unknown connection error",
21454aaa 389 &addr->sin_addr.s_addr,
c56c65fb
TT
390 ntohs(addr->sin_port),
391 ep, event->event);
392 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
393 dprintk("RPC: %s: %sconnected\n",
394 __func__, connstate > 0 ? "" : "dis");
395 ep->rep_connected = connstate;
396 ep->rep_func(ep);
397 wake_up_all(&ep->rep_connect_wait);
398 break;
399 default:
1a954051 400 dprintk("RPC: %s: unexpected CM event %d\n",
c56c65fb 401 __func__, event->event);
c56c65fb
TT
402 break;
403 }
404
b3cd8d45
TT
405#ifdef RPC_DEBUG
406 if (connstate == 1) {
407 int ird = attr.max_dest_rd_atomic;
408 int tird = ep->rep_remote_cma.responder_resources;
21454aaa 409 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
b3cd8d45 410 "on %s, memreg %d slots %d ird %d%s\n",
21454aaa 411 &addr->sin_addr.s_addr,
b3cd8d45
TT
412 ntohs(addr->sin_port),
413 ia->ri_id->device->name,
414 ia->ri_memreg_strategy,
415 xprt->rx_buf.rb_max_requests,
416 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
417 } else if (connstate < 0) {
21454aaa
HH
418 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
419 &addr->sin_addr.s_addr,
b3cd8d45
TT
420 ntohs(addr->sin_port),
421 connstate);
422 }
423#endif
424
c56c65fb
TT
425 return 0;
426}
427
428static struct rdma_cm_id *
429rpcrdma_create_id(struct rpcrdma_xprt *xprt,
430 struct rpcrdma_ia *ia, struct sockaddr *addr)
431{
432 struct rdma_cm_id *id;
433 int rc;
434
1a954051
TT
435 init_completion(&ia->ri_done);
436
b26f9b99 437 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
c56c65fb
TT
438 if (IS_ERR(id)) {
439 rc = PTR_ERR(id);
440 dprintk("RPC: %s: rdma_create_id() failed %i\n",
441 __func__, rc);
442 return id;
443 }
444
5675add3 445 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
446 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
447 if (rc) {
448 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
449 __func__, rc);
450 goto out;
451 }
5675add3
TT
452 wait_for_completion_interruptible_timeout(&ia->ri_done,
453 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
454 rc = ia->ri_async_rc;
455 if (rc)
456 goto out;
457
5675add3 458 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
459 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
460 if (rc) {
461 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
462 __func__, rc);
463 goto out;
464 }
5675add3
TT
465 wait_for_completion_interruptible_timeout(&ia->ri_done,
466 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
467 rc = ia->ri_async_rc;
468 if (rc)
469 goto out;
470
471 return id;
472
473out:
474 rdma_destroy_id(id);
475 return ERR_PTR(rc);
476}
477
478/*
479 * Drain any cq, prior to teardown.
480 */
481static void
482rpcrdma_clean_cq(struct ib_cq *cq)
483{
484 struct ib_wc wc;
485 int count = 0;
486
487 while (1 == ib_poll_cq(cq, 1, &wc))
488 ++count;
489
490 if (count)
491 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
492 __func__, count, wc.opcode);
493}
494
495/*
496 * Exported functions.
497 */
498
499/*
500 * Open and initialize an Interface Adapter.
501 * o initializes fields of struct rpcrdma_ia, including
502 * interface and provider attributes and protection zone.
503 */
504int
505rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
506{
bd7ed1d1
TT
507 int rc, mem_priv;
508 struct ib_device_attr devattr;
c56c65fb
TT
509 struct rpcrdma_ia *ia = &xprt->rx_ia;
510
c56c65fb
TT
511 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
512 if (IS_ERR(ia->ri_id)) {
513 rc = PTR_ERR(ia->ri_id);
514 goto out1;
515 }
516
517 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
518 if (IS_ERR(ia->ri_pd)) {
519 rc = PTR_ERR(ia->ri_pd);
520 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
521 __func__, rc);
522 goto out2;
523 }
524
bd7ed1d1
TT
525 /*
526 * Query the device to determine if the requested memory
527 * registration strategy is supported. If it isn't, set the
528 * strategy to a globally supported model.
529 */
530 rc = ib_query_device(ia->ri_id->device, &devattr);
531 if (rc) {
532 dprintk("RPC: %s: ib_query_device failed %d\n",
533 __func__, rc);
534 goto out2;
535 }
536
537 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
538 ia->ri_have_dma_lkey = 1;
539 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
540 }
541
f10eafd3 542 if (memreg == RPCRDMA_FRMR) {
3197d309
TT
543 /* Requires both frmr reg and local dma lkey */
544 if ((devattr.device_cap_flags &
545 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
546 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
3197d309 547 dprintk("RPC: %s: FRMR registration "
f10eafd3
CL
548 "not supported by HCA\n", __func__);
549 memreg = RPCRDMA_MTHCAFMR;
0fc6c4e7
SW
550 } else {
551 /* Mind the ia limit on FRMR page list depth */
552 ia->ri_max_frmr_depth = min_t(unsigned int,
553 RPCRDMA_MAX_DATA_SEGS,
554 devattr.max_fast_reg_page_list_len);
bd7ed1d1 555 }
f10eafd3
CL
556 }
557 if (memreg == RPCRDMA_MTHCAFMR) {
558 if (!ia->ri_id->device->alloc_fmr) {
559 dprintk("RPC: %s: MTHCAFMR registration "
560 "not supported by HCA\n", __func__);
561#if RPCRDMA_PERSISTENT_REGISTRATION
562 memreg = RPCRDMA_ALLPHYSICAL;
563#else
cdd9ade7 564 rc = -ENOMEM;
f10eafd3
CL
565 goto out2;
566#endif
567 }
bd7ed1d1
TT
568 }
569
c56c65fb
TT
570 /*
571 * Optionally obtain an underlying physical identity mapping in
572 * order to do a memory window-based bind. This base registration
573 * is protected from remote access - that is enabled only by binding
574 * for the specific bytes targeted during each RPC operation, and
575 * revoked after the corresponding completion similar to a storage
576 * adapter.
577 */
bd7ed1d1 578 switch (memreg) {
3197d309 579 case RPCRDMA_FRMR:
bd7ed1d1 580 break;
c56c65fb 581#if RPCRDMA_PERSISTENT_REGISTRATION
bd7ed1d1
TT
582 case RPCRDMA_ALLPHYSICAL:
583 mem_priv = IB_ACCESS_LOCAL_WRITE |
584 IB_ACCESS_REMOTE_WRITE |
585 IB_ACCESS_REMOTE_READ;
586 goto register_setup;
c56c65fb 587#endif
bd7ed1d1
TT
588 case RPCRDMA_MTHCAFMR:
589 if (ia->ri_have_dma_lkey)
c56c65fb 590 break;
bd7ed1d1 591 mem_priv = IB_ACCESS_LOCAL_WRITE;
b45ccfd2 592#if RPCRDMA_PERSISTENT_REGISTRATION
bd7ed1d1 593 register_setup:
b45ccfd2 594#endif
c56c65fb
TT
595 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
596 if (IS_ERR(ia->ri_bind_mem)) {
597 printk(KERN_ALERT "%s: ib_get_dma_mr for "
0ac531c1 598 "phys register failed with %lX\n",
c56c65fb 599 __func__, PTR_ERR(ia->ri_bind_mem));
0ac531c1
CL
600 rc = -ENOMEM;
601 goto out2;
c56c65fb 602 }
bd7ed1d1
TT
603 break;
604 default:
cdd9ade7
CL
605 printk(KERN_ERR "RPC: Unsupported memory "
606 "registration mode: %d\n", memreg);
607 rc = -ENOMEM;
bd7ed1d1 608 goto out2;
c56c65fb 609 }
bd7ed1d1
TT
610 dprintk("RPC: %s: memory registration strategy is %d\n",
611 __func__, memreg);
c56c65fb
TT
612
613 /* Else will do memory reg/dereg for each chunk */
614 ia->ri_memreg_strategy = memreg;
615
73806c88 616 rwlock_init(&ia->ri_qplock);
c56c65fb
TT
617 return 0;
618out2:
619 rdma_destroy_id(ia->ri_id);
fee08caf 620 ia->ri_id = NULL;
c56c65fb
TT
621out1:
622 return rc;
623}
624
625/*
626 * Clean up/close an IA.
627 * o if event handles and PD have been initialized, free them.
628 * o close the IA
629 */
630void
631rpcrdma_ia_close(struct rpcrdma_ia *ia)
632{
633 int rc;
634
635 dprintk("RPC: %s: entering\n", __func__);
636 if (ia->ri_bind_mem != NULL) {
637 rc = ib_dereg_mr(ia->ri_bind_mem);
638 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
639 __func__, rc);
640 }
fee08caf
TT
641 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
642 if (ia->ri_id->qp)
643 rdma_destroy_qp(ia->ri_id);
644 rdma_destroy_id(ia->ri_id);
645 ia->ri_id = NULL;
646 }
c56c65fb
TT
647 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
648 rc = ib_dealloc_pd(ia->ri_pd);
649 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
650 __func__, rc);
651 }
c56c65fb
TT
652}
653
654/*
655 * Create unconnected endpoint.
656 */
657int
658rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
659 struct rpcrdma_create_data_internal *cdata)
660{
661 struct ib_device_attr devattr;
fc664485 662 struct ib_cq *sendcq, *recvcq;
5d40a8a5 663 int rc, err;
c56c65fb
TT
664
665 rc = ib_query_device(ia->ri_id->device, &devattr);
666 if (rc) {
667 dprintk("RPC: %s: ib_query_device failed %d\n",
668 __func__, rc);
669 return rc;
670 }
671
672 /* check provider's send/recv wr limits */
673 if (cdata->max_requests > devattr.max_qp_wr)
674 cdata->max_requests = devattr.max_qp_wr;
675
676 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
677 ep->rep_attr.qp_context = ep;
678 /* send_cq and recv_cq initialized below */
679 ep->rep_attr.srq = NULL;
680 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
681 switch (ia->ri_memreg_strategy) {
0fc6c4e7
SW
682 case RPCRDMA_FRMR: {
683 int depth = 7;
684
15cdc644
TT
685 /* Add room for frmr register and invalidate WRs.
686 * 1. FRMR reg WR for head
687 * 2. FRMR invalidate WR for head
0fc6c4e7
SW
688 * 3. N FRMR reg WRs for pagelist
689 * 4. N FRMR invalidate WRs for pagelist
15cdc644
TT
690 * 5. FRMR reg WR for tail
691 * 6. FRMR invalidate WR for tail
692 * 7. The RDMA_SEND WR
693 */
0fc6c4e7
SW
694
695 /* Calculate N if the device max FRMR depth is smaller than
696 * RPCRDMA_MAX_DATA_SEGS.
697 */
698 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
699 int delta = RPCRDMA_MAX_DATA_SEGS -
700 ia->ri_max_frmr_depth;
701
702 do {
703 depth += 2; /* FRMR reg + invalidate */
704 delta -= ia->ri_max_frmr_depth;
705 } while (delta > 0);
706
707 }
708 ep->rep_attr.cap.max_send_wr *= depth;
15cdc644 709 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
0fc6c4e7 710 cdata->max_requests = devattr.max_qp_wr / depth;
15cdc644
TT
711 if (!cdata->max_requests)
712 return -EINVAL;
0fc6c4e7
SW
713 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
714 depth;
15cdc644 715 }
3197d309 716 break;
0fc6c4e7 717 }
c56c65fb
TT
718 default:
719 break;
720 }
721 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
722 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
723 ep->rep_attr.cap.max_recv_sge = 1;
724 ep->rep_attr.cap.max_inline_data = 0;
725 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
726 ep->rep_attr.qp_type = IB_QPT_RC;
727 ep->rep_attr.port_num = ~0;
728
729 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
730 "iovs: send %d recv %d\n",
731 __func__,
732 ep->rep_attr.cap.max_send_wr,
733 ep->rep_attr.cap.max_recv_wr,
734 ep->rep_attr.cap.max_send_sge,
735 ep->rep_attr.cap.max_recv_sge);
736
737 /* set trigger for requesting send completion */
fc664485 738 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
c56c65fb
TT
739 if (ep->rep_cqinit <= 2)
740 ep->rep_cqinit = 0;
741 INIT_CQCOUNT(ep);
742 ep->rep_ia = ia;
743 init_waitqueue_head(&ep->rep_connect_wait);
254f91e2 744 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
c56c65fb 745
fc664485 746 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
1c00dd07 747 rpcrdma_cq_async_error_upcall, ep,
c56c65fb 748 ep->rep_attr.cap.max_send_wr + 1, 0);
fc664485
CL
749 if (IS_ERR(sendcq)) {
750 rc = PTR_ERR(sendcq);
751 dprintk("RPC: %s: failed to create send CQ: %i\n",
c56c65fb
TT
752 __func__, rc);
753 goto out1;
754 }
755
fc664485 756 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
c56c65fb
TT
757 if (rc) {
758 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
759 __func__, rc);
760 goto out2;
761 }
762
fc664485 763 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
1c00dd07 764 rpcrdma_cq_async_error_upcall, ep,
fc664485
CL
765 ep->rep_attr.cap.max_recv_wr + 1, 0);
766 if (IS_ERR(recvcq)) {
767 rc = PTR_ERR(recvcq);
768 dprintk("RPC: %s: failed to create recv CQ: %i\n",
769 __func__, rc);
770 goto out2;
771 }
772
773 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
774 if (rc) {
775 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
776 __func__, rc);
777 ib_destroy_cq(recvcq);
778 goto out2;
779 }
780
781 ep->rep_attr.send_cq = sendcq;
782 ep->rep_attr.recv_cq = recvcq;
c56c65fb
TT
783
784 /* Initialize cma parameters */
785
786 /* RPC/RDMA does not use private data */
787 ep->rep_remote_cma.private_data = NULL;
788 ep->rep_remote_cma.private_data_len = 0;
789
790 /* Client offers RDMA Read but does not initiate */
b334eaab 791 ep->rep_remote_cma.initiator_depth = 0;
03ff8821 792 if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
b334eaab
TT
793 ep->rep_remote_cma.responder_resources = 32;
794 else
c56c65fb 795 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
c56c65fb
TT
796
797 ep->rep_remote_cma.retry_count = 7;
798 ep->rep_remote_cma.flow_control = 0;
799 ep->rep_remote_cma.rnr_retry_count = 0;
800
801 return 0;
802
803out2:
fc664485 804 err = ib_destroy_cq(sendcq);
5d40a8a5
CL
805 if (err)
806 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
807 __func__, err);
c56c65fb
TT
808out1:
809 return rc;
810}
811
812/*
813 * rpcrdma_ep_destroy
814 *
815 * Disconnect and destroy endpoint. After this, the only
816 * valid operations on the ep are to free it (if dynamically
817 * allocated) or re-create it.
c56c65fb 818 */
7f1d5419 819void
c56c65fb
TT
820rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
821{
822 int rc;
823
824 dprintk("RPC: %s: entering, connected is %d\n",
825 __func__, ep->rep_connected);
826
254f91e2
CL
827 cancel_delayed_work_sync(&ep->rep_connect_worker);
828
c56c65fb
TT
829 if (ia->ri_id->qp) {
830 rc = rpcrdma_ep_disconnect(ep, ia);
831 if (rc)
832 dprintk("RPC: %s: rpcrdma_ep_disconnect"
833 " returned %i\n", __func__, rc);
fee08caf
TT
834 rdma_destroy_qp(ia->ri_id);
835 ia->ri_id->qp = NULL;
c56c65fb
TT
836 }
837
c56c65fb
TT
838 /* padding - could be done in rpcrdma_buffer_destroy... */
839 if (ep->rep_pad_mr) {
840 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
841 ep->rep_pad_mr = NULL;
842 }
843
fc664485
CL
844 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
845 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
846 if (rc)
847 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
848 __func__, rc);
849
850 rpcrdma_clean_cq(ep->rep_attr.send_cq);
851 rc = ib_destroy_cq(ep->rep_attr.send_cq);
c56c65fb
TT
852 if (rc)
853 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
854 __func__, rc);
c56c65fb
TT
855}
856
857/*
858 * Connect unconnected endpoint.
859 */
860int
861rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
862{
73806c88 863 struct rdma_cm_id *id, *old;
c56c65fb
TT
864 int rc = 0;
865 int retry_count = 0;
c56c65fb 866
c055551e 867 if (ep->rep_connected != 0) {
c56c65fb
TT
868 struct rpcrdma_xprt *xprt;
869retry:
ec62f40d 870 dprintk("RPC: %s: reconnecting...\n", __func__);
c56c65fb
TT
871 rc = rpcrdma_ep_disconnect(ep, ia);
872 if (rc && rc != -ENOTCONN)
873 dprintk("RPC: %s: rpcrdma_ep_disconnect"
874 " status %i\n", __func__, rc);
fc664485
CL
875
876 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
877 rpcrdma_clean_cq(ep->rep_attr.send_cq);
c56c65fb
TT
878
879 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
880 id = rpcrdma_create_id(xprt, ia,
881 (struct sockaddr *)&xprt->rx_data.addr);
882 if (IS_ERR(id)) {
ec62f40d 883 rc = -EHOSTUNREACH;
c56c65fb
TT
884 goto out;
885 }
886 /* TEMP TEMP TEMP - fail if new device:
887 * Deregister/remarshal *all* requests!
888 * Close and recreate adapter, pd, etc!
889 * Re-determine all attributes still sane!
890 * More stuff I haven't thought of!
891 * Rrrgh!
892 */
893 if (ia->ri_id->device != id->device) {
894 printk("RPC: %s: can't reconnect on "
895 "different device!\n", __func__);
896 rdma_destroy_id(id);
ec62f40d 897 rc = -ENETUNREACH;
c56c65fb
TT
898 goto out;
899 }
900 /* END TEMP */
ec62f40d
CL
901 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
902 if (rc) {
903 dprintk("RPC: %s: rdma_create_qp failed %i\n",
904 __func__, rc);
905 rdma_destroy_id(id);
906 rc = -ENETUNREACH;
907 goto out;
908 }
73806c88
CL
909
910 write_lock(&ia->ri_qplock);
911 old = ia->ri_id;
c56c65fb 912 ia->ri_id = id;
73806c88
CL
913 write_unlock(&ia->ri_qplock);
914
915 rdma_destroy_qp(old);
916 rdma_destroy_id(old);
ec62f40d
CL
917 } else {
918 dprintk("RPC: %s: connecting...\n", __func__);
919 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
920 if (rc) {
921 dprintk("RPC: %s: rdma_create_qp failed %i\n",
922 __func__, rc);
923 /* do not update ep->rep_connected */
924 return -ENETUNREACH;
925 }
c56c65fb
TT
926 }
927
c56c65fb
TT
928 ep->rep_connected = 0;
929
930 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
931 if (rc) {
932 dprintk("RPC: %s: rdma_connect() failed with %i\n",
933 __func__, rc);
934 goto out;
935 }
936
c56c65fb
TT
937 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
938
939 /*
940 * Check state. A non-peer reject indicates no listener
941 * (ECONNREFUSED), which may be a transient state. All
942 * others indicate a transport condition which has already
943 * undergone a best-effort.
944 */
f64f9e71
JP
945 if (ep->rep_connected == -ECONNREFUSED &&
946 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
c56c65fb
TT
947 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
948 goto retry;
949 }
950 if (ep->rep_connected <= 0) {
951 /* Sometimes, the only way to reliably connect to remote
952 * CMs is to use same nonzero values for ORD and IRD. */
b334eaab
TT
953 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
954 (ep->rep_remote_cma.responder_resources == 0 ||
955 ep->rep_remote_cma.initiator_depth !=
956 ep->rep_remote_cma.responder_resources)) {
957 if (ep->rep_remote_cma.responder_resources == 0)
958 ep->rep_remote_cma.responder_resources = 1;
959 ep->rep_remote_cma.initiator_depth =
960 ep->rep_remote_cma.responder_resources;
c56c65fb 961 goto retry;
b334eaab 962 }
c56c65fb
TT
963 rc = ep->rep_connected;
964 } else {
965 dprintk("RPC: %s: connected\n", __func__);
966 }
967
968out:
969 if (rc)
970 ep->rep_connected = rc;
971 return rc;
972}
973
974/*
975 * rpcrdma_ep_disconnect
976 *
977 * This is separate from destroy to facilitate the ability
978 * to reconnect without recreating the endpoint.
979 *
980 * This call is not reentrant, and must not be made in parallel
981 * on the same endpoint.
982 */
983int
984rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
985{
986 int rc;
987
fc664485
CL
988 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
989 rpcrdma_clean_cq(ep->rep_attr.send_cq);
c56c65fb
TT
990 rc = rdma_disconnect(ia->ri_id);
991 if (!rc) {
992 /* returns without wait if not connected */
993 wait_event_interruptible(ep->rep_connect_wait,
994 ep->rep_connected != 1);
995 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
996 (ep->rep_connected == 1) ? "still " : "dis");
997 } else {
998 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
999 ep->rep_connected = rc;
1000 }
1001 return rc;
1002}
1003
1004/*
1005 * Initialize buffer memory
1006 */
1007int
1008rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1009 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
1010{
1011 char *p;
65866f82 1012 size_t len, rlen, wlen;
c56c65fb 1013 int i, rc;
8d4ba034 1014 struct rpcrdma_mw *r;
c56c65fb
TT
1015
1016 buf->rb_max_requests = cdata->max_requests;
1017 spin_lock_init(&buf->rb_lock);
1018 atomic_set(&buf->rb_credits, 1);
1019
1020 /* Need to allocate:
1021 * 1. arrays for send and recv pointers
1022 * 2. arrays of struct rpcrdma_req to fill in pointers
1023 * 3. array of struct rpcrdma_rep for replies
1024 * 4. padding, if any
3197d309 1025 * 5. mw's, fmr's or frmr's, if any
c56c65fb
TT
1026 * Send/recv buffers in req/rep need to be registered
1027 */
1028
1029 len = buf->rb_max_requests *
1030 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1031 len += cdata->padding;
1032 switch (ia->ri_memreg_strategy) {
3197d309
TT
1033 case RPCRDMA_FRMR:
1034 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
1035 sizeof(struct rpcrdma_mw);
1036 break;
c56c65fb
TT
1037 case RPCRDMA_MTHCAFMR:
1038 /* TBD we are perhaps overallocating here */
1039 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1040 sizeof(struct rpcrdma_mw);
1041 break;
c56c65fb
TT
1042 default:
1043 break;
1044 }
1045
1046 /* allocate 1, 4 and 5 in one shot */
1047 p = kzalloc(len, GFP_KERNEL);
1048 if (p == NULL) {
1049 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1050 __func__, len);
1051 rc = -ENOMEM;
1052 goto out;
1053 }
1054 buf->rb_pool = p; /* for freeing it later */
1055
1056 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1057 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1058 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1059 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1060
1061 /*
1062 * Register the zeroed pad buffer, if any.
1063 */
1064 if (cdata->padding) {
1065 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1066 &ep->rep_pad_mr, &ep->rep_pad);
1067 if (rc)
1068 goto out;
1069 }
1070 p += cdata->padding;
1071
c56c65fb 1072 INIT_LIST_HEAD(&buf->rb_mws);
8d4ba034 1073 r = (struct rpcrdma_mw *)p;
c56c65fb 1074 switch (ia->ri_memreg_strategy) {
3197d309
TT
1075 case RPCRDMA_FRMR:
1076 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1077 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
0fc6c4e7 1078 ia->ri_max_frmr_depth);
3197d309
TT
1079 if (IS_ERR(r->r.frmr.fr_mr)) {
1080 rc = PTR_ERR(r->r.frmr.fr_mr);
1081 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1082 " failed %i\n", __func__, rc);
1083 goto out;
1084 }
0fc6c4e7
SW
1085 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1086 ia->ri_id->device,
1087 ia->ri_max_frmr_depth);
3197d309
TT
1088 if (IS_ERR(r->r.frmr.fr_pgl)) {
1089 rc = PTR_ERR(r->r.frmr.fr_pgl);
1090 dprintk("RPC: %s: "
1091 "ib_alloc_fast_reg_page_list "
1092 "failed %i\n", __func__, rc);
4034ba04
AA
1093
1094 ib_dereg_mr(r->r.frmr.fr_mr);
3197d309
TT
1095 goto out;
1096 }
1097 list_add(&r->mw_list, &buf->rb_mws);
1098 ++r;
1099 }
1100 break;
c56c65fb 1101 case RPCRDMA_MTHCAFMR:
c56c65fb
TT
1102 /* TBD we are perhaps overallocating here */
1103 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
8d4ba034
TT
1104 static struct ib_fmr_attr fa =
1105 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
c56c65fb
TT
1106 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1107 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1108 &fa);
1109 if (IS_ERR(r->r.fmr)) {
1110 rc = PTR_ERR(r->r.fmr);
1111 dprintk("RPC: %s: ib_alloc_fmr"
1112 " failed %i\n", __func__, rc);
1113 goto out;
1114 }
1115 list_add(&r->mw_list, &buf->rb_mws);
1116 ++r;
1117 }
c56c65fb 1118 break;
c56c65fb
TT
1119 default:
1120 break;
1121 }
1122
1123 /*
1124 * Allocate/init the request/reply buffers. Doing this
1125 * using kmalloc for now -- one for each buf.
1126 */
65866f82
CL
1127 wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
1128 rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
1129 dprintk("RPC: %s: wlen = %zu, rlen = %zu\n",
1130 __func__, wlen, rlen);
1131
c56c65fb
TT
1132 for (i = 0; i < buf->rb_max_requests; i++) {
1133 struct rpcrdma_req *req;
1134 struct rpcrdma_rep *rep;
1135
65866f82 1136 req = kmalloc(wlen, GFP_KERNEL);
c56c65fb
TT
1137 if (req == NULL) {
1138 dprintk("RPC: %s: request buffer %d alloc"
1139 " failed\n", __func__, i);
1140 rc = -ENOMEM;
1141 goto out;
1142 }
1143 memset(req, 0, sizeof(struct rpcrdma_req));
1144 buf->rb_send_bufs[i] = req;
1145 buf->rb_send_bufs[i]->rl_buffer = buf;
1146
1147 rc = rpcrdma_register_internal(ia, req->rl_base,
65866f82 1148 wlen - offsetof(struct rpcrdma_req, rl_base),
c56c65fb
TT
1149 &buf->rb_send_bufs[i]->rl_handle,
1150 &buf->rb_send_bufs[i]->rl_iov);
1151 if (rc)
1152 goto out;
1153
65866f82
CL
1154 buf->rb_send_bufs[i]->rl_size = wlen -
1155 sizeof(struct rpcrdma_req);
c56c65fb 1156
65866f82 1157 rep = kmalloc(rlen, GFP_KERNEL);
c56c65fb
TT
1158 if (rep == NULL) {
1159 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1160 __func__, i);
1161 rc = -ENOMEM;
1162 goto out;
1163 }
1164 memset(rep, 0, sizeof(struct rpcrdma_rep));
1165 buf->rb_recv_bufs[i] = rep;
1166 buf->rb_recv_bufs[i]->rr_buffer = buf;
c56c65fb
TT
1167
1168 rc = rpcrdma_register_internal(ia, rep->rr_base,
65866f82 1169 rlen - offsetof(struct rpcrdma_rep, rr_base),
c56c65fb
TT
1170 &buf->rb_recv_bufs[i]->rr_handle,
1171 &buf->rb_recv_bufs[i]->rr_iov);
1172 if (rc)
1173 goto out;
1174
1175 }
1176 dprintk("RPC: %s: max_requests %d\n",
1177 __func__, buf->rb_max_requests);
1178 /* done */
1179 return 0;
1180out:
1181 rpcrdma_buffer_destroy(buf);
1182 return rc;
1183}
1184
1185/*
1186 * Unregister and destroy buffer memory. Need to deal with
1187 * partial initialization, so it's callable from failed create.
1188 * Must be called before destroying endpoint, as registrations
1189 * reference it.
1190 */
1191void
1192rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1193{
1194 int rc, i;
1195 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
8d4ba034 1196 struct rpcrdma_mw *r;
c56c65fb
TT
1197
1198 /* clean up in reverse order from create
1199 * 1. recv mr memory (mr free, then kfree)
c56c65fb
TT
1200 * 2. send mr memory (mr free, then kfree)
1201 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1202 * 4. arrays
1203 */
1204 dprintk("RPC: %s: entering\n", __func__);
1205
1206 for (i = 0; i < buf->rb_max_requests; i++) {
1207 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1208 rpcrdma_deregister_internal(ia,
1209 buf->rb_recv_bufs[i]->rr_handle,
1210 &buf->rb_recv_bufs[i]->rr_iov);
1211 kfree(buf->rb_recv_bufs[i]);
1212 }
1213 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
c56c65fb
TT
1214 rpcrdma_deregister_internal(ia,
1215 buf->rb_send_bufs[i]->rl_handle,
1216 &buf->rb_send_bufs[i]->rl_iov);
1217 kfree(buf->rb_send_bufs[i]);
1218 }
1219 }
1220
4034ba04
AA
1221 while (!list_empty(&buf->rb_mws)) {
1222 r = list_entry(buf->rb_mws.next,
1223 struct rpcrdma_mw, mw_list);
1224 list_del(&r->mw_list);
1225 switch (ia->ri_memreg_strategy) {
1226 case RPCRDMA_FRMR:
1227 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1228 if (rc)
1229 dprintk("RPC: %s:"
1230 " ib_dereg_mr"
1231 " failed %i\n",
1232 __func__, rc);
1233 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1234 break;
1235 case RPCRDMA_MTHCAFMR:
1236 rc = ib_dealloc_fmr(r->r.fmr);
1237 if (rc)
1238 dprintk("RPC: %s:"
1239 " ib_dealloc_fmr"
1240 " failed %i\n",
1241 __func__, rc);
1242 break;
4034ba04
AA
1243 default:
1244 break;
1245 }
1246 }
1247
c56c65fb
TT
1248 kfree(buf->rb_pool);
1249}
1250
1251/*
1252 * Get a set of request/reply buffers.
1253 *
1254 * Reply buffer (if needed) is attached to send buffer upon return.
1255 * Rule:
1256 * rb_send_index and rb_recv_index MUST always be pointing to the
1257 * *next* available buffer (non-NULL). They are incremented after
1258 * removing buffers, and decremented *before* returning them.
1259 */
1260struct rpcrdma_req *
1261rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1262{
1263 struct rpcrdma_req *req;
1264 unsigned long flags;
8d4ba034
TT
1265 int i;
1266 struct rpcrdma_mw *r;
c56c65fb
TT
1267
1268 spin_lock_irqsave(&buffers->rb_lock, flags);
1269 if (buffers->rb_send_index == buffers->rb_max_requests) {
1270 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1271 dprintk("RPC: %s: out of request buffers\n", __func__);
1272 return ((struct rpcrdma_req *)NULL);
1273 }
1274
1275 req = buffers->rb_send_bufs[buffers->rb_send_index];
1276 if (buffers->rb_send_index < buffers->rb_recv_index) {
1277 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1278 __func__,
1279 buffers->rb_recv_index - buffers->rb_send_index);
1280 req->rl_reply = NULL;
1281 } else {
1282 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1283 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1284 }
1285 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1286 if (!list_empty(&buffers->rb_mws)) {
8d4ba034 1287 i = RPCRDMA_MAX_SEGS - 1;
c56c65fb 1288 do {
c56c65fb
TT
1289 r = list_entry(buffers->rb_mws.next,
1290 struct rpcrdma_mw, mw_list);
1291 list_del(&r->mw_list);
1292 req->rl_segments[i].mr_chunk.rl_mw = r;
1293 } while (--i >= 0);
1294 }
1295 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1296 return req;
1297}
1298
1299/*
1300 * Put request/reply buffers back into pool.
1301 * Pre-decrement counter/array index.
1302 */
1303void
1304rpcrdma_buffer_put(struct rpcrdma_req *req)
1305{
1306 struct rpcrdma_buffer *buffers = req->rl_buffer;
1307 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1308 int i;
1309 unsigned long flags;
1310
c56c65fb
TT
1311 spin_lock_irqsave(&buffers->rb_lock, flags);
1312 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1313 req->rl_niovs = 0;
1314 if (req->rl_reply) {
1315 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
c56c65fb
TT
1316 req->rl_reply->rr_func = NULL;
1317 req->rl_reply = NULL;
1318 }
1319 switch (ia->ri_memreg_strategy) {
3197d309 1320 case RPCRDMA_FRMR:
c56c65fb 1321 case RPCRDMA_MTHCAFMR:
c56c65fb
TT
1322 /*
1323 * Cycle mw's back in reverse order, and "spin" them.
1324 * This delays and scrambles reuse as much as possible.
1325 */
1326 i = 1;
1327 do {
1328 struct rpcrdma_mw **mw;
1329 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1330 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1331 *mw = NULL;
1332 } while (++i < RPCRDMA_MAX_SEGS);
1333 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1334 &buffers->rb_mws);
1335 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1336 break;
1337 default:
1338 break;
1339 }
1340 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1341}
1342
1343/*
1344 * Recover reply buffers from pool.
1345 * This happens when recovering from error conditions.
1346 * Post-increment counter/array index.
1347 */
1348void
1349rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1350{
1351 struct rpcrdma_buffer *buffers = req->rl_buffer;
1352 unsigned long flags;
1353
1354 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1355 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1356 spin_lock_irqsave(&buffers->rb_lock, flags);
1357 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1358 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1359 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1360 }
1361 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1362}
1363
1364/*
1365 * Put reply buffers back into pool when not attached to
b45ccfd2 1366 * request. This happens in error conditions.
c56c65fb
TT
1367 */
1368void
1369rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1370{
1371 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1372 unsigned long flags;
1373
1374 rep->rr_func = NULL;
1375 spin_lock_irqsave(&buffers->rb_lock, flags);
1376 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1377 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1378}
1379
1380/*
1381 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1382 */
1383
1384int
1385rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1386 struct ib_mr **mrp, struct ib_sge *iov)
1387{
1388 struct ib_phys_buf ipb;
1389 struct ib_mr *mr;
1390 int rc;
1391
1392 /*
1393 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1394 */
1395 iov->addr = ib_dma_map_single(ia->ri_id->device,
1396 va, len, DMA_BIDIRECTIONAL);
bf858ab0
YB
1397 if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
1398 return -ENOMEM;
1399
c56c65fb
TT
1400 iov->length = len;
1401
bd7ed1d1
TT
1402 if (ia->ri_have_dma_lkey) {
1403 *mrp = NULL;
1404 iov->lkey = ia->ri_dma_lkey;
1405 return 0;
1406 } else if (ia->ri_bind_mem != NULL) {
c56c65fb
TT
1407 *mrp = NULL;
1408 iov->lkey = ia->ri_bind_mem->lkey;
1409 return 0;
1410 }
1411
1412 ipb.addr = iov->addr;
1413 ipb.size = iov->length;
1414 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1415 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1416
1417 dprintk("RPC: %s: phys convert: 0x%llx "
1418 "registered 0x%llx length %d\n",
a56daeb7
AM
1419 __func__, (unsigned long long)ipb.addr,
1420 (unsigned long long)iov->addr, len);
c56c65fb
TT
1421
1422 if (IS_ERR(mr)) {
1423 *mrp = NULL;
1424 rc = PTR_ERR(mr);
1425 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1426 } else {
1427 *mrp = mr;
1428 iov->lkey = mr->lkey;
1429 rc = 0;
1430 }
1431
1432 return rc;
1433}
1434
1435int
1436rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1437 struct ib_mr *mr, struct ib_sge *iov)
1438{
1439 int rc;
1440
1441 ib_dma_unmap_single(ia->ri_id->device,
1442 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1443
1444 if (NULL == mr)
1445 return 0;
1446
1447 rc = ib_dereg_mr(mr);
1448 if (rc)
1449 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1450 return rc;
1451}
1452
1453/*
1454 * Wrappers for chunk registration, shared by read/write chunk code.
1455 */
1456
1457static void
1458rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1459{
1460 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1461 seg->mr_dmalen = seg->mr_len;
1462 if (seg->mr_page)
1463 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1464 seg->mr_page, offset_in_page(seg->mr_offset),
1465 seg->mr_dmalen, seg->mr_dir);
1466 else
1467 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1468 seg->mr_offset,
1469 seg->mr_dmalen, seg->mr_dir);
5c635e09
TT
1470 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1471 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1472 __func__,
986d4abb
RD
1473 (unsigned long long)seg->mr_dma,
1474 seg->mr_offset, seg->mr_dmalen);
5c635e09 1475 }
c56c65fb
TT
1476}
1477
1478static void
1479rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1480{
1481 if (seg->mr_page)
1482 ib_dma_unmap_page(ia->ri_id->device,
1483 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1484 else
1485 ib_dma_unmap_single(ia->ri_id->device,
1486 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1487}
1488
3197d309
TT
1489static int
1490rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1491 int *nsegs, int writing, struct rpcrdma_ia *ia,
1492 struct rpcrdma_xprt *r_xprt)
1493{
1494 struct rpcrdma_mr_seg *seg1 = seg;
5c635e09
TT
1495 struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1496
3197d309
TT
1497 u8 key;
1498 int len, pageoff;
1499 int i, rc;
9b78145c
TT
1500 int seg_len;
1501 u64 pa;
1502 int page_no;
3197d309
TT
1503
1504 pageoff = offset_in_page(seg1->mr_offset);
1505 seg1->mr_offset -= pageoff; /* start of page */
1506 seg1->mr_len += pageoff;
1507 len = -pageoff;
0fc6c4e7
SW
1508 if (*nsegs > ia->ri_max_frmr_depth)
1509 *nsegs = ia->ri_max_frmr_depth;
9b78145c 1510 for (page_no = i = 0; i < *nsegs;) {
3197d309 1511 rpcrdma_map_one(ia, seg, writing);
9b78145c
TT
1512 pa = seg->mr_dma;
1513 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1514 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->
1515 page_list[page_no++] = pa;
1516 pa += PAGE_SIZE;
1517 }
3197d309
TT
1518 len += seg->mr_len;
1519 ++seg;
1520 ++i;
1521 /* Check for holes */
1522 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1523 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1524 break;
1525 }
1526 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1527 __func__, seg1->mr_chunk.rl_mw, i);
1528
5c635e09
TT
1529 if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1530 dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n",
1531 __func__,
1532 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1533 /* Invalidate before using. */
1534 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1535 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1536 invalidate_wr.next = &frmr_wr;
1537 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1538 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1539 invalidate_wr.ex.invalidate_rkey =
1540 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1541 DECR_CQCOUNT(&r_xprt->rx_ep);
1542 post_wr = &invalidate_wr;
1543 } else
1544 post_wr = &frmr_wr;
1545
3197d309
TT
1546 /* Prepare FRMR WR */
1547 memset(&frmr_wr, 0, sizeof frmr_wr);
5c635e09 1548 frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
3197d309 1549 frmr_wr.opcode = IB_WR_FAST_REG_MR;
5c635e09 1550 frmr_wr.send_flags = IB_SEND_SIGNALED;
7a8b80eb 1551 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
3197d309 1552 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
9b78145c 1553 frmr_wr.wr.fast_reg.page_list_len = page_no;
3197d309 1554 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
9b78145c 1555 frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
c977dea2 1556 if (frmr_wr.wr.fast_reg.length < len) {
5fc83f47
CL
1557 rc = -EIO;
1558 goto out_err;
c977dea2
CL
1559 }
1560
1561 /* Bump the key */
1562 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1563 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1564
3197d309 1565 frmr_wr.wr.fast_reg.access_flags = (writing ?
68743082
VP
1566 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1567 IB_ACCESS_REMOTE_READ);
3197d309
TT
1568 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1569 DECR_CQCOUNT(&r_xprt->rx_ep);
1570
5c635e09 1571 rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
3197d309
TT
1572
1573 if (rc) {
1574 dprintk("RPC: %s: failed ib_post_send for register,"
1575 " status %i\n", __func__, rc);
5fc83f47 1576 goto out_err;
3197d309
TT
1577 } else {
1578 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1579 seg1->mr_base = seg1->mr_dma + pageoff;
1580 seg1->mr_nsegs = i;
1581 seg1->mr_len = len;
1582 }
1583 *nsegs = i;
5fc83f47
CL
1584 return 0;
1585out_err:
1586 while (i--)
1587 rpcrdma_unmap_one(ia, --seg);
3197d309
TT
1588 return rc;
1589}
1590
1591static int
1592rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1593 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1594{
1595 struct rpcrdma_mr_seg *seg1 = seg;
1596 struct ib_send_wr invalidate_wr, *bad_wr;
1597 int rc;
1598
3197d309 1599 memset(&invalidate_wr, 0, sizeof invalidate_wr);
5c635e09 1600 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
3197d309 1601 invalidate_wr.opcode = IB_WR_LOCAL_INV;
5c635e09 1602 invalidate_wr.send_flags = IB_SEND_SIGNALED;
3197d309
TT
1603 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1604 DECR_CQCOUNT(&r_xprt->rx_ep);
1605
73806c88
CL
1606 read_lock(&ia->ri_qplock);
1607 while (seg1->mr_nsegs--)
1608 rpcrdma_unmap_one(ia, seg++);
3197d309 1609 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
73806c88 1610 read_unlock(&ia->ri_qplock);
3197d309
TT
1611 if (rc)
1612 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1613 " status %i\n", __func__, rc);
1614 return rc;
1615}
1616
8d4ba034
TT
1617static int
1618rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1619 int *nsegs, int writing, struct rpcrdma_ia *ia)
1620{
1621 struct rpcrdma_mr_seg *seg1 = seg;
1622 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1623 int len, pageoff, i, rc;
1624
1625 pageoff = offset_in_page(seg1->mr_offset);
1626 seg1->mr_offset -= pageoff; /* start of page */
1627 seg1->mr_len += pageoff;
1628 len = -pageoff;
1629 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1630 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1631 for (i = 0; i < *nsegs;) {
1632 rpcrdma_map_one(ia, seg, writing);
1633 physaddrs[i] = seg->mr_dma;
1634 len += seg->mr_len;
1635 ++seg;
1636 ++i;
1637 /* Check for holes */
1638 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1639 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1640 break;
1641 }
1642 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1643 physaddrs, i, seg1->mr_dma);
1644 if (rc) {
1645 dprintk("RPC: %s: failed ib_map_phys_fmr "
1646 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1647 len, (unsigned long long)seg1->mr_dma,
1648 pageoff, i, rc);
1649 while (i--)
1650 rpcrdma_unmap_one(ia, --seg);
1651 } else {
1652 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1653 seg1->mr_base = seg1->mr_dma + pageoff;
1654 seg1->mr_nsegs = i;
1655 seg1->mr_len = len;
1656 }
1657 *nsegs = i;
1658 return rc;
1659}
1660
1661static int
1662rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1663 struct rpcrdma_ia *ia)
1664{
1665 struct rpcrdma_mr_seg *seg1 = seg;
1666 LIST_HEAD(l);
1667 int rc;
1668
1669 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1670 rc = ib_unmap_fmr(&l);
73806c88 1671 read_lock(&ia->ri_qplock);
8d4ba034
TT
1672 while (seg1->mr_nsegs--)
1673 rpcrdma_unmap_one(ia, seg++);
73806c88 1674 read_unlock(&ia->ri_qplock);
8d4ba034
TT
1675 if (rc)
1676 dprintk("RPC: %s: failed ib_unmap_fmr,"
1677 " status %i\n", __func__, rc);
1678 return rc;
1679}
1680
c56c65fb
TT
1681int
1682rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1683 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1684{
1685 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1686 int rc = 0;
1687
1688 switch (ia->ri_memreg_strategy) {
1689
1690#if RPCRDMA_PERSISTENT_REGISTRATION
1691 case RPCRDMA_ALLPHYSICAL:
1692 rpcrdma_map_one(ia, seg, writing);
1693 seg->mr_rkey = ia->ri_bind_mem->rkey;
1694 seg->mr_base = seg->mr_dma;
1695 seg->mr_nsegs = 1;
1696 nsegs = 1;
1697 break;
1698#endif
1699
3197d309
TT
1700 /* Registration using frmr registration */
1701 case RPCRDMA_FRMR:
1702 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1703 break;
1704
8d4ba034 1705 /* Registration using fmr memory registration */
c56c65fb 1706 case RPCRDMA_MTHCAFMR:
8d4ba034 1707 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
c56c65fb
TT
1708 break;
1709
c56c65fb 1710 default:
0ac531c1 1711 return -1;
c56c65fb
TT
1712 }
1713 if (rc)
1714 return -1;
1715
1716 return nsegs;
1717}
1718
1719int
1720rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
13c9ff8f 1721 struct rpcrdma_xprt *r_xprt)
c56c65fb
TT
1722{
1723 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1724 int nsegs = seg->mr_nsegs, rc;
1725
1726 switch (ia->ri_memreg_strategy) {
1727
1728#if RPCRDMA_PERSISTENT_REGISTRATION
1729 case RPCRDMA_ALLPHYSICAL:
73806c88 1730 read_lock(&ia->ri_qplock);
c56c65fb 1731 rpcrdma_unmap_one(ia, seg);
73806c88 1732 read_unlock(&ia->ri_qplock);
c56c65fb
TT
1733 break;
1734#endif
1735
3197d309
TT
1736 case RPCRDMA_FRMR:
1737 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1738 break;
1739
c56c65fb 1740 case RPCRDMA_MTHCAFMR:
8d4ba034 1741 rc = rpcrdma_deregister_fmr_external(seg, ia);
c56c65fb
TT
1742 break;
1743
c56c65fb 1744 default:
c56c65fb
TT
1745 break;
1746 }
c56c65fb
TT
1747 return nsegs;
1748}
1749
1750/*
1751 * Prepost any receive buffer, then post send.
1752 *
1753 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1754 */
1755int
1756rpcrdma_ep_post(struct rpcrdma_ia *ia,
1757 struct rpcrdma_ep *ep,
1758 struct rpcrdma_req *req)
1759{
1760 struct ib_send_wr send_wr, *send_wr_fail;
1761 struct rpcrdma_rep *rep = req->rl_reply;
1762 int rc;
1763
1764 if (rep) {
1765 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1766 if (rc)
1767 goto out;
1768 req->rl_reply = NULL;
1769 }
1770
1771 send_wr.next = NULL;
1772 send_wr.wr_id = 0ULL; /* no send cookie */
1773 send_wr.sg_list = req->rl_send_iov;
1774 send_wr.num_sge = req->rl_niovs;
1775 send_wr.opcode = IB_WR_SEND;
c56c65fb
TT
1776 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1777 ib_dma_sync_single_for_device(ia->ri_id->device,
1778 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1779 DMA_TO_DEVICE);
1780 ib_dma_sync_single_for_device(ia->ri_id->device,
1781 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1782 DMA_TO_DEVICE);
1783 ib_dma_sync_single_for_device(ia->ri_id->device,
1784 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1785 DMA_TO_DEVICE);
1786
1787 if (DECR_CQCOUNT(ep) > 0)
1788 send_wr.send_flags = 0;
1789 else { /* Provider must take a send completion every now and then */
1790 INIT_CQCOUNT(ep);
1791 send_wr.send_flags = IB_SEND_SIGNALED;
1792 }
1793
1794 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1795 if (rc)
1796 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1797 rc);
1798out:
1799 return rc;
1800}
1801
1802/*
1803 * (Re)post a receive buffer.
1804 */
1805int
1806rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1807 struct rpcrdma_ep *ep,
1808 struct rpcrdma_rep *rep)
1809{
1810 struct ib_recv_wr recv_wr, *recv_wr_fail;
1811 int rc;
1812
1813 recv_wr.next = NULL;
1814 recv_wr.wr_id = (u64) (unsigned long) rep;
1815 recv_wr.sg_list = &rep->rr_iov;
1816 recv_wr.num_sge = 1;
1817
1818 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1819 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1820
c56c65fb
TT
1821 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1822
1823 if (rc)
1824 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1825 rc);
1826 return rc;
1827}