net: remove interrupt.h inclusion from netdevice.h
[linux-2.6-block.git] / net / sunrpc / xprtrdma / verbs.c
CommitLineData
f58851e6 1/*
c56c65fb
TT
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
f58851e6
TT
38 */
39
c56c65fb
TT
40/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
a6b7a407 50#include <linux/interrupt.h>
c56c65fb 51#include <linux/pci.h> /* for Tavor hack below */
5a0e3ad6 52#include <linux/slab.h>
c56c65fb 53
f58851e6
TT
54#include "xprt_rdma.h"
55
c56c65fb
TT
56/*
57 * Globals/Macros
58 */
59
60#ifdef RPC_DEBUG
61# define RPCDBG_FACILITY RPCDBG_TRANS
62#endif
63
64/*
65 * internal functions
66 */
67
68/*
69 * handle replies in tasklet context, using a single, global list
70 * rdma tasklet function -- just turn around and call the func
71 * for all replies on the list
72 */
73
74static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
75static LIST_HEAD(rpcrdma_tasklets_g);
76
77static void
78rpcrdma_run_tasklet(unsigned long data)
79{
80 struct rpcrdma_rep *rep;
81 void (*func)(struct rpcrdma_rep *);
82 unsigned long flags;
83
84 data = data;
85 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
86 while (!list_empty(&rpcrdma_tasklets_g)) {
87 rep = list_entry(rpcrdma_tasklets_g.next,
88 struct rpcrdma_rep, rr_list);
89 list_del(&rep->rr_list);
90 func = rep->rr_func;
91 rep->rr_func = NULL;
92 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
93
94 if (func)
95 func(rep);
96 else
97 rpcrdma_recv_buffer_put(rep);
98
99 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
100 }
101 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
102}
103
104static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
105
106static inline void
107rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
108{
109 unsigned long flags;
110
111 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
112 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
113 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
114 tasklet_schedule(&rpcrdma_tasklet_g);
115}
116
117static void
118rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
119{
120 struct rpcrdma_ep *ep = context;
121
122 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
123 __func__, event->event, event->device->name, context);
124 if (ep->rep_connected == 1) {
125 ep->rep_connected = -EIO;
126 ep->rep_func(ep);
127 wake_up_all(&ep->rep_connect_wait);
128 }
129}
130
131static void
132rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
133{
134 struct rpcrdma_ep *ep = context;
135
136 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
137 __func__, event->event, event->device->name, context);
138 if (ep->rep_connected == 1) {
139 ep->rep_connected = -EIO;
140 ep->rep_func(ep);
141 wake_up_all(&ep->rep_connect_wait);
142 }
143}
144
145static inline
146void rpcrdma_event_process(struct ib_wc *wc)
147{
5c635e09 148 struct rpcrdma_mw *frmr;
c56c65fb
TT
149 struct rpcrdma_rep *rep =
150 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
151
152 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
153 __func__, rep, wc->status, wc->opcode, wc->byte_len);
154
155 if (!rep) /* send or bind completion that we don't care about */
156 return;
157
158 if (IB_WC_SUCCESS != wc->status) {
5c635e09
TT
159 dprintk("RPC: %s: WC opcode %d status %X, connection lost\n",
160 __func__, wc->opcode, wc->status);
c56c65fb 161 rep->rr_len = ~0U;
5c635e09
TT
162 if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
163 rpcrdma_schedule_tasklet(rep);
c56c65fb
TT
164 return;
165 }
166
167 switch (wc->opcode) {
5c635e09
TT
168 case IB_WC_FAST_REG_MR:
169 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
170 frmr->r.frmr.state = FRMR_IS_VALID;
171 break;
172 case IB_WC_LOCAL_INV:
173 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
174 frmr->r.frmr.state = FRMR_IS_INVALID;
175 break;
c56c65fb
TT
176 case IB_WC_RECV:
177 rep->rr_len = wc->byte_len;
178 ib_dma_sync_single_for_cpu(
179 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
180 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
181 /* Keep (only) the most recent credits, after check validity */
182 if (rep->rr_len >= 16) {
183 struct rpcrdma_msg *p =
184 (struct rpcrdma_msg *) rep->rr_base;
185 unsigned int credits = ntohl(p->rm_credit);
186 if (credits == 0) {
187 dprintk("RPC: %s: server"
188 " dropped credits to 0!\n", __func__);
189 /* don't deadlock */
190 credits = 1;
191 } else if (credits > rep->rr_buffer->rb_max_requests) {
192 dprintk("RPC: %s: server"
193 " over-crediting: %d (%d)\n",
194 __func__, credits,
195 rep->rr_buffer->rb_max_requests);
196 credits = rep->rr_buffer->rb_max_requests;
197 }
198 atomic_set(&rep->rr_buffer->rb_credits, credits);
199 }
200 /* fall through */
201 case IB_WC_BIND_MW:
202 rpcrdma_schedule_tasklet(rep);
203 break;
204 default:
205 dprintk("RPC: %s: unexpected WC event %X\n",
206 __func__, wc->opcode);
207 break;
208 }
209}
210
211static inline int
212rpcrdma_cq_poll(struct ib_cq *cq)
213{
214 struct ib_wc wc;
215 int rc;
216
217 for (;;) {
218 rc = ib_poll_cq(cq, 1, &wc);
219 if (rc < 0) {
220 dprintk("RPC: %s: ib_poll_cq failed %i\n",
221 __func__, rc);
222 return rc;
223 }
224 if (rc == 0)
225 break;
226
227 rpcrdma_event_process(&wc);
228 }
229
230 return 0;
231}
232
233/*
234 * rpcrdma_cq_event_upcall
235 *
236 * This upcall handles recv, send, bind and unbind events.
237 * It is reentrant but processes single events in order to maintain
238 * ordering of receives to keep server credits.
239 *
240 * It is the responsibility of the scheduled tasklet to return
241 * recv buffers to the pool. NOTE: this affects synchronization of
242 * connection shutdown. That is, the structures required for
243 * the completion of the reply handler must remain intact until
244 * all memory has been reclaimed.
245 *
246 * Note that send events are suppressed and do not result in an upcall.
247 */
248static void
249rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
250{
251 int rc;
252
253 rc = rpcrdma_cq_poll(cq);
254 if (rc)
255 return;
256
257 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
258 if (rc) {
259 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
260 __func__, rc);
261 return;
262 }
263
264 rpcrdma_cq_poll(cq);
265}
266
267#ifdef RPC_DEBUG
268static const char * const conn[] = {
269 "address resolved",
270 "address error",
271 "route resolved",
272 "route error",
273 "connect request",
274 "connect response",
275 "connect error",
276 "unreachable",
277 "rejected",
278 "established",
279 "disconnected",
280 "device removal"
281};
282#endif
283
284static int
285rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
286{
287 struct rpcrdma_xprt *xprt = id->context;
288 struct rpcrdma_ia *ia = &xprt->rx_ia;
289 struct rpcrdma_ep *ep = &xprt->rx_ep;
ff0db049 290#ifdef RPC_DEBUG
c56c65fb 291 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
ff0db049 292#endif
c56c65fb
TT
293 struct ib_qp_attr attr;
294 struct ib_qp_init_attr iattr;
295 int connstate = 0;
296
297 switch (event->event) {
298 case RDMA_CM_EVENT_ADDR_RESOLVED:
299 case RDMA_CM_EVENT_ROUTE_RESOLVED:
5675add3 300 ia->ri_async_rc = 0;
c56c65fb
TT
301 complete(&ia->ri_done);
302 break;
303 case RDMA_CM_EVENT_ADDR_ERROR:
304 ia->ri_async_rc = -EHOSTUNREACH;
305 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
306 __func__, ep);
307 complete(&ia->ri_done);
308 break;
309 case RDMA_CM_EVENT_ROUTE_ERROR:
310 ia->ri_async_rc = -ENETUNREACH;
311 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
312 __func__, ep);
313 complete(&ia->ri_done);
314 break;
315 case RDMA_CM_EVENT_ESTABLISHED:
316 connstate = 1;
317 ib_query_qp(ia->ri_id->qp, &attr,
318 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
319 &iattr);
320 dprintk("RPC: %s: %d responder resources"
321 " (%d initiator)\n",
322 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
323 goto connected;
324 case RDMA_CM_EVENT_CONNECT_ERROR:
325 connstate = -ENOTCONN;
326 goto connected;
327 case RDMA_CM_EVENT_UNREACHABLE:
328 connstate = -ENETDOWN;
329 goto connected;
330 case RDMA_CM_EVENT_REJECTED:
331 connstate = -ECONNREFUSED;
332 goto connected;
333 case RDMA_CM_EVENT_DISCONNECTED:
334 connstate = -ECONNABORTED;
335 goto connected;
336 case RDMA_CM_EVENT_DEVICE_REMOVAL:
337 connstate = -ENODEV;
338connected:
21454aaa 339 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
c56c65fb
TT
340 __func__,
341 (event->event <= 11) ? conn[event->event] :
342 "unknown connection error",
21454aaa 343 &addr->sin_addr.s_addr,
c56c65fb
TT
344 ntohs(addr->sin_port),
345 ep, event->event);
346 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
347 dprintk("RPC: %s: %sconnected\n",
348 __func__, connstate > 0 ? "" : "dis");
349 ep->rep_connected = connstate;
350 ep->rep_func(ep);
351 wake_up_all(&ep->rep_connect_wait);
352 break;
353 default:
1a954051 354 dprintk("RPC: %s: unexpected CM event %d\n",
c56c65fb 355 __func__, event->event);
c56c65fb
TT
356 break;
357 }
358
b3cd8d45
TT
359#ifdef RPC_DEBUG
360 if (connstate == 1) {
361 int ird = attr.max_dest_rd_atomic;
362 int tird = ep->rep_remote_cma.responder_resources;
21454aaa 363 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
b3cd8d45 364 "on %s, memreg %d slots %d ird %d%s\n",
21454aaa 365 &addr->sin_addr.s_addr,
b3cd8d45
TT
366 ntohs(addr->sin_port),
367 ia->ri_id->device->name,
368 ia->ri_memreg_strategy,
369 xprt->rx_buf.rb_max_requests,
370 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
371 } else if (connstate < 0) {
21454aaa
HH
372 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
373 &addr->sin_addr.s_addr,
b3cd8d45
TT
374 ntohs(addr->sin_port),
375 connstate);
376 }
377#endif
378
c56c65fb
TT
379 return 0;
380}
381
382static struct rdma_cm_id *
383rpcrdma_create_id(struct rpcrdma_xprt *xprt,
384 struct rpcrdma_ia *ia, struct sockaddr *addr)
385{
386 struct rdma_cm_id *id;
387 int rc;
388
1a954051
TT
389 init_completion(&ia->ri_done);
390
b26f9b99 391 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
c56c65fb
TT
392 if (IS_ERR(id)) {
393 rc = PTR_ERR(id);
394 dprintk("RPC: %s: rdma_create_id() failed %i\n",
395 __func__, rc);
396 return id;
397 }
398
5675add3 399 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
400 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
401 if (rc) {
402 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
403 __func__, rc);
404 goto out;
405 }
5675add3
TT
406 wait_for_completion_interruptible_timeout(&ia->ri_done,
407 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
408 rc = ia->ri_async_rc;
409 if (rc)
410 goto out;
411
5675add3 412 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
413 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
414 if (rc) {
415 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
416 __func__, rc);
417 goto out;
418 }
5675add3
TT
419 wait_for_completion_interruptible_timeout(&ia->ri_done,
420 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
421 rc = ia->ri_async_rc;
422 if (rc)
423 goto out;
424
425 return id;
426
427out:
428 rdma_destroy_id(id);
429 return ERR_PTR(rc);
430}
431
432/*
433 * Drain any cq, prior to teardown.
434 */
435static void
436rpcrdma_clean_cq(struct ib_cq *cq)
437{
438 struct ib_wc wc;
439 int count = 0;
440
441 while (1 == ib_poll_cq(cq, 1, &wc))
442 ++count;
443
444 if (count)
445 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
446 __func__, count, wc.opcode);
447}
448
449/*
450 * Exported functions.
451 */
452
453/*
454 * Open and initialize an Interface Adapter.
455 * o initializes fields of struct rpcrdma_ia, including
456 * interface and provider attributes and protection zone.
457 */
458int
459rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
460{
bd7ed1d1
TT
461 int rc, mem_priv;
462 struct ib_device_attr devattr;
c56c65fb
TT
463 struct rpcrdma_ia *ia = &xprt->rx_ia;
464
c56c65fb
TT
465 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
466 if (IS_ERR(ia->ri_id)) {
467 rc = PTR_ERR(ia->ri_id);
468 goto out1;
469 }
470
471 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
472 if (IS_ERR(ia->ri_pd)) {
473 rc = PTR_ERR(ia->ri_pd);
474 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
475 __func__, rc);
476 goto out2;
477 }
478
bd7ed1d1
TT
479 /*
480 * Query the device to determine if the requested memory
481 * registration strategy is supported. If it isn't, set the
482 * strategy to a globally supported model.
483 */
484 rc = ib_query_device(ia->ri_id->device, &devattr);
485 if (rc) {
486 dprintk("RPC: %s: ib_query_device failed %d\n",
487 __func__, rc);
488 goto out2;
489 }
490
491 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
492 ia->ri_have_dma_lkey = 1;
493 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
494 }
495
496 switch (memreg) {
497 case RPCRDMA_MEMWINDOWS:
498 case RPCRDMA_MEMWINDOWS_ASYNC:
499 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
500 dprintk("RPC: %s: MEMWINDOWS registration "
501 "specified but not supported by adapter, "
502 "using slower RPCRDMA_REGISTER\n",
503 __func__);
504 memreg = RPCRDMA_REGISTER;
505 }
506 break;
507 case RPCRDMA_MTHCAFMR:
508 if (!ia->ri_id->device->alloc_fmr) {
509#if RPCRDMA_PERSISTENT_REGISTRATION
510 dprintk("RPC: %s: MTHCAFMR registration "
511 "specified but not supported by adapter, "
512 "using riskier RPCRDMA_ALLPHYSICAL\n",
513 __func__);
514 memreg = RPCRDMA_ALLPHYSICAL;
515#else
516 dprintk("RPC: %s: MTHCAFMR registration "
517 "specified but not supported by adapter, "
518 "using slower RPCRDMA_REGISTER\n",
519 __func__);
520 memreg = RPCRDMA_REGISTER;
3197d309
TT
521#endif
522 }
523 break;
524 case RPCRDMA_FRMR:
525 /* Requires both frmr reg and local dma lkey */
526 if ((devattr.device_cap_flags &
527 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
528 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
529#if RPCRDMA_PERSISTENT_REGISTRATION
530 dprintk("RPC: %s: FRMR registration "
531 "specified but not supported by adapter, "
532 "using riskier RPCRDMA_ALLPHYSICAL\n",
533 __func__);
534 memreg = RPCRDMA_ALLPHYSICAL;
535#else
536 dprintk("RPC: %s: FRMR registration "
537 "specified but not supported by adapter, "
538 "using slower RPCRDMA_REGISTER\n",
539 __func__);
540 memreg = RPCRDMA_REGISTER;
bd7ed1d1
TT
541#endif
542 }
543 break;
544 }
545
c56c65fb
TT
546 /*
547 * Optionally obtain an underlying physical identity mapping in
548 * order to do a memory window-based bind. This base registration
549 * is protected from remote access - that is enabled only by binding
550 * for the specific bytes targeted during each RPC operation, and
551 * revoked after the corresponding completion similar to a storage
552 * adapter.
553 */
bd7ed1d1
TT
554 switch (memreg) {
555 case RPCRDMA_BOUNCEBUFFERS:
556 case RPCRDMA_REGISTER:
3197d309 557 case RPCRDMA_FRMR:
bd7ed1d1 558 break;
c56c65fb 559#if RPCRDMA_PERSISTENT_REGISTRATION
bd7ed1d1
TT
560 case RPCRDMA_ALLPHYSICAL:
561 mem_priv = IB_ACCESS_LOCAL_WRITE |
562 IB_ACCESS_REMOTE_WRITE |
563 IB_ACCESS_REMOTE_READ;
564 goto register_setup;
c56c65fb 565#endif
bd7ed1d1
TT
566 case RPCRDMA_MEMWINDOWS_ASYNC:
567 case RPCRDMA_MEMWINDOWS:
568 mem_priv = IB_ACCESS_LOCAL_WRITE |
569 IB_ACCESS_MW_BIND;
570 goto register_setup;
571 case RPCRDMA_MTHCAFMR:
572 if (ia->ri_have_dma_lkey)
c56c65fb 573 break;
bd7ed1d1
TT
574 mem_priv = IB_ACCESS_LOCAL_WRITE;
575 register_setup:
c56c65fb
TT
576 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
577 if (IS_ERR(ia->ri_bind_mem)) {
578 printk(KERN_ALERT "%s: ib_get_dma_mr for "
579 "phys register failed with %lX\n\t"
580 "Will continue with degraded performance\n",
581 __func__, PTR_ERR(ia->ri_bind_mem));
582 memreg = RPCRDMA_REGISTER;
583 ia->ri_bind_mem = NULL;
584 }
bd7ed1d1
TT
585 break;
586 default:
587 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
588 __func__, memreg);
589 rc = -EINVAL;
590 goto out2;
c56c65fb 591 }
bd7ed1d1
TT
592 dprintk("RPC: %s: memory registration strategy is %d\n",
593 __func__, memreg);
c56c65fb
TT
594
595 /* Else will do memory reg/dereg for each chunk */
596 ia->ri_memreg_strategy = memreg;
597
598 return 0;
599out2:
600 rdma_destroy_id(ia->ri_id);
fee08caf 601 ia->ri_id = NULL;
c56c65fb
TT
602out1:
603 return rc;
604}
605
606/*
607 * Clean up/close an IA.
608 * o if event handles and PD have been initialized, free them.
609 * o close the IA
610 */
611void
612rpcrdma_ia_close(struct rpcrdma_ia *ia)
613{
614 int rc;
615
616 dprintk("RPC: %s: entering\n", __func__);
617 if (ia->ri_bind_mem != NULL) {
618 rc = ib_dereg_mr(ia->ri_bind_mem);
619 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
620 __func__, rc);
621 }
fee08caf
TT
622 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
623 if (ia->ri_id->qp)
624 rdma_destroy_qp(ia->ri_id);
625 rdma_destroy_id(ia->ri_id);
626 ia->ri_id = NULL;
627 }
c56c65fb
TT
628 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
629 rc = ib_dealloc_pd(ia->ri_pd);
630 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
631 __func__, rc);
632 }
c56c65fb
TT
633}
634
635/*
636 * Create unconnected endpoint.
637 */
638int
639rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
640 struct rpcrdma_create_data_internal *cdata)
641{
642 struct ib_device_attr devattr;
5d40a8a5 643 int rc, err;
c56c65fb
TT
644
645 rc = ib_query_device(ia->ri_id->device, &devattr);
646 if (rc) {
647 dprintk("RPC: %s: ib_query_device failed %d\n",
648 __func__, rc);
649 return rc;
650 }
651
652 /* check provider's send/recv wr limits */
653 if (cdata->max_requests > devattr.max_qp_wr)
654 cdata->max_requests = devattr.max_qp_wr;
655
656 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
657 ep->rep_attr.qp_context = ep;
658 /* send_cq and recv_cq initialized below */
659 ep->rep_attr.srq = NULL;
660 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
661 switch (ia->ri_memreg_strategy) {
3197d309 662 case RPCRDMA_FRMR:
15cdc644
TT
663 /* Add room for frmr register and invalidate WRs.
664 * 1. FRMR reg WR for head
665 * 2. FRMR invalidate WR for head
666 * 3. FRMR reg WR for pagelist
667 * 4. FRMR invalidate WR for pagelist
668 * 5. FRMR reg WR for tail
669 * 6. FRMR invalidate WR for tail
670 * 7. The RDMA_SEND WR
671 */
672 ep->rep_attr.cap.max_send_wr *= 7;
673 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
674 cdata->max_requests = devattr.max_qp_wr / 7;
675 if (!cdata->max_requests)
676 return -EINVAL;
677 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
678 }
3197d309 679 break;
c56c65fb
TT
680 case RPCRDMA_MEMWINDOWS_ASYNC:
681 case RPCRDMA_MEMWINDOWS:
682 /* Add room for mw_binds+unbinds - overkill! */
683 ep->rep_attr.cap.max_send_wr++;
684 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
685 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
686 return -EINVAL;
687 break;
688 default:
689 break;
690 }
691 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
692 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
693 ep->rep_attr.cap.max_recv_sge = 1;
694 ep->rep_attr.cap.max_inline_data = 0;
695 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
696 ep->rep_attr.qp_type = IB_QPT_RC;
697 ep->rep_attr.port_num = ~0;
698
699 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
700 "iovs: send %d recv %d\n",
701 __func__,
702 ep->rep_attr.cap.max_send_wr,
703 ep->rep_attr.cap.max_recv_wr,
704 ep->rep_attr.cap.max_send_sge,
705 ep->rep_attr.cap.max_recv_sge);
706
707 /* set trigger for requesting send completion */
708 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
709 switch (ia->ri_memreg_strategy) {
710 case RPCRDMA_MEMWINDOWS_ASYNC:
711 case RPCRDMA_MEMWINDOWS:
712 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
713 break;
714 default:
715 break;
716 }
717 if (ep->rep_cqinit <= 2)
718 ep->rep_cqinit = 0;
719 INIT_CQCOUNT(ep);
720 ep->rep_ia = ia;
721 init_waitqueue_head(&ep->rep_connect_wait);
722
723 /*
724 * Create a single cq for receive dto and mw_bind (only ever
725 * care about unbind, really). Send completions are suppressed.
726 * Use single threaded tasklet upcalls to maintain ordering.
727 */
728 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
729 rpcrdma_cq_async_error_upcall, NULL,
730 ep->rep_attr.cap.max_recv_wr +
731 ep->rep_attr.cap.max_send_wr + 1, 0);
732 if (IS_ERR(ep->rep_cq)) {
733 rc = PTR_ERR(ep->rep_cq);
734 dprintk("RPC: %s: ib_create_cq failed: %i\n",
735 __func__, rc);
736 goto out1;
737 }
738
739 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
740 if (rc) {
741 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
742 __func__, rc);
743 goto out2;
744 }
745
746 ep->rep_attr.send_cq = ep->rep_cq;
747 ep->rep_attr.recv_cq = ep->rep_cq;
748
749 /* Initialize cma parameters */
750
751 /* RPC/RDMA does not use private data */
752 ep->rep_remote_cma.private_data = NULL;
753 ep->rep_remote_cma.private_data_len = 0;
754
755 /* Client offers RDMA Read but does not initiate */
b334eaab
TT
756 ep->rep_remote_cma.initiator_depth = 0;
757 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
c56c65fb 758 ep->rep_remote_cma.responder_resources = 0;
b334eaab
TT
759 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
760 ep->rep_remote_cma.responder_resources = 32;
761 else
c56c65fb 762 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
c56c65fb
TT
763
764 ep->rep_remote_cma.retry_count = 7;
765 ep->rep_remote_cma.flow_control = 0;
766 ep->rep_remote_cma.rnr_retry_count = 0;
767
768 return 0;
769
770out2:
5d40a8a5
CL
771 err = ib_destroy_cq(ep->rep_cq);
772 if (err)
773 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
774 __func__, err);
c56c65fb
TT
775out1:
776 return rc;
777}
778
779/*
780 * rpcrdma_ep_destroy
781 *
782 * Disconnect and destroy endpoint. After this, the only
783 * valid operations on the ep are to free it (if dynamically
784 * allocated) or re-create it.
785 *
786 * The caller's error handling must be sure to not leak the endpoint
787 * if this function fails.
788 */
789int
790rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
791{
792 int rc;
793
794 dprintk("RPC: %s: entering, connected is %d\n",
795 __func__, ep->rep_connected);
796
797 if (ia->ri_id->qp) {
798 rc = rpcrdma_ep_disconnect(ep, ia);
799 if (rc)
800 dprintk("RPC: %s: rpcrdma_ep_disconnect"
801 " returned %i\n", __func__, rc);
fee08caf
TT
802 rdma_destroy_qp(ia->ri_id);
803 ia->ri_id->qp = NULL;
c56c65fb
TT
804 }
805
c56c65fb
TT
806 /* padding - could be done in rpcrdma_buffer_destroy... */
807 if (ep->rep_pad_mr) {
808 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
809 ep->rep_pad_mr = NULL;
810 }
811
c56c65fb
TT
812 rpcrdma_clean_cq(ep->rep_cq);
813 rc = ib_destroy_cq(ep->rep_cq);
814 if (rc)
815 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
816 __func__, rc);
817
818 return rc;
819}
820
821/*
822 * Connect unconnected endpoint.
823 */
824int
825rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
826{
827 struct rdma_cm_id *id;
828 int rc = 0;
829 int retry_count = 0;
c56c65fb 830
c055551e 831 if (ep->rep_connected != 0) {
c56c65fb
TT
832 struct rpcrdma_xprt *xprt;
833retry:
834 rc = rpcrdma_ep_disconnect(ep, ia);
835 if (rc && rc != -ENOTCONN)
836 dprintk("RPC: %s: rpcrdma_ep_disconnect"
837 " status %i\n", __func__, rc);
838 rpcrdma_clean_cq(ep->rep_cq);
839
840 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
841 id = rpcrdma_create_id(xprt, ia,
842 (struct sockaddr *)&xprt->rx_data.addr);
843 if (IS_ERR(id)) {
844 rc = PTR_ERR(id);
845 goto out;
846 }
847 /* TEMP TEMP TEMP - fail if new device:
848 * Deregister/remarshal *all* requests!
849 * Close and recreate adapter, pd, etc!
850 * Re-determine all attributes still sane!
851 * More stuff I haven't thought of!
852 * Rrrgh!
853 */
854 if (ia->ri_id->device != id->device) {
855 printk("RPC: %s: can't reconnect on "
856 "different device!\n", __func__);
857 rdma_destroy_id(id);
858 rc = -ENETDOWN;
859 goto out;
860 }
861 /* END TEMP */
1a954051 862 rdma_destroy_qp(ia->ri_id);
c56c65fb
TT
863 rdma_destroy_id(ia->ri_id);
864 ia->ri_id = id;
865 }
866
867 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
868 if (rc) {
869 dprintk("RPC: %s: rdma_create_qp failed %i\n",
870 __func__, rc);
871 goto out;
872 }
873
874/* XXX Tavor device performs badly with 2K MTU! */
875if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
876 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
877 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
878 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
879 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
880 struct ib_qp_attr attr = {
881 .path_mtu = IB_MTU_1024
882 };
883 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
884 }
885}
886
c56c65fb
TT
887 ep->rep_connected = 0;
888
889 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
890 if (rc) {
891 dprintk("RPC: %s: rdma_connect() failed with %i\n",
892 __func__, rc);
893 goto out;
894 }
895
c56c65fb
TT
896 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
897
898 /*
899 * Check state. A non-peer reject indicates no listener
900 * (ECONNREFUSED), which may be a transient state. All
901 * others indicate a transport condition which has already
902 * undergone a best-effort.
903 */
f64f9e71
JP
904 if (ep->rep_connected == -ECONNREFUSED &&
905 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
c56c65fb
TT
906 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
907 goto retry;
908 }
909 if (ep->rep_connected <= 0) {
910 /* Sometimes, the only way to reliably connect to remote
911 * CMs is to use same nonzero values for ORD and IRD. */
b334eaab
TT
912 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
913 (ep->rep_remote_cma.responder_resources == 0 ||
914 ep->rep_remote_cma.initiator_depth !=
915 ep->rep_remote_cma.responder_resources)) {
916 if (ep->rep_remote_cma.responder_resources == 0)
917 ep->rep_remote_cma.responder_resources = 1;
918 ep->rep_remote_cma.initiator_depth =
919 ep->rep_remote_cma.responder_resources;
c56c65fb 920 goto retry;
b334eaab 921 }
c56c65fb
TT
922 rc = ep->rep_connected;
923 } else {
924 dprintk("RPC: %s: connected\n", __func__);
925 }
926
927out:
928 if (rc)
929 ep->rep_connected = rc;
930 return rc;
931}
932
933/*
934 * rpcrdma_ep_disconnect
935 *
936 * This is separate from destroy to facilitate the ability
937 * to reconnect without recreating the endpoint.
938 *
939 * This call is not reentrant, and must not be made in parallel
940 * on the same endpoint.
941 */
942int
943rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
944{
945 int rc;
946
947 rpcrdma_clean_cq(ep->rep_cq);
948 rc = rdma_disconnect(ia->ri_id);
949 if (!rc) {
950 /* returns without wait if not connected */
951 wait_event_interruptible(ep->rep_connect_wait,
952 ep->rep_connected != 1);
953 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
954 (ep->rep_connected == 1) ? "still " : "dis");
955 } else {
956 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
957 ep->rep_connected = rc;
958 }
959 return rc;
960}
961
962/*
963 * Initialize buffer memory
964 */
965int
966rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
967 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
968{
969 char *p;
970 size_t len;
971 int i, rc;
8d4ba034 972 struct rpcrdma_mw *r;
c56c65fb
TT
973
974 buf->rb_max_requests = cdata->max_requests;
975 spin_lock_init(&buf->rb_lock);
976 atomic_set(&buf->rb_credits, 1);
977
978 /* Need to allocate:
979 * 1. arrays for send and recv pointers
980 * 2. arrays of struct rpcrdma_req to fill in pointers
981 * 3. array of struct rpcrdma_rep for replies
982 * 4. padding, if any
3197d309 983 * 5. mw's, fmr's or frmr's, if any
c56c65fb
TT
984 * Send/recv buffers in req/rep need to be registered
985 */
986
987 len = buf->rb_max_requests *
988 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
989 len += cdata->padding;
990 switch (ia->ri_memreg_strategy) {
3197d309
TT
991 case RPCRDMA_FRMR:
992 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
993 sizeof(struct rpcrdma_mw);
994 break;
c56c65fb
TT
995 case RPCRDMA_MTHCAFMR:
996 /* TBD we are perhaps overallocating here */
997 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
998 sizeof(struct rpcrdma_mw);
999 break;
1000 case RPCRDMA_MEMWINDOWS_ASYNC:
1001 case RPCRDMA_MEMWINDOWS:
1002 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1003 sizeof(struct rpcrdma_mw);
1004 break;
1005 default:
1006 break;
1007 }
1008
1009 /* allocate 1, 4 and 5 in one shot */
1010 p = kzalloc(len, GFP_KERNEL);
1011 if (p == NULL) {
1012 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1013 __func__, len);
1014 rc = -ENOMEM;
1015 goto out;
1016 }
1017 buf->rb_pool = p; /* for freeing it later */
1018
1019 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1020 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1021 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1022 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1023
1024 /*
1025 * Register the zeroed pad buffer, if any.
1026 */
1027 if (cdata->padding) {
1028 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1029 &ep->rep_pad_mr, &ep->rep_pad);
1030 if (rc)
1031 goto out;
1032 }
1033 p += cdata->padding;
1034
1035 /*
1036 * Allocate the fmr's, or mw's for mw_bind chunk registration.
1037 * We "cycle" the mw's in order to minimize rkey reuse,
1038 * and also reduce unbind-to-bind collision.
1039 */
1040 INIT_LIST_HEAD(&buf->rb_mws);
8d4ba034 1041 r = (struct rpcrdma_mw *)p;
c56c65fb 1042 switch (ia->ri_memreg_strategy) {
3197d309
TT
1043 case RPCRDMA_FRMR:
1044 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1045 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1046 RPCRDMA_MAX_SEGS);
1047 if (IS_ERR(r->r.frmr.fr_mr)) {
1048 rc = PTR_ERR(r->r.frmr.fr_mr);
1049 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1050 " failed %i\n", __func__, rc);
1051 goto out;
1052 }
1053 r->r.frmr.fr_pgl =
1054 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1055 RPCRDMA_MAX_SEGS);
1056 if (IS_ERR(r->r.frmr.fr_pgl)) {
1057 rc = PTR_ERR(r->r.frmr.fr_pgl);
1058 dprintk("RPC: %s: "
1059 "ib_alloc_fast_reg_page_list "
1060 "failed %i\n", __func__, rc);
1061 goto out;
1062 }
1063 list_add(&r->mw_list, &buf->rb_mws);
1064 ++r;
1065 }
1066 break;
c56c65fb 1067 case RPCRDMA_MTHCAFMR:
c56c65fb
TT
1068 /* TBD we are perhaps overallocating here */
1069 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
8d4ba034
TT
1070 static struct ib_fmr_attr fa =
1071 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
c56c65fb
TT
1072 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1073 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1074 &fa);
1075 if (IS_ERR(r->r.fmr)) {
1076 rc = PTR_ERR(r->r.fmr);
1077 dprintk("RPC: %s: ib_alloc_fmr"
1078 " failed %i\n", __func__, rc);
1079 goto out;
1080 }
1081 list_add(&r->mw_list, &buf->rb_mws);
1082 ++r;
1083 }
c56c65fb
TT
1084 break;
1085 case RPCRDMA_MEMWINDOWS_ASYNC:
1086 case RPCRDMA_MEMWINDOWS:
c56c65fb
TT
1087 /* Allocate one extra request's worth, for full cycling */
1088 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1089 r->r.mw = ib_alloc_mw(ia->ri_pd);
1090 if (IS_ERR(r->r.mw)) {
1091 rc = PTR_ERR(r->r.mw);
1092 dprintk("RPC: %s: ib_alloc_mw"
1093 " failed %i\n", __func__, rc);
1094 goto out;
1095 }
1096 list_add(&r->mw_list, &buf->rb_mws);
1097 ++r;
1098 }
c56c65fb
TT
1099 break;
1100 default:
1101 break;
1102 }
1103
1104 /*
1105 * Allocate/init the request/reply buffers. Doing this
1106 * using kmalloc for now -- one for each buf.
1107 */
1108 for (i = 0; i < buf->rb_max_requests; i++) {
1109 struct rpcrdma_req *req;
1110 struct rpcrdma_rep *rep;
1111
1112 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1113 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1114 /* Typical ~2400b, so rounding up saves work later */
1115 if (len < 4096)
1116 len = 4096;
1117 req = kmalloc(len, GFP_KERNEL);
1118 if (req == NULL) {
1119 dprintk("RPC: %s: request buffer %d alloc"
1120 " failed\n", __func__, i);
1121 rc = -ENOMEM;
1122 goto out;
1123 }
1124 memset(req, 0, sizeof(struct rpcrdma_req));
1125 buf->rb_send_bufs[i] = req;
1126 buf->rb_send_bufs[i]->rl_buffer = buf;
1127
1128 rc = rpcrdma_register_internal(ia, req->rl_base,
1129 len - offsetof(struct rpcrdma_req, rl_base),
1130 &buf->rb_send_bufs[i]->rl_handle,
1131 &buf->rb_send_bufs[i]->rl_iov);
1132 if (rc)
1133 goto out;
1134
1135 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1136
1137 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1138 rep = kmalloc(len, GFP_KERNEL);
1139 if (rep == NULL) {
1140 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1141 __func__, i);
1142 rc = -ENOMEM;
1143 goto out;
1144 }
1145 memset(rep, 0, sizeof(struct rpcrdma_rep));
1146 buf->rb_recv_bufs[i] = rep;
1147 buf->rb_recv_bufs[i]->rr_buffer = buf;
1148 init_waitqueue_head(&rep->rr_unbind);
1149
1150 rc = rpcrdma_register_internal(ia, rep->rr_base,
1151 len - offsetof(struct rpcrdma_rep, rr_base),
1152 &buf->rb_recv_bufs[i]->rr_handle,
1153 &buf->rb_recv_bufs[i]->rr_iov);
1154 if (rc)
1155 goto out;
1156
1157 }
1158 dprintk("RPC: %s: max_requests %d\n",
1159 __func__, buf->rb_max_requests);
1160 /* done */
1161 return 0;
1162out:
1163 rpcrdma_buffer_destroy(buf);
1164 return rc;
1165}
1166
1167/*
1168 * Unregister and destroy buffer memory. Need to deal with
1169 * partial initialization, so it's callable from failed create.
1170 * Must be called before destroying endpoint, as registrations
1171 * reference it.
1172 */
1173void
1174rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1175{
1176 int rc, i;
1177 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
8d4ba034 1178 struct rpcrdma_mw *r;
c56c65fb
TT
1179
1180 /* clean up in reverse order from create
1181 * 1. recv mr memory (mr free, then kfree)
1182 * 1a. bind mw memory
1183 * 2. send mr memory (mr free, then kfree)
1184 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1185 * 4. arrays
1186 */
1187 dprintk("RPC: %s: entering\n", __func__);
1188
1189 for (i = 0; i < buf->rb_max_requests; i++) {
1190 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1191 rpcrdma_deregister_internal(ia,
1192 buf->rb_recv_bufs[i]->rr_handle,
1193 &buf->rb_recv_bufs[i]->rr_iov);
1194 kfree(buf->rb_recv_bufs[i]);
1195 }
1196 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1197 while (!list_empty(&buf->rb_mws)) {
c56c65fb
TT
1198 r = list_entry(buf->rb_mws.next,
1199 struct rpcrdma_mw, mw_list);
1200 list_del(&r->mw_list);
1201 switch (ia->ri_memreg_strategy) {
3197d309
TT
1202 case RPCRDMA_FRMR:
1203 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1204 if (rc)
1205 dprintk("RPC: %s:"
1206 " ib_dereg_mr"
1207 " failed %i\n",
1208 __func__, rc);
1209 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1210 break;
c56c65fb
TT
1211 case RPCRDMA_MTHCAFMR:
1212 rc = ib_dealloc_fmr(r->r.fmr);
1213 if (rc)
1214 dprintk("RPC: %s:"
1215 " ib_dealloc_fmr"
1216 " failed %i\n",
1217 __func__, rc);
1218 break;
1219 case RPCRDMA_MEMWINDOWS_ASYNC:
1220 case RPCRDMA_MEMWINDOWS:
1221 rc = ib_dealloc_mw(r->r.mw);
1222 if (rc)
1223 dprintk("RPC: %s:"
1224 " ib_dealloc_mw"
1225 " failed %i\n",
1226 __func__, rc);
1227 break;
1228 default:
1229 break;
1230 }
1231 }
1232 rpcrdma_deregister_internal(ia,
1233 buf->rb_send_bufs[i]->rl_handle,
1234 &buf->rb_send_bufs[i]->rl_iov);
1235 kfree(buf->rb_send_bufs[i]);
1236 }
1237 }
1238
1239 kfree(buf->rb_pool);
1240}
1241
1242/*
1243 * Get a set of request/reply buffers.
1244 *
1245 * Reply buffer (if needed) is attached to send buffer upon return.
1246 * Rule:
1247 * rb_send_index and rb_recv_index MUST always be pointing to the
1248 * *next* available buffer (non-NULL). They are incremented after
1249 * removing buffers, and decremented *before* returning them.
1250 */
1251struct rpcrdma_req *
1252rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1253{
1254 struct rpcrdma_req *req;
1255 unsigned long flags;
8d4ba034
TT
1256 int i;
1257 struct rpcrdma_mw *r;
c56c65fb
TT
1258
1259 spin_lock_irqsave(&buffers->rb_lock, flags);
1260 if (buffers->rb_send_index == buffers->rb_max_requests) {
1261 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1262 dprintk("RPC: %s: out of request buffers\n", __func__);
1263 return ((struct rpcrdma_req *)NULL);
1264 }
1265
1266 req = buffers->rb_send_bufs[buffers->rb_send_index];
1267 if (buffers->rb_send_index < buffers->rb_recv_index) {
1268 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1269 __func__,
1270 buffers->rb_recv_index - buffers->rb_send_index);
1271 req->rl_reply = NULL;
1272 } else {
1273 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1274 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1275 }
1276 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1277 if (!list_empty(&buffers->rb_mws)) {
8d4ba034 1278 i = RPCRDMA_MAX_SEGS - 1;
c56c65fb 1279 do {
c56c65fb
TT
1280 r = list_entry(buffers->rb_mws.next,
1281 struct rpcrdma_mw, mw_list);
1282 list_del(&r->mw_list);
1283 req->rl_segments[i].mr_chunk.rl_mw = r;
1284 } while (--i >= 0);
1285 }
1286 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1287 return req;
1288}
1289
1290/*
1291 * Put request/reply buffers back into pool.
1292 * Pre-decrement counter/array index.
1293 */
1294void
1295rpcrdma_buffer_put(struct rpcrdma_req *req)
1296{
1297 struct rpcrdma_buffer *buffers = req->rl_buffer;
1298 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1299 int i;
1300 unsigned long flags;
1301
1302 BUG_ON(req->rl_nchunks != 0);
1303 spin_lock_irqsave(&buffers->rb_lock, flags);
1304 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1305 req->rl_niovs = 0;
1306 if (req->rl_reply) {
1307 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1308 init_waitqueue_head(&req->rl_reply->rr_unbind);
1309 req->rl_reply->rr_func = NULL;
1310 req->rl_reply = NULL;
1311 }
1312 switch (ia->ri_memreg_strategy) {
3197d309 1313 case RPCRDMA_FRMR:
c56c65fb
TT
1314 case RPCRDMA_MTHCAFMR:
1315 case RPCRDMA_MEMWINDOWS_ASYNC:
1316 case RPCRDMA_MEMWINDOWS:
1317 /*
1318 * Cycle mw's back in reverse order, and "spin" them.
1319 * This delays and scrambles reuse as much as possible.
1320 */
1321 i = 1;
1322 do {
1323 struct rpcrdma_mw **mw;
1324 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1325 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1326 *mw = NULL;
1327 } while (++i < RPCRDMA_MAX_SEGS);
1328 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1329 &buffers->rb_mws);
1330 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1331 break;
1332 default:
1333 break;
1334 }
1335 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1336}
1337
1338/*
1339 * Recover reply buffers from pool.
1340 * This happens when recovering from error conditions.
1341 * Post-increment counter/array index.
1342 */
1343void
1344rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1345{
1346 struct rpcrdma_buffer *buffers = req->rl_buffer;
1347 unsigned long flags;
1348
1349 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1350 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1351 spin_lock_irqsave(&buffers->rb_lock, flags);
1352 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1353 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1354 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1355 }
1356 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1357}
1358
1359/*
1360 * Put reply buffers back into pool when not attached to
1361 * request. This happens in error conditions, and when
1362 * aborting unbinds. Pre-decrement counter/array index.
1363 */
1364void
1365rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1366{
1367 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1368 unsigned long flags;
1369
1370 rep->rr_func = NULL;
1371 spin_lock_irqsave(&buffers->rb_lock, flags);
1372 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1373 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1374}
1375
1376/*
1377 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1378 */
1379
1380int
1381rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1382 struct ib_mr **mrp, struct ib_sge *iov)
1383{
1384 struct ib_phys_buf ipb;
1385 struct ib_mr *mr;
1386 int rc;
1387
1388 /*
1389 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1390 */
1391 iov->addr = ib_dma_map_single(ia->ri_id->device,
1392 va, len, DMA_BIDIRECTIONAL);
1393 iov->length = len;
1394
bd7ed1d1
TT
1395 if (ia->ri_have_dma_lkey) {
1396 *mrp = NULL;
1397 iov->lkey = ia->ri_dma_lkey;
1398 return 0;
1399 } else if (ia->ri_bind_mem != NULL) {
c56c65fb
TT
1400 *mrp = NULL;
1401 iov->lkey = ia->ri_bind_mem->lkey;
1402 return 0;
1403 }
1404
1405 ipb.addr = iov->addr;
1406 ipb.size = iov->length;
1407 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1408 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1409
1410 dprintk("RPC: %s: phys convert: 0x%llx "
1411 "registered 0x%llx length %d\n",
a56daeb7
AM
1412 __func__, (unsigned long long)ipb.addr,
1413 (unsigned long long)iov->addr, len);
c56c65fb
TT
1414
1415 if (IS_ERR(mr)) {
1416 *mrp = NULL;
1417 rc = PTR_ERR(mr);
1418 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1419 } else {
1420 *mrp = mr;
1421 iov->lkey = mr->lkey;
1422 rc = 0;
1423 }
1424
1425 return rc;
1426}
1427
1428int
1429rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1430 struct ib_mr *mr, struct ib_sge *iov)
1431{
1432 int rc;
1433
1434 ib_dma_unmap_single(ia->ri_id->device,
1435 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1436
1437 if (NULL == mr)
1438 return 0;
1439
1440 rc = ib_dereg_mr(mr);
1441 if (rc)
1442 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1443 return rc;
1444}
1445
1446/*
1447 * Wrappers for chunk registration, shared by read/write chunk code.
1448 */
1449
1450static void
1451rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1452{
1453 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1454 seg->mr_dmalen = seg->mr_len;
1455 if (seg->mr_page)
1456 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1457 seg->mr_page, offset_in_page(seg->mr_offset),
1458 seg->mr_dmalen, seg->mr_dir);
1459 else
1460 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1461 seg->mr_offset,
1462 seg->mr_dmalen, seg->mr_dir);
5c635e09
TT
1463 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1464 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1465 __func__,
986d4abb
RD
1466 (unsigned long long)seg->mr_dma,
1467 seg->mr_offset, seg->mr_dmalen);
5c635e09 1468 }
c56c65fb
TT
1469}
1470
1471static void
1472rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1473{
1474 if (seg->mr_page)
1475 ib_dma_unmap_page(ia->ri_id->device,
1476 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1477 else
1478 ib_dma_unmap_single(ia->ri_id->device,
1479 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1480}
1481
3197d309
TT
1482static int
1483rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1484 int *nsegs, int writing, struct rpcrdma_ia *ia,
1485 struct rpcrdma_xprt *r_xprt)
1486{
1487 struct rpcrdma_mr_seg *seg1 = seg;
5c635e09
TT
1488 struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1489
3197d309
TT
1490 u8 key;
1491 int len, pageoff;
1492 int i, rc;
1493
1494 pageoff = offset_in_page(seg1->mr_offset);
1495 seg1->mr_offset -= pageoff; /* start of page */
1496 seg1->mr_len += pageoff;
1497 len = -pageoff;
1498 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1499 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1500 for (i = 0; i < *nsegs;) {
1501 rpcrdma_map_one(ia, seg, writing);
1502 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1503 len += seg->mr_len;
5c635e09 1504 BUG_ON(seg->mr_len > PAGE_SIZE);
3197d309
TT
1505 ++seg;
1506 ++i;
1507 /* Check for holes */
1508 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1509 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1510 break;
1511 }
1512 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1513 __func__, seg1->mr_chunk.rl_mw, i);
1514
5c635e09
TT
1515 if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1516 dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n",
1517 __func__,
1518 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1519 /* Invalidate before using. */
1520 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1521 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1522 invalidate_wr.next = &frmr_wr;
1523 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1524 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1525 invalidate_wr.ex.invalidate_rkey =
1526 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1527 DECR_CQCOUNT(&r_xprt->rx_ep);
1528 post_wr = &invalidate_wr;
1529 } else
1530 post_wr = &frmr_wr;
1531
3197d309
TT
1532 /* Bump the key */
1533 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1534 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1535
1536 /* Prepare FRMR WR */
1537 memset(&frmr_wr, 0, sizeof frmr_wr);
5c635e09 1538 frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
3197d309 1539 frmr_wr.opcode = IB_WR_FAST_REG_MR;
5c635e09 1540 frmr_wr.send_flags = IB_SEND_SIGNALED;
7a8b80eb 1541 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
3197d309
TT
1542 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1543 frmr_wr.wr.fast_reg.page_list_len = i;
1544 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1545 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
5c635e09 1546 BUG_ON(frmr_wr.wr.fast_reg.length < len);
3197d309 1547 frmr_wr.wr.fast_reg.access_flags = (writing ?
68743082
VP
1548 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1549 IB_ACCESS_REMOTE_READ);
3197d309
TT
1550 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1551 DECR_CQCOUNT(&r_xprt->rx_ep);
1552
5c635e09 1553 rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
3197d309
TT
1554
1555 if (rc) {
1556 dprintk("RPC: %s: failed ib_post_send for register,"
1557 " status %i\n", __func__, rc);
1558 while (i--)
1559 rpcrdma_unmap_one(ia, --seg);
1560 } else {
1561 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1562 seg1->mr_base = seg1->mr_dma + pageoff;
1563 seg1->mr_nsegs = i;
1564 seg1->mr_len = len;
1565 }
1566 *nsegs = i;
1567 return rc;
1568}
1569
1570static int
1571rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1572 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1573{
1574 struct rpcrdma_mr_seg *seg1 = seg;
1575 struct ib_send_wr invalidate_wr, *bad_wr;
1576 int rc;
1577
1578 while (seg1->mr_nsegs--)
1579 rpcrdma_unmap_one(ia, seg++);
1580
1581 memset(&invalidate_wr, 0, sizeof invalidate_wr);
5c635e09 1582 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
3197d309 1583 invalidate_wr.opcode = IB_WR_LOCAL_INV;
5c635e09 1584 invalidate_wr.send_flags = IB_SEND_SIGNALED;
3197d309
TT
1585 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1586 DECR_CQCOUNT(&r_xprt->rx_ep);
1587
1588 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1589 if (rc)
1590 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1591 " status %i\n", __func__, rc);
1592 return rc;
1593}
1594
8d4ba034
TT
1595static int
1596rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1597 int *nsegs, int writing, struct rpcrdma_ia *ia)
1598{
1599 struct rpcrdma_mr_seg *seg1 = seg;
1600 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1601 int len, pageoff, i, rc;
1602
1603 pageoff = offset_in_page(seg1->mr_offset);
1604 seg1->mr_offset -= pageoff; /* start of page */
1605 seg1->mr_len += pageoff;
1606 len = -pageoff;
1607 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1608 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1609 for (i = 0; i < *nsegs;) {
1610 rpcrdma_map_one(ia, seg, writing);
1611 physaddrs[i] = seg->mr_dma;
1612 len += seg->mr_len;
1613 ++seg;
1614 ++i;
1615 /* Check for holes */
1616 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1617 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1618 break;
1619 }
1620 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1621 physaddrs, i, seg1->mr_dma);
1622 if (rc) {
1623 dprintk("RPC: %s: failed ib_map_phys_fmr "
1624 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1625 len, (unsigned long long)seg1->mr_dma,
1626 pageoff, i, rc);
1627 while (i--)
1628 rpcrdma_unmap_one(ia, --seg);
1629 } else {
1630 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1631 seg1->mr_base = seg1->mr_dma + pageoff;
1632 seg1->mr_nsegs = i;
1633 seg1->mr_len = len;
1634 }
1635 *nsegs = i;
1636 return rc;
1637}
1638
1639static int
1640rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1641 struct rpcrdma_ia *ia)
1642{
1643 struct rpcrdma_mr_seg *seg1 = seg;
1644 LIST_HEAD(l);
1645 int rc;
1646
1647 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1648 rc = ib_unmap_fmr(&l);
1649 while (seg1->mr_nsegs--)
1650 rpcrdma_unmap_one(ia, seg++);
1651 if (rc)
1652 dprintk("RPC: %s: failed ib_unmap_fmr,"
1653 " status %i\n", __func__, rc);
1654 return rc;
1655}
1656
1657static int
1658rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1659 int *nsegs, int writing, struct rpcrdma_ia *ia,
1660 struct rpcrdma_xprt *r_xprt)
1661{
1662 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1663 IB_ACCESS_REMOTE_READ);
1664 struct ib_mw_bind param;
1665 int rc;
1666
1667 *nsegs = 1;
1668 rpcrdma_map_one(ia, seg, writing);
1669 param.mr = ia->ri_bind_mem;
1670 param.wr_id = 0ULL; /* no send cookie */
1671 param.addr = seg->mr_dma;
1672 param.length = seg->mr_len;
1673 param.send_flags = 0;
1674 param.mw_access_flags = mem_priv;
1675
1676 DECR_CQCOUNT(&r_xprt->rx_ep);
1677 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1678 if (rc) {
1679 dprintk("RPC: %s: failed ib_bind_mw "
1680 "%u@0x%llx status %i\n",
1681 __func__, seg->mr_len,
1682 (unsigned long long)seg->mr_dma, rc);
1683 rpcrdma_unmap_one(ia, seg);
1684 } else {
1685 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1686 seg->mr_base = param.addr;
1687 seg->mr_nsegs = 1;
1688 }
1689 return rc;
1690}
1691
1692static int
1693rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1694 struct rpcrdma_ia *ia,
1695 struct rpcrdma_xprt *r_xprt, void **r)
1696{
1697 struct ib_mw_bind param;
1698 LIST_HEAD(l);
1699 int rc;
1700
1701 BUG_ON(seg->mr_nsegs != 1);
1702 param.mr = ia->ri_bind_mem;
1703 param.addr = 0ULL; /* unbind */
1704 param.length = 0;
1705 param.mw_access_flags = 0;
1706 if (*r) {
1707 param.wr_id = (u64) (unsigned long) *r;
1708 param.send_flags = IB_SEND_SIGNALED;
1709 INIT_CQCOUNT(&r_xprt->rx_ep);
1710 } else {
1711 param.wr_id = 0ULL;
1712 param.send_flags = 0;
1713 DECR_CQCOUNT(&r_xprt->rx_ep);
1714 }
1715 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1716 rpcrdma_unmap_one(ia, seg);
1717 if (rc)
1718 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1719 " status %i\n", __func__, rc);
1720 else
1721 *r = NULL; /* will upcall on completion */
1722 return rc;
1723}
1724
1725static int
1726rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1727 int *nsegs, int writing, struct rpcrdma_ia *ia)
1728{
1729 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1730 IB_ACCESS_REMOTE_READ);
1731 struct rpcrdma_mr_seg *seg1 = seg;
1732 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1733 int len, i, rc = 0;
1734
1735 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1736 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1737 for (len = 0, i = 0; i < *nsegs;) {
1738 rpcrdma_map_one(ia, seg, writing);
1739 ipb[i].addr = seg->mr_dma;
1740 ipb[i].size = seg->mr_len;
1741 len += seg->mr_len;
1742 ++seg;
1743 ++i;
1744 /* Check for holes */
1745 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1746 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1747 break;
1748 }
1749 seg1->mr_base = seg1->mr_dma;
1750 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1751 ipb, i, mem_priv, &seg1->mr_base);
1752 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1753 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1754 dprintk("RPC: %s: failed ib_reg_phys_mr "
1755 "%u@0x%llx (%d)... status %i\n",
1756 __func__, len,
1757 (unsigned long long)seg1->mr_dma, i, rc);
1758 while (i--)
1759 rpcrdma_unmap_one(ia, --seg);
1760 } else {
1761 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1762 seg1->mr_nsegs = i;
1763 seg1->mr_len = len;
1764 }
1765 *nsegs = i;
1766 return rc;
1767}
1768
1769static int
1770rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1771 struct rpcrdma_ia *ia)
1772{
1773 struct rpcrdma_mr_seg *seg1 = seg;
1774 int rc;
1775
1776 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1777 seg1->mr_chunk.rl_mr = NULL;
1778 while (seg1->mr_nsegs--)
1779 rpcrdma_unmap_one(ia, seg++);
1780 if (rc)
1781 dprintk("RPC: %s: failed ib_dereg_mr,"
1782 " status %i\n", __func__, rc);
1783 return rc;
1784}
1785
c56c65fb
TT
1786int
1787rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1788 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1789{
1790 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1791 int rc = 0;
1792
1793 switch (ia->ri_memreg_strategy) {
1794
1795#if RPCRDMA_PERSISTENT_REGISTRATION
1796 case RPCRDMA_ALLPHYSICAL:
1797 rpcrdma_map_one(ia, seg, writing);
1798 seg->mr_rkey = ia->ri_bind_mem->rkey;
1799 seg->mr_base = seg->mr_dma;
1800 seg->mr_nsegs = 1;
1801 nsegs = 1;
1802 break;
1803#endif
1804
3197d309
TT
1805 /* Registration using frmr registration */
1806 case RPCRDMA_FRMR:
1807 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1808 break;
1809
8d4ba034 1810 /* Registration using fmr memory registration */
c56c65fb 1811 case RPCRDMA_MTHCAFMR:
8d4ba034 1812 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
c56c65fb
TT
1813 break;
1814
1815 /* Registration using memory windows */
1816 case RPCRDMA_MEMWINDOWS_ASYNC:
1817 case RPCRDMA_MEMWINDOWS:
8d4ba034 1818 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
c56c65fb
TT
1819 break;
1820
1821 /* Default registration each time */
1822 default:
8d4ba034 1823 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
c56c65fb
TT
1824 break;
1825 }
1826 if (rc)
1827 return -1;
1828
1829 return nsegs;
1830}
1831
1832int
1833rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1834 struct rpcrdma_xprt *r_xprt, void *r)
1835{
1836 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1837 int nsegs = seg->mr_nsegs, rc;
1838
1839 switch (ia->ri_memreg_strategy) {
1840
1841#if RPCRDMA_PERSISTENT_REGISTRATION
1842 case RPCRDMA_ALLPHYSICAL:
1843 BUG_ON(nsegs != 1);
1844 rpcrdma_unmap_one(ia, seg);
1845 rc = 0;
1846 break;
1847#endif
1848
3197d309
TT
1849 case RPCRDMA_FRMR:
1850 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1851 break;
1852
c56c65fb 1853 case RPCRDMA_MTHCAFMR:
8d4ba034 1854 rc = rpcrdma_deregister_fmr_external(seg, ia);
c56c65fb
TT
1855 break;
1856
1857 case RPCRDMA_MEMWINDOWS_ASYNC:
1858 case RPCRDMA_MEMWINDOWS:
8d4ba034 1859 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
c56c65fb
TT
1860 break;
1861
1862 default:
8d4ba034 1863 rc = rpcrdma_deregister_default_external(seg, ia);
c56c65fb
TT
1864 break;
1865 }
1866 if (r) {
1867 struct rpcrdma_rep *rep = r;
1868 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1869 rep->rr_func = NULL;
1870 func(rep); /* dereg done, callback now */
1871 }
1872 return nsegs;
1873}
1874
1875/*
1876 * Prepost any receive buffer, then post send.
1877 *
1878 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1879 */
1880int
1881rpcrdma_ep_post(struct rpcrdma_ia *ia,
1882 struct rpcrdma_ep *ep,
1883 struct rpcrdma_req *req)
1884{
1885 struct ib_send_wr send_wr, *send_wr_fail;
1886 struct rpcrdma_rep *rep = req->rl_reply;
1887 int rc;
1888
1889 if (rep) {
1890 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1891 if (rc)
1892 goto out;
1893 req->rl_reply = NULL;
1894 }
1895
1896 send_wr.next = NULL;
1897 send_wr.wr_id = 0ULL; /* no send cookie */
1898 send_wr.sg_list = req->rl_send_iov;
1899 send_wr.num_sge = req->rl_niovs;
1900 send_wr.opcode = IB_WR_SEND;
c56c65fb
TT
1901 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1902 ib_dma_sync_single_for_device(ia->ri_id->device,
1903 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1904 DMA_TO_DEVICE);
1905 ib_dma_sync_single_for_device(ia->ri_id->device,
1906 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1907 DMA_TO_DEVICE);
1908 ib_dma_sync_single_for_device(ia->ri_id->device,
1909 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1910 DMA_TO_DEVICE);
1911
1912 if (DECR_CQCOUNT(ep) > 0)
1913 send_wr.send_flags = 0;
1914 else { /* Provider must take a send completion every now and then */
1915 INIT_CQCOUNT(ep);
1916 send_wr.send_flags = IB_SEND_SIGNALED;
1917 }
1918
1919 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1920 if (rc)
1921 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1922 rc);
1923out:
1924 return rc;
1925}
1926
1927/*
1928 * (Re)post a receive buffer.
1929 */
1930int
1931rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1932 struct rpcrdma_ep *ep,
1933 struct rpcrdma_rep *rep)
1934{
1935 struct ib_recv_wr recv_wr, *recv_wr_fail;
1936 int rc;
1937
1938 recv_wr.next = NULL;
1939 recv_wr.wr_id = (u64) (unsigned long) rep;
1940 recv_wr.sg_list = &rep->rr_iov;
1941 recv_wr.num_sge = 1;
1942
1943 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1944 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1945
1946 DECR_CQCOUNT(ep);
1947 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1948
1949 if (rc)
1950 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1951 rc);
1952 return rc;
1953}