xprtrdma: Reduce the number of hardway buffer allocations
[linux-2.6-block.git] / net / sunrpc / xprtrdma / verbs.c
CommitLineData
f58851e6 1/*
c56c65fb
TT
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
f58851e6
TT
38 */
39
c56c65fb
TT
40/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
a6b7a407 50#include <linux/interrupt.h>
c56c65fb 51#include <linux/pci.h> /* for Tavor hack below */
5a0e3ad6 52#include <linux/slab.h>
65866f82 53#include <asm/bitops.h>
c56c65fb 54
f58851e6
TT
55#include "xprt_rdma.h"
56
c56c65fb
TT
57/*
58 * Globals/Macros
59 */
60
61#ifdef RPC_DEBUG
62# define RPCDBG_FACILITY RPCDBG_TRANS
63#endif
64
65/*
66 * internal functions
67 */
68
69/*
70 * handle replies in tasklet context, using a single, global list
71 * rdma tasklet function -- just turn around and call the func
72 * for all replies on the list
73 */
74
75static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
76static LIST_HEAD(rpcrdma_tasklets_g);
77
78static void
79rpcrdma_run_tasklet(unsigned long data)
80{
81 struct rpcrdma_rep *rep;
82 void (*func)(struct rpcrdma_rep *);
83 unsigned long flags;
84
85 data = data;
86 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
87 while (!list_empty(&rpcrdma_tasklets_g)) {
88 rep = list_entry(rpcrdma_tasklets_g.next,
89 struct rpcrdma_rep, rr_list);
90 list_del(&rep->rr_list);
91 func = rep->rr_func;
92 rep->rr_func = NULL;
93 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
94
95 if (func)
96 func(rep);
97 else
98 rpcrdma_recv_buffer_put(rep);
99
100 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
101 }
102 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
103}
104
105static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
106
107static inline void
108rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
109{
110 unsigned long flags;
111
112 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
113 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
114 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
115 tasklet_schedule(&rpcrdma_tasklet_g);
116}
117
118static void
119rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
120{
121 struct rpcrdma_ep *ep = context;
122
123 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
124 __func__, event->event, event->device->name, context);
125 if (ep->rep_connected == 1) {
126 ep->rep_connected = -EIO;
127 ep->rep_func(ep);
128 wake_up_all(&ep->rep_connect_wait);
129 }
130}
131
132static void
133rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
134{
135 struct rpcrdma_ep *ep = context;
136
137 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
138 __func__, event->event, event->device->name, context);
139 if (ep->rep_connected == 1) {
140 ep->rep_connected = -EIO;
141 ep->rep_func(ep);
142 wake_up_all(&ep->rep_connect_wait);
143 }
144}
145
fc664485
CL
146static void
147rpcrdma_sendcq_process_wc(struct ib_wc *wc)
c56c65fb 148{
fc664485 149 struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
c56c65fb 150
fc664485
CL
151 dprintk("RPC: %s: frmr %p status %X opcode %d\n",
152 __func__, frmr, wc->status, wc->opcode);
c56c65fb 153
fc664485 154 if (wc->wr_id == 0ULL)
c56c65fb 155 return;
fc664485 156 if (wc->status != IB_WC_SUCCESS)
c56c65fb 157 return;
c56c65fb 158
fc664485 159 if (wc->opcode == IB_WC_FAST_REG_MR)
5c635e09 160 frmr->r.frmr.state = FRMR_IS_VALID;
fc664485 161 else if (wc->opcode == IB_WC_LOCAL_INV)
5c635e09 162 frmr->r.frmr.state = FRMR_IS_INVALID;
c56c65fb
TT
163}
164
fc664485 165static int
1c00dd07 166rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
c56c65fb 167{
1c00dd07 168 struct ib_wc *wcs;
8301a2c0 169 int budget, count, rc;
c56c65fb 170
8301a2c0 171 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
172 do {
173 wcs = ep->rep_send_wcs;
174
175 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
176 if (rc <= 0)
177 return rc;
178
179 count = rc;
180 while (count-- > 0)
181 rpcrdma_sendcq_process_wc(wcs++);
8301a2c0 182 } while (rc == RPCRDMA_POLLSIZE && --budget);
1c00dd07 183 return 0;
fc664485 184}
c56c65fb 185
fc664485
CL
186/*
187 * Handle send, fast_reg_mr, and local_inv completions.
188 *
189 * Send events are typically suppressed and thus do not result
190 * in an upcall. Occasionally one is signaled, however. This
191 * prevents the provider's completion queue from wrapping and
192 * losing a completion.
193 */
194static void
195rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
196{
1c00dd07 197 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
fc664485
CL
198 int rc;
199
1c00dd07 200 rc = rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
201 if (rc) {
202 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
203 __func__, rc);
204 return;
c56c65fb
TT
205 }
206
7f23f6f6
CL
207 rc = ib_req_notify_cq(cq,
208 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
209 if (rc == 0)
210 return;
211 if (rc < 0) {
fc664485
CL
212 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
213 __func__, rc);
214 return;
215 }
216
1c00dd07 217 rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
218}
219
220static void
221rpcrdma_recvcq_process_wc(struct ib_wc *wc)
222{
223 struct rpcrdma_rep *rep =
224 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
225
226 dprintk("RPC: %s: rep %p status %X opcode %X length %u\n",
227 __func__, rep, wc->status, wc->opcode, wc->byte_len);
228
229 if (wc->status != IB_WC_SUCCESS) {
230 rep->rr_len = ~0U;
231 goto out_schedule;
232 }
233 if (wc->opcode != IB_WC_RECV)
234 return;
235
236 rep->rr_len = wc->byte_len;
237 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
238 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
239
240 if (rep->rr_len >= 16) {
241 struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
242 unsigned int credits = ntohl(p->rm_credit);
243
244 if (credits == 0)
245 credits = 1; /* don't deadlock */
246 else if (credits > rep->rr_buffer->rb_max_requests)
247 credits = rep->rr_buffer->rb_max_requests;
248 atomic_set(&rep->rr_buffer->rb_credits, credits);
249 }
250
251out_schedule:
252 rpcrdma_schedule_tasklet(rep);
253}
254
255static int
1c00dd07 256rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
fc664485 257{
1c00dd07 258 struct ib_wc *wcs;
8301a2c0 259 int budget, count, rc;
fc664485 260
8301a2c0 261 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
262 do {
263 wcs = ep->rep_recv_wcs;
264
265 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
266 if (rc <= 0)
267 return rc;
268
269 count = rc;
270 while (count-- > 0)
271 rpcrdma_recvcq_process_wc(wcs++);
8301a2c0 272 } while (rc == RPCRDMA_POLLSIZE && --budget);
1c00dd07 273 return 0;
c56c65fb
TT
274}
275
276/*
fc664485 277 * Handle receive completions.
c56c65fb 278 *
c56c65fb
TT
279 * It is reentrant but processes single events in order to maintain
280 * ordering of receives to keep server credits.
281 *
282 * It is the responsibility of the scheduled tasklet to return
283 * recv buffers to the pool. NOTE: this affects synchronization of
284 * connection shutdown. That is, the structures required for
285 * the completion of the reply handler must remain intact until
286 * all memory has been reclaimed.
c56c65fb
TT
287 */
288static void
fc664485 289rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
c56c65fb 290{
1c00dd07 291 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
c56c65fb
TT
292 int rc;
293
1c00dd07 294 rc = rpcrdma_recvcq_poll(cq, ep);
fc664485
CL
295 if (rc) {
296 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
297 __func__, rc);
c56c65fb 298 return;
fc664485 299 }
c56c65fb 300
7f23f6f6
CL
301 rc = ib_req_notify_cq(cq,
302 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
303 if (rc == 0)
304 return;
305 if (rc < 0) {
fc664485 306 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
c56c65fb
TT
307 __func__, rc);
308 return;
309 }
310
1c00dd07 311 rpcrdma_recvcq_poll(cq, ep);
c56c65fb
TT
312}
313
314#ifdef RPC_DEBUG
315static const char * const conn[] = {
316 "address resolved",
317 "address error",
318 "route resolved",
319 "route error",
320 "connect request",
321 "connect response",
322 "connect error",
323 "unreachable",
324 "rejected",
325 "established",
326 "disconnected",
327 "device removal"
328};
329#endif
330
331static int
332rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
333{
334 struct rpcrdma_xprt *xprt = id->context;
335 struct rpcrdma_ia *ia = &xprt->rx_ia;
336 struct rpcrdma_ep *ep = &xprt->rx_ep;
ff0db049 337#ifdef RPC_DEBUG
c56c65fb 338 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
ff0db049 339#endif
c56c65fb
TT
340 struct ib_qp_attr attr;
341 struct ib_qp_init_attr iattr;
342 int connstate = 0;
343
344 switch (event->event) {
345 case RDMA_CM_EVENT_ADDR_RESOLVED:
346 case RDMA_CM_EVENT_ROUTE_RESOLVED:
5675add3 347 ia->ri_async_rc = 0;
c56c65fb
TT
348 complete(&ia->ri_done);
349 break;
350 case RDMA_CM_EVENT_ADDR_ERROR:
351 ia->ri_async_rc = -EHOSTUNREACH;
352 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
353 __func__, ep);
354 complete(&ia->ri_done);
355 break;
356 case RDMA_CM_EVENT_ROUTE_ERROR:
357 ia->ri_async_rc = -ENETUNREACH;
358 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
359 __func__, ep);
360 complete(&ia->ri_done);
361 break;
362 case RDMA_CM_EVENT_ESTABLISHED:
363 connstate = 1;
364 ib_query_qp(ia->ri_id->qp, &attr,
365 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
366 &iattr);
367 dprintk("RPC: %s: %d responder resources"
368 " (%d initiator)\n",
369 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
370 goto connected;
371 case RDMA_CM_EVENT_CONNECT_ERROR:
372 connstate = -ENOTCONN;
373 goto connected;
374 case RDMA_CM_EVENT_UNREACHABLE:
375 connstate = -ENETDOWN;
376 goto connected;
377 case RDMA_CM_EVENT_REJECTED:
378 connstate = -ECONNREFUSED;
379 goto connected;
380 case RDMA_CM_EVENT_DISCONNECTED:
381 connstate = -ECONNABORTED;
382 goto connected;
383 case RDMA_CM_EVENT_DEVICE_REMOVAL:
384 connstate = -ENODEV;
385connected:
21454aaa 386 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
c56c65fb
TT
387 __func__,
388 (event->event <= 11) ? conn[event->event] :
389 "unknown connection error",
21454aaa 390 &addr->sin_addr.s_addr,
c56c65fb
TT
391 ntohs(addr->sin_port),
392 ep, event->event);
393 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
394 dprintk("RPC: %s: %sconnected\n",
395 __func__, connstate > 0 ? "" : "dis");
396 ep->rep_connected = connstate;
397 ep->rep_func(ep);
398 wake_up_all(&ep->rep_connect_wait);
399 break;
400 default:
1a954051 401 dprintk("RPC: %s: unexpected CM event %d\n",
c56c65fb 402 __func__, event->event);
c56c65fb
TT
403 break;
404 }
405
b3cd8d45
TT
406#ifdef RPC_DEBUG
407 if (connstate == 1) {
408 int ird = attr.max_dest_rd_atomic;
409 int tird = ep->rep_remote_cma.responder_resources;
21454aaa 410 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
b3cd8d45 411 "on %s, memreg %d slots %d ird %d%s\n",
21454aaa 412 &addr->sin_addr.s_addr,
b3cd8d45
TT
413 ntohs(addr->sin_port),
414 ia->ri_id->device->name,
415 ia->ri_memreg_strategy,
416 xprt->rx_buf.rb_max_requests,
417 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
418 } else if (connstate < 0) {
21454aaa
HH
419 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
420 &addr->sin_addr.s_addr,
b3cd8d45
TT
421 ntohs(addr->sin_port),
422 connstate);
423 }
424#endif
425
c56c65fb
TT
426 return 0;
427}
428
429static struct rdma_cm_id *
430rpcrdma_create_id(struct rpcrdma_xprt *xprt,
431 struct rpcrdma_ia *ia, struct sockaddr *addr)
432{
433 struct rdma_cm_id *id;
434 int rc;
435
1a954051
TT
436 init_completion(&ia->ri_done);
437
b26f9b99 438 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
c56c65fb
TT
439 if (IS_ERR(id)) {
440 rc = PTR_ERR(id);
441 dprintk("RPC: %s: rdma_create_id() failed %i\n",
442 __func__, rc);
443 return id;
444 }
445
5675add3 446 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
447 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
448 if (rc) {
449 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
450 __func__, rc);
451 goto out;
452 }
5675add3
TT
453 wait_for_completion_interruptible_timeout(&ia->ri_done,
454 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
455 rc = ia->ri_async_rc;
456 if (rc)
457 goto out;
458
5675add3 459 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
460 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
461 if (rc) {
462 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
463 __func__, rc);
464 goto out;
465 }
5675add3
TT
466 wait_for_completion_interruptible_timeout(&ia->ri_done,
467 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
468 rc = ia->ri_async_rc;
469 if (rc)
470 goto out;
471
472 return id;
473
474out:
475 rdma_destroy_id(id);
476 return ERR_PTR(rc);
477}
478
479/*
480 * Drain any cq, prior to teardown.
481 */
482static void
483rpcrdma_clean_cq(struct ib_cq *cq)
484{
485 struct ib_wc wc;
486 int count = 0;
487
488 while (1 == ib_poll_cq(cq, 1, &wc))
489 ++count;
490
491 if (count)
492 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
493 __func__, count, wc.opcode);
494}
495
496/*
497 * Exported functions.
498 */
499
500/*
501 * Open and initialize an Interface Adapter.
502 * o initializes fields of struct rpcrdma_ia, including
503 * interface and provider attributes and protection zone.
504 */
505int
506rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
507{
bd7ed1d1
TT
508 int rc, mem_priv;
509 struct ib_device_attr devattr;
c56c65fb
TT
510 struct rpcrdma_ia *ia = &xprt->rx_ia;
511
c56c65fb
TT
512 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
513 if (IS_ERR(ia->ri_id)) {
514 rc = PTR_ERR(ia->ri_id);
515 goto out1;
516 }
517
518 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
519 if (IS_ERR(ia->ri_pd)) {
520 rc = PTR_ERR(ia->ri_pd);
521 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
522 __func__, rc);
523 goto out2;
524 }
525
bd7ed1d1
TT
526 /*
527 * Query the device to determine if the requested memory
528 * registration strategy is supported. If it isn't, set the
529 * strategy to a globally supported model.
530 */
531 rc = ib_query_device(ia->ri_id->device, &devattr);
532 if (rc) {
533 dprintk("RPC: %s: ib_query_device failed %d\n",
534 __func__, rc);
535 goto out2;
536 }
537
538 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
539 ia->ri_have_dma_lkey = 1;
540 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
541 }
542
f10eafd3 543 if (memreg == RPCRDMA_FRMR) {
3197d309
TT
544 /* Requires both frmr reg and local dma lkey */
545 if ((devattr.device_cap_flags &
546 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
547 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
3197d309 548 dprintk("RPC: %s: FRMR registration "
f10eafd3
CL
549 "not supported by HCA\n", __func__);
550 memreg = RPCRDMA_MTHCAFMR;
0fc6c4e7
SW
551 } else {
552 /* Mind the ia limit on FRMR page list depth */
553 ia->ri_max_frmr_depth = min_t(unsigned int,
554 RPCRDMA_MAX_DATA_SEGS,
555 devattr.max_fast_reg_page_list_len);
bd7ed1d1 556 }
f10eafd3
CL
557 }
558 if (memreg == RPCRDMA_MTHCAFMR) {
559 if (!ia->ri_id->device->alloc_fmr) {
560 dprintk("RPC: %s: MTHCAFMR registration "
561 "not supported by HCA\n", __func__);
562#if RPCRDMA_PERSISTENT_REGISTRATION
563 memreg = RPCRDMA_ALLPHYSICAL;
564#else
cdd9ade7 565 rc = -ENOMEM;
f10eafd3
CL
566 goto out2;
567#endif
568 }
bd7ed1d1
TT
569 }
570
c56c65fb
TT
571 /*
572 * Optionally obtain an underlying physical identity mapping in
573 * order to do a memory window-based bind. This base registration
574 * is protected from remote access - that is enabled only by binding
575 * for the specific bytes targeted during each RPC operation, and
576 * revoked after the corresponding completion similar to a storage
577 * adapter.
578 */
bd7ed1d1 579 switch (memreg) {
3197d309 580 case RPCRDMA_FRMR:
bd7ed1d1 581 break;
c56c65fb 582#if RPCRDMA_PERSISTENT_REGISTRATION
bd7ed1d1
TT
583 case RPCRDMA_ALLPHYSICAL:
584 mem_priv = IB_ACCESS_LOCAL_WRITE |
585 IB_ACCESS_REMOTE_WRITE |
586 IB_ACCESS_REMOTE_READ;
587 goto register_setup;
c56c65fb 588#endif
bd7ed1d1
TT
589 case RPCRDMA_MTHCAFMR:
590 if (ia->ri_have_dma_lkey)
c56c65fb 591 break;
bd7ed1d1 592 mem_priv = IB_ACCESS_LOCAL_WRITE;
b45ccfd2 593#if RPCRDMA_PERSISTENT_REGISTRATION
bd7ed1d1 594 register_setup:
b45ccfd2 595#endif
c56c65fb
TT
596 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
597 if (IS_ERR(ia->ri_bind_mem)) {
598 printk(KERN_ALERT "%s: ib_get_dma_mr for "
0ac531c1 599 "phys register failed with %lX\n",
c56c65fb 600 __func__, PTR_ERR(ia->ri_bind_mem));
0ac531c1
CL
601 rc = -ENOMEM;
602 goto out2;
c56c65fb 603 }
bd7ed1d1
TT
604 break;
605 default:
cdd9ade7
CL
606 printk(KERN_ERR "RPC: Unsupported memory "
607 "registration mode: %d\n", memreg);
608 rc = -ENOMEM;
bd7ed1d1 609 goto out2;
c56c65fb 610 }
bd7ed1d1
TT
611 dprintk("RPC: %s: memory registration strategy is %d\n",
612 __func__, memreg);
c56c65fb
TT
613
614 /* Else will do memory reg/dereg for each chunk */
615 ia->ri_memreg_strategy = memreg;
616
617 return 0;
618out2:
619 rdma_destroy_id(ia->ri_id);
fee08caf 620 ia->ri_id = NULL;
c56c65fb
TT
621out1:
622 return rc;
623}
624
625/*
626 * Clean up/close an IA.
627 * o if event handles and PD have been initialized, free them.
628 * o close the IA
629 */
630void
631rpcrdma_ia_close(struct rpcrdma_ia *ia)
632{
633 int rc;
634
635 dprintk("RPC: %s: entering\n", __func__);
636 if (ia->ri_bind_mem != NULL) {
637 rc = ib_dereg_mr(ia->ri_bind_mem);
638 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
639 __func__, rc);
640 }
fee08caf
TT
641 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
642 if (ia->ri_id->qp)
643 rdma_destroy_qp(ia->ri_id);
644 rdma_destroy_id(ia->ri_id);
645 ia->ri_id = NULL;
646 }
c56c65fb
TT
647 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
648 rc = ib_dealloc_pd(ia->ri_pd);
649 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
650 __func__, rc);
651 }
c56c65fb
TT
652}
653
654/*
655 * Create unconnected endpoint.
656 */
657int
658rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
659 struct rpcrdma_create_data_internal *cdata)
660{
661 struct ib_device_attr devattr;
fc664485 662 struct ib_cq *sendcq, *recvcq;
5d40a8a5 663 int rc, err;
c56c65fb
TT
664
665 rc = ib_query_device(ia->ri_id->device, &devattr);
666 if (rc) {
667 dprintk("RPC: %s: ib_query_device failed %d\n",
668 __func__, rc);
669 return rc;
670 }
671
672 /* check provider's send/recv wr limits */
673 if (cdata->max_requests > devattr.max_qp_wr)
674 cdata->max_requests = devattr.max_qp_wr;
675
676 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
677 ep->rep_attr.qp_context = ep;
678 /* send_cq and recv_cq initialized below */
679 ep->rep_attr.srq = NULL;
680 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
681 switch (ia->ri_memreg_strategy) {
0fc6c4e7
SW
682 case RPCRDMA_FRMR: {
683 int depth = 7;
684
15cdc644
TT
685 /* Add room for frmr register and invalidate WRs.
686 * 1. FRMR reg WR for head
687 * 2. FRMR invalidate WR for head
0fc6c4e7
SW
688 * 3. N FRMR reg WRs for pagelist
689 * 4. N FRMR invalidate WRs for pagelist
15cdc644
TT
690 * 5. FRMR reg WR for tail
691 * 6. FRMR invalidate WR for tail
692 * 7. The RDMA_SEND WR
693 */
0fc6c4e7
SW
694
695 /* Calculate N if the device max FRMR depth is smaller than
696 * RPCRDMA_MAX_DATA_SEGS.
697 */
698 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
699 int delta = RPCRDMA_MAX_DATA_SEGS -
700 ia->ri_max_frmr_depth;
701
702 do {
703 depth += 2; /* FRMR reg + invalidate */
704 delta -= ia->ri_max_frmr_depth;
705 } while (delta > 0);
706
707 }
708 ep->rep_attr.cap.max_send_wr *= depth;
15cdc644 709 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
0fc6c4e7 710 cdata->max_requests = devattr.max_qp_wr / depth;
15cdc644
TT
711 if (!cdata->max_requests)
712 return -EINVAL;
0fc6c4e7
SW
713 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
714 depth;
15cdc644 715 }
3197d309 716 break;
0fc6c4e7 717 }
c56c65fb
TT
718 default:
719 break;
720 }
721 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
722 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
723 ep->rep_attr.cap.max_recv_sge = 1;
724 ep->rep_attr.cap.max_inline_data = 0;
725 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
726 ep->rep_attr.qp_type = IB_QPT_RC;
727 ep->rep_attr.port_num = ~0;
728
729 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
730 "iovs: send %d recv %d\n",
731 __func__,
732 ep->rep_attr.cap.max_send_wr,
733 ep->rep_attr.cap.max_recv_wr,
734 ep->rep_attr.cap.max_send_sge,
735 ep->rep_attr.cap.max_recv_sge);
736
737 /* set trigger for requesting send completion */
fc664485 738 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
c56c65fb
TT
739 if (ep->rep_cqinit <= 2)
740 ep->rep_cqinit = 0;
741 INIT_CQCOUNT(ep);
742 ep->rep_ia = ia;
743 init_waitqueue_head(&ep->rep_connect_wait);
254f91e2 744 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
c56c65fb 745
fc664485 746 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
1c00dd07 747 rpcrdma_cq_async_error_upcall, ep,
c56c65fb 748 ep->rep_attr.cap.max_send_wr + 1, 0);
fc664485
CL
749 if (IS_ERR(sendcq)) {
750 rc = PTR_ERR(sendcq);
751 dprintk("RPC: %s: failed to create send CQ: %i\n",
c56c65fb
TT
752 __func__, rc);
753 goto out1;
754 }
755
fc664485 756 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
c56c65fb
TT
757 if (rc) {
758 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
759 __func__, rc);
760 goto out2;
761 }
762
fc664485 763 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
1c00dd07 764 rpcrdma_cq_async_error_upcall, ep,
fc664485
CL
765 ep->rep_attr.cap.max_recv_wr + 1, 0);
766 if (IS_ERR(recvcq)) {
767 rc = PTR_ERR(recvcq);
768 dprintk("RPC: %s: failed to create recv CQ: %i\n",
769 __func__, rc);
770 goto out2;
771 }
772
773 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
774 if (rc) {
775 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
776 __func__, rc);
777 ib_destroy_cq(recvcq);
778 goto out2;
779 }
780
781 ep->rep_attr.send_cq = sendcq;
782 ep->rep_attr.recv_cq = recvcq;
c56c65fb
TT
783
784 /* Initialize cma parameters */
785
786 /* RPC/RDMA does not use private data */
787 ep->rep_remote_cma.private_data = NULL;
788 ep->rep_remote_cma.private_data_len = 0;
789
790 /* Client offers RDMA Read but does not initiate */
b334eaab 791 ep->rep_remote_cma.initiator_depth = 0;
03ff8821 792 if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
b334eaab
TT
793 ep->rep_remote_cma.responder_resources = 32;
794 else
c56c65fb 795 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
c56c65fb
TT
796
797 ep->rep_remote_cma.retry_count = 7;
798 ep->rep_remote_cma.flow_control = 0;
799 ep->rep_remote_cma.rnr_retry_count = 0;
800
801 return 0;
802
803out2:
fc664485 804 err = ib_destroy_cq(sendcq);
5d40a8a5
CL
805 if (err)
806 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
807 __func__, err);
c56c65fb
TT
808out1:
809 return rc;
810}
811
812/*
813 * rpcrdma_ep_destroy
814 *
815 * Disconnect and destroy endpoint. After this, the only
816 * valid operations on the ep are to free it (if dynamically
817 * allocated) or re-create it.
c56c65fb 818 */
7f1d5419 819void
c56c65fb
TT
820rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
821{
822 int rc;
823
824 dprintk("RPC: %s: entering, connected is %d\n",
825 __func__, ep->rep_connected);
826
254f91e2
CL
827 cancel_delayed_work_sync(&ep->rep_connect_worker);
828
c56c65fb
TT
829 if (ia->ri_id->qp) {
830 rc = rpcrdma_ep_disconnect(ep, ia);
831 if (rc)
832 dprintk("RPC: %s: rpcrdma_ep_disconnect"
833 " returned %i\n", __func__, rc);
fee08caf
TT
834 rdma_destroy_qp(ia->ri_id);
835 ia->ri_id->qp = NULL;
c56c65fb
TT
836 }
837
c56c65fb
TT
838 /* padding - could be done in rpcrdma_buffer_destroy... */
839 if (ep->rep_pad_mr) {
840 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
841 ep->rep_pad_mr = NULL;
842 }
843
fc664485
CL
844 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
845 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
846 if (rc)
847 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
848 __func__, rc);
849
850 rpcrdma_clean_cq(ep->rep_attr.send_cq);
851 rc = ib_destroy_cq(ep->rep_attr.send_cq);
c56c65fb
TT
852 if (rc)
853 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
854 __func__, rc);
c56c65fb
TT
855}
856
857/*
858 * Connect unconnected endpoint.
859 */
860int
861rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
862{
863 struct rdma_cm_id *id;
864 int rc = 0;
865 int retry_count = 0;
c56c65fb 866
c055551e 867 if (ep->rep_connected != 0) {
c56c65fb
TT
868 struct rpcrdma_xprt *xprt;
869retry:
870 rc = rpcrdma_ep_disconnect(ep, ia);
871 if (rc && rc != -ENOTCONN)
872 dprintk("RPC: %s: rpcrdma_ep_disconnect"
873 " status %i\n", __func__, rc);
fc664485
CL
874
875 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
876 rpcrdma_clean_cq(ep->rep_attr.send_cq);
c56c65fb
TT
877
878 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
879 id = rpcrdma_create_id(xprt, ia,
880 (struct sockaddr *)&xprt->rx_data.addr);
881 if (IS_ERR(id)) {
882 rc = PTR_ERR(id);
883 goto out;
884 }
885 /* TEMP TEMP TEMP - fail if new device:
886 * Deregister/remarshal *all* requests!
887 * Close and recreate adapter, pd, etc!
888 * Re-determine all attributes still sane!
889 * More stuff I haven't thought of!
890 * Rrrgh!
891 */
892 if (ia->ri_id->device != id->device) {
893 printk("RPC: %s: can't reconnect on "
894 "different device!\n", __func__);
895 rdma_destroy_id(id);
896 rc = -ENETDOWN;
897 goto out;
898 }
899 /* END TEMP */
1a954051 900 rdma_destroy_qp(ia->ri_id);
c56c65fb
TT
901 rdma_destroy_id(ia->ri_id);
902 ia->ri_id = id;
903 }
904
905 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
906 if (rc) {
907 dprintk("RPC: %s: rdma_create_qp failed %i\n",
908 __func__, rc);
909 goto out;
910 }
911
912/* XXX Tavor device performs badly with 2K MTU! */
913if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
914 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
915 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
916 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
917 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
918 struct ib_qp_attr attr = {
919 .path_mtu = IB_MTU_1024
920 };
921 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
922 }
923}
924
c56c65fb
TT
925 ep->rep_connected = 0;
926
927 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
928 if (rc) {
929 dprintk("RPC: %s: rdma_connect() failed with %i\n",
930 __func__, rc);
931 goto out;
932 }
933
c56c65fb
TT
934 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
935
936 /*
937 * Check state. A non-peer reject indicates no listener
938 * (ECONNREFUSED), which may be a transient state. All
939 * others indicate a transport condition which has already
940 * undergone a best-effort.
941 */
f64f9e71
JP
942 if (ep->rep_connected == -ECONNREFUSED &&
943 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
c56c65fb
TT
944 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
945 goto retry;
946 }
947 if (ep->rep_connected <= 0) {
948 /* Sometimes, the only way to reliably connect to remote
949 * CMs is to use same nonzero values for ORD and IRD. */
b334eaab
TT
950 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
951 (ep->rep_remote_cma.responder_resources == 0 ||
952 ep->rep_remote_cma.initiator_depth !=
953 ep->rep_remote_cma.responder_resources)) {
954 if (ep->rep_remote_cma.responder_resources == 0)
955 ep->rep_remote_cma.responder_resources = 1;
956 ep->rep_remote_cma.initiator_depth =
957 ep->rep_remote_cma.responder_resources;
c56c65fb 958 goto retry;
b334eaab 959 }
c56c65fb
TT
960 rc = ep->rep_connected;
961 } else {
962 dprintk("RPC: %s: connected\n", __func__);
963 }
964
965out:
966 if (rc)
967 ep->rep_connected = rc;
968 return rc;
969}
970
971/*
972 * rpcrdma_ep_disconnect
973 *
974 * This is separate from destroy to facilitate the ability
975 * to reconnect without recreating the endpoint.
976 *
977 * This call is not reentrant, and must not be made in parallel
978 * on the same endpoint.
979 */
980int
981rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
982{
983 int rc;
984
fc664485
CL
985 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
986 rpcrdma_clean_cq(ep->rep_attr.send_cq);
c56c65fb
TT
987 rc = rdma_disconnect(ia->ri_id);
988 if (!rc) {
989 /* returns without wait if not connected */
990 wait_event_interruptible(ep->rep_connect_wait,
991 ep->rep_connected != 1);
992 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
993 (ep->rep_connected == 1) ? "still " : "dis");
994 } else {
995 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
996 ep->rep_connected = rc;
997 }
998 return rc;
999}
1000
1001/*
1002 * Initialize buffer memory
1003 */
1004int
1005rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1006 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
1007{
1008 char *p;
65866f82 1009 size_t len, rlen, wlen;
c56c65fb 1010 int i, rc;
8d4ba034 1011 struct rpcrdma_mw *r;
c56c65fb
TT
1012
1013 buf->rb_max_requests = cdata->max_requests;
1014 spin_lock_init(&buf->rb_lock);
1015 atomic_set(&buf->rb_credits, 1);
1016
1017 /* Need to allocate:
1018 * 1. arrays for send and recv pointers
1019 * 2. arrays of struct rpcrdma_req to fill in pointers
1020 * 3. array of struct rpcrdma_rep for replies
1021 * 4. padding, if any
3197d309 1022 * 5. mw's, fmr's or frmr's, if any
c56c65fb
TT
1023 * Send/recv buffers in req/rep need to be registered
1024 */
1025
1026 len = buf->rb_max_requests *
1027 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1028 len += cdata->padding;
1029 switch (ia->ri_memreg_strategy) {
3197d309
TT
1030 case RPCRDMA_FRMR:
1031 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
1032 sizeof(struct rpcrdma_mw);
1033 break;
c56c65fb
TT
1034 case RPCRDMA_MTHCAFMR:
1035 /* TBD we are perhaps overallocating here */
1036 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1037 sizeof(struct rpcrdma_mw);
1038 break;
c56c65fb
TT
1039 default:
1040 break;
1041 }
1042
1043 /* allocate 1, 4 and 5 in one shot */
1044 p = kzalloc(len, GFP_KERNEL);
1045 if (p == NULL) {
1046 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1047 __func__, len);
1048 rc = -ENOMEM;
1049 goto out;
1050 }
1051 buf->rb_pool = p; /* for freeing it later */
1052
1053 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1054 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1055 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1056 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1057
1058 /*
1059 * Register the zeroed pad buffer, if any.
1060 */
1061 if (cdata->padding) {
1062 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1063 &ep->rep_pad_mr, &ep->rep_pad);
1064 if (rc)
1065 goto out;
1066 }
1067 p += cdata->padding;
1068
c56c65fb 1069 INIT_LIST_HEAD(&buf->rb_mws);
8d4ba034 1070 r = (struct rpcrdma_mw *)p;
c56c65fb 1071 switch (ia->ri_memreg_strategy) {
3197d309
TT
1072 case RPCRDMA_FRMR:
1073 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1074 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
0fc6c4e7 1075 ia->ri_max_frmr_depth);
3197d309
TT
1076 if (IS_ERR(r->r.frmr.fr_mr)) {
1077 rc = PTR_ERR(r->r.frmr.fr_mr);
1078 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1079 " failed %i\n", __func__, rc);
1080 goto out;
1081 }
0fc6c4e7
SW
1082 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1083 ia->ri_id->device,
1084 ia->ri_max_frmr_depth);
3197d309
TT
1085 if (IS_ERR(r->r.frmr.fr_pgl)) {
1086 rc = PTR_ERR(r->r.frmr.fr_pgl);
1087 dprintk("RPC: %s: "
1088 "ib_alloc_fast_reg_page_list "
1089 "failed %i\n", __func__, rc);
4034ba04
AA
1090
1091 ib_dereg_mr(r->r.frmr.fr_mr);
3197d309
TT
1092 goto out;
1093 }
1094 list_add(&r->mw_list, &buf->rb_mws);
1095 ++r;
1096 }
1097 break;
c56c65fb 1098 case RPCRDMA_MTHCAFMR:
c56c65fb
TT
1099 /* TBD we are perhaps overallocating here */
1100 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
8d4ba034
TT
1101 static struct ib_fmr_attr fa =
1102 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
c56c65fb
TT
1103 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1104 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1105 &fa);
1106 if (IS_ERR(r->r.fmr)) {
1107 rc = PTR_ERR(r->r.fmr);
1108 dprintk("RPC: %s: ib_alloc_fmr"
1109 " failed %i\n", __func__, rc);
1110 goto out;
1111 }
1112 list_add(&r->mw_list, &buf->rb_mws);
1113 ++r;
1114 }
c56c65fb 1115 break;
c56c65fb
TT
1116 default:
1117 break;
1118 }
1119
1120 /*
1121 * Allocate/init the request/reply buffers. Doing this
1122 * using kmalloc for now -- one for each buf.
1123 */
65866f82
CL
1124 wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
1125 rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
1126 dprintk("RPC: %s: wlen = %zu, rlen = %zu\n",
1127 __func__, wlen, rlen);
1128
c56c65fb
TT
1129 for (i = 0; i < buf->rb_max_requests; i++) {
1130 struct rpcrdma_req *req;
1131 struct rpcrdma_rep *rep;
1132
65866f82 1133 req = kmalloc(wlen, GFP_KERNEL);
c56c65fb
TT
1134 if (req == NULL) {
1135 dprintk("RPC: %s: request buffer %d alloc"
1136 " failed\n", __func__, i);
1137 rc = -ENOMEM;
1138 goto out;
1139 }
1140 memset(req, 0, sizeof(struct rpcrdma_req));
1141 buf->rb_send_bufs[i] = req;
1142 buf->rb_send_bufs[i]->rl_buffer = buf;
1143
1144 rc = rpcrdma_register_internal(ia, req->rl_base,
65866f82 1145 wlen - offsetof(struct rpcrdma_req, rl_base),
c56c65fb
TT
1146 &buf->rb_send_bufs[i]->rl_handle,
1147 &buf->rb_send_bufs[i]->rl_iov);
1148 if (rc)
1149 goto out;
1150
65866f82
CL
1151 buf->rb_send_bufs[i]->rl_size = wlen -
1152 sizeof(struct rpcrdma_req);
c56c65fb 1153
65866f82 1154 rep = kmalloc(rlen, GFP_KERNEL);
c56c65fb
TT
1155 if (rep == NULL) {
1156 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1157 __func__, i);
1158 rc = -ENOMEM;
1159 goto out;
1160 }
1161 memset(rep, 0, sizeof(struct rpcrdma_rep));
1162 buf->rb_recv_bufs[i] = rep;
1163 buf->rb_recv_bufs[i]->rr_buffer = buf;
c56c65fb
TT
1164
1165 rc = rpcrdma_register_internal(ia, rep->rr_base,
65866f82 1166 rlen - offsetof(struct rpcrdma_rep, rr_base),
c56c65fb
TT
1167 &buf->rb_recv_bufs[i]->rr_handle,
1168 &buf->rb_recv_bufs[i]->rr_iov);
1169 if (rc)
1170 goto out;
1171
1172 }
1173 dprintk("RPC: %s: max_requests %d\n",
1174 __func__, buf->rb_max_requests);
1175 /* done */
1176 return 0;
1177out:
1178 rpcrdma_buffer_destroy(buf);
1179 return rc;
1180}
1181
1182/*
1183 * Unregister and destroy buffer memory. Need to deal with
1184 * partial initialization, so it's callable from failed create.
1185 * Must be called before destroying endpoint, as registrations
1186 * reference it.
1187 */
1188void
1189rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1190{
1191 int rc, i;
1192 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
8d4ba034 1193 struct rpcrdma_mw *r;
c56c65fb
TT
1194
1195 /* clean up in reverse order from create
1196 * 1. recv mr memory (mr free, then kfree)
c56c65fb
TT
1197 * 2. send mr memory (mr free, then kfree)
1198 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1199 * 4. arrays
1200 */
1201 dprintk("RPC: %s: entering\n", __func__);
1202
1203 for (i = 0; i < buf->rb_max_requests; i++) {
1204 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1205 rpcrdma_deregister_internal(ia,
1206 buf->rb_recv_bufs[i]->rr_handle,
1207 &buf->rb_recv_bufs[i]->rr_iov);
1208 kfree(buf->rb_recv_bufs[i]);
1209 }
1210 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
c56c65fb
TT
1211 rpcrdma_deregister_internal(ia,
1212 buf->rb_send_bufs[i]->rl_handle,
1213 &buf->rb_send_bufs[i]->rl_iov);
1214 kfree(buf->rb_send_bufs[i]);
1215 }
1216 }
1217
4034ba04
AA
1218 while (!list_empty(&buf->rb_mws)) {
1219 r = list_entry(buf->rb_mws.next,
1220 struct rpcrdma_mw, mw_list);
1221 list_del(&r->mw_list);
1222 switch (ia->ri_memreg_strategy) {
1223 case RPCRDMA_FRMR:
1224 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1225 if (rc)
1226 dprintk("RPC: %s:"
1227 " ib_dereg_mr"
1228 " failed %i\n",
1229 __func__, rc);
1230 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1231 break;
1232 case RPCRDMA_MTHCAFMR:
1233 rc = ib_dealloc_fmr(r->r.fmr);
1234 if (rc)
1235 dprintk("RPC: %s:"
1236 " ib_dealloc_fmr"
1237 " failed %i\n",
1238 __func__, rc);
1239 break;
4034ba04
AA
1240 default:
1241 break;
1242 }
1243 }
1244
c56c65fb
TT
1245 kfree(buf->rb_pool);
1246}
1247
1248/*
1249 * Get a set of request/reply buffers.
1250 *
1251 * Reply buffer (if needed) is attached to send buffer upon return.
1252 * Rule:
1253 * rb_send_index and rb_recv_index MUST always be pointing to the
1254 * *next* available buffer (non-NULL). They are incremented after
1255 * removing buffers, and decremented *before* returning them.
1256 */
1257struct rpcrdma_req *
1258rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1259{
1260 struct rpcrdma_req *req;
1261 unsigned long flags;
8d4ba034
TT
1262 int i;
1263 struct rpcrdma_mw *r;
c56c65fb
TT
1264
1265 spin_lock_irqsave(&buffers->rb_lock, flags);
1266 if (buffers->rb_send_index == buffers->rb_max_requests) {
1267 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1268 dprintk("RPC: %s: out of request buffers\n", __func__);
1269 return ((struct rpcrdma_req *)NULL);
1270 }
1271
1272 req = buffers->rb_send_bufs[buffers->rb_send_index];
1273 if (buffers->rb_send_index < buffers->rb_recv_index) {
1274 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1275 __func__,
1276 buffers->rb_recv_index - buffers->rb_send_index);
1277 req->rl_reply = NULL;
1278 } else {
1279 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1280 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1281 }
1282 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1283 if (!list_empty(&buffers->rb_mws)) {
8d4ba034 1284 i = RPCRDMA_MAX_SEGS - 1;
c56c65fb 1285 do {
c56c65fb
TT
1286 r = list_entry(buffers->rb_mws.next,
1287 struct rpcrdma_mw, mw_list);
1288 list_del(&r->mw_list);
1289 req->rl_segments[i].mr_chunk.rl_mw = r;
1290 } while (--i >= 0);
1291 }
1292 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1293 return req;
1294}
1295
1296/*
1297 * Put request/reply buffers back into pool.
1298 * Pre-decrement counter/array index.
1299 */
1300void
1301rpcrdma_buffer_put(struct rpcrdma_req *req)
1302{
1303 struct rpcrdma_buffer *buffers = req->rl_buffer;
1304 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1305 int i;
1306 unsigned long flags;
1307
1308 BUG_ON(req->rl_nchunks != 0);
1309 spin_lock_irqsave(&buffers->rb_lock, flags);
1310 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1311 req->rl_niovs = 0;
1312 if (req->rl_reply) {
1313 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
c56c65fb
TT
1314 req->rl_reply->rr_func = NULL;
1315 req->rl_reply = NULL;
1316 }
1317 switch (ia->ri_memreg_strategy) {
3197d309 1318 case RPCRDMA_FRMR:
c56c65fb 1319 case RPCRDMA_MTHCAFMR:
c56c65fb
TT
1320 /*
1321 * Cycle mw's back in reverse order, and "spin" them.
1322 * This delays and scrambles reuse as much as possible.
1323 */
1324 i = 1;
1325 do {
1326 struct rpcrdma_mw **mw;
1327 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1328 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1329 *mw = NULL;
1330 } while (++i < RPCRDMA_MAX_SEGS);
1331 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1332 &buffers->rb_mws);
1333 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1334 break;
1335 default:
1336 break;
1337 }
1338 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1339}
1340
1341/*
1342 * Recover reply buffers from pool.
1343 * This happens when recovering from error conditions.
1344 * Post-increment counter/array index.
1345 */
1346void
1347rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1348{
1349 struct rpcrdma_buffer *buffers = req->rl_buffer;
1350 unsigned long flags;
1351
1352 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1353 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1354 spin_lock_irqsave(&buffers->rb_lock, flags);
1355 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1356 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1357 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1358 }
1359 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1360}
1361
1362/*
1363 * Put reply buffers back into pool when not attached to
b45ccfd2 1364 * request. This happens in error conditions.
c56c65fb
TT
1365 */
1366void
1367rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1368{
1369 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1370 unsigned long flags;
1371
1372 rep->rr_func = NULL;
1373 spin_lock_irqsave(&buffers->rb_lock, flags);
1374 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1375 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1376}
1377
1378/*
1379 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1380 */
1381
1382int
1383rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1384 struct ib_mr **mrp, struct ib_sge *iov)
1385{
1386 struct ib_phys_buf ipb;
1387 struct ib_mr *mr;
1388 int rc;
1389
1390 /*
1391 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1392 */
1393 iov->addr = ib_dma_map_single(ia->ri_id->device,
1394 va, len, DMA_BIDIRECTIONAL);
1395 iov->length = len;
1396
bd7ed1d1
TT
1397 if (ia->ri_have_dma_lkey) {
1398 *mrp = NULL;
1399 iov->lkey = ia->ri_dma_lkey;
1400 return 0;
1401 } else if (ia->ri_bind_mem != NULL) {
c56c65fb
TT
1402 *mrp = NULL;
1403 iov->lkey = ia->ri_bind_mem->lkey;
1404 return 0;
1405 }
1406
1407 ipb.addr = iov->addr;
1408 ipb.size = iov->length;
1409 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1410 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1411
1412 dprintk("RPC: %s: phys convert: 0x%llx "
1413 "registered 0x%llx length %d\n",
a56daeb7
AM
1414 __func__, (unsigned long long)ipb.addr,
1415 (unsigned long long)iov->addr, len);
c56c65fb
TT
1416
1417 if (IS_ERR(mr)) {
1418 *mrp = NULL;
1419 rc = PTR_ERR(mr);
1420 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1421 } else {
1422 *mrp = mr;
1423 iov->lkey = mr->lkey;
1424 rc = 0;
1425 }
1426
1427 return rc;
1428}
1429
1430int
1431rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1432 struct ib_mr *mr, struct ib_sge *iov)
1433{
1434 int rc;
1435
1436 ib_dma_unmap_single(ia->ri_id->device,
1437 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1438
1439 if (NULL == mr)
1440 return 0;
1441
1442 rc = ib_dereg_mr(mr);
1443 if (rc)
1444 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1445 return rc;
1446}
1447
1448/*
1449 * Wrappers for chunk registration, shared by read/write chunk code.
1450 */
1451
1452static void
1453rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1454{
1455 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1456 seg->mr_dmalen = seg->mr_len;
1457 if (seg->mr_page)
1458 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1459 seg->mr_page, offset_in_page(seg->mr_offset),
1460 seg->mr_dmalen, seg->mr_dir);
1461 else
1462 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1463 seg->mr_offset,
1464 seg->mr_dmalen, seg->mr_dir);
5c635e09
TT
1465 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1466 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1467 __func__,
986d4abb
RD
1468 (unsigned long long)seg->mr_dma,
1469 seg->mr_offset, seg->mr_dmalen);
5c635e09 1470 }
c56c65fb
TT
1471}
1472
1473static void
1474rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1475{
1476 if (seg->mr_page)
1477 ib_dma_unmap_page(ia->ri_id->device,
1478 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1479 else
1480 ib_dma_unmap_single(ia->ri_id->device,
1481 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1482}
1483
3197d309
TT
1484static int
1485rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1486 int *nsegs, int writing, struct rpcrdma_ia *ia,
1487 struct rpcrdma_xprt *r_xprt)
1488{
1489 struct rpcrdma_mr_seg *seg1 = seg;
5c635e09
TT
1490 struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1491
3197d309
TT
1492 u8 key;
1493 int len, pageoff;
1494 int i, rc;
9b78145c
TT
1495 int seg_len;
1496 u64 pa;
1497 int page_no;
3197d309
TT
1498
1499 pageoff = offset_in_page(seg1->mr_offset);
1500 seg1->mr_offset -= pageoff; /* start of page */
1501 seg1->mr_len += pageoff;
1502 len = -pageoff;
0fc6c4e7
SW
1503 if (*nsegs > ia->ri_max_frmr_depth)
1504 *nsegs = ia->ri_max_frmr_depth;
9b78145c 1505 for (page_no = i = 0; i < *nsegs;) {
3197d309 1506 rpcrdma_map_one(ia, seg, writing);
9b78145c
TT
1507 pa = seg->mr_dma;
1508 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1509 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->
1510 page_list[page_no++] = pa;
1511 pa += PAGE_SIZE;
1512 }
3197d309
TT
1513 len += seg->mr_len;
1514 ++seg;
1515 ++i;
1516 /* Check for holes */
1517 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1518 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1519 break;
1520 }
1521 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1522 __func__, seg1->mr_chunk.rl_mw, i);
1523
5c635e09
TT
1524 if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1525 dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n",
1526 __func__,
1527 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1528 /* Invalidate before using. */
1529 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1530 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1531 invalidate_wr.next = &frmr_wr;
1532 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1533 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1534 invalidate_wr.ex.invalidate_rkey =
1535 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1536 DECR_CQCOUNT(&r_xprt->rx_ep);
1537 post_wr = &invalidate_wr;
1538 } else
1539 post_wr = &frmr_wr;
1540
3197d309
TT
1541 /* Bump the key */
1542 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1543 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1544
1545 /* Prepare FRMR WR */
1546 memset(&frmr_wr, 0, sizeof frmr_wr);
5c635e09 1547 frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
3197d309 1548 frmr_wr.opcode = IB_WR_FAST_REG_MR;
5c635e09 1549 frmr_wr.send_flags = IB_SEND_SIGNALED;
7a8b80eb 1550 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
3197d309 1551 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
9b78145c 1552 frmr_wr.wr.fast_reg.page_list_len = page_no;
3197d309 1553 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
9b78145c 1554 frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
5c635e09 1555 BUG_ON(frmr_wr.wr.fast_reg.length < len);
3197d309 1556 frmr_wr.wr.fast_reg.access_flags = (writing ?
68743082
VP
1557 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1558 IB_ACCESS_REMOTE_READ);
3197d309
TT
1559 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1560 DECR_CQCOUNT(&r_xprt->rx_ep);
1561
5c635e09 1562 rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
3197d309
TT
1563
1564 if (rc) {
1565 dprintk("RPC: %s: failed ib_post_send for register,"
1566 " status %i\n", __func__, rc);
1567 while (i--)
1568 rpcrdma_unmap_one(ia, --seg);
1569 } else {
1570 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1571 seg1->mr_base = seg1->mr_dma + pageoff;
1572 seg1->mr_nsegs = i;
1573 seg1->mr_len = len;
1574 }
1575 *nsegs = i;
1576 return rc;
1577}
1578
1579static int
1580rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1581 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1582{
1583 struct rpcrdma_mr_seg *seg1 = seg;
1584 struct ib_send_wr invalidate_wr, *bad_wr;
1585 int rc;
1586
1587 while (seg1->mr_nsegs--)
1588 rpcrdma_unmap_one(ia, seg++);
1589
1590 memset(&invalidate_wr, 0, sizeof invalidate_wr);
5c635e09 1591 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
3197d309 1592 invalidate_wr.opcode = IB_WR_LOCAL_INV;
5c635e09 1593 invalidate_wr.send_flags = IB_SEND_SIGNALED;
3197d309
TT
1594 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1595 DECR_CQCOUNT(&r_xprt->rx_ep);
1596
1597 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1598 if (rc)
1599 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1600 " status %i\n", __func__, rc);
1601 return rc;
1602}
1603
8d4ba034
TT
1604static int
1605rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1606 int *nsegs, int writing, struct rpcrdma_ia *ia)
1607{
1608 struct rpcrdma_mr_seg *seg1 = seg;
1609 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1610 int len, pageoff, i, rc;
1611
1612 pageoff = offset_in_page(seg1->mr_offset);
1613 seg1->mr_offset -= pageoff; /* start of page */
1614 seg1->mr_len += pageoff;
1615 len = -pageoff;
1616 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1617 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1618 for (i = 0; i < *nsegs;) {
1619 rpcrdma_map_one(ia, seg, writing);
1620 physaddrs[i] = seg->mr_dma;
1621 len += seg->mr_len;
1622 ++seg;
1623 ++i;
1624 /* Check for holes */
1625 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1626 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1627 break;
1628 }
1629 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1630 physaddrs, i, seg1->mr_dma);
1631 if (rc) {
1632 dprintk("RPC: %s: failed ib_map_phys_fmr "
1633 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1634 len, (unsigned long long)seg1->mr_dma,
1635 pageoff, i, rc);
1636 while (i--)
1637 rpcrdma_unmap_one(ia, --seg);
1638 } else {
1639 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1640 seg1->mr_base = seg1->mr_dma + pageoff;
1641 seg1->mr_nsegs = i;
1642 seg1->mr_len = len;
1643 }
1644 *nsegs = i;
1645 return rc;
1646}
1647
1648static int
1649rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1650 struct rpcrdma_ia *ia)
1651{
1652 struct rpcrdma_mr_seg *seg1 = seg;
1653 LIST_HEAD(l);
1654 int rc;
1655
1656 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1657 rc = ib_unmap_fmr(&l);
1658 while (seg1->mr_nsegs--)
1659 rpcrdma_unmap_one(ia, seg++);
1660 if (rc)
1661 dprintk("RPC: %s: failed ib_unmap_fmr,"
1662 " status %i\n", __func__, rc);
1663 return rc;
1664}
1665
c56c65fb
TT
1666int
1667rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1668 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1669{
1670 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1671 int rc = 0;
1672
1673 switch (ia->ri_memreg_strategy) {
1674
1675#if RPCRDMA_PERSISTENT_REGISTRATION
1676 case RPCRDMA_ALLPHYSICAL:
1677 rpcrdma_map_one(ia, seg, writing);
1678 seg->mr_rkey = ia->ri_bind_mem->rkey;
1679 seg->mr_base = seg->mr_dma;
1680 seg->mr_nsegs = 1;
1681 nsegs = 1;
1682 break;
1683#endif
1684
3197d309
TT
1685 /* Registration using frmr registration */
1686 case RPCRDMA_FRMR:
1687 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1688 break;
1689
8d4ba034 1690 /* Registration using fmr memory registration */
c56c65fb 1691 case RPCRDMA_MTHCAFMR:
8d4ba034 1692 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
c56c65fb
TT
1693 break;
1694
c56c65fb 1695 default:
0ac531c1 1696 return -1;
c56c65fb
TT
1697 }
1698 if (rc)
1699 return -1;
1700
1701 return nsegs;
1702}
1703
1704int
1705rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
13c9ff8f 1706 struct rpcrdma_xprt *r_xprt)
c56c65fb
TT
1707{
1708 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1709 int nsegs = seg->mr_nsegs, rc;
1710
1711 switch (ia->ri_memreg_strategy) {
1712
1713#if RPCRDMA_PERSISTENT_REGISTRATION
1714 case RPCRDMA_ALLPHYSICAL:
1715 BUG_ON(nsegs != 1);
1716 rpcrdma_unmap_one(ia, seg);
1717 rc = 0;
1718 break;
1719#endif
1720
3197d309
TT
1721 case RPCRDMA_FRMR:
1722 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1723 break;
1724
c56c65fb 1725 case RPCRDMA_MTHCAFMR:
8d4ba034 1726 rc = rpcrdma_deregister_fmr_external(seg, ia);
c56c65fb
TT
1727 break;
1728
c56c65fb 1729 default:
c56c65fb
TT
1730 break;
1731 }
c56c65fb
TT
1732 return nsegs;
1733}
1734
1735/*
1736 * Prepost any receive buffer, then post send.
1737 *
1738 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1739 */
1740int
1741rpcrdma_ep_post(struct rpcrdma_ia *ia,
1742 struct rpcrdma_ep *ep,
1743 struct rpcrdma_req *req)
1744{
1745 struct ib_send_wr send_wr, *send_wr_fail;
1746 struct rpcrdma_rep *rep = req->rl_reply;
1747 int rc;
1748
1749 if (rep) {
1750 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1751 if (rc)
1752 goto out;
1753 req->rl_reply = NULL;
1754 }
1755
1756 send_wr.next = NULL;
1757 send_wr.wr_id = 0ULL; /* no send cookie */
1758 send_wr.sg_list = req->rl_send_iov;
1759 send_wr.num_sge = req->rl_niovs;
1760 send_wr.opcode = IB_WR_SEND;
c56c65fb
TT
1761 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1762 ib_dma_sync_single_for_device(ia->ri_id->device,
1763 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1764 DMA_TO_DEVICE);
1765 ib_dma_sync_single_for_device(ia->ri_id->device,
1766 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1767 DMA_TO_DEVICE);
1768 ib_dma_sync_single_for_device(ia->ri_id->device,
1769 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1770 DMA_TO_DEVICE);
1771
1772 if (DECR_CQCOUNT(ep) > 0)
1773 send_wr.send_flags = 0;
1774 else { /* Provider must take a send completion every now and then */
1775 INIT_CQCOUNT(ep);
1776 send_wr.send_flags = IB_SEND_SIGNALED;
1777 }
1778
1779 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1780 if (rc)
1781 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1782 rc);
1783out:
1784 return rc;
1785}
1786
1787/*
1788 * (Re)post a receive buffer.
1789 */
1790int
1791rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1792 struct rpcrdma_ep *ep,
1793 struct rpcrdma_rep *rep)
1794{
1795 struct ib_recv_wr recv_wr, *recv_wr_fail;
1796 int rc;
1797
1798 recv_wr.next = NULL;
1799 recv_wr.wr_id = (u64) (unsigned long) rep;
1800 recv_wr.sg_list = &rep->rr_iov;
1801 recv_wr.num_sge = 1;
1802
1803 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1804 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1805
c56c65fb
TT
1806 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1807
1808 if (rc)
1809 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1810 rc);
1811 return rc;
1812}