xprtrdma: Schedule reply tasklet once per upcall
[linux-2.6-block.git] / net / sunrpc / xprtrdma / verbs.c
CommitLineData
f58851e6 1/*
c56c65fb
TT
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
f58851e6
TT
38 */
39
c56c65fb
TT
40/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
a6b7a407 50#include <linux/interrupt.h>
5a0e3ad6 51#include <linux/slab.h>
65866f82 52#include <asm/bitops.h>
c56c65fb 53
f58851e6
TT
54#include "xprt_rdma.h"
55
c56c65fb
TT
56/*
57 * Globals/Macros
58 */
59
60#ifdef RPC_DEBUG
61# define RPCDBG_FACILITY RPCDBG_TRANS
62#endif
63
9f9d802a
CL
64static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
65
c56c65fb
TT
66/*
67 * internal functions
68 */
69
70/*
71 * handle replies in tasklet context, using a single, global list
72 * rdma tasklet function -- just turn around and call the func
73 * for all replies on the list
74 */
75
76static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
77static LIST_HEAD(rpcrdma_tasklets_g);
78
79static void
80rpcrdma_run_tasklet(unsigned long data)
81{
82 struct rpcrdma_rep *rep;
83 void (*func)(struct rpcrdma_rep *);
84 unsigned long flags;
85
86 data = data;
87 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
88 while (!list_empty(&rpcrdma_tasklets_g)) {
89 rep = list_entry(rpcrdma_tasklets_g.next,
90 struct rpcrdma_rep, rr_list);
91 list_del(&rep->rr_list);
92 func = rep->rr_func;
93 rep->rr_func = NULL;
94 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
95
96 if (func)
97 func(rep);
98 else
99 rpcrdma_recv_buffer_put(rep);
100
101 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
102 }
103 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
104}
105
106static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
107
c56c65fb
TT
108static void
109rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
110{
111 struct rpcrdma_ep *ep = context;
112
113 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
114 __func__, event->event, event->device->name, context);
115 if (ep->rep_connected == 1) {
116 ep->rep_connected = -EIO;
117 ep->rep_func(ep);
118 wake_up_all(&ep->rep_connect_wait);
119 }
120}
121
122static void
123rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
124{
125 struct rpcrdma_ep *ep = context;
126
127 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
128 __func__, event->event, event->device->name, context);
129 if (ep->rep_connected == 1) {
130 ep->rep_connected = -EIO;
131 ep->rep_func(ep);
132 wake_up_all(&ep->rep_connect_wait);
133 }
134}
135
fc664485
CL
136static void
137rpcrdma_sendcq_process_wc(struct ib_wc *wc)
c56c65fb 138{
fc664485 139 struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
c56c65fb 140
fc664485
CL
141 dprintk("RPC: %s: frmr %p status %X opcode %d\n",
142 __func__, frmr, wc->status, wc->opcode);
c56c65fb 143
fc664485 144 if (wc->wr_id == 0ULL)
c56c65fb 145 return;
dab7e3b8 146 if (wc->status != IB_WC_SUCCESS)
9f9d802a 147 frmr->r.frmr.fr_state = FRMR_IS_STALE;
c56c65fb
TT
148}
149
fc664485 150static int
1c00dd07 151rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
c56c65fb 152{
1c00dd07 153 struct ib_wc *wcs;
8301a2c0 154 int budget, count, rc;
c56c65fb 155
8301a2c0 156 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
157 do {
158 wcs = ep->rep_send_wcs;
159
160 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
161 if (rc <= 0)
162 return rc;
163
164 count = rc;
165 while (count-- > 0)
166 rpcrdma_sendcq_process_wc(wcs++);
8301a2c0 167 } while (rc == RPCRDMA_POLLSIZE && --budget);
1c00dd07 168 return 0;
fc664485 169}
c56c65fb 170
fc664485
CL
171/*
172 * Handle send, fast_reg_mr, and local_inv completions.
173 *
174 * Send events are typically suppressed and thus do not result
175 * in an upcall. Occasionally one is signaled, however. This
176 * prevents the provider's completion queue from wrapping and
177 * losing a completion.
178 */
179static void
180rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
181{
1c00dd07 182 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
fc664485
CL
183 int rc;
184
1c00dd07 185 rc = rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
186 if (rc) {
187 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
188 __func__, rc);
189 return;
c56c65fb
TT
190 }
191
7f23f6f6
CL
192 rc = ib_req_notify_cq(cq,
193 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
194 if (rc == 0)
195 return;
196 if (rc < 0) {
fc664485
CL
197 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
198 __func__, rc);
199 return;
200 }
201
1c00dd07 202 rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
203}
204
205static void
bb96193d 206rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
fc664485
CL
207{
208 struct rpcrdma_rep *rep =
209 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
210
211 dprintk("RPC: %s: rep %p status %X opcode %X length %u\n",
212 __func__, rep, wc->status, wc->opcode, wc->byte_len);
213
214 if (wc->status != IB_WC_SUCCESS) {
215 rep->rr_len = ~0U;
216 goto out_schedule;
217 }
218 if (wc->opcode != IB_WC_RECV)
219 return;
220
221 rep->rr_len = wc->byte_len;
222 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
223 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
224
225 if (rep->rr_len >= 16) {
226 struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
227 unsigned int credits = ntohl(p->rm_credit);
228
229 if (credits == 0)
230 credits = 1; /* don't deadlock */
231 else if (credits > rep->rr_buffer->rb_max_requests)
232 credits = rep->rr_buffer->rb_max_requests;
233 atomic_set(&rep->rr_buffer->rb_credits, credits);
234 }
235
236out_schedule:
bb96193d 237 list_add_tail(&rep->rr_list, sched_list);
fc664485
CL
238}
239
240static int
1c00dd07 241rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
fc664485 242{
bb96193d 243 struct list_head sched_list;
1c00dd07 244 struct ib_wc *wcs;
8301a2c0 245 int budget, count, rc;
bb96193d 246 unsigned long flags;
fc664485 247
bb96193d 248 INIT_LIST_HEAD(&sched_list);
8301a2c0 249 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
250 do {
251 wcs = ep->rep_recv_wcs;
252
253 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
254 if (rc <= 0)
bb96193d 255 goto out_schedule;
1c00dd07
CL
256
257 count = rc;
258 while (count-- > 0)
bb96193d 259 rpcrdma_recvcq_process_wc(wcs++, &sched_list);
8301a2c0 260 } while (rc == RPCRDMA_POLLSIZE && --budget);
bb96193d
CL
261 rc = 0;
262
263out_schedule:
264 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
265 list_splice_tail(&sched_list, &rpcrdma_tasklets_g);
266 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
267 tasklet_schedule(&rpcrdma_tasklet_g);
268 return rc;
c56c65fb
TT
269}
270
271/*
fc664485 272 * Handle receive completions.
c56c65fb 273 *
c56c65fb
TT
274 * It is reentrant but processes single events in order to maintain
275 * ordering of receives to keep server credits.
276 *
277 * It is the responsibility of the scheduled tasklet to return
278 * recv buffers to the pool. NOTE: this affects synchronization of
279 * connection shutdown. That is, the structures required for
280 * the completion of the reply handler must remain intact until
281 * all memory has been reclaimed.
c56c65fb
TT
282 */
283static void
fc664485 284rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
c56c65fb 285{
1c00dd07 286 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
c56c65fb
TT
287 int rc;
288
1c00dd07 289 rc = rpcrdma_recvcq_poll(cq, ep);
fc664485
CL
290 if (rc) {
291 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
292 __func__, rc);
c56c65fb 293 return;
fc664485 294 }
c56c65fb 295
7f23f6f6
CL
296 rc = ib_req_notify_cq(cq,
297 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
298 if (rc == 0)
299 return;
300 if (rc < 0) {
fc664485 301 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
c56c65fb
TT
302 __func__, rc);
303 return;
304 }
305
1c00dd07 306 rpcrdma_recvcq_poll(cq, ep);
c56c65fb
TT
307}
308
a7bc211a
CL
309static void
310rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
311{
312 rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep);
313 rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep);
314}
315
c56c65fb
TT
316#ifdef RPC_DEBUG
317static const char * const conn[] = {
318 "address resolved",
319 "address error",
320 "route resolved",
321 "route error",
322 "connect request",
323 "connect response",
324 "connect error",
325 "unreachable",
326 "rejected",
327 "established",
328 "disconnected",
329 "device removal"
330};
331#endif
332
333static int
334rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
335{
336 struct rpcrdma_xprt *xprt = id->context;
337 struct rpcrdma_ia *ia = &xprt->rx_ia;
338 struct rpcrdma_ep *ep = &xprt->rx_ep;
ff0db049 339#ifdef RPC_DEBUG
c56c65fb 340 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
ff0db049 341#endif
c56c65fb
TT
342 struct ib_qp_attr attr;
343 struct ib_qp_init_attr iattr;
344 int connstate = 0;
345
346 switch (event->event) {
347 case RDMA_CM_EVENT_ADDR_RESOLVED:
348 case RDMA_CM_EVENT_ROUTE_RESOLVED:
5675add3 349 ia->ri_async_rc = 0;
c56c65fb
TT
350 complete(&ia->ri_done);
351 break;
352 case RDMA_CM_EVENT_ADDR_ERROR:
353 ia->ri_async_rc = -EHOSTUNREACH;
354 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
355 __func__, ep);
356 complete(&ia->ri_done);
357 break;
358 case RDMA_CM_EVENT_ROUTE_ERROR:
359 ia->ri_async_rc = -ENETUNREACH;
360 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
361 __func__, ep);
362 complete(&ia->ri_done);
363 break;
364 case RDMA_CM_EVENT_ESTABLISHED:
365 connstate = 1;
366 ib_query_qp(ia->ri_id->qp, &attr,
367 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
368 &iattr);
369 dprintk("RPC: %s: %d responder resources"
370 " (%d initiator)\n",
371 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
372 goto connected;
373 case RDMA_CM_EVENT_CONNECT_ERROR:
374 connstate = -ENOTCONN;
375 goto connected;
376 case RDMA_CM_EVENT_UNREACHABLE:
377 connstate = -ENETDOWN;
378 goto connected;
379 case RDMA_CM_EVENT_REJECTED:
380 connstate = -ECONNREFUSED;
381 goto connected;
382 case RDMA_CM_EVENT_DISCONNECTED:
383 connstate = -ECONNABORTED;
384 goto connected;
385 case RDMA_CM_EVENT_DEVICE_REMOVAL:
386 connstate = -ENODEV;
387connected:
21454aaa 388 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
c56c65fb
TT
389 __func__,
390 (event->event <= 11) ? conn[event->event] :
391 "unknown connection error",
21454aaa 392 &addr->sin_addr.s_addr,
c56c65fb
TT
393 ntohs(addr->sin_port),
394 ep, event->event);
395 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
396 dprintk("RPC: %s: %sconnected\n",
397 __func__, connstate > 0 ? "" : "dis");
398 ep->rep_connected = connstate;
399 ep->rep_func(ep);
400 wake_up_all(&ep->rep_connect_wait);
401 break;
402 default:
1a954051 403 dprintk("RPC: %s: unexpected CM event %d\n",
c56c65fb 404 __func__, event->event);
c56c65fb
TT
405 break;
406 }
407
b3cd8d45
TT
408#ifdef RPC_DEBUG
409 if (connstate == 1) {
410 int ird = attr.max_dest_rd_atomic;
411 int tird = ep->rep_remote_cma.responder_resources;
21454aaa 412 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
b3cd8d45 413 "on %s, memreg %d slots %d ird %d%s\n",
21454aaa 414 &addr->sin_addr.s_addr,
b3cd8d45
TT
415 ntohs(addr->sin_port),
416 ia->ri_id->device->name,
417 ia->ri_memreg_strategy,
418 xprt->rx_buf.rb_max_requests,
419 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
420 } else if (connstate < 0) {
21454aaa
HH
421 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
422 &addr->sin_addr.s_addr,
b3cd8d45
TT
423 ntohs(addr->sin_port),
424 connstate);
425 }
426#endif
427
c56c65fb
TT
428 return 0;
429}
430
431static struct rdma_cm_id *
432rpcrdma_create_id(struct rpcrdma_xprt *xprt,
433 struct rpcrdma_ia *ia, struct sockaddr *addr)
434{
435 struct rdma_cm_id *id;
436 int rc;
437
1a954051
TT
438 init_completion(&ia->ri_done);
439
b26f9b99 440 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
c56c65fb
TT
441 if (IS_ERR(id)) {
442 rc = PTR_ERR(id);
443 dprintk("RPC: %s: rdma_create_id() failed %i\n",
444 __func__, rc);
445 return id;
446 }
447
5675add3 448 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
449 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
450 if (rc) {
451 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
452 __func__, rc);
453 goto out;
454 }
5675add3
TT
455 wait_for_completion_interruptible_timeout(&ia->ri_done,
456 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
457 rc = ia->ri_async_rc;
458 if (rc)
459 goto out;
460
5675add3 461 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
462 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
463 if (rc) {
464 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
465 __func__, rc);
466 goto out;
467 }
5675add3
TT
468 wait_for_completion_interruptible_timeout(&ia->ri_done,
469 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
470 rc = ia->ri_async_rc;
471 if (rc)
472 goto out;
473
474 return id;
475
476out:
477 rdma_destroy_id(id);
478 return ERR_PTR(rc);
479}
480
481/*
482 * Drain any cq, prior to teardown.
483 */
484static void
485rpcrdma_clean_cq(struct ib_cq *cq)
486{
487 struct ib_wc wc;
488 int count = 0;
489
490 while (1 == ib_poll_cq(cq, 1, &wc))
491 ++count;
492
493 if (count)
494 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
495 __func__, count, wc.opcode);
496}
497
498/*
499 * Exported functions.
500 */
501
502/*
503 * Open and initialize an Interface Adapter.
504 * o initializes fields of struct rpcrdma_ia, including
505 * interface and provider attributes and protection zone.
506 */
507int
508rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
509{
bd7ed1d1
TT
510 int rc, mem_priv;
511 struct ib_device_attr devattr;
c56c65fb
TT
512 struct rpcrdma_ia *ia = &xprt->rx_ia;
513
c56c65fb
TT
514 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
515 if (IS_ERR(ia->ri_id)) {
516 rc = PTR_ERR(ia->ri_id);
517 goto out1;
518 }
519
520 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
521 if (IS_ERR(ia->ri_pd)) {
522 rc = PTR_ERR(ia->ri_pd);
523 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
524 __func__, rc);
525 goto out2;
526 }
527
bd7ed1d1
TT
528 /*
529 * Query the device to determine if the requested memory
530 * registration strategy is supported. If it isn't, set the
531 * strategy to a globally supported model.
532 */
533 rc = ib_query_device(ia->ri_id->device, &devattr);
534 if (rc) {
535 dprintk("RPC: %s: ib_query_device failed %d\n",
536 __func__, rc);
537 goto out2;
538 }
539
540 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
541 ia->ri_have_dma_lkey = 1;
542 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
543 }
544
f10eafd3 545 if (memreg == RPCRDMA_FRMR) {
3197d309
TT
546 /* Requires both frmr reg and local dma lkey */
547 if ((devattr.device_cap_flags &
548 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
549 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
3197d309 550 dprintk("RPC: %s: FRMR registration "
f10eafd3
CL
551 "not supported by HCA\n", __func__);
552 memreg = RPCRDMA_MTHCAFMR;
0fc6c4e7
SW
553 } else {
554 /* Mind the ia limit on FRMR page list depth */
555 ia->ri_max_frmr_depth = min_t(unsigned int,
556 RPCRDMA_MAX_DATA_SEGS,
557 devattr.max_fast_reg_page_list_len);
bd7ed1d1 558 }
f10eafd3
CL
559 }
560 if (memreg == RPCRDMA_MTHCAFMR) {
561 if (!ia->ri_id->device->alloc_fmr) {
562 dprintk("RPC: %s: MTHCAFMR registration "
563 "not supported by HCA\n", __func__);
564#if RPCRDMA_PERSISTENT_REGISTRATION
565 memreg = RPCRDMA_ALLPHYSICAL;
566#else
cdd9ade7 567 rc = -ENOMEM;
f10eafd3
CL
568 goto out2;
569#endif
570 }
bd7ed1d1
TT
571 }
572
c56c65fb
TT
573 /*
574 * Optionally obtain an underlying physical identity mapping in
575 * order to do a memory window-based bind. This base registration
576 * is protected from remote access - that is enabled only by binding
577 * for the specific bytes targeted during each RPC operation, and
578 * revoked after the corresponding completion similar to a storage
579 * adapter.
580 */
bd7ed1d1 581 switch (memreg) {
3197d309 582 case RPCRDMA_FRMR:
bd7ed1d1 583 break;
c56c65fb 584#if RPCRDMA_PERSISTENT_REGISTRATION
bd7ed1d1
TT
585 case RPCRDMA_ALLPHYSICAL:
586 mem_priv = IB_ACCESS_LOCAL_WRITE |
587 IB_ACCESS_REMOTE_WRITE |
588 IB_ACCESS_REMOTE_READ;
589 goto register_setup;
c56c65fb 590#endif
bd7ed1d1
TT
591 case RPCRDMA_MTHCAFMR:
592 if (ia->ri_have_dma_lkey)
c56c65fb 593 break;
bd7ed1d1 594 mem_priv = IB_ACCESS_LOCAL_WRITE;
b45ccfd2 595#if RPCRDMA_PERSISTENT_REGISTRATION
bd7ed1d1 596 register_setup:
b45ccfd2 597#endif
c56c65fb
TT
598 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
599 if (IS_ERR(ia->ri_bind_mem)) {
600 printk(KERN_ALERT "%s: ib_get_dma_mr for "
0ac531c1 601 "phys register failed with %lX\n",
c56c65fb 602 __func__, PTR_ERR(ia->ri_bind_mem));
0ac531c1
CL
603 rc = -ENOMEM;
604 goto out2;
c56c65fb 605 }
bd7ed1d1
TT
606 break;
607 default:
cdd9ade7
CL
608 printk(KERN_ERR "RPC: Unsupported memory "
609 "registration mode: %d\n", memreg);
610 rc = -ENOMEM;
bd7ed1d1 611 goto out2;
c56c65fb 612 }
bd7ed1d1
TT
613 dprintk("RPC: %s: memory registration strategy is %d\n",
614 __func__, memreg);
c56c65fb
TT
615
616 /* Else will do memory reg/dereg for each chunk */
617 ia->ri_memreg_strategy = memreg;
618
73806c88 619 rwlock_init(&ia->ri_qplock);
c56c65fb
TT
620 return 0;
621out2:
622 rdma_destroy_id(ia->ri_id);
fee08caf 623 ia->ri_id = NULL;
c56c65fb
TT
624out1:
625 return rc;
626}
627
628/*
629 * Clean up/close an IA.
630 * o if event handles and PD have been initialized, free them.
631 * o close the IA
632 */
633void
634rpcrdma_ia_close(struct rpcrdma_ia *ia)
635{
636 int rc;
637
638 dprintk("RPC: %s: entering\n", __func__);
639 if (ia->ri_bind_mem != NULL) {
640 rc = ib_dereg_mr(ia->ri_bind_mem);
641 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
642 __func__, rc);
643 }
fee08caf
TT
644 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
645 if (ia->ri_id->qp)
646 rdma_destroy_qp(ia->ri_id);
647 rdma_destroy_id(ia->ri_id);
648 ia->ri_id = NULL;
649 }
c56c65fb
TT
650 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
651 rc = ib_dealloc_pd(ia->ri_pd);
652 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
653 __func__, rc);
654 }
c56c65fb
TT
655}
656
657/*
658 * Create unconnected endpoint.
659 */
660int
661rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
662 struct rpcrdma_create_data_internal *cdata)
663{
664 struct ib_device_attr devattr;
fc664485 665 struct ib_cq *sendcq, *recvcq;
5d40a8a5 666 int rc, err;
c56c65fb
TT
667
668 rc = ib_query_device(ia->ri_id->device, &devattr);
669 if (rc) {
670 dprintk("RPC: %s: ib_query_device failed %d\n",
671 __func__, rc);
672 return rc;
673 }
674
675 /* check provider's send/recv wr limits */
676 if (cdata->max_requests > devattr.max_qp_wr)
677 cdata->max_requests = devattr.max_qp_wr;
678
679 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
680 ep->rep_attr.qp_context = ep;
681 /* send_cq and recv_cq initialized below */
682 ep->rep_attr.srq = NULL;
683 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
684 switch (ia->ri_memreg_strategy) {
0fc6c4e7
SW
685 case RPCRDMA_FRMR: {
686 int depth = 7;
687
15cdc644
TT
688 /* Add room for frmr register and invalidate WRs.
689 * 1. FRMR reg WR for head
690 * 2. FRMR invalidate WR for head
0fc6c4e7
SW
691 * 3. N FRMR reg WRs for pagelist
692 * 4. N FRMR invalidate WRs for pagelist
15cdc644
TT
693 * 5. FRMR reg WR for tail
694 * 6. FRMR invalidate WR for tail
695 * 7. The RDMA_SEND WR
696 */
0fc6c4e7
SW
697
698 /* Calculate N if the device max FRMR depth is smaller than
699 * RPCRDMA_MAX_DATA_SEGS.
700 */
701 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
702 int delta = RPCRDMA_MAX_DATA_SEGS -
703 ia->ri_max_frmr_depth;
704
705 do {
706 depth += 2; /* FRMR reg + invalidate */
707 delta -= ia->ri_max_frmr_depth;
708 } while (delta > 0);
709
710 }
711 ep->rep_attr.cap.max_send_wr *= depth;
15cdc644 712 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
0fc6c4e7 713 cdata->max_requests = devattr.max_qp_wr / depth;
15cdc644
TT
714 if (!cdata->max_requests)
715 return -EINVAL;
0fc6c4e7
SW
716 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
717 depth;
15cdc644 718 }
3197d309 719 break;
0fc6c4e7 720 }
c56c65fb
TT
721 default:
722 break;
723 }
724 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
725 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
726 ep->rep_attr.cap.max_recv_sge = 1;
727 ep->rep_attr.cap.max_inline_data = 0;
728 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
729 ep->rep_attr.qp_type = IB_QPT_RC;
730 ep->rep_attr.port_num = ~0;
731
732 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
733 "iovs: send %d recv %d\n",
734 __func__,
735 ep->rep_attr.cap.max_send_wr,
736 ep->rep_attr.cap.max_recv_wr,
737 ep->rep_attr.cap.max_send_sge,
738 ep->rep_attr.cap.max_recv_sge);
739
740 /* set trigger for requesting send completion */
fc664485 741 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
c56c65fb
TT
742 if (ep->rep_cqinit <= 2)
743 ep->rep_cqinit = 0;
744 INIT_CQCOUNT(ep);
745 ep->rep_ia = ia;
746 init_waitqueue_head(&ep->rep_connect_wait);
254f91e2 747 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
c56c65fb 748
fc664485 749 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
1c00dd07 750 rpcrdma_cq_async_error_upcall, ep,
c56c65fb 751 ep->rep_attr.cap.max_send_wr + 1, 0);
fc664485
CL
752 if (IS_ERR(sendcq)) {
753 rc = PTR_ERR(sendcq);
754 dprintk("RPC: %s: failed to create send CQ: %i\n",
c56c65fb
TT
755 __func__, rc);
756 goto out1;
757 }
758
fc664485 759 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
c56c65fb
TT
760 if (rc) {
761 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
762 __func__, rc);
763 goto out2;
764 }
765
fc664485 766 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
1c00dd07 767 rpcrdma_cq_async_error_upcall, ep,
fc664485
CL
768 ep->rep_attr.cap.max_recv_wr + 1, 0);
769 if (IS_ERR(recvcq)) {
770 rc = PTR_ERR(recvcq);
771 dprintk("RPC: %s: failed to create recv CQ: %i\n",
772 __func__, rc);
773 goto out2;
774 }
775
776 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
777 if (rc) {
778 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
779 __func__, rc);
780 ib_destroy_cq(recvcq);
781 goto out2;
782 }
783
784 ep->rep_attr.send_cq = sendcq;
785 ep->rep_attr.recv_cq = recvcq;
c56c65fb
TT
786
787 /* Initialize cma parameters */
788
789 /* RPC/RDMA does not use private data */
790 ep->rep_remote_cma.private_data = NULL;
791 ep->rep_remote_cma.private_data_len = 0;
792
793 /* Client offers RDMA Read but does not initiate */
b334eaab 794 ep->rep_remote_cma.initiator_depth = 0;
03ff8821 795 if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
b334eaab
TT
796 ep->rep_remote_cma.responder_resources = 32;
797 else
c56c65fb 798 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
c56c65fb
TT
799
800 ep->rep_remote_cma.retry_count = 7;
801 ep->rep_remote_cma.flow_control = 0;
802 ep->rep_remote_cma.rnr_retry_count = 0;
803
804 return 0;
805
806out2:
fc664485 807 err = ib_destroy_cq(sendcq);
5d40a8a5
CL
808 if (err)
809 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
810 __func__, err);
c56c65fb
TT
811out1:
812 return rc;
813}
814
815/*
816 * rpcrdma_ep_destroy
817 *
818 * Disconnect and destroy endpoint. After this, the only
819 * valid operations on the ep are to free it (if dynamically
820 * allocated) or re-create it.
c56c65fb 821 */
7f1d5419 822void
c56c65fb
TT
823rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
824{
825 int rc;
826
827 dprintk("RPC: %s: entering, connected is %d\n",
828 __func__, ep->rep_connected);
829
254f91e2
CL
830 cancel_delayed_work_sync(&ep->rep_connect_worker);
831
c56c65fb
TT
832 if (ia->ri_id->qp) {
833 rc = rpcrdma_ep_disconnect(ep, ia);
834 if (rc)
835 dprintk("RPC: %s: rpcrdma_ep_disconnect"
836 " returned %i\n", __func__, rc);
fee08caf
TT
837 rdma_destroy_qp(ia->ri_id);
838 ia->ri_id->qp = NULL;
c56c65fb
TT
839 }
840
c56c65fb
TT
841 /* padding - could be done in rpcrdma_buffer_destroy... */
842 if (ep->rep_pad_mr) {
843 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
844 ep->rep_pad_mr = NULL;
845 }
846
fc664485
CL
847 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
848 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
849 if (rc)
850 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
851 __func__, rc);
852
853 rpcrdma_clean_cq(ep->rep_attr.send_cq);
854 rc = ib_destroy_cq(ep->rep_attr.send_cq);
c56c65fb
TT
855 if (rc)
856 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
857 __func__, rc);
c56c65fb
TT
858}
859
860/*
861 * Connect unconnected endpoint.
862 */
863int
864rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
865{
73806c88 866 struct rdma_cm_id *id, *old;
c56c65fb
TT
867 int rc = 0;
868 int retry_count = 0;
c56c65fb 869
c055551e 870 if (ep->rep_connected != 0) {
c56c65fb
TT
871 struct rpcrdma_xprt *xprt;
872retry:
ec62f40d 873 dprintk("RPC: %s: reconnecting...\n", __func__);
c56c65fb
TT
874 rc = rpcrdma_ep_disconnect(ep, ia);
875 if (rc && rc != -ENOTCONN)
876 dprintk("RPC: %s: rpcrdma_ep_disconnect"
877 " status %i\n", __func__, rc);
a7bc211a 878 rpcrdma_flush_cqs(ep);
c56c65fb 879
9f9d802a
CL
880 if (ia->ri_memreg_strategy == RPCRDMA_FRMR)
881 rpcrdma_reset_frmrs(ia);
882
c56c65fb
TT
883 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
884 id = rpcrdma_create_id(xprt, ia,
885 (struct sockaddr *)&xprt->rx_data.addr);
886 if (IS_ERR(id)) {
ec62f40d 887 rc = -EHOSTUNREACH;
c56c65fb
TT
888 goto out;
889 }
890 /* TEMP TEMP TEMP - fail if new device:
891 * Deregister/remarshal *all* requests!
892 * Close and recreate adapter, pd, etc!
893 * Re-determine all attributes still sane!
894 * More stuff I haven't thought of!
895 * Rrrgh!
896 */
897 if (ia->ri_id->device != id->device) {
898 printk("RPC: %s: can't reconnect on "
899 "different device!\n", __func__);
900 rdma_destroy_id(id);
ec62f40d 901 rc = -ENETUNREACH;
c56c65fb
TT
902 goto out;
903 }
904 /* END TEMP */
ec62f40d
CL
905 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
906 if (rc) {
907 dprintk("RPC: %s: rdma_create_qp failed %i\n",
908 __func__, rc);
909 rdma_destroy_id(id);
910 rc = -ENETUNREACH;
911 goto out;
912 }
73806c88
CL
913
914 write_lock(&ia->ri_qplock);
915 old = ia->ri_id;
c56c65fb 916 ia->ri_id = id;
73806c88
CL
917 write_unlock(&ia->ri_qplock);
918
919 rdma_destroy_qp(old);
920 rdma_destroy_id(old);
ec62f40d
CL
921 } else {
922 dprintk("RPC: %s: connecting...\n", __func__);
923 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
924 if (rc) {
925 dprintk("RPC: %s: rdma_create_qp failed %i\n",
926 __func__, rc);
927 /* do not update ep->rep_connected */
928 return -ENETUNREACH;
929 }
c56c65fb
TT
930 }
931
c56c65fb
TT
932 ep->rep_connected = 0;
933
934 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
935 if (rc) {
936 dprintk("RPC: %s: rdma_connect() failed with %i\n",
937 __func__, rc);
938 goto out;
939 }
940
c56c65fb
TT
941 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
942
943 /*
944 * Check state. A non-peer reject indicates no listener
945 * (ECONNREFUSED), which may be a transient state. All
946 * others indicate a transport condition which has already
947 * undergone a best-effort.
948 */
f64f9e71
JP
949 if (ep->rep_connected == -ECONNREFUSED &&
950 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
c56c65fb
TT
951 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
952 goto retry;
953 }
954 if (ep->rep_connected <= 0) {
955 /* Sometimes, the only way to reliably connect to remote
956 * CMs is to use same nonzero values for ORD and IRD. */
b334eaab
TT
957 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
958 (ep->rep_remote_cma.responder_resources == 0 ||
959 ep->rep_remote_cma.initiator_depth !=
960 ep->rep_remote_cma.responder_resources)) {
961 if (ep->rep_remote_cma.responder_resources == 0)
962 ep->rep_remote_cma.responder_resources = 1;
963 ep->rep_remote_cma.initiator_depth =
964 ep->rep_remote_cma.responder_resources;
c56c65fb 965 goto retry;
b334eaab 966 }
c56c65fb
TT
967 rc = ep->rep_connected;
968 } else {
969 dprintk("RPC: %s: connected\n", __func__);
970 }
971
972out:
973 if (rc)
974 ep->rep_connected = rc;
975 return rc;
976}
977
978/*
979 * rpcrdma_ep_disconnect
980 *
981 * This is separate from destroy to facilitate the ability
982 * to reconnect without recreating the endpoint.
983 *
984 * This call is not reentrant, and must not be made in parallel
985 * on the same endpoint.
986 */
987int
988rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
989{
990 int rc;
991
a7bc211a 992 rpcrdma_flush_cqs(ep);
c56c65fb
TT
993 rc = rdma_disconnect(ia->ri_id);
994 if (!rc) {
995 /* returns without wait if not connected */
996 wait_event_interruptible(ep->rep_connect_wait,
997 ep->rep_connected != 1);
998 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
999 (ep->rep_connected == 1) ? "still " : "dis");
1000 } else {
1001 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
1002 ep->rep_connected = rc;
1003 }
1004 return rc;
1005}
1006
2e84522c
CL
1007static int
1008rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1009{
1010 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
1011 struct ib_fmr_attr fmr_attr = {
1012 .max_pages = RPCRDMA_MAX_DATA_SEGS,
1013 .max_maps = 1,
1014 .page_shift = PAGE_SHIFT
1015 };
1016 struct rpcrdma_mw *r;
1017 int i, rc;
1018
1019 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1020 dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
1021
1022 while (i--) {
1023 r = kzalloc(sizeof(*r), GFP_KERNEL);
1024 if (r == NULL)
1025 return -ENOMEM;
1026
1027 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
1028 if (IS_ERR(r->r.fmr)) {
1029 rc = PTR_ERR(r->r.fmr);
1030 dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
1031 __func__, rc);
1032 goto out_free;
1033 }
1034
1035 list_add(&r->mw_list, &buf->rb_mws);
1036 list_add(&r->mw_all, &buf->rb_all);
1037 }
1038 return 0;
1039
1040out_free:
1041 kfree(r);
1042 return rc;
1043}
1044
1045static int
1046rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1047{
1048 struct rpcrdma_frmr *f;
1049 struct rpcrdma_mw *r;
1050 int i, rc;
1051
1052 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1053 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
1054
1055 while (i--) {
1056 r = kzalloc(sizeof(*r), GFP_KERNEL);
1057 if (r == NULL)
1058 return -ENOMEM;
1059 f = &r->r.frmr;
1060
1061 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1062 ia->ri_max_frmr_depth);
1063 if (IS_ERR(f->fr_mr)) {
1064 rc = PTR_ERR(f->fr_mr);
1065 dprintk("RPC: %s: ib_alloc_fast_reg_mr "
1066 "failed %i\n", __func__, rc);
1067 goto out_free;
1068 }
1069
1070 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
1071 ia->ri_max_frmr_depth);
1072 if (IS_ERR(f->fr_pgl)) {
1073 rc = PTR_ERR(f->fr_pgl);
1074 dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
1075 "failed %i\n", __func__, rc);
1076
1077 ib_dereg_mr(f->fr_mr);
1078 goto out_free;
1079 }
1080
1081 list_add(&r->mw_list, &buf->rb_mws);
1082 list_add(&r->mw_all, &buf->rb_all);
1083 }
1084
1085 return 0;
1086
1087out_free:
1088 kfree(r);
1089 return rc;
1090}
1091
c56c65fb
TT
1092int
1093rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1094 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
1095{
1096 char *p;
65866f82 1097 size_t len, rlen, wlen;
c56c65fb
TT
1098 int i, rc;
1099
1100 buf->rb_max_requests = cdata->max_requests;
1101 spin_lock_init(&buf->rb_lock);
1102 atomic_set(&buf->rb_credits, 1);
1103
1104 /* Need to allocate:
1105 * 1. arrays for send and recv pointers
1106 * 2. arrays of struct rpcrdma_req to fill in pointers
1107 * 3. array of struct rpcrdma_rep for replies
1108 * 4. padding, if any
c56c65fb
TT
1109 * Send/recv buffers in req/rep need to be registered
1110 */
c56c65fb
TT
1111 len = buf->rb_max_requests *
1112 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1113 len += cdata->padding;
c56c65fb 1114
c56c65fb
TT
1115 p = kzalloc(len, GFP_KERNEL);
1116 if (p == NULL) {
1117 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1118 __func__, len);
1119 rc = -ENOMEM;
1120 goto out;
1121 }
1122 buf->rb_pool = p; /* for freeing it later */
1123
1124 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1125 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1126 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1127 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1128
1129 /*
1130 * Register the zeroed pad buffer, if any.
1131 */
1132 if (cdata->padding) {
1133 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1134 &ep->rep_pad_mr, &ep->rep_pad);
1135 if (rc)
1136 goto out;
1137 }
1138 p += cdata->padding;
1139
c56c65fb 1140 INIT_LIST_HEAD(&buf->rb_mws);
3111d72c 1141 INIT_LIST_HEAD(&buf->rb_all);
c56c65fb 1142 switch (ia->ri_memreg_strategy) {
3197d309 1143 case RPCRDMA_FRMR:
2e84522c
CL
1144 rc = rpcrdma_init_frmrs(ia, buf);
1145 if (rc)
1146 goto out;
3197d309 1147 break;
c56c65fb 1148 case RPCRDMA_MTHCAFMR:
2e84522c
CL
1149 rc = rpcrdma_init_fmrs(ia, buf);
1150 if (rc)
1151 goto out;
c56c65fb 1152 break;
c56c65fb
TT
1153 default:
1154 break;
1155 }
1156
1157 /*
1158 * Allocate/init the request/reply buffers. Doing this
1159 * using kmalloc for now -- one for each buf.
1160 */
65866f82
CL
1161 wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
1162 rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
1163 dprintk("RPC: %s: wlen = %zu, rlen = %zu\n",
1164 __func__, wlen, rlen);
1165
c56c65fb
TT
1166 for (i = 0; i < buf->rb_max_requests; i++) {
1167 struct rpcrdma_req *req;
1168 struct rpcrdma_rep *rep;
1169
65866f82 1170 req = kmalloc(wlen, GFP_KERNEL);
c56c65fb
TT
1171 if (req == NULL) {
1172 dprintk("RPC: %s: request buffer %d alloc"
1173 " failed\n", __func__, i);
1174 rc = -ENOMEM;
1175 goto out;
1176 }
1177 memset(req, 0, sizeof(struct rpcrdma_req));
1178 buf->rb_send_bufs[i] = req;
1179 buf->rb_send_bufs[i]->rl_buffer = buf;
1180
1181 rc = rpcrdma_register_internal(ia, req->rl_base,
65866f82 1182 wlen - offsetof(struct rpcrdma_req, rl_base),
c56c65fb
TT
1183 &buf->rb_send_bufs[i]->rl_handle,
1184 &buf->rb_send_bufs[i]->rl_iov);
1185 if (rc)
1186 goto out;
1187
65866f82
CL
1188 buf->rb_send_bufs[i]->rl_size = wlen -
1189 sizeof(struct rpcrdma_req);
c56c65fb 1190
65866f82 1191 rep = kmalloc(rlen, GFP_KERNEL);
c56c65fb
TT
1192 if (rep == NULL) {
1193 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1194 __func__, i);
1195 rc = -ENOMEM;
1196 goto out;
1197 }
1198 memset(rep, 0, sizeof(struct rpcrdma_rep));
1199 buf->rb_recv_bufs[i] = rep;
1200 buf->rb_recv_bufs[i]->rr_buffer = buf;
c56c65fb
TT
1201
1202 rc = rpcrdma_register_internal(ia, rep->rr_base,
65866f82 1203 rlen - offsetof(struct rpcrdma_rep, rr_base),
c56c65fb
TT
1204 &buf->rb_recv_bufs[i]->rr_handle,
1205 &buf->rb_recv_bufs[i]->rr_iov);
1206 if (rc)
1207 goto out;
1208
1209 }
1210 dprintk("RPC: %s: max_requests %d\n",
1211 __func__, buf->rb_max_requests);
1212 /* done */
1213 return 0;
1214out:
1215 rpcrdma_buffer_destroy(buf);
1216 return rc;
1217}
1218
2e84522c
CL
1219static void
1220rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1221{
1222 struct rpcrdma_mw *r;
1223 int rc;
1224
1225 while (!list_empty(&buf->rb_all)) {
1226 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1227 list_del(&r->mw_all);
1228 list_del(&r->mw_list);
1229
1230 rc = ib_dealloc_fmr(r->r.fmr);
1231 if (rc)
1232 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
1233 __func__, rc);
1234
1235 kfree(r);
1236 }
1237}
1238
1239static void
1240rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
1241{
1242 struct rpcrdma_mw *r;
1243 int rc;
1244
1245 while (!list_empty(&buf->rb_all)) {
1246 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1247 list_del(&r->mw_all);
1248 list_del(&r->mw_list);
1249
1250 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1251 if (rc)
1252 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1253 __func__, rc);
1254 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1255
1256 kfree(r);
1257 }
1258}
1259
c56c65fb
TT
1260void
1261rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1262{
c56c65fb 1263 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
2e84522c 1264 int i;
c56c65fb
TT
1265
1266 /* clean up in reverse order from create
1267 * 1. recv mr memory (mr free, then kfree)
c56c65fb 1268 * 2. send mr memory (mr free, then kfree)
2e84522c 1269 * 3. MWs
c56c65fb
TT
1270 */
1271 dprintk("RPC: %s: entering\n", __func__);
1272
1273 for (i = 0; i < buf->rb_max_requests; i++) {
1274 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1275 rpcrdma_deregister_internal(ia,
1276 buf->rb_recv_bufs[i]->rr_handle,
1277 &buf->rb_recv_bufs[i]->rr_iov);
1278 kfree(buf->rb_recv_bufs[i]);
1279 }
1280 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
c56c65fb
TT
1281 rpcrdma_deregister_internal(ia,
1282 buf->rb_send_bufs[i]->rl_handle,
1283 &buf->rb_send_bufs[i]->rl_iov);
1284 kfree(buf->rb_send_bufs[i]);
1285 }
1286 }
1287
2e84522c
CL
1288 switch (ia->ri_memreg_strategy) {
1289 case RPCRDMA_FRMR:
1290 rpcrdma_destroy_frmrs(buf);
1291 break;
1292 case RPCRDMA_MTHCAFMR:
1293 rpcrdma_destroy_fmrs(buf);
1294 break;
1295 default:
1296 break;
4034ba04
AA
1297 }
1298
c56c65fb
TT
1299 kfree(buf->rb_pool);
1300}
1301
9f9d802a
CL
1302/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1303 * an unusable state. Find FRMRs in this state and dereg / reg
1304 * each. FRMRs that are VALID and attached to an rpcrdma_req are
1305 * also torn down.
1306 *
1307 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1308 *
1309 * This is invoked only in the transport connect worker in order
1310 * to serialize with rpcrdma_register_frmr_external().
1311 */
1312static void
1313rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
1314{
1315 struct rpcrdma_xprt *r_xprt =
1316 container_of(ia, struct rpcrdma_xprt, rx_ia);
1317 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1318 struct list_head *pos;
1319 struct rpcrdma_mw *r;
1320 int rc;
1321
1322 list_for_each(pos, &buf->rb_all) {
1323 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1324
1325 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
1326 continue;
1327
1328 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1329 if (rc)
1330 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1331 __func__, rc);
1332 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1333
1334 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1335 ia->ri_max_frmr_depth);
1336 if (IS_ERR(r->r.frmr.fr_mr)) {
1337 rc = PTR_ERR(r->r.frmr.fr_mr);
1338 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1339 " failed %i\n", __func__, rc);
1340 continue;
1341 }
1342 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1343 ia->ri_id->device,
1344 ia->ri_max_frmr_depth);
1345 if (IS_ERR(r->r.frmr.fr_pgl)) {
1346 rc = PTR_ERR(r->r.frmr.fr_pgl);
1347 dprintk("RPC: %s: "
1348 "ib_alloc_fast_reg_page_list "
1349 "failed %i\n", __func__, rc);
1350
1351 ib_dereg_mr(r->r.frmr.fr_mr);
1352 continue;
1353 }
1354 r->r.frmr.fr_state = FRMR_IS_INVALID;
1355 }
1356}
1357
c2922c02
CL
1358/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1359 * some req segments uninitialized.
1360 */
1361static void
1362rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
1363{
1364 if (*mw) {
1365 list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
1366 *mw = NULL;
1367 }
1368}
1369
1370/* Cycle mw's back in reverse order, and "spin" them.
1371 * This delays and scrambles reuse as much as possible.
1372 */
1373static void
1374rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1375{
1376 struct rpcrdma_mr_seg *seg = req->rl_segments;
1377 struct rpcrdma_mr_seg *seg1 = seg;
1378 int i;
1379
1380 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
1381 rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf);
1382 rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf);
1383}
1384
1385static void
1386rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1387{
1388 buf->rb_send_bufs[--buf->rb_send_index] = req;
1389 req->rl_niovs = 0;
1390 if (req->rl_reply) {
1391 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
1392 req->rl_reply->rr_func = NULL;
1393 req->rl_reply = NULL;
1394 }
1395}
1396
ddb6bebc
CL
1397/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
1398 * Redo only the ib_post_send().
1399 */
1400static void
1401rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
1402{
1403 struct rpcrdma_xprt *r_xprt =
1404 container_of(ia, struct rpcrdma_xprt, rx_ia);
1405 struct ib_send_wr invalidate_wr, *bad_wr;
1406 int rc;
1407
1408 dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
1409
1410 /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
dab7e3b8 1411 r->r.frmr.fr_state = FRMR_IS_INVALID;
ddb6bebc
CL
1412
1413 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
1414 invalidate_wr.wr_id = (unsigned long)(void *)r;
1415 invalidate_wr.opcode = IB_WR_LOCAL_INV;
ddb6bebc
CL
1416 invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
1417 DECR_CQCOUNT(&r_xprt->rx_ep);
1418
1419 dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
1420 __func__, r, r->r.frmr.fr_mr->rkey);
1421
1422 read_lock(&ia->ri_qplock);
1423 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1424 read_unlock(&ia->ri_qplock);
1425 if (rc) {
1426 /* Force rpcrdma_buffer_get() to retry */
1427 r->r.frmr.fr_state = FRMR_IS_STALE;
1428 dprintk("RPC: %s: ib_post_send failed, %i\n",
1429 __func__, rc);
1430 }
1431}
1432
1433static void
1434rpcrdma_retry_flushed_linv(struct list_head *stale,
1435 struct rpcrdma_buffer *buf)
1436{
1437 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1438 struct list_head *pos;
1439 struct rpcrdma_mw *r;
1440 unsigned long flags;
1441
1442 list_for_each(pos, stale) {
1443 r = list_entry(pos, struct rpcrdma_mw, mw_list);
1444 rpcrdma_retry_local_inv(r, ia);
1445 }
1446
1447 spin_lock_irqsave(&buf->rb_lock, flags);
1448 list_splice_tail(stale, &buf->rb_mws);
1449 spin_unlock_irqrestore(&buf->rb_lock, flags);
1450}
1451
1452static struct rpcrdma_req *
1453rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
1454 struct list_head *stale)
1455{
1456 struct rpcrdma_mw *r;
1457 int i;
1458
1459 i = RPCRDMA_MAX_SEGS - 1;
1460 while (!list_empty(&buf->rb_mws)) {
1461 r = list_entry(buf->rb_mws.next,
1462 struct rpcrdma_mw, mw_list);
1463 list_del(&r->mw_list);
1464 if (r->r.frmr.fr_state == FRMR_IS_STALE) {
1465 list_add(&r->mw_list, stale);
1466 continue;
1467 }
1468 req->rl_segments[i].mr_chunk.rl_mw = r;
1469 if (unlikely(i-- == 0))
1470 return req; /* Success */
1471 }
1472
1473 /* Not enough entries on rb_mws for this req */
1474 rpcrdma_buffer_put_sendbuf(req, buf);
1475 rpcrdma_buffer_put_mrs(req, buf);
1476 return NULL;
1477}
1478
c2922c02 1479static struct rpcrdma_req *
ddb6bebc 1480rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
c2922c02
CL
1481{
1482 struct rpcrdma_mw *r;
1483 int i;
1484
1485 i = RPCRDMA_MAX_SEGS - 1;
1486 while (!list_empty(&buf->rb_mws)) {
1487 r = list_entry(buf->rb_mws.next,
1488 struct rpcrdma_mw, mw_list);
1489 list_del(&r->mw_list);
1490 req->rl_segments[i].mr_chunk.rl_mw = r;
1491 if (unlikely(i-- == 0))
1492 return req; /* Success */
1493 }
1494
1495 /* Not enough entries on rb_mws for this req */
1496 rpcrdma_buffer_put_sendbuf(req, buf);
1497 rpcrdma_buffer_put_mrs(req, buf);
1498 return NULL;
1499}
1500
c56c65fb
TT
1501/*
1502 * Get a set of request/reply buffers.
1503 *
1504 * Reply buffer (if needed) is attached to send buffer upon return.
1505 * Rule:
1506 * rb_send_index and rb_recv_index MUST always be pointing to the
1507 * *next* available buffer (non-NULL). They are incremented after
1508 * removing buffers, and decremented *before* returning them.
1509 */
1510struct rpcrdma_req *
1511rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1512{
c2922c02 1513 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
ddb6bebc 1514 struct list_head stale;
c56c65fb
TT
1515 struct rpcrdma_req *req;
1516 unsigned long flags;
1517
1518 spin_lock_irqsave(&buffers->rb_lock, flags);
1519 if (buffers->rb_send_index == buffers->rb_max_requests) {
1520 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1521 dprintk("RPC: %s: out of request buffers\n", __func__);
1522 return ((struct rpcrdma_req *)NULL);
1523 }
1524
1525 req = buffers->rb_send_bufs[buffers->rb_send_index];
1526 if (buffers->rb_send_index < buffers->rb_recv_index) {
1527 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1528 __func__,
1529 buffers->rb_recv_index - buffers->rb_send_index);
1530 req->rl_reply = NULL;
1531 } else {
1532 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1533 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1534 }
1535 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
ddb6bebc
CL
1536
1537 INIT_LIST_HEAD(&stale);
c2922c02
CL
1538 switch (ia->ri_memreg_strategy) {
1539 case RPCRDMA_FRMR:
ddb6bebc
CL
1540 req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
1541 break;
c2922c02 1542 case RPCRDMA_MTHCAFMR:
ddb6bebc 1543 req = rpcrdma_buffer_get_fmrs(req, buffers);
c2922c02
CL
1544 break;
1545 default:
1546 break;
c56c65fb
TT
1547 }
1548 spin_unlock_irqrestore(&buffers->rb_lock, flags);
ddb6bebc
CL
1549 if (!list_empty(&stale))
1550 rpcrdma_retry_flushed_linv(&stale, buffers);
c56c65fb
TT
1551 return req;
1552}
1553
1554/*
1555 * Put request/reply buffers back into pool.
1556 * Pre-decrement counter/array index.
1557 */
1558void
1559rpcrdma_buffer_put(struct rpcrdma_req *req)
1560{
1561 struct rpcrdma_buffer *buffers = req->rl_buffer;
1562 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
c56c65fb
TT
1563 unsigned long flags;
1564
c56c65fb 1565 spin_lock_irqsave(&buffers->rb_lock, flags);
c2922c02 1566 rpcrdma_buffer_put_sendbuf(req, buffers);
c56c65fb 1567 switch (ia->ri_memreg_strategy) {
3197d309 1568 case RPCRDMA_FRMR:
c56c65fb 1569 case RPCRDMA_MTHCAFMR:
c2922c02 1570 rpcrdma_buffer_put_mrs(req, buffers);
c56c65fb
TT
1571 break;
1572 default:
1573 break;
1574 }
1575 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1576}
1577
1578/*
1579 * Recover reply buffers from pool.
1580 * This happens when recovering from error conditions.
1581 * Post-increment counter/array index.
1582 */
1583void
1584rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1585{
1586 struct rpcrdma_buffer *buffers = req->rl_buffer;
1587 unsigned long flags;
1588
1589 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1590 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1591 spin_lock_irqsave(&buffers->rb_lock, flags);
1592 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1593 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1594 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1595 }
1596 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1597}
1598
1599/*
1600 * Put reply buffers back into pool when not attached to
b45ccfd2 1601 * request. This happens in error conditions.
c56c65fb
TT
1602 */
1603void
1604rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1605{
1606 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1607 unsigned long flags;
1608
1609 rep->rr_func = NULL;
1610 spin_lock_irqsave(&buffers->rb_lock, flags);
1611 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1612 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1613}
1614
1615/*
1616 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1617 */
1618
1619int
1620rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1621 struct ib_mr **mrp, struct ib_sge *iov)
1622{
1623 struct ib_phys_buf ipb;
1624 struct ib_mr *mr;
1625 int rc;
1626
1627 /*
1628 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1629 */
1630 iov->addr = ib_dma_map_single(ia->ri_id->device,
1631 va, len, DMA_BIDIRECTIONAL);
bf858ab0
YB
1632 if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
1633 return -ENOMEM;
1634
c56c65fb
TT
1635 iov->length = len;
1636
bd7ed1d1
TT
1637 if (ia->ri_have_dma_lkey) {
1638 *mrp = NULL;
1639 iov->lkey = ia->ri_dma_lkey;
1640 return 0;
1641 } else if (ia->ri_bind_mem != NULL) {
c56c65fb
TT
1642 *mrp = NULL;
1643 iov->lkey = ia->ri_bind_mem->lkey;
1644 return 0;
1645 }
1646
1647 ipb.addr = iov->addr;
1648 ipb.size = iov->length;
1649 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1650 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1651
1652 dprintk("RPC: %s: phys convert: 0x%llx "
1653 "registered 0x%llx length %d\n",
a56daeb7
AM
1654 __func__, (unsigned long long)ipb.addr,
1655 (unsigned long long)iov->addr, len);
c56c65fb
TT
1656
1657 if (IS_ERR(mr)) {
1658 *mrp = NULL;
1659 rc = PTR_ERR(mr);
1660 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1661 } else {
1662 *mrp = mr;
1663 iov->lkey = mr->lkey;
1664 rc = 0;
1665 }
1666
1667 return rc;
1668}
1669
1670int
1671rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1672 struct ib_mr *mr, struct ib_sge *iov)
1673{
1674 int rc;
1675
1676 ib_dma_unmap_single(ia->ri_id->device,
1677 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1678
1679 if (NULL == mr)
1680 return 0;
1681
1682 rc = ib_dereg_mr(mr);
1683 if (rc)
1684 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1685 return rc;
1686}
1687
1688/*
1689 * Wrappers for chunk registration, shared by read/write chunk code.
1690 */
1691
1692static void
1693rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1694{
1695 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1696 seg->mr_dmalen = seg->mr_len;
1697 if (seg->mr_page)
1698 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1699 seg->mr_page, offset_in_page(seg->mr_offset),
1700 seg->mr_dmalen, seg->mr_dir);
1701 else
1702 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1703 seg->mr_offset,
1704 seg->mr_dmalen, seg->mr_dir);
5c635e09
TT
1705 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1706 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1707 __func__,
986d4abb
RD
1708 (unsigned long long)seg->mr_dma,
1709 seg->mr_offset, seg->mr_dmalen);
5c635e09 1710 }
c56c65fb
TT
1711}
1712
1713static void
1714rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1715{
1716 if (seg->mr_page)
1717 ib_dma_unmap_page(ia->ri_id->device,
1718 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1719 else
1720 ib_dma_unmap_single(ia->ri_id->device,
1721 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1722}
1723
3197d309
TT
1724static int
1725rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1726 int *nsegs, int writing, struct rpcrdma_ia *ia,
1727 struct rpcrdma_xprt *r_xprt)
1728{
1729 struct rpcrdma_mr_seg *seg1 = seg;
0dbb4108
CL
1730 struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw;
1731 struct rpcrdma_frmr *frmr = &mw->r.frmr;
1732 struct ib_mr *mr = frmr->fr_mr;
f590e878 1733 struct ib_send_wr fastreg_wr, *bad_wr;
3197d309
TT
1734 u8 key;
1735 int len, pageoff;
1736 int i, rc;
9b78145c
TT
1737 int seg_len;
1738 u64 pa;
1739 int page_no;
3197d309
TT
1740
1741 pageoff = offset_in_page(seg1->mr_offset);
1742 seg1->mr_offset -= pageoff; /* start of page */
1743 seg1->mr_len += pageoff;
1744 len = -pageoff;
0fc6c4e7
SW
1745 if (*nsegs > ia->ri_max_frmr_depth)
1746 *nsegs = ia->ri_max_frmr_depth;
9b78145c 1747 for (page_no = i = 0; i < *nsegs;) {
3197d309 1748 rpcrdma_map_one(ia, seg, writing);
9b78145c
TT
1749 pa = seg->mr_dma;
1750 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
0dbb4108 1751 frmr->fr_pgl->page_list[page_no++] = pa;
9b78145c
TT
1752 pa += PAGE_SIZE;
1753 }
3197d309
TT
1754 len += seg->mr_len;
1755 ++seg;
1756 ++i;
1757 /* Check for holes */
1758 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1759 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1760 break;
1761 }
1762 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
0dbb4108 1763 __func__, mw, i);
3197d309 1764
05055722
CL
1765 frmr->fr_state = FRMR_IS_VALID;
1766
f590e878
CL
1767 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
1768 fastreg_wr.wr_id = (unsigned long)(void *)mw;
1769 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
1770 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1771 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
1772 fastreg_wr.wr.fast_reg.page_list_len = page_no;
1773 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1774 fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1775 if (fastreg_wr.wr.fast_reg.length < len) {
5fc83f47
CL
1776 rc = -EIO;
1777 goto out_err;
c977dea2
CL
1778 }
1779
1780 /* Bump the key */
0dbb4108
CL
1781 key = (u8)(mr->rkey & 0x000000FF);
1782 ib_update_fast_reg_key(mr, ++key);
c977dea2 1783
f590e878 1784 fastreg_wr.wr.fast_reg.access_flags = (writing ?
68743082
VP
1785 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1786 IB_ACCESS_REMOTE_READ);
f590e878 1787 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
3197d309
TT
1788 DECR_CQCOUNT(&r_xprt->rx_ep);
1789
f590e878 1790 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
3197d309
TT
1791 if (rc) {
1792 dprintk("RPC: %s: failed ib_post_send for register,"
1793 " status %i\n", __func__, rc);
c93e986a 1794 ib_update_fast_reg_key(mr, --key);
5fc83f47 1795 goto out_err;
3197d309 1796 } else {
0dbb4108 1797 seg1->mr_rkey = mr->rkey;
3197d309
TT
1798 seg1->mr_base = seg1->mr_dma + pageoff;
1799 seg1->mr_nsegs = i;
1800 seg1->mr_len = len;
1801 }
1802 *nsegs = i;
5fc83f47
CL
1803 return 0;
1804out_err:
05055722 1805 frmr->fr_state = FRMR_IS_INVALID;
5fc83f47
CL
1806 while (i--)
1807 rpcrdma_unmap_one(ia, --seg);
3197d309
TT
1808 return rc;
1809}
1810
1811static int
1812rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1813 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1814{
1815 struct rpcrdma_mr_seg *seg1 = seg;
1816 struct ib_send_wr invalidate_wr, *bad_wr;
1817 int rc;
1818
dab7e3b8
CL
1819 seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
1820
3197d309 1821 memset(&invalidate_wr, 0, sizeof invalidate_wr);
5c635e09 1822 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
3197d309 1823 invalidate_wr.opcode = IB_WR_LOCAL_INV;
3197d309
TT
1824 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1825 DECR_CQCOUNT(&r_xprt->rx_ep);
1826
73806c88
CL
1827 read_lock(&ia->ri_qplock);
1828 while (seg1->mr_nsegs--)
1829 rpcrdma_unmap_one(ia, seg++);
3197d309 1830 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
73806c88 1831 read_unlock(&ia->ri_qplock);
dab7e3b8
CL
1832 if (rc) {
1833 /* Force rpcrdma_buffer_get() to retry */
1834 seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
3197d309
TT
1835 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1836 " status %i\n", __func__, rc);
dab7e3b8 1837 }
3197d309
TT
1838 return rc;
1839}
1840
8d4ba034
TT
1841static int
1842rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1843 int *nsegs, int writing, struct rpcrdma_ia *ia)
1844{
1845 struct rpcrdma_mr_seg *seg1 = seg;
1846 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1847 int len, pageoff, i, rc;
1848
1849 pageoff = offset_in_page(seg1->mr_offset);
1850 seg1->mr_offset -= pageoff; /* start of page */
1851 seg1->mr_len += pageoff;
1852 len = -pageoff;
1853 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1854 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1855 for (i = 0; i < *nsegs;) {
1856 rpcrdma_map_one(ia, seg, writing);
1857 physaddrs[i] = seg->mr_dma;
1858 len += seg->mr_len;
1859 ++seg;
1860 ++i;
1861 /* Check for holes */
1862 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1863 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1864 break;
1865 }
1866 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1867 physaddrs, i, seg1->mr_dma);
1868 if (rc) {
1869 dprintk("RPC: %s: failed ib_map_phys_fmr "
1870 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1871 len, (unsigned long long)seg1->mr_dma,
1872 pageoff, i, rc);
1873 while (i--)
1874 rpcrdma_unmap_one(ia, --seg);
1875 } else {
1876 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1877 seg1->mr_base = seg1->mr_dma + pageoff;
1878 seg1->mr_nsegs = i;
1879 seg1->mr_len = len;
1880 }
1881 *nsegs = i;
1882 return rc;
1883}
1884
1885static int
1886rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1887 struct rpcrdma_ia *ia)
1888{
1889 struct rpcrdma_mr_seg *seg1 = seg;
1890 LIST_HEAD(l);
1891 int rc;
1892
1893 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1894 rc = ib_unmap_fmr(&l);
73806c88 1895 read_lock(&ia->ri_qplock);
8d4ba034
TT
1896 while (seg1->mr_nsegs--)
1897 rpcrdma_unmap_one(ia, seg++);
73806c88 1898 read_unlock(&ia->ri_qplock);
8d4ba034
TT
1899 if (rc)
1900 dprintk("RPC: %s: failed ib_unmap_fmr,"
1901 " status %i\n", __func__, rc);
1902 return rc;
1903}
1904
c56c65fb
TT
1905int
1906rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1907 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1908{
1909 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1910 int rc = 0;
1911
1912 switch (ia->ri_memreg_strategy) {
1913
1914#if RPCRDMA_PERSISTENT_REGISTRATION
1915 case RPCRDMA_ALLPHYSICAL:
1916 rpcrdma_map_one(ia, seg, writing);
1917 seg->mr_rkey = ia->ri_bind_mem->rkey;
1918 seg->mr_base = seg->mr_dma;
1919 seg->mr_nsegs = 1;
1920 nsegs = 1;
1921 break;
1922#endif
1923
3197d309
TT
1924 /* Registration using frmr registration */
1925 case RPCRDMA_FRMR:
1926 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1927 break;
1928
8d4ba034 1929 /* Registration using fmr memory registration */
c56c65fb 1930 case RPCRDMA_MTHCAFMR:
8d4ba034 1931 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
c56c65fb
TT
1932 break;
1933
c56c65fb 1934 default:
0ac531c1 1935 return -1;
c56c65fb
TT
1936 }
1937 if (rc)
1938 return -1;
1939
1940 return nsegs;
1941}
1942
1943int
1944rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
13c9ff8f 1945 struct rpcrdma_xprt *r_xprt)
c56c65fb
TT
1946{
1947 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1948 int nsegs = seg->mr_nsegs, rc;
1949
1950 switch (ia->ri_memreg_strategy) {
1951
1952#if RPCRDMA_PERSISTENT_REGISTRATION
1953 case RPCRDMA_ALLPHYSICAL:
73806c88 1954 read_lock(&ia->ri_qplock);
c56c65fb 1955 rpcrdma_unmap_one(ia, seg);
73806c88 1956 read_unlock(&ia->ri_qplock);
c56c65fb
TT
1957 break;
1958#endif
1959
3197d309
TT
1960 case RPCRDMA_FRMR:
1961 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1962 break;
1963
c56c65fb 1964 case RPCRDMA_MTHCAFMR:
8d4ba034 1965 rc = rpcrdma_deregister_fmr_external(seg, ia);
c56c65fb
TT
1966 break;
1967
c56c65fb 1968 default:
c56c65fb
TT
1969 break;
1970 }
c56c65fb
TT
1971 return nsegs;
1972}
1973
1974/*
1975 * Prepost any receive buffer, then post send.
1976 *
1977 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1978 */
1979int
1980rpcrdma_ep_post(struct rpcrdma_ia *ia,
1981 struct rpcrdma_ep *ep,
1982 struct rpcrdma_req *req)
1983{
1984 struct ib_send_wr send_wr, *send_wr_fail;
1985 struct rpcrdma_rep *rep = req->rl_reply;
1986 int rc;
1987
1988 if (rep) {
1989 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1990 if (rc)
1991 goto out;
1992 req->rl_reply = NULL;
1993 }
1994
1995 send_wr.next = NULL;
1996 send_wr.wr_id = 0ULL; /* no send cookie */
1997 send_wr.sg_list = req->rl_send_iov;
1998 send_wr.num_sge = req->rl_niovs;
1999 send_wr.opcode = IB_WR_SEND;
c56c65fb
TT
2000 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
2001 ib_dma_sync_single_for_device(ia->ri_id->device,
2002 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
2003 DMA_TO_DEVICE);
2004 ib_dma_sync_single_for_device(ia->ri_id->device,
2005 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
2006 DMA_TO_DEVICE);
2007 ib_dma_sync_single_for_device(ia->ri_id->device,
2008 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
2009 DMA_TO_DEVICE);
2010
2011 if (DECR_CQCOUNT(ep) > 0)
2012 send_wr.send_flags = 0;
2013 else { /* Provider must take a send completion every now and then */
2014 INIT_CQCOUNT(ep);
2015 send_wr.send_flags = IB_SEND_SIGNALED;
2016 }
2017
2018 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
2019 if (rc)
2020 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
2021 rc);
2022out:
2023 return rc;
2024}
2025
2026/*
2027 * (Re)post a receive buffer.
2028 */
2029int
2030rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
2031 struct rpcrdma_ep *ep,
2032 struct rpcrdma_rep *rep)
2033{
2034 struct ib_recv_wr recv_wr, *recv_wr_fail;
2035 int rc;
2036
2037 recv_wr.next = NULL;
2038 recv_wr.wr_id = (u64) (unsigned long) rep;
2039 recv_wr.sg_list = &rep->rr_iov;
2040 recv_wr.num_sge = 1;
2041
2042 ib_dma_sync_single_for_cpu(ia->ri_id->device,
2043 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
2044
c56c65fb
TT
2045 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
2046
2047 if (rc)
2048 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
2049 rc);
2050 return rc;
2051}
43e95988
CL
2052
2053/* Physical mapping means one Read/Write list entry per-page.
2054 * All list entries must fit within an inline buffer
2055 *
2056 * NB: The server must return a Write list for NFS READ,
2057 * which has the same constraint. Factor in the inline
2058 * rsize as well.
2059 */
2060static size_t
2061rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
2062{
2063 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
2064 unsigned int inline_size, pages;
2065
2066 inline_size = min_t(unsigned int,
2067 cdata->inline_wsize, cdata->inline_rsize);
2068 inline_size -= RPCRDMA_HDRLEN_MIN;
2069 pages = inline_size / sizeof(struct rpcrdma_segment);
2070 return pages << PAGE_SHIFT;
2071}
2072
2073static size_t
2074rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
2075{
2076 return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
2077}
2078
2079size_t
2080rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
2081{
2082 size_t result;
2083
2084 switch (r_xprt->rx_ia.ri_memreg_strategy) {
2085 case RPCRDMA_ALLPHYSICAL:
2086 result = rpcrdma_physical_max_payload(r_xprt);
2087 break;
2088 default:
2089 result = rpcrdma_mr_max_payload(r_xprt);
2090 }
2091 return result;
2092}