Commit | Line | Data |
---|---|---|
bcf3ffd4 | 1 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause |
c06b540a | 2 | /* |
ecf85b23 | 3 | * Copyright (c) 2016-2018 Oracle. All rights reserved. |
0bf48289 | 4 | * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. |
c06b540a TT |
5 | * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. |
6 | * | |
7 | * This software is available to you under a choice of one of two | |
8 | * licenses. You may choose to be licensed under the terms of the GNU | |
9 | * General Public License (GPL) Version 2, available from the file | |
10 | * COPYING in the main directory of this source tree, or the BSD-type | |
11 | * license below: | |
12 | * | |
13 | * Redistribution and use in source and binary forms, with or without | |
14 | * modification, are permitted provided that the following conditions | |
15 | * are met: | |
16 | * | |
17 | * Redistributions of source code must retain the above copyright | |
18 | * notice, this list of conditions and the following disclaimer. | |
19 | * | |
20 | * Redistributions in binary form must reproduce the above | |
21 | * copyright notice, this list of conditions and the following | |
22 | * disclaimer in the documentation and/or other materials provided | |
23 | * with the distribution. | |
24 | * | |
25 | * Neither the name of the Network Appliance, Inc. nor the names of | |
26 | * its contributors may be used to endorse or promote products | |
27 | * derived from this software without specific prior written | |
28 | * permission. | |
29 | * | |
30 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
31 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
32 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
33 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
34 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
35 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
36 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
37 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
38 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
39 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
40 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
41 | * | |
42 | * Author: Tom Tucker <tom@opengridcomputing.com> | |
43 | */ | |
44 | ||
9a6a180b CL |
45 | /* Operation |
46 | * | |
47 | * The main entry point is svc_rdma_sendto. This is called by the | |
48 | * RPC server when an RPC Reply is ready to be transmitted to a client. | |
49 | * | |
50 | * The passed-in svc_rqst contains a struct xdr_buf which holds an | |
51 | * XDR-encoded RPC Reply message. sendto must construct the RPC-over-RDMA | |
52 | * transport header, post all Write WRs needed for this Reply, then post | |
53 | * a Send WR conveying the transport header and the RPC message itself to | |
54 | * the client. | |
55 | * | |
56 | * svc_rdma_sendto must fully transmit the Reply before returning, as | |
57 | * the svc_rqst will be recycled as soon as sendto returns. Remaining | |
58 | * resources referred to by the svc_rqst are also recycled at that time. | |
59 | * Therefore any resources that must remain longer must be detached | |
60 | * from the svc_rqst and released later. | |
61 | * | |
62 | * Page Management | |
63 | * | |
64 | * The I/O that performs Reply transmission is asynchronous, and may | |
65 | * complete well after sendto returns. Thus pages under I/O must be | |
66 | * removed from the svc_rqst before sendto returns. | |
67 | * | |
68 | * The logic here depends on Send Queue and completion ordering. Since | |
69 | * the Send WR is always posted last, it will always complete last. Thus | |
70 | * when it completes, it is guaranteed that all previous Write WRs have | |
71 | * also completed. | |
72 | * | |
73 | * Write WRs are constructed and posted. Each Write segment gets its own | |
74 | * svc_rdma_rw_ctxt, allowing the Write completion handler to find and | |
75 | * DMA-unmap the pages under I/O for that Write segment. The Write | |
76 | * completion handler does not release any pages. | |
77 | * | |
4201c746 | 78 | * When the Send WR is constructed, it also gets its own svc_rdma_send_ctxt. |
9a6a180b CL |
79 | * The ownership of all of the Reply's pages are transferred into that |
80 | * ctxt, the Send WR is posted, and sendto returns. | |
81 | * | |
4201c746 | 82 | * The svc_rdma_send_ctxt is presented when the Send WR completes. The |
9a6a180b CL |
83 | * Send completion handler finally releases the Reply's pages. |
84 | * | |
85 | * This mechanism also assumes that completions on the transport's Send | |
86 | * Completion Queue do not run in parallel. Otherwise a Write completion | |
87 | * and Send completion running at the same time could release pages that | |
88 | * are still DMA-mapped. | |
89 | * | |
90 | * Error Handling | |
91 | * | |
92 | * - If the Send WR is posted successfully, it will either complete | |
93 | * successfully, or get flushed. Either way, the Send completion | |
94 | * handler releases the Reply's pages. | |
95 | * - If the Send WR cannot be not posted, the forward path releases | |
96 | * the Reply's pages. | |
97 | * | |
98 | * This handles the case, without the use of page reference counting, | |
99 | * where two different Write segments send portions of the same page. | |
100 | */ | |
101 | ||
c06b540a TT |
102 | #include <linux/spinlock.h> |
103 | #include <asm/unaligned.h> | |
98895edb | 104 | |
c06b540a TT |
105 | #include <rdma/ib_verbs.h> |
106 | #include <rdma/rdma_cm.h> | |
98895edb CL |
107 | |
108 | #include <linux/sunrpc/debug.h> | |
109 | #include <linux/sunrpc/rpc_rdma.h> | |
c06b540a TT |
110 | #include <linux/sunrpc/svc_rdma.h> |
111 | ||
98895edb CL |
112 | #include "xprt_rdma.h" |
113 | #include <trace/events/rpcrdma.h> | |
114 | ||
c06b540a TT |
115 | #define RPCDBG_FACILITY RPCDBG_SVCXPRT |
116 | ||
4201c746 CL |
117 | static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc); |
118 | ||
119 | static inline struct svc_rdma_send_ctxt * | |
120 | svc_rdma_next_send_ctxt(struct list_head *list) | |
121 | { | |
122 | return list_first_entry_or_null(list, struct svc_rdma_send_ctxt, | |
123 | sc_list); | |
124 | } | |
125 | ||
126 | static struct svc_rdma_send_ctxt * | |
127 | svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) | |
128 | { | |
129 | struct svc_rdma_send_ctxt *ctxt; | |
99722fe4 CL |
130 | dma_addr_t addr; |
131 | void *buffer; | |
25fd86ec | 132 | size_t size; |
4201c746 CL |
133 | int i; |
134 | ||
25fd86ec CL |
135 | size = sizeof(*ctxt); |
136 | size += rdma->sc_max_send_sges * sizeof(struct ib_sge); | |
137 | ctxt = kmalloc(size, GFP_KERNEL); | |
4201c746 | 138 | if (!ctxt) |
99722fe4 CL |
139 | goto fail0; |
140 | buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL); | |
141 | if (!buffer) | |
142 | goto fail1; | |
143 | addr = ib_dma_map_single(rdma->sc_pd->device, buffer, | |
144 | rdma->sc_max_req_size, DMA_TO_DEVICE); | |
145 | if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) | |
146 | goto fail2; | |
4201c746 | 147 | |
4201c746 CL |
148 | ctxt->sc_send_wr.next = NULL; |
149 | ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe; | |
150 | ctxt->sc_send_wr.sg_list = ctxt->sc_sges; | |
151 | ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; | |
99722fe4 CL |
152 | ctxt->sc_cqe.done = svc_rdma_wc_send; |
153 | ctxt->sc_xprt_buf = buffer; | |
154 | ctxt->sc_sges[0].addr = addr; | |
155 | ||
25fd86ec | 156 | for (i = 0; i < rdma->sc_max_send_sges; i++) |
4201c746 CL |
157 | ctxt->sc_sges[i].lkey = rdma->sc_pd->local_dma_lkey; |
158 | return ctxt; | |
99722fe4 CL |
159 | |
160 | fail2: | |
161 | kfree(buffer); | |
162 | fail1: | |
163 | kfree(ctxt); | |
164 | fail0: | |
165 | return NULL; | |
4201c746 CL |
166 | } |
167 | ||
168 | /** | |
169 | * svc_rdma_send_ctxts_destroy - Release all send_ctxt's for an xprt | |
170 | * @rdma: svcxprt_rdma being torn down | |
171 | * | |
172 | */ | |
173 | void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) | |
174 | { | |
175 | struct svc_rdma_send_ctxt *ctxt; | |
176 | ||
177 | while ((ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts))) { | |
178 | list_del(&ctxt->sc_list); | |
99722fe4 CL |
179 | ib_dma_unmap_single(rdma->sc_pd->device, |
180 | ctxt->sc_sges[0].addr, | |
181 | rdma->sc_max_req_size, | |
182 | DMA_TO_DEVICE); | |
183 | kfree(ctxt->sc_xprt_buf); | |
4201c746 CL |
184 | kfree(ctxt); |
185 | } | |
186 | } | |
187 | ||
188 | /** | |
189 | * svc_rdma_send_ctxt_get - Get a free send_ctxt | |
190 | * @rdma: controlling svcxprt_rdma | |
191 | * | |
192 | * Returns a ready-to-use send_ctxt, or NULL if none are | |
193 | * available and a fresh one cannot be allocated. | |
194 | */ | |
195 | struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma) | |
196 | { | |
197 | struct svc_rdma_send_ctxt *ctxt; | |
198 | ||
199 | spin_lock(&rdma->sc_send_lock); | |
200 | ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts); | |
201 | if (!ctxt) | |
202 | goto out_empty; | |
203 | list_del(&ctxt->sc_list); | |
204 | spin_unlock(&rdma->sc_send_lock); | |
205 | ||
206 | out: | |
207 | ctxt->sc_send_wr.num_sge = 0; | |
99722fe4 | 208 | ctxt->sc_cur_sge_no = 0; |
4201c746 CL |
209 | ctxt->sc_page_count = 0; |
210 | return ctxt; | |
211 | ||
212 | out_empty: | |
213 | spin_unlock(&rdma->sc_send_lock); | |
214 | ctxt = svc_rdma_send_ctxt_alloc(rdma); | |
215 | if (!ctxt) | |
216 | return NULL; | |
217 | goto out; | |
218 | } | |
219 | ||
220 | /** | |
221 | * svc_rdma_send_ctxt_put - Return send_ctxt to free list | |
222 | * @rdma: controlling svcxprt_rdma | |
223 | * @ctxt: object to return to the free list | |
224 | * | |
225 | * Pages left in sc_pages are DMA unmapped and released. | |
226 | */ | |
227 | void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, | |
228 | struct svc_rdma_send_ctxt *ctxt) | |
229 | { | |
230 | struct ib_device *device = rdma->sc_cm_id->device; | |
231 | unsigned int i; | |
232 | ||
99722fe4 CL |
233 | /* The first SGE contains the transport header, which |
234 | * remains mapped until @ctxt is destroyed. | |
235 | */ | |
236 | for (i = 1; i < ctxt->sc_send_wr.num_sge; i++) | |
4201c746 CL |
237 | ib_dma_unmap_page(device, |
238 | ctxt->sc_sges[i].addr, | |
239 | ctxt->sc_sges[i].length, | |
240 | DMA_TO_DEVICE); | |
241 | ||
242 | for (i = 0; i < ctxt->sc_page_count; ++i) | |
243 | put_page(ctxt->sc_pages[i]); | |
244 | ||
245 | spin_lock(&rdma->sc_send_lock); | |
246 | list_add(&ctxt->sc_list, &rdma->sc_send_ctxts); | |
247 | spin_unlock(&rdma->sc_send_lock); | |
248 | } | |
249 | ||
250 | /** | |
251 | * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC | |
252 | * @cq: Completion Queue context | |
253 | * @wc: Work Completion object | |
254 | * | |
255 | * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that | |
256 | * the Send completion handler could be running. | |
257 | */ | |
258 | static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) | |
259 | { | |
260 | struct svcxprt_rdma *rdma = cq->cq_context; | |
261 | struct ib_cqe *cqe = wc->wr_cqe; | |
262 | struct svc_rdma_send_ctxt *ctxt; | |
263 | ||
264 | trace_svcrdma_wc_send(wc); | |
265 | ||
266 | atomic_inc(&rdma->sc_sq_avail); | |
267 | wake_up(&rdma->sc_send_wait); | |
268 | ||
269 | ctxt = container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); | |
270 | svc_rdma_send_ctxt_put(rdma, ctxt); | |
271 | ||
272 | if (unlikely(wc->status != IB_WC_SUCCESS)) { | |
273 | set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); | |
274 | svc_xprt_enqueue(&rdma->sc_xprt); | |
4201c746 CL |
275 | } |
276 | ||
277 | svc_xprt_put(&rdma->sc_xprt); | |
278 | } | |
279 | ||
3abb03fa CL |
280 | /** |
281 | * svc_rdma_send - Post a single Send WR | |
282 | * @rdma: transport on which to post the WR | |
283 | * @wr: prepared Send WR to post | |
284 | * | |
285 | * Returns zero the Send WR was posted successfully. Otherwise, a | |
286 | * negative errno is returned. | |
287 | */ | |
4201c746 CL |
288 | int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr) |
289 | { | |
4201c746 CL |
290 | int ret; |
291 | ||
3abb03fa | 292 | might_sleep(); |
4201c746 CL |
293 | |
294 | /* If the SQ is full, wait until an SQ entry is available */ | |
295 | while (1) { | |
3abb03fa | 296 | if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) { |
4201c746 CL |
297 | atomic_inc(&rdma_stat_sq_starve); |
298 | trace_svcrdma_sq_full(rdma); | |
3abb03fa | 299 | atomic_inc(&rdma->sc_sq_avail); |
4201c746 | 300 | wait_event(rdma->sc_send_wait, |
3abb03fa | 301 | atomic_read(&rdma->sc_sq_avail) > 1); |
4201c746 CL |
302 | if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) |
303 | return -ENOTCONN; | |
304 | trace_svcrdma_sq_retry(rdma); | |
305 | continue; | |
306 | } | |
4201c746 | 307 | |
3abb03fa | 308 | svc_xprt_get(&rdma->sc_xprt); |
ed288d74 | 309 | ret = ib_post_send(rdma->sc_qp, wr, NULL); |
4201c746 CL |
310 | trace_svcrdma_post_send(wr, ret); |
311 | if (ret) { | |
312 | set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); | |
3abb03fa | 313 | svc_xprt_put(&rdma->sc_xprt); |
4201c746 CL |
314 | wake_up(&rdma->sc_send_wait); |
315 | } | |
316 | break; | |
317 | } | |
318 | return ret; | |
319 | } | |
320 | ||
cf570a93 CL |
321 | static u32 xdr_padsize(u32 len) |
322 | { | |
323 | return (len & 3) ? (4 - (len & 3)) : 0; | |
324 | } | |
325 | ||
9a6a180b CL |
326 | /* Returns length of transport header, in bytes. |
327 | */ | |
328 | static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp) | |
329 | { | |
330 | unsigned int nsegs; | |
331 | __be32 *p; | |
332 | ||
333 | p = rdma_resp; | |
334 | ||
335 | /* RPC-over-RDMA V1 replies never have a Read list. */ | |
336 | p += rpcrdma_fixed_maxsz + 1; | |
337 | ||
338 | /* Skip Write list. */ | |
339 | while (*p++ != xdr_zero) { | |
340 | nsegs = be32_to_cpup(p++); | |
341 | p += nsegs * rpcrdma_segment_maxsz; | |
342 | } | |
343 | ||
344 | /* Skip Reply chunk. */ | |
345 | if (*p++ != xdr_zero) { | |
346 | nsegs = be32_to_cpup(p++); | |
347 | p += nsegs * rpcrdma_segment_maxsz; | |
348 | } | |
349 | ||
350 | return (unsigned long)p - (unsigned long)rdma_resp; | |
351 | } | |
352 | ||
353 | /* One Write chunk is copied from Call transport header to Reply | |
354 | * transport header. Each segment's length field is updated to | |
355 | * reflect number of bytes consumed in the segment. | |
356 | * | |
357 | * Returns number of segments in this chunk. | |
358 | */ | |
359 | static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src, | |
360 | unsigned int remaining) | |
361 | { | |
362 | unsigned int i, nsegs; | |
363 | u32 seg_len; | |
364 | ||
365 | /* Write list discriminator */ | |
366 | *dst++ = *src++; | |
367 | ||
368 | /* number of segments in this chunk */ | |
369 | nsegs = be32_to_cpup(src); | |
370 | *dst++ = *src++; | |
371 | ||
372 | for (i = nsegs; i; i--) { | |
373 | /* segment's RDMA handle */ | |
374 | *dst++ = *src++; | |
375 | ||
376 | /* bytes returned in this segment */ | |
377 | seg_len = be32_to_cpu(*src); | |
378 | if (remaining >= seg_len) { | |
379 | /* entire segment was consumed */ | |
380 | *dst = *src; | |
381 | remaining -= seg_len; | |
382 | } else { | |
383 | /* segment only partly filled */ | |
384 | *dst = cpu_to_be32(remaining); | |
385 | remaining = 0; | |
386 | } | |
387 | dst++; src++; | |
388 | ||
389 | /* segment's RDMA offset */ | |
390 | *dst++ = *src++; | |
391 | *dst++ = *src++; | |
392 | } | |
393 | ||
394 | return nsegs; | |
395 | } | |
396 | ||
397 | /* The client provided a Write list in the Call message. Fill in | |
398 | * the segments in the first Write chunk in the Reply's transport | |
399 | * header with the number of bytes consumed in each segment. | |
400 | * Remaining chunks are returned unused. | |
401 | * | |
402 | * Assumptions: | |
403 | * - Client has provided only one Write chunk | |
404 | */ | |
405 | static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch, | |
406 | unsigned int consumed) | |
407 | { | |
408 | unsigned int nsegs; | |
409 | __be32 *p, *q; | |
410 | ||
411 | /* RPC-over-RDMA V1 replies never have a Read list. */ | |
412 | p = rdma_resp + rpcrdma_fixed_maxsz + 1; | |
413 | ||
414 | q = wr_ch; | |
415 | while (*q != xdr_zero) { | |
416 | nsegs = xdr_encode_write_chunk(p, q, consumed); | |
417 | q += 2 + nsegs * rpcrdma_segment_maxsz; | |
418 | p += 2 + nsegs * rpcrdma_segment_maxsz; | |
419 | consumed = 0; | |
420 | } | |
421 | ||
422 | /* Terminate Write list */ | |
423 | *p++ = xdr_zero; | |
424 | ||
425 | /* Reply chunk discriminator; may be replaced later */ | |
426 | *p = xdr_zero; | |
427 | } | |
428 | ||
429 | /* The client provided a Reply chunk in the Call message. Fill in | |
430 | * the segments in the Reply chunk in the Reply message with the | |
431 | * number of bytes consumed in each segment. | |
432 | * | |
433 | * Assumptions: | |
434 | * - Reply can always fit in the provided Reply chunk | |
435 | */ | |
436 | static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch, | |
437 | unsigned int consumed) | |
438 | { | |
439 | __be32 *p; | |
440 | ||
441 | /* Find the Reply chunk in the Reply's xprt header. | |
442 | * RPC-over-RDMA V1 replies never have a Read list. | |
443 | */ | |
444 | p = rdma_resp + rpcrdma_fixed_maxsz + 1; | |
445 | ||
446 | /* Skip past Write list */ | |
447 | while (*p++ != xdr_zero) | |
448 | p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; | |
449 | ||
450 | xdr_encode_write_chunk(p, rp_ch, consumed); | |
451 | } | |
452 | ||
5fdca653 | 453 | /* Parse the RPC Call's transport header. |
10dc4512 | 454 | */ |
9a6a180b CL |
455 | static void svc_rdma_get_write_arrays(__be32 *rdma_argp, |
456 | __be32 **write, __be32 **reply) | |
10dc4512 | 457 | { |
5fdca653 | 458 | __be32 *p; |
10dc4512 | 459 | |
9a6a180b | 460 | p = rdma_argp + rpcrdma_fixed_maxsz; |
10dc4512 | 461 | |
5fdca653 CL |
462 | /* Read list */ |
463 | while (*p++ != xdr_zero) | |
464 | p += 5; | |
10dc4512 | 465 | |
5fdca653 CL |
466 | /* Write list */ |
467 | if (*p != xdr_zero) { | |
9a6a180b | 468 | *write = p; |
5fdca653 CL |
469 | while (*p++ != xdr_zero) |
470 | p += 1 + be32_to_cpu(*p) * 4; | |
471 | } else { | |
472 | *write = NULL; | |
473 | p++; | |
10dc4512 CL |
474 | } |
475 | ||
5fdca653 CL |
476 | /* Reply chunk */ |
477 | if (*p != xdr_zero) | |
9a6a180b | 478 | *reply = p; |
5fdca653 CL |
479 | else |
480 | *reply = NULL; | |
10dc4512 CL |
481 | } |
482 | ||
6e6092ca | 483 | static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, |
4201c746 | 484 | struct svc_rdma_send_ctxt *ctxt, |
6e6092ca | 485 | struct page *page, |
f016f305 | 486 | unsigned long offset, |
6e6092ca CL |
487 | unsigned int len) |
488 | { | |
489 | struct ib_device *dev = rdma->sc_cm_id->device; | |
490 | dma_addr_t dma_addr; | |
491 | ||
492 | dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE); | |
493 | if (ib_dma_mapping_error(dev, dma_addr)) | |
91a08eae | 494 | goto out_maperr; |
6e6092ca | 495 | |
25fd86ec CL |
496 | ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr; |
497 | ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len; | |
4201c746 | 498 | ctxt->sc_send_wr.num_sge++; |
6e6092ca | 499 | return 0; |
91a08eae CL |
500 | |
501 | out_maperr: | |
bd2abef3 | 502 | trace_svcrdma_dma_map_page(rdma, page); |
91a08eae | 503 | return -EIO; |
6e6092ca CL |
504 | } |
505 | ||
f016f305 CL |
506 | /* ib_dma_map_page() is used here because svc_rdma_dma_unmap() |
507 | * handles DMA-unmap and it uses ib_dma_unmap_page() exclusively. | |
508 | */ | |
509 | static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, | |
4201c746 | 510 | struct svc_rdma_send_ctxt *ctxt, |
f016f305 CL |
511 | unsigned char *base, |
512 | unsigned int len) | |
513 | { | |
25fd86ec | 514 | return svc_rdma_dma_map_page(rdma, ctxt, virt_to_page(base), |
f016f305 CL |
515 | offset_in_page(base), len); |
516 | } | |
517 | ||
6e6092ca | 518 | /** |
99722fe4 | 519 | * svc_rdma_sync_reply_hdr - DMA sync the transport header buffer |
6e6092ca | 520 | * @rdma: controlling transport |
99722fe4 | 521 | * @ctxt: send_ctxt for the Send WR |
6e6092ca CL |
522 | * @len: length of transport header |
523 | * | |
6e6092ca | 524 | */ |
99722fe4 CL |
525 | void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma, |
526 | struct svc_rdma_send_ctxt *ctxt, | |
527 | unsigned int len) | |
6e6092ca | 528 | { |
99722fe4 CL |
529 | ctxt->sc_sges[0].length = len; |
530 | ctxt->sc_send_wr.num_sge++; | |
531 | ib_dma_sync_single_for_device(rdma->sc_pd->device, | |
532 | ctxt->sc_sges[0].addr, len, | |
533 | DMA_TO_DEVICE); | |
6e6092ca CL |
534 | } |
535 | ||
e248aa7b CL |
536 | /* If the xdr_buf has more elements than the device can |
537 | * transmit in a single RDMA Send, then the reply will | |
538 | * have to be copied into a bounce buffer. | |
539 | */ | |
540 | static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma, | |
541 | struct xdr_buf *xdr, | |
542 | __be32 *wr_lst) | |
543 | { | |
544 | int elements; | |
545 | ||
546 | /* xdr->head */ | |
547 | elements = 1; | |
548 | ||
549 | /* xdr->pages */ | |
550 | if (!wr_lst) { | |
551 | unsigned int remaining; | |
552 | unsigned long pageoff; | |
553 | ||
554 | pageoff = xdr->page_base & ~PAGE_MASK; | |
555 | remaining = xdr->page_len; | |
556 | while (remaining) { | |
557 | ++elements; | |
558 | remaining -= min_t(u32, PAGE_SIZE - pageoff, | |
559 | remaining); | |
560 | pageoff = 0; | |
561 | } | |
562 | } | |
563 | ||
564 | /* xdr->tail */ | |
565 | if (xdr->tail[0].iov_len) | |
566 | ++elements; | |
567 | ||
568 | /* assume 1 SGE is needed for the transport header */ | |
569 | return elements >= rdma->sc_max_send_sges; | |
570 | } | |
571 | ||
572 | /* The device is not capable of sending the reply directly. | |
573 | * Assemble the elements of @xdr into the transport header | |
574 | * buffer. | |
575 | */ | |
576 | static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma, | |
577 | struct svc_rdma_send_ctxt *ctxt, | |
578 | struct xdr_buf *xdr, __be32 *wr_lst) | |
579 | { | |
580 | unsigned char *dst, *tailbase; | |
581 | unsigned int taillen; | |
582 | ||
583 | dst = ctxt->sc_xprt_buf; | |
584 | dst += ctxt->sc_sges[0].length; | |
585 | ||
586 | memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len); | |
587 | dst += xdr->head[0].iov_len; | |
588 | ||
589 | tailbase = xdr->tail[0].iov_base; | |
590 | taillen = xdr->tail[0].iov_len; | |
591 | if (wr_lst) { | |
592 | u32 xdrpad; | |
593 | ||
594 | xdrpad = xdr_padsize(xdr->page_len); | |
595 | if (taillen && xdrpad) { | |
596 | tailbase += xdrpad; | |
597 | taillen -= xdrpad; | |
598 | } | |
599 | } else { | |
600 | unsigned int len, remaining; | |
601 | unsigned long pageoff; | |
602 | struct page **ppages; | |
603 | ||
604 | ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); | |
605 | pageoff = xdr->page_base & ~PAGE_MASK; | |
606 | remaining = xdr->page_len; | |
607 | while (remaining) { | |
608 | len = min_t(u32, PAGE_SIZE - pageoff, remaining); | |
609 | ||
610 | memcpy(dst, page_address(*ppages), len); | |
611 | remaining -= len; | |
612 | dst += len; | |
613 | pageoff = 0; | |
614 | } | |
615 | } | |
616 | ||
617 | if (taillen) | |
618 | memcpy(dst, tailbase, taillen); | |
619 | ||
620 | ctxt->sc_sges[0].length += xdr->len; | |
621 | ib_dma_sync_single_for_device(rdma->sc_pd->device, | |
622 | ctxt->sc_sges[0].addr, | |
623 | ctxt->sc_sges[0].length, | |
624 | DMA_TO_DEVICE); | |
625 | ||
626 | return 0; | |
627 | } | |
628 | ||
99722fe4 CL |
629 | /* svc_rdma_map_reply_msg - Map the buffer holding RPC message |
630 | * @rdma: controlling transport | |
631 | * @ctxt: send_ctxt for the Send WR | |
632 | * @xdr: prepared xdr_buf containing RPC message | |
633 | * @wr_lst: pointer to Call header's Write list, or NULL | |
634 | * | |
635 | * Load the xdr_buf into the ctxt's sge array, and DMA map each | |
9a6a180b CL |
636 | * element as it is added. |
637 | * | |
23262790 | 638 | * Returns zero on success, or a negative errno on failure. |
c06b540a | 639 | */ |
99722fe4 CL |
640 | int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, |
641 | struct svc_rdma_send_ctxt *ctxt, | |
642 | struct xdr_buf *xdr, __be32 *wr_lst) | |
c06b540a | 643 | { |
25fd86ec | 644 | unsigned int len, remaining; |
f016f305 | 645 | unsigned long page_off; |
9a6a180b CL |
646 | struct page **ppages; |
647 | unsigned char *base; | |
648 | u32 xdr_pad; | |
649 | int ret; | |
c06b540a | 650 | |
e248aa7b CL |
651 | if (svc_rdma_pull_up_needed(rdma, xdr, wr_lst)) |
652 | return svc_rdma_pull_up_reply_msg(rdma, ctxt, xdr, wr_lst); | |
653 | ||
654 | ++ctxt->sc_cur_sge_no; | |
25fd86ec | 655 | ret = svc_rdma_dma_map_buf(rdma, ctxt, |
9a6a180b CL |
656 | xdr->head[0].iov_base, |
657 | xdr->head[0].iov_len); | |
658 | if (ret < 0) | |
659 | return ret; | |
c06b540a | 660 | |
9a6a180b CL |
661 | /* If a Write chunk is present, the xdr_buf's page list |
662 | * is not included inline. However the Upper Layer may | |
663 | * have added XDR padding in the tail buffer, and that | |
664 | * should not be included inline. | |
665 | */ | |
666 | if (wr_lst) { | |
667 | base = xdr->tail[0].iov_base; | |
668 | len = xdr->tail[0].iov_len; | |
669 | xdr_pad = xdr_padsize(xdr->page_len); | |
c06b540a | 670 | |
9a6a180b CL |
671 | if (len && xdr_pad) { |
672 | base += xdr_pad; | |
673 | len -= xdr_pad; | |
3fe04ee9 | 674 | } |
c06b540a | 675 | |
9a6a180b | 676 | goto tail; |
c06b540a | 677 | } |
c06b540a | 678 | |
9a6a180b CL |
679 | ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); |
680 | page_off = xdr->page_base & ~PAGE_MASK; | |
681 | remaining = xdr->page_len; | |
682 | while (remaining) { | |
683 | len = min_t(u32, PAGE_SIZE - page_off, remaining); | |
08ae4e7f | 684 | |
e248aa7b | 685 | ++ctxt->sc_cur_sge_no; |
25fd86ec CL |
686 | ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++, |
687 | page_off, len); | |
9a6a180b CL |
688 | if (ret < 0) |
689 | return ret; | |
c06b540a | 690 | |
9a6a180b CL |
691 | remaining -= len; |
692 | page_off = 0; | |
c06b540a | 693 | } |
c06b540a | 694 | |
9a6a180b CL |
695 | base = xdr->tail[0].iov_base; |
696 | len = xdr->tail[0].iov_len; | |
697 | tail: | |
698 | if (len) { | |
e248aa7b | 699 | ++ctxt->sc_cur_sge_no; |
25fd86ec | 700 | ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len); |
9a6a180b CL |
701 | if (ret < 0) |
702 | return ret; | |
703 | } | |
08ae4e7f | 704 | |
23262790 | 705 | return 0; |
c06b540a TT |
706 | } |
707 | ||
c55ab070 CL |
708 | /* The svc_rqst and all resources it owns are released as soon as |
709 | * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt | |
710 | * so they are released by the Send completion handler. | |
711 | */ | |
712 | static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, | |
4201c746 | 713 | struct svc_rdma_send_ctxt *ctxt) |
c55ab070 CL |
714 | { |
715 | int i, pages = rqstp->rq_next_page - rqstp->rq_respages; | |
716 | ||
4201c746 | 717 | ctxt->sc_page_count += pages; |
c55ab070 | 718 | for (i = 0; i < pages; i++) { |
99722fe4 | 719 | ctxt->sc_pages[i] = rqstp->rq_respages[i]; |
c55ab070 CL |
720 | rqstp->rq_respages[i] = NULL; |
721 | } | |
a53d5cb0 CL |
722 | |
723 | /* Prevent svc_xprt_release from releasing pages in rq_pages */ | |
724 | rqstp->rq_next_page = rqstp->rq_respages; | |
c55ab070 CL |
725 | } |
726 | ||
9a6a180b CL |
727 | /* Prepare the portion of the RPC Reply that will be transmitted |
728 | * via RDMA Send. The RPC-over-RDMA transport header is prepared | |
4201c746 | 729 | * in sc_sges[0], and the RPC xdr_buf is prepared in following sges. |
9a6a180b CL |
730 | * |
731 | * Depending on whether a Write list or Reply chunk is present, | |
732 | * the server may send all, a portion of, or none of the xdr_buf. | |
4201c746 | 733 | * In the latter case, only the transport header (sc_sges[0]) is |
9a6a180b CL |
734 | * transmitted. |
735 | * | |
736 | * RDMA Send is the last step of transmitting an RPC reply. Pages | |
737 | * involved in the earlier RDMA Writes are here transferred out | |
97bce634 | 738 | * of the rqstp and into the sctxt's page array. These pages are |
9a6a180b CL |
739 | * DMA unmapped by each Write completion, but the subsequent Send |
740 | * completion finally releases these pages. | |
741 | * | |
742 | * Assumptions: | |
743 | * - The Reply's transport header will never be larger than a page. | |
c06b540a | 744 | */ |
9a6a180b | 745 | static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, |
97bce634 CL |
746 | struct svc_rdma_send_ctxt *sctxt, |
747 | struct svc_rdma_recv_ctxt *rctxt, | |
9a6a180b CL |
748 | struct svc_rqst *rqstp, |
749 | __be32 *wr_lst, __be32 *rp_ch) | |
c06b540a | 750 | { |
9a6a180b CL |
751 | int ret; |
752 | ||
9a6a180b | 753 | if (!rp_ch) { |
97bce634 | 754 | ret = svc_rdma_map_reply_msg(rdma, sctxt, |
9a6a180b CL |
755 | &rqstp->rq_res, wr_lst); |
756 | if (ret < 0) | |
99722fe4 | 757 | return ret; |
3fe04ee9 | 758 | } |
c06b540a | 759 | |
97bce634 | 760 | svc_rdma_save_io_pages(rqstp, sctxt); |
0bf48289 | 761 | |
97bce634 CL |
762 | if (rctxt->rc_inv_rkey) { |
763 | sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; | |
764 | sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey; | |
765 | } else { | |
766 | sctxt->sc_send_wr.opcode = IB_WR_SEND; | |
986b7889 CL |
767 | } |
768 | dprintk("svcrdma: posting Send WR with %u sge(s)\n", | |
97bce634 CL |
769 | sctxt->sc_send_wr.num_sge); |
770 | return svc_rdma_send(rdma, &sctxt->sc_send_wr); | |
c06b540a TT |
771 | } |
772 | ||
4757d90b CL |
773 | /* Given the client-provided Write and Reply chunks, the server was not |
774 | * able to form a complete reply. Return an RDMA_ERROR message so the | |
775 | * client can retire this RPC transaction. As above, the Send completion | |
776 | * routine releases payload pages that were part of a previous RDMA Write. | |
777 | * | |
778 | * Remote Invalidation is skipped for simplicity. | |
779 | */ | |
780 | static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, | |
99722fe4 CL |
781 | struct svc_rdma_send_ctxt *ctxt, |
782 | struct svc_rqst *rqstp) | |
4757d90b | 783 | { |
4757d90b CL |
784 | __be32 *p; |
785 | int ret; | |
786 | ||
99722fe4 CL |
787 | p = ctxt->sc_xprt_buf; |
788 | trace_svcrdma_err_chunk(*p); | |
789 | p += 3; | |
4757d90b CL |
790 | *p++ = rdma_error; |
791 | *p = err_chunk; | |
99722fe4 | 792 | svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_ERR); |
4757d90b CL |
793 | |
794 | svc_rdma_save_io_pages(rqstp, ctxt); | |
795 | ||
986b7889 CL |
796 | ctxt->sc_send_wr.opcode = IB_WR_SEND; |
797 | ret = svc_rdma_send(rdma, &ctxt->sc_send_wr); | |
99722fe4 CL |
798 | if (ret) { |
799 | svc_rdma_send_ctxt_put(rdma, ctxt); | |
800 | return ret; | |
801 | } | |
4757d90b CL |
802 | |
803 | return 0; | |
4757d90b CL |
804 | } |
805 | ||
9a6a180b CL |
806 | /** |
807 | * svc_rdma_sendto - Transmit an RPC reply | |
808 | * @rqstp: processed RPC request, reply XDR already in ::rq_res | |
809 | * | |
810 | * Any resources still associated with @rqstp are released upon return. | |
811 | * If no reply message was possible, the connection is closed. | |
812 | * | |
813 | * Returns: | |
814 | * %0 if an RPC reply has been successfully posted, | |
815 | * %-ENOMEM if a resource shortage occurred (connection is lost), | |
816 | * %-ENOTCONN if posting failed (connection is lost). | |
817 | */ | |
c06b540a TT |
818 | int svc_rdma_sendto(struct svc_rqst *rqstp) |
819 | { | |
820 | struct svc_xprt *xprt = rqstp->rq_xprt; | |
821 | struct svcxprt_rdma *rdma = | |
822 | container_of(xprt, struct svcxprt_rdma, sc_xprt); | |
3a88092e | 823 | struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; |
9a6a180b CL |
824 | __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch; |
825 | struct xdr_buf *xdr = &rqstp->rq_res; | |
99722fe4 | 826 | struct svc_rdma_send_ctxt *sctxt; |
9a6a180b | 827 | int ret; |
c06b540a | 828 | |
3316f063 | 829 | rdma_argp = rctxt->rc_recv_buf; |
9a6a180b | 830 | svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch); |
c06b540a | 831 | |
e4eb42ce CL |
832 | /* Create the RDMA response header. xprt->xpt_mutex, |
833 | * acquired in svc_send(), serializes RPC replies. The | |
834 | * code path below that inserts the credit grant value | |
835 | * into each transport header runs only inside this | |
836 | * critical section. | |
837 | */ | |
78da2b3c | 838 | ret = -ENOMEM; |
99722fe4 CL |
839 | sctxt = svc_rdma_send_ctxt_get(rdma); |
840 | if (!sctxt) | |
78da2b3c | 841 | goto err0; |
99722fe4 | 842 | rdma_resp = sctxt->sc_xprt_buf; |
98fc21d3 | 843 | |
9a6a180b CL |
844 | p = rdma_resp; |
845 | *p++ = *rdma_argp; | |
846 | *p++ = *(rdma_argp + 1); | |
98fc21d3 | 847 | *p++ = rdma->sc_fc_credits; |
9a6a180b | 848 | *p++ = rp_ch ? rdma_nomsg : rdma_msg; |
98fc21d3 CL |
849 | |
850 | /* Start with empty chunks */ | |
851 | *p++ = xdr_zero; | |
852 | *p++ = xdr_zero; | |
853 | *p = xdr_zero; | |
c06b540a | 854 | |
9a6a180b CL |
855 | if (wr_lst) { |
856 | /* XXX: Presume the client sent only one Write chunk */ | |
857 | ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr); | |
08ae4e7f | 858 | if (ret < 0) |
4757d90b | 859 | goto err2; |
9a6a180b | 860 | svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret); |
c06b540a | 861 | } |
9a6a180b CL |
862 | if (rp_ch) { |
863 | ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr); | |
08ae4e7f | 864 | if (ret < 0) |
4757d90b | 865 | goto err2; |
9a6a180b | 866 | svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret); |
c06b540a | 867 | } |
c06b540a | 868 | |
99722fe4 | 869 | svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp)); |
97bce634 | 870 | ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp, |
9a6a180b | 871 | wr_lst, rp_ch); |
3e1eeb98 | 872 | if (ret < 0) |
99722fe4 | 873 | goto err1; |
3a88092e CL |
874 | ret = 0; |
875 | ||
876 | out: | |
877 | rqstp->rq_xprt_ctxt = NULL; | |
878 | svc_rdma_recv_ctxt_put(rdma, rctxt); | |
879 | return ret; | |
afd566ea | 880 | |
4757d90b | 881 | err2: |
b20dae70 | 882 | if (ret != -E2BIG && ret != -EINVAL) |
4757d90b CL |
883 | goto err1; |
884 | ||
99722fe4 | 885 | ret = svc_rdma_send_error_msg(rdma, sctxt, rqstp); |
4757d90b | 886 | if (ret < 0) |
99722fe4 | 887 | goto err1; |
3a88092e CL |
888 | ret = 0; |
889 | goto out; | |
4757d90b | 890 | |
afd566ea | 891 | err1: |
99722fe4 | 892 | svc_rdma_send_ctxt_put(rdma, sctxt); |
afd566ea | 893 | err0: |
bd2abef3 | 894 | trace_svcrdma_send_failed(rqstp, ret); |
9a6a180b | 895 | set_bit(XPT_CLOSE, &xprt->xpt_flags); |
3a88092e CL |
896 | ret = -ENOTCONN; |
897 | goto out; | |
c06b540a | 898 | } |