Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
a0ce85f5 | 2 | /* |
ce5b3717 | 3 | * Copyright (c) 2015, 2017 Oracle. All rights reserved. |
a0ce85f5 CL |
4 | * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. |
5 | */ | |
6 | ||
7 | /* Lightweight memory registration using Fast Registration Work | |
ce5b3717 | 8 | * Requests (FRWR). |
a0ce85f5 | 9 | * |
2fb2a4d5 CL |
10 | * FRWR features ordered asynchronous registration and invalidation |
11 | * of arbitrarily-sized memory regions. This is the fastest and safest | |
a0ce85f5 CL |
12 | * but most complex memory registration mode. |
13 | */ | |
14 | ||
c14d86e5 CL |
15 | /* Normal operation |
16 | * | |
2fb2a4d5 | 17 | * A Memory Region is prepared for RDMA Read or Write using a FAST_REG |
5f62412b | 18 | * Work Request (frwr_map). When the RDMA operation is finished, this |
c14d86e5 | 19 | * Memory Region is invalidated using a LOCAL_INV Work Request |
2fb2a4d5 | 20 | * (frwr_unmap_async and frwr_unmap_sync). |
c14d86e5 | 21 | * |
2fb2a4d5 CL |
22 | * Typically FAST_REG Work Requests are not signaled, and neither are |
23 | * RDMA Send Work Requests (with the exception of signaling occasionally | |
24 | * to prevent provider work queue overflows). This greatly reduces HCA | |
c14d86e5 | 25 | * interrupt workload. |
c14d86e5 CL |
26 | */ |
27 | ||
28 | /* Transport recovery | |
29 | * | |
2fb2a4d5 CL |
30 | * frwr_map and frwr_unmap_* cannot run at the same time the transport |
31 | * connect worker is running. The connect worker holds the transport | |
32 | * send lock, just as ->send_request does. This prevents frwr_map and | |
33 | * the connect worker from running concurrently. When a connection is | |
34 | * closed, the Receive completion queue is drained before the allowing | |
35 | * the connect worker to get control. This prevents frwr_unmap and the | |
36 | * connect worker from running concurrently. | |
37 | * | |
38 | * When the underlying transport disconnects, MRs that are in flight | |
39 | * are flushed and are likely unusable. Thus all flushed MRs are | |
40 | * destroyed. New MRs are created on demand. | |
c14d86e5 CL |
41 | */ |
42 | ||
c8b920bb | 43 | #include <linux/sunrpc/rpc_rdma.h> |
bd2abef3 | 44 | #include <linux/sunrpc/svc_rdma.h> |
c8b920bb | 45 | |
a0ce85f5 | 46 | #include "xprt_rdma.h" |
b6e717cb | 47 | #include <trace/events/rpcrdma.h> |
a0ce85f5 CL |
48 | |
49 | #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) | |
50 | # define RPCDBG_FACILITY RPCDBG_TRANS | |
51 | #endif | |
52 | ||
5f62412b CL |
53 | /** |
54 | * frwr_is_supported - Check if device supports FRWR | |
f19bd0bb | 55 | * @device: interface adapter to check |
5f62412b CL |
56 | * |
57 | * Returns true if device supports FRWR, otherwise false | |
58 | */ | |
f19bd0bb | 59 | bool frwr_is_supported(struct ib_device *device) |
b54054ca | 60 | { |
f19bd0bb | 61 | struct ib_device_attr *attrs = &device->attrs; |
b54054ca CL |
62 | |
63 | if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) | |
64 | goto out_not_supported; | |
65 | if (attrs->max_fast_reg_page_list_len == 0) | |
66 | goto out_not_supported; | |
67 | return true; | |
68 | ||
69 | out_not_supported: | |
70 | pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n", | |
f19bd0bb | 71 | device->name); |
b54054ca CL |
72 | return false; |
73 | } | |
74 | ||
5f62412b CL |
75 | /** |
76 | * frwr_release_mr - Destroy one MR | |
77 | * @mr: MR allocated by frwr_init_mr | |
78 | * | |
79 | */ | |
80 | void frwr_release_mr(struct rpcrdma_mr *mr) | |
61da886b CL |
81 | { |
82 | int rc; | |
83 | ||
84 | rc = ib_dereg_mr(mr->frwr.fr_mr); | |
85 | if (rc) | |
53b2c1cb | 86 | trace_xprtrdma_frwr_dereg(mr, rc); |
61da886b CL |
87 | kfree(mr->mr_sg); |
88 | kfree(mr); | |
89 | } | |
90 | ||
91 | /* MRs are dynamically allocated, so simply clean up and release the MR. | |
92 | * A replacement MR will subsequently be allocated on demand. | |
93 | */ | |
94 | static void | |
95 | frwr_mr_recycle_worker(struct work_struct *work) | |
96 | { | |
97 | struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); | |
61da886b CL |
98 | struct rpcrdma_xprt *r_xprt = mr->mr_xprt; |
99 | ||
100 | trace_xprtrdma_mr_recycle(mr); | |
101 | ||
e2f34e26 | 102 | if (mr->mr_dir != DMA_NONE) { |
d379eaa8 | 103 | trace_xprtrdma_mr_unmap(mr); |
f19bd0bb | 104 | ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device, |
61da886b | 105 | mr->mr_sg, mr->mr_nents, mr->mr_dir); |
e2f34e26 | 106 | mr->mr_dir = DMA_NONE; |
61da886b CL |
107 | } |
108 | ||
4d6b8890 | 109 | spin_lock(&r_xprt->rx_buf.rb_lock); |
61da886b CL |
110 | list_del(&mr->mr_all); |
111 | r_xprt->rx_stats.mrs_recycled++; | |
4d6b8890 | 112 | spin_unlock(&r_xprt->rx_buf.rb_lock); |
5f62412b CL |
113 | |
114 | frwr_release_mr(mr); | |
61da886b CL |
115 | } |
116 | ||
40088f0e CL |
117 | /* frwr_reset - Place MRs back on the free list |
118 | * @req: request to reset | |
119 | * | |
120 | * Used after a failed marshal. For FRWR, this means the MRs | |
121 | * don't have to be fully released and recreated. | |
122 | * | |
123 | * NB: This is safe only as long as none of @req's MRs are | |
124 | * involved with an ongoing asynchronous FAST_REG or LOCAL_INV | |
125 | * Work Request. | |
126 | */ | |
127 | void frwr_reset(struct rpcrdma_req *req) | |
128 | { | |
265a38d4 | 129 | struct rpcrdma_mr *mr; |
40088f0e | 130 | |
265a38d4 | 131 | while ((mr = rpcrdma_mr_pop(&req->rl_registered))) |
1ca3f4c0 | 132 | rpcrdma_mr_put(mr); |
40088f0e CL |
133 | } |
134 | ||
5f62412b CL |
135 | /** |
136 | * frwr_init_mr - Initialize one MR | |
137 | * @ia: interface adapter | |
138 | * @mr: generic MR to prepare for FRWR | |
139 | * | |
140 | * Returns zero if successful. Otherwise a negative errno | |
141 | * is returned. | |
142 | */ | |
143 | int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) | |
d48b1d29 | 144 | { |
ce5b3717 | 145 | unsigned int depth = ia->ri_max_frwr_depth; |
f85adb1b CL |
146 | struct scatterlist *sg; |
147 | struct ib_mr *frmr; | |
d48b1d29 CL |
148 | int rc; |
149 | ||
805a1f62 CL |
150 | /* NB: ib_alloc_mr and device drivers typically allocate |
151 | * memory with GFP_KERNEL. | |
152 | */ | |
f85adb1b CL |
153 | frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); |
154 | if (IS_ERR(frmr)) | |
d48b1d29 CL |
155 | goto out_mr_err; |
156 | ||
805a1f62 | 157 | sg = kcalloc(depth, sizeof(*sg), GFP_NOFS); |
f85adb1b | 158 | if (!sg) |
d48b1d29 CL |
159 | goto out_list_err; |
160 | ||
f85adb1b | 161 | mr->frwr.fr_mr = frmr; |
e2f34e26 | 162 | mr->mr_dir = DMA_NONE; |
054f1557 | 163 | INIT_LIST_HEAD(&mr->mr_list); |
61da886b | 164 | INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker); |
f85adb1b CL |
165 | init_completion(&mr->frwr.fr_linv_done); |
166 | ||
167 | sg_init_table(sg, depth); | |
168 | mr->mr_sg = sg; | |
d48b1d29 CL |
169 | return 0; |
170 | ||
171 | out_mr_err: | |
f85adb1b | 172 | rc = PTR_ERR(frmr); |
53b2c1cb | 173 | trace_xprtrdma_frwr_alloc(mr, rc); |
d48b1d29 CL |
174 | return rc; |
175 | ||
176 | out_list_err: | |
f85adb1b CL |
177 | ib_dereg_mr(frmr); |
178 | return -ENOMEM; | |
d48b1d29 CL |
179 | } |
180 | ||
5f62412b CL |
181 | /** |
182 | * frwr_open - Prepare an endpoint for use with FRWR | |
183 | * @ia: interface adapter this endpoint will use | |
184 | * @ep: endpoint to prepare | |
5f62412b CL |
185 | * |
186 | * On success, sets: | |
914fcad9 CL |
187 | * ep->rep_attr.cap.max_send_wr |
188 | * ep->rep_attr.cap.max_recv_wr | |
86c4ccd9 | 189 | * ep->rep_max_requests |
914fcad9 CL |
190 | * ia->ri_max_segs |
191 | * | |
192 | * And these FRWR-related fields: | |
193 | * ia->ri_max_frwr_depth | |
194 | * ia->ri_mrtype | |
5f62412b CL |
195 | * |
196 | * On failure, a negative errno is returned. | |
914fcad9 | 197 | */ |
86c4ccd9 | 198 | int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep) |
3968cb58 | 199 | { |
f19bd0bb | 200 | struct ib_device_attr *attrs = &ia->ri_id->device->attrs; |
914fcad9 | 201 | int max_qp_wr, depth, delta; |
3968cb58 | 202 | |
5e9fc6a0 CL |
203 | ia->ri_mrtype = IB_MR_TYPE_MEM_REG; |
204 | if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) | |
205 | ia->ri_mrtype = IB_MR_TYPE_SG_GAPS; | |
206 | ||
a7886849 CL |
207 | /* Quirk: Some devices advertise a large max_fast_reg_page_list_len |
208 | * capability, but perform optimally when the MRs are not larger | |
209 | * than a page. | |
210 | */ | |
211 | if (attrs->max_sge_rd > 1) | |
212 | ia->ri_max_frwr_depth = attrs->max_sge_rd; | |
213 | else | |
214 | ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len; | |
215 | if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS) | |
216 | ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS; | |
217 | dprintk("RPC: %s: max FR page list depth = %u\n", | |
ce5b3717 CL |
218 | __func__, ia->ri_max_frwr_depth); |
219 | ||
220 | /* Add room for frwr register and invalidate WRs. | |
221 | * 1. FRWR reg WR for head | |
222 | * 2. FRWR invalidate WR for head | |
223 | * 3. N FRWR reg WRs for pagelist | |
224 | * 4. N FRWR invalidate WRs for pagelist | |
225 | * 5. FRWR reg WR for tail | |
226 | * 6. FRWR invalidate WR for tail | |
3968cb58 CL |
227 | * 7. The RDMA_SEND WR |
228 | */ | |
229 | depth = 7; | |
230 | ||
ce5b3717 | 231 | /* Calculate N if the device max FRWR depth is smaller than |
3968cb58 CL |
232 | * RPCRDMA_MAX_DATA_SEGS. |
233 | */ | |
ce5b3717 CL |
234 | if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) { |
235 | delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth; | |
3968cb58 | 236 | do { |
ce5b3717 CL |
237 | depth += 2; /* FRWR reg + invalidate */ |
238 | delta -= ia->ri_max_frwr_depth; | |
3968cb58 CL |
239 | } while (delta > 0); |
240 | } | |
241 | ||
f19bd0bb | 242 | max_qp_wr = ia->ri_id->device->attrs.max_qp_wr; |
914fcad9 CL |
243 | max_qp_wr -= RPCRDMA_BACKWARD_WRS; |
244 | max_qp_wr -= 1; | |
245 | if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) | |
246 | return -ENOMEM; | |
86c4ccd9 CL |
247 | if (ep->rep_max_requests > max_qp_wr) |
248 | ep->rep_max_requests = max_qp_wr; | |
249 | ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth; | |
914fcad9 | 250 | if (ep->rep_attr.cap.max_send_wr > max_qp_wr) { |
86c4ccd9 CL |
251 | ep->rep_max_requests = max_qp_wr / depth; |
252 | if (!ep->rep_max_requests) | |
3968cb58 | 253 | return -EINVAL; |
86c4ccd9 | 254 | ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth; |
3968cb58 | 255 | } |
914fcad9 CL |
256 | ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; |
257 | ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ | |
86c4ccd9 | 258 | ep->rep_attr.cap.max_recv_wr = ep->rep_max_requests; |
914fcad9 CL |
259 | ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; |
260 | ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ | |
3968cb58 | 261 | |
36bdd905 CL |
262 | ia->ri_max_segs = |
263 | DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth); | |
6946f823 CL |
264 | /* Reply chunks require segments for head and tail buffers */ |
265 | ia->ri_max_segs += 2; | |
266 | if (ia->ri_max_segs > RPCRDMA_MAX_HDR_SEGS) | |
267 | ia->ri_max_segs = RPCRDMA_MAX_HDR_SEGS; | |
3968cb58 CL |
268 | return 0; |
269 | } | |
270 | ||
5f62412b CL |
271 | /** |
272 | * frwr_maxpages - Compute size of largest payload | |
273 | * @r_xprt: transport | |
274 | * | |
275 | * Returns maximum size of an RPC message, in pages. | |
276 | * | |
277 | * FRWR mode conveys a list of pages per chunk segment. The | |
1c9351ee CL |
278 | * maximum length of that list is the FRWR page list depth. |
279 | */ | |
5f62412b | 280 | size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt) |
1c9351ee CL |
281 | { |
282 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | |
283 | ||
284 | return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, | |
6946f823 | 285 | (ia->ri_max_segs - 2) * ia->ri_max_frwr_depth); |
1c9351ee CL |
286 | } |
287 | ||
5f62412b CL |
288 | /** |
289 | * frwr_map - Register a memory region | |
290 | * @r_xprt: controlling transport | |
291 | * @seg: memory region co-ordinates | |
292 | * @nsegs: number of segments remaining | |
293 | * @writing: true when RDMA Write will be used | |
0a93fbcb | 294 | * @xid: XID of RPC using the registered memory |
3b39f52a | 295 | * @mr: MR to fill in |
5f62412b CL |
296 | * |
297 | * Prepare a REG_MR Work Request to register a memory region | |
9c1b4d77 | 298 | * for remote access via RDMA READ or RDMA WRITE. |
5f62412b CL |
299 | * |
300 | * Returns the next segment or a negative errno pointer. | |
3b39f52a | 301 | * On success, @mr is filled in. |
9c1b4d77 | 302 | */ |
5f62412b CL |
303 | struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, |
304 | struct rpcrdma_mr_seg *seg, | |
ec482cc1 | 305 | int nsegs, bool writing, __be32 xid, |
3b39f52a | 306 | struct rpcrdma_mr *mr) |
9c1b4d77 CL |
307 | { |
308 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | |
3cf4e169 | 309 | struct ib_reg_wr *reg_wr; |
3b39f52a | 310 | struct ib_mr *ibmr; |
f2877623 | 311 | int i, n; |
9c1b4d77 | 312 | u8 key; |
9c1b4d77 | 313 | |
ce5b3717 CL |
314 | if (nsegs > ia->ri_max_frwr_depth) |
315 | nsegs = ia->ri_max_frwr_depth; | |
4143f34e SG |
316 | for (i = 0; i < nsegs;) { |
317 | if (seg->mr_page) | |
96ceddea | 318 | sg_set_page(&mr->mr_sg[i], |
4143f34e SG |
319 | seg->mr_page, |
320 | seg->mr_len, | |
321 | offset_in_page(seg->mr_offset)); | |
322 | else | |
96ceddea | 323 | sg_set_buf(&mr->mr_sg[i], seg->mr_offset, |
4143f34e SG |
324 | seg->mr_len); |
325 | ||
9c1b4d77 CL |
326 | ++seg; |
327 | ++i; | |
3b39f52a | 328 | if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS) |
5e9fc6a0 | 329 | continue; |
9c1b4d77 CL |
330 | if ((i < nsegs && offset_in_page(seg->mr_offset)) || |
331 | offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) | |
332 | break; | |
333 | } | |
96ceddea | 334 | mr->mr_dir = rpcrdma_data_dir(writing); |
4143f34e | 335 | |
f19bd0bb CL |
336 | mr->mr_nents = |
337 | ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir); | |
96ceddea | 338 | if (!mr->mr_nents) |
564471d2 CL |
339 | goto out_dmamap_err; |
340 | ||
84756894 | 341 | ibmr = mr->frwr.fr_mr; |
96ceddea CL |
342 | n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); |
343 | if (unlikely(n != mr->mr_nents)) | |
564471d2 | 344 | goto out_mapmr_err; |
4143f34e | 345 | |
0a93fbcb | 346 | ibmr->iova &= 0x00000000ffffffff; |
ec482cc1 | 347 | ibmr->iova |= ((u64)be32_to_cpu(xid)) << 32; |
96ceddea CL |
348 | key = (u8)(ibmr->rkey & 0x000000FF); |
349 | ib_update_fast_reg_key(ibmr, ++key); | |
4143f34e | 350 | |
84756894 | 351 | reg_wr = &mr->frwr.fr_regwr; |
96ceddea CL |
352 | reg_wr->mr = ibmr; |
353 | reg_wr->key = ibmr->rkey; | |
3cf4e169 CL |
354 | reg_wr->access = writing ? |
355 | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : | |
356 | IB_ACCESS_REMOTE_READ; | |
9c1b4d77 | 357 | |
96ceddea CL |
358 | mr->mr_handle = ibmr->rkey; |
359 | mr->mr_length = ibmr->length; | |
360 | mr->mr_offset = ibmr->iova; | |
ba217ec6 | 361 | trace_xprtrdma_mr_map(mr); |
4143f34e | 362 | |
6748b0ca | 363 | return seg; |
564471d2 CL |
364 | |
365 | out_dmamap_err: | |
b2ca473b | 366 | mr->mr_dir = DMA_NONE; |
53b2c1cb | 367 | trace_xprtrdma_frwr_sgerr(mr, i); |
6748b0ca | 368 | return ERR_PTR(-EIO); |
564471d2 CL |
369 | |
370 | out_mapmr_err: | |
53b2c1cb | 371 | trace_xprtrdma_frwr_maperr(mr, n); |
6748b0ca | 372 | return ERR_PTR(-EIO); |
f2877623 | 373 | } |
9c1b4d77 | 374 | |
84756894 CL |
375 | /** |
376 | * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC | |
377 | * @cq: completion queue (ignored) | |
378 | * @wc: completed WR | |
379 | * | |
380 | */ | |
381 | static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) | |
382 | { | |
383 | struct ib_cqe *cqe = wc->wr_cqe; | |
384 | struct rpcrdma_frwr *frwr = | |
385 | container_of(cqe, struct rpcrdma_frwr, fr_cqe); | |
386 | ||
387 | /* WARNING: Only wr_cqe and status are reliable at this point */ | |
388 | trace_xprtrdma_wc_fastreg(wc, frwr); | |
389 | /* The MR will get recycled when the associated req is retransmitted */ | |
390 | } | |
391 | ||
5f62412b CL |
392 | /** |
393 | * frwr_send - post Send WR containing the RPC Call message | |
394 | * @ia: interface adapter | |
395 | * @req: Prepared RPC Call | |
f2877623 | 396 | * |
e0f86bc4 | 397 | * For FRWR, chain any FastReg WRs to the Send WR. Only a |
f2877623 CL |
398 | * single ib_post_send call is needed to register memory |
399 | * and then post the Send WR. | |
5f62412b CL |
400 | * |
401 | * Returns the result of ib_post_send. | |
f2877623 | 402 | */ |
5f62412b | 403 | int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) |
f2877623 | 404 | { |
ed288d74 | 405 | struct ib_send_wr *post_wr; |
f2877623 CL |
406 | struct rpcrdma_mr *mr; |
407 | ||
408 | post_wr = &req->rl_sendctx->sc_wr; | |
409 | list_for_each_entry(mr, &req->rl_registered, mr_list) { | |
410 | struct rpcrdma_frwr *frwr; | |
411 | ||
412 | frwr = &mr->frwr; | |
413 | ||
414 | frwr->fr_cqe.done = frwr_wc_fastreg; | |
415 | frwr->fr_regwr.wr.next = post_wr; | |
416 | frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe; | |
417 | frwr->fr_regwr.wr.num_sge = 0; | |
418 | frwr->fr_regwr.wr.opcode = IB_WR_REG_MR; | |
419 | frwr->fr_regwr.wr.send_flags = 0; | |
420 | ||
421 | post_wr = &frwr->fr_regwr.wr; | |
422 | } | |
423 | ||
424 | /* If ib_post_send fails, the next ->send_request for | |
e0f86bc4 | 425 | * @req will queue these MRs for recovery. |
f2877623 | 426 | */ |
ed288d74 | 427 | return ib_post_send(ia->ri_id->qp, post_wr, NULL); |
9c1b4d77 CL |
428 | } |
429 | ||
5f62412b CL |
430 | /** |
431 | * frwr_reminv - handle a remotely invalidated mr on the @mrs list | |
432 | * @rep: Received reply | |
433 | * @mrs: list of MRs to check | |
434 | * | |
c3441618 | 435 | */ |
5f62412b | 436 | void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) |
c3441618 | 437 | { |
96ceddea | 438 | struct rpcrdma_mr *mr; |
c3441618 | 439 | |
96ceddea CL |
440 | list_for_each_entry(mr, mrs, mr_list) |
441 | if (mr->mr_handle == rep->rr_inv_rkey) { | |
054f1557 | 442 | list_del_init(&mr->mr_list); |
d379eaa8 | 443 | trace_xprtrdma_mr_remoteinv(mr); |
1ca3f4c0 | 444 | rpcrdma_mr_put(mr); |
c3441618 CL |
445 | break; /* only one invalidated MR per RPC */ |
446 | } | |
447 | } | |
448 | ||
84756894 CL |
449 | static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr) |
450 | { | |
451 | if (wc->status != IB_WC_SUCCESS) | |
452 | rpcrdma_mr_recycle(mr); | |
453 | else | |
1ca3f4c0 | 454 | rpcrdma_mr_put(mr); |
84756894 CL |
455 | } |
456 | ||
5f62412b | 457 | /** |
84756894 CL |
458 | * frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC |
459 | * @cq: completion queue (ignored) | |
460 | * @wc: completed WR | |
c9918ff5 | 461 | * |
84756894 CL |
462 | */ |
463 | static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) | |
464 | { | |
465 | struct ib_cqe *cqe = wc->wr_cqe; | |
466 | struct rpcrdma_frwr *frwr = | |
467 | container_of(cqe, struct rpcrdma_frwr, fr_cqe); | |
468 | struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); | |
469 | ||
470 | /* WARNING: Only wr_cqe and status are reliable at this point */ | |
471 | trace_xprtrdma_wc_li(wc, frwr); | |
472 | __frwr_release_mr(wc, mr); | |
473 | } | |
474 | ||
475 | /** | |
476 | * frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC | |
477 | * @cq: completion queue (ignored) | |
478 | * @wc: completed WR | |
9d6b0409 | 479 | * |
84756894 | 480 | * Awaken anyone waiting for an MR to finish being fenced. |
c9918ff5 | 481 | */ |
84756894 CL |
482 | static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) |
483 | { | |
484 | struct ib_cqe *cqe = wc->wr_cqe; | |
485 | struct rpcrdma_frwr *frwr = | |
486 | container_of(cqe, struct rpcrdma_frwr, fr_cqe); | |
487 | struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); | |
488 | ||
489 | /* WARNING: Only wr_cqe and status are reliable at this point */ | |
490 | trace_xprtrdma_wc_li_wake(wc, frwr); | |
84756894 | 491 | __frwr_release_mr(wc, mr); |
6dc6ec9e | 492 | complete(&frwr->fr_linv_done); |
84756894 CL |
493 | } |
494 | ||
495 | /** | |
496 | * frwr_unmap_sync - invalidate memory regions that were registered for @req | |
497 | * @r_xprt: controlling transport instance | |
498 | * @req: rpcrdma_req with a non-empty list of MRs to process | |
499 | * | |
500 | * Sleeps until it is safe for the host CPU to access the previously mapped | |
d8099fed CL |
501 | * memory regions. This guarantees that registered MRs are properly fenced |
502 | * from the server before the RPC consumer accesses the data in them. It | |
503 | * also ensures proper Send flow control: waking the next RPC waits until | |
504 | * this RPC has relinquished all its Send Queue entries. | |
84756894 CL |
505 | */ |
506 | void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) | |
c9918ff5 | 507 | { |
d34ac5cd BVA |
508 | struct ib_send_wr *first, **prev, *last; |
509 | const struct ib_send_wr *bad_wr; | |
ce5b3717 | 510 | struct rpcrdma_frwr *frwr; |
96ceddea | 511 | struct rpcrdma_mr *mr; |
84756894 | 512 | int rc; |
c9918ff5 | 513 | |
451d26e1 | 514 | /* ORDER: Invalidate all of the MRs first |
c9918ff5 CL |
515 | * |
516 | * Chain the LOCAL_INV Work Requests and post them with | |
517 | * a single ib_post_send() call. | |
518 | */ | |
ce5b3717 | 519 | frwr = NULL; |
a100fda1 | 520 | prev = &first; |
265a38d4 | 521 | while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { |
96ceddea | 522 | |
d379eaa8 | 523 | trace_xprtrdma_mr_localinv(mr); |
84756894 | 524 | r_xprt->rx_stats.local_inv_needed++; |
a100fda1 | 525 | |
84756894 | 526 | frwr = &mr->frwr; |
ce5b3717 CL |
527 | frwr->fr_cqe.done = frwr_wc_localinv; |
528 | last = &frwr->fr_invwr; | |
84756894 | 529 | last->next = NULL; |
ce5b3717 | 530 | last->wr_cqe = &frwr->fr_cqe; |
84756894 CL |
531 | last->sg_list = NULL; |
532 | last->num_sge = 0; | |
a100fda1 | 533 | last->opcode = IB_WR_LOCAL_INV; |
84756894 | 534 | last->send_flags = IB_SEND_SIGNALED; |
96ceddea | 535 | last->ex.invalidate_rkey = mr->mr_handle; |
c9918ff5 | 536 | |
a100fda1 CL |
537 | *prev = last; |
538 | prev = &last->next; | |
c9918ff5 | 539 | } |
c9918ff5 CL |
540 | |
541 | /* Strong send queue ordering guarantees that when the | |
542 | * last WR in the chain completes, all WRs in the chain | |
543 | * are complete. | |
544 | */ | |
ce5b3717 CL |
545 | frwr->fr_cqe.done = frwr_wc_localinv_wake; |
546 | reinit_completion(&frwr->fr_linv_done); | |
8d38de65 | 547 | |
c9918ff5 CL |
548 | /* Transport disconnect drains the receive CQ before it |
549 | * replaces the QP. The RPC reply handler won't call us | |
550 | * unless ri_id->qp is a valid pointer. | |
551 | */ | |
8d75483a | 552 | bad_wr = NULL; |
84756894 CL |
553 | rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr); |
554 | trace_xprtrdma_post_send(req, rc); | |
c9918ff5 | 555 | |
84756894 CL |
556 | /* The final LOCAL_INV WR in the chain is supposed to |
557 | * do the wake. If it was never posted, the wake will | |
558 | * not happen, so don't wait in that case. | |
c9918ff5 | 559 | */ |
84756894 CL |
560 | if (bad_wr != first) |
561 | wait_for_completion(&frwr->fr_linv_done); | |
562 | if (!rc) | |
563 | return; | |
6814baea | 564 | |
84756894 | 565 | /* Recycle MRs in the LOCAL_INV chain that did not get posted. |
d7a21c1b | 566 | */ |
8d75483a | 567 | while (bad_wr) { |
ce5b3717 CL |
568 | frwr = container_of(bad_wr, struct rpcrdma_frwr, |
569 | fr_invwr); | |
96ceddea | 570 | mr = container_of(frwr, struct rpcrdma_mr, frwr); |
8d75483a | 571 | bad_wr = bad_wr->next; |
61da886b | 572 | |
b674c4b4 CL |
573 | list_del_init(&mr->mr_list); |
574 | rpcrdma_mr_recycle(mr); | |
d7a21c1b | 575 | } |
c9918ff5 | 576 | } |
d8099fed CL |
577 | |
578 | /** | |
579 | * frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC | |
580 | * @cq: completion queue (ignored) | |
581 | * @wc: completed WR | |
582 | * | |
583 | */ | |
584 | static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) | |
585 | { | |
586 | struct ib_cqe *cqe = wc->wr_cqe; | |
587 | struct rpcrdma_frwr *frwr = | |
588 | container_of(cqe, struct rpcrdma_frwr, fr_cqe); | |
589 | struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); | |
6dc6ec9e | 590 | struct rpcrdma_rep *rep = mr->mr_req->rl_reply; |
d8099fed CL |
591 | |
592 | /* WARNING: Only wr_cqe and status are reliable at this point */ | |
593 | trace_xprtrdma_wc_li_done(wc, frwr); | |
d8099fed | 594 | __frwr_release_mr(wc, mr); |
6dc6ec9e CL |
595 | |
596 | /* Ensure @rep is generated before __frwr_release_mr */ | |
597 | smp_rmb(); | |
598 | rpcrdma_complete_rqst(rep); | |
d8099fed CL |
599 | } |
600 | ||
601 | /** | |
602 | * frwr_unmap_async - invalidate memory regions that were registered for @req | |
603 | * @r_xprt: controlling transport instance | |
604 | * @req: rpcrdma_req with a non-empty list of MRs to process | |
605 | * | |
606 | * This guarantees that registered MRs are properly fenced from the | |
607 | * server before the RPC consumer accesses the data in them. It also | |
608 | * ensures proper Send flow control: waking the next RPC waits until | |
609 | * this RPC has relinquished all its Send Queue entries. | |
610 | */ | |
611 | void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) | |
612 | { | |
613 | struct ib_send_wr *first, *last, **prev; | |
614 | const struct ib_send_wr *bad_wr; | |
615 | struct rpcrdma_frwr *frwr; | |
616 | struct rpcrdma_mr *mr; | |
617 | int rc; | |
618 | ||
619 | /* Chain the LOCAL_INV Work Requests and post them with | |
620 | * a single ib_post_send() call. | |
621 | */ | |
622 | frwr = NULL; | |
623 | prev = &first; | |
265a38d4 | 624 | while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { |
d8099fed CL |
625 | |
626 | trace_xprtrdma_mr_localinv(mr); | |
627 | r_xprt->rx_stats.local_inv_needed++; | |
628 | ||
629 | frwr = &mr->frwr; | |
630 | frwr->fr_cqe.done = frwr_wc_localinv; | |
d8099fed CL |
631 | last = &frwr->fr_invwr; |
632 | last->next = NULL; | |
633 | last->wr_cqe = &frwr->fr_cqe; | |
634 | last->sg_list = NULL; | |
635 | last->num_sge = 0; | |
636 | last->opcode = IB_WR_LOCAL_INV; | |
637 | last->send_flags = IB_SEND_SIGNALED; | |
638 | last->ex.invalidate_rkey = mr->mr_handle; | |
639 | ||
640 | *prev = last; | |
641 | prev = &last->next; | |
642 | } | |
643 | ||
644 | /* Strong send queue ordering guarantees that when the | |
645 | * last WR in the chain completes, all WRs in the chain | |
646 | * are complete. The last completion will wake up the | |
647 | * RPC waiter. | |
648 | */ | |
649 | frwr->fr_cqe.done = frwr_wc_localinv_done; | |
650 | ||
651 | /* Transport disconnect drains the receive CQ before it | |
652 | * replaces the QP. The RPC reply handler won't call us | |
653 | * unless ri_id->qp is a valid pointer. | |
654 | */ | |
655 | bad_wr = NULL; | |
656 | rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr); | |
657 | trace_xprtrdma_post_send(req, rc); | |
658 | if (!rc) | |
659 | return; | |
660 | ||
661 | /* Recycle MRs in the LOCAL_INV chain that did not get posted. | |
662 | */ | |
663 | while (bad_wr) { | |
664 | frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); | |
665 | mr = container_of(frwr, struct rpcrdma_mr, frwr); | |
666 | bad_wr = bad_wr->next; | |
667 | ||
668 | rpcrdma_mr_recycle(mr); | |
669 | } | |
670 | ||
671 | /* The final LOCAL_INV WR in the chain is supposed to | |
672 | * do the wake. If it was never posted, the wake will | |
673 | * not happen, so wake here in that case. | |
674 | */ | |
675 | rpcrdma_complete_rqst(req->rl_reply); | |
676 | } |