Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
1dc563a6 | 30 | * Copyright (c) 2011, 2015, Intel Corporation. |
d7e09d03 PT |
31 | */ |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | */ | |
36 | ||
37 | #define DEBUG_SUBSYSTEM S_RPC | |
e27db149 GKH |
38 | #include "../include/obd_support.h" |
39 | #include "../include/lustre_net.h" | |
40 | #include "../include/lustre_lib.h" | |
41 | #include "../include/obd.h" | |
42 | #include "../include/obd_class.h" | |
d7e09d03 PT |
43 | #include "ptlrpc_internal.h" |
44 | ||
45 | /** | |
46 | * Helper function. Sends \a len bytes from \a base at offset \a offset | |
47 | * over \a conn connection to portal \a portal. | |
48 | * Returns 0 on success or error code. | |
49 | */ | |
3949015e KM |
50 | static int ptl_send_buf(lnet_handle_md_t *mdh, void *base, int len, |
51 | lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid, | |
52 | struct ptlrpc_connection *conn, int portal, __u64 xid, | |
53 | unsigned int offset) | |
d7e09d03 | 54 | { |
d0bfef31 CH |
55 | int rc; |
56 | lnet_md_t md; | |
d7e09d03 | 57 | |
3949015e KM |
58 | LASSERT(portal != 0); |
59 | LASSERT(conn != NULL); | |
60 | CDEBUG(D_INFO, "conn=%p id %s\n", conn, libcfs_id2str(conn->c_peer)); | |
d0bfef31 CH |
61 | md.start = base; |
62 | md.length = len; | |
d7e09d03 | 63 | md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1; |
d0bfef31 CH |
64 | md.options = PTLRPC_MD_OPTIONS; |
65 | md.user_ptr = cbid; | |
d7e09d03 PT |
66 | md.eq_handle = ptlrpc_eq_h; |
67 | ||
68 | if (unlikely(ack == LNET_ACK_REQ && | |
cb68dd2d KM |
69 | OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, |
70 | OBD_FAIL_ONCE))) { | |
d7e09d03 PT |
71 | /* don't ask for the ack to simulate failing client */ |
72 | ack = LNET_NOACK_REQ; | |
73 | } | |
74 | ||
3949015e | 75 | rc = LNetMDBind(md, LNET_UNLINK, mdh); |
d7e09d03 | 76 | if (unlikely(rc != 0)) { |
3949015e KM |
77 | CERROR("LNetMDBind failed: %d\n", rc); |
78 | LASSERT(rc == -ENOMEM); | |
0a3bdb00 | 79 | return -ENOMEM; |
d7e09d03 PT |
80 | } |
81 | ||
f537dd2c | 82 | CDEBUG(D_NET, "Sending %d bytes to portal %d, xid %lld, offset %u\n", |
d7e09d03 PT |
83 | len, portal, xid, offset); |
84 | ||
3949015e KM |
85 | rc = LNetPut(conn->c_self, *mdh, ack, |
86 | conn->c_peer, portal, xid, offset, 0); | |
d7e09d03 PT |
87 | if (unlikely(rc != 0)) { |
88 | int rc2; | |
89 | /* We're going to get an UNLINK event when I unlink below, | |
90 | * which will complete just like any other failed send, so | |
91 | * I fall through and return success here! */ | |
f537dd2c | 92 | CERROR("LNetPut(%s, %d, %lld) failed: %d\n", |
d7e09d03 PT |
93 | libcfs_id2str(conn->c_peer), portal, xid, rc); |
94 | rc2 = LNetMDUnlink(*mdh); | |
95 | LASSERTF(rc2 == 0, "rc2 = %d\n", rc2); | |
96 | } | |
97 | ||
0a3bdb00 | 98 | return 0; |
d7e09d03 PT |
99 | } |
100 | ||
101 | static void mdunlink_iterate_helper(lnet_handle_md_t *bd_mds, int count) | |
102 | { | |
103 | int i; | |
104 | ||
105 | for (i = 0; i < count; i++) | |
106 | LNetMDUnlink(bd_mds[i]); | |
107 | } | |
108 | ||
d7e09d03 PT |
109 | /** |
110 | * Register bulk at the sender for later transfer. | |
111 | * Returns 0 on success or error code. | |
112 | */ | |
12d0be62 | 113 | static int ptlrpc_register_bulk(struct ptlrpc_request *req) |
d7e09d03 PT |
114 | { |
115 | struct ptlrpc_bulk_desc *desc = req->rq_bulk; | |
116 | lnet_process_id_t peer; | |
117 | int rc = 0; | |
118 | int rc2; | |
119 | int posted_md; | |
120 | int total_md; | |
121 | __u64 xid; | |
d0bfef31 CH |
122 | lnet_handle_me_t me_h; |
123 | lnet_md_t md; | |
d7e09d03 PT |
124 | |
125 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET)) | |
0a3bdb00 | 126 | return 0; |
d7e09d03 PT |
127 | |
128 | /* NB no locking required until desc is on the network */ | |
129 | LASSERT(desc->bd_nob > 0); | |
130 | LASSERT(desc->bd_md_count == 0); | |
131 | LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT); | |
132 | LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES); | |
133 | LASSERT(desc->bd_req != NULL); | |
134 | LASSERT(desc->bd_type == BULK_PUT_SINK || | |
135 | desc->bd_type == BULK_GET_SOURCE); | |
136 | ||
137 | /* cleanup the state of the bulk for it will be reused */ | |
138 | if (req->rq_resend || req->rq_send_state == LUSTRE_IMP_REPLAY) | |
139 | desc->bd_nob_transferred = 0; | |
140 | else | |
141 | LASSERT(desc->bd_nob_transferred == 0); | |
142 | ||
143 | desc->bd_failure = 0; | |
144 | ||
145 | peer = desc->bd_import->imp_connection->c_peer; | |
146 | ||
147 | LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback); | |
148 | LASSERT(desc->bd_cbid.cbid_arg == desc); | |
149 | ||
150 | /* An XID is only used for a single request from the client. | |
151 | * For retried bulk transfers, a new XID will be allocated in | |
152 | * in ptlrpc_check_set() if it needs to be resent, so it is not | |
153 | * using the same RDMA match bits after an error. | |
154 | * | |
155 | * For multi-bulk RPCs, rq_xid is the last XID needed for bulks. The | |
156 | * first bulk XID is power-of-two aligned before rq_xid. LU-1431 */ | |
157 | xid = req->rq_xid & ~((__u64)desc->bd_md_max_brw - 1); | |
158 | LASSERTF(!(desc->bd_registered && | |
159 | req->rq_send_state != LUSTRE_IMP_REPLAY) || | |
160 | xid != desc->bd_last_xid, | |
b0f5aad5 | 161 | "registered: %d rq_xid: %llu bd_last_xid: %llu\n", |
d7e09d03 PT |
162 | desc->bd_registered, xid, desc->bd_last_xid); |
163 | ||
164 | total_md = (desc->bd_iov_count + LNET_MAX_IOV - 1) / LNET_MAX_IOV; | |
165 | desc->bd_registered = 1; | |
166 | desc->bd_last_xid = xid; | |
167 | desc->bd_md_count = total_md; | |
168 | md.user_ptr = &desc->bd_cbid; | |
169 | md.eq_handle = ptlrpc_eq_h; | |
170 | md.threshold = 1; /* PUT or GET */ | |
171 | ||
172 | for (posted_md = 0; posted_md < total_md; posted_md++, xid++) { | |
173 | md.options = PTLRPC_MD_OPTIONS | | |
174 | ((desc->bd_type == BULK_GET_SOURCE) ? | |
175 | LNET_MD_OP_GET : LNET_MD_OP_PUT); | |
176 | ptlrpc_fill_bulk_md(&md, desc, posted_md); | |
177 | ||
178 | rc = LNetMEAttach(desc->bd_portal, peer, xid, 0, | |
179 | LNET_UNLINK, LNET_INS_AFTER, &me_h); | |
180 | if (rc != 0) { | |
b0f5aad5 | 181 | CERROR("%s: LNetMEAttach failed x%llu/%d: rc = %d\n", |
3c92a0bf | 182 | desc->bd_import->imp_obd->obd_name, xid, |
d7e09d03 PT |
183 | posted_md, rc); |
184 | break; | |
185 | } | |
186 | ||
187 | /* About to let the network at it... */ | |
188 | rc = LNetMDAttach(me_h, md, LNET_UNLINK, | |
189 | &desc->bd_mds[posted_md]); | |
190 | if (rc != 0) { | |
b0f5aad5 | 191 | CERROR("%s: LNetMDAttach failed x%llu/%d: rc = %d\n", |
3c92a0bf | 192 | desc->bd_import->imp_obd->obd_name, xid, |
d7e09d03 PT |
193 | posted_md, rc); |
194 | rc2 = LNetMEUnlink(me_h); | |
195 | LASSERT(rc2 == 0); | |
196 | break; | |
197 | } | |
198 | } | |
199 | ||
200 | if (rc != 0) { | |
201 | LASSERT(rc == -ENOMEM); | |
202 | spin_lock(&desc->bd_lock); | |
203 | desc->bd_md_count -= total_md - posted_md; | |
204 | spin_unlock(&desc->bd_lock); | |
205 | LASSERT(desc->bd_md_count >= 0); | |
206 | mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); | |
207 | req->rq_status = -ENOMEM; | |
0a3bdb00 | 208 | return -ENOMEM; |
d7e09d03 PT |
209 | } |
210 | ||
211 | /* Set rq_xid to matchbits of the final bulk so that server can | |
212 | * infer the number of bulks that were prepared */ | |
213 | req->rq_xid = --xid; | |
214 | LASSERTF(desc->bd_last_xid == (req->rq_xid & PTLRPC_BULK_OPS_MASK), | |
b0f5aad5 | 215 | "bd_last_xid = x%llu, rq_xid = x%llu\n", |
d7e09d03 PT |
216 | desc->bd_last_xid, req->rq_xid); |
217 | ||
218 | spin_lock(&desc->bd_lock); | |
219 | /* Holler if peer manages to touch buffers before he knows the xid */ | |
220 | if (desc->bd_md_count != total_md) | |
221 | CWARN("%s: Peer %s touched %d buffers while I registered\n", | |
3c92a0bf | 222 | desc->bd_import->imp_obd->obd_name, libcfs_id2str(peer), |
d7e09d03 PT |
223 | total_md - desc->bd_md_count); |
224 | spin_unlock(&desc->bd_lock); | |
225 | ||
2d00bd17 JP |
226 | CDEBUG(D_NET, "Setup %u bulk %s buffers: %u pages %u bytes, xid x%#llx-%#llx, portal %u\n", |
227 | desc->bd_md_count, | |
d7e09d03 PT |
228 | desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink", |
229 | desc->bd_iov_count, desc->bd_nob, | |
230 | desc->bd_last_xid, req->rq_xid, desc->bd_portal); | |
231 | ||
0a3bdb00 | 232 | return 0; |
d7e09d03 | 233 | } |
d7e09d03 PT |
234 | |
235 | /** | |
236 | * Disconnect a bulk desc from the network. Idempotent. Not | |
237 | * thread-safe (i.e. only interlocks with completion callback). | |
238 | * Returns 1 on success or 0 if network unregistration failed for whatever | |
239 | * reason. | |
240 | */ | |
241 | int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async) | |
242 | { | |
243 | struct ptlrpc_bulk_desc *desc = req->rq_bulk; | |
d0bfef31 CH |
244 | wait_queue_head_t *wq; |
245 | struct l_wait_info lwi; | |
246 | int rc; | |
d7e09d03 PT |
247 | |
248 | LASSERT(!in_interrupt()); /* might sleep */ | |
249 | ||
250 | /* Let's setup deadline for reply unlink. */ | |
251 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) && | |
252 | async && req->rq_bulk_deadline == 0) | |
219e6de6 | 253 | req->rq_bulk_deadline = ktime_get_real_seconds() + LONG_UNLINK; |
d7e09d03 PT |
254 | |
255 | if (ptlrpc_client_bulk_active(req) == 0) /* completed or */ | |
0a3bdb00 | 256 | return 1; /* never registered */ |
d7e09d03 PT |
257 | |
258 | LASSERT(desc->bd_req == req); /* bd_req NULL until registered */ | |
259 | ||
260 | /* the unlink ensures the callback happens ASAP and is the last | |
261 | * one. If it fails, it must be because completion just happened, | |
262 | * but we must still l_wait_event() in this case to give liblustre | |
263 | * a chance to run client_bulk_callback() */ | |
264 | mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); | |
265 | ||
266 | if (ptlrpc_client_bulk_active(req) == 0) /* completed or */ | |
0a3bdb00 | 267 | return 1; /* never registered */ |
d7e09d03 PT |
268 | |
269 | /* Move to "Unregistering" phase as bulk was not unlinked yet. */ | |
270 | ptlrpc_rqphase_move(req, RQ_PHASE_UNREGISTERING); | |
271 | ||
272 | /* Do not wait for unlink to finish. */ | |
273 | if (async) | |
0a3bdb00 | 274 | return 0; |
d7e09d03 PT |
275 | |
276 | if (req->rq_set != NULL) | |
277 | wq = &req->rq_set->set_waitq; | |
278 | else | |
279 | wq = &req->rq_reply_waitq; | |
280 | ||
281 | for (;;) { | |
282 | /* Network access will complete in finite time but the HUGE | |
283 | * timeout lets us CWARN for visibility of sluggish NALs */ | |
284 | lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), | |
285 | cfs_time_seconds(1), NULL, NULL); | |
286 | rc = l_wait_event(*wq, !ptlrpc_client_bulk_active(req), &lwi); | |
287 | if (rc == 0) { | |
288 | ptlrpc_rqphase_move(req, req->rq_next_phase); | |
0a3bdb00 | 289 | return 1; |
d7e09d03 PT |
290 | } |
291 | ||
292 | LASSERT(rc == -ETIMEDOUT); | |
293 | DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p", | |
294 | desc); | |
295 | } | |
0a3bdb00 | 296 | return 0; |
d7e09d03 PT |
297 | } |
298 | EXPORT_SYMBOL(ptlrpc_unregister_bulk); | |
299 | ||
300 | static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags) | |
301 | { | |
d0bfef31 CH |
302 | struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; |
303 | struct ptlrpc_service *svc = svcpt->scp_service; | |
219e6de6 | 304 | int service_time = max_t(int, ktime_get_real_seconds() - |
d7e09d03 PT |
305 | req->rq_arrival_time.tv_sec, 1); |
306 | ||
307 | if (!(flags & PTLRPC_REPLY_EARLY) && | |
308 | (req->rq_type != PTL_RPC_MSG_ERR) && | |
309 | (req->rq_reqmsg != NULL) && | |
310 | !(lustre_msg_get_flags(req->rq_reqmsg) & | |
311 | (MSG_RESENT | MSG_REPLAY | | |
312 | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) { | |
313 | /* early replies, errors and recovery requests don't count | |
314 | * toward our service time estimate */ | |
315 | int oldse = at_measured(&svcpt->scp_at_estimate, service_time); | |
316 | ||
317 | if (oldse != 0) { | |
318 | DEBUG_REQ(D_ADAPTTO, req, | |
319 | "svc %s changed estimate from %d to %d", | |
320 | svc->srv_name, oldse, | |
321 | at_get(&svcpt->scp_at_estimate)); | |
322 | } | |
323 | } | |
324 | /* Report actual service time for client latency calc */ | |
325 | lustre_msg_set_service_time(req->rq_repmsg, service_time); | |
326 | /* Report service time estimate for future client reqs, but report 0 | |
327 | * (to be ignored by client) if it's a error reply during recovery. | |
328 | * (bz15815) */ | |
af3ec53b | 329 | if (req->rq_type == PTL_RPC_MSG_ERR && !req->rq_export) |
d7e09d03 PT |
330 | lustre_msg_set_timeout(req->rq_repmsg, 0); |
331 | else | |
332 | lustre_msg_set_timeout(req->rq_repmsg, | |
333 | at_get(&svcpt->scp_at_estimate)); | |
334 | ||
335 | if (req->rq_reqmsg && | |
336 | !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { | |
2e4fe2bd | 337 | CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x req_flags=%#x magic=%x/%x len=%d\n", |
d7e09d03 | 338 | flags, lustre_msg_get_flags(req->rq_reqmsg), |
d7e09d03 PT |
339 | lustre_msg_get_magic(req->rq_reqmsg), |
340 | lustre_msg_get_magic(req->rq_repmsg), req->rq_replen); | |
341 | } | |
342 | } | |
343 | ||
344 | /** | |
345 | * Send request reply from request \a req reply buffer. | |
346 | * \a flags defines reply types | |
b6da17f3 | 347 | * Returns 0 on success or error code |
d7e09d03 PT |
348 | */ |
349 | int ptlrpc_send_reply(struct ptlrpc_request *req, int flags) | |
350 | { | |
351 | struct ptlrpc_reply_state *rs = req->rq_reply_state; | |
d0bfef31 CH |
352 | struct ptlrpc_connection *conn; |
353 | int rc; | |
d7e09d03 PT |
354 | |
355 | /* We must already have a reply buffer (only ptlrpc_error() may be | |
356 | * called without one). The reply generated by sptlrpc layer (e.g. | |
357 | * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must | |
358 | * have a request buffer which is either the actual (swabbed) incoming | |
359 | * request, or a saved copy if this is a req saved in | |
360 | * target_queue_final_reply(). | |
361 | */ | |
3949015e KM |
362 | LASSERT(req->rq_no_reply == 0); |
363 | LASSERT(req->rq_reqbuf != NULL); | |
364 | LASSERT(rs != NULL); | |
365 | LASSERT((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult); | |
366 | LASSERT(req->rq_repmsg != NULL); | |
367 | LASSERT(req->rq_repmsg == rs->rs_msg); | |
368 | LASSERT(rs->rs_cb_id.cbid_fn == reply_out_callback); | |
369 | LASSERT(rs->rs_cb_id.cbid_arg == rs); | |
d7e09d03 PT |
370 | |
371 | /* There may be no rq_export during failover */ | |
372 | ||
373 | if (unlikely(req->rq_export && req->rq_export->exp_obd && | |
374 | req->rq_export->exp_obd->obd_fail)) { | |
375 | /* Failed obd's only send ENODEV */ | |
376 | req->rq_type = PTL_RPC_MSG_ERR; | |
377 | req->rq_status = -ENODEV; | |
378 | CDEBUG(D_HA, "sending ENODEV from failed obd %d\n", | |
379 | req->rq_export->exp_obd->obd_minor); | |
380 | } | |
381 | ||
dfc16973 | 382 | /* In order to keep interoperability with the client (< 2.3) which |
d7e09d03 PT |
383 | * doesn't have pb_jobid in ptlrpc_body, We have to shrink the |
384 | * ptlrpc_body in reply buffer to ptlrpc_body_v2, otherwise, the | |
385 | * reply buffer on client will be overflow. | |
386 | * | |
71474ccb JSO |
387 | * XXX Remove this whenever we drop the interoperability with |
388 | * such client. | |
d7e09d03 PT |
389 | */ |
390 | req->rq_replen = lustre_shrink_msg(req->rq_repmsg, 0, | |
391 | sizeof(struct ptlrpc_body_v2), 1); | |
392 | ||
393 | if (req->rq_type != PTL_RPC_MSG_ERR) | |
394 | req->rq_type = PTL_RPC_MSG_REPLY; | |
395 | ||
396 | lustre_msg_set_type(req->rq_repmsg, req->rq_type); | |
2d58de78 LW |
397 | lustre_msg_set_status(req->rq_repmsg, |
398 | ptlrpc_status_hton(req->rq_status)); | |
d7e09d03 PT |
399 | lustre_msg_set_opc(req->rq_repmsg, |
400 | req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0); | |
401 | ||
402 | target_pack_pool_reply(req); | |
403 | ||
404 | ptlrpc_at_set_reply(req, flags); | |
405 | ||
406 | if (req->rq_export == NULL || req->rq_export->exp_connection == NULL) | |
407 | conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL); | |
408 | else | |
409 | conn = ptlrpc_connection_addref(req->rq_export->exp_connection); | |
410 | ||
411 | if (unlikely(conn == NULL)) { | |
412 | CERROR("not replying on NULL connection\n"); /* bug 9635 */ | |
413 | return -ENOTCONN; | |
414 | } | |
415 | ptlrpc_rs_addref(rs); /* +1 ref for the network */ | |
416 | ||
417 | rc = sptlrpc_svc_wrap_reply(req); | |
418 | if (unlikely(rc)) | |
419 | goto out; | |
420 | ||
219e6de6 | 421 | req->rq_sent = ktime_get_real_seconds(); |
d7e09d03 | 422 | |
3949015e KM |
423 | rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len, |
424 | (rs->rs_difficult && !rs->rs_no_ack) ? | |
425 | LNET_ACK_REQ : LNET_NOACK_REQ, | |
426 | &rs->rs_cb_id, conn, | |
427 | ptlrpc_req2svc(req)->srv_rep_portal, | |
428 | req->rq_xid, req->rq_reply_off); | |
d7e09d03 PT |
429 | out: |
430 | if (unlikely(rc != 0)) | |
431 | ptlrpc_req_drop_rs(req); | |
432 | ptlrpc_connection_put(conn); | |
433 | return rc; | |
434 | } | |
435 | EXPORT_SYMBOL(ptlrpc_send_reply); | |
436 | ||
3949015e | 437 | int ptlrpc_reply(struct ptlrpc_request *req) |
d7e09d03 PT |
438 | { |
439 | if (req->rq_no_reply) | |
440 | return 0; | |
5ce91a9e | 441 | return ptlrpc_send_reply(req, 0); |
d7e09d03 PT |
442 | } |
443 | EXPORT_SYMBOL(ptlrpc_reply); | |
444 | ||
445 | /** | |
446 | * For request \a req send an error reply back. Create empty | |
447 | * reply buffers if necessary. | |
448 | */ | |
449 | int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult) | |
450 | { | |
451 | int rc; | |
d7e09d03 PT |
452 | |
453 | if (req->rq_no_reply) | |
0a3bdb00 | 454 | return 0; |
d7e09d03 PT |
455 | |
456 | if (!req->rq_repmsg) { | |
457 | rc = lustre_pack_reply(req, 1, NULL, NULL); | |
458 | if (rc) | |
0a3bdb00 | 459 | return rc; |
d7e09d03 PT |
460 | } |
461 | ||
462 | if (req->rq_status != -ENOSPC && req->rq_status != -EACCES && | |
463 | req->rq_status != -EPERM && req->rq_status != -ENOENT && | |
464 | req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT) | |
465 | req->rq_type = PTL_RPC_MSG_ERR; | |
466 | ||
467 | rc = ptlrpc_send_reply(req, may_be_difficult); | |
0a3bdb00 | 468 | return rc; |
d7e09d03 PT |
469 | } |
470 | EXPORT_SYMBOL(ptlrpc_send_error); | |
471 | ||
472 | int ptlrpc_error(struct ptlrpc_request *req) | |
473 | { | |
474 | return ptlrpc_send_error(req, 0); | |
475 | } | |
476 | EXPORT_SYMBOL(ptlrpc_error); | |
477 | ||
478 | /** | |
479 | * Send request \a request. | |
480 | * if \a noreply is set, don't expect any reply back and don't set up | |
481 | * reply buffers. | |
482 | * Returns 0 on success or error code. | |
483 | */ | |
484 | int ptl_send_rpc(struct ptlrpc_request *request, int noreply) | |
485 | { | |
486 | int rc; | |
487 | int rc2; | |
488 | int mpflag = 0; | |
489 | struct ptlrpc_connection *connection; | |
d0bfef31 CH |
490 | lnet_handle_me_t reply_me_h; |
491 | lnet_md_t reply_md; | |
d7e09d03 | 492 | struct obd_device *obd = request->rq_import->imp_obd; |
d7e09d03 PT |
493 | |
494 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC)) | |
0a3bdb00 | 495 | return 0; |
d7e09d03 PT |
496 | |
497 | LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST); | |
498 | LASSERT(request->rq_wait_ctx == 0); | |
499 | ||
500 | /* If this is a re-transmit, we're required to have disengaged | |
501 | * cleanly from the previous attempt */ | |
502 | LASSERT(!request->rq_receiving_reply); | |
5c689e68 AB |
503 | LASSERT(!((lustre_msg_get_flags(request->rq_reqmsg) & MSG_REPLAY) && |
504 | (request->rq_import->imp_state == LUSTRE_IMP_FULL))); | |
d7e09d03 | 505 | |
f60d7c39 | 506 | if (unlikely(obd != NULL && obd->obd_fail)) { |
d7e09d03 | 507 | CDEBUG(D_HA, "muting rpc for failed imp obd %s\n", |
f60d7c39 | 508 | obd->obd_name); |
d7e09d03 | 509 | /* this prevents us from waiting in ptlrpc_queue_wait */ |
15c50ccc | 510 | spin_lock(&request->rq_lock); |
d7e09d03 | 511 | request->rq_err = 1; |
15c50ccc | 512 | spin_unlock(&request->rq_lock); |
d7e09d03 | 513 | request->rq_status = -ENODEV; |
0a3bdb00 | 514 | return -ENODEV; |
d7e09d03 PT |
515 | } |
516 | ||
517 | connection = request->rq_import->imp_connection; | |
518 | ||
519 | lustre_msg_set_handle(request->rq_reqmsg, | |
520 | &request->rq_import->imp_remote_handle); | |
521 | lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST); | |
522 | lustre_msg_set_conn_cnt(request->rq_reqmsg, | |
523 | request->rq_import->imp_conn_cnt); | |
524 | lustre_msghdr_set_flags(request->rq_reqmsg, | |
525 | request->rq_import->imp_msghdr_flags); | |
526 | ||
527 | if (request->rq_resend) | |
528 | lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT); | |
529 | ||
530 | if (request->rq_memalloc) | |
531 | mpflag = cfs_memory_pressure_get_and_set(); | |
532 | ||
533 | rc = sptlrpc_cli_wrap_request(request); | |
534 | if (rc) | |
a9b3e8f3 | 535 | goto out; |
d7e09d03 PT |
536 | |
537 | /* bulk register should be done after wrap_request() */ | |
538 | if (request->rq_bulk != NULL) { | |
3949015e | 539 | rc = ptlrpc_register_bulk(request); |
d7e09d03 | 540 | if (rc != 0) |
a9b3e8f3 | 541 | goto out; |
d7e09d03 PT |
542 | } |
543 | ||
544 | if (!noreply) { | |
3949015e | 545 | LASSERT(request->rq_replen != 0); |
d7e09d03 PT |
546 | if (request->rq_repbuf == NULL) { |
547 | LASSERT(request->rq_repdata == NULL); | |
548 | LASSERT(request->rq_repmsg == NULL); | |
549 | rc = sptlrpc_cli_alloc_repbuf(request, | |
550 | request->rq_replen); | |
551 | if (rc) { | |
552 | /* this prevents us from looping in | |
553 | * ptlrpc_queue_wait */ | |
15c50ccc | 554 | spin_lock(&request->rq_lock); |
d7e09d03 | 555 | request->rq_err = 1; |
15c50ccc | 556 | spin_unlock(&request->rq_lock); |
d7e09d03 | 557 | request->rq_status = rc; |
a9b3e8f3 | 558 | goto cleanup_bulk; |
d7e09d03 PT |
559 | } |
560 | } else { | |
561 | request->rq_repdata = NULL; | |
562 | request->rq_repmsg = NULL; | |
563 | } | |
564 | ||
565 | rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/ | |
566 | connection->c_peer, request->rq_xid, 0, | |
567 | LNET_UNLINK, LNET_INS_AFTER, &reply_me_h); | |
568 | if (rc != 0) { | |
569 | CERROR("LNetMEAttach failed: %d\n", rc); | |
3949015e | 570 | LASSERT(rc == -ENOMEM); |
a9b3e8f3 JL |
571 | rc = -ENOMEM; |
572 | goto cleanup_bulk; | |
d7e09d03 PT |
573 | } |
574 | } | |
575 | ||
576 | spin_lock(&request->rq_lock); | |
577 | /* If the MD attach succeeds, there _will_ be a reply_in callback */ | |
578 | request->rq_receiving_reply = !noreply; | |
cf378ff7 | 579 | request->rq_req_unlink = 1; |
d7e09d03 | 580 | /* We are responsible for unlinking the reply buffer */ |
cf378ff7 | 581 | request->rq_reply_unlink = !noreply; |
d7e09d03 PT |
582 | /* Clear any flags that may be present from previous sends. */ |
583 | request->rq_replied = 0; | |
584 | request->rq_err = 0; | |
585 | request->rq_timedout = 0; | |
586 | request->rq_net_err = 0; | |
587 | request->rq_resend = 0; | |
588 | request->rq_restart = 0; | |
589 | request->rq_reply_truncate = 0; | |
590 | spin_unlock(&request->rq_lock); | |
591 | ||
592 | if (!noreply) { | |
d0bfef31 CH |
593 | reply_md.start = request->rq_repbuf; |
594 | reply_md.length = request->rq_repbuf_len; | |
d7e09d03 PT |
595 | /* Allow multiple early replies */ |
596 | reply_md.threshold = LNET_MD_THRESH_INF; | |
597 | /* Manage remote for early replies */ | |
d0bfef31 | 598 | reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | |
d7e09d03 | 599 | LNET_MD_MANAGE_REMOTE | |
7fb7027c | 600 | LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */ |
d0bfef31 | 601 | reply_md.user_ptr = &request->rq_reply_cbid; |
d7e09d03 PT |
602 | reply_md.eq_handle = ptlrpc_eq_h; |
603 | ||
cf378ff7 | 604 | /* We must see the unlink callback to unset rq_reply_unlink, |
d7e09d03 PT |
605 | so we can't auto-unlink */ |
606 | rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN, | |
607 | &request->rq_reply_md_h); | |
608 | if (rc != 0) { | |
609 | CERROR("LNetMDAttach failed: %d\n", rc); | |
3949015e | 610 | LASSERT(rc == -ENOMEM); |
d7e09d03 PT |
611 | spin_lock(&request->rq_lock); |
612 | /* ...but the MD attach didn't succeed... */ | |
613 | request->rq_receiving_reply = 0; | |
614 | spin_unlock(&request->rq_lock); | |
a9b3e8f3 JL |
615 | rc = -ENOMEM; |
616 | goto cleanup_me; | |
d7e09d03 PT |
617 | } |
618 | ||
b0f5aad5 | 619 | CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid %llu, portal %u\n", |
d7e09d03 PT |
620 | request->rq_repbuf_len, request->rq_xid, |
621 | request->rq_reply_portal); | |
622 | } | |
623 | ||
624 | /* add references on request for request_out_callback */ | |
625 | ptlrpc_request_addref(request); | |
f60d7c39 | 626 | if (obd != NULL && obd->obd_svc_stats != NULL) |
d7e09d03 PT |
627 | lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR, |
628 | atomic_read(&request->rq_import->imp_inflight)); | |
629 | ||
630 | OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5); | |
631 | ||
219e6de6 AB |
632 | ktime_get_real_ts64(&request->rq_arrival_time); |
633 | request->rq_sent = ktime_get_real_seconds(); | |
d7e09d03 PT |
634 | /* We give the server rq_timeout secs to process the req, and |
635 | add the network latency for our local timeout. */ | |
636 | request->rq_deadline = request->rq_sent + request->rq_timeout + | |
637 | ptlrpc_at_get_net_latency(request); | |
638 | ||
639 | ptlrpc_pinger_sending_on_import(request->rq_import); | |
640 | ||
641 | DEBUG_REQ(D_INFO, request, "send flg=%x", | |
642 | lustre_msg_get_flags(request->rq_reqmsg)); | |
643 | rc = ptl_send_buf(&request->rq_req_md_h, | |
644 | request->rq_reqbuf, request->rq_reqdata_len, | |
645 | LNET_NOACK_REQ, &request->rq_req_cbid, | |
646 | connection, | |
647 | request->rq_request_portal, | |
648 | request->rq_xid, 0); | |
649 | if (rc == 0) | |
a9b3e8f3 | 650 | goto out; |
d7e09d03 PT |
651 | |
652 | ptlrpc_req_finished(request); | |
653 | if (noreply) | |
a9b3e8f3 | 654 | goto out; |
d7e09d03 PT |
655 | |
656 | cleanup_me: | |
657 | /* MEUnlink is safe; the PUT didn't even get off the ground, and | |
658 | * nobody apart from the PUT's target has the right nid+XID to | |
659 | * access the reply buffer. */ | |
660 | rc2 = LNetMEUnlink(reply_me_h); | |
3949015e | 661 | LASSERT(rc2 == 0); |
d7e09d03 PT |
662 | /* UNLINKED callback called synchronously */ |
663 | LASSERT(!request->rq_receiving_reply); | |
664 | ||
665 | cleanup_bulk: | |
666 | /* We do sync unlink here as there was no real transfer here so | |
667 | * the chance to have long unlink to sluggish net is smaller here. */ | |
668 | ptlrpc_unregister_bulk(request, 0); | |
669 | out: | |
670 | if (request->rq_memalloc) | |
671 | cfs_memory_pressure_restore(mpflag); | |
672 | return rc; | |
673 | } | |
674 | EXPORT_SYMBOL(ptl_send_rpc); | |
675 | ||
676 | /** | |
677 | * Register request buffer descriptor for request receiving. | |
678 | */ | |
679 | int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd) | |
680 | { | |
d0bfef31 CH |
681 | struct ptlrpc_service *service = rqbd->rqbd_svcpt->scp_service; |
682 | static lnet_process_id_t match_id = {LNET_NID_ANY, LNET_PID_ANY}; | |
683 | int rc; | |
684 | lnet_md_t md; | |
685 | lnet_handle_me_t me_h; | |
d7e09d03 PT |
686 | |
687 | CDEBUG(D_NET, "LNetMEAttach: portal %d\n", | |
688 | service->srv_req_portal); | |
689 | ||
690 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD)) | |
fbe7c6c7 | 691 | return -ENOMEM; |
d7e09d03 PT |
692 | |
693 | /* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL, | |
694 | * which means buffer can only be attached on local CPT, and LND | |
695 | * threads can find it by grabbing a local lock */ | |
696 | rc = LNetMEAttach(service->srv_req_portal, | |
697 | match_id, 0, ~0, LNET_UNLINK, | |
698 | rqbd->rqbd_svcpt->scp_cpt >= 0 ? | |
699 | LNET_INS_LOCAL : LNET_INS_AFTER, &me_h); | |
700 | if (rc != 0) { | |
701 | CERROR("LNetMEAttach failed: %d\n", rc); | |
fbe7c6c7 | 702 | return -ENOMEM; |
d7e09d03 PT |
703 | } |
704 | ||
705 | LASSERT(rqbd->rqbd_refcount == 0); | |
706 | rqbd->rqbd_refcount = 1; | |
707 | ||
d0bfef31 CH |
708 | md.start = rqbd->rqbd_buffer; |
709 | md.length = service->srv_buf_size; | |
710 | md.max_size = service->srv_max_req_size; | |
d7e09d03 | 711 | md.threshold = LNET_MD_THRESH_INF; |
d0bfef31 CH |
712 | md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE; |
713 | md.user_ptr = &rqbd->rqbd_cbid; | |
d7e09d03 PT |
714 | md.eq_handle = ptlrpc_eq_h; |
715 | ||
716 | rc = LNetMDAttach(me_h, md, LNET_UNLINK, &rqbd->rqbd_md_h); | |
717 | if (rc == 0) | |
fbe7c6c7 | 718 | return 0; |
d7e09d03 | 719 | |
998d2766 | 720 | CERROR("LNetMDAttach failed: %d;\n", rc); |
3949015e KM |
721 | LASSERT(rc == -ENOMEM); |
722 | rc = LNetMEUnlink(me_h); | |
723 | LASSERT(rc == 0); | |
d7e09d03 PT |
724 | rqbd->rqbd_refcount = 0; |
725 | ||
fbe7c6c7 | 726 | return -ENOMEM; |
d7e09d03 | 727 | } |