| 1 | /* |
| 2 | * GPL HEADER START |
| 3 | * |
| 4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License version 2 only, |
| 8 | * as published by the Free Software Foundation. |
| 9 | * |
| 10 | * This program is distributed in the hope that it will be useful, but |
| 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | * General Public License version 2 for more details (a copy is included |
| 14 | * in the LICENSE file that accompanied this code). |
| 15 | * |
| 16 | * You should have received a copy of the GNU General Public License |
| 17 | * version 2 along with this program; If not, see |
| 18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf |
| 19 | * |
| 20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, |
| 21 | * CA 95054 USA or visit www.sun.com if you need additional information or |
| 22 | * have any questions. |
| 23 | * |
| 24 | * GPL HEADER END |
| 25 | */ |
| 26 | /* |
| 27 | * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. |
| 28 | * Use is subject to license terms. |
| 29 | * |
| 30 | * Copyright (c) 2011, 2015, Intel Corporation. |
| 31 | */ |
| 32 | /* |
| 33 | * This file is part of Lustre, http://www.lustre.org/ |
| 34 | * Lustre is a trademark of Sun Microsystems, Inc. |
| 35 | */ |
| 36 | |
| 37 | #define DEBUG_SUBSYSTEM S_RPC |
| 38 | #include "../include/obd_support.h" |
| 39 | #include "../include/lustre_net.h" |
| 40 | #include "../include/lustre_lib.h" |
| 41 | #include "../include/obd.h" |
| 42 | #include "../include/obd_class.h" |
| 43 | #include "ptlrpc_internal.h" |
| 44 | |
| 45 | /** |
| 46 | * Helper function. Sends \a len bytes from \a base at offset \a offset |
| 47 | * over \a conn connection to portal \a portal. |
| 48 | * Returns 0 on success or error code. |
| 49 | */ |
| 50 | static int ptl_send_buf(lnet_handle_md_t *mdh, void *base, int len, |
| 51 | lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid, |
| 52 | struct ptlrpc_connection *conn, int portal, __u64 xid, |
| 53 | unsigned int offset) |
| 54 | { |
| 55 | int rc; |
| 56 | lnet_md_t md; |
| 57 | |
| 58 | LASSERT(portal != 0); |
| 59 | LASSERT(conn != NULL); |
| 60 | CDEBUG(D_INFO, "conn=%p id %s\n", conn, libcfs_id2str(conn->c_peer)); |
| 61 | md.start = base; |
| 62 | md.length = len; |
| 63 | md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1; |
| 64 | md.options = PTLRPC_MD_OPTIONS; |
| 65 | md.user_ptr = cbid; |
| 66 | md.eq_handle = ptlrpc_eq_h; |
| 67 | |
| 68 | if (unlikely(ack == LNET_ACK_REQ && |
| 69 | OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, |
| 70 | OBD_FAIL_ONCE))) { |
| 71 | /* don't ask for the ack to simulate failing client */ |
| 72 | ack = LNET_NOACK_REQ; |
| 73 | } |
| 74 | |
| 75 | rc = LNetMDBind(md, LNET_UNLINK, mdh); |
| 76 | if (unlikely(rc != 0)) { |
| 77 | CERROR("LNetMDBind failed: %d\n", rc); |
| 78 | LASSERT(rc == -ENOMEM); |
| 79 | return -ENOMEM; |
| 80 | } |
| 81 | |
| 82 | CDEBUG(D_NET, "Sending %d bytes to portal %d, xid %lld, offset %u\n", |
| 83 | len, portal, xid, offset); |
| 84 | |
| 85 | rc = LNetPut(conn->c_self, *mdh, ack, |
| 86 | conn->c_peer, portal, xid, offset, 0); |
| 87 | if (unlikely(rc != 0)) { |
| 88 | int rc2; |
| 89 | /* We're going to get an UNLINK event when I unlink below, |
| 90 | * which will complete just like any other failed send, so |
| 91 | * I fall through and return success here! */ |
| 92 | CERROR("LNetPut(%s, %d, %lld) failed: %d\n", |
| 93 | libcfs_id2str(conn->c_peer), portal, xid, rc); |
| 94 | rc2 = LNetMDUnlink(*mdh); |
| 95 | LASSERTF(rc2 == 0, "rc2 = %d\n", rc2); |
| 96 | } |
| 97 | |
| 98 | return 0; |
| 99 | } |
| 100 | |
| 101 | static void mdunlink_iterate_helper(lnet_handle_md_t *bd_mds, int count) |
| 102 | { |
| 103 | int i; |
| 104 | |
| 105 | for (i = 0; i < count; i++) |
| 106 | LNetMDUnlink(bd_mds[i]); |
| 107 | } |
| 108 | |
| 109 | /** |
| 110 | * Register bulk at the sender for later transfer. |
| 111 | * Returns 0 on success or error code. |
| 112 | */ |
| 113 | static int ptlrpc_register_bulk(struct ptlrpc_request *req) |
| 114 | { |
| 115 | struct ptlrpc_bulk_desc *desc = req->rq_bulk; |
| 116 | lnet_process_id_t peer; |
| 117 | int rc = 0; |
| 118 | int rc2; |
| 119 | int posted_md; |
| 120 | int total_md; |
| 121 | __u64 xid; |
| 122 | lnet_handle_me_t me_h; |
| 123 | lnet_md_t md; |
| 124 | |
| 125 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET)) |
| 126 | return 0; |
| 127 | |
| 128 | /* NB no locking required until desc is on the network */ |
| 129 | LASSERT(desc->bd_nob > 0); |
| 130 | LASSERT(desc->bd_md_count == 0); |
| 131 | LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT); |
| 132 | LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES); |
| 133 | LASSERT(desc->bd_req != NULL); |
| 134 | LASSERT(desc->bd_type == BULK_PUT_SINK || |
| 135 | desc->bd_type == BULK_GET_SOURCE); |
| 136 | |
| 137 | /* cleanup the state of the bulk for it will be reused */ |
| 138 | if (req->rq_resend || req->rq_send_state == LUSTRE_IMP_REPLAY) |
| 139 | desc->bd_nob_transferred = 0; |
| 140 | else |
| 141 | LASSERT(desc->bd_nob_transferred == 0); |
| 142 | |
| 143 | desc->bd_failure = 0; |
| 144 | |
| 145 | peer = desc->bd_import->imp_connection->c_peer; |
| 146 | |
| 147 | LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback); |
| 148 | LASSERT(desc->bd_cbid.cbid_arg == desc); |
| 149 | |
| 150 | /* An XID is only used for a single request from the client. |
| 151 | * For retried bulk transfers, a new XID will be allocated in |
| 152 | * in ptlrpc_check_set() if it needs to be resent, so it is not |
| 153 | * using the same RDMA match bits after an error. |
| 154 | * |
| 155 | * For multi-bulk RPCs, rq_xid is the last XID needed for bulks. The |
| 156 | * first bulk XID is power-of-two aligned before rq_xid. LU-1431 */ |
| 157 | xid = req->rq_xid & ~((__u64)desc->bd_md_max_brw - 1); |
| 158 | LASSERTF(!(desc->bd_registered && |
| 159 | req->rq_send_state != LUSTRE_IMP_REPLAY) || |
| 160 | xid != desc->bd_last_xid, |
| 161 | "registered: %d rq_xid: %llu bd_last_xid: %llu\n", |
| 162 | desc->bd_registered, xid, desc->bd_last_xid); |
| 163 | |
| 164 | total_md = (desc->bd_iov_count + LNET_MAX_IOV - 1) / LNET_MAX_IOV; |
| 165 | desc->bd_registered = 1; |
| 166 | desc->bd_last_xid = xid; |
| 167 | desc->bd_md_count = total_md; |
| 168 | md.user_ptr = &desc->bd_cbid; |
| 169 | md.eq_handle = ptlrpc_eq_h; |
| 170 | md.threshold = 1; /* PUT or GET */ |
| 171 | |
| 172 | for (posted_md = 0; posted_md < total_md; posted_md++, xid++) { |
| 173 | md.options = PTLRPC_MD_OPTIONS | |
| 174 | ((desc->bd_type == BULK_GET_SOURCE) ? |
| 175 | LNET_MD_OP_GET : LNET_MD_OP_PUT); |
| 176 | ptlrpc_fill_bulk_md(&md, desc, posted_md); |
| 177 | |
| 178 | rc = LNetMEAttach(desc->bd_portal, peer, xid, 0, |
| 179 | LNET_UNLINK, LNET_INS_AFTER, &me_h); |
| 180 | if (rc != 0) { |
| 181 | CERROR("%s: LNetMEAttach failed x%llu/%d: rc = %d\n", |
| 182 | desc->bd_import->imp_obd->obd_name, xid, |
| 183 | posted_md, rc); |
| 184 | break; |
| 185 | } |
| 186 | |
| 187 | /* About to let the network at it... */ |
| 188 | rc = LNetMDAttach(me_h, md, LNET_UNLINK, |
| 189 | &desc->bd_mds[posted_md]); |
| 190 | if (rc != 0) { |
| 191 | CERROR("%s: LNetMDAttach failed x%llu/%d: rc = %d\n", |
| 192 | desc->bd_import->imp_obd->obd_name, xid, |
| 193 | posted_md, rc); |
| 194 | rc2 = LNetMEUnlink(me_h); |
| 195 | LASSERT(rc2 == 0); |
| 196 | break; |
| 197 | } |
| 198 | } |
| 199 | |
| 200 | if (rc != 0) { |
| 201 | LASSERT(rc == -ENOMEM); |
| 202 | spin_lock(&desc->bd_lock); |
| 203 | desc->bd_md_count -= total_md - posted_md; |
| 204 | spin_unlock(&desc->bd_lock); |
| 205 | LASSERT(desc->bd_md_count >= 0); |
| 206 | mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); |
| 207 | req->rq_status = -ENOMEM; |
| 208 | return -ENOMEM; |
| 209 | } |
| 210 | |
| 211 | /* Set rq_xid to matchbits of the final bulk so that server can |
| 212 | * infer the number of bulks that were prepared */ |
| 213 | req->rq_xid = --xid; |
| 214 | LASSERTF(desc->bd_last_xid == (req->rq_xid & PTLRPC_BULK_OPS_MASK), |
| 215 | "bd_last_xid = x%llu, rq_xid = x%llu\n", |
| 216 | desc->bd_last_xid, req->rq_xid); |
| 217 | |
| 218 | spin_lock(&desc->bd_lock); |
| 219 | /* Holler if peer manages to touch buffers before he knows the xid */ |
| 220 | if (desc->bd_md_count != total_md) |
| 221 | CWARN("%s: Peer %s touched %d buffers while I registered\n", |
| 222 | desc->bd_import->imp_obd->obd_name, libcfs_id2str(peer), |
| 223 | total_md - desc->bd_md_count); |
| 224 | spin_unlock(&desc->bd_lock); |
| 225 | |
| 226 | CDEBUG(D_NET, "Setup %u bulk %s buffers: %u pages %u bytes, xid x%#llx-%#llx, portal %u\n", |
| 227 | desc->bd_md_count, |
| 228 | desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink", |
| 229 | desc->bd_iov_count, desc->bd_nob, |
| 230 | desc->bd_last_xid, req->rq_xid, desc->bd_portal); |
| 231 | |
| 232 | return 0; |
| 233 | } |
| 234 | |
| 235 | /** |
| 236 | * Disconnect a bulk desc from the network. Idempotent. Not |
| 237 | * thread-safe (i.e. only interlocks with completion callback). |
| 238 | * Returns 1 on success or 0 if network unregistration failed for whatever |
| 239 | * reason. |
| 240 | */ |
| 241 | int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async) |
| 242 | { |
| 243 | struct ptlrpc_bulk_desc *desc = req->rq_bulk; |
| 244 | wait_queue_head_t *wq; |
| 245 | struct l_wait_info lwi; |
| 246 | int rc; |
| 247 | |
| 248 | LASSERT(!in_interrupt()); /* might sleep */ |
| 249 | |
| 250 | /* Let's setup deadline for reply unlink. */ |
| 251 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) && |
| 252 | async && req->rq_bulk_deadline == 0) |
| 253 | req->rq_bulk_deadline = ktime_get_real_seconds() + LONG_UNLINK; |
| 254 | |
| 255 | if (ptlrpc_client_bulk_active(req) == 0) /* completed or */ |
| 256 | return 1; /* never registered */ |
| 257 | |
| 258 | LASSERT(desc->bd_req == req); /* bd_req NULL until registered */ |
| 259 | |
| 260 | /* the unlink ensures the callback happens ASAP and is the last |
| 261 | * one. If it fails, it must be because completion just happened, |
| 262 | * but we must still l_wait_event() in this case to give liblustre |
| 263 | * a chance to run client_bulk_callback() */ |
| 264 | mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); |
| 265 | |
| 266 | if (ptlrpc_client_bulk_active(req) == 0) /* completed or */ |
| 267 | return 1; /* never registered */ |
| 268 | |
| 269 | /* Move to "Unregistering" phase as bulk was not unlinked yet. */ |
| 270 | ptlrpc_rqphase_move(req, RQ_PHASE_UNREGISTERING); |
| 271 | |
| 272 | /* Do not wait for unlink to finish. */ |
| 273 | if (async) |
| 274 | return 0; |
| 275 | |
| 276 | if (req->rq_set != NULL) |
| 277 | wq = &req->rq_set->set_waitq; |
| 278 | else |
| 279 | wq = &req->rq_reply_waitq; |
| 280 | |
| 281 | for (;;) { |
| 282 | /* Network access will complete in finite time but the HUGE |
| 283 | * timeout lets us CWARN for visibility of sluggish NALs */ |
| 284 | lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), |
| 285 | cfs_time_seconds(1), NULL, NULL); |
| 286 | rc = l_wait_event(*wq, !ptlrpc_client_bulk_active(req), &lwi); |
| 287 | if (rc == 0) { |
| 288 | ptlrpc_rqphase_move(req, req->rq_next_phase); |
| 289 | return 1; |
| 290 | } |
| 291 | |
| 292 | LASSERT(rc == -ETIMEDOUT); |
| 293 | DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p", |
| 294 | desc); |
| 295 | } |
| 296 | return 0; |
| 297 | } |
| 298 | EXPORT_SYMBOL(ptlrpc_unregister_bulk); |
| 299 | |
| 300 | static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags) |
| 301 | { |
| 302 | struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; |
| 303 | struct ptlrpc_service *svc = svcpt->scp_service; |
| 304 | int service_time = max_t(int, ktime_get_real_seconds() - |
| 305 | req->rq_arrival_time.tv_sec, 1); |
| 306 | |
| 307 | if (!(flags & PTLRPC_REPLY_EARLY) && |
| 308 | (req->rq_type != PTL_RPC_MSG_ERR) && |
| 309 | (req->rq_reqmsg != NULL) && |
| 310 | !(lustre_msg_get_flags(req->rq_reqmsg) & |
| 311 | (MSG_RESENT | MSG_REPLAY | |
| 312 | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) { |
| 313 | /* early replies, errors and recovery requests don't count |
| 314 | * toward our service time estimate */ |
| 315 | int oldse = at_measured(&svcpt->scp_at_estimate, service_time); |
| 316 | |
| 317 | if (oldse != 0) { |
| 318 | DEBUG_REQ(D_ADAPTTO, req, |
| 319 | "svc %s changed estimate from %d to %d", |
| 320 | svc->srv_name, oldse, |
| 321 | at_get(&svcpt->scp_at_estimate)); |
| 322 | } |
| 323 | } |
| 324 | /* Report actual service time for client latency calc */ |
| 325 | lustre_msg_set_service_time(req->rq_repmsg, service_time); |
| 326 | /* Report service time estimate for future client reqs, but report 0 |
| 327 | * (to be ignored by client) if it's a error reply during recovery. |
| 328 | * (bz15815) */ |
| 329 | if (req->rq_type == PTL_RPC_MSG_ERR && !req->rq_export) |
| 330 | lustre_msg_set_timeout(req->rq_repmsg, 0); |
| 331 | else |
| 332 | lustre_msg_set_timeout(req->rq_repmsg, |
| 333 | at_get(&svcpt->scp_at_estimate)); |
| 334 | |
| 335 | if (req->rq_reqmsg && |
| 336 | !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { |
| 337 | CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x req_flags=%#x magic=%x/%x len=%d\n", |
| 338 | flags, lustre_msg_get_flags(req->rq_reqmsg), |
| 339 | lustre_msg_get_magic(req->rq_reqmsg), |
| 340 | lustre_msg_get_magic(req->rq_repmsg), req->rq_replen); |
| 341 | } |
| 342 | } |
| 343 | |
| 344 | /** |
| 345 | * Send request reply from request \a req reply buffer. |
| 346 | * \a flags defines reply types |
| 347 | * Returns 0 on success or error code |
| 348 | */ |
| 349 | int ptlrpc_send_reply(struct ptlrpc_request *req, int flags) |
| 350 | { |
| 351 | struct ptlrpc_reply_state *rs = req->rq_reply_state; |
| 352 | struct ptlrpc_connection *conn; |
| 353 | int rc; |
| 354 | |
| 355 | /* We must already have a reply buffer (only ptlrpc_error() may be |
| 356 | * called without one). The reply generated by sptlrpc layer (e.g. |
| 357 | * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must |
| 358 | * have a request buffer which is either the actual (swabbed) incoming |
| 359 | * request, or a saved copy if this is a req saved in |
| 360 | * target_queue_final_reply(). |
| 361 | */ |
| 362 | LASSERT(req->rq_no_reply == 0); |
| 363 | LASSERT(req->rq_reqbuf != NULL); |
| 364 | LASSERT(rs != NULL); |
| 365 | LASSERT((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult); |
| 366 | LASSERT(req->rq_repmsg != NULL); |
| 367 | LASSERT(req->rq_repmsg == rs->rs_msg); |
| 368 | LASSERT(rs->rs_cb_id.cbid_fn == reply_out_callback); |
| 369 | LASSERT(rs->rs_cb_id.cbid_arg == rs); |
| 370 | |
| 371 | /* There may be no rq_export during failover */ |
| 372 | |
| 373 | if (unlikely(req->rq_export && req->rq_export->exp_obd && |
| 374 | req->rq_export->exp_obd->obd_fail)) { |
| 375 | /* Failed obd's only send ENODEV */ |
| 376 | req->rq_type = PTL_RPC_MSG_ERR; |
| 377 | req->rq_status = -ENODEV; |
| 378 | CDEBUG(D_HA, "sending ENODEV from failed obd %d\n", |
| 379 | req->rq_export->exp_obd->obd_minor); |
| 380 | } |
| 381 | |
| 382 | /* In order to keep interoperability with the client (< 2.3) which |
| 383 | * doesn't have pb_jobid in ptlrpc_body, We have to shrink the |
| 384 | * ptlrpc_body in reply buffer to ptlrpc_body_v2, otherwise, the |
| 385 | * reply buffer on client will be overflow. |
| 386 | * |
| 387 | * XXX Remove this whenever we drop the interoperability with |
| 388 | * such client. |
| 389 | */ |
| 390 | req->rq_replen = lustre_shrink_msg(req->rq_repmsg, 0, |
| 391 | sizeof(struct ptlrpc_body_v2), 1); |
| 392 | |
| 393 | if (req->rq_type != PTL_RPC_MSG_ERR) |
| 394 | req->rq_type = PTL_RPC_MSG_REPLY; |
| 395 | |
| 396 | lustre_msg_set_type(req->rq_repmsg, req->rq_type); |
| 397 | lustre_msg_set_status(req->rq_repmsg, |
| 398 | ptlrpc_status_hton(req->rq_status)); |
| 399 | lustre_msg_set_opc(req->rq_repmsg, |
| 400 | req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0); |
| 401 | |
| 402 | target_pack_pool_reply(req); |
| 403 | |
| 404 | ptlrpc_at_set_reply(req, flags); |
| 405 | |
| 406 | if (req->rq_export == NULL || req->rq_export->exp_connection == NULL) |
| 407 | conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL); |
| 408 | else |
| 409 | conn = ptlrpc_connection_addref(req->rq_export->exp_connection); |
| 410 | |
| 411 | if (unlikely(conn == NULL)) { |
| 412 | CERROR("not replying on NULL connection\n"); /* bug 9635 */ |
| 413 | return -ENOTCONN; |
| 414 | } |
| 415 | ptlrpc_rs_addref(rs); /* +1 ref for the network */ |
| 416 | |
| 417 | rc = sptlrpc_svc_wrap_reply(req); |
| 418 | if (unlikely(rc)) |
| 419 | goto out; |
| 420 | |
| 421 | req->rq_sent = ktime_get_real_seconds(); |
| 422 | |
| 423 | rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len, |
| 424 | (rs->rs_difficult && !rs->rs_no_ack) ? |
| 425 | LNET_ACK_REQ : LNET_NOACK_REQ, |
| 426 | &rs->rs_cb_id, conn, |
| 427 | ptlrpc_req2svc(req)->srv_rep_portal, |
| 428 | req->rq_xid, req->rq_reply_off); |
| 429 | out: |
| 430 | if (unlikely(rc != 0)) |
| 431 | ptlrpc_req_drop_rs(req); |
| 432 | ptlrpc_connection_put(conn); |
| 433 | return rc; |
| 434 | } |
| 435 | EXPORT_SYMBOL(ptlrpc_send_reply); |
| 436 | |
| 437 | int ptlrpc_reply(struct ptlrpc_request *req) |
| 438 | { |
| 439 | if (req->rq_no_reply) |
| 440 | return 0; |
| 441 | return ptlrpc_send_reply(req, 0); |
| 442 | } |
| 443 | EXPORT_SYMBOL(ptlrpc_reply); |
| 444 | |
| 445 | /** |
| 446 | * For request \a req send an error reply back. Create empty |
| 447 | * reply buffers if necessary. |
| 448 | */ |
| 449 | int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult) |
| 450 | { |
| 451 | int rc; |
| 452 | |
| 453 | if (req->rq_no_reply) |
| 454 | return 0; |
| 455 | |
| 456 | if (!req->rq_repmsg) { |
| 457 | rc = lustre_pack_reply(req, 1, NULL, NULL); |
| 458 | if (rc) |
| 459 | return rc; |
| 460 | } |
| 461 | |
| 462 | if (req->rq_status != -ENOSPC && req->rq_status != -EACCES && |
| 463 | req->rq_status != -EPERM && req->rq_status != -ENOENT && |
| 464 | req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT) |
| 465 | req->rq_type = PTL_RPC_MSG_ERR; |
| 466 | |
| 467 | rc = ptlrpc_send_reply(req, may_be_difficult); |
| 468 | return rc; |
| 469 | } |
| 470 | EXPORT_SYMBOL(ptlrpc_send_error); |
| 471 | |
| 472 | int ptlrpc_error(struct ptlrpc_request *req) |
| 473 | { |
| 474 | return ptlrpc_send_error(req, 0); |
| 475 | } |
| 476 | EXPORT_SYMBOL(ptlrpc_error); |
| 477 | |
| 478 | /** |
| 479 | * Send request \a request. |
| 480 | * if \a noreply is set, don't expect any reply back and don't set up |
| 481 | * reply buffers. |
| 482 | * Returns 0 on success or error code. |
| 483 | */ |
| 484 | int ptl_send_rpc(struct ptlrpc_request *request, int noreply) |
| 485 | { |
| 486 | int rc; |
| 487 | int rc2; |
| 488 | int mpflag = 0; |
| 489 | struct ptlrpc_connection *connection; |
| 490 | lnet_handle_me_t reply_me_h; |
| 491 | lnet_md_t reply_md; |
| 492 | struct obd_device *obd = request->rq_import->imp_obd; |
| 493 | |
| 494 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC)) |
| 495 | return 0; |
| 496 | |
| 497 | LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST); |
| 498 | LASSERT(request->rq_wait_ctx == 0); |
| 499 | |
| 500 | /* If this is a re-transmit, we're required to have disengaged |
| 501 | * cleanly from the previous attempt */ |
| 502 | LASSERT(!request->rq_receiving_reply); |
| 503 | LASSERT(!((lustre_msg_get_flags(request->rq_reqmsg) & MSG_REPLAY) && |
| 504 | (request->rq_import->imp_state == LUSTRE_IMP_FULL))); |
| 505 | |
| 506 | if (unlikely(obd != NULL && obd->obd_fail)) { |
| 507 | CDEBUG(D_HA, "muting rpc for failed imp obd %s\n", |
| 508 | obd->obd_name); |
| 509 | /* this prevents us from waiting in ptlrpc_queue_wait */ |
| 510 | spin_lock(&request->rq_lock); |
| 511 | request->rq_err = 1; |
| 512 | spin_unlock(&request->rq_lock); |
| 513 | request->rq_status = -ENODEV; |
| 514 | return -ENODEV; |
| 515 | } |
| 516 | |
| 517 | connection = request->rq_import->imp_connection; |
| 518 | |
| 519 | lustre_msg_set_handle(request->rq_reqmsg, |
| 520 | &request->rq_import->imp_remote_handle); |
| 521 | lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST); |
| 522 | lustre_msg_set_conn_cnt(request->rq_reqmsg, |
| 523 | request->rq_import->imp_conn_cnt); |
| 524 | lustre_msghdr_set_flags(request->rq_reqmsg, |
| 525 | request->rq_import->imp_msghdr_flags); |
| 526 | |
| 527 | if (request->rq_resend) |
| 528 | lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT); |
| 529 | |
| 530 | if (request->rq_memalloc) |
| 531 | mpflag = cfs_memory_pressure_get_and_set(); |
| 532 | |
| 533 | rc = sptlrpc_cli_wrap_request(request); |
| 534 | if (rc) |
| 535 | goto out; |
| 536 | |
| 537 | /* bulk register should be done after wrap_request() */ |
| 538 | if (request->rq_bulk != NULL) { |
| 539 | rc = ptlrpc_register_bulk(request); |
| 540 | if (rc != 0) |
| 541 | goto out; |
| 542 | } |
| 543 | |
| 544 | if (!noreply) { |
| 545 | LASSERT(request->rq_replen != 0); |
| 546 | if (request->rq_repbuf == NULL) { |
| 547 | LASSERT(request->rq_repdata == NULL); |
| 548 | LASSERT(request->rq_repmsg == NULL); |
| 549 | rc = sptlrpc_cli_alloc_repbuf(request, |
| 550 | request->rq_replen); |
| 551 | if (rc) { |
| 552 | /* this prevents us from looping in |
| 553 | * ptlrpc_queue_wait */ |
| 554 | spin_lock(&request->rq_lock); |
| 555 | request->rq_err = 1; |
| 556 | spin_unlock(&request->rq_lock); |
| 557 | request->rq_status = rc; |
| 558 | goto cleanup_bulk; |
| 559 | } |
| 560 | } else { |
| 561 | request->rq_repdata = NULL; |
| 562 | request->rq_repmsg = NULL; |
| 563 | } |
| 564 | |
| 565 | rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/ |
| 566 | connection->c_peer, request->rq_xid, 0, |
| 567 | LNET_UNLINK, LNET_INS_AFTER, &reply_me_h); |
| 568 | if (rc != 0) { |
| 569 | CERROR("LNetMEAttach failed: %d\n", rc); |
| 570 | LASSERT(rc == -ENOMEM); |
| 571 | rc = -ENOMEM; |
| 572 | goto cleanup_bulk; |
| 573 | } |
| 574 | } |
| 575 | |
| 576 | spin_lock(&request->rq_lock); |
| 577 | /* If the MD attach succeeds, there _will_ be a reply_in callback */ |
| 578 | request->rq_receiving_reply = !noreply; |
| 579 | request->rq_req_unlink = 1; |
| 580 | /* We are responsible for unlinking the reply buffer */ |
| 581 | request->rq_reply_unlink = !noreply; |
| 582 | /* Clear any flags that may be present from previous sends. */ |
| 583 | request->rq_replied = 0; |
| 584 | request->rq_err = 0; |
| 585 | request->rq_timedout = 0; |
| 586 | request->rq_net_err = 0; |
| 587 | request->rq_resend = 0; |
| 588 | request->rq_restart = 0; |
| 589 | request->rq_reply_truncate = 0; |
| 590 | spin_unlock(&request->rq_lock); |
| 591 | |
| 592 | if (!noreply) { |
| 593 | reply_md.start = request->rq_repbuf; |
| 594 | reply_md.length = request->rq_repbuf_len; |
| 595 | /* Allow multiple early replies */ |
| 596 | reply_md.threshold = LNET_MD_THRESH_INF; |
| 597 | /* Manage remote for early replies */ |
| 598 | reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | |
| 599 | LNET_MD_MANAGE_REMOTE | |
| 600 | LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */ |
| 601 | reply_md.user_ptr = &request->rq_reply_cbid; |
| 602 | reply_md.eq_handle = ptlrpc_eq_h; |
| 603 | |
| 604 | /* We must see the unlink callback to unset rq_reply_unlink, |
| 605 | so we can't auto-unlink */ |
| 606 | rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN, |
| 607 | &request->rq_reply_md_h); |
| 608 | if (rc != 0) { |
| 609 | CERROR("LNetMDAttach failed: %d\n", rc); |
| 610 | LASSERT(rc == -ENOMEM); |
| 611 | spin_lock(&request->rq_lock); |
| 612 | /* ...but the MD attach didn't succeed... */ |
| 613 | request->rq_receiving_reply = 0; |
| 614 | spin_unlock(&request->rq_lock); |
| 615 | rc = -ENOMEM; |
| 616 | goto cleanup_me; |
| 617 | } |
| 618 | |
| 619 | CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid %llu, portal %u\n", |
| 620 | request->rq_repbuf_len, request->rq_xid, |
| 621 | request->rq_reply_portal); |
| 622 | } |
| 623 | |
| 624 | /* add references on request for request_out_callback */ |
| 625 | ptlrpc_request_addref(request); |
| 626 | if (obd != NULL && obd->obd_svc_stats != NULL) |
| 627 | lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR, |
| 628 | atomic_read(&request->rq_import->imp_inflight)); |
| 629 | |
| 630 | OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5); |
| 631 | |
| 632 | ktime_get_real_ts64(&request->rq_arrival_time); |
| 633 | request->rq_sent = ktime_get_real_seconds(); |
| 634 | /* We give the server rq_timeout secs to process the req, and |
| 635 | add the network latency for our local timeout. */ |
| 636 | request->rq_deadline = request->rq_sent + request->rq_timeout + |
| 637 | ptlrpc_at_get_net_latency(request); |
| 638 | |
| 639 | ptlrpc_pinger_sending_on_import(request->rq_import); |
| 640 | |
| 641 | DEBUG_REQ(D_INFO, request, "send flg=%x", |
| 642 | lustre_msg_get_flags(request->rq_reqmsg)); |
| 643 | rc = ptl_send_buf(&request->rq_req_md_h, |
| 644 | request->rq_reqbuf, request->rq_reqdata_len, |
| 645 | LNET_NOACK_REQ, &request->rq_req_cbid, |
| 646 | connection, |
| 647 | request->rq_request_portal, |
| 648 | request->rq_xid, 0); |
| 649 | if (rc == 0) |
| 650 | goto out; |
| 651 | |
| 652 | ptlrpc_req_finished(request); |
| 653 | if (noreply) |
| 654 | goto out; |
| 655 | |
| 656 | cleanup_me: |
| 657 | /* MEUnlink is safe; the PUT didn't even get off the ground, and |
| 658 | * nobody apart from the PUT's target has the right nid+XID to |
| 659 | * access the reply buffer. */ |
| 660 | rc2 = LNetMEUnlink(reply_me_h); |
| 661 | LASSERT(rc2 == 0); |
| 662 | /* UNLINKED callback called synchronously */ |
| 663 | LASSERT(!request->rq_receiving_reply); |
| 664 | |
| 665 | cleanup_bulk: |
| 666 | /* We do sync unlink here as there was no real transfer here so |
| 667 | * the chance to have long unlink to sluggish net is smaller here. */ |
| 668 | ptlrpc_unregister_bulk(request, 0); |
| 669 | out: |
| 670 | if (request->rq_memalloc) |
| 671 | cfs_memory_pressure_restore(mpflag); |
| 672 | return rc; |
| 673 | } |
| 674 | EXPORT_SYMBOL(ptl_send_rpc); |
| 675 | |
| 676 | /** |
| 677 | * Register request buffer descriptor for request receiving. |
| 678 | */ |
| 679 | int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd) |
| 680 | { |
| 681 | struct ptlrpc_service *service = rqbd->rqbd_svcpt->scp_service; |
| 682 | static lnet_process_id_t match_id = {LNET_NID_ANY, LNET_PID_ANY}; |
| 683 | int rc; |
| 684 | lnet_md_t md; |
| 685 | lnet_handle_me_t me_h; |
| 686 | |
| 687 | CDEBUG(D_NET, "LNetMEAttach: portal %d\n", |
| 688 | service->srv_req_portal); |
| 689 | |
| 690 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD)) |
| 691 | return -ENOMEM; |
| 692 | |
| 693 | /* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL, |
| 694 | * which means buffer can only be attached on local CPT, and LND |
| 695 | * threads can find it by grabbing a local lock */ |
| 696 | rc = LNetMEAttach(service->srv_req_portal, |
| 697 | match_id, 0, ~0, LNET_UNLINK, |
| 698 | rqbd->rqbd_svcpt->scp_cpt >= 0 ? |
| 699 | LNET_INS_LOCAL : LNET_INS_AFTER, &me_h); |
| 700 | if (rc != 0) { |
| 701 | CERROR("LNetMEAttach failed: %d\n", rc); |
| 702 | return -ENOMEM; |
| 703 | } |
| 704 | |
| 705 | LASSERT(rqbd->rqbd_refcount == 0); |
| 706 | rqbd->rqbd_refcount = 1; |
| 707 | |
| 708 | md.start = rqbd->rqbd_buffer; |
| 709 | md.length = service->srv_buf_size; |
| 710 | md.max_size = service->srv_max_req_size; |
| 711 | md.threshold = LNET_MD_THRESH_INF; |
| 712 | md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE; |
| 713 | md.user_ptr = &rqbd->rqbd_cbid; |
| 714 | md.eq_handle = ptlrpc_eq_h; |
| 715 | |
| 716 | rc = LNetMDAttach(me_h, md, LNET_UNLINK, &rqbd->rqbd_md_h); |
| 717 | if (rc == 0) |
| 718 | return 0; |
| 719 | |
| 720 | CERROR("LNetMDAttach failed: %d;\n", rc); |
| 721 | LASSERT(rc == -ENOMEM); |
| 722 | rc = LNetMEUnlink(me_h); |
| 723 | LASSERT(rc == 0); |
| 724 | rqbd->rqbd_refcount = 0; |
| 725 | |
| 726 | return -ENOMEM; |
| 727 | } |