1 // SPDX-License-Identifier: GPL-2.0
5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 only,
9 * as published by the Free Software Foundation.
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License version 2 for more details (a copy is included
15 * in the LICENSE file that accompanied this code).
17 * You should have received a copy of the GNU General Public License
18 * version 2 along with this program; If not, see
19 * http://www.gnu.org/licenses/gpl-2.0.html
24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Use is subject to license terms.
27 * Copyright (c) 2011, 2012, Intel Corporation.
30 * This file is part of Lustre, http://www.lustre.org/
31 * Lustre is a trademark of Sun Microsystems, Inc.
37 ksocknal_lib_get_conn_addrs(struct ksock_conn *conn)
39 int rc = lnet_sock_getaddr(conn->ksnc_sock, 1, &conn->ksnc_ipaddr,
42 /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
43 LASSERT(!conn->ksnc_closing);
46 CERROR("Error %d getting sock peer IP\n", rc);
50 rc = lnet_sock_getaddr(conn->ksnc_sock, 0, &conn->ksnc_myipaddr, NULL);
52 CERROR("Error %d getting sock local IP\n", rc);
60 ksocknal_lib_zc_capable(struct ksock_conn *conn)
62 int caps = conn->ksnc_sock->sk->sk_route_caps;
64 if (conn->ksnc_proto == &ksocknal_protocol_v1x)
68 * ZC if the socket supports scatter/gather and doesn't need software
71 return ((caps & NETIF_F_SG) && (caps & NETIF_F_CSUM_MASK));
75 ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx)
77 struct msghdr msg = {.msg_flags = MSG_DONTWAIT};
78 struct socket *sock = conn->ksnc_sock;
81 if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */
82 conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */
83 tx->tx_nob == tx->tx_resid && /* frist sending */
84 !tx->tx_msg.ksm_csum) /* not checksummed */
85 ksocknal_lib_csum_tx(tx);
87 for (nob = i = 0; i < tx->tx_niov; i++)
88 nob += tx->tx_iov[i].iov_len;
90 if (!list_empty(&conn->ksnc_tx_queue) ||
92 msg.msg_flags |= MSG_MORE;
94 iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC,
95 tx->tx_iov, tx->tx_niov, nob);
96 return sock_sendmsg(sock, &msg);
100 ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx)
102 struct socket *sock = conn->ksnc_sock;
103 struct bio_vec *kiov = tx->tx_kiov;
107 /* Not NOOP message */
108 LASSERT(tx->tx_lnetmsg);
110 if (tx->tx_msg.ksm_zc_cookies[0]) {
111 /* Zero copy is enabled */
112 struct sock *sk = sock->sk;
113 struct page *page = kiov->bv_page;
114 int offset = kiov->bv_offset;
115 int fragsize = kiov->bv_len;
116 int msgflg = MSG_DONTWAIT;
118 CDEBUG(D_NET, "page %p + offset %x for %d\n",
119 page, offset, kiov->bv_len);
121 if (!list_empty(&conn->ksnc_tx_queue) ||
122 fragsize < tx->tx_resid)
125 if (sk->sk_prot->sendpage) {
126 rc = sk->sk_prot->sendpage(sk, page,
127 offset, fragsize, msgflg);
129 rc = tcp_sendpage(sk, page, offset, fragsize, msgflg);
132 struct msghdr msg = {.msg_flags = MSG_DONTWAIT};
135 for (nob = i = 0; i < tx->tx_nkiov; i++)
136 nob += kiov[i].bv_len;
138 if (!list_empty(&conn->ksnc_tx_queue) ||
140 msg.msg_flags |= MSG_MORE;
142 iov_iter_bvec(&msg.msg_iter, WRITE | ITER_BVEC,
143 kiov, tx->tx_nkiov, nob);
144 rc = sock_sendmsg(sock, &msg);
150 ksocknal_lib_eager_ack(struct ksock_conn *conn)
153 struct socket *sock = conn->ksnc_sock;
156 * Remind the socket to ACK eagerly. If I don't, the socket might
157 * think I'm about to send something it could piggy-back the ACK
158 * on, introducing delay in completing zero-copy sends in my
161 kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, (char *)&opt,
165 static int lustre_csum(struct kvec *v, void *context)
167 struct ksock_conn *conn = context;
168 conn->ksnc_rx_csum = crc32_le(conn->ksnc_rx_csum,
169 v->iov_base, v->iov_len);
174 ksocknal_lib_recv(struct ksock_conn *conn)
176 struct msghdr msg = { .msg_iter = conn->ksnc_rx_to };
180 rc = sock_recvmsg(conn->ksnc_sock, &msg, MSG_DONTWAIT);
184 saved_csum = conn->ksnc_msg.ksm_csum;
188 /* header is included only in V2 - V3 checksums only the bulk data */
189 if (!(conn->ksnc_rx_to.type & ITER_BVEC) &&
190 conn->ksnc_proto != &ksocknal_protocol_v2x)
193 /* accumulate checksum */
194 conn->ksnc_msg.ksm_csum = 0;
195 iov_iter_for_each_range(&conn->ksnc_rx_to, rc, lustre_csum, conn);
196 conn->ksnc_msg.ksm_csum = saved_csum;
202 ksocknal_lib_csum_tx(struct ksock_tx *tx)
208 LASSERT(tx->tx_iov[0].iov_base == &tx->tx_msg);
209 LASSERT(tx->tx_conn);
210 LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x);
212 tx->tx_msg.ksm_csum = 0;
214 csum = crc32_le(~0, tx->tx_iov[0].iov_base,
215 tx->tx_iov[0].iov_len);
218 for (i = 0; i < tx->tx_nkiov; i++) {
219 base = kmap(tx->tx_kiov[i].bv_page) +
220 tx->tx_kiov[i].bv_offset;
222 csum = crc32_le(csum, base, tx->tx_kiov[i].bv_len);
224 kunmap(tx->tx_kiov[i].bv_page);
227 for (i = 1; i < tx->tx_niov; i++)
228 csum = crc32_le(csum, tx->tx_iov[i].iov_base,
229 tx->tx_iov[i].iov_len);
232 if (*ksocknal_tunables.ksnd_inject_csum_error) {
234 *ksocknal_tunables.ksnd_inject_csum_error = 0;
237 tx->tx_msg.ksm_csum = csum;
241 ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem,
242 int *rxmem, int *nagle)
244 struct socket *sock = conn->ksnc_sock;
248 rc = ksocknal_connsock_addref(conn);
250 LASSERT(conn->ksnc_closing);
251 *txmem = *rxmem = *nagle = 0;
255 rc = lnet_sock_getbuf(sock, txmem, rxmem);
257 len = sizeof(*nagle);
258 rc = kernel_getsockopt(sock, SOL_TCP, TCP_NODELAY,
259 (char *)nagle, &len);
262 ksocknal_connsock_decref(conn);
267 *txmem = *rxmem = *nagle = 0;
273 ksocknal_lib_setup_sock(struct socket *sock)
281 struct linger linger;
283 sock->sk->sk_allocation = GFP_NOFS;
286 * Ensure this socket aborts active sends immediately when we close
292 rc = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, (char *)&linger,
295 CERROR("Can't set SO_LINGER: %d\n", rc);
300 rc = kernel_setsockopt(sock, SOL_TCP, TCP_LINGER2, (char *)&option,
303 CERROR("Can't set SO_LINGER2: %d\n", rc);
307 if (!*ksocknal_tunables.ksnd_nagle) {
310 rc = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
311 (char *)&option, sizeof(option));
313 CERROR("Can't disable nagle: %d\n", rc);
318 rc = lnet_sock_setbuf(sock, *ksocknal_tunables.ksnd_tx_buffer_size,
319 *ksocknal_tunables.ksnd_rx_buffer_size);
321 CERROR("Can't set buffer tx %d, rx %d buffers: %d\n",
322 *ksocknal_tunables.ksnd_tx_buffer_size,
323 *ksocknal_tunables.ksnd_rx_buffer_size, rc);
327 /* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
329 /* snapshot tunables */
330 keep_idle = *ksocknal_tunables.ksnd_keepalive_idle;
331 keep_count = *ksocknal_tunables.ksnd_keepalive_count;
332 keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
334 do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
336 option = (do_keepalive ? 1 : 0);
337 rc = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&option,
340 CERROR("Can't set SO_KEEPALIVE: %d\n", rc);
347 rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, (char *)&keep_idle,
350 CERROR("Can't set TCP_KEEPIDLE: %d\n", rc);
354 rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
355 (char *)&keep_intvl, sizeof(keep_intvl));
357 CERROR("Can't set TCP_KEEPINTVL: %d\n", rc);
361 rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, (char *)&keep_count,
364 CERROR("Can't set TCP_KEEPCNT: %d\n", rc);
372 ksocknal_lib_push_conn(struct ksock_conn *conn)
380 rc = ksocknal_connsock_addref(conn);
381 if (rc) /* being shut down */
384 sk = conn->ksnc_sock->sk;
388 nonagle = tp->nonagle;
392 rc = kernel_setsockopt(conn->ksnc_sock, SOL_TCP, TCP_NODELAY,
393 (char *)&val, sizeof(val));
397 tp->nonagle = nonagle;
400 ksocknal_connsock_decref(conn);
404 * socket call back in Linux
407 ksocknal_data_ready(struct sock *sk)
409 struct ksock_conn *conn;
411 /* interleave correctly with closing sockets... */
413 read_lock(&ksocknal_data.ksnd_global_lock);
415 conn = sk->sk_user_data;
416 if (!conn) { /* raced with ksocknal_terminate_conn */
417 LASSERT(sk->sk_data_ready != &ksocknal_data_ready);
418 sk->sk_data_ready(sk);
420 ksocknal_read_callback(conn);
423 read_unlock(&ksocknal_data.ksnd_global_lock);
427 ksocknal_write_space(struct sock *sk)
429 struct ksock_conn *conn;
433 /* interleave correctly with closing sockets... */
435 read_lock(&ksocknal_data.ksnd_global_lock);
437 conn = sk->sk_user_data;
438 wspace = sk_stream_wspace(sk);
439 min_wpace = sk_stream_min_wspace(sk);
441 CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
442 sk, wspace, min_wpace, conn,
443 !conn ? "" : (conn->ksnc_tx_ready ?
444 " ready" : " blocked"),
445 !conn ? "" : (conn->ksnc_tx_scheduled ?
446 " scheduled" : " idle"),
447 !conn ? "" : (list_empty(&conn->ksnc_tx_queue) ?
448 " empty" : " queued"));
450 if (!conn) { /* raced with ksocknal_terminate_conn */
451 LASSERT(sk->sk_write_space != &ksocknal_write_space);
452 sk->sk_write_space(sk);
454 read_unlock(&ksocknal_data.ksnd_global_lock);
458 if (wspace >= min_wpace) { /* got enough space */
459 ksocknal_write_callback(conn);
462 * Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
463 * ENOMEM check in ksocknal_transmit is race-free (think about
466 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
469 read_unlock(&ksocknal_data.ksnd_global_lock);
473 ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn)
475 conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
476 conn->ksnc_saved_write_space = sock->sk->sk_write_space;
480 ksocknal_lib_set_callback(struct socket *sock, struct ksock_conn *conn)
482 sock->sk->sk_user_data = conn;
483 sock->sk->sk_data_ready = ksocknal_data_ready;
484 sock->sk->sk_write_space = ksocknal_write_space;
488 ksocknal_lib_reset_callback(struct socket *sock, struct ksock_conn *conn)
491 * Remove conn's network callbacks.
492 * NB I _have_ to restore the callback, rather than storing a noop,
493 * since the socket could survive past this module being unloaded!!
495 sock->sk->sk_data_ready = conn->ksnc_saved_data_ready;
496 sock->sk->sk_write_space = conn->ksnc_saved_write_space;
499 * A callback could be in progress already; they hold a read lock
500 * on ksnd_global_lock (to serialise with me) and NOOP if
501 * sk_user_data is NULL.
503 sock->sk->sk_user_data = NULL;
507 ksocknal_lib_memory_pressure(struct ksock_conn *conn)
510 struct ksock_sched *sched;
512 sched = conn->ksnc_scheduler;
513 spin_lock_bh(&sched->kss_lock);
515 if (!test_bit(SOCK_NOSPACE, &conn->ksnc_sock->flags) &&
516 !conn->ksnc_tx_ready) {
518 * SOCK_NOSPACE is set when the socket fills
519 * and cleared in the write_space callback
520 * (which also sets ksnc_tx_ready). If
521 * SOCK_NOSPACE and ksnc_tx_ready are BOTH
522 * zero, I didn't fill the socket and
523 * write_space won't reschedule me, so I
524 * return -ENOMEM to get my caller to retry
530 spin_unlock_bh(&sched->kss_lock);