tcp: Don't allocate tcp_death_row outside of struct netns_ipv4.
authorKuniyuki Iwashima <kuniyu@amazon.com>
Thu, 8 Sep 2022 01:10:18 +0000 (18:10 -0700)
committerJakub Kicinski <kuba@kernel.org>
Tue, 20 Sep 2022 17:21:49 +0000 (10:21 -0700)
We will soon introduce an optional per-netns ehash and access hash
tables via net->ipv4.tcp_death_row->hashinfo instead of &tcp_hashinfo
in most places.

It could harm the fast path because dereferences of two fields in net
and tcp_death_row might incur two extra cache line misses.  To save one
dereference, let's place tcp_death_row back in netns_ipv4 and fetch
hashinfo via net->ipv4.tcp_death_row"."hashinfo.

Note tcp_death_row was initially placed in netns_ipv4, and commit
fbb8295248e1 ("tcp: allocate tcp_death_row outside of struct netns_ipv4")
changed it to a pointer so that we can fire TIME_WAIT timers after freeing
net.  However, we don't do so after commit 04c494e68a13 ("Revert "tcp/dccp:
get rid of inet_twsk_purge()""), so we need not define tcp_death_row as a
pointer.

Also, we move refcount_dec_and_test(&tw_refcount) from tcp_sk_exit() to
tcp_sk_exit_batch() as a debug check.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
include/net/netns/ipv4.h
net/ipv4/inet_timewait_sock.c
net/ipv4/proc.c
net/ipv4/sysctl_net_ipv4.c
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_minisocks.c
net/ipv6/tcp_ipv6.c

index 6320a76cefdcdf6232b7a94ddd0c4a433084887d..2c7df93e340326b70eacbff06b5688a7478f6a2a 100644 (file)
@@ -34,6 +34,7 @@ struct inet_hashinfo;
 struct inet_timewait_death_row {
        refcount_t              tw_refcount;
 
+       /* Padding to avoid false sharing, tw_refcount can be often written */
        struct inet_hashinfo    *hashinfo ____cacheline_aligned_in_smp;
        int                     sysctl_max_tw_buckets;
 };
@@ -41,7 +42,7 @@ struct inet_timewait_death_row {
 struct tcp_fastopen_context;
 
 struct netns_ipv4 {
-       struct inet_timewait_death_row *tcp_death_row;
+       struct inet_timewait_death_row tcp_death_row;
 
 #ifdef CONFIG_SYSCTL
        struct ctl_table_header *forw_hdr;
index 47ccc343c9fb0a995b817d938f25a779dd8221c0..71d3bb0abf6c589bfa2cb29611544b8ab12a8a81 100644 (file)
@@ -59,9 +59,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
        inet_twsk_bind_unhash(tw, hashinfo);
        spin_unlock(&bhead->lock);
 
-       if (refcount_dec_and_test(&tw->tw_dr->tw_refcount))
-               kfree(tw->tw_dr);
-
+       refcount_dec(&tw->tw_dr->tw_refcount);
        inet_twsk_put(tw);
 }
 
index 0088a4c64d77ed89b86b736d1c40b099a817ea27..5386f460bd208c8f30902ea8b6aa613449d59ee0 100644 (file)
@@ -59,7 +59,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
        socket_seq_show(seq);
        seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
                   sock_prot_inuse_get(net, &tcp_prot), orphans,
-                  refcount_read(&net->ipv4.tcp_death_row->tw_refcount) - 1,
+                  refcount_read(&net->ipv4.tcp_death_row.tw_refcount) - 1,
                   sockets, proto_memory_allocated(&tcp_prot));
        seq_printf(seq, "UDP: inuse %d mem %ld\n",
                   sock_prot_inuse_get(net, &udp_prot),
index 5490c285668b93b5683328a2c284af66e2f19b0d..4d7c110c772f388e8b24e0441f9a8374225d586f 100644 (file)
@@ -530,10 +530,9 @@ static struct ctl_table ipv4_table[] = {
 };
 
 static struct ctl_table ipv4_net_table[] = {
-       /* tcp_max_tw_buckets must be first in this table. */
        {
                .procname       = "tcp_max_tw_buckets",
-/*             .data           = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets, */
+               .data           = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec
@@ -1361,8 +1360,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
                if (!table)
                        goto err_alloc;
 
-               /* skip first entry (sysctl_max_tw_buckets) */
-               for (i = 1; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) {
+               for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) {
                        if (table[i].data) {
                                /* Update the variables to point into
                                 * the current struct net
@@ -1377,8 +1375,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
                }
        }
 
-       table[0].data = &net->ipv4.tcp_death_row->sysctl_max_tw_buckets;
-
        net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
        if (!net->ipv4.ipv4_hdr)
                goto err_reg;
index a07243f66d4c14c670405b96687733dbe30c4922..3930b6a1e0d65a50d5e69bf081be47138c11a5b4 100644 (file)
@@ -292,7 +292,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
         * complete initialization after this.
         */
        tcp_set_state(sk, TCP_SYN_SENT);
-       tcp_death_row = net->ipv4.tcp_death_row;
+       tcp_death_row = &net->ipv4.tcp_death_row;
        err = inet_hash_connect(tcp_death_row, sk);
        if (err)
                goto failure;
@@ -3091,13 +3091,9 @@ EXPORT_SYMBOL(tcp_prot);
 
 static void __net_exit tcp_sk_exit(struct net *net)
 {
-       struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
-
        if (net->ipv4.tcp_congestion_control)
                bpf_module_put(net->ipv4.tcp_congestion_control,
                               net->ipv4.tcp_congestion_control->owner);
-       if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
-               kfree(tcp_death_row);
 }
 
 static int __net_init tcp_sk_init(struct net *net)
@@ -3129,13 +3125,10 @@ static int __net_init tcp_sk_init(struct net *net)
        net->ipv4.sysctl_tcp_tw_reuse = 2;
        net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
 
-       net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
-       if (!net->ipv4.tcp_death_row)
-               return -ENOMEM;
-       refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
+       refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
        cnt = tcp_hashinfo.ehash_mask + 1;
-       net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
-       net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
+       net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
+       net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
 
        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
        net->ipv4.sysctl_tcp_sack = 1;
@@ -3201,8 +3194,10 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
 
        inet_twsk_purge(&tcp_hashinfo, AF_INET);
 
-       list_for_each_entry(net, net_exit_list, exit_list)
+       list_for_each_entry(net, net_exit_list, exit_list) {
+               WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
                tcp_fastopen_ctx_destroy(net);
+       }
 }
 
 static struct pernet_operations __net_initdata tcp_sk_ops = {
index 80ce27f8f77e47780335fa822de0d3fd3faba403..8bddb2a78b21c232b5c544dc2c25903cfe75a92c 100644 (file)
@@ -250,7 +250,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
        struct net *net = sock_net(sk);
        struct inet_timewait_sock *tw;
 
-       tw = inet_twsk_alloc(sk, net->ipv4.tcp_death_row, state);
+       tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state);
 
        if (tw) {
                struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
index 5c562d69fddfeffba084d8ec810e6eaf690c3c94..eb1da7a63fbb3efc0f71cce1869a4f51b7de2ddd 100644 (file)
@@ -325,7 +325,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
        inet->inet_dport = usin->sin6_port;
 
        tcp_set_state(sk, TCP_SYN_SENT);
-       tcp_death_row = net->ipv4.tcp_death_row;
+       tcp_death_row = &net->ipv4.tcp_death_row;
        err = inet6_hash_connect(tcp_death_row, sk);
        if (err)
                goto late_failure;