net: Add sysctl to toggle early demux for tcp and udp
authorsubashab@codeaurora.org <subashab@codeaurora.org>
Thu, 23 Mar 2017 19:34:16 +0000 (13:34 -0600)
committerDavid S. Miller <davem@davemloft.net>
Fri, 24 Mar 2017 20:17:07 +0000 (13:17 -0700)
Certain system process significant unconnected UDP workload.
It would be preferrable to disable UDP early demux for those systems
and enable it for TCP only.

By disabling UDP demux, we see these slight gains on an ARM64 system-
782 -> 788Mbps unconnected single stream UDPv4
633 -> 654Mbps unconnected UDPv4 different sources

The performance impact can change based on CPU architecure and cache
sizes. There will not much difference seen if entire UDP hash table
is in cache.

Both sysctls are enabled by default to preserve existing behavior.

v1->v2: Change function pointer instead of adding conditional as
suggested by Stephen.

v2->v3: Read once in callers to avoid issues due to compiler
optimizations. Also update commit message with the tests.

v3->v4: Store and use read once result instead of querying pointer
again incorrectly.

v4->v5: Refactor to avoid errors due to compilation with IPV6={m,n}

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Suggested-by: Eric Dumazet <edumazet@google.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Tom Herbert <tom@herbertland.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
12 files changed:
Documentation/networking/ip-sysctl.txt
include/net/netns/ipv4.h
include/net/protocol.h
include/net/udp.h
net/ipv4/af_inet.c
net/ipv4/ip_input.c
net/ipv4/protocol.c
net/ipv4/sysctl_net_ipv4.c
net/ipv6/ip6_input.c
net/ipv6/protocol.c
net/ipv6/tcp_ipv6.c
net/ipv6/udp.c

index eaee2c8d4c00add74b162bbcdeb555934586cf24..b1c6500e7a8df4d7377b291e9afc09363e66cd17 100644 (file)
@@ -856,12 +856,21 @@ ip_dynaddr - BOOLEAN
 ip_early_demux - BOOLEAN
        Optimize input packet processing down to one demux for
        certain kinds of local sockets.  Currently we only do this
-       for established TCP sockets.
+       for established TCP and connected UDP sockets.
 
        It may add an additional cost for pure routing workloads that
        reduces overall throughput, in such case you should disable it.
        Default: 1
 
+tcp_early_demux - BOOLEAN
+       Enable early demux for established TCP sockets.
+       Default: 1
+
+udp_early_demux - BOOLEAN
+       Enable early demux for connected UDP sockets. Disable this if
+       your system could experience more unconnected load.
+       Default: 1
+
 icmp_echo_ignore_all - BOOLEAN
        If set non-zero, then the kernel will ignore all ICMP ECHO
        requests sent to it.
index a0e89190a3e9240a80e7c307c0d3f482ea6168f4..cd686c4fb32dc5409a08f818d48228bffa6f6778 100644 (file)
@@ -95,6 +95,8 @@ struct netns_ipv4 {
        /* Shall we try to damage output packets if routing dev changes? */
        int sysctl_ip_dynaddr;
        int sysctl_ip_early_demux;
+       int sysctl_tcp_early_demux;
+       int sysctl_udp_early_demux;
 
        int sysctl_fwmark_reflect;
        int sysctl_tcp_fwmark_accept;
index bf36ca34af7ad255b9eb821cbed0a70abad993f5..65ba335b0e7e66bb7f1b4bd279d31e616e0dd31e 100644 (file)
@@ -40,6 +40,7 @@
 /* This is used to register protocols. */
 struct net_protocol {
        void                    (*early_demux)(struct sk_buff *skb);
+       void                    (*early_demux_handler)(struct sk_buff *skb);
        int                     (*handler)(struct sk_buff *skb);
        void                    (*err_handler)(struct sk_buff *skb, u32 info);
        unsigned int            no_policy:1,
@@ -54,7 +55,7 @@ struct net_protocol {
 #if IS_ENABLED(CONFIG_IPV6)
 struct inet6_protocol {
        void    (*early_demux)(struct sk_buff *skb);
-
+       void    (*early_demux_handler)(struct sk_buff *skb);
        int     (*handler)(struct sk_buff *skb);
 
        void    (*err_handler)(struct sk_buff *skb,
@@ -92,12 +93,12 @@ struct inet_protosw {
 #define INET_PROTOSW_PERMANENT 0x02  /* Permanent protocols are unremovable. */
 #define INET_PROTOSW_ICSK      0x04  /* Is this an inet_connection_sock? */
 
-extern const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS];
+extern struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS];
 extern const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS];
 extern const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS];
 
 #if IS_ENABLED(CONFIG_IPV6)
-extern const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS];
+extern struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS];
 #endif
 
 int inet_add_protocol(const struct net_protocol *prot, unsigned char num);
index c9d8b8e848e05c2e7228f287f88ccdb57b2e10c2..3391dbd739595a76150453c28468ce8bb55530f8 100644 (file)
@@ -372,4 +372,5 @@ void udp_encap_enable(void);
 #if IS_ENABLED(CONFIG_IPV6)
 void udpv6_encap_enable(void);
 #endif
+
 #endif /* _UDP_H */
index 6b1fc6e4278ef4f1cba58412977918af31d73e62..d1a11707a12682fcd70f22f6df77087b779a5826 100644 (file)
@@ -1599,8 +1599,9 @@ static const struct net_protocol igmp_protocol = {
 };
 #endif
 
-static const struct net_protocol tcp_protocol = {
+static struct net_protocol tcp_protocol = {
        .early_demux    =       tcp_v4_early_demux,
+       .early_demux_handler =  tcp_v4_early_demux,
        .handler        =       tcp_v4_rcv,
        .err_handler    =       tcp_v4_err,
        .no_policy      =       1,
@@ -1608,8 +1609,9 @@ static const struct net_protocol tcp_protocol = {
        .icmp_strict_tag_validation = 1,
 };
 
-static const struct net_protocol udp_protocol = {
+static struct net_protocol udp_protocol = {
        .early_demux =  udp_v4_early_demux,
+       .early_demux_handler =  udp_v4_early_demux,
        .handler =      udp_rcv,
        .err_handler =  udp_err,
        .no_policy =    1,
@@ -1720,6 +1722,8 @@ static __net_init int inet_init_net(struct net *net)
        net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
        net->ipv4.sysctl_ip_dynaddr = 0;
        net->ipv4.sysctl_ip_early_demux = 1;
+       net->ipv4.sysctl_udp_early_demux = 1;
+       net->ipv4.sysctl_tcp_early_demux = 1;
 #ifdef CONFIG_SYSCTL
        net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
 #endif
index d6feabb0351607f282e1f78f159c0ccb88bcec96..fa2dc8f692c631f1ff7fe814c3ee27f0de2a41d8 100644 (file)
@@ -313,6 +313,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
        const struct iphdr *iph = ip_hdr(skb);
        struct rtable *rt;
        struct net_device *dev = skb->dev;
+       void (*edemux)(struct sk_buff *skb);
 
        /* if ingress device is enslaved to an L3 master device pass the
         * skb to its handler for processing
@@ -329,8 +330,8 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
                int protocol = iph->protocol;
 
                ipprot = rcu_dereference(inet_protos[protocol]);
-               if (ipprot && ipprot->early_demux) {
-                       ipprot->early_demux(skb);
+               if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
+                       edemux(skb);
                        /* must reload iph, skb->head might have changed */
                        iph = ip_hdr(skb);
                }
index 4b7c0ec65251ef40577a2d5e360fcbaed391a566..32a691b7ce2c7e79eab6491b52457a11e666f7d3 100644 (file)
@@ -28,7 +28,7 @@
 #include <linux/spinlock.h>
 #include <net/protocol.h>
 
-const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
+struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
 const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
 EXPORT_SYMBOL(inet_offloads);
 
index 711c3e2e17b1a46925deb99a2e22916610388033..6fb25693c00b92cbf881a13b06f2276b288853b1 100644 (file)
@@ -24,6 +24,7 @@
 #include <net/cipso_ipv4.h>
 #include <net/inet_frag.h>
 #include <net/ping.h>
+#include <net/protocol.h>
 
 static int zero;
 static int one = 1;
@@ -294,6 +295,58 @@ bad_key:
        return ret;
 }
 
+static void proc_configure_early_demux(int enabled, int protocol)
+{
+       struct net_protocol *ipprot;
+#if IS_ENABLED(CONFIG_IPV6)
+       struct inet6_protocol *ip6prot;
+#endif
+
+       ipprot = rcu_dereference(inet_protos[protocol]);
+       if (ipprot)
+               ipprot->early_demux = enabled ? ipprot->early_demux_handler :
+                                               NULL;
+
+#if IS_ENABLED(CONFIG_IPV6)
+       ip6prot = rcu_dereference(inet6_protos[protocol]);
+       if (ip6prot)
+               ip6prot->early_demux = enabled ? ip6prot->early_demux_handler :
+                                                NULL;
+#endif
+}
+
+static int proc_tcp_early_demux(struct ctl_table *table, int write,
+                               void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       int ret = 0;
+
+       ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+       if (write && !ret) {
+               int enabled = init_net.ipv4.sysctl_tcp_early_demux;
+
+               proc_configure_early_demux(enabled, IPPROTO_TCP);
+       }
+
+       return ret;
+}
+
+static int proc_udp_early_demux(struct ctl_table *table, int write,
+                               void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       int ret = 0;
+
+       ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+       if (write && !ret) {
+               int enabled = init_net.ipv4.sysctl_udp_early_demux;
+
+               proc_configure_early_demux(enabled, IPPROTO_UDP);
+       }
+
+       return ret;
+}
+
 static struct ctl_table ipv4_table[] = {
        {
                .procname       = "tcp_timestamps",
@@ -749,6 +802,20 @@ static struct ctl_table ipv4_net_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec
        },
+       {
+               .procname       = "udp_early_demux",
+               .data           = &init_net.ipv4.sysctl_udp_early_demux,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_udp_early_demux
+       },
+       {
+               .procname       = "tcp_early_demux",
+               .data           = &init_net.ipv4.sysctl_tcp_early_demux,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_tcp_early_demux
+       },
        {
                .procname       = "ip_default_ttl",
                .data           = &init_net.ipv4.sysctl_ip_default_ttl,
index aacfb4bce1533b3f3b38e1173c18cb1bb6b33099..b04539dd4629d2b71b5db27c4a64a89151b2d5d7 100644 (file)
@@ -49,6 +49,8 @@
 
 int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
+       void (*edemux)(struct sk_buff *skb);
+
        /* if ingress device is enslaved to an L3 master device pass the
         * skb to its handler for processing
         */
@@ -60,8 +62,8 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
                const struct inet6_protocol *ipprot;
 
                ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]);
-               if (ipprot && ipprot->early_demux)
-                       ipprot->early_demux(skb);
+               if (ipprot && (edemux = READ_ONCE(ipprot->early_demux)))
+                       edemux(skb);
        }
        if (!skb_valid_dst(skb))
                ip6_route_input(skb);
index e3770abe688a3a9059456fe9195adbfcdfb73157..b5d54d4f995c0f4bade2e3f1c4def9616252ca55 100644 (file)
@@ -26,7 +26,7 @@
 #include <net/protocol.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
-const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly;
+struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly;
 EXPORT_SYMBOL(inet6_protos);
 
 int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol)
index 0f08d718a00238b228d859d2c0a1dab10db57125..031a8c019f7a740cffe9d1703d9d87992268b028 100644 (file)
@@ -1925,8 +1925,9 @@ struct proto tcpv6_prot = {
        .diag_destroy           = tcp_abort,
 };
 
-static const struct inet6_protocol tcpv6_protocol = {
+static struct inet6_protocol tcpv6_protocol = {
        .early_demux    =       tcp_v6_early_demux,
+       .early_demux_handler =  tcp_v6_early_demux,
        .handler        =       tcp_v6_rcv,
        .err_handler    =       tcp_v6_err,
        .flags          =       INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
index b793ed1d2a36960291b60de44b3cffea20861415..fd4b1c98a47230b94641c31fe3213b3dff6ac915 100644 (file)
@@ -1436,8 +1436,9 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
 }
 #endif
 
-static const struct inet6_protocol udpv6_protocol = {
+static struct inet6_protocol udpv6_protocol = {
        .early_demux    =       udp_v6_early_demux,
+       .early_demux_handler =  udp_v6_early_demux,
        .handler        =       udpv6_rcv,
        .err_handler    =       udpv6_err,
        .flags          =       INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,