ipv4: Add sysctl knob to control early socket demux
authorAlexander Duyck <alexander.h.duyck@intel.com>
Thu, 21 Jun 2012 13:58:31 +0000 (13:58 +0000)
committerDavid S. Miller <davem@davemloft.net>
Sat, 23 Jun 2012 00:11:13 +0000 (17:11 -0700)
This change is meant to add a control for disabling early socket demux.
The main motivation behind this patch is to provide an option to disable
the feature as it adds an additional cost to routing that reduces overall
throughput by up to 5%.  For example one of my systems went from 12.1Mpps
to 11.6 after the early socket demux was added.  It looks like the reason
for the regression is that we are now having to perform two lookups, first
the one for an established socket, and then the one for the routing table.

By adding this patch and toggling the value for ip_early_demux to 0 I am
able to get back to the 12.1Mpps I was previously seeing.

[ Move local variables in ip_rcv_finish() down into the basic
  block in which they are actually used.  -DaveM ]

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/linux/sysctl.h
include/net/ip.h
kernel/sysctl_binary.c
net/ipv4/ip_input.c
net/ipv4/sysctl_net_ipv4.c

index c34b4c82b0dcd6f60937cf6507da1b9988fd9d6f..20825e5f433f770dcd9323af94a674f255bef04c 100644 (file)
@@ -425,6 +425,7 @@ enum
        NET_TCP_ALLOWED_CONG_CONTROL=123,
        NET_TCP_MAX_SSTHRESH=124,
        NET_TCP_FRTO_RESPONSE=125,
+       NET_IPV4_EARLY_DEMUX=126,
 };
 
 enum {
index 83e0619f59d03d5e087c03fba83c0b402ea13502..50841bd6f10e34e8d83c766d1430b0614c8b2052 100644 (file)
@@ -210,6 +210,9 @@ extern int inet_peer_threshold;
 extern int inet_peer_minttl;
 extern int inet_peer_maxttl;
 
+/* From ip_input.c */
+extern int sysctl_ip_early_demux;
+
 /* From ip_output.c */
 extern int sysctl_ip_dynaddr;
 
index a650694883a180e93c5ec1d6414e45ba904fcff3..6a3cf8253aaec58aab3ee065350fce068aa9977f 100644 (file)
@@ -415,6 +415,8 @@ static const struct bin_table bin_net_ipv4_table[] = {
        { CTL_INT,      NET_IPV4_IPFRAG_SECRET_INTERVAL,        "ipfrag_secret_interval" },
        /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */
 
+       { CTL_INT,      NET_IPV4_EARLY_DEMUX,                   "ip_early_demux" },
+
        { CTL_INT,      2088 /* NET_IPQ_QMAX */,                "ip_queue_maxlen" },
 
        /* NET_TCP_DEFAULT_WIN_SCALE unused */
index 93b092c9a3944c59eea4b87dfdd0aed8bff2979b..bca25179cdb9354b3eb53819946a0fad7edb66b1 100644 (file)
@@ -313,6 +313,8 @@ drop:
        return true;
 }
 
+int sysctl_ip_early_demux __read_mostly = 1;
+
 static int ip_rcv_finish(struct sk_buff *skb)
 {
        const struct iphdr *iph = ip_hdr(skb);
@@ -323,16 +325,18 @@ static int ip_rcv_finish(struct sk_buff *skb)
         *      how the packet travels inside Linux networking.
         */
        if (skb_dst(skb) == NULL) {
-               const struct net_protocol *ipprot;
-               int protocol = iph->protocol;
-               int err;
+               int err = -ENOENT;
 
-               rcu_read_lock();
-               ipprot = rcu_dereference(inet_protos[protocol]);
-               err = -ENOENT;
-               if (ipprot && ipprot->early_demux)
-                       err = ipprot->early_demux(skb);
-               rcu_read_unlock();
+               if (sysctl_ip_early_demux) {
+                       const struct net_protocol *ipprot;
+                       int protocol = iph->protocol;
+
+                       rcu_read_lock();
+                       ipprot = rcu_dereference(inet_protos[protocol]);
+                       if (ipprot && ipprot->early_demux)
+                               err = ipprot->early_demux(skb);
+                       rcu_read_unlock();
+               }
 
                if (err) {
                        err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
index ef32956ed6554401274eb8a58184296762f36f93..12aa0c5867c4db489bb56c8119342d79774ecda1 100644 (file)
@@ -300,6 +300,13 @@ static struct ctl_table ipv4_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec
        },
+       {
+               .procname       = "ip_early_demux",
+               .data           = &sysctl_ip_early_demux,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec
+       },
        {
                .procname       = "ip_dynaddr",
                .data           = &sysctl_ip_dynaddr,