net: ipv4: add sysctl for nexthop api compatibility mode
authorRoopa Prabhu <roopa@cumulusnetworks.com>
Mon, 27 Apr 2020 20:56:46 +0000 (13:56 -0700)
committerDavid S. Miller <davem@davemloft.net>
Tue, 28 Apr 2020 19:50:37 +0000 (12:50 -0700)
Current route nexthop API maintains user space compatibility
with old route API by default. Dumps and netlink notifications
support both new and old API format. In systems which have
moved to the new API, this compatibility mode cancels some
of the performance benefits provided by the new nexthop API.

This patch adds new sysctl nexthop_compat_mode which is on
by default but provides the ability to turn off compatibility
mode allowing systems to run entirely with the new routing
API. Old route API behaviour and support is not modified by this
sysctl.

Uses a single sysctl to cover both ipv4 and ipv6 following
other sysctls. Covers dumps and delete notifications as
suggested by David Ahern.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Documentation/networking/ip-sysctl.txt
include/net/netns/ipv4.h
net/ipv4/af_inet.c
net/ipv4/fib_semantics.c
net/ipv4/nexthop.c
net/ipv4/sysctl_net_ipv4.c
net/ipv6/route.c

index 9375324aa8e1630c373098842c05f8d3b38715d9..5cdc37c34830904baac4d32378003f8bc3475a3f 100644 (file)
@@ -1560,6 +1560,18 @@ skip_notify_on_dev_down - BOOLEAN
        on userspace caches to track link events and evict routes.
        Default: false (generate message)
 
+nexthop_compat_mode - BOOLEAN
+       New nexthop API provides a means for managing nexthops independent of
+       prefixes. Backwards compatibilty with old route format is enabled by
+       default which means route dumps and notifications contain the new
+       nexthop attribute but also the full, expanded nexthop definition.
+       Further, updates or deletes of a nexthop configuration generate route
+       notifications for each fib entry using the nexthop. Once a system
+       understands the new API, this sysctl can be disabled to achieve full
+       performance benefits of the new API by disabling the nexthop expansion
+       and extraneous notifications.
+       Default: true (backward compat mode)
+
 IPv6 Fragmentation:
 
 ip6frag_high_thresh - INTEGER
index 154b8f01499b52d6136334db809a112df12ec682..5acdb4d414c4fe3cb96b62e6bf3500f447371247 100644 (file)
@@ -111,6 +111,8 @@ struct netns_ipv4 {
        int sysctl_tcp_early_demux;
        int sysctl_udp_early_demux;
 
+       int sysctl_nexthop_compat_mode;
+
        int sysctl_fwmark_reflect;
        int sysctl_tcp_fwmark_accept;
 #ifdef CONFIG_NET_L3_MASTER_DEV
index c618e242490faa47b7b536a0a68635225b7e36cd..6177c4ba00370871dd2636ec4df12141ced884e0 100644 (file)
@@ -1835,6 +1835,7 @@ static __net_init int inet_init_net(struct net *net)
        net->ipv4.sysctl_ip_early_demux = 1;
        net->ipv4.sysctl_udp_early_demux = 1;
        net->ipv4.sysctl_tcp_early_demux = 1;
+       net->ipv4.sysctl_nexthop_compat_mode = 1;
 #ifdef CONFIG_SYSCTL
        net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
 #endif
index 55ca2e5218280c7857c9bad9351858d38df1916d..e53871e4a097838f8eebdab828a3fd0ea23db5cb 100644 (file)
@@ -1780,6 +1780,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
                        goto nla_put_failure;
                if (nexthop_is_blackhole(fi->nh))
                        rtm->rtm_type = RTN_BLACKHOLE;
+               if (!fi->fib_net->ipv4.sysctl_nexthop_compat_mode)
+                       goto offload;
        }
 
        if (nhs == 1) {
@@ -1805,6 +1807,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
                        goto nla_put_failure;
        }
 
+offload:
        if (fri->offload)
                rtm->rtm_flags |= RTM_F_OFFLOAD;
        if (fri->trap)
index 9999687ad6dc94e40431602a26e9402677eb8742..3957364d556cd73d158af2cc25dee2545d4491f9 100644 (file)
@@ -784,7 +784,8 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
        list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
                /* __ip6_del_rt does a release, so do a hold here */
                fib6_info_hold(f6i);
-               ipv6_stub->ip6_del_rt(net, f6i, false);
+               ipv6_stub->ip6_del_rt(net, f6i,
+                                     !net->ipv4.sysctl_nexthop_compat_mode);
        }
 }
 
@@ -1041,7 +1042,7 @@ out:
        if (!rc) {
                nh_base_seq_inc(net);
                nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
-               if (replace_notify)
+               if (replace_notify && net->ipv4.sysctl_nexthop_compat_mode)
                        nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
        }
 
index 81b267e990a1c6576bdb4055e34d95534c558256..95ad71e76cc3f62348c416e296be777decbc1e12 100644 (file)
@@ -710,6 +710,15 @@ static struct ctl_table ipv4_net_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_tcp_early_demux
        },
+       {
+               .procname       = "nexthop_compat_mode",
+               .data           = &init_net.ipv4.sysctl_nexthop_compat_mode,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = SYSCTL_ZERO,
+               .extra2         = SYSCTL_ONE,
+       },
        {
                .procname       = "ip_default_ttl",
                .data           = &init_net.ipv4.sysctl_ip_default_ttl,
index 486c36a14f2405191a747be10c517f2c13b2a94b..803212aae4caf035560284c4ed08f92391622721 100644 (file)
@@ -5557,7 +5557,8 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
                if (nexthop_is_blackhole(rt->nh))
                        rtm->rtm_type = RTN_BLACKHOLE;
 
-               if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
+               if (net->ipv4.sysctl_nexthop_compat_mode &&
+                   rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
                        goto nla_put_failure;
 
                rtm->rtm_flags |= nh_flags;