net: ipv4: add support for ECMP hash policy choice
authorNikolay Aleksandrov <nikolay@cumulusnetworks.com>
Thu, 16 Mar 2017 13:28:00 +0000 (15:28 +0200)
committerDavid S. Miller <davem@davemloft.net>
Tue, 21 Mar 2017 22:27:19 +0000 (15:27 -0700)
This patch adds support for ECMP hash policy choice via a new sysctl
called fib_multipath_hash_policy and also adds support for L4 hashes.
The current values for fib_multipath_hash_policy are:
 0 - layer 3 (default)
 1 - layer 4
If there's an skb hash already set and it matches the chosen policy then it
will be used instead of being calculated (currently only for L4).
In L3 mode we always calculate the hash due to the ICMP error special
case, the flow dissector's field consistentification should handle the
address order thus we can remove the address reversals.
If the skb is provided we always use it for the hash calculation,
otherwise we fallback to fl4, that is if skb is NULL fl4 has to be set.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Documentation/networking/ip-sysctl.txt
include/net/ip_fib.h
include/net/netns/ipv4.h
include/net/route.h
net/ipv4/fib_semantics.c
net/ipv4/icmp.c
net/ipv4/route.c
net/ipv4/sysctl_net_ipv4.c

index ed3d0791eb273e80950fdd0fc0153c1e75c39bae..b57308e76b1d3223dec0e97ae5c4496441f0f38e 100644 (file)
@@ -73,6 +73,14 @@ fib_multipath_use_neigh - BOOLEAN
        0 - disabled
        1 - enabled
 
+fib_multipath_hash_policy - INTEGER
+       Controls which hash policy to use for multipath routes. Only valid
+       for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled.
+       Default: 0 (Layer 3)
+       Possible values:
+       0 - Layer 3
+       1 - Layer 4
+
 route/max_size - INTEGER
        Maximum number of routes allowed in the kernel.  Increase
        this when using large numbers of interfaces and/or routes.
index 272e62e139e05cf5cd5cb53a1f08dcd881d1879d..6692c5758b332d468f1e0611ecc4f3e03ae03b2b 100644 (file)
@@ -395,17 +395,13 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
 int fib_sync_down_addr(struct net_device *dev, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 
-extern u32 fib_multipath_secret __read_mostly;
-
-static inline int fib_multipath_hash(__be32 saddr, __be32 daddr)
-{
-       return jhash_2words((__force u32)saddr, (__force u32)daddr,
-                           fib_multipath_secret) >> 1;
-}
-
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
+                      const struct sk_buff *skb);
+#endif
 void fib_select_multipath(struct fib_result *res, int hash);
 void fib_select_path(struct net *net, struct fib_result *res,
-                    struct flowi4 *fl4, int mp_hash);
+                    struct flowi4 *fl4, const struct sk_buff *skb);
 
 /* Exported by fib_trie.c */
 void fib_trie_init(void);
index 2e9d649ba1694d09882e3bff391625f819cd36d9..a0e89190a3e9240a80e7c307c0d3f482ea6168f4 100644 (file)
@@ -151,6 +151,7 @@ struct netns_ipv4 {
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
        int sysctl_fib_multipath_use_neigh;
+       int sysctl_fib_multipath_hash_policy;
 #endif
 
        unsigned int    fib_seq;        /* protected by rtnl_mutex */
index c0874c87c173717f2c13c8af06d2482a76190243..2cc0e14c63598ce3d3be88bb04d2fd433d676129 100644 (file)
@@ -113,13 +113,13 @@ struct in_device;
 int ip_rt_init(void);
 void rt_cache_flush(struct net *net);
 void rt_flush_dev(struct net_device *dev);
-struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp,
-                                         int mp_hash);
+struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *flp,
+                                         const struct sk_buff *skb);
 
 static inline struct rtable *__ip_route_output_key(struct net *net,
                                                   struct flowi4 *flp)
 {
-       return __ip_route_output_key_hash(net, flp, -1);
+       return __ip_route_output_key_hash(net, flp, NULL);
 }
 
 struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
index 317026a39cfa2b49bf06182d89a11af0fa2688af..da449ddb8cc172bd9091c00057a69a095f98b56d 100644 (file)
@@ -57,7 +57,6 @@ static unsigned int fib_info_cnt;
 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-u32 fib_multipath_secret __read_mostly;
 
 #define for_nexthops(fi) {                                             \
        int nhsel; const struct fib_nh *nh;                             \
@@ -576,9 +575,6 @@ static void fib_rebalance(struct fib_info *fi)
 
                atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
        } endfor_nexthops(fi);
-
-       net_get_random_once(&fib_multipath_secret,
-                           sizeof(fib_multipath_secret));
 }
 
 static inline void fib_add_weight(struct fib_info *fi,
@@ -1641,7 +1637,7 @@ void fib_select_multipath(struct fib_result *res, int hash)
 #endif
 
 void fib_select_path(struct net *net, struct fib_result *res,
-                    struct flowi4 *fl4, int mp_hash)
+                    struct flowi4 *fl4, const struct sk_buff *skb)
 {
        bool oif_check;
 
@@ -1650,10 +1646,9 @@ void fib_select_path(struct net *net, struct fib_result *res,
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (res->fi->fib_nhs > 1 && oif_check) {
-               if (mp_hash < 0)
-                       mp_hash = get_hash_from_flowi4(fl4) >> 1;
+               int h = fib_multipath_hash(res->fi, fl4, skb);
 
-               fib_select_multipath(res, mp_hash);
+               fib_select_multipath(res, h);
        }
        else
 #endif
index fc310db2708bf6c9e96befe413e89ac931818f74..43318b5f56474bc15253e74e156962dd2c8df01f 100644 (file)
@@ -464,22 +464,6 @@ out_bh_enable:
        local_bh_enable();
 }
 
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
-/* Source and destination is swapped. See ip_multipath_icmp_hash */
-static int icmp_multipath_hash_skb(const struct sk_buff *skb)
-{
-       const struct iphdr *iph = ip_hdr(skb);
-
-       return fib_multipath_hash(iph->daddr, iph->saddr);
-}
-
-#else
-
-#define icmp_multipath_hash_skb(skb) (-1)
-
-#endif
-
 static struct rtable *icmp_route_lookup(struct net *net,
                                        struct flowi4 *fl4,
                                        struct sk_buff *skb_in,
@@ -505,8 +489,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
        fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
 
        security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
-       rt = __ip_route_output_key_hash(net, fl4,
-                                       icmp_multipath_hash_skb(skb_in));
+       rt = __ip_route_output_key_hash(net, fl4, skb_in);
        if (IS_ERR(rt))
                return rt;
 
index 8471dd116771462d149e1da2807e446b69b74bcc..5dda1ef81c7e10bbc41e610aacf6bad15ba05b45 100644 (file)
@@ -1734,45 +1734,97 @@ out:
 }
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-
 /* To make ICMP packets follow the right flow, the multipath hash is
- * calculated from the inner IP addresses in reverse order.
+ * calculated from the inner IP addresses.
  */
-static int ip_multipath_icmp_hash(struct sk_buff *skb)
+static void ip_multipath_l3_keys(const struct sk_buff *skb,
+                                struct flow_keys *hash_keys)
 {
        const struct iphdr *outer_iph = ip_hdr(skb);
-       struct icmphdr _icmph;
+       const struct iphdr *inner_iph;
        const struct icmphdr *icmph;
        struct iphdr _inner_iph;
-       const struct iphdr *inner_iph;
+       struct icmphdr _icmph;
+
+       hash_keys->addrs.v4addrs.src = outer_iph->saddr;
+       hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
+       if (likely(outer_iph->protocol != IPPROTO_ICMP))
+               return;
 
        if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
-               goto standard_hash;
+               return;
 
        icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
                                   &_icmph);
        if (!icmph)
-               goto standard_hash;
+               return;
 
        if (icmph->type != ICMP_DEST_UNREACH &&
            icmph->type != ICMP_REDIRECT &&
            icmph->type != ICMP_TIME_EXCEEDED &&
-           icmph->type != ICMP_PARAMETERPROB) {
-               goto standard_hash;
-       }
+           icmph->type != ICMP_PARAMETERPROB)
+               return;
 
        inner_iph = skb_header_pointer(skb,
                                       outer_iph->ihl * 4 + sizeof(_icmph),
                                       sizeof(_inner_iph), &_inner_iph);
        if (!inner_iph)
-               goto standard_hash;
+               return;
+       hash_keys->addrs.v4addrs.src = inner_iph->saddr;
+       hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
+}
 
-       return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
+/* if skb is set it will be used and fl4 can be NULL */
+int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
+                      const struct sk_buff *skb)
+{
+       struct net *net = fi->fib_net;
+       struct flow_keys hash_keys;
+       u32 mhash;
 
-standard_hash:
-       return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
-}
+       switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
+       case 0:
+               memset(&hash_keys, 0, sizeof(hash_keys));
+               hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+               if (skb) {
+                       ip_multipath_l3_keys(skb, &hash_keys);
+               } else {
+                       hash_keys.addrs.v4addrs.src = fl4->saddr;
+                       hash_keys.addrs.v4addrs.dst = fl4->daddr;
+               }
+               break;
+       case 1:
+               /* skb is currently provided only when forwarding */
+               if (skb) {
+                       unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
+                       struct flow_keys keys;
+
+                       /* short-circuit if we already have L4 hash present */
+                       if (skb->l4_hash)
+                               return skb_get_hash_raw(skb) >> 1;
+                       memset(&hash_keys, 0, sizeof(hash_keys));
+                       skb_flow_dissect_flow_keys(skb, &keys, flag);
+                       hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+                       hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+                       hash_keys.ports.src = keys.ports.src;
+                       hash_keys.ports.dst = keys.ports.dst;
+                       hash_keys.basic.ip_proto = keys.basic.ip_proto;
+               } else {
+                       memset(&hash_keys, 0, sizeof(hash_keys));
+                       hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+                       hash_keys.addrs.v4addrs.src = fl4->saddr;
+                       hash_keys.addrs.v4addrs.dst = fl4->daddr;
+                       hash_keys.ports.src = fl4->fl4_sport;
+                       hash_keys.ports.dst = fl4->fl4_dport;
+                       hash_keys.basic.ip_proto = fl4->flowi4_proto;
+               }
+               break;
+       }
+       mhash = flow_hash_from_keys(&hash_keys);
 
+       return mhash >> 1;
+}
+EXPORT_SYMBOL_GPL(fib_multipath_hash);
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 
 static int ip_mkroute_input(struct sk_buff *skb,
@@ -1782,12 +1834,8 @@ static int ip_mkroute_input(struct sk_buff *skb,
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (res->fi && res->fi->fib_nhs > 1) {
-               int h;
+               int h = fib_multipath_hash(res->fi, NULL, skb);
 
-               if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
-                       h = ip_multipath_icmp_hash(skb);
-               else
-                       h = fib_multipath_hash(saddr, daddr);
                fib_select_multipath(res, h);
        }
 #endif
@@ -2203,7 +2251,7 @@ add:
  */
 
 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
-                                         int mp_hash)
+                                         const struct sk_buff *skb)
 {
        struct net_device *dev_out = NULL;
        __u8 tos = RT_FL_TOS(fl4);
@@ -2365,7 +2413,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
                goto make_route;
        }
 
-       fib_select_path(net, &res, fl4, mp_hash);
+       fib_select_path(net, &res, fl4, skb);
 
        dev_out = FIB_RES_DEV(res);
        fl4->flowi4_oif = dev_out->ifindex;
index 11aaef0939b29f164336ac17c930daccb0d5404d..711c3e2e17b1a46925deb99a2e22916610388033 100644 (file)
@@ -997,6 +997,15 @@ static struct ctl_table ipv4_net_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
+       {
+               .procname       = "fib_multipath_hash_policy",
+               .data           = &init_net.ipv4.sysctl_fib_multipath_hash_policy,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
 #endif
        {
                .procname       = "ip_unprivileged_port_start",