Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next

author David S. Miller <davem@davemloft.net>

Tue, 15 Mar 2016 02:10:25 +0000 (22:10 -0400)

committer David S. Miller <davem@davemloft.net>

Tue, 15 Mar 2016 02:10:25 +0000 (22:10 -0400)
author David S. Miller <davem@davemloft.net>
Tue, 15 Mar 2016 02:10:25 +0000 (22:10 -0400)
committer David S. Miller <davem@davemloft.net>
Tue, 15 Mar 2016 02:10:25 +0000 (22:10 -0400)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h

index 0816c872b68911077bd51bc41dee0e32e1c3213d..a6cc576fd467f879054c344c24dede5010c72992 100644 (file)
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1588,6 +1588,23 @@ static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
  }
  #endif /* CONFIG_IP_VS_NFCT */
  
+/* Really using conntrack? */
+static inline bool ip_vs_conn_uses_conntrack(struct ip_vs_conn *cp,
+                                            struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_VS_NFCT
+       enum ip_conntrack_info ctinfo;
+       struct nf_conn *ct;
+
+       if (!(cp->flags & IP_VS_CONN_F_NFCT))
+               return false;
+       ct = nf_ct_get(skb, &ctinfo);
+       if (ct && !nf_ct_is_untracked(ct))
+               return true;
+#endif
+       return false;
+}
+
  static inline int
  ip_vs_dest_conn_overhead(struct ip_vs_dest *dest)
  {
diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h

index 319f47128db8c117563efa26d4cd54e1c3ac2cc2..6d074d14ee274309f9e35b967e1728f26506fd6e 100644 (file)
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -20,9 +20,15 @@ enum ip_conntrack_info {
  
         IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY,
         IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY,
-       IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY,   
-       /* Number of distinct IP_CT types (no NEW in reply dirn). */
-       IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1
+       /* No NEW in reply direction. */
+
+       /* Number of distinct IP_CT types. */
+       IP_CT_NUMBER,
+
+       /* only for userspace compatibility */
+#ifndef __KERNEL__
+       IP_CT_NEW_REPLY = IP_CT_NUMBER,
+#endif
  };
  
  #define NF_CT_STATE_INVALID_BIT                        (1 << 0)
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h

index a27222d5b413a85d4c773b5b43563bf9b57831d2..616d04761730d1613a9b39977e277a6d26666e87 100644 (file)
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -454,6 +454,14 @@ struct ovs_key_ct_labels {
  #define OVS_CS_F_REPLY_DIR         0x08 /* Flow is in the reply direction. */
  #define OVS_CS_F_INVALID           0x10 /* Could not track connection. */
  #define OVS_CS_F_TRACKED           0x20 /* Conntrack has occurred. */
+#define OVS_CS_F_SRC_NAT           0x40 /* Packet's source address/port was
+                                        * mangled by NAT.
+                                        */
+#define OVS_CS_F_DST_NAT           0x80 /* Packet's destination address/port
+                                        * was mangled by NAT.
+                                        */
+
+#define OVS_CS_F_NAT_MASK (OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
  
  /**
   * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
@@ -632,6 +640,8 @@ struct ovs_action_hash {
   * mask. For each bit set in the mask, the corresponding bit in the value is
   * copied to the connection tracking label field in the connection.
   * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG.
+ * @OVS_CT_ATTR_NAT: Nested OVS_NAT_ATTR_* for performing L3 network address
+ * translation (NAT) on the packet.
   */
  enum ovs_ct_attr {
         OVS_CT_ATTR_UNSPEC,
@@ -641,11 +651,50 @@ enum ovs_ct_attr {
         OVS_CT_ATTR_LABELS,     /* labels to associate with this connection. */
         OVS_CT_ATTR_HELPER,     /* netlink helper to assist detection of
                                    related connections. */
+       OVS_CT_ATTR_NAT,        /* Nested OVS_NAT_ATTR_* */
         __OVS_CT_ATTR_MAX
  };
  
  #define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1)
  
+/**
+ * enum ovs_nat_attr - Attributes for %OVS_CT_ATTR_NAT.
+ *
+ * @OVS_NAT_ATTR_SRC: Flag for Source NAT (mangle source address/port).
+ * @OVS_NAT_ATTR_DST: Flag for Destination NAT (mangle destination
+ * address/port).  Only one of (@OVS_NAT_ATTR_SRC, @OVS_NAT_ATTR_DST) may be
+ * specified.  Effective only for packets for ct_state NEW connections.
+ * Packets of committed connections are mangled by the NAT action according to
+ * the committed NAT type regardless of the flags specified.  As a corollary, a
+ * NAT action without a NAT type flag will only mangle packets of committed
+ * connections.  The following NAT attributes only apply for NEW
+ * (non-committed) connections, and they may be included only when the CT
+ * action has the @OVS_CT_ATTR_COMMIT flag and either @OVS_NAT_ATTR_SRC or
+ * @OVS_NAT_ATTR_DST is also included.
+ * @OVS_NAT_ATTR_IP_MIN: struct in_addr or struct in6_addr
+ * @OVS_NAT_ATTR_IP_MAX: struct in_addr or struct in6_addr
+ * @OVS_NAT_ATTR_PROTO_MIN: u16 L4 protocol specific lower boundary (port)
+ * @OVS_NAT_ATTR_PROTO_MAX: u16 L4 protocol specific upper boundary (port)
+ * @OVS_NAT_ATTR_PERSISTENT: Flag for persistent IP mapping across reboots
+ * @OVS_NAT_ATTR_PROTO_HASH: Flag for pseudo random L4 port mapping (MD5)
+ * @OVS_NAT_ATTR_PROTO_RANDOM: Flag for fully randomized L4 port mapping
+ */
+enum ovs_nat_attr {
+       OVS_NAT_ATTR_UNSPEC,
+       OVS_NAT_ATTR_SRC,
+       OVS_NAT_ATTR_DST,
+       OVS_NAT_ATTR_IP_MIN,
+       OVS_NAT_ATTR_IP_MAX,
+       OVS_NAT_ATTR_PROTO_MIN,
+       OVS_NAT_ATTR_PROTO_MAX,
+       OVS_NAT_ATTR_PERSISTENT,
+       OVS_NAT_ATTR_PROTO_HASH,
+       OVS_NAT_ATTR_PROTO_RANDOM,
+       __OVS_NAT_ATTR_MAX,
+};
+
+#define OVS_NAT_ATTR_MAX (__OVS_NAT_ATTR_MAX - 1)
+
  /**
   * enum ovs_action_attr - Action types.
   *
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c

index 61c7cc22ea684ffe289f3eb8373584dee94c182c..f8aad03d674b05008edb5b9883b3a26b2fa7461f 100644 (file)
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -127,29 +127,15 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
                                     u8 proto, void *data, __sum16 *check,
                                     int datalen, int oldlen)
  {
-       const struct iphdr *iph = ip_hdr(skb);
-       struct rtable *rt = skb_rtable(skb);
-
         if (skb->ip_summed != CHECKSUM_PARTIAL) {
-               if (!(rt->rt_flags & RTCF_LOCAL) &&
-                   (!skb->dev || skb->dev->features &
-                    (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) {
-                       skb->ip_summed = CHECKSUM_PARTIAL;
-                       skb->csum_start = skb_headroom(skb) +
-                                         skb_network_offset(skb) +
-                                         ip_hdrlen(skb);
-                       skb->csum_offset = (void *)check - data;
-                       *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
-                                                   datalen, proto, 0);
-               } else {
-                       *check = 0;
-                       *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
-                                                  datalen, proto,
-                                                  csum_partial(data, datalen,
-                                                               0));
-                       if (proto == IPPROTO_UDP && !*check)
-                               *check = CSUM_MANGLED_0;
-               }
+               const struct iphdr *iph = ip_hdr(skb);
+
+               skb->ip_summed = CHECKSUM_PARTIAL;
+               skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
+                       ip_hdrlen(skb);
+               skb->csum_offset = (void *)check - data;
+               *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
+                                           proto, 0);
         } else
                 inet_proto_csum_replace2(check, skb,
                                          htons(oldlen), htons(datalen), true);
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c

index 6ce3099288416b3753d6cab8ee70f1a0b97ab94f..e0be97e636a48f54c1488ca70ae97a9a13e8be61 100644 (file)
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -131,29 +131,15 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
                                     u8 proto, void *data, __sum16 *check,
                                     int datalen, int oldlen)
  {
-       const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
-       struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
-
         if (skb->ip_summed != CHECKSUM_PARTIAL) {
-               if (!(rt->rt6i_flags & RTF_LOCAL) &&
-                   (!skb->dev || skb->dev->features &
-                    (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))) {
-                       skb->ip_summed = CHECKSUM_PARTIAL;
-                       skb->csum_start = skb_headroom(skb) +
-                                         skb_network_offset(skb) +
-                                         (data - (void *)skb->data);
-                       skb->csum_offset = (void *)check - data;
-                       *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
-                                                 datalen, proto, 0);
-               } else {
-                       *check = 0;
-                       *check = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
-                                                datalen, proto,
-                                                csum_partial(data, datalen,
-                                                             0));
-                       if (proto == IPPROTO_UDP && !*check)
-                               *check = CSUM_MANGLED_0;
-               }
+               const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+
+               skb->ip_summed = CHECKSUM_PARTIAL;
+               skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
+                       (data - (void *)skb->data);
+               skb->csum_offset = (void *)check - data;
+               *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+                                         datalen, proto, 0);
         } else
                 inet_proto_csum_replace2(check, skb,
                                          htons(oldlen), htons(datalen), true);
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c

index 29dde208381dac6cdfca67094dc4fc07bc85b40c..9a065f672d3a56f33a22e38937780ab59d318c13 100644 (file)
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -267,6 +267,8 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
  
         e.id = ip_to_id(map, ip);
         if (tb[IPSET_ATTR_ETHER]) {
+               if (nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN)
+                       return -IPSET_ERR_PROTOCOL;
                 memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
                 e.add_mac = 1;
         }
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c

index 95db43fc0303a136c0829ce185228d064f4f4775..7e6568cad4941b904f1344267feaab31198a2630 100644 (file)
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -985,6 +985,9 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
         if (unlikely(protocol_failed(attr)))
                 return -IPSET_ERR_PROTOCOL;
  
+       /* Must wait for flush to be really finished in list:set */
+       rcu_barrier();
+
         /* Commands are serialized and references are
          * protected by the ip_set_ref_lock.
          * External systems (i.e. xt_set) must call
diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c

index f1e7d2c0f68531421189b38d251148bdc4d12c95..8f004edad396717e5855dafbb456412b54e9fc51 100644 (file)
--- a/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/net/netfilter/ipset/ip_set_hash_mac.c
@@ -110,7 +110,8 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[],
         if (tb[IPSET_ATTR_LINENO])
                 *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
  
-       if (unlikely(!tb[IPSET_ATTR_ETHER]))
+       if (unlikely(!tb[IPSET_ATTR_ETHER] ||
+                    nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN))
                 return -IPSET_ERR_PROTOCOL;
  
         ret = ip_set_get_extensions(set, tb, &ext);
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c

index bbede95c9f68cee1a1e0b5c285231c78d987df63..24c6c1962aea19e356c21273c5dd8bf8745554e4 100644 (file)
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -30,6 +30,7 @@ MODULE_ALIAS("ip_set_list:set");
  struct set_elem {
         struct rcu_head rcu;
         struct list_head list;
+       struct ip_set *set;     /* Sigh, in order to cleanup reference */
         ip_set_id_t id;
  } __aligned(__alignof__(u64));
  
@@ -151,30 +152,29 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
  /* Userspace interfaces: we are protected by the nfnl mutex */
  
  static void
-__list_set_del(struct ip_set *set, struct set_elem *e)
+__list_set_del_rcu(struct rcu_head * rcu)
  {
+       struct set_elem *e = container_of(rcu, struct set_elem, rcu);
+       struct ip_set *set = e->set;
         struct list_set *map = set->data;
  
         ip_set_put_byindex(map->net, e->id);
-       /* We may call it, because we don't have a to be destroyed
-        * extension which is used by the kernel.
-        */
         ip_set_ext_destroy(set, e);
-       kfree_rcu(e, rcu);
+       kfree(e);
  }
  
  static inline void
  list_set_del(struct ip_set *set, struct set_elem *e)
  {
         list_del_rcu(&e->list);
-       __list_set_del(set, e);
+       call_rcu(&e->rcu, __list_set_del_rcu);
  }
  
  static inline void
-list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old)
+list_set_replace(struct set_elem *e, struct set_elem *old)
  {
         list_replace_rcu(&old->list, &e->list);
-       __list_set_del(set, old);
+       call_rcu(&old->rcu, __list_set_del_rcu);
  }
  
  static void
@@ -244,9 +244,6 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
         struct set_elem *e, *n, *prev, *next;
         bool flag_exist = flags & IPSET_FLAG_EXIST;
  
-       if (SET_WITH_TIMEOUT(set))
-               set_cleanup_entries(set);
-
         /* Find where to add the new entry */
         n = prev = next = NULL;
         list_for_each_entry(e, &map->members, list) {
@@ -301,10 +298,11 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
         if (!e)
                 return -ENOMEM;
         e->id = d->id;
+       e->set = set;
         INIT_LIST_HEAD(&e->list);
         list_set_init_extensions(set, ext, e);
         if (n)
-               list_set_replace(set, e, n);
+               list_set_replace(e, n);
         else if (next)
                 list_add_tail_rcu(&e->list, &next->list);
         else if (prev)
@@ -431,6 +429,7 @@ list_set_destroy(struct ip_set *set)
  
         if (SET_WITH_TIMEOUT(set))
                 del_timer_sync(&map->gc);
+
         list_for_each_entry_safe(e, n, &map->members, list) {
                 list_del(&e->list);
                 ip_set_put_byindex(map->net, e->id);
@@ -450,8 +449,10 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
         struct set_elem *e;
         u32 n = 0;
  
-       list_for_each_entry(e, &map->members, list)
+       rcu_read_lock();
+       list_for_each_entry_rcu(e, &map->members, list)
                 n++;
+       rcu_read_unlock();
  
         nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
         if (!nested)
@@ -483,33 +484,25 @@ list_set_list(const struct ip_set *set,
         atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
         if (!atd)
                 return -EMSGSIZE;
-       list_for_each_entry(e, &map->members, list) {
-               if (i == first)
-                       break;
-               i++;
-       }
  
         rcu_read_lock();
-       list_for_each_entry_from(e, &map->members, list) {
-               i++;
-               if (SET_WITH_TIMEOUT(set) &&
-                   ip_set_timeout_expired(ext_timeout(e, set)))
+       list_for_each_entry_rcu(e, &map->members, list) {
+               if (i < first ||
+                   (SET_WITH_TIMEOUT(set) &&
+                    ip_set_timeout_expired(ext_timeout(e, set)))) {
+                       i++;
                         continue;
+               }
                 nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
-               if (!nested) {
-                       if (i == first) {
-                               nla_nest_cancel(skb, atd);
-                               ret = -EMSGSIZE;
-                               goto out;
-                       }
+               if (!nested)
                         goto nla_put_failure;
-               }
                 if (nla_put_string(skb, IPSET_ATTR_NAME,
                                    ip_set_name_byindex(map->net, e->id)))
                         goto nla_put_failure;
                 if (ip_set_put_extensions(skb, set, e, true))
                         goto nla_put_failure;
                 ipset_nest_end(skb, nested);
+               i++;
         }
  
         ipset_nest_end(skb, atd);
@@ -520,10 +513,12 @@ list_set_list(const struct ip_set *set,
  nla_put_failure:
         nla_nest_cancel(skb, nested);
         if (unlikely(i == first)) {
+               nla_nest_cancel(skb, atd);
                 cb->args[IPSET_CB_ARG0] = 0;
                 ret = -EMSGSIZE;
+       } else {
+               cb->args[IPSET_CB_ARG0] = i;
         }
-       cb->args[IPSET_CB_ARG0] = i - 1;
         ipset_nest_end(skb, atd);
  out:
         rcu_read_unlock();
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c

index f57b4dcdb2330e40110ce18ede71b742249edf00..b9a4082afa3abb7f2fcbf931ac6594baf474c9f0 100644 (file)
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1089,6 +1089,7 @@ static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
         switch (cp->protocol) {
         case IPPROTO_TCP:
                 return (cp->state == IP_VS_TCP_S_TIME_WAIT) ||
+                      (cp->state == IP_VS_TCP_S_CLOSE) ||
                         ((conn_reuse_mode & 2) &&
                          (cp->state == IP_VS_TCP_S_FIN_WAIT) &&
                          (cp->flags & IP_VS_CONN_F_NOOUTPUT));
@@ -1757,15 +1758,34 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
         cp = pp->conn_in_get(ipvs, af, skb, &iph);
  
         conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
-       if (conn_reuse_mode && !iph.fragoffs &&
-           is_new_conn(skb, &iph) && cp &&
-           ((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
-             unlikely(!atomic_read(&cp->dest->weight))) ||
-            unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) {
-               if (!atomic_read(&cp->n_control))
-                       ip_vs_conn_expire_now(cp);
-               __ip_vs_conn_put(cp);
-               cp = NULL;
+       if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) {
+               bool uses_ct = false, resched = false;
+
+               if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
+                   unlikely(!atomic_read(&cp->dest->weight))) {
+                       resched = true;
+                       uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
+               } else if (is_new_conn_expected(cp, conn_reuse_mode)) {
+                       uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
+                       if (!atomic_read(&cp->n_control)) {
+                               resched = true;
+                       } else {
+                               /* Do not reschedule controlling connection
+                                * that uses conntrack while it is still
+                                * referenced by controlled connection(s).
+                                */
+                               resched = !uses_ct;
+                       }
+               }
+
+               if (resched) {
+                       if (!atomic_read(&cp->n_control))
+                               ip_vs_conn_expire_now(cp);
+                       __ip_vs_conn_put(cp);
+                       if (uses_ct)
+                               return NF_DROP;
+                       cp = NULL;
+               }
         }
  
         if (unlikely(!cp)) {
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c

index 1b8d594e493a32f0ea461ade2d2a85387054a78d..0a6eb5c0d9e9c0c067ef23b57684506831932e89 100644 (file)
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -70,10 +70,10 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
         const char *dptr;
         int retc;
  
-       ip_vs_fill_iph_skb(p->af, skb, false, &iph);
+       retc = ip_vs_fill_iph_skb(p->af, skb, false, &iph);
  
         /* Only useful with UDP */
-       if (iph.protocol != IPPROTO_UDP)
+       if (!retc || iph.protocol != IPPROTO_UDP)
                 return -EINVAL;
         /* todo: IPv6 fragments:
          *       I think this only should be done for the first fragment. /HS
@@ -88,7 +88,7 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
         dptr = skb->data + dataoff;
         datalen = skb->len - dataoff;
  
-       if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
+       if (get_callid(dptr, 0, datalen, &matchoff, &matchlen))
                 return -EINVAL;
  
         /* N.B: pe_data is only set on success,
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c

index f60b4fdeeb8cc4fc600506ecc58e2bdc4cad1654..afde5f5e728a320773be246ecbcfcca7f5a16617 100644 (file)
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -74,8 +74,7 @@ void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
         spin_lock(lock);
         while (unlikely(nf_conntrack_locks_all)) {
                 spin_unlock(lock);
-               spin_lock(&nf_conntrack_locks_all_lock);
-               spin_unlock(&nf_conntrack_locks_all_lock);
+               spin_unlock_wait(&nf_conntrack_locks_all_lock);
                 spin_lock(lock);
         }
  }
@@ -121,8 +120,7 @@ static void nf_conntrack_all_lock(void)
         nf_conntrack_locks_all = true;
  
         for (i = 0; i < CONNTRACK_LOCKS; i++) {
-               spin_lock(&nf_conntrack_locks[i]);
-               spin_unlock(&nf_conntrack_locks[i]);
+               spin_unlock_wait(&nf_conntrack_locks[i]);
         }
  }
  
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c

index 5274b04c42a6cccfe10ce25026b5f45a1cf48510..4c2b4c0c4d5fa4ac209ab85020e97e14ed716ab9 100644 (file)
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -242,6 +242,9 @@ nfacct_filter_alloc(const struct nlattr * const attr)
         if (err < 0)
                 return ERR_PTR(err);
  
+       if (!tb[NFACCT_FILTER_MASK] || !tb[NFACCT_FILTER_VALUE])
+               return ERR_PTR(-EINVAL);
+
         filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL);
         if (!filter)
                 return ERR_PTR(-ENOMEM);
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c

index 454841baa4d07f3312a09dd397257ad66b41c412..6228c422c766e26dc4766e01635b538e4fd4d194 100644 (file)
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -660,6 +660,9 @@ nft_match_select_ops(const struct nft_ctx *ctx,
         if (IS_ERR(match))
                 return ERR_PTR(-ENOENT);
  
+       if (match->matchsize > nla_len(tb[NFTA_MATCH_INFO]))
+               return ERR_PTR(-EINVAL);
+
         /* This is the first time we use this match, allocate operations */
         nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
         if (nft_match == NULL)
@@ -740,6 +743,9 @@ nft_target_select_ops(const struct nft_ctx *ctx,
         if (IS_ERR(target))
                 return ERR_PTR(-ENOENT);
  
+       if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO]))
+               return ERR_PTR(-EINVAL);
+
         /* This is the first time we use this target, allocate operations */
         nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
         if (nft_target == NULL)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c

index d0cd2b9bf84463af1028d00ce87d2f5aab1dc736..582c9cfd6567ce4c7d5b3f86c15732b33a63e1b4 100644 (file)
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -659,6 +659,9 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
         struct xt_table_info *info = NULL;
         size_t sz = sizeof(*info) + size;
  
+       if (sz < sizeof(*info))
+               return NULL;
+
         /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
         if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
                 return NULL;
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig

index cd5fd9d728a7cd00e676dda2a90a2d5403b0dfeb..234a73344c6e2043bd7042247d70fbab2c5ae782 100644 (file)
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -6,7 +6,8 @@ config OPENVSWITCH
         tristate "Open vSwitch"
         depends on INET
         depends on !NF_CONNTRACK || \
-                  (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6))
+                  (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
+                                    (!NF_NAT || NF_NAT)))
         select LIBCRC32C
         select MPLS
         select NET_MPLS_GSO
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c

index ee6ff8ffc12d924655f111307233d7a8bb88e638..dc5eb29fe7d6754acebffb7ecf2867bf5716c134 100644 (file)
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -13,21 +13,31 @@
  
  #include <linux/module.h>
  #include <linux/openvswitch.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
  #include <net/ip.h>
  #include <net/netfilter/nf_conntrack_core.h>
  #include <net/netfilter/nf_conntrack_helper.h>
  #include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
  #include <net/netfilter/nf_conntrack_zones.h>
  #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
  
+#ifdef CONFIG_NF_NAT_NEEDED
+#include <linux/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#endif
+
  #include "datapath.h"
  #include "conntrack.h"
  #include "flow.h"
  #include "flow_netlink.h"
  
  struct ovs_ct_len_tbl {
-       size_t maxlen;
-       size_t minlen;
+       int maxlen;
+       int minlen;
  };
  
  /* Metadata mark for masked write to conntrack mark */
@@ -42,15 +52,25 @@ struct md_labels {
         struct ovs_key_ct_labels mask;
  };
  
+enum ovs_ct_nat {
+       OVS_CT_NAT = 1 << 0,     /* NAT for committed connections only. */
+       OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */
+       OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */
+};
+
  /* Conntrack action context for execution. */
  struct ovs_conntrack_info {
         struct nf_conntrack_helper *helper;
         struct nf_conntrack_zone zone;
         struct nf_conn *ct;
         u8 commit : 1;
+       u8 nat : 3;                 /* enum ovs_ct_nat */
         u16 family;
         struct md_mark mark;
         struct md_labels labels;
+#ifdef CONFIG_NF_NAT_NEEDED
+       struct nf_nat_range range;  /* Only present for SRC NAT and DST NAT. */
+#endif
  };
  
  static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
@@ -75,7 +95,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
         switch (ctinfo) {
         case IP_CT_ESTABLISHED_REPLY:
         case IP_CT_RELATED_REPLY:
-       case IP_CT_NEW_REPLY:
                 ct_state |= OVS_CS_F_REPLY_DIR;
                 break;
         default:
@@ -92,7 +111,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
                 ct_state |= OVS_CS_F_RELATED;
                 break;
         case IP_CT_NEW:
-       case IP_CT_NEW_REPLY:
                 ct_state |= OVS_CS_F_NEW;
                 break;
         default:
@@ -139,12 +157,15 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
         ovs_ct_get_labels(ct, &key->ct.labels);
  }
  
-/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
- * previously sent the packet to conntrack via the ct action.
+/* Update 'key' based on skb->nfct.  If 'post_ct' is true, then OVS has
+ * previously sent the packet to conntrack via the ct action.  If
+ * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
+ * initialized from the connection status.
   */
  static void ovs_ct_update_key(const struct sk_buff *skb,
                               const struct ovs_conntrack_info *info,
-                             struct sw_flow_key *key, bool post_ct)
+                             struct sw_flow_key *key, bool post_ct,
+                             bool keep_nat_flags)
  {
         const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
         enum ip_conntrack_info ctinfo;
@@ -154,10 +175,22 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
         ct = nf_ct_get(skb, &ctinfo);
         if (ct) {
                 state = ovs_ct_get_state(ctinfo);
+               /* All unconfirmed entries are NEW connections. */
                 if (!nf_ct_is_confirmed(ct))
                         state |= OVS_CS_F_NEW;
+               /* OVS persists the related flag for the duration of the
+                * connection.
+                */
                 if (ct->master)
                         state |= OVS_CS_F_RELATED;
+               if (keep_nat_flags) {
+                       state |= key->ct.state & OVS_CS_F_NAT_MASK;
+               } else {
+                       if (ct->status & IPS_SRC_NAT)
+                               state |= OVS_CS_F_SRC_NAT;
+                       if (ct->status & IPS_DST_NAT)
+                               state |= OVS_CS_F_DST_NAT;
+               }
                 zone = nf_ct_zone(ct);
         } else if (post_ct) {
                 state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
@@ -167,9 +200,12 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
         __ovs_ct_update_key(key, state, zone, ct);
  }
  
+/* This is called to initialize CT key fields possibly coming in from the local
+ * stack.
+ */
  void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
  {
-       ovs_ct_update_key(skb, NULL, key, false);
+       ovs_ct_update_key(skb, NULL, key, false, false);
  }
  
  int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
@@ -201,7 +237,6 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
         struct nf_conn *ct;
         u32 new_mark;
  
-
         /* The connection could be invalid, in which case set_mark is no-op. */
         ct = nf_ct_get(skb, &ctinfo);
         if (!ct)
@@ -259,6 +294,7 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
         enum ip_conntrack_info ctinfo;
         unsigned int protoff;
         struct nf_conn *ct;
+       int err;
  
         ct = nf_ct_get(skb, &ctinfo);
         if (!ct || ctinfo == IP_CT_RELATED_REPLY)
@@ -295,7 +331,18 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
                 return NF_DROP;
         }
  
-       return helper->help(skb, protoff, ct, ctinfo);
+       err = helper->help(skb, protoff, ct, ctinfo);
+       if (err != NF_ACCEPT)
+               return err;
+
+       /* Adjust seqs after helper.  This is needed due to some helpers (e.g.,
+        * FTP with NAT) adusting the TCP payload size when mangling IP
+        * addresses and/or port numbers in the text-based control connection.
+        */
+       if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+           !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
+               return NF_DROP;
+       return NF_ACCEPT;
  }
  
  /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
@@ -352,14 +399,101 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
         return __nf_ct_expect_find(net, zone, &tuple);
  }
  
+/* This replicates logic from nf_conntrack_core.c that is not exported. */
+static enum ip_conntrack_info
+ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
+{
+       const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+       if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
+               return IP_CT_ESTABLISHED_REPLY;
+       /* Once we've had two way comms, always ESTABLISHED. */
+       if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status))
+               return IP_CT_ESTABLISHED;
+       if (test_bit(IPS_EXPECTED_BIT, &ct->status))
+               return IP_CT_RELATED;
+       return IP_CT_NEW;
+}
+
+/* Find an existing connection which this packet belongs to without
+ * re-attributing statistics or modifying the connection state.  This allows an
+ * skb->nfct lost due to an upcall to be recovered during actions execution.
+ *
+ * Must be called with rcu_read_lock.
+ *
+ * On success, populates skb->nfct and skb->nfctinfo, and returns the
+ * connection.  Returns NULL if there is no existing entry.
+ */
+static struct nf_conn *
+ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
+                    u8 l3num, struct sk_buff *skb)
+{
+       struct nf_conntrack_l3proto *l3proto;
+       struct nf_conntrack_l4proto *l4proto;
+       struct nf_conntrack_tuple tuple;
+       struct nf_conntrack_tuple_hash *h;
+       enum ip_conntrack_info ctinfo;
+       struct nf_conn *ct;
+       unsigned int dataoff;
+       u8 protonum;
+
+       l3proto = __nf_ct_l3proto_find(l3num);
+       if (!l3proto) {
+               pr_debug("ovs_ct_find_existing: Can't get l3proto\n");
+               return NULL;
+       }
+       if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
+                                &protonum) <= 0) {
+               pr_debug("ovs_ct_find_existing: Can't get protonum\n");
+               return NULL;
+       }
+       l4proto = __nf_ct_l4proto_find(l3num, protonum);
+       if (!l4proto) {
+               pr_debug("ovs_ct_find_existing: Can't get l4proto\n");
+               return NULL;
+       }
+       if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
+                            protonum, net, &tuple, l3proto, l4proto)) {
+               pr_debug("ovs_ct_find_existing: Can't get tuple\n");
+               return NULL;
+       }
+
+       /* look for tuple match */
+       h = nf_conntrack_find_get(net, zone, &tuple);
+       if (!h)
+               return NULL;   /* Not found. */
+
+       ct = nf_ct_tuplehash_to_ctrack(h);
+
+       ctinfo = ovs_ct_get_info(h);
+       if (ctinfo == IP_CT_NEW) {
+               /* This should not happen. */
+               WARN_ONCE(1, "ovs_ct_find_existing: new packet for %p\n", ct);
+       }
+       skb->nfct = &ct->ct_general;
+       skb->nfctinfo = ctinfo;
+       return ct;
+}
+
  /* Determine whether skb->nfct is equal to the result of conntrack lookup. */
-static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
-                           const struct ovs_conntrack_info *info)
+static bool skb_nfct_cached(struct net *net,
+                           const struct sw_flow_key *key,
+                           const struct ovs_conntrack_info *info,
+                           struct sk_buff *skb)
  {
         enum ip_conntrack_info ctinfo;
         struct nf_conn *ct;
  
         ct = nf_ct_get(skb, &ctinfo);
+       /* If no ct, check if we have evidence that an existing conntrack entry
+        * might be found for this skb.  This happens when we lose a skb->nfct
+        * due to an upcall.  If the connection was not confirmed, it is not
+        * cached and needs to be run through conntrack again.
+        */
+       if (!ct && key->ct.state & OVS_CS_F_TRACKED &&
+           !(key->ct.state & OVS_CS_F_INVALID) &&
+           key->ct.zone == info->zone.id)
+               ct = ovs_ct_find_existing(net, &info->zone, info->family, skb);
         if (!ct)
                 return false;
         if (!net_eq(net, read_pnet(&ct->ct_net)))
@@ -377,6 +511,206 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
         return true;
  }
  
+#ifdef CONFIG_NF_NAT_NEEDED
+/* Modelled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+ * Returns either NF_ACCEPT or NF_DROP.
+ */
+static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
+                             enum ip_conntrack_info ctinfo,
+                             const struct nf_nat_range *range,
+                             enum nf_nat_manip_type maniptype)
+{
+       int hooknum, nh_off, err = NF_ACCEPT;
+
+       nh_off = skb_network_offset(skb);
+       skb_pull(skb, nh_off);
+
+       /* See HOOK2MANIP(). */
+       if (maniptype == NF_NAT_MANIP_SRC)
+               hooknum = NF_INET_LOCAL_IN; /* Source NAT */
+       else
+               hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
+
+       switch (ctinfo) {
+       case IP_CT_RELATED:
+       case IP_CT_RELATED_REPLY:
+               if (skb->protocol == htons(ETH_P_IP) &&
+                   ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+                       if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+                                                          hooknum))
+                               err = NF_DROP;
+                       goto push;
+#if IS_ENABLED(CONFIG_NF_NAT_IPV6)
+               } else if (skb->protocol == htons(ETH_P_IPV6)) {
+                       __be16 frag_off;
+                       u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+                       int hdrlen = ipv6_skip_exthdr(skb,
+                                                     sizeof(struct ipv6hdr),
+                                                     &nexthdr, &frag_off);
+
+                       if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+                               if (!nf_nat_icmpv6_reply_translation(skb, ct,
+                                                                    ctinfo,
+                                                                    hooknum,
+                                                                    hdrlen))
+                                       err = NF_DROP;
+                               goto push;
+                       }
+#endif
+               }
+               /* Non-ICMP, fall thru to initialize if needed. */
+       case IP_CT_NEW:
+               /* Seen it before?  This can happen for loopback, retrans,
+                * or local packets.
+                */
+               if (!nf_nat_initialized(ct, maniptype)) {
+                       /* Initialize according to the NAT action. */
+                       err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
+                               /* Action is set up to establish a new
+                                * mapping.
+                                */
+                               ? nf_nat_setup_info(ct, range, maniptype)
+                               : nf_nat_alloc_null_binding(ct, hooknum);
+                       if (err != NF_ACCEPT)
+                               goto push;
+               }
+               break;
+
+       case IP_CT_ESTABLISHED:
+       case IP_CT_ESTABLISHED_REPLY:
+               break;
+
+       default:
+               err = NF_DROP;
+               goto push;
+       }
+
+       err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+push:
+       skb_push(skb, nh_off);
+
+       return err;
+}
+
+static void ovs_nat_update_key(struct sw_flow_key *key,
+                              const struct sk_buff *skb,
+                              enum nf_nat_manip_type maniptype)
+{
+       if (maniptype == NF_NAT_MANIP_SRC) {
+               __be16 src;
+
+               key->ct.state |= OVS_CS_F_SRC_NAT;
+               if (key->eth.type == htons(ETH_P_IP))
+                       key->ipv4.addr.src = ip_hdr(skb)->saddr;
+               else if (key->eth.type == htons(ETH_P_IPV6))
+                       memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr,
+                              sizeof(key->ipv6.addr.src));
+               else
+                       return;
+
+               if (key->ip.proto == IPPROTO_UDP)
+                       src = udp_hdr(skb)->source;
+               else if (key->ip.proto == IPPROTO_TCP)
+                       src = tcp_hdr(skb)->source;
+               else if (key->ip.proto == IPPROTO_SCTP)
+                       src = sctp_hdr(skb)->source;
+               else
+                       return;
+
+               key->tp.src = src;
+       } else {
+               __be16 dst;
+
+               key->ct.state |= OVS_CS_F_DST_NAT;
+               if (key->eth.type == htons(ETH_P_IP))
+                       key->ipv4.addr.dst = ip_hdr(skb)->daddr;
+               else if (key->eth.type == htons(ETH_P_IPV6))
+                       memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr,
+                              sizeof(key->ipv6.addr.dst));
+               else
+                       return;
+
+               if (key->ip.proto == IPPROTO_UDP)
+                       dst = udp_hdr(skb)->dest;
+               else if (key->ip.proto == IPPROTO_TCP)
+                       dst = tcp_hdr(skb)->dest;
+               else if (key->ip.proto == IPPROTO_SCTP)
+                       dst = sctp_hdr(skb)->dest;
+               else
+                       return;
+
+               key->tp.dst = dst;
+       }
+}
+
+/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */
+static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
+                     const struct ovs_conntrack_info *info,
+                     struct sk_buff *skb, struct nf_conn *ct,
+                     enum ip_conntrack_info ctinfo)
+{
+       enum nf_nat_manip_type maniptype;
+       int err;
+
+       if (nf_ct_is_untracked(ct)) {
+               /* A NAT action may only be performed on tracked packets. */
+               return NF_ACCEPT;
+       }
+
+       /* Add NAT extension if not confirmed yet. */
+       if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
+               return NF_ACCEPT;   /* Can't NAT. */
+
+       /* Determine NAT type.
+        * Check if the NAT type can be deduced from the tracked connection.
+        * Make sure expected traffic is NATted only when committing.
+        */
+       if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW &&
+           ct->status & IPS_NAT_MASK &&
+           (!(ct->status & IPS_EXPECTED_BIT) || info->commit)) {
+               /* NAT an established or related connection like before. */
+               if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
+                       /* This is the REPLY direction for a connection
+                        * for which NAT was applied in the forward
+                        * direction.  Do the reverse NAT.
+                        */
+                       maniptype = ct->status & IPS_SRC_NAT
+                               ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
+               else
+                       maniptype = ct->status & IPS_SRC_NAT
+                               ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
+       } else if (info->nat & OVS_CT_SRC_NAT) {
+               maniptype = NF_NAT_MANIP_SRC;
+       } else if (info->nat & OVS_CT_DST_NAT) {
+               maniptype = NF_NAT_MANIP_DST;
+       } else {
+               return NF_ACCEPT; /* Connection is not NATed. */
+       }
+       err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
+
+       /* Mark NAT done if successful and update the flow key. */
+       if (err == NF_ACCEPT)
+               ovs_nat_update_key(key, skb, maniptype);
+
+       return err;
+}
+#else /* !CONFIG_NF_NAT_NEEDED */
+static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
+                     const struct ovs_conntrack_info *info,
+                     struct sk_buff *skb, struct nf_conn *ct,
+                     enum ip_conntrack_info ctinfo)
+{
+       return NF_ACCEPT;
+}
+#endif
+
+/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
+ * not done already.  Update key with new CT state after passing the packet
+ * through conntrack.
+ * Note that if the packet is deemed invalid by conntrack, skb->nfct will be
+ * set to NULL and 0 will be returned.
+ */
  static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
                            const struct ovs_conntrack_info *info,
                            struct sk_buff *skb)
@@ -386,8 +720,13 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
          * actually run the packet through conntrack twice unless it's for a
          * different zone.
          */
-       if (!skb_nfct_cached(net, skb, info)) {
+       bool cached = skb_nfct_cached(net, key, info, skb);
+       enum ip_conntrack_info ctinfo;
+       struct nf_conn *ct;
+
+       if (!cached) {
                 struct nf_conn *tmpl = info->ct;
+               int err;
  
                 /* Associate skb with specified zone. */
                 if (tmpl) {
@@ -398,17 +737,53 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
                         skb->nfctinfo = IP_CT_NEW;
                 }
  
-               if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING,
-                                   skb) != NF_ACCEPT)
+               /* Repeat if requested, see nf_iterate(). */
+               do {
+                       err = nf_conntrack_in(net, info->family,
+                                             NF_INET_PRE_ROUTING, skb);
+               } while (err == NF_REPEAT);
+
+               if (err != NF_ACCEPT)
                         return -ENOENT;
  
-               if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
-                       WARN_ONCE(1, "helper rejected packet");
+               /* Clear CT state NAT flags to mark that we have not yet done
+                * NAT after the nf_conntrack_in() call.  We can actually clear
+                * the whole state, as it will be re-initialized below.
+                */
+               key->ct.state = 0;
+
+               /* Update the key, but keep the NAT flags. */
+               ovs_ct_update_key(skb, info, key, true, true);
+       }
+
+       ct = nf_ct_get(skb, &ctinfo);
+       if (ct) {
+               /* Packets starting a new connection must be NATted before the
+                * helper, so that the helper knows about the NAT.  We enforce
+                * this by delaying both NAT and helper calls for unconfirmed
+                * connections until the committing CT action.  For later
+                * packets NAT and Helper may be called in either order.
+                *
+                * NAT will be done only if the CT action has NAT, and only
+                * once per packet (per zone), as guarded by the NAT bits in
+                * the key->ct.state.
+                */
+               if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) &&
+                   (nf_ct_is_confirmed(ct) || info->commit) &&
+                   ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
                         return -EINVAL;
                 }
-       }
  
-       ovs_ct_update_key(skb, info, key, true);
+               /* Call the helper only if:
+                * - nf_conntrack_in() was executed above ("!cached") for a
+                *   confirmed connection, or
+                * - When committing an unconfirmed connection.
+                */
+               if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
+                   ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
+                       return -EINVAL;
+               }
+       }
  
         return 0;
  }
@@ -420,19 +795,24 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
  {
         struct nf_conntrack_expect *exp;
  
+       /* If we pass an expected packet through nf_conntrack_in() the
+        * expectation is typically removed, but the packet could still be
+        * lost in upcall processing.  To prevent this from happening we
+        * perform an explicit expectation lookup.  Expected connections are
+        * always new, and will be passed through conntrack only when they are
+        * committed, as it is OK to remove the expectation at that time.
+        */
         exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
         if (exp) {
                 u8 state;
  
+               /* NOTE: New connections are NATted and Helped only when
+                * committed, so we are not calling into NAT here.
+                */
                 state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
                 __ovs_ct_update_key(key, state, &info->zone, exp->master);
-       } else {
-               int err;
-
-               err = __ovs_ct_lookup(net, key, info, skb);
-               if (err)
-                       return err;
-       }
+       } else
+               return __ovs_ct_lookup(net, key, info, skb);
  
         return 0;
  }
@@ -442,21 +822,12 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
                          const struct ovs_conntrack_info *info,
                          struct sk_buff *skb)
  {
-       u8 state;
         int err;
  
-       state = key->ct.state;
-       if (key->ct.zone == info->zone.id &&
-           ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) {
-               /* Previous lookup has shown that this connection is already
-                * tracked and committed. Skip committing.
-                */
-               return 0;
-       }
-
         err = __ovs_ct_lookup(net, key, info, skb);
         if (err)
                 return err;
+       /* This is a no-op if the connection has already been confirmed. */
         if (nf_conntrack_confirm(skb) != NF_ACCEPT)
                 return -EINVAL;
  
@@ -541,6 +912,135 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
         return 0;
  }
  
+#ifdef CONFIG_NF_NAT_NEEDED
+static int parse_nat(const struct nlattr *attr,
+                    struct ovs_conntrack_info *info, bool log)
+{
+       struct nlattr *a;
+       int rem;
+       bool have_ip_max = false;
+       bool have_proto_max = false;
+       bool ip_vers = (info->family == NFPROTO_IPV6);
+
+       nla_for_each_nested(a, attr, rem) {
+               static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = {
+                       [OVS_NAT_ATTR_SRC] = {0, 0},
+                       [OVS_NAT_ATTR_DST] = {0, 0},
+                       [OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr),
+                                                sizeof(struct in6_addr)},
+                       [OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr),
+                                                sizeof(struct in6_addr)},
+                       [OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)},
+                       [OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)},
+                       [OVS_NAT_ATTR_PERSISTENT] = {0, 0},
+                       [OVS_NAT_ATTR_PROTO_HASH] = {0, 0},
+                       [OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0},
+               };
+               int type = nla_type(a);
+
+               if (type > OVS_NAT_ATTR_MAX) {
+                       OVS_NLERR(log,
+                                 "Unknown NAT attribute (type=%d, max=%d).\n",
+                                 type, OVS_NAT_ATTR_MAX);
+                       return -EINVAL;
+               }
+
+               if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) {
+                       OVS_NLERR(log,
+                                 "NAT attribute type %d has unexpected length (%d != %d).\n",
+                                 type, nla_len(a),
+                                 ovs_nat_attr_lens[type][ip_vers]);
+                       return -EINVAL;
+               }
+
+               switch (type) {
+               case OVS_NAT_ATTR_SRC:
+               case OVS_NAT_ATTR_DST:
+                       if (info->nat) {
+                               OVS_NLERR(log,
+                                         "Only one type of NAT may be specified.\n"
+                                         );
+                               return -ERANGE;
+                       }
+                       info->nat |= OVS_CT_NAT;
+                       info->nat |= ((type == OVS_NAT_ATTR_SRC)
+                                       ? OVS_CT_SRC_NAT : OVS_CT_DST_NAT);
+                       break;
+
+               case OVS_NAT_ATTR_IP_MIN:
+                       nla_memcpy(&info->range.min_addr, a, nla_len(a));
+                       info->range.flags |= NF_NAT_RANGE_MAP_IPS;
+                       break;
+
+               case OVS_NAT_ATTR_IP_MAX:
+                       have_ip_max = true;
+                       nla_memcpy(&info->range.max_addr, a,
+                                  sizeof(info->range.max_addr));
+                       info->range.flags |= NF_NAT_RANGE_MAP_IPS;
+                       break;
+
+               case OVS_NAT_ATTR_PROTO_MIN:
+                       info->range.min_proto.all = htons(nla_get_u16(a));
+                       info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+                       break;
+
+               case OVS_NAT_ATTR_PROTO_MAX:
+                       have_proto_max = true;
+                       info->range.max_proto.all = htons(nla_get_u16(a));
+                       info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+                       break;
+
+               case OVS_NAT_ATTR_PERSISTENT:
+                       info->range.flags |= NF_NAT_RANGE_PERSISTENT;
+                       break;
+
+               case OVS_NAT_ATTR_PROTO_HASH:
+                       info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM;
+                       break;
+
+               case OVS_NAT_ATTR_PROTO_RANDOM:
+                       info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY;
+                       break;
+
+               default:
+                       OVS_NLERR(log, "Unknown nat attribute (%d).\n", type);
+                       return -EINVAL;
+               }
+       }
+
+       if (rem > 0) {
+               OVS_NLERR(log, "NAT attribute has %d unknown bytes.\n", rem);
+               return -EINVAL;
+       }
+       if (!info->nat) {
+               /* Do not allow flags if no type is given. */
+               if (info->range.flags) {
+                       OVS_NLERR(log,
+                                 "NAT flags may be given only when NAT range (SRC or DST) is also specified.\n"
+                                 );
+                       return -EINVAL;
+               }
+               info->nat = OVS_CT_NAT;   /* NAT existing connections. */
+       } else if (!info->commit) {
+               OVS_NLERR(log,
+                         "NAT attributes may be specified only when CT COMMIT flag is also specified.\n"
+                         );
+               return -EINVAL;
+       }
+       /* Allow missing IP_MAX. */
+       if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) {
+               memcpy(&info->range.max_addr, &info->range.min_addr,
+                      sizeof(info->range.max_addr));
+       }
+       /* Allow missing PROTO_MAX. */
+       if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
+           !have_proto_max) {
+               info->range.max_proto.all = info->range.min_proto.all;
+       }
+       return 0;
+}
+#endif
+
  static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
         [OVS_CT_ATTR_COMMIT]    = { .minlen = 0, .maxlen = 0 },
         [OVS_CT_ATTR_ZONE]      = { .minlen = sizeof(u16),
@@ -550,7 +1050,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
         [OVS_CT_ATTR_LABELS]    = { .minlen = sizeof(struct md_labels),
                                     .maxlen = sizeof(struct md_labels) },
         [OVS_CT_ATTR_HELPER]    = { .minlen = 1,
-                                   .maxlen = NF_CT_HELPER_NAME_LEN }
+                                   .maxlen = NF_CT_HELPER_NAME_LEN },
+#ifdef CONFIG_NF_NAT_NEEDED
+       /* NAT length is checked when parsing the nested attributes. */
+       [OVS_CT_ATTR_NAT]       = { .minlen = 0, .maxlen = INT_MAX },
+#endif
  };
  
  static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
@@ -617,6 +1121,15 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
                                 return -EINVAL;
                         }
                         break;
+#ifdef CONFIG_NF_NAT_NEEDED
+               case OVS_CT_ATTR_NAT: {
+                       int err = parse_nat(a, info, log);
+
+                       if (err)
+                               return err;
+                       break;
+               }
+#endif
                 default:
                         OVS_NLERR(log, "Unknown conntrack attr (%d)",
                                   type);
@@ -704,6 +1217,74 @@ err_free_ct:
         return err;
  }
  
+#ifdef CONFIG_NF_NAT_NEEDED
+static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
+                              struct sk_buff *skb)
+{
+       struct nlattr *start;
+
+       start = nla_nest_start(skb, OVS_CT_ATTR_NAT);
+       if (!start)
+               return false;
+
+       if (info->nat & OVS_CT_SRC_NAT) {
+               if (nla_put_flag(skb, OVS_NAT_ATTR_SRC))
+                       return false;
+       } else if (info->nat & OVS_CT_DST_NAT) {
+               if (nla_put_flag(skb, OVS_NAT_ATTR_DST))
+                       return false;
+       } else {
+               goto out;
+       }
+
+       if (info->range.flags & NF_NAT_RANGE_MAP_IPS) {
+               if (info->family == NFPROTO_IPV4) {
+                       if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN,
+                                           info->range.min_addr.ip) ||
+                           (info->range.max_addr.ip
+                            != info->range.min_addr.ip &&
+                            (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX,
+                                             info->range.max_addr.ip))))
+                               return false;
+#if IS_ENABLED(CONFIG_NF_NAT_IPV6)
+               } else if (info->family == NFPROTO_IPV6) {
+                       if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN,
+                                            &info->range.min_addr.in6) ||
+                           (memcmp(&info->range.max_addr.in6,
+                                   &info->range.min_addr.in6,
+                                   sizeof(info->range.max_addr.in6)) &&
+                            (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX,
+                                              &info->range.max_addr.in6))))
+                               return false;
+#endif
+               } else {
+                       return false;
+               }
+       }
+       if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
+           (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN,
+                        ntohs(info->range.min_proto.all)) ||
+            (info->range.max_proto.all != info->range.min_proto.all &&
+             nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX,
+                         ntohs(info->range.max_proto.all)))))
+               return false;
+
+       if (info->range.flags & NF_NAT_RANGE_PERSISTENT &&
+           nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT))
+               return false;
+       if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM &&
+           nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH))
+               return false;
+       if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY &&
+           nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM))
+               return false;
+out:
+       nla_nest_end(skb, start);
+
+       return true;
+}
+#endif
+
  int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
                           struct sk_buff *skb)
  {
@@ -732,7 +1313,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
                                    ct_info->helper->name))
                         return -EMSGSIZE;
         }
-
+#ifdef CONFIG_NF_NAT_NEEDED
+       if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb))
+               return -EMSGSIZE;
+#endif
         nla_nest_end(skb, start);
  
         return 0;
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h

index a7544f405c1626f6564075f4453832b311b1ac29..8f6230bd618333561b2c4636a3f8f52356a6bd6c 100644 (file)
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -37,7 +37,8 @@ void ovs_ct_free_action(const struct nlattr *a);
  
  #define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
                            OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \
-                          OVS_CS_F_INVALID | OVS_CS_F_TRACKED)
+                          OVS_CS_F_INVALID | OVS_CS_F_TRACKED | \
+                          OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
  #else
  #include <linux/errno.h>
author	David S. Miller <davem@davemloft.net>
	Tue, 15 Mar 2016 02:10:25 +0000 (22:10 -0400)
committer	David S. Miller <davem@davemloft.net>
	Tue, 15 Mar 2016 02:10:25 +0000 (22:10 -0400)
include/net/ip_vs.h		patch \| blob \| blame \| history
include/uapi/linux/netfilter/nf_conntrack_common.h		patch \| blob \| blame \| history
include/uapi/linux/openvswitch.h		patch \| blob \| blame \| history
net/ipv4/netfilter/nf_nat_l3proto_ipv4.c		patch \| blob \| blame \| history
net/ipv6/netfilter/nf_nat_l3proto_ipv6.c		patch \| blob \| blame \| history
net/netfilter/ipset/ip_set_bitmap_ipmac.c		patch \| blob \| blame \| history
net/netfilter/ipset/ip_set_core.c		patch \| blob \| blame \| history
net/netfilter/ipset/ip_set_hash_mac.c		patch \| blob \| blame \| history
net/netfilter/ipset/ip_set_list_set.c		patch \| blob \| blame \| history
net/netfilter/ipvs/ip_vs_core.c		patch \| blob \| blame \| history
net/netfilter/ipvs/ip_vs_pe_sip.c		patch \| blob \| blame \| history
net/netfilter/nf_conntrack_core.c		patch \| blob \| blame \| history
net/netfilter/nfnetlink_acct.c		patch \| blob \| blame \| history
net/netfilter/nft_compat.c		patch \| blob \| blame \| history
net/netfilter/x_tables.c		patch \| blob \| blame \| history
net/openvswitch/Kconfig		patch \| blob \| blame \| history
net/openvswitch/conntrack.c		patch \| blob \| blame \| history
net/openvswitch/conntrack.h		patch \| blob \| blame \| history