Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next
authorDavid S. Miller <davem@davemloft.net>
Sat, 2 Jun 2018 13:04:21 +0000 (09:04 -0400)
committerDavid S. Miller <davem@davemloft.net>
Sat, 2 Jun 2018 13:04:21 +0000 (09:04 -0400)
Pablo Neira Ayuso says:

====================
Netfilter/IPVS updates for net-next

The following patchset contains Netfilter/IPVS updates for your net-next
tree, the most relevant things in this batch are:

1) Compile masquerade infrastructure into NAT module, from Florian Westphal.
   Same thing with the redirection support.

2) Abort transaction if early initialization of the commit phase fails.
   Also from Florian.

3) Get rid of synchronize_rcu() by using rule array in nf_tables, from
   Florian.

4) Abort nf_tables batch if fatal signal is pending, from Florian.

5) Use .call_rcu nfnetlink from nf_tables to make dumps fully lockless.
   From Florian Westphal.

6) Support to match transparent sockets from nf_tables, from Máté Eckl.

7) Audit support for nf_tables, from Phil Sutter.

8) Validate chain dependencies from commit phase, fall back to fine grain
   validation only in case of errors.

9) Attach dst to skbuff from netfilter flowtable packet path, from
   Jason A. Donenfeld.

10) Use artificial maximum attribute cap to remove VLA from nfnetlink.
    Patch from Kees Cook.

11) Add extension to allow to forward packets through neighbour layer.

12) Add IPv6 conntrack helper support to IPVS, from Julian Anastasov.

13) Add IPv6 FTP conntrack support to IPVS, from Julian Anastasov.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
35 files changed:
include/linux/netfilter/nfnetlink.h
include/net/ip_vs.h
include/net/netfilter/nf_socket.h
include/net/netfilter/nf_tables.h
include/net/netfilter/nf_tables_core.h
include/net/netns/nftables.h
include/uapi/linux/netfilter/nf_tables.h
net/ipv4/netfilter/Kconfig
net/ipv4/netfilter/Makefile
net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
net/ipv6/netfilter/Kconfig
net/ipv6/netfilter/Makefile
net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
net/netfilter/Kconfig
net/netfilter/Makefile
net/netfilter/ipvs/ip_vs_app.c
net/netfilter/ipvs/ip_vs_ftp.c
net/netfilter/ipvs/ip_vs_nfct.c
net/netfilter/ipvs/ip_vs_proto_sctp.c
net/netfilter/ipvs/ip_vs_proto_tcp.c
net/netfilter/ipvs/ip_vs_proto_udp.c
net/netfilter/nf_flow_table_ip.c
net/netfilter/nf_nat_core.c
net/netfilter/nf_nat_redirect.c
net/netfilter/nf_tables_api.c
net/netfilter/nf_tables_core.c
net/netfilter/nfnetlink.c
net/netfilter/nft_compat.c
net/netfilter/nft_fwd_netdev.c
net/netfilter/nft_hash.c
net/netfilter/nft_immediate.c
net/netfilter/nft_log.c
net/netfilter/nft_lookup.c
net/netfilter/nft_numgen.c
net/netfilter/nft_socket.c [new file with mode: 0644]

index 34551f8aaf9d4d81bf6a6a5e30c74759ec80dabf..3ecc3050be0ec29fc9716b3ad616a43131d36be3 100644 (file)
@@ -31,6 +31,7 @@ struct nfnetlink_subsystem {
        const struct nfnl_callback *cb; /* callback for individual types */
        int (*commit)(struct net *net, struct sk_buff *skb);
        int (*abort)(struct net *net, struct sk_buff *skb);
+       void (*cleanup)(struct net *net);
        bool (*valid_genid)(struct net *net, u32 genid);
 };
 
index 0ac795b41ab80029ba4149f70933994dd2db64e2..03f567eb953610ff47b62edc9b490234605f4e17 100644 (file)
@@ -763,14 +763,14 @@ struct ip_vs_app {
         *         2=Mangled but checksum was not updated
         */
        int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *,
-                      struct sk_buff *, int *diff);
+                      struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh);
 
        /* input hook: Process packet in outin direction, diff set for TCP.
         * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok,
         *         2=Mangled but checksum was not updated
         */
        int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *,
-                     struct sk_buff *, int *diff);
+                     struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh);
 
        /* ip_vs_app initializer */
        int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *);
@@ -1328,8 +1328,10 @@ int register_ip_vs_app_inc(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16
 int ip_vs_app_inc_get(struct ip_vs_app *inc);
 void ip_vs_app_inc_put(struct ip_vs_app *inc);
 
-int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb);
-int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb);
+int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb,
+                     struct ip_vs_iphdr *ipvsh);
+int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb,
+                    struct ip_vs_iphdr *ipvsh);
 
 int register_ip_vs_pe(struct ip_vs_pe *pe);
 int unregister_ip_vs_pe(struct ip_vs_pe *pe);
index 8230fefff9f5ebd5143c72d0aac61414c4f21695..29b6313f0557e9238b794a7d7569b075042ab72a 100644 (file)
@@ -2,10 +2,8 @@
 #ifndef _NF_SOCK_H_
 #define _NF_SOCK_H_
 
-struct net_device;
-struct sk_buff;
-struct sock;
-struct net;
+#include <net/sock.h>
+#include <net/inet_timewait_sock.h>
 
 static inline bool nf_sk_is_transparent(struct sock *sk)
 {
index 603b51401deb4c19d9c85b06f2e365fc8ef41a9f..435c32d8a9959a5bb3f3be80980e1a1fee9a670f 100644 (file)
@@ -858,6 +858,8 @@ enum nft_chain_flags {
  *     @name: name of the chain
  */
 struct nft_chain {
+       struct nft_rule                 *__rcu *rules_gen_0;
+       struct nft_rule                 *__rcu *rules_gen_1;
        struct list_head                rules;
        struct list_head                list;
        struct nft_table                *table;
@@ -867,8 +869,13 @@ struct nft_chain {
        u8                              flags:6,
                                        genmask:2;
        char                            *name;
+
+       /* Only used during control plane commit phase: */
+       struct nft_rule                 **rules_next;
 };
 
+int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain);
+
 enum nft_chain_types {
        NFT_CHAIN_T_DEFAULT = 0,
        NFT_CHAIN_T_ROUTE,
index cd6915b6c054e363e8bb621068b5e18e87fa8905..e0c0c2558ec48adfb27629c2180f9b04efb67bcf 100644 (file)
@@ -2,6 +2,8 @@
 #ifndef _NET_NF_TABLES_CORE_H
 #define _NET_NF_TABLES_CORE_H
 
+#include <net/netfilter/nf_tables.h>
+
 extern struct nft_expr_type nft_imm_type;
 extern struct nft_expr_type nft_cmp_type;
 extern struct nft_expr_type nft_lookup_type;
@@ -23,6 +25,12 @@ struct nft_cmp_fast_expr {
        u8                      len;
 };
 
+struct nft_immediate_expr {
+       struct nft_data         data;
+       enum nft_registers      dreg:8;
+       u8                      dlen;
+};
+
 /* Calculate the mask for the nft_cmp_fast expression. On big endian the
  * mask needs to include the *upper* bytes when interpreting that data as
  * something smaller than the full u32, therefore a cpu_to_le32 is done.
index 29c3851b486aebddd10d5af3a441259fef644907..94767ea3a490660cfe23da891c9a3ba8255da4e1 100644 (file)
@@ -9,6 +9,7 @@ struct netns_nftables {
        struct list_head        commit_list;
        unsigned int            base_seq;
        u8                      gencursor;
+       u8                      validate_state;
 };
 
 #endif
index 9c71f024f9cc466f1fa758d5f65b1d4e80420792..a089af092a294bdad5a076cb76bb0e8c763e4bec 100644 (file)
@@ -904,6 +904,31 @@ enum nft_rt_attributes {
 };
 #define NFTA_RT_MAX            (__NFTA_RT_MAX - 1)
 
+/**
+ * enum nft_socket_attributes - nf_tables socket expression netlink attributes
+ *
+ * @NFTA_SOCKET_KEY: socket key to match
+ * @NFTA_SOCKET_DREG: destination register
+ */
+enum nft_socket_attributes {
+       NFTA_SOCKET_UNSPEC,
+       NFTA_SOCKET_KEY,
+       NFTA_SOCKET_DREG,
+       __NFTA_SOCKET_MAX
+};
+#define NFTA_SOCKET_MAX                (__NFTA_SOCKET_MAX - 1)
+
+/*
+ * enum nft_socket_keys - nf_tables socket expression keys
+ *
+ * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option_
+ */
+enum nft_socket_keys {
+       NFT_SOCKET_TRANSPARENT,
+       __NFT_SOCKET_MAX
+};
+#define NFT_SOCKET_MAX (__NFT_SOCKET_MAX - 1)
+
 /**
  * enum nft_ct_keys - nf_tables ct expression keys
  *
@@ -1055,6 +1080,11 @@ enum nft_log_attributes {
 };
 #define NFTA_LOG_MAX           (__NFTA_LOG_MAX - 1)
 
+/**
+ * LOGLEVEL_AUDIT - a pseudo log level enabling audit logging
+ */
+#define LOGLEVEL_AUDIT         8
+
 /**
  * enum nft_queue_attributes - nf_tables queue expression netlink attributes
  *
@@ -1230,10 +1260,14 @@ enum nft_dup_attributes {
  * enum nft_fwd_attributes - nf_tables fwd expression netlink attributes
  *
  * @NFTA_FWD_SREG_DEV: source register of output interface (NLA_U32: nft_register)
+ * @NFTA_FWD_SREG_ADDR: source register of destination address (NLA_U32: nft_register)
+ * @NFTA_FWD_NFPROTO: layer 3 family of source register address (NLA_U32: enum nfproto)
  */
 enum nft_fwd_attributes {
        NFTA_FWD_UNSPEC,
        NFTA_FWD_SREG_DEV,
+       NFTA_FWD_SREG_ADDR,
+       NFTA_FWD_NFPROTO,
        __NFTA_FWD_MAX
 };
 #define NFTA_FWD_MAX   (__NFTA_FWD_MAX - 1)
index 280048e1e3958b94ee5a6ce23fcb8b88dee553bf..d03bc5a01a70e4594c3dbe37f6dccb6858b36d18 100644 (file)
@@ -129,10 +129,7 @@ config NFT_CHAIN_NAT_IPV4
          source and destination ports.
 
 config NF_NAT_MASQUERADE_IPV4
-       tristate "IPv4 masquerade support"
-       help
-         This is the kernel functionality to provide NAT in the masquerade
-         flavour (automatic source address selection).
+       bool
 
 config NFT_MASQ_IPV4
        tristate "IPv4 masquerading support for nf_tables"
index 0e5edd0c7926734f657246493468ce0c1624d908..c4b05b1740910fbbe640106142d22332852f5e4e 100644 (file)
@@ -10,6 +10,7 @@ nf_conntrack_ipv4-y   :=  nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
 obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
 
 nf_nat_ipv4-y          := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
+nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
 obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
 
 # defrag
@@ -32,9 +33,6 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o
 $(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h
 obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
 
-obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
-
-
 # NAT protocols (nf_nat)
 obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
 
index f538c500154711bd7b419b5ddc9fd544d12b6755..ad3aeff152ede37e5d39b0e5bcf05a0ad5c0904e 100644 (file)
@@ -7,7 +7,6 @@
  */
 
 #include <linux/types.h>
-#include <linux/module.h>
 #include <linux/atomic.h>
 #include <linux/inetdevice.h>
 #include <linux/ip.h>
@@ -157,6 +156,3 @@ void nf_nat_masquerade_ipv4_unregister_notifier(void)
        unregister_inetaddr_notifier(&masq_inet_notifier);
 }
 EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
index ce77bcc2490c2783604d69778887127d7f405239..9f5b00a39adfdb1cef535dcbc4da31d64205cad6 100644 (file)
@@ -136,10 +136,7 @@ config NF_NAT_IPV6
 if NF_NAT_IPV6
 
 config NF_NAT_MASQUERADE_IPV6
-       tristate "IPv6 masquerade support"
-       help
-         This is the kernel functionality to provide NAT in the masquerade
-         flavour (automatic source address selection) for IPv6.
+       bool
 
 endif # NF_NAT_IPV6
 
index 44273d6f03a57d69d34c4d20ae56793c2f3ea0d9..71518f22ae3956d7ac428222a78518500c083a47 100644 (file)
@@ -18,8 +18,8 @@ nf_conntrack_ipv6-y  :=  nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
 obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
 
 nf_nat_ipv6-y          := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o
+nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
 obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
-obj-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
 
 # defrag
 nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
index 9dfc2b90c3622ce2ff8ab0bef4b00c8faa96f164..e6eb7cf9b54fd5e5c81b14836b0d629d5cccfd6d 100644 (file)
@@ -10,7 +10,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/atomic.h>
 #include <linux/netdevice.h>
 #include <linux/ipv6.h>
@@ -186,6 +185,3 @@ void nf_nat_masquerade_ipv6_unregister_notifier(void)
        unregister_netdevice_notifier(&masq_dev_notifier);
 }
 EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
index a5b60e6a983e4a6be6a305b43087cf33c96dbf51..276e1e32f44ee6d0b584670644471627a6c127fe 100644 (file)
@@ -433,11 +433,7 @@ config NF_NAT_TFTP
        default NF_NAT && NF_CONNTRACK_TFTP
 
 config NF_NAT_REDIRECT
-        tristate "IPv4/IPv6 redirect support"
-       depends on NF_NAT
-        help
-          This is the kernel functionality to redirect packets to local
-          machine through NAT.
+       bool
 
 config NETFILTER_SYNPROXY
        tristate
@@ -617,6 +613,15 @@ config NFT_FIB_INET
          The lookup will be delegated to the IPv4 or IPv6 FIB depending
          on the protocol of the packet.
 
+config NFT_SOCKET
+       tristate "Netfilter nf_tables socket match support"
+       depends on IPV6 || IPV6=n
+       select NF_SOCKET_IPV4
+       select NF_SOCKET_IPV6 if IPV6
+       help
+         This option allows matching for the presence or absence of a
+         corresponding socket and its attributes.
+
 if NF_TABLES_NETDEV
 
 config NF_DUP_NETDEV
index 1aa710b5d384ecc76a49be9011d64851bb427ff6..eec169555731cdc98225b66c2fa2acbdd1966f05 100644 (file)
@@ -55,7 +55,7 @@ obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
 obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o
 
 obj-$(CONFIG_NF_NAT) += nf_nat.o
-obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
+nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
 
 # NAT helpers
 obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
@@ -102,6 +102,7 @@ obj-$(CONFIG_NFT_FIB)               += nft_fib.o
 obj-$(CONFIG_NFT_FIB_INET)     += nft_fib_inet.o
 obj-$(CONFIG_NFT_FIB_NETDEV)   += nft_fib_netdev.o
 obj-$(CONFIG_NF_OSF)           += nf_osf.o
+obj-$(CONFIG_NFT_SOCKET)       += nft_socket.o
 
 # nf_tables netdev
 obj-$(CONFIG_NFT_DUP_NETDEV)   += nft_dup_netdev.o
index 1c98c907bc6354108305e55b1be50d7e6b74ca4e..12d74896556a6e3e1dc6c75d99fe175bd53742d6 100644 (file)
@@ -355,7 +355,8 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
 }
 
 static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
-                                 struct ip_vs_app *app)
+                                 struct ip_vs_app *app,
+                                 struct ip_vs_iphdr *ipvsh)
 {
        int diff;
        const unsigned int tcp_offset = ip_hdrlen(skb);
@@ -386,7 +387,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
        if (app->pkt_out == NULL)
                return 1;
 
-       if (!app->pkt_out(app, cp, skb, &diff))
+       if (!app->pkt_out(app, cp, skb, &diff, ipvsh))
                return 0;
 
        /*
@@ -404,7 +405,8 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
  *     called by ipvs packet handler, assumes previously checked cp!=NULL
  *     returns false if it can't handle packet (oom)
  */
-int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
+                     struct ip_vs_iphdr *ipvsh)
 {
        struct ip_vs_app *app;
 
@@ -417,7 +419,7 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
 
        /* TCP is complicated */
        if (cp->protocol == IPPROTO_TCP)
-               return app_tcp_pkt_out(cp, skb, app);
+               return app_tcp_pkt_out(cp, skb, app, ipvsh);
 
        /*
         *      Call private output hook function
@@ -425,12 +427,13 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
        if (app->pkt_out == NULL)
                return 1;
 
-       return app->pkt_out(app, cp, skb, NULL);
+       return app->pkt_out(app, cp, skb, NULL, ipvsh);
 }
 
 
 static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
-                                struct ip_vs_app *app)
+                                struct ip_vs_app *app,
+                                struct ip_vs_iphdr *ipvsh)
 {
        int diff;
        const unsigned int tcp_offset = ip_hdrlen(skb);
@@ -461,7 +464,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
        if (app->pkt_in == NULL)
                return 1;
 
-       if (!app->pkt_in(app, cp, skb, &diff))
+       if (!app->pkt_in(app, cp, skb, &diff, ipvsh))
                return 0;
 
        /*
@@ -479,7 +482,8 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
  *     called by ipvs packet handler, assumes previously checked cp!=NULL.
  *     returns false if can't handle packet (oom).
  */
-int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
+                    struct ip_vs_iphdr *ipvsh)
 {
        struct ip_vs_app *app;
 
@@ -492,7 +496,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
 
        /* TCP is complicated */
        if (cp->protocol == IPPROTO_TCP)
-               return app_tcp_pkt_in(cp, skb, app);
+               return app_tcp_pkt_in(cp, skb, app, ipvsh);
 
        /*
         *      Call private input hook function
@@ -500,7 +504,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
        if (app->pkt_in == NULL)
                return 1;
 
-       return app->pkt_in(app, cp, skb, NULL);
+       return app->pkt_in(app, cp, skb, NULL, ipvsh);
 }
 
 
index 58d5d05aec24c5fcc0bb23f2ccceea887bfaa029..4398a72edec59ffc112ab49d21565853da3a92c7 100644 (file)
@@ -29,6 +29,8 @@
 #include <linux/moduleparam.h>
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/netfilter.h>
 #include <net/ip_vs.h>
 
 
-#define SERVER_STRING "227 "
-#define CLIENT_STRING "PORT"
+#define SERVER_STRING_PASV "227 "
+#define CLIENT_STRING_PORT "PORT"
+#define SERVER_STRING_EPSV "229 "
+#define CLIENT_STRING_EPRT "EPRT"
 
+enum {
+       IP_VS_FTP_ACTIVE = 0,
+       IP_VS_FTP_PORT = 0,
+       IP_VS_FTP_PASV,
+       IP_VS_FTP_EPRT,
+       IP_VS_FTP_EPSV,
+};
 
 /*
  * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
@@ -58,9 +69,15 @@ module_param_array(ports, ushort, &ports_count, 0444);
 MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
 
 
-/*     Dummy variable */
-static int ip_vs_ftp_pasv;
+static char *ip_vs_ftp_data_ptr(struct sk_buff *skb, struct ip_vs_iphdr *ipvsh)
+{
+       struct tcphdr *th = (struct tcphdr *)((char *)skb->data + ipvsh->len);
+
+       if ((th->doff << 2) < sizeof(struct tcphdr))
+               return NULL;
 
+       return (char *)th + (th->doff << 2);
+}
 
 static int
 ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
@@ -78,20 +95,20 @@ ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
 }
 
 
-/*
- * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
- * with the "pattern", ignoring before "skip" and terminated with
- * the "term" character.
- * <addr,port> is in network order.
+/* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern". <addr,port> is in network order.
+ * Parse extended format depending on ext. In this case addr can be pre-set.
  */
 static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
                                  const char *pattern, size_t plen,
-                                 char skip, char term,
-                                 __be32 *addr, __be16 *port,
-                                 char **start, char **end)
+                                 char skip, bool ext, int mode,
+                                 union nf_inet_addr *addr, __be16 *port,
+                                 __u16 af, char **start, char **end)
 {
        char *s, c;
        unsigned char p[6];
+       char edelim;
+       __u16 hport;
        int i = 0;
 
        if (data_limit - data < plen) {
@@ -113,6 +130,11 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
                        if (s == data_limit)
                                return -1;
                        if (!found) {
+                               /* "(" is optional for non-extended format,
+                                * so catch the start of IPv4 address
+                                */
+                               if (!ext && isdigit(*s))
+                                       break;
                                if (*s == skip)
                                        found = 1;
                        } else if (*s != skip) {
@@ -120,41 +142,102 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
                        }
                }
        }
+       /* Old IPv4-only format? */
+       if (!ext) {
+               p[0] = 0;
+               for (data = s; ; data++) {
+                       if (data == data_limit)
+                               return -1;
+                       c = *data;
+                       if (isdigit(c)) {
+                               p[i] = p[i]*10 + c - '0';
+                       } else if (c == ',' && i < 5) {
+                               i++;
+                               p[i] = 0;
+                       } else {
+                               /* unexpected character or terminator */
+                               break;
+                       }
+               }
 
-       for (data = s; ; data++) {
-               if (data == data_limit)
+               if (i != 5)
                        return -1;
-               if (*data == term)
-                       break;
+
+               *start = s;
+               *end = data;
+               addr->ip = get_unaligned((__be32 *) p);
+               *port = get_unaligned((__be16 *) (p + 4));
+               return 1;
        }
-       *end = data;
+       if (s == data_limit)
+               return -1;
+       *start = s;
+       edelim = *s++;
+       if (edelim < 33 || edelim > 126)
+               return -1;
+       if (s == data_limit)
+               return -1;
+       if (*s == edelim) {
+               /* Address family is usually missing for EPSV response */
+               if (mode != IP_VS_FTP_EPSV)
+                       return -1;
+               s++;
+               if (s == data_limit)
+                       return -1;
+               /* Then address should be missing too */
+               if (*s != edelim)
+                       return -1;
+               /* Caller can pre-set addr, if needed */
+               s++;
+       } else {
+               const char *ep;
 
-       memset(p, 0, sizeof(p));
-       for (data = s; ; data++) {
-               c = *data;
-               if (c == term)
-                       break;
-               if (c >= '0' && c <= '9') {
-                       p[i] = p[i]*10 + c - '0';
-               } else if (c == ',' && i < 5) {
-                       i++;
-               } else {
-                       /* unexpected character */
+               /* We allow address only from same family */
+               if (af == AF_INET6 && *s != '2')
                        return -1;
+               if (af == AF_INET && *s != '1')
+                       return -1;
+               s++;
+               if (s == data_limit)
+                       return -1;
+               if (*s != edelim)
+                       return -1;
+               s++;
+               if (s == data_limit)
+                       return -1;
+               if (af == AF_INET6) {
+                       if (in6_pton(s, data_limit - s, (u8 *)addr, edelim,
+                                    &ep) <= 0)
+                               return -1;
+               } else {
+                       if (in4_pton(s, data_limit - s, (u8 *)addr, edelim,
+                                    &ep) <= 0)
+                               return -1;
                }
+               s = (char *) ep;
+               if (s == data_limit)
+                       return -1;
+               if (*s != edelim)
+                       return -1;
+               s++;
        }
-
-       if (i != 5)
+       for (hport = 0; ; s++)
+       {
+               if (s == data_limit)
+                       return -1;
+               if (!isdigit(*s))
+                       break;
+               hport = hport * 10 + *s - '0';
+       }
+       if (s == data_limit || !hport || *s != edelim)
                return -1;
-
-       *start = s;
-       *addr = get_unaligned((__be32 *) p);
-       *port = get_unaligned((__be16 *) (p + 4));
+       s++;
+       *end = s;
+       *port = htons(hport);
        return 1;
 }
 
-/*
- * Look at outgoing ftp packets to catch the response to a PASV command
+/* Look at outgoing ftp packets to catch the response to a PASV/EPSV command
  * from the server (inside-to-outside).
  * When we see one, we build a connection entry with the client address,
  * client port 0 (unknown at the moment), the server address and the
@@ -165,12 +248,13 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
  * The outgoing packet should be something like
  *   "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
  * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ * The extended format for EPSV response provides usually only port:
+ *   "229 Entering Extended Passive Mode (|||ppp|)"
  */
 static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
-                        struct sk_buff *skb, int *diff)
+                        struct sk_buff *skb, int *diff,
+                        struct ip_vs_iphdr *ipvsh)
 {
-       struct iphdr *iph;
-       struct tcphdr *th;
        char *data, *data_limit;
        char *start, *end;
        union nf_inet_addr from;
@@ -184,14 +268,6 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 
        *diff = 0;
 
-#ifdef CONFIG_IP_VS_IPV6
-       /* This application helper doesn't work with IPv6 yet,
-        * so turn this into a no-op for IPv6 packets
-        */
-       if (cp->af == AF_INET6)
-               return 1;
-#endif
-
        /* Only useful for established sessions */
        if (cp->state != IP_VS_TCP_S_ESTABLISHED)
                return 1;
@@ -200,53 +276,77 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
        if (!skb_make_writable(skb, skb->len))
                return 0;
 
-       if (cp->app_data == &ip_vs_ftp_pasv) {
-               iph = ip_hdr(skb);
-               th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-               data = (char *)th + (th->doff << 2);
+       if (cp->app_data == (void *) IP_VS_FTP_PASV) {
+               data = ip_vs_ftp_data_ptr(skb, ipvsh);
                data_limit = skb_tail_pointer(skb);
 
+               if (!data || data >= data_limit)
+                       return 1;
+
                if (ip_vs_ftp_get_addrport(data, data_limit,
-                                          SERVER_STRING,
-                                          sizeof(SERVER_STRING)-1,
-                                          '(', ')',
-                                          &from.ip, &port,
+                                          SERVER_STRING_PASV,
+                                          sizeof(SERVER_STRING_PASV)-1,
+                                          '(', false, IP_VS_FTP_PASV,
+                                          &from, &port, cp->af,
                                           &start, &end) != 1)
                        return 1;
 
-               IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n",
+               IP_VS_DBG(7, "PASV response (%pI4:%u) -> %pI4:%u detected\n",
                          &from.ip, ntohs(port), &cp->caddr.ip, 0);
+       } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) {
+               data = ip_vs_ftp_data_ptr(skb, ipvsh);
+               data_limit = skb_tail_pointer(skb);
 
-               /*
-                * Now update or create an connection entry for it
+               if (!data || data >= data_limit)
+                       return 1;
+
+               /* Usually, data address is not specified but
+                * we support different address, so pre-set it.
                 */
-               {
-                       struct ip_vs_conn_param p;
-                       ip_vs_conn_fill_param(cp->ipvs, AF_INET,
-                                             iph->protocol, &from, port,
-                                             &cp->caddr, 0, &p);
-                       n_cp = ip_vs_conn_out_get(&p);
-               }
-               if (!n_cp) {
-                       struct ip_vs_conn_param p;
-                       ip_vs_conn_fill_param(cp->ipvs,
-                                             AF_INET, IPPROTO_TCP, &cp->caddr,
-                                             0, &cp->vaddr, port, &p);
-                       /* As above, this is ipv4 only */
-                       n_cp = ip_vs_conn_new(&p, AF_INET, &from, port,
-                                             IP_VS_CONN_F_NO_CPORT |
-                                             IP_VS_CONN_F_NFCT,
-                                             cp->dest, skb->mark);
-                       if (!n_cp)
-                               return 0;
+               from = cp->daddr;
+               if (ip_vs_ftp_get_addrport(data, data_limit,
+                                          SERVER_STRING_EPSV,
+                                          sizeof(SERVER_STRING_EPSV)-1,
+                                          '(', true, IP_VS_FTP_EPSV,
+                                          &from, &port, cp->af,
+                                          &start, &end) != 1)
+                       return 1;
 
-                       /* add its controller */
-                       ip_vs_control_add(n_cp, cp);
-               }
+               IP_VS_DBG_BUF(7, "EPSV response (%s:%u) -> %s:%u detected\n",
+                             IP_VS_DBG_ADDR(cp->af, &from), ntohs(port),
+                             IP_VS_DBG_ADDR(cp->af, &cp->caddr), 0);
+       } else {
+               return 1;
+       }
 
-               /*
-                * Replace the old passive address with the new one
-                */
+       /* Now update or create a connection entry for it */
+       {
+               struct ip_vs_conn_param p;
+
+               ip_vs_conn_fill_param(cp->ipvs, cp->af,
+                                     ipvsh->protocol, &from, port,
+                                     &cp->caddr, 0, &p);
+               n_cp = ip_vs_conn_out_get(&p);
+       }
+       if (!n_cp) {
+               struct ip_vs_conn_param p;
+
+               ip_vs_conn_fill_param(cp->ipvs,
+                                     cp->af, ipvsh->protocol, &cp->caddr,
+                                     0, &cp->vaddr, port, &p);
+               n_cp = ip_vs_conn_new(&p, cp->af, &from, port,
+                                     IP_VS_CONN_F_NO_CPORT |
+                                     IP_VS_CONN_F_NFCT,
+                                     cp->dest, skb->mark);
+               if (!n_cp)
+                       return 0;
+
+               /* add its controller */
+               ip_vs_control_add(n_cp, cp);
+       }
+
+       /* Replace the old passive address with the new one */
+       if (cp->app_data == (void *) IP_VS_FTP_PASV) {
                from.ip = n_cp->vaddr.ip;
                port = n_cp->vport;
                snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u",
@@ -256,50 +356,54 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
                         ((unsigned char *)&from.ip)[3],
                         ntohs(port) >> 8,
                         ntohs(port) & 0xFF);
+       } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) {
+               from = n_cp->vaddr;
+               port = n_cp->vport;
+               /* Only port, client will use VIP for the data connection */
+               snprintf(buf, sizeof(buf), "|||%u|",
+                        ntohs(port));
+       } else {
+               *buf = 0;
+       }
+       buf_len = strlen(buf);
 
-               buf_len = strlen(buf);
-
-               ct = nf_ct_get(skb, &ctinfo);
-               if (ct) {
-                       bool mangled;
-
-                       /* If mangling fails this function will return 0
-                        * which will cause the packet to be dropped.
-                        * Mangling can only fail under memory pressure,
-                        * hopefully it will succeed on the retransmitted
-                        * packet.
-                        */
-                       mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
-                                                          iph->ihl * 4,
-                                                          start - data,
-                                                          end - start,
-                                                          buf, buf_len);
-                       if (mangled) {
-                               ip_vs_nfct_expect_related(skb, ct, n_cp,
-                                                         IPPROTO_TCP, 0, 0);
-                               if (skb->ip_summed == CHECKSUM_COMPLETE)
-                                       skb->ip_summed = CHECKSUM_UNNECESSARY;
-                               /* csum is updated */
-                               ret = 1;
-                       }
-               }
+       ct = nf_ct_get(skb, &ctinfo);
+       if (ct) {
+               bool mangled;
 
-               /*
-                * Not setting 'diff' is intentional, otherwise the sequence
-                * would be adjusted twice.
+               /* If mangling fails this function will return 0
+                * which will cause the packet to be dropped.
+                * Mangling can only fail under memory pressure,
+                * hopefully it will succeed on the retransmitted
+                * packet.
                 */
-
-               cp->app_data = NULL;
-               ip_vs_tcp_conn_listen(n_cp);
-               ip_vs_conn_put(n_cp);
-               return ret;
+               mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+                                                  ipvsh->len,
+                                                  start - data,
+                                                  end - start,
+                                                  buf, buf_len);
+               if (mangled) {
+                       ip_vs_nfct_expect_related(skb, ct, n_cp,
+                                                 ipvsh->protocol, 0, 0);
+                       if (skb->ip_summed == CHECKSUM_COMPLETE)
+                               skb->ip_summed = CHECKSUM_UNNECESSARY;
+                       /* csum is updated */
+                       ret = 1;
+               }
        }
-       return 1;
+
+       /* Not setting 'diff' is intentional, otherwise the sequence
+        * would be adjusted twice.
+        */
+
+       cp->app_data = (void *) IP_VS_FTP_ACTIVE;
+       ip_vs_tcp_conn_listen(n_cp);
+       ip_vs_conn_put(n_cp);
+       return ret;
 }
 
 
-/*
- * Look at incoming ftp packets to catch the PASV/PORT command
+/* Look at incoming ftp packets to catch the PASV/PORT/EPRT/EPSV command
  * (outside-to-inside).
  *
  * The incoming packet having the PORT command should be something like
@@ -308,12 +412,19 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
  * In this case, we create a connection entry using the client address and
  * port, so that the active ftp data connection from the server can reach
  * the client.
+ * Extended format:
+ *     "EPSV\r\n" when client requests server address from same family
+ *     "EPSV 1\r\n" when client requests IPv4 server address
+ *     "EPSV 2\r\n" when client requests IPv6 server address
+ *     "EPSV ALL\r\n" - not supported
+ *     EPRT with specified delimiter (ASCII 33..126), "|" by default:
+ *     "EPRT |1|IPv4ADDR|PORT|\r\n" when client provides IPv4 addrport
+ *     "EPRT |2|IPv6ADDR|PORT|\r\n" when client provides IPv6 addrport
  */
 static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
-                       struct sk_buff *skb, int *diff)
+                       struct sk_buff *skb, int *diff,
+                       struct ip_vs_iphdr *ipvsh)
 {
-       struct iphdr *iph;
-       struct tcphdr *th;
        char *data, *data_start, *data_limit;
        char *start, *end;
        union nf_inet_addr to;
@@ -323,14 +434,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
        /* no diff required for incoming packets */
        *diff = 0;
 
-#ifdef CONFIG_IP_VS_IPV6
-       /* This application helper doesn't work with IPv6 yet,
-        * so turn this into a no-op for IPv6 packets
-        */
-       if (cp->af == AF_INET6)
-               return 1;
-#endif
-
        /* Only useful for established sessions */
        if (cp->state != IP_VS_TCP_S_ESTABLISHED)
                return 1;
@@ -339,27 +442,48 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
        if (!skb_make_writable(skb, skb->len))
                return 0;
 
-       /*
-        * Detecting whether it is passive
-        */
-       iph = ip_hdr(skb);
-       th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-
-       /* Since there may be OPTIONS in the TCP packet and the HLEN is
-          the length of the header in 32-bit multiples, it is accurate
-          to calculate data address by th+HLEN*4 */
-       data = data_start = (char *)th + (th->doff << 2);
+       data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh);
        data_limit = skb_tail_pointer(skb);
+       if (!data || data >= data_limit)
+               return 1;
 
        while (data <= data_limit - 6) {
-               if (strncasecmp(data, "PASV\r\n", 6) == 0) {
+               if (cp->af == AF_INET &&
+                   strncasecmp(data, "PASV\r\n", 6) == 0) {
                        /* Passive mode on */
                        IP_VS_DBG(7, "got PASV at %td of %td\n",
                                  data - data_start,
                                  data_limit - data_start);
-                       cp->app_data = &ip_vs_ftp_pasv;
+                       cp->app_data = (void *) IP_VS_FTP_PASV;
                        return 1;
                }
+
+               /* EPSV or EPSV<space><net-prt> */
+               if (strncasecmp(data, "EPSV", 4) == 0 &&
+                   (data[4] == ' ' || data[4] == '\r')) {
+                       if (data[4] == ' ') {
+                               char proto = data[5];
+
+                               if (data > data_limit - 7 || data[6] != '\r')
+                                       return 1;
+
+#ifdef CONFIG_IP_VS_IPV6
+                               if (cp->af == AF_INET6 && proto == '2') {
+                               } else
+#endif
+                               if (cp->af == AF_INET && proto == '1') {
+                               } else {
+                                       return 1;
+                               }
+                       }
+                       /* Extended Passive mode on */
+                       IP_VS_DBG(7, "got EPSV at %td of %td\n",
+                                 data - data_start,
+                                 data_limit - data_start);
+                       cp->app_data = (void *) IP_VS_FTP_EPSV;
+                       return 1;
+               }
+
                data++;
        }
 
@@ -370,33 +494,52 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
         * then create a new connection entry for the coming data
         * connection.
         */
-       if (ip_vs_ftp_get_addrport(data_start, data_limit,
-                                  CLIENT_STRING, sizeof(CLIENT_STRING)-1,
-                                  ' ', '\r', &to.ip, &port,
-                                  &start, &end) != 1)
+       if (cp->af == AF_INET &&
+           ip_vs_ftp_get_addrport(data_start, data_limit,
+                                  CLIENT_STRING_PORT,
+                                  sizeof(CLIENT_STRING_PORT)-1,
+                                  ' ', false, IP_VS_FTP_PORT,
+                                  &to, &port, cp->af,
+                                  &start, &end) == 1) {
+
+               IP_VS_DBG(7, "PORT %pI4:%u detected\n", &to.ip, ntohs(port));
+
+               /* Now update or create a connection entry for it */
+               IP_VS_DBG(7, "protocol %s %pI4:%u %pI4:%u\n",
+                         ip_vs_proto_name(ipvsh->protocol),
+                         &to.ip, ntohs(port), &cp->vaddr.ip,
+                         ntohs(cp->vport)-1);
+       } else if (ip_vs_ftp_get_addrport(data_start, data_limit,
+                                         CLIENT_STRING_EPRT,
+                                         sizeof(CLIENT_STRING_EPRT)-1,
+                                         ' ', true, IP_VS_FTP_EPRT,
+                                         &to, &port, cp->af,
+                                         &start, &end) == 1) {
+
+               IP_VS_DBG_BUF(7, "EPRT %s:%u detected\n",
+                             IP_VS_DBG_ADDR(cp->af, &to), ntohs(port));
+
+               /* Now update or create a connection entry for it */
+               IP_VS_DBG_BUF(7, "protocol %s %s:%u %s:%u\n",
+                             ip_vs_proto_name(ipvsh->protocol),
+                             IP_VS_DBG_ADDR(cp->af, &to), ntohs(port),
+                             IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+                             ntohs(cp->vport)-1);
+       } else {
                return 1;
-
-       IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port));
+       }
 
        /* Passive mode off */
-       cp->app_data = NULL;
-
-       /*
-        * Now update or create a connection entry for it
-        */
-       IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n",
-                 ip_vs_proto_name(iph->protocol),
-                 &to.ip, ntohs(port), &cp->vaddr.ip, 0);
+       cp->app_data = (void *) IP_VS_FTP_ACTIVE;
 
        {
                struct ip_vs_conn_param p;
-               ip_vs_conn_fill_param(cp->ipvs, AF_INET,
-                                     iph->protocol, &to, port, &cp->vaddr,
+               ip_vs_conn_fill_param(cp->ipvs, cp->af,
+                                     ipvsh->protocol, &to, port, &cp->vaddr,
                                      htons(ntohs(cp->vport)-1), &p);
                n_cp = ip_vs_conn_in_get(&p);
                if (!n_cp) {
-                       /* This is ipv4 only */
-                       n_cp = ip_vs_conn_new(&p, AF_INET, &cp->daddr,
+                       n_cp = ip_vs_conn_new(&p, cp->af, &cp->daddr,
                                              htons(ntohs(cp->dport)-1),
                                              IP_VS_CONN_F_NFCT, cp->dest,
                                              skb->mark);
@@ -454,7 +597,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
                ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]);
                if (ret)
                        goto err_unreg;
-               pr_info("%s: loaded support on port[%d] = %d\n",
+               pr_info("%s: loaded support on port[%d] = %u\n",
                        app->name, i, ports[i]);
        }
        return 0;
index 6cf3fd81a5eca887ccefe628c2b9d627b40cf055..eb8b9c883889c1111bca123af249ad08c2976270 100644 (file)
 #include <net/netfilter/nf_conntrack_zones.h>
 
 
-#define FMT_TUPLE      "%pI4:%u->%pI4:%u/%u"
-#define ARG_TUPLE(T)   &(T)->src.u3.ip, ntohs((T)->src.u.all), \
-                       &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+#define FMT_TUPLE      "%s:%u->%s:%u/%u"
+#define ARG_TUPLE(T)   IP_VS_DBG_ADDR((T)->src.l3num, &(T)->src.u3),   \
+                       ntohs((T)->src.u.all),                          \
+                       IP_VS_DBG_ADDR((T)->src.l3num, &(T)->dst.u3),   \
+                       ntohs((T)->dst.u.all),                          \
                        (T)->dst.protonum
 
-#define FMT_CONN       "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
-#define ARG_CONN(C)    &((C)->caddr.ip), ntohs((C)->cport), \
-                       &((C)->vaddr.ip), ntohs((C)->vport), \
-                       &((C)->daddr.ip), ntohs((C)->dport), \
+#define FMT_CONN       "%s:%u->%s:%u->%s:%u/%u:%u"
+#define ARG_CONN(C)    IP_VS_DBG_ADDR((C)->af, &((C)->caddr)),         \
+                       ntohs((C)->cport),                              \
+                       IP_VS_DBG_ADDR((C)->af, &((C)->vaddr)),         \
+                       ntohs((C)->vport),                              \
+                       IP_VS_DBG_ADDR((C)->daf, &((C)->daddr)),        \
+                       ntohs((C)->dport),                              \
                        (C)->protocol, (C)->state
 
 void
@@ -127,13 +132,17 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
                    new_tuple.dst.protonum != IPPROTO_ICMPV6)
                        new_tuple.dst.u.tcp.port = cp->vport;
        }
-       IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
-                 "ctinfo=%d, old reply=" FMT_TUPLE
-                 ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
-                 __func__, ct, ct->status, ctinfo,
-                 ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
-                 ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+       IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+                     "ctinfo=%d, old reply=" FMT_TUPLE "\n",
+                     __func__, ct, ct->status, ctinfo,
+                     ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple));
+       IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+                     "ctinfo=%d, new reply=" FMT_TUPLE "\n",
+                     __func__, ct, ct->status, ctinfo,
+                     ARG_TUPLE(&new_tuple));
        nf_conntrack_alter_reply(ct, &new_tuple);
+       IP_VS_DBG_BUF(7, "%s: Updated conntrack ct=%p for cp=" FMT_CONN "\n",
+                     __func__, ct, ARG_CONN(cp));
 }
 
 int ip_vs_confirm_conntrack(struct sk_buff *skb)
@@ -152,9 +161,6 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
        struct ip_vs_conn_param p;
        struct net *net = nf_ct_net(ct);
 
-       if (exp->tuple.src.l3num != PF_INET)
-               return;
-
        /*
         * We assume that no NF locks are held before this callback.
         * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
@@ -171,19 +177,15 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
        cp = ip_vs_conn_out_get(&p);
        if (cp) {
                /* Change reply CLIENT->RS to CLIENT->VS */
+               IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found inout cp="
+                             FMT_CONN "\n",
+                             __func__, ct, ct->status, ARG_CONN(cp));
                new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-               IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
-                         FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
-                         __func__, ct, ct->status,
-                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-                         ARG_CONN(cp));
+               IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple="
+                             FMT_TUPLE "\n",
+                             __func__, ct, ARG_TUPLE(&new_reply));
                new_reply.dst.u3 = cp->vaddr;
                new_reply.dst.u.tcp.port = cp->vport;
-               IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
-                         ", inout cp=" FMT_CONN "\n",
-                         __func__, ct,
-                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-                         ARG_CONN(cp));
                goto alter;
        }
 
@@ -191,25 +193,21 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
        cp = ip_vs_conn_in_get(&p);
        if (cp) {
                /* Change reply VS->CLIENT to RS->CLIENT */
+               IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found outin cp="
+                             FMT_CONN "\n",
+                             __func__, ct, ct->status, ARG_CONN(cp));
                new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-               IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
-                         FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
-                         __func__, ct, ct->status,
-                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-                         ARG_CONN(cp));
+               IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple="
+                             FMT_TUPLE "\n",
+                             __func__, ct, ARG_TUPLE(&new_reply));
                new_reply.src.u3 = cp->daddr;
                new_reply.src.u.tcp.port = cp->dport;
-               IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
-                         FMT_TUPLE ", outin cp=" FMT_CONN "\n",
-                         __func__, ct,
-                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-                         ARG_CONN(cp));
                goto alter;
        }
 
-       IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
-                 " - unknown expect\n",
-                 __func__, ct, ct->status, ARG_TUPLE(orig));
+       IP_VS_DBG_BUF(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+                     " - unknown expect\n",
+                     __func__, ct, ct->status, ARG_TUPLE(orig));
        return;
 
 alter:
@@ -247,8 +245,8 @@ void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
 
        exp->expectfn = ip_vs_nfct_expect_callback;
 
-       IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
-               __func__, ct, ARG_TUPLE(&exp->tuple));
+       IP_VS_DBG_BUF(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+                     __func__, ct, ARG_TUPLE(&exp->tuple));
        nf_ct_expect_related(exp);
        nf_ct_expect_put(exp);
 }
@@ -274,26 +272,25 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
        tuple.dst.u3 = cp->vaddr;
        tuple.dst.u.all = cp->vport;
 
-       IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
-               " for conn " FMT_CONN "\n",
-               __func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+       IP_VS_DBG_BUF(7, "%s: dropping conntrack for conn " FMT_CONN "\n",
+                     __func__, ARG_CONN(cp));
 
        h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple);
        if (h) {
                ct = nf_ct_tuplehash_to_ctrack(h);
                if (nf_ct_kill(ct)) {
-                       IP_VS_DBG(7, "%s: ct=%p, deleted conntrack for tuple="
-                               FMT_TUPLE "\n",
-                               __func__, ct, ARG_TUPLE(&tuple));
+                       IP_VS_DBG_BUF(7, "%s: ct=%p deleted for tuple="
+                                     FMT_TUPLE "\n",
+                                     __func__, ct, ARG_TUPLE(&tuple));
                } else {
-                       IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
-                               FMT_TUPLE "\n",
-                               __func__, ct, ARG_TUPLE(&tuple));
+                       IP_VS_DBG_BUF(7, "%s: ct=%p, no conntrack for tuple="
+                                     FMT_TUPLE "\n",
+                                     __func__, ct, ARG_TUPLE(&tuple));
                }
                nf_ct_put(ct);
        } else {
-               IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
-                       __func__, ARG_TUPLE(&tuple));
+               IP_VS_DBG_BUF(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+                             __func__, ARG_TUPLE(&tuple));
        }
 }
 
index eff7569824e5b7e568874e9ed25e9ea2310670d9..3250c4a1111e27046c797c703524e80166c80a34 100644 (file)
@@ -109,7 +109,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
                        return 0;
 
                /* Call application helper if needed */
-               ret = ip_vs_app_pkt_out(cp, skb);
+               ret = ip_vs_app_pkt_out(cp, skb, iph);
                if (ret == 0)
                        return 0;
                /* ret=2: csum update is needed after payload mangling */
@@ -156,7 +156,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
                        return 0;
 
                /* Call application helper if needed */
-               ret = ip_vs_app_pkt_in(cp, skb);
+               ret = ip_vs_app_pkt_in(cp, skb, iph);
                if (ret == 0)
                        return 0;
                /* ret=2: csum update is needed after payload mangling */
index 569631d2b2a10d32c149491b5140158020917831..80d10ad12a15f686a68e0457ac3701b605a764c9 100644 (file)
@@ -170,7 +170,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
                        return 0;
 
                /* Call application helper if needed */
-               if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+               if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
                        return 0;
                /* ret=2: csum update is needed after payload mangling */
                if (ret == 1)
@@ -251,7 +251,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
                 *      Attempt ip_vs_app call.
                 *      It will fix ip_vs_conn and iph ack_seq stuff
                 */
-               if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+               if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
                        return 0;
                /* ret=2: csum update is needed after payload mangling */
                if (ret == 1)
index c15ef7c2a1fae800189da7351729da4377789143..e0ef11c3691e49deebd7ae4204cc4ad8bce53cc7 100644 (file)
@@ -162,7 +162,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
                /*
                 *      Call application helper if needed
                 */
-               if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+               if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
                        return 0;
                /* ret=2: csum update is needed after payload mangling */
                if (ret == 1)
@@ -246,7 +246,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
                 *      Attempt ip_vs_app call.
                 *      It will fix ip_vs_conn
                 */
-               if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+               if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
                        return 0;
                /* ret=2: csum update is needed after payload mangling */
                if (ret == 1)
index 82451b7e0acb2eaae5ca26ed27d803c663b9152a..15ed91309992e85121f0eb4c3ad01d5be2bdd194 100644 (file)
@@ -220,7 +220,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
        enum flow_offload_tuple_dir dir;
        struct flow_offload *flow;
        struct net_device *outdev;
-       const struct rtable *rt;
+       struct rtable *rt;
        unsigned int thoff;
        struct iphdr *iph;
        __be32 nexthop;
@@ -241,7 +241,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 
        dir = tuplehash->tuple.dir;
        flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-       rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+       rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
 
        if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) &&
            (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0)
@@ -264,6 +264,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 
        skb->dev = outdev;
        nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
+       skb_dst_set_noref(skb, &rt->dst);
        neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
 
        return NF_STOLEN;
@@ -480,6 +481,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 
        skb->dev = outdev;
        nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
+       skb_dst_set_noref(skb, &rt->dst);
        neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
 
        return NF_STOLEN;
index 821f8d835f7ad7cab6cdbcba56fd4f2832dfd47d..b7df32a56e7ed2b495e39b54a7219a59af1531e4 100644 (file)
@@ -1036,7 +1036,7 @@ static struct pernet_operations nat_net_ops = {
        .size = sizeof(struct nat_net),
 };
 
-struct nf_nat_hook nat_hook = {
+static struct nf_nat_hook nat_hook = {
        .parse_nat_setup        = nfnetlink_parse_nat_setup,
 #ifdef CONFIG_XFRM
        .decode_session         = __nf_nat_decode_session,
index 7c4bb0a773ca2237670bd24469ac18b441761c63..adee04af8d43f519402c20b4f1a8bd11929a2159 100644 (file)
@@ -15,7 +15,6 @@
 #include <linux/inetdevice.h>
 #include <linux/ip.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/netdevice.h>
 #include <linux/netfilter.h>
 #include <linux/types.h>
@@ -124,6 +123,3 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
        return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
 }
 EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
index 87b2a77add654ec600195b590fab4862f263bd47..c785bc5a66f149aabf74cbb35b0af3c4db352b11 100644 (file)
@@ -28,6 +28,28 @@ static LIST_HEAD(nf_tables_objects);
 static LIST_HEAD(nf_tables_flowtables);
 static u64 table_handle;
 
+enum {
+       NFT_VALIDATE_SKIP       = 0,
+       NFT_VALIDATE_NEED,
+       NFT_VALIDATE_DO,
+};
+
+static void nft_validate_state_update(struct net *net, u8 new_validate_state)
+{
+       switch (net->nft.validate_state) {
+       case NFT_VALIDATE_SKIP:
+               WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO);
+               break;
+       case NFT_VALIDATE_NEED:
+               break;
+       case NFT_VALIDATE_DO:
+               if (new_validate_state == NFT_VALIDATE_NEED)
+                       return;
+       }
+
+       net->nft.validate_state = new_validate_state;
+}
+
 static void nft_ctx_init(struct nft_ctx *ctx,
                         struct net *net,
                         const struct sk_buff *skb,
@@ -373,7 +395,7 @@ static struct nft_table *nft_table_lookup(const struct net *net,
        if (nla == NULL)
                return ERR_PTR(-EINVAL);
 
-       list_for_each_entry(table, &net->nft.tables, list) {
+       list_for_each_entry_rcu(table, &net->nft.tables, list) {
                if (!nla_strcmp(nla, table->name) &&
                    table->family == family &&
                    nft_active_genmask(table, genmask))
@@ -546,6 +568,24 @@ done:
        return skb->len;
 }
 
+static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb,
+                                     const struct nlmsghdr *nlh,
+                                     struct netlink_dump_control *c)
+{
+       int err;
+
+       if (!try_module_get(THIS_MODULE))
+               return -EINVAL;
+
+       rcu_read_unlock();
+       err = netlink_dump_start(nlsk, skb, nlh, c);
+       rcu_read_lock();
+       module_put(THIS_MODULE);
+
+       return err;
+}
+
+/* called with rcu_read_lock held */
 static int nf_tables_gettable(struct net *net, struct sock *nlsk,
                              struct sk_buff *skb, const struct nlmsghdr *nlh,
                              const struct nlattr * const nla[],
@@ -561,8 +601,10 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
        if (nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_tables,
+                       .module = THIS_MODULE,
                };
-               return netlink_dump_start(nlsk, skb, nlh, &c);
+
+               return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
        }
 
        table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask);
@@ -571,7 +613,7 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
                return PTR_ERR(table);
        }
 
-       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return -ENOMEM;
 
@@ -933,7 +975,7 @@ static struct nft_chain *nft_chain_lookup(const struct nft_table *table,
        if (nla == NULL)
                return ERR_PTR(-EINVAL);
 
-       list_for_each_entry(chain, &table->chains, list) {
+       list_for_each_entry_rcu(chain, &table->chains, list) {
                if (!nla_strcmp(nla, chain->name) &&
                    nft_active_genmask(chain, genmask))
                        return chain;
@@ -1135,6 +1177,7 @@ done:
        return skb->len;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getchain(struct net *net, struct sock *nlsk,
                              struct sk_buff *skb, const struct nlmsghdr *nlh,
                              const struct nlattr * const nla[],
@@ -1151,8 +1194,10 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
        if (nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_chains,
+                       .module = THIS_MODULE,
                };
-               return netlink_dump_start(nlsk, skb, nlh, &c);
+
+               return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
        }
 
        table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
@@ -1167,7 +1212,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
                return PTR_ERR(chain);
        }
 
-       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return -ENOMEM;
 
@@ -1237,12 +1282,29 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain,
                rcu_assign_pointer(chain->stats, newstats);
 }
 
+static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
+{
+       struct nft_rule **g0 = rcu_dereference_raw(chain->rules_gen_0);
+       struct nft_rule **g1 = rcu_dereference_raw(chain->rules_gen_1);
+
+       if (g0 != g1)
+               kvfree(g1);
+       kvfree(g0);
+
+       /* should be NULL either via abort or via successful commit */
+       WARN_ON_ONCE(chain->rules_next);
+       kvfree(chain->rules_next);
+}
+
 static void nf_tables_chain_destroy(struct nft_ctx *ctx)
 {
        struct nft_chain *chain = ctx->chain;
 
        BUG_ON(chain->use > 0);
 
+       /* no concurrent access possible anymore */
+       nf_tables_chain_free_chain_rules(chain);
+
        if (nft_is_base_chain(chain)) {
                struct nft_base_chain *basechain = nft_base_chain(chain);
 
@@ -1335,6 +1397,27 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook)
        module_put(hook->type->owner);
 }
 
+struct nft_rules_old {
+       struct rcu_head h;
+       struct nft_rule **start;
+};
+
+static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *chain,
+                                                    unsigned int alloc)
+{
+       if (alloc > INT_MAX)
+               return NULL;
+
+       alloc += 1;     /* NULL, ends rules */
+       if (sizeof(struct nft_rule *) > INT_MAX / alloc)
+               return NULL;
+
+       alloc *= sizeof(struct nft_rule *);
+       alloc += sizeof(struct nft_rules_old);
+
+       return kvmalloc(alloc, GFP_KERNEL);
+}
+
 static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
                              u8 policy, bool create)
 {
@@ -1344,6 +1427,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
        struct nft_stats __percpu *stats;
        struct net *net = ctx->net;
        struct nft_chain *chain;
+       struct nft_rule **rules;
        int err;
 
        if (table->use == UINT_MAX)
@@ -1406,6 +1490,16 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
                goto err1;
        }
 
+       rules = nf_tables_chain_alloc_rules(chain, 0);
+       if (!rules) {
+               err = -ENOMEM;
+               goto err1;
+       }
+
+       *rules = NULL;
+       rcu_assign_pointer(chain->rules_gen_0, rules);
+       rcu_assign_pointer(chain->rules_gen_1, rules);
+
        err = nf_tables_register_hook(net, table, chain);
        if (err < 0)
                goto err1;
@@ -1849,19 +1943,7 @@ static int nf_tables_newexpr(const struct nft_ctx *ctx,
                        goto err1;
        }
 
-       if (ops->validate) {
-               const struct nft_data *data = NULL;
-
-               err = ops->validate(ctx, expr, &data);
-               if (err < 0)
-                       goto err2;
-       }
-
        return 0;
-
-err2:
-       if (ops->destroy)
-               ops->destroy(ctx, expr);
 err1:
        expr->ops = NULL;
        return err;
@@ -1920,7 +2002,7 @@ static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain,
        struct nft_rule *rule;
 
        // FIXME: this sucks
-       list_for_each_entry(rule, &chain->rules, list) {
+       list_for_each_entry_rcu(rule, &chain->rules, list) {
                if (handle == rule->handle)
                        return rule;
        }
@@ -2116,6 +2198,7 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb)
        return 0;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getrule(struct net *net, struct sock *nlsk,
                             struct sk_buff *skb, const struct nlmsghdr *nlh,
                             const struct nlattr * const nla[],
@@ -2134,18 +2217,19 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_rules,
                        .done = nf_tables_dump_rules_done,
+                       .module = THIS_MODULE,
                };
 
                if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
                        struct nft_rule_dump_ctx *ctx;
 
-                       ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+                       ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
                        if (!ctx)
                                return -ENOMEM;
 
                        if (nla[NFTA_RULE_TABLE]) {
                                ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
-                                                       GFP_KERNEL);
+                                                       GFP_ATOMIC);
                                if (!ctx->table) {
                                        kfree(ctx);
                                        return -ENOMEM;
@@ -2153,7 +2237,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
                        }
                        if (nla[NFTA_RULE_CHAIN]) {
                                ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
-                                                       GFP_KERNEL);
+                                                       GFP_ATOMIC);
                                if (!ctx->chain) {
                                        kfree(ctx->table);
                                        kfree(ctx);
@@ -2163,7 +2247,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
                        c.data = ctx;
                }
 
-               return netlink_dump_start(nlsk, skb, nlh, &c);
+               return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
        }
 
        table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
@@ -2184,7 +2268,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
                return PTR_ERR(rule);
        }
 
-       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return -ENOMEM;
 
@@ -2225,6 +2309,53 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx,
        nf_tables_rule_destroy(ctx, rule);
 }
 
+int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
+{
+       struct nft_expr *expr, *last;
+       const struct nft_data *data;
+       struct nft_rule *rule;
+       int err;
+
+       list_for_each_entry(rule, &chain->rules, list) {
+               if (!nft_is_active_next(ctx->net, rule))
+                       continue;
+
+               nft_rule_for_each_expr(expr, last, rule) {
+                       if (!expr->ops->validate)
+                               continue;
+
+                       err = expr->ops->validate(ctx, expr, &data);
+                       if (err < 0)
+                               return err;
+               }
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(nft_chain_validate);
+
+static int nft_table_validate(struct net *net, const struct nft_table *table)
+{
+       struct nft_chain *chain;
+       struct nft_ctx ctx = {
+               .net    = net,
+               .family = table->family,
+       };
+       int err;
+
+       list_for_each_entry(chain, &table->chains, list) {
+               if (!nft_is_base_chain(chain))
+                       continue;
+
+               ctx.chain = chain;
+               err = nft_chain_validate(&ctx, chain);
+               if (err < 0)
+                       return err;
+       }
+
+       return 0;
+}
+
 #define NFT_RULE_MAXEXPRS      128
 
 static struct nft_expr_info *info;
@@ -2352,6 +2483,10 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
                err = nf_tables_newexpr(&ctx, &info[i], expr);
                if (err < 0)
                        goto err2;
+
+               if (info[i].ops->validate)
+                       nft_validate_state_update(net, NFT_VALIDATE_NEED);
+
                info[i].ops = NULL;
                expr = nft_expr_next(expr);
        }
@@ -2395,8 +2530,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
                }
        }
        chain->use++;
-       return 0;
 
+       if (net->nft.validate_state == NFT_VALIDATE_DO)
+               return nft_table_validate(net, table);
+
+       return 0;
 err2:
        nf_tables_rule_release(&ctx, rule);
 err1:
@@ -2655,7 +2793,7 @@ static struct nft_set *nft_set_lookup(const struct nft_table *table,
        if (nla == NULL)
                return ERR_PTR(-EINVAL);
 
-       list_for_each_entry(set, &table->sets, list) {
+       list_for_each_entry_rcu(set, &table->sets, list) {
                if (!nla_strcmp(nla, set->name) &&
                    nft_active_genmask(set, genmask))
                        return set;
@@ -2781,7 +2919,7 @@ static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
        return 0;
 }
 
-static u64 nf_jiffies64_to_msecs(u64 input)
+static __be64 nf_jiffies64_to_msecs(u64 input)
 {
        u64 ms = jiffies64_to_nsecs(input);
 
@@ -2960,6 +3098,7 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb)
        return 0;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getset(struct net *net, struct sock *nlsk,
                            struct sk_buff *skb, const struct nlmsghdr *nlh,
                            const struct nlattr * const nla[],
@@ -2982,17 +3121,18 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_sets,
                        .done = nf_tables_dump_sets_done,
+                       .module = THIS_MODULE,
                };
                struct nft_ctx *ctx_dump;
 
-               ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_KERNEL);
+               ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_ATOMIC);
                if (ctx_dump == NULL)
                        return -ENOMEM;
 
                *ctx_dump = ctx;
                c.data = ctx_dump;
 
-               return netlink_dump_start(nlsk, skb, nlh, &c);
+               return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
        }
 
        /* Only accept unspec with dump */
@@ -3005,7 +3145,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
        if (IS_ERR(set))
                return PTR_ERR(set);
 
-       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (skb2 == NULL)
                return -ENOMEM;
 
@@ -3746,7 +3886,7 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
        ext = nft_set_elem_ext(set, &elem);
 
        err = -ENOMEM;
-       skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+       skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (skb == NULL)
                goto err1;
 
@@ -3768,6 +3908,7 @@ err1:
        return err == -EAGAIN ? -ENOBUFS : err;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
                                struct sk_buff *skb, const struct nlmsghdr *nlh,
                                const struct nlattr * const nla[],
@@ -3792,10 +3933,11 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_set,
                        .done = nf_tables_dump_set_done,
+                       .module = THIS_MODULE,
                };
                struct nft_set_dump_ctx *dump_ctx;
 
-               dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL);
+               dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_ATOMIC);
                if (!dump_ctx)
                        return -ENOMEM;
 
@@ -3803,7 +3945,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
                dump_ctx->ctx = ctx;
 
                c.data = dump_ctx;
-               return netlink_dump_start(nlsk, skb, nlh, &c);
+               return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
        }
 
        if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
@@ -4034,6 +4176,12 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
                                                          d2.type, d2.len);
                        if (err < 0)
                                goto err3;
+
+                       if (d2.type == NFT_DATA_VERDICT &&
+                           (data.verdict.code == NFT_GOTO ||
+                            data.verdict.code == NFT_JUMP))
+                               nft_validate_state_update(ctx->net,
+                                                         NFT_VALIDATE_NEED);
                }
 
                nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len);
@@ -4133,7 +4281,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
        const struct nlattr *attr;
        struct nft_set *set;
        struct nft_ctx ctx;
-       int rem, err = 0;
+       int rem, err;
 
        if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
                return -EINVAL;
@@ -4154,9 +4302,13 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
        nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
                err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags);
                if (err < 0)
-                       break;
+                       return err;
        }
-       return err;
+
+       if (net->nft.validate_state == NFT_VALIDATE_DO)
+               return nft_table_validate(net, ctx.table);
+
+       return 0;
 }
 
 /**
@@ -4426,7 +4578,7 @@ struct nft_object *nft_obj_lookup(const struct nft_table *table,
 {
        struct nft_object *obj;
 
-       list_for_each_entry(obj, &table->objects, list) {
+       list_for_each_entry_rcu(obj, &table->objects, list) {
                if (!nla_strcmp(nla, obj->name) &&
                    objtype == obj->ops->type->type &&
                    nft_active_genmask(obj, genmask))
@@ -4756,12 +4908,12 @@ nft_obj_filter_alloc(const struct nlattr * const nla[])
 {
        struct nft_obj_filter *filter;
 
-       filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+       filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
        if (!filter)
                return ERR_PTR(-ENOMEM);
 
        if (nla[NFTA_OBJ_TABLE]) {
-               filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_KERNEL);
+               filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
                if (!filter->table) {
                        kfree(filter);
                        return ERR_PTR(-ENOMEM);
@@ -4773,6 +4925,7 @@ nft_obj_filter_alloc(const struct nlattr * const nla[])
        return filter;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getobj(struct net *net, struct sock *nlsk,
                            struct sk_buff *skb, const struct nlmsghdr *nlh,
                            const struct nlattr * const nla[],
@@ -4792,6 +4945,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_obj,
                        .done = nf_tables_dump_obj_done,
+                       .module = THIS_MODULE,
                };
 
                if (nla[NFTA_OBJ_TABLE] ||
@@ -4804,7 +4958,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 
                        c.data = filter;
                }
-               return netlink_dump_start(nlsk, skb, nlh, &c);
+               return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
        }
 
        if (!nla[NFTA_OBJ_NAME] ||
@@ -4824,7 +4978,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
                return PTR_ERR(obj);
        }
 
-       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return -ENOMEM;
 
@@ -4969,7 +5123,7 @@ struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
 {
        struct nft_flowtable *flowtable;
 
-       list_for_each_entry(flowtable, &table->flowtables, list) {
+       list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
                if (!nla_strcmp(nla, flowtable->name) &&
                    nft_active_genmask(flowtable, genmask))
                        return flowtable;
@@ -5430,13 +5584,13 @@ nft_flowtable_filter_alloc(const struct nlattr * const nla[])
 {
        struct nft_flowtable_filter *filter;
 
-       filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+       filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
        if (!filter)
                return ERR_PTR(-ENOMEM);
 
        if (nla[NFTA_FLOWTABLE_TABLE]) {
                filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE],
-                                          GFP_KERNEL);
+                                          GFP_ATOMIC);
                if (!filter->table) {
                        kfree(filter);
                        return ERR_PTR(-ENOMEM);
@@ -5445,6 +5599,7 @@ nft_flowtable_filter_alloc(const struct nlattr * const nla[])
        return filter;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
                                  struct sk_buff *skb,
                                  const struct nlmsghdr *nlh,
@@ -5463,6 +5618,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_flowtable,
                        .done = nf_tables_dump_flowtable_done,
+                       .module = THIS_MODULE,
                };
 
                if (nla[NFTA_FLOWTABLE_TABLE]) {
@@ -5474,7 +5630,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
 
                        c.data = filter;
                }
-               return netlink_dump_start(nlsk, skb, nlh, &c);
+               return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
        }
 
        if (!nla[NFTA_FLOWTABLE_NAME])
@@ -5490,7 +5646,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
        if (IS_ERR(flowtable))
                return PTR_ERR(flowtable);
 
-       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return -ENOMEM;
 
@@ -5654,7 +5810,7 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk,
        struct sk_buff *skb2;
        int err;
 
-       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+       skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (skb2 == NULL)
                return -ENOMEM;
 
@@ -5676,7 +5832,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_table_policy,
        },
        [NFT_MSG_GETTABLE] = {
-               .call           = nf_tables_gettable,
+               .call_rcu       = nf_tables_gettable,
                .attr_count     = NFTA_TABLE_MAX,
                .policy         = nft_table_policy,
        },
@@ -5691,7 +5847,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_chain_policy,
        },
        [NFT_MSG_GETCHAIN] = {
-               .call           = nf_tables_getchain,
+               .call_rcu       = nf_tables_getchain,
                .attr_count     = NFTA_CHAIN_MAX,
                .policy         = nft_chain_policy,
        },
@@ -5706,7 +5862,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_rule_policy,
        },
        [NFT_MSG_GETRULE] = {
-               .call           = nf_tables_getrule,
+               .call_rcu       = nf_tables_getrule,
                .attr_count     = NFTA_RULE_MAX,
                .policy         = nft_rule_policy,
        },
@@ -5721,7 +5877,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_set_policy,
        },
        [NFT_MSG_GETSET] = {
-               .call           = nf_tables_getset,
+               .call_rcu       = nf_tables_getset,
                .attr_count     = NFTA_SET_MAX,
                .policy         = nft_set_policy,
        },
@@ -5736,7 +5892,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_set_elem_list_policy,
        },
        [NFT_MSG_GETSETELEM] = {
-               .call           = nf_tables_getsetelem,
+               .call_rcu       = nf_tables_getsetelem,
                .attr_count     = NFTA_SET_ELEM_LIST_MAX,
                .policy         = nft_set_elem_list_policy,
        },
@@ -5746,7 +5902,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_set_elem_list_policy,
        },
        [NFT_MSG_GETGEN] = {
-               .call           = nf_tables_getgen,
+               .call_rcu       = nf_tables_getgen,
        },
        [NFT_MSG_NEWOBJ] = {
                .call_batch     = nf_tables_newobj,
@@ -5754,7 +5910,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_obj_policy,
        },
        [NFT_MSG_GETOBJ] = {
-               .call           = nf_tables_getobj,
+               .call_rcu       = nf_tables_getobj,
                .attr_count     = NFTA_OBJ_MAX,
                .policy         = nft_obj_policy,
        },
@@ -5764,7 +5920,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_obj_policy,
        },
        [NFT_MSG_GETOBJ_RESET] = {
-               .call           = nf_tables_getobj,
+               .call_rcu       = nf_tables_getobj,
                .attr_count     = NFTA_OBJ_MAX,
                .policy         = nft_obj_policy,
        },
@@ -5774,7 +5930,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
                .policy         = nft_flowtable_policy,
        },
        [NFT_MSG_GETFLOWTABLE] = {
-               .call           = nf_tables_getflowtable,
+               .call_rcu       = nf_tables_getflowtable,
                .attr_count     = NFTA_FLOWTABLE_MAX,
                .policy         = nft_flowtable_policy,
        },
@@ -5785,6 +5941,27 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
        },
 };
 
+static int nf_tables_validate(struct net *net)
+{
+       struct nft_table *table;
+
+       switch (net->nft.validate_state) {
+       case NFT_VALIDATE_SKIP:
+               break;
+       case NFT_VALIDATE_NEED:
+               nft_validate_state_update(net, NFT_VALIDATE_DO);
+               /* fall through */
+       case NFT_VALIDATE_DO:
+               list_for_each_entry(table, &net->nft.tables, list) {
+                       if (nft_table_validate(net, table) < 0)
+                               return -EAGAIN;
+               }
+               break;
+       }
+
+       return 0;
+}
+
 static void nft_chain_commit_update(struct nft_trans *trans)
 {
        struct nft_base_chain *basechain;
@@ -5850,21 +6027,166 @@ static void nf_tables_commit_release(struct net *net)
        }
 }
 
+static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain)
+{
+       struct nft_rule *rule;
+       unsigned int alloc = 0;
+       int i;
+
+       /* already handled or inactive chain? */
+       if (chain->rules_next || !nft_is_active_next(net, chain))
+               return 0;
+
+       rule = list_entry(&chain->rules, struct nft_rule, list);
+       i = 0;
+
+       list_for_each_entry_continue(rule, &chain->rules, list) {
+               if (nft_is_active_next(net, rule))
+                       alloc++;
+       }
+
+       chain->rules_next = nf_tables_chain_alloc_rules(chain, alloc);
+       if (!chain->rules_next)
+               return -ENOMEM;
+
+       list_for_each_entry_continue(rule, &chain->rules, list) {
+               if (nft_is_active_next(net, rule))
+                       chain->rules_next[i++] = rule;
+       }
+
+       chain->rules_next[i] = NULL;
+       return 0;
+}
+
+static void nf_tables_commit_chain_prepare_cancel(struct net *net)
+{
+       struct nft_trans *trans, *next;
+
+       list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+               struct nft_chain *chain = trans->ctx.chain;
+
+               if (trans->msg_type == NFT_MSG_NEWRULE ||
+                   trans->msg_type == NFT_MSG_DELRULE) {
+                       kvfree(chain->rules_next);
+                       chain->rules_next = NULL;
+               }
+       }
+}
+
+static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h)
+{
+       struct nft_rules_old *o = container_of(h, struct nft_rules_old, h);
+
+       kvfree(o->start);
+}
+
+static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules)
+{
+       struct nft_rule **r = rules;
+       struct nft_rules_old *old;
+
+       while (*r)
+               r++;
+
+       r++;    /* rcu_head is after end marker */
+       old = (void *) r;
+       old->start = rules;
+
+       call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old);
+}
+
+static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *chain)
+{
+       struct nft_rule **g0, **g1;
+       bool next_genbit;
+
+       next_genbit = nft_gencursor_next(net);
+
+       g0 = rcu_dereference_protected(chain->rules_gen_0,
+                                      lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+       g1 = rcu_dereference_protected(chain->rules_gen_1,
+                                      lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+
+       /* No changes to this chain? */
+       if (chain->rules_next == NULL) {
+               /* chain had no change in last or next generation */
+               if (g0 == g1)
+                       return;
+               /*
+                * chain had no change in this generation; make sure next
+                * one uses same rules as current generation.
+                */
+               if (next_genbit) {
+                       rcu_assign_pointer(chain->rules_gen_1, g0);
+                       nf_tables_commit_chain_free_rules_old(g1);
+               } else {
+                       rcu_assign_pointer(chain->rules_gen_0, g1);
+                       nf_tables_commit_chain_free_rules_old(g0);
+               }
+
+               return;
+       }
+
+       if (next_genbit)
+               rcu_assign_pointer(chain->rules_gen_1, chain->rules_next);
+       else
+               rcu_assign_pointer(chain->rules_gen_0, chain->rules_next);
+
+       chain->rules_next = NULL;
+
+       if (g0 == g1)
+               return;
+
+       if (next_genbit)
+               nf_tables_commit_chain_free_rules_old(g1);
+       else
+               nf_tables_commit_chain_free_rules_old(g0);
+}
+
 static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 {
        struct nft_trans *trans, *next;
        struct nft_trans_elem *te;
+       struct nft_chain *chain;
+       struct nft_table *table;
 
-       /* Bump generation counter, invalidate any dump in progress */
-       while (++net->nft.base_seq == 0);
+       /* 0. Validate ruleset, otherwise roll back for error reporting. */
+       if (nf_tables_validate(net) < 0)
+               return -EAGAIN;
 
-       /* A new generation has just started */
-       net->nft.gencursor = nft_gencursor_next(net);
+       /* 1.  Allocate space for next generation rules_gen_X[] */
+       list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+               int ret;
+
+               if (trans->msg_type == NFT_MSG_NEWRULE ||
+                   trans->msg_type == NFT_MSG_DELRULE) {
+                       chain = trans->ctx.chain;
 
-       /* Make sure all packets have left the previous generation before
-        * purging old rules.
+                       ret = nf_tables_commit_chain_prepare(net, chain);
+                       if (ret < 0) {
+                               nf_tables_commit_chain_prepare_cancel(net);
+                               return ret;
+                       }
+               }
+       }
+
+       /* step 2.  Make rules_gen_X visible to packet path */
+       list_for_each_entry(table, &net->nft.tables, list) {
+               list_for_each_entry(chain, &table->chains, list) {
+                       if (!nft_is_active_next(net, chain))
+                               continue;
+                       nf_tables_commit_chain_active(net, chain);
+               }
+       }
+
+       /*
+        * Bump generation counter, invalidate any dump in progress.
+        * Cannot fail after this point.
         */
-       synchronize_rcu();
+       while (++net->nft.base_seq == 0);
+
+       /* step 3. Start new generation, rules_gen_X now in use. */
+       net->nft.gencursor = nft_gencursor_next(net);
 
        list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
                switch (trans->msg_type) {
@@ -6126,6 +6448,11 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
        return 0;
 }
 
+static void nf_tables_cleanup(struct net *net)
+{
+       nft_validate_state_update(net, NFT_VALIDATE_SKIP);
+}
+
 static bool nf_tables_valid_genid(struct net *net, u32 genid)
 {
        return net->nft.base_seq == genid;
@@ -6138,6 +6465,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
        .cb             = nf_tables_cb,
        .commit         = nf_tables_commit,
        .abort          = nf_tables_abort,
+       .cleanup        = nf_tables_cleanup,
        .valid_genid    = nf_tables_valid_genid,
 };
 
@@ -6221,19 +6549,18 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
 
        list_for_each_entry(rule, &chain->rules, list) {
                nft_rule_for_each_expr(expr, last, rule) {
-                       const struct nft_data *data = NULL;
+                       struct nft_immediate_expr *priv;
+                       const struct nft_data *data;
                        int err;
 
-                       if (!expr->ops->validate)
+                       if (strcmp(expr->ops->type->name, "immediate"))
                                continue;
 
-                       err = expr->ops->validate(ctx, expr, &data);
-                       if (err < 0)
-                               return err;
-
-                       if (data == NULL)
+                       priv = nft_expr_priv(expr);
+                       if (priv->dreg != NFT_REG_VERDICT)
                                continue;
 
+                       data = &priv->data;
                        switch (data->verdict.code) {
                        case NFT_JUMP:
                        case NFT_GOTO:
@@ -6713,6 +7040,8 @@ static int __net_init nf_tables_init_net(struct net *net)
        INIT_LIST_HEAD(&net->nft.tables);
        INIT_LIST_HEAD(&net->nft.commit_list);
        net->nft.base_seq = 1;
+       net->nft.validate_state = NFT_VALIDATE_SKIP;
+
        return 0;
 }
 
index 4f46d2f4e167d31353c2a00d06694caed7de7d86..47cf667b15caca4bf8321584e6bf8f63ac62d222 100644 (file)
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_log.h>
 
-static const char *const comments[__NFT_TRACETYPE_MAX] = {
-       [NFT_TRACETYPE_POLICY]  = "policy",
-       [NFT_TRACETYPE_RETURN]  = "return",
-       [NFT_TRACETYPE_RULE]    = "rule",
-};
-
-static const struct nf_loginfo trace_loginfo = {
-       .type = NF_LOG_TYPE_LOG,
-       .u = {
-               .log = {
-                       .level = LOGLEVEL_WARNING,
-                       .logflags = NF_LOG_DEFAULT_MASK,
-               },
-       },
-};
-
 static noinline void __nft_trace_packet(struct nft_traceinfo *info,
                                        const struct nft_chain *chain,
                                        enum nft_trace_types type)
@@ -133,7 +117,7 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
 
 struct nft_jumpstack {
        const struct nft_chain  *chain;
-       const struct nft_rule   *rule;
+       struct nft_rule *const *rules;
 };
 
 unsigned int
@@ -141,27 +125,29 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
 {
        const struct nft_chain *chain = priv, *basechain = chain;
        const struct net *net = nft_net(pkt);
+       struct nft_rule *const *rules;
        const struct nft_rule *rule;
        const struct nft_expr *expr, *last;
        struct nft_regs regs;
        unsigned int stackptr = 0;
        struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
-       unsigned int gencursor = nft_genmask_cur(net);
+       bool genbit = READ_ONCE(net->nft.gencursor);
        struct nft_traceinfo info;
 
        info.trace = false;
        if (static_branch_unlikely(&nft_trace_enabled))
                nft_trace_init(&info, pkt, &regs.verdict, basechain);
 do_chain:
-       rule = list_entry(&chain->rules, struct nft_rule, list);
+       if (genbit)
+               rules = rcu_dereference(chain->rules_gen_1);
+       else
+               rules = rcu_dereference(chain->rules_gen_0);
+
 next_rule:
+       rule = *rules;
        regs.verdict.code = NFT_CONTINUE;
-       list_for_each_entry_continue_rcu(rule, &chain->rules, list) {
-
-               /* This rule is not active, skip. */
-               if (unlikely(rule->genmask & gencursor))
-                       continue;
-
+       for (; *rules ; rules++) {
+               rule = *rules;
                nft_rule_for_each_expr(expr, last, rule) {
                        if (expr->ops == &nft_cmp_fast_ops)
                                nft_cmp_fast_eval(expr, &regs);
@@ -199,7 +185,7 @@ next_rule:
        case NFT_JUMP:
                BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE);
                jumpstack[stackptr].chain = chain;
-               jumpstack[stackptr].rule  = rule;
+               jumpstack[stackptr].rules = rules + 1;
                stackptr++;
                /* fall through */
        case NFT_GOTO:
@@ -221,7 +207,7 @@ next_rule:
        if (stackptr > 0) {
                stackptr--;
                chain = jumpstack[stackptr].chain;
-               rule  = jumpstack[stackptr].rule;
+               rules = jumpstack[stackptr].rules;
                goto next_rule;
        }
 
index 03ead8a9e90ccfcc1936ee66064269dec4719fbd..4d0da7042affbcc4a2d129c852cf1b6fd2bf04a3 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/uaccess.h>
 #include <net/sock.h>
 #include <linux/init.h>
+#include <linux/sched/signal.h>
 
 #include <net/netlink.h>
 #include <linux/netfilter/nfnetlink.h>
@@ -37,6 +38,8 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
        rcu_dereference_protected(table[(id)].subsys, \
                                  lockdep_nfnl_is_held((id)))
 
+#define NFNL_MAX_ATTR_COUNT    32
+
 static struct {
        struct mutex                            mutex;
        const struct nfnetlink_subsystem __rcu  *subsys;
@@ -76,6 +79,13 @@ EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held);
 
 int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
 {
+       u8 cb_id;
+
+       /* Sanity-check attr_count size to avoid stack buffer overflow. */
+       for (cb_id = 0; cb_id < n->cb_count; cb_id++)
+               if (WARN_ON(n->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT))
+                       return -EINVAL;
+
        nfnl_lock(n->subsys_id);
        if (table[n->subsys_id].subsys) {
                nfnl_unlock(n->subsys_id);
@@ -185,11 +195,17 @@ replay:
        {
                int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
                u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
-               struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+               struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
                struct nlattr *attr = (void *)nlh + min_len;
                int attrlen = nlh->nlmsg_len - min_len;
                __u8 subsys_id = NFNL_SUBSYS_ID(type);
 
+               /* Sanity-check NFNL_MAX_ATTR_COUNT */
+               if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
+                       rcu_read_unlock();
+                       return -ENOMEM;
+               }
+
                err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen,
                                ss->cb[cb_id].policy, extack);
                if (err < 0) {
@@ -330,6 +346,13 @@ replay:
        while (skb->len >= nlmsg_total_size(0)) {
                int msglen, type;
 
+               if (fatal_signal_pending(current)) {
+                       nfnl_err_reset(&err_list);
+                       err = -EINTR;
+                       status = NFNL_BATCH_FAILURE;
+                       goto done;
+               }
+
                memset(&extack, 0, sizeof(extack));
                nlh = nlmsg_hdr(skb);
                err = 0;
@@ -379,10 +402,16 @@ replay:
                {
                        int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
                        u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
-                       struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+                       struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
                        struct nlattr *attr = (void *)nlh + min_len;
                        int attrlen = nlh->nlmsg_len - min_len;
 
+                       /* Sanity-check NFTA_MAX_ATTR */
+                       if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
+                               err = -ENOMEM;
+                               goto ack;
+                       }
+
                        err = nla_parse(cda, ss->cb[cb_id].attr_count, attr,
                                        attrlen, ss->cb[cb_id].policy, NULL);
                        if (err < 0)
@@ -441,10 +470,19 @@ done:
                kfree_skb(skb);
                goto replay;
        } else if (status == NFNL_BATCH_DONE) {
-               ss->commit(net, oskb);
+               err = ss->commit(net, oskb);
+               if (err == -EAGAIN) {
+                       status |= NFNL_BATCH_REPLAY;
+                       goto done;
+               } else if (err) {
+                       ss->abort(net, oskb);
+                       netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL);
+               }
        } else {
                ss->abort(net, oskb);
        }
+       if (ss->cleanup)
+               ss->cleanup(net);
 
        nfnl_err_deliver(&err_list, oskb);
        nfnl_unlock(subsys_id);
index 1d99a1efdafcda5e209eadb975c4e53c859d5495..8d1ff654e5aff1dfd5c2ace7693876568ea3377a 100644 (file)
@@ -611,10 +611,10 @@ nla_put_failure:
        return -1;
 }
 
-static int nfnl_compat_get(struct net *net, struct sock *nfnl,
-                          struct sk_buff *skb, const struct nlmsghdr *nlh,
-                          const struct nlattr * const tb[],
-                          struct netlink_ext_ack *extack)
+static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
+                              struct sk_buff *skb, const struct nlmsghdr *nlh,
+                              const struct nlattr * const tb[],
+                              struct netlink_ext_ack *extack)
 {
        int ret = 0, target;
        struct nfgenmsg *nfmsg;
@@ -653,16 +653,21 @@ static int nfnl_compat_get(struct net *net, struct sock *nfnl,
                return -EINVAL;
        }
 
+       if (!try_module_get(THIS_MODULE))
+               return -EINVAL;
+
+       rcu_read_unlock();
        try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name,
                                                 rev, target, &ret),
                                                 fmt, name);
-
        if (ret < 0)
-               return ret;
+               goto out_put;
 
        skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-       if (skb2 == NULL)
-               return -ENOMEM;
+       if (skb2 == NULL) {
+               ret = -ENOMEM;
+               goto out_put;
+       }
 
        /* include the best revision for this extension in the message */
        if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid,
@@ -672,14 +677,16 @@ static int nfnl_compat_get(struct net *net, struct sock *nfnl,
                                  nfmsg->nfgen_family,
                                  name, ret, target) <= 0) {
                kfree_skb(skb2);
-               return -ENOSPC;
+               goto out_put;
        }
 
        ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
                                MSG_DONTWAIT);
        if (ret > 0)
                ret = 0;
-
+out_put:
+       rcu_read_lock();
+       module_put(THIS_MODULE);
        return ret == -EAGAIN ? -ENOBUFS : ret;
 }
 
@@ -691,7 +698,7 @@ static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
 };
 
 static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = {
-       [NFNL_MSG_COMPAT_GET]           = { .call = nfnl_compat_get,
+       [NFNL_MSG_COMPAT_GET]           = { .call_rcu = nfnl_compat_get_rcu,
                                            .attr_count = NFTA_COMPAT_MAX,
                                            .policy = nfnl_compat_policy_get },
 };
index ce13a50b91893fd67a9c8586f7cca53278b1067d..8abb9891cdf22e26e2c17d4cadb4acba1a8dc14b 100644 (file)
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_dup_netdev.h>
+#include <net/neighbour.h>
+#include <net/ip.h>
 
 struct nft_fwd_netdev {
        enum nft_registers      sreg_dev:8;
@@ -32,6 +36,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
 
 static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = {
        [NFTA_FWD_SREG_DEV]     = { .type = NLA_U32 },
+       [NFTA_FWD_SREG_ADDR]    = { .type = NLA_U32 },
+       [NFTA_FWD_NFPROTO]      = { .type = NLA_U32 },
 };
 
 static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
@@ -62,7 +68,133 @@ nla_put_failure:
        return -1;
 }
 
+struct nft_fwd_neigh {
+       enum nft_registers      sreg_dev:8;
+       enum nft_registers      sreg_addr:8;
+       u8                      nfproto;
+};
+
+static void nft_fwd_neigh_eval(const struct nft_expr *expr,
+                             struct nft_regs *regs,
+                             const struct nft_pktinfo *pkt)
+{
+       struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+       void *addr = &regs->data[priv->sreg_addr];
+       int oif = regs->data[priv->sreg_dev];
+       unsigned int verdict = NF_STOLEN;
+       struct sk_buff *skb = pkt->skb;
+       struct net_device *dev;
+       int neigh_table;
+
+       switch (priv->nfproto) {
+       case NFPROTO_IPV4: {
+               struct iphdr *iph;
+
+               if (skb->protocol != htons(ETH_P_IP)) {
+                       verdict = NFT_BREAK;
+                       goto out;
+               }
+               if (skb_try_make_writable(skb, sizeof(*iph))) {
+                       verdict = NF_DROP;
+                       goto out;
+               }
+               iph = ip_hdr(skb);
+               ip_decrease_ttl(iph);
+               neigh_table = NEIGH_ARP_TABLE;
+               break;
+               }
+       case NFPROTO_IPV6: {
+               struct ipv6hdr *ip6h;
+
+               if (skb->protocol != htons(ETH_P_IPV6)) {
+                       verdict = NFT_BREAK;
+                       goto out;
+               }
+               if (skb_try_make_writable(skb, sizeof(*ip6h))) {
+                       verdict = NF_DROP;
+                       goto out;
+               }
+               ip6h = ipv6_hdr(skb);
+               ip6h->hop_limit--;
+               neigh_table = NEIGH_ND_TABLE;
+               break;
+               }
+       default:
+               verdict = NFT_BREAK;
+               goto out;
+       }
+
+       dev = dev_get_by_index_rcu(nft_net(pkt), oif);
+       if (dev == NULL)
+               return;
+
+       skb->dev = dev;
+       neigh_xmit(neigh_table, dev, addr, skb);
+out:
+       regs->verdict.code = verdict;
+}
+
+static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
+                             const struct nft_expr *expr,
+                             const struct nlattr * const tb[])
+{
+       struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+       unsigned int addr_len;
+       int err;
+
+       if (!tb[NFTA_FWD_SREG_DEV] ||
+           !tb[NFTA_FWD_SREG_ADDR] ||
+           !tb[NFTA_FWD_NFPROTO])
+               return -EINVAL;
+
+       priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]);
+       priv->sreg_addr = nft_parse_register(tb[NFTA_FWD_SREG_ADDR]);
+       priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO]));
+
+       switch (priv->nfproto) {
+       case NFPROTO_IPV4:
+               addr_len = sizeof(struct in_addr);
+               break;
+       case NFPROTO_IPV6:
+               addr_len = sizeof(struct in6_addr);
+               break;
+       default:
+               return -EOPNOTSUPP;
+       }
+
+       err = nft_validate_register_load(priv->sreg_dev, sizeof(int));
+       if (err < 0)
+               return err;
+
+       return nft_validate_register_load(priv->sreg_addr, addr_len);
+}
+
+static const struct nft_expr_ops nft_fwd_netdev_ingress_ops;
+
+static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+       struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+
+       if (nft_dump_register(skb, NFTA_FWD_SREG_DEV, priv->sreg_dev) ||
+           nft_dump_register(skb, NFTA_FWD_SREG_ADDR, priv->sreg_addr) ||
+           nla_put_be32(skb, NFTA_FWD_NFPROTO, htonl(priv->nfproto)))
+               goto nla_put_failure;
+
+       return 0;
+
+nla_put_failure:
+       return -1;
+}
+
 static struct nft_expr_type nft_fwd_netdev_type;
+static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
+       .type           = &nft_fwd_netdev_type,
+       .size           = NFT_EXPR_SIZE(sizeof(struct nft_fwd_neigh)),
+       .eval           = nft_fwd_neigh_eval,
+       .init           = nft_fwd_neigh_init,
+       .dump           = nft_fwd_neigh_dump,
+};
+
 static const struct nft_expr_ops nft_fwd_netdev_ops = {
        .type           = &nft_fwd_netdev_type,
        .size           = NFT_EXPR_SIZE(sizeof(struct nft_fwd_netdev)),
@@ -71,10 +203,22 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = {
        .dump           = nft_fwd_netdev_dump,
 };
 
+static const struct nft_expr_ops *
+nft_fwd_select_ops(const struct nft_ctx *ctx,
+                  const struct nlattr * const tb[])
+{
+       if (tb[NFTA_FWD_SREG_ADDR])
+               return &nft_fwd_neigh_netdev_ops;
+       if (tb[NFTA_FWD_SREG_DEV])
+               return &nft_fwd_netdev_ops;
+
+        return ERR_PTR(-EOPNOTSUPP);
+}
+
 static struct nft_expr_type nft_fwd_netdev_type __read_mostly = {
        .family         = NFPROTO_NETDEV,
        .name           = "fwd",
-       .ops            = &nft_fwd_netdev_ops,
+       .select_ops     = nft_fwd_select_ops,
        .policy         = nft_fwd_netdev_policy,
        .maxattr        = NFTA_FWD_MAX,
        .owner          = THIS_MODULE,
index f0fc21f887753c6cc91be5ada551c12385cbb579..c2d237144f747c4e4938ca4807668a400d1018c5 100644 (file)
@@ -177,10 +177,7 @@ static int nft_jhash_map_init(const struct nft_ctx *ctx,
        priv->map = nft_set_lookup_global(ctx->net, ctx->table,
                                          tb[NFTA_HASH_SET_NAME],
                                          tb[NFTA_HASH_SET_ID], genmask);
-       if (IS_ERR(priv->map))
-               return PTR_ERR(priv->map);
-
-       return 0;
+       return PTR_ERR_OR_ZERO(priv->map);
 }
 
 static int nft_symhash_init(const struct nft_ctx *ctx,
@@ -220,10 +217,7 @@ static int nft_symhash_map_init(const struct nft_ctx *ctx,
        priv->map = nft_set_lookup_global(ctx->net, ctx->table,
                                          tb[NFTA_HASH_SET_NAME],
                                          tb[NFTA_HASH_SET_ID], genmask);
-       if (IS_ERR(priv->map))
-               return PTR_ERR(priv->map);
-
-       return 0;
+       return PTR_ERR_OR_ZERO(priv->map);
 }
 
 static int nft_jhash_dump(struct sk_buff *skb,
index aa87ff8beae82cf733303b1b32d8b50ba5af65b7..15adf8ca82c3783efcb510efa85aa894afd1c2da 100644 (file)
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
 
-struct nft_immediate_expr {
-       struct nft_data         data;
-       enum nft_registers      dreg:8;
-       u8                      dlen;
-};
-
 static void nft_immediate_eval(const struct nft_expr *expr,
                               struct nft_regs *regs,
                               const struct nft_pktinfo *pkt)
@@ -101,12 +95,27 @@ nla_put_failure:
 
 static int nft_immediate_validate(const struct nft_ctx *ctx,
                                  const struct nft_expr *expr,
-                                 const struct nft_data **data)
+                                 const struct nft_data **d)
 {
        const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+       const struct nft_data *data;
+       int err;
 
-       if (priv->dreg == NFT_REG_VERDICT)
-               *data = &priv->data;
+       if (priv->dreg != NFT_REG_VERDICT)
+               return 0;
+
+       data = &priv->data;
+
+       switch (data->verdict.code) {
+       case NFT_JUMP:
+       case NFT_GOTO:
+               err = nft_chain_validate(ctx, data->verdict.chain);
+               if (err < 0)
+                       return err;
+               break;
+       default:
+               break;
+       }
 
        return 0;
 }
index a27be36dc0afbb937b39c2712c797b9d1215c3ab..7eef1cffbf1bc83bce99162ce00b2f03bc14f2cb 100644 (file)
@@ -9,12 +9,15 @@
  * Development of this code funded by Astaro AG (http://www.astaro.com/)
  */
 
+#include <linux/audit.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_log.h>
 #include <linux/netdevice.h>
@@ -26,12 +29,93 @@ struct nft_log {
        char                    *prefix;
 };
 
+static bool audit_ip4(struct audit_buffer *ab, struct sk_buff *skb)
+{
+       struct iphdr _iph;
+       const struct iphdr *ih;
+
+       ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_iph), &_iph);
+       if (!ih)
+               return false;
+
+       audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu",
+                        &ih->saddr, &ih->daddr, ih->protocol);
+
+       return true;
+}
+
+static bool audit_ip6(struct audit_buffer *ab, struct sk_buff *skb)
+{
+       struct ipv6hdr _ip6h;
+       const struct ipv6hdr *ih;
+       u8 nexthdr;
+       __be16 frag_off;
+
+       ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h);
+       if (!ih)
+               return false;
+
+       nexthdr = ih->nexthdr;
+       ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), &nexthdr, &frag_off);
+
+       audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu",
+                        &ih->saddr, &ih->daddr, nexthdr);
+
+       return true;
+}
+
+static void nft_log_eval_audit(const struct nft_pktinfo *pkt)
+{
+       struct sk_buff *skb = pkt->skb;
+       struct audit_buffer *ab;
+       int fam = -1;
+
+       if (!audit_enabled)
+               return;
+
+       ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
+       if (!ab)
+               return;
+
+       audit_log_format(ab, "mark=%#x", skb->mark);
+
+       switch (nft_pf(pkt)) {
+       case NFPROTO_BRIDGE:
+               switch (eth_hdr(skb)->h_proto) {
+               case htons(ETH_P_IP):
+                       fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1;
+                       break;
+               case htons(ETH_P_IPV6):
+                       fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1;
+                       break;
+               }
+               break;
+       case NFPROTO_IPV4:
+               fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1;
+               break;
+       case NFPROTO_IPV6:
+               fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1;
+               break;
+       }
+
+       if (fam == -1)
+               audit_log_format(ab, " saddr=? daddr=? proto=-1");
+
+       audit_log_end(ab);
+}
+
 static void nft_log_eval(const struct nft_expr *expr,
                         struct nft_regs *regs,
                         const struct nft_pktinfo *pkt)
 {
        const struct nft_log *priv = nft_expr_priv(expr);
 
+       if (priv->loginfo.type == NF_LOG_TYPE_LOG &&
+           priv->loginfo.u.log.level == LOGLEVEL_AUDIT) {
+               nft_log_eval_audit(pkt);
+               return;
+       }
+
        nf_log_packet(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
                      nft_in(pkt), nft_out(pkt), &priv->loginfo, "%s",
                      priv->prefix);
@@ -84,7 +168,7 @@ static int nft_log_init(const struct nft_ctx *ctx,
                } else {
                        li->u.log.level = LOGLEVEL_WARNING;
                }
-               if (li->u.log.level > LOGLEVEL_DEBUG) {
+               if (li->u.log.level > LOGLEVEL_AUDIT) {
                        err = -EINVAL;
                        goto err1;
                }
@@ -112,6 +196,9 @@ static int nft_log_init(const struct nft_ctx *ctx,
                break;
        }
 
+       if (li->u.log.level == LOGLEVEL_AUDIT)
+               return 0;
+
        err = nf_logger_find_get(ctx->family, li->type);
        if (err < 0)
                goto err1;
@@ -133,6 +220,9 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
        if (priv->prefix != nft_log_null_prefix)
                kfree(priv->prefix);
 
+       if (li->u.log.level == LOGLEVEL_AUDIT)
+               return;
+
        nf_logger_put(ctx->family, li->type);
 }
 
index f52da5e2199fe4e55febfc5bf4366acfddc87889..42e6fadf1417eba7ce4512d43cce339fc627e204 100644 (file)
@@ -149,6 +149,52 @@ nla_put_failure:
        return -1;
 }
 
+static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
+                                      struct nft_set *set,
+                                      const struct nft_set_iter *iter,
+                                      struct nft_set_elem *elem)
+{
+       const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+       const struct nft_data *data;
+
+       if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+           *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
+               return 0;
+
+       data = nft_set_ext_data(ext);
+       switch (data->verdict.code) {
+       case NFT_JUMP:
+       case NFT_GOTO:
+               return nft_chain_validate(ctx, data->verdict.chain);
+       default:
+               return 0;
+       }
+}
+
+static int nft_lookup_validate(const struct nft_ctx *ctx,
+                              const struct nft_expr *expr,
+                              const struct nft_data **d)
+{
+       const struct nft_lookup *priv = nft_expr_priv(expr);
+       struct nft_set_iter iter;
+
+       if (!(priv->set->flags & NFT_SET_MAP) ||
+           priv->set->dtype != NFT_DATA_VERDICT)
+               return 0;
+
+       iter.genmask    = nft_genmask_next(ctx->net);
+       iter.skip       = 0;
+       iter.count      = 0;
+       iter.err        = 0;
+       iter.fn         = nft_lookup_validate_setelem;
+
+       priv->set->ops->walk(ctx, priv->set, &iter);
+       if (iter.err < 0)
+               return iter.err;
+
+       return 0;
+}
+
 static const struct nft_expr_ops nft_lookup_ops = {
        .type           = &nft_lookup_type,
        .size           = NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
@@ -156,6 +202,7 @@ static const struct nft_expr_ops nft_lookup_ops = {
        .init           = nft_lookup_init,
        .destroy        = nft_lookup_destroy,
        .dump           = nft_lookup_dump,
+       .validate       = nft_lookup_validate,
 };
 
 struct nft_expr_type nft_lookup_type __read_mostly = {
index cdbc62a53933e5940ce2049af61ab0d453f8962d..1f4d0854cf70b877a43bd46bbb23f46a51157d21 100644 (file)
@@ -114,10 +114,7 @@ static int nft_ng_inc_map_init(const struct nft_ctx *ctx,
                                          tb[NFTA_NG_SET_NAME],
                                          tb[NFTA_NG_SET_ID], genmask);
 
-       if (IS_ERR(priv->map))
-               return PTR_ERR(priv->map);
-
-       return 0;
+       return PTR_ERR_OR_ZERO(priv->map);
 }
 
 static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
new file mode 100644 (file)
index 0000000..d863370
--- /dev/null
@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_socket.h>
+#include <net/inet_sock.h>
+
+struct nft_socket {
+       enum nft_socket_keys            key:8;
+       union {
+               enum nft_registers      dreg:8;
+       };
+};
+
+static void nft_socket_eval(const struct nft_expr *expr,
+                           struct nft_regs *regs,
+                           const struct nft_pktinfo *pkt)
+{
+       const struct nft_socket *priv = nft_expr_priv(expr);
+       struct sk_buff *skb = pkt->skb;
+       struct sock *sk = skb->sk;
+       u32 *dest = &regs->data[priv->dreg];
+
+       if (!sk)
+               switch(nft_pf(pkt)) {
+               case NFPROTO_IPV4:
+                       sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt));
+                       break;
+#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+               case NFPROTO_IPV6:
+                       sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt));
+                       break;
+#endif
+               default:
+                       WARN_ON_ONCE(1);
+                       regs->verdict.code = NFT_BREAK;
+                       return;
+               }
+
+       if(!sk) {
+               nft_reg_store8(dest, 0);
+               return;
+       }
+
+       /* So that subsequent socket matching not to require other lookups. */
+       skb->sk = sk;
+
+       switch(priv->key) {
+       case NFT_SOCKET_TRANSPARENT:
+               nft_reg_store8(dest, nf_sk_is_transparent(sk));
+               break;
+       default:
+               WARN_ON(1);
+               regs->verdict.code = NFT_BREAK;
+       }
+}
+
+static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = {
+       [NFTA_SOCKET_KEY]               = { .type = NLA_U32 },
+       [NFTA_SOCKET_DREG]              = { .type = NLA_U32 },
+};
+
+static int nft_socket_init(const struct nft_ctx *ctx,
+                          const struct nft_expr *expr,
+                          const struct nlattr * const tb[])
+{
+       struct nft_socket *priv = nft_expr_priv(expr);
+       unsigned int len;
+
+       if (!tb[NFTA_SOCKET_DREG] || !tb[NFTA_SOCKET_KEY])
+               return -EINVAL;
+
+       switch(ctx->family) {
+       case NFPROTO_IPV4:
+#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+       case NFPROTO_IPV6:
+#endif
+       case NFPROTO_INET:
+               break;
+       default:
+               return -EOPNOTSUPP;
+       }
+
+       priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY]));
+       switch(priv->key) {
+       case NFT_SOCKET_TRANSPARENT:
+               len = sizeof(u8);
+               break;
+       default:
+               return -EOPNOTSUPP;
+       }
+
+       priv->dreg = nft_parse_register(tb[NFTA_SOCKET_DREG]);
+       return nft_validate_register_store(ctx, priv->dreg, NULL,
+                                          NFT_DATA_VALUE, len);
+}
+
+static int nft_socket_dump(struct sk_buff *skb,
+                          const struct nft_expr *expr)
+{
+       const struct nft_socket *priv = nft_expr_priv(expr);
+
+       if (nla_put_u32(skb, NFTA_SOCKET_KEY, htonl(priv->key)))
+               return -1;
+       if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg))
+               return -1;
+       return 0;
+}
+
+static struct nft_expr_type nft_socket_type;
+static const struct nft_expr_ops nft_socket_ops = {
+       .type           = &nft_socket_type,
+       .size           = NFT_EXPR_SIZE(sizeof(struct nft_socket)),
+       .eval           = nft_socket_eval,
+       .init           = nft_socket_init,
+       .dump           = nft_socket_dump,
+};
+
+static struct nft_expr_type nft_socket_type __read_mostly = {
+       .name           = "socket",
+       .ops            = &nft_socket_ops,
+       .policy         = nft_socket_policy,
+       .maxattr        = NFTA_SOCKET_MAX,
+       .owner          = THIS_MODULE,
+};
+
+static int __init nft_socket_module_init(void)
+{
+       return nft_register_expr(&nft_socket_type);
+}
+
+static void __exit nft_socket_module_exit(void)
+{
+       nft_unregister_expr(&nft_socket_type);
+}
+
+module_init(nft_socket_module_init);
+module_exit(nft_socket_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Máté Eckl");
+MODULE_DESCRIPTION("nf_tables socket match module");