bpf: Add socket assign support

author Joe Stringer <joe@wand.net.nz>

Sun, 29 Mar 2020 22:53:38 +0000 (15:53 -0700)

committer Alexei Starovoitov <ast@kernel.org>

Mon, 30 Mar 2020 20:45:04 +0000 (13:45 -0700)
author Joe Stringer <joe@wand.net.nz>
Sun, 29 Mar 2020 22:53:38 +0000 (15:53 -0700)
committer Alexei Starovoitov <ast@kernel.org>
Mon, 30 Mar 2020 20:45:04 +0000 (13:45 -0700)
diff --git a/include/net/sock.h b/include/net/sock.h

index b5cca7bae69bbc0abe75c6ae848d34eb7bd5118f..dc398cee78739b02db4c62cd9e37b5823b21b121 100644 (file)
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1659,6 +1659,7 @@ void sock_rfree(struct sk_buff *skb);
  void sock_efree(struct sk_buff *skb);
  #ifdef CONFIG_INET
  void sock_edemux(struct sk_buff *skb);
+void sock_pfree(struct sk_buff *skb);
  #else
  #define sock_edemux sock_efree
  #endif
@@ -2526,6 +2527,16 @@ void sock_net_set(struct sock *sk, struct net *net)
         write_pnet(&sk->sk_net, net);
  }
  
+static inline bool
+skb_sk_is_prefetched(struct sk_buff *skb)
+{
+#ifdef CONFIG_INET
+       return skb->destructor == sock_pfree;
+#else
+       return false;
+#endif /* CONFIG_INET */
+}
+
  static inline struct sock *skb_steal_sock(struct sk_buff *skb)
  {
         if (skb->sk) {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

index f1fbc36f58d329e4ca4993eefbb49fae6d49118e..9f786a5a44ac3fca22db85353504887aa1afa3c1 100644 (file)
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2983,6 +2983,28 @@ union bpf_attr {
   *             **bpf_get_current_cgroup_id**\ ().
   *     Return
   *             The id is returned or 0 in case the id could not be retrieved.
+ *
+ * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
+ *     Description
+ *             Assign the *sk* to the *skb*. When combined with appropriate
+ *             routing configuration to receive the packet towards the socket,
+ *             will cause *skb* to be delivered to the specified socket.
+ *             Subsequent redirection of *skb* via  **bpf_redirect**\ (),
+ *             **bpf_clone_redirect**\ () or other methods outside of BPF may
+ *             interfere with successful delivery to the socket.
+ *
+ *             This operation is only valid from TC ingress path.
+ *
+ *             The *flags* argument must be zero.
+ *     Return
+ *             0 on success, or a negative errno in case of failure.
+ *
+ *             * **-EINVAL**           Unsupported flags specified.
+ *             * **-ENOENT**           Socket is unavailable for assignment.
+ *             * **-ENETUNREACH**      Socket is unreachable (wrong netns).
+ *             * **-EOPNOTSUPP**       Unsupported operation, for example a
+ *                                     call from outside of TC ingress.
+ *             * **-ESOCKTNOSUPPORT**  Socket type not supported (reuseport).
   */
  #define __BPF_FUNC_MAPPER(FN)          \
         FN(unspec),                     \
@@ -3108,7 +3130,8 @@ union bpf_attr {
         FN(get_ns_current_pid_tgid),    \
         FN(xdp_output),                 \
         FN(get_netns_cookie),           \
-       FN(get_current_ancestor_cgroup_id),
+       FN(get_current_ancestor_cgroup_id),     \
+       FN(sk_assign),
  
  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
   * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c

index bb4a196c88096c4328c99e10f7d90f624bed885d..ac5c1633f8d2723b32d7d1d32adb5d8cd9436a41 100644 (file)
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5918,6 +5918,35 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
         .arg5_type      = ARG_CONST_SIZE,
  };
  
+BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
+{
+       if (flags != 0)
+               return -EINVAL;
+       if (!skb_at_tc_ingress(skb))
+               return -EOPNOTSUPP;
+       if (unlikely(dev_net(skb->dev) != sock_net(sk)))
+               return -ENETUNREACH;
+       if (unlikely(sk->sk_reuseport))
+               return -ESOCKTNOSUPPORT;
+       if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
+               return -ENOENT;
+
+       skb_orphan(skb);
+       skb->sk = sk;
+       skb->destructor = sock_pfree;
+
+       return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_assign_proto = {
+       .func           = bpf_sk_assign,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_PTR_TO_SOCK_COMMON,
+       .arg3_type      = ARG_ANYTHING,
+};
+
  #endif /* CONFIG_INET */
  
  bool bpf_helper_changes_pkt_data(void *func)
@@ -6249,6 +6278,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                 return &bpf_skb_ecn_set_ce_proto;
         case BPF_FUNC_tcp_gen_syncookie:
                 return &bpf_tcp_gen_syncookie_proto;
+       case BPF_FUNC_sk_assign:
+               return &bpf_sk_assign_proto;
  #endif
         default:
                 return bpf_base_func_proto(func_id);
diff --git a/net/core/sock.c b/net/core/sock.c

index 0fc8937a7ff4c15862d62be3c518a0cec328276b..87e3a03c905631b336d164d5fbf35318376e6c1f 100644 (file)
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2071,6 +2071,17 @@ void sock_efree(struct sk_buff *skb)
  }
  EXPORT_SYMBOL(sock_efree);
  
+/* Buffer destructor for prefetch/receive path where reference count may
+ * not be held, e.g. for listen sockets.
+ */
+#ifdef CONFIG_INET
+void sock_pfree(struct sk_buff *skb)
+{
+       sock_gen_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_pfree);
+#endif /* CONFIG_INET */
+
  kuid_t sock_i_uid(struct sock *sk)
  {
         kuid_t uid;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c

index aa438c6758a7f9fadbf3dddce56ea7821e290cf2..b0c244af1e4d587322b4e3490ad3d878805be670 100644 (file)
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -509,7 +509,8 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
         IPCB(skb)->iif = skb->skb_iif;
  
         /* Must drop socket now because of tproxy. */
-       skb_orphan(skb);
+       if (!skb_sk_is_prefetched(skb))
+               skb_orphan(skb);
  
         return skb;
  
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c

index 7b089d0ac8cddd02431348de564016e82bf9dd54..e96304d8a4a7f70cae11754ac7532c58d61fb261 100644 (file)
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -285,7 +285,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
         rcu_read_unlock();
  
         /* Must drop socket now because of tproxy. */
-       skb_orphan(skb);
+       if (!skb_sk_is_prefetched(skb))
+               skb_orphan(skb);
  
         return skb;
  err:
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c

index 46f47e58b3bea0b13461408db42f75e31da34ebd..54d5652cfe6ca3849680db1d5489067c447908bf 100644 (file)
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -12,6 +12,7 @@
  #include <linux/bpf.h>
  
  #include <net/netlink.h>
+#include <net/sock.h>
  #include <net/pkt_sched.h>
  #include <net/pkt_cls.h>
  
@@ -53,6 +54,8 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act,
                 bpf_compute_data_pointers(skb);
                 filter_res = BPF_PROG_RUN(filter, skb);
         }
+       if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK)
+               skb_orphan(skb);
         rcu_read_unlock();
  
         /* A BPF program may overwrite the default action opcode.
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h

index f1fbc36f58d329e4ca4993eefbb49fae6d49118e..9f786a5a44ac3fca22db85353504887aa1afa3c1 100644 (file)
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2983,6 +2983,28 @@ union bpf_attr {
   *             **bpf_get_current_cgroup_id**\ ().
   *     Return
   *             The id is returned or 0 in case the id could not be retrieved.
+ *
+ * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
+ *     Description
+ *             Assign the *sk* to the *skb*. When combined with appropriate
+ *             routing configuration to receive the packet towards the socket,
+ *             will cause *skb* to be delivered to the specified socket.
+ *             Subsequent redirection of *skb* via  **bpf_redirect**\ (),
+ *             **bpf_clone_redirect**\ () or other methods outside of BPF may
+ *             interfere with successful delivery to the socket.
+ *
+ *             This operation is only valid from TC ingress path.
+ *
+ *             The *flags* argument must be zero.
+ *     Return
+ *             0 on success, or a negative errno in case of failure.
+ *
+ *             * **-EINVAL**           Unsupported flags specified.
+ *             * **-ENOENT**           Socket is unavailable for assignment.
+ *             * **-ENETUNREACH**      Socket is unreachable (wrong netns).
+ *             * **-EOPNOTSUPP**       Unsupported operation, for example a
+ *                                     call from outside of TC ingress.
+ *             * **-ESOCKTNOSUPPORT**  Socket type not supported (reuseport).
   */
  #define __BPF_FUNC_MAPPER(FN)          \
         FN(unspec),                     \
@@ -3108,7 +3130,8 @@ union bpf_attr {
         FN(get_ns_current_pid_tgid),    \
         FN(xdp_output),                 \
         FN(get_netns_cookie),           \
-       FN(get_current_ancestor_cgroup_id),
+       FN(get_current_ancestor_cgroup_id),     \
+       FN(sk_assign),
  
  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
   * function eBPF program intends to call
author	Joe Stringer <joe@wand.net.nz>
	Sun, 29 Mar 2020 22:53:38 +0000 (15:53 -0700)
committer	Alexei Starovoitov <ast@kernel.org>
	Mon, 30 Mar 2020 20:45:04 +0000 (13:45 -0700)
include/net/sock.h		patch \| blob \| blame \| history
include/uapi/linux/bpf.h		patch \| blob \| blame \| history
net/core/filter.c		patch \| blob \| blame \| history
net/core/sock.c		patch \| blob \| blame \| history
net/ipv4/ip_input.c		patch \| blob \| blame \| history
net/ipv6/ip6_input.c		patch \| blob \| blame \| history
net/sched/act_bpf.c		patch \| blob \| blame \| history
tools/include/uapi/linux/bpf.h		patch \| blob \| blame \| history