bpf: Add get{peer, sock}name attach types for sock_addr
authorDaniel Borkmann <daniel@iogearbox.net>
Mon, 18 May 2020 22:45:45 +0000 (00:45 +0200)
committerAlexei Starovoitov <ast@kernel.org>
Tue, 19 May 2020 18:32:04 +0000 (11:32 -0700)
As stated in 983695fa6765 ("bpf: fix unconnected udp hooks"), the objective
for the existing cgroup connect/sendmsg/recvmsg/bind BPF hooks is to be
transparent to applications. In Cilium we make use of these hooks [0] in
order to enable E-W load balancing for existing Kubernetes service types
for all Cilium managed nodes in the cluster. Those backends can be local
or remote. The main advantage of this approach is that it operates as close
as possible to the socket, and therefore allows to avoid packet-based NAT
given in connect/sendmsg/recvmsg hooks we only need to xlate sock addresses.

This also allows to expose NodePort services on loopback addresses in the
host namespace, for example. As another advantage, this also efficiently
blocks bind requests for applications in the host namespace for exposed
ports. However, one missing item is that we also need to perform reverse
xlation for inet{,6}_getname() hooks such that we can return the service
IP/port tuple back to the application instead of the remote peer address.

The vast majority of applications does not bother about getpeername(), but
in a few occasions we've seen breakage when validating the peer's address
since it returns unexpectedly the backend tuple instead of the service one.
Therefore, this trivial patch allows to customise and adds a getpeername()
as well as getsockname() BPF cgroup hook for both IPv4 and IPv6 in order
to address this situation.

Simple example:

  # ./cilium/cilium service list
  ID   Frontend     Service Type   Backend
  1    1.2.3.4:80   ClusterIP      1 => 10.0.0.10:80

Before; curl's verbose output example, no getpeername() reverse xlation:

  # curl --verbose 1.2.3.4
  * Rebuilt URL to: 1.2.3.4/
  *   Trying 1.2.3.4...
  * TCP_NODELAY set
  * Connected to 1.2.3.4 (10.0.0.10) port 80 (#0)
  > GET / HTTP/1.1
  > Host: 1.2.3.4
  > User-Agent: curl/7.58.0
  > Accept: */*
  [...]

After; with getpeername() reverse xlation:

  # curl --verbose 1.2.3.4
  * Rebuilt URL to: 1.2.3.4/
  *   Trying 1.2.3.4...
  * TCP_NODELAY set
  * Connected to 1.2.3.4 (1.2.3.4) port 80 (#0)
  > GET / HTTP/1.1
  >  Host: 1.2.3.4
  > User-Agent: curl/7.58.0
  > Accept: */*
  [...]

Originally, I had both under a BPF_CGROUP_INET{4,6}_GETNAME type and exposed
peer to the context similar as in inet{,6}_getname() fashion, but API-wise
this is suboptimal as it always enforces programs having to test for ctx->peer
which can easily be missed, hence BPF_CGROUP_INET{4,6}_GET{PEER,SOCK}NAME split.
Similarly, the checked return code is on tnum_range(1, 1), but if a use case
comes up in future, it can easily be changed to return an error code instead.
Helper and ctx member access is the same as with connect/sendmsg/etc hooks.

  [0] https://github.com/cilium/cilium/blob/master/bpf/bpf_sock.c

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Andrey Ignatov <rdna@fb.com>
Link: https://lore.kernel.org/bpf/61a479d759b2482ae3efb45546490bacd796a220.1589841594.git.daniel@iogearbox.net
include/linux/bpf-cgroup.h
include/uapi/linux/bpf.h
kernel/bpf/syscall.c
kernel/bpf/verifier.c
net/core/filter.c
net/ipv4/af_inet.c
net/ipv6/af_inet6.c
tools/include/uapi/linux/bpf.h

index 272626cc3fc9dbd40aa828eeb236a150a2b9774c..c66c545e161a60a42fe72a1530cfb031fb357e5d 100644 (file)
@@ -396,6 +396,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 }
 
 #define cgroup_bpf_enabled (0)
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; })
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
index b9b8a0f63b9134e07fc74c483143c4b203b35d21..97e1fd19ff58ae0b1a69ea293958ba6353931ba4 100644 (file)
@@ -220,6 +220,10 @@ enum bpf_attach_type {
        BPF_MODIFY_RETURN,
        BPF_LSM_MAC,
        BPF_TRACE_ITER,
+       BPF_CGROUP_INET4_GETPEERNAME,
+       BPF_CGROUP_INET6_GETPEERNAME,
+       BPF_CGROUP_INET4_GETSOCKNAME,
+       BPF_CGROUP_INET6_GETSOCKNAME,
        __MAX_BPF_ATTACH_TYPE
 };
 
index 57dfc98289d5319c05330d61998ec0cc2f79c525..431241c74614339dd0ca739c33da83bbda444bbc 100644 (file)
@@ -1978,6 +1978,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
+               case BPF_CGROUP_INET4_GETPEERNAME:
+               case BPF_CGROUP_INET6_GETPEERNAME:
+               case BPF_CGROUP_INET4_GETSOCKNAME:
+               case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UDP4_RECVMSG:
@@ -2767,6 +2771,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
        case BPF_CGROUP_INET6_BIND:
        case BPF_CGROUP_INET4_CONNECT:
        case BPF_CGROUP_INET6_CONNECT:
+       case BPF_CGROUP_INET4_GETPEERNAME:
+       case BPF_CGROUP_INET6_GETPEERNAME:
+       case BPF_CGROUP_INET4_GETSOCKNAME:
+       case BPF_CGROUP_INET6_GETSOCKNAME:
        case BPF_CGROUP_UDP4_SENDMSG:
        case BPF_CGROUP_UDP6_SENDMSG:
        case BPF_CGROUP_UDP4_RECVMSG:
@@ -2912,6 +2920,10 @@ static int bpf_prog_query(const union bpf_attr *attr,
        case BPF_CGROUP_INET6_POST_BIND:
        case BPF_CGROUP_INET4_CONNECT:
        case BPF_CGROUP_INET6_CONNECT:
+       case BPF_CGROUP_INET4_GETPEERNAME:
+       case BPF_CGROUP_INET6_GETPEERNAME:
+       case BPF_CGROUP_INET4_GETSOCKNAME:
+       case BPF_CGROUP_INET6_GETSOCKNAME:
        case BPF_CGROUP_UDP4_SENDMSG:
        case BPF_CGROUP_UDP6_SENDMSG:
        case BPF_CGROUP_UDP4_RECVMSG:
index 9c7d67d65d8cc15539e14c92d8134b5b10bf854b..2ed8351f47a499db20db01213860428905b1f1bc 100644 (file)
@@ -7094,7 +7094,11 @@ static int check_return_code(struct bpf_verifier_env *env)
        switch (env->prog->type) {
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
                if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
-                   env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
+                   env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
+                   env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
+                   env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
+                   env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
+                   env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
                        range = tnum_range(1, 1);
                break;
        case BPF_PROG_TYPE_CGROUP_SKB:
index 822d662f97ef93482d2708f9623d30400909d523..bd2853d23b505e34cc8aeb61b9500cfdd590aae2 100644 (file)
@@ -7049,6 +7049,8 @@ static bool sock_addr_is_valid_access(int off, int size,
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET4_CONNECT:
+               case BPF_CGROUP_INET4_GETPEERNAME:
+               case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP4_RECVMSG:
                        break;
@@ -7060,6 +7062,8 @@ static bool sock_addr_is_valid_access(int off, int size,
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET6_CONNECT:
+               case BPF_CGROUP_INET6_GETPEERNAME:
+               case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                        break;
index fcf0d12a407a90a5ae54a5539b51a5e4429dcdcc..8f5c8c9409d397b70252c1b4329cc4a1ed78c870 100644 (file)
@@ -755,12 +755,11 @@ do_err:
 }
 EXPORT_SYMBOL(inet_accept);
 
-
 /*
  *     This does both peername and sockname.
  */
 int inet_getname(struct socket *sock, struct sockaddr *uaddr,
-                       int peer)
+                int peer)
 {
        struct sock *sk         = sock->sk;
        struct inet_sock *inet  = inet_sk(sk);
@@ -781,6 +780,11 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
                sin->sin_port = inet->inet_sport;
                sin->sin_addr.s_addr = addr;
        }
+       if (cgroup_bpf_enabled)
+               BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+                                           peer ? BPF_CGROUP_INET4_GETPEERNAME :
+                                                  BPF_CGROUP_INET4_GETSOCKNAME,
+                                           NULL);
        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
        return sizeof(*sin);
 }
index 771a462a8322b3e3962ec3de85f3c3934fe3f5e0..3b6fcc0c321ae73d609b66f29331811f979a64cc 100644 (file)
@@ -504,9 +504,8 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock);
 /*
  *     This does both peername and sockname.
  */
-
 int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
-                int peer)
+                 int peer)
 {
        struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr;
        struct sock *sk = sock->sk;
@@ -531,9 +530,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
                        sin->sin6_addr = np->saddr;
                else
                        sin->sin6_addr = sk->sk_v6_rcv_saddr;
-
                sin->sin6_port = inet->inet_sport;
        }
+       if (cgroup_bpf_enabled)
+               BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+                                           peer ? BPF_CGROUP_INET6_GETPEERNAME :
+                                                  BPF_CGROUP_INET6_GETSOCKNAME,
+                                           NULL);
        sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
                                                 sk->sk_bound_dev_if);
        return sizeof(*sin);
index 146c742f1d491a91be76ea5fa06c615530f6a2be..1cddc398404a818f6692f547727292855acb437d 100644 (file)
@@ -220,6 +220,10 @@ enum bpf_attach_type {
        BPF_MODIFY_RETURN,
        BPF_LSM_MAC,
        BPF_TRACE_ITER,
+       BPF_CGROUP_INET4_GETPEERNAME,
+       BPF_CGROUP_INET6_GETPEERNAME,
+       BPF_CGROUP_INET4_GETSOCKNAME,
+       BPF_CGROUP_INET6_GETSOCKNAME,
        __MAX_BPF_ATTACH_TYPE
 };