bpf, sockmap: Pass skb ownership through read_skb
authorJohn Fastabend <john.fastabend@gmail.com>
Tue, 23 May 2023 02:56:05 +0000 (19:56 -0700)
committerDaniel Borkmann <daniel@iogearbox.net>
Tue, 23 May 2023 14:09:47 +0000 (16:09 +0200)
The read_skb hook calls consume_skb() now, but this means that if the
recv_actor program wants to use the skb it needs to inc the ref cnt
so that the consume_skb() doesn't kfree the sk_buff.

This is problematic because in some error cases under memory pressure
we may need to linearize the sk_buff from sk_psock_skb_ingress_enqueue().
Then we get this,

 skb_linearize()
   __pskb_pull_tail()
     pskb_expand_head()
       BUG_ON(skb_shared(skb))

Because we incremented users refcnt from sk_psock_verdict_recv() we
hit the bug on with refcnt > 1 and trip it.

To fix lets simply pass ownership of the sk_buff through the skb_read
call. Then we can drop the consume from read_skb handlers and assume
the verdict recv does any required kfree.

Bug found while testing in our CI which runs in VMs that hit memory
constraints rather regularly. William tested TCP read_skb handlers.

[  106.536188] ------------[ cut here ]------------
[  106.536197] kernel BUG at net/core/skbuff.c:1693!
[  106.536479] invalid opcode: 0000 [#1] PREEMPT SMP PTI
[  106.536726] CPU: 3 PID: 1495 Comm: curl Not tainted 5.19.0-rc5 #1
[  106.537023] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ArchLinux 1.16.0-1 04/01/2014
[  106.537467] RIP: 0010:pskb_expand_head+0x269/0x330
[  106.538585] RSP: 0018:ffffc90000138b68 EFLAGS: 00010202
[  106.538839] RAX: 000000000000003f RBX: ffff8881048940e8 RCX: 0000000000000a20
[  106.539186] RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff8881048940e8
[  106.539529] RBP: ffffc90000138be8 R08: 00000000e161fd1a R09: 0000000000000000
[  106.539877] R10: 0000000000000018 R11: 0000000000000000 R12: ffff8881048940e8
[  106.540222] R13: 0000000000000003 R14: 0000000000000000 R15: ffff8881048940e8
[  106.540568] FS:  00007f277dde9f00(0000) GS:ffff88813bd80000(0000) knlGS:0000000000000000
[  106.540954] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  106.541227] CR2: 00007f277eeede64 CR3: 000000000ad3e000 CR4: 00000000000006e0
[  106.541569] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  106.541915] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  106.542255] Call Trace:
[  106.542383]  <IRQ>
[  106.542487]  __pskb_pull_tail+0x4b/0x3e0
[  106.542681]  skb_ensure_writable+0x85/0xa0
[  106.542882]  sk_skb_pull_data+0x18/0x20
[  106.543084]  bpf_prog_b517a65a242018b0_bpf_skskb_http_verdict+0x3a9/0x4aa9
[  106.543536]  ? migrate_disable+0x66/0x80
[  106.543871]  sk_psock_verdict_recv+0xe2/0x310
[  106.544258]  ? sk_psock_write_space+0x1f0/0x1f0
[  106.544561]  tcp_read_skb+0x7b/0x120
[  106.544740]  tcp_data_queue+0x904/0xee0
[  106.544931]  tcp_rcv_established+0x212/0x7c0
[  106.545142]  tcp_v4_do_rcv+0x174/0x2a0
[  106.545326]  tcp_v4_rcv+0xe70/0xf60
[  106.545500]  ip_protocol_deliver_rcu+0x48/0x290
[  106.545744]  ip_local_deliver_finish+0xa7/0x150

Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()")
Reported-by: William Findlay <will@isovalent.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: William Findlay <will@isovalent.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20230523025618.113937-2-john.fastabend@gmail.com
net/core/skmsg.c
net/ipv4/tcp.c
net/ipv4/udp.c
net/unix/af_unix.c
net/vmw_vsock/virtio_transport_common.c

index f81883759d381c14d70f126d7281404e786259f9..4a3dc8d272957ea4fba6de6496df3b76ff193a05 100644 (file)
@@ -1183,8 +1183,6 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
        int ret = __SK_DROP;
        int len = skb->len;
 
-       skb_get(skb);
-
        rcu_read_lock();
        psock = sk_psock(sk);
        if (unlikely(!psock)) {
index 4d6392c16b7a5a9a853c27e3a4b258d000738304..e914e344637784cc9e112a03a30ecfda5ad3af39 100644 (file)
@@ -1773,7 +1773,6 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
                WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
                tcp_flags = TCP_SKB_CB(skb)->tcp_flags;
                used = recv_actor(sk, skb);
-               consume_skb(skb);
                if (used < 0) {
                        if (!copied)
                                copied = used;
index aa32afd871ee50968f7bb8152401be60dece1454..9482def1f310379efde1a1a8c86999b4b826cf17 100644 (file)
@@ -1818,7 +1818,7 @@ EXPORT_SYMBOL(__skb_recv_udp);
 int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 {
        struct sk_buff *skb;
-       int err, copied;
+       int err;
 
 try_again:
        skb = skb_recv_udp(sk, MSG_DONTWAIT, &err);
@@ -1837,10 +1837,7 @@ try_again:
        }
 
        WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
-       copied = recv_actor(sk, skb);
-       kfree_skb(skb);
-
-       return copied;
+       return recv_actor(sk, skb);
 }
 EXPORT_SYMBOL(udp_read_skb);
 
index cc695c9f09ec24297bf214092ab170646306339b..e7728b57a8c70ed3ea9f5a268cb9cc1238168bf4 100644 (file)
@@ -2553,7 +2553,7 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 {
        struct unix_sock *u = unix_sk(sk);
        struct sk_buff *skb;
-       int err, copied;
+       int err;
 
        mutex_lock(&u->iolock);
        skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
@@ -2561,10 +2561,7 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
        if (!skb)
                return err;
 
-       copied = recv_actor(sk, skb);
-       kfree_skb(skb);
-
-       return copied;
+       return recv_actor(sk, skb);
 }
 
 /*
index e4878551f1402cfb8977d0ac792c92bb8e9b2ccf..b769fc2589315098083f5b2a2c6850438d4265c4 100644 (file)
@@ -1441,7 +1441,6 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto
        struct sock *sk = sk_vsock(vsk);
        struct sk_buff *skb;
        int off = 0;
-       int copied;
        int err;
 
        spin_lock_bh(&vvs->rx_lock);
@@ -1454,9 +1453,7 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto
        if (!skb)
                return err;
 
-       copied = recv_actor(sk, skb);
-       kfree_skb(skb);
-       return copied;
+       return recv_actor(sk, skb);
 }
 EXPORT_SYMBOL_GPL(virtio_transport_read_skb);