bpf: udp: Make sure iter->batch always contains a full bucket snapshot
authorJordan Rife <jordan@jrife.io>
Fri, 2 May 2025 16:15:21 +0000 (09:15 -0700)
committerMartin KaFai Lau <martin.lau@kernel.org>
Fri, 2 May 2025 17:54:37 +0000 (10:54 -0700)
Require that iter->batch always contains a full bucket snapshot. This
invariant is important to avoid skipping or repeating sockets during
iteration when combined with the next few patches. Before, there were
two cases where a call to bpf_iter_udp_batch may only capture part of a
bucket:

1. When bpf_iter_udp_realloc_batch() returns -ENOMEM [1].
2. When more sockets are added to the bucket while calling
   bpf_iter_udp_realloc_batch(), making the updated batch size
   insufficient [2].

In cases where the batch size only covers part of a bucket, it is
possible to forget which sockets were already visited, especially if we
have to process a bucket in more than two batches. This forces us to
choose between repeating or skipping sockets, so don't allow this:

1. Stop iteration and propagate -ENOMEM up to userspace if reallocation
   fails instead of continuing with a partial batch.
2. Try bpf_iter_udp_realloc_batch() with GFP_USER just as before, but if
   we still aren't able to capture the full bucket, call
   bpf_iter_udp_realloc_batch() again while holding the bucket lock to
   guarantee the bucket does not change. On the second attempt use
   GFP_NOWAIT since we hold onto the spin lock.

Introduce the udp_portaddr_for_each_entry_from macro and use it instead
of udp_portaddr_for_each_entry to make it possible to continue iteration
from an arbitrary socket. This is required for this patch in the
GFP_NOWAIT case to allow us to fill the rest of a batch starting from
the middle of a bucket and the later patch which skips sockets that were
already seen.

Testing all scenarios directly is a bit difficult, but I did some manual
testing to exercise the code paths where GFP_NOWAIT is used and where
ERR_PTR(err) is returned. I used the realloc test case included later
in this series to trigger a scenario where a realloc happens inside
bpf_iter_udp_batch and made a small code tweak to force the first
realloc attempt to allocate a too-small batch, thus requiring
another attempt with GFP_NOWAIT. Some printks showed both reallocs with
the tests passing:

Apr 25 23:16:24 crow kernel: go again GFP_USER
Apr 25 23:16:24 crow kernel: go again GFP_NOWAIT

With this setup, I also forced each of the bpf_iter_udp_realloc_batch
calls to return -ENOMEM to ensure that iteration ends and that the
read() in userspace fails.

[1]: https://lore.kernel.org/bpf/CABi4-ogUtMrH8-NVB6W8Xg_F_KDLq=yy-yu-tKr2udXE2Mu1Lg@mail.gmail.com/
[2]: https://lore.kernel.org/bpf/7ed28273-a716-4638-912d-f86f965e54bb@linux.dev/

Signed-off-by: Jordan Rife <jordan@jrife.io>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
include/linux/udp.h
net/ipv4/udp.c

index 895240177f4f4cbb26b15363600d16874a623daa..4e1a672af4c57f01d10dde906b2114327387ca73 100644 (file)
@@ -216,6 +216,9 @@ static inline void udp_allow_gso(struct sock *sk)
 #define udp_portaddr_for_each_entry(__sk, list) \
        hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)
 
+#define udp_portaddr_for_each_entry_from(__sk) \
+       hlist_for_each_entry_from(__sk, __sk_common.skc_portaddr_node)
+
 #define udp_portaddr_for_each_entry_rcu(__sk, list) \
        hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node)
 
index 68a77323bc519089238c9ffb283a3a02b1bfd6c6..426a8b7c5cdec8e530f31bb6e55b5cb676aa0317 100644 (file)
@@ -3433,8 +3433,9 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
        int resume_bucket, resume_offset;
        struct udp_table *udptable;
        unsigned int batch_sks = 0;
-       bool resized = false;
+       int resizes = 0;
        struct sock *sk;
+       int err = 0;
 
        resume_bucket = state->bucket;
        resume_offset = iter->offset;
@@ -3455,18 +3456,21 @@ again:
         */
        iter->cur_sk = 0;
        iter->end_sk = 0;
-       iter->st_bucket_done = false;
+       iter->st_bucket_done = true;
        batch_sks = 0;
 
        for (; state->bucket <= udptable->mask; state->bucket++) {
                struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot;
 
                if (hlist_empty(&hslot2->head))
-                       continue;
+                       goto next_bucket;
 
                iter->offset = 0;
                spin_lock_bh(&hslot2->lock);
-               udp_portaddr_for_each_entry(sk, &hslot2->head) {
+               sk = hlist_entry_safe(hslot2->head.first, struct sock,
+                                     __sk_common.skc_portaddr_node);
+fill_batch:
+               udp_portaddr_for_each_entry_from(sk) {
                        if (seq_sk_match(seq, sk)) {
                                /* Resume from the last iterated socket at the
                                 * offset in the bucket before iterator was stopped.
@@ -3483,33 +3487,55 @@ again:
                                batch_sks++;
                        }
                }
+
+               /* Allocate a larger batch and try again. */
+               if (unlikely(resizes <= 1 && iter->end_sk &&
+                            iter->end_sk != batch_sks)) {
+                       resizes++;
+
+                       /* First, try with GFP_USER to maximize the chances of
+                        * grabbing more memory.
+                        */
+                       if (resizes == 1) {
+                               spin_unlock_bh(&hslot2->lock);
+                               err = bpf_iter_udp_realloc_batch(iter,
+                                                                batch_sks * 3 / 2,
+                                                                GFP_USER);
+                               if (err)
+                                       return ERR_PTR(err);
+                               /* Start over. */
+                               goto again;
+                       }
+
+                       /* Next, hold onto the lock, so the bucket doesn't
+                        * change while we get the rest of the sockets.
+                        */
+                       err = bpf_iter_udp_realloc_batch(iter, batch_sks,
+                                                        GFP_NOWAIT);
+                       if (err) {
+                               spin_unlock_bh(&hslot2->lock);
+                               return ERR_PTR(err);
+                       }
+
+                       /* Pick up where we left off. */
+                       sk = iter->batch[iter->end_sk - 1];
+                       sk = hlist_entry_safe(sk->__sk_common.skc_portaddr_node.next,
+                                             struct sock,
+                                             __sk_common.skc_portaddr_node);
+                       batch_sks = iter->end_sk;
+                       goto fill_batch;
+               }
+
                spin_unlock_bh(&hslot2->lock);
 
                if (iter->end_sk)
                        break;
+next_bucket:
+               resizes = 0;
        }
 
-       /* All done: no batch made. */
-       if (!iter->end_sk)
-               return NULL;
-
-       if (iter->end_sk == batch_sks) {
-               /* Batching is done for the current bucket; return the first
-                * socket to be iterated from the batch.
-                */
-               iter->st_bucket_done = true;
-               goto done;
-       }
-       if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2,
-                                                   GFP_USER)) {
-               resized = true;
-               /* After allocating a larger batch, retry one more time to grab
-                * the whole bucket.
-                */
-               goto again;
-       }
-done:
-       return iter->batch[0];
+       WARN_ON_ONCE(iter->end_sk != batch_sks);
+       return iter->end_sk ? iter->batch[0] : NULL;
 }
 
 static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -3873,7 +3899,10 @@ static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
        if (!new_batch)
                return -ENOMEM;
 
-       bpf_iter_udp_put_batch(iter);
+       if (flags != GFP_NOWAIT)
+               bpf_iter_udp_put_batch(iter);
+
+       memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
        kvfree(iter->batch);
        iter->batch = new_batch;
        iter->max_sk = new_batch_sz;