net: Introduce preferred busy-polling
authorBjörn Töpel <bjorn.topel@intel.com>
Mon, 30 Nov 2020 18:51:56 +0000 (19:51 +0100)
committerDaniel Borkmann <daniel@iogearbox.net>
Mon, 30 Nov 2020 23:09:25 +0000 (00:09 +0100)
The existing busy-polling mode, enabled by the SO_BUSY_POLL socket
option or system-wide using the /proc/sys/net/core/busy_read knob, is
an opportunistic. That means that if the NAPI context is not
scheduled, it will poll it. If, after busy-polling, the budget is
exceeded the busy-polling logic will schedule the NAPI onto the
regular softirq handling.

One implication of the behavior above is that a busy/heavy loaded NAPI
context will never enter/allow for busy-polling. Some applications
prefer that most NAPI processing would be done by busy-polling.

This series adds a new socket option, SO_PREFER_BUSY_POLL, that works
in concert with the napi_defer_hard_irqs and gro_flush_timeout
knobs. The napi_defer_hard_irqs and gro_flush_timeout knobs were
introduced in commit 6f8b12d661d0 ("net: napi: add hard irqs deferral
feature"), and allows for a user to defer interrupts to be enabled and
instead schedule the NAPI context from a watchdog timer. When a user
enables the SO_PREFER_BUSY_POLL, again with the other knobs enabled,
and the NAPI context is being processed by a softirq, the softirq NAPI
processing will exit early to allow the busy-polling to be performed.

If the application stops performing busy-polling via a system call,
the watchdog timer defined by gro_flush_timeout will timeout, and
regular softirq handling will resume.

In summary; Heavy traffic applications that prefer busy-polling over
softirq processing should use this option.

Example usage:

  $ echo 2 | sudo tee /sys/class/net/ens785f1/napi_defer_hard_irqs
  $ echo 200000 | sudo tee /sys/class/net/ens785f1/gro_flush_timeout

Note that the timeout should be larger than the userspace processing
window, otherwise the watchdog will timeout and fall back to regular
softirq processing.

Enable the SO_BUSY_POLL/SO_PREFER_BUSY_POLL options on your socket.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/bpf/20201130185205.196029-2-bjorn.topel@gmail.com
arch/alpha/include/uapi/asm/socket.h
arch/mips/include/uapi/asm/socket.h
arch/parisc/include/uapi/asm/socket.h
arch/sparc/include/uapi/asm/socket.h
fs/eventpoll.c
include/linux/netdevice.h
include/net/busy_poll.h
include/net/sock.h
include/uapi/asm-generic/socket.h
net/core/dev.c
net/core/sock.c

index de6c4df610826c262fadadfb5133ada318d2c2bd..538359642554db5472ebae38341abefc88950ad6 100644 (file)
 
 #define SO_DETACH_REUSEPORT_BPF 68
 
+#define SO_PREFER_BUSY_POLL    69
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
index d0a9ed2ca2d6831856daedb3275e7d0a841209a0..e406e73b5e6e70f333d7b16273ac0bbc06268dfd 100644 (file)
 
 #define SO_DETACH_REUSEPORT_BPF 68
 
+#define SO_PREFER_BUSY_POLL    69
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
index 10173c32195e4cdd75b7436ea5ef19ff2cfa0644..1bc46200889d0d88474c14be8ecd2953db00fc78 100644 (file)
 
 #define SO_DETACH_REUSEPORT_BPF 0x4042
 
+#define SO_PREFER_BUSY_POLL    0x4043
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
index 8029b681fc7ca142155666008eebaaf44262f541..99688cf673a4d3c4123003a91ed37780e303afe7 100644 (file)
 
 #define SO_DETACH_REUSEPORT_BPF  0x0047
 
+#define SO_PREFER_BUSY_POLL     0x0048
+
 #if !defined(__KERNEL__)
 
 
index 4df61129566d40a21f7016b0900565cdaf68f463..e11fab3a0b9e8b56331386b99537e34e3e41d3f9 100644 (file)
@@ -397,7 +397,7 @@ static void ep_busy_loop(struct eventpoll *ep, int nonblock)
        unsigned int napi_id = READ_ONCE(ep->napi_id);
 
        if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
-               napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
+               napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false);
 }
 
 static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
index 7ce648a564f75f3603bfac0f6cc04f2d5a0b9b5e..52d1cc2bd8a73f27ba7695954645d10836fcfdd1 100644 (file)
@@ -350,23 +350,25 @@ struct napi_struct {
 };
 
 enum {
-       NAPI_STATE_SCHED,       /* Poll is scheduled */
-       NAPI_STATE_MISSED,      /* reschedule a napi */
-       NAPI_STATE_DISABLE,     /* Disable pending */
-       NAPI_STATE_NPSVC,       /* Netpoll - don't dequeue from poll_list */
-       NAPI_STATE_LISTED,      /* NAPI added to system lists */
-       NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
-       NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
+       NAPI_STATE_SCHED,               /* Poll is scheduled */
+       NAPI_STATE_MISSED,              /* reschedule a napi */
+       NAPI_STATE_DISABLE,             /* Disable pending */
+       NAPI_STATE_NPSVC,               /* Netpoll - don't dequeue from poll_list */
+       NAPI_STATE_LISTED,              /* NAPI added to system lists */
+       NAPI_STATE_NO_BUSY_POLL,        /* Do not add in napi_hash, no busy polling */
+       NAPI_STATE_IN_BUSY_POLL,        /* sk_busy_loop() owns this NAPI */
+       NAPI_STATE_PREFER_BUSY_POLL,    /* prefer busy-polling over softirq processing*/
 };
 
 enum {
-       NAPIF_STATE_SCHED        = BIT(NAPI_STATE_SCHED),
-       NAPIF_STATE_MISSED       = BIT(NAPI_STATE_MISSED),
-       NAPIF_STATE_DISABLE      = BIT(NAPI_STATE_DISABLE),
-       NAPIF_STATE_NPSVC        = BIT(NAPI_STATE_NPSVC),
-       NAPIF_STATE_LISTED       = BIT(NAPI_STATE_LISTED),
-       NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
-       NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
+       NAPIF_STATE_SCHED               = BIT(NAPI_STATE_SCHED),
+       NAPIF_STATE_MISSED              = BIT(NAPI_STATE_MISSED),
+       NAPIF_STATE_DISABLE             = BIT(NAPI_STATE_DISABLE),
+       NAPIF_STATE_NPSVC               = BIT(NAPI_STATE_NPSVC),
+       NAPIF_STATE_LISTED              = BIT(NAPI_STATE_LISTED),
+       NAPIF_STATE_NO_BUSY_POLL        = BIT(NAPI_STATE_NO_BUSY_POLL),
+       NAPIF_STATE_IN_BUSY_POLL        = BIT(NAPI_STATE_IN_BUSY_POLL),
+       NAPIF_STATE_PREFER_BUSY_POLL    = BIT(NAPI_STATE_PREFER_BUSY_POLL),
 };
 
 enum gro_result {
@@ -437,6 +439,11 @@ static inline bool napi_disable_pending(struct napi_struct *n)
        return test_bit(NAPI_STATE_DISABLE, &n->state);
 }
 
+static inline bool napi_prefer_busy_poll(struct napi_struct *n)
+{
+       return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
+}
+
 bool napi_schedule_prep(struct napi_struct *n);
 
 /**
index b001fa91c14eaa685c0dd2e892442ef0744bf4af..0292b8353d7e99b11ca012e13794180484c7a413 100644 (file)
@@ -43,7 +43,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time);
 
 void napi_busy_loop(unsigned int napi_id,
                    bool (*loop_end)(void *, unsigned long),
-                   void *loop_end_arg);
+                   void *loop_end_arg, bool prefer_busy_poll);
 
 #else /* CONFIG_NET_RX_BUSY_POLL */
 static inline unsigned long net_busy_loop_on(void)
@@ -105,7 +105,8 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock)
        unsigned int napi_id = READ_ONCE(sk->sk_napi_id);
 
        if (napi_id >= MIN_NAPI_ID)
-               napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk);
+               napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk,
+                              READ_ONCE(sk->sk_prefer_busy_poll));
 #endif
 }
 
index a5c6ae78df77d08e4b65e634e93040446486bdc2..d49b89b071b6b0ad5ecc1528b2ce82a513c17e05 100644 (file)
@@ -301,6 +301,7 @@ struct bpf_local_storage;
   *    @sk_ack_backlog: current listen backlog
   *    @sk_max_ack_backlog: listen backlog set in listen()
   *    @sk_uid: user id of owner
+  *    @sk_prefer_busy_poll: prefer busypolling over softirq processing
   *    @sk_priority: %SO_PRIORITY setting
   *    @sk_type: socket type (%SOCK_STREAM, etc)
   *    @sk_protocol: which protocol this socket belongs in this network family
@@ -479,6 +480,9 @@ struct sock {
        u32                     sk_ack_backlog;
        u32                     sk_max_ack_backlog;
        kuid_t                  sk_uid;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+       u8                      sk_prefer_busy_poll;
+#endif
        struct pid              *sk_peer_pid;
        const struct cred       *sk_peer_cred;
        long                    sk_rcvtimeo;
index 77f7c1638eb1ce7d3e143bbffd60056e472b1122..7dd02408b7ce08602c86ed0463e7d6c0e986600f 100644 (file)
 
 #define SO_DETACH_REUSEPORT_BPF 68
 
+#define SO_PREFER_BUSY_POLL    69
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
index 60d325bda0d7b4a1ecb7bf7b3352d58bed8b96a2..6f8d2cffb7c5c1104183d80a2e13696f3f289a1f 100644 (file)
@@ -6458,7 +6458,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 
                WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
 
-               new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
+               new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
+                             NAPIF_STATE_PREFER_BUSY_POLL);
 
                /* If STATE_MISSED was set, leave STATE_SCHED set,
                 * because we will call napi->poll() one more time.
@@ -6497,8 +6498,29 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
 
 #define BUSY_POLL_BUDGET 8
 
-static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
+static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
 {
+       if (!skip_schedule) {
+               gro_normal_list(napi);
+               __napi_schedule(napi);
+               return;
+       }
+
+       if (napi->gro_bitmask) {
+               /* flush too old packets
+                * If HZ < 1000, flush all packets.
+                */
+               napi_gro_flush(napi, HZ >= 1000);
+       }
+
+       gro_normal_list(napi);
+       clear_bit(NAPI_STATE_SCHED, &napi->state);
+}
+
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll)
+{
+       bool skip_schedule = false;
+       unsigned long timeout;
        int rc;
 
        /* Busy polling means there is a high chance device driver hard irq
@@ -6515,6 +6537,15 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
 
        local_bh_disable();
 
+       if (prefer_busy_poll) {
+               napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
+               timeout = READ_ONCE(napi->dev->gro_flush_timeout);
+               if (napi->defer_hard_irqs_count && timeout) {
+                       hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
+                       skip_schedule = true;
+               }
+       }
+
        /* All we really want here is to re-enable device interrupts.
         * Ideally, a new ndo_busy_poll_stop() could avoid another round.
         */
@@ -6525,19 +6556,14 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
         */
        trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
        netpoll_poll_unlock(have_poll_lock);
-       if (rc == BUSY_POLL_BUDGET) {
-               /* As the whole budget was spent, we still own the napi so can
-                * safely handle the rx_list.
-                */
-               gro_normal_list(napi);
-               __napi_schedule(napi);
-       }
+       if (rc == BUSY_POLL_BUDGET)
+               __busy_poll_stop(napi, skip_schedule);
        local_bh_enable();
 }
 
 void napi_busy_loop(unsigned int napi_id,
                    bool (*loop_end)(void *, unsigned long),
-                   void *loop_end_arg)
+                   void *loop_end_arg, bool prefer_busy_poll)
 {
        unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
        int (*napi_poll)(struct napi_struct *napi, int budget);
@@ -6565,12 +6591,18 @@ restart:
                         * we avoid dirtying napi->state as much as we can.
                         */
                        if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
-                                  NAPIF_STATE_IN_BUSY_POLL))
+                                  NAPIF_STATE_IN_BUSY_POLL)) {
+                               if (prefer_busy_poll)
+                                       set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                                goto count;
+                       }
                        if (cmpxchg(&napi->state, val,
                                    val | NAPIF_STATE_IN_BUSY_POLL |
-                                         NAPIF_STATE_SCHED) != val)
+                                         NAPIF_STATE_SCHED) != val) {
+                               if (prefer_busy_poll)
+                                       set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                                goto count;
+                       }
                        have_poll_lock = netpoll_poll_lock(napi);
                        napi_poll = napi->poll;
                }
@@ -6588,7 +6620,7 @@ count:
 
                if (unlikely(need_resched())) {
                        if (napi_poll)
-                               busy_poll_stop(napi, have_poll_lock);
+                               busy_poll_stop(napi, have_poll_lock, prefer_busy_poll);
                        preempt_enable();
                        rcu_read_unlock();
                        cond_resched();
@@ -6599,7 +6631,7 @@ count:
                cpu_relax();
        }
        if (napi_poll)
-               busy_poll_stop(napi, have_poll_lock);
+               busy_poll_stop(napi, have_poll_lock, prefer_busy_poll);
        preempt_enable();
 out:
        rcu_read_unlock();
@@ -6650,8 +6682,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
         * NAPI_STATE_MISSED, since we do not react to a device IRQ.
         */
        if (!napi_disable_pending(napi) &&
-           !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
+           !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
+               clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                __napi_schedule_irqoff(napi);
+       }
 
        return HRTIMER_NORESTART;
 }
@@ -6709,6 +6743,7 @@ void napi_disable(struct napi_struct *n)
 
        hrtimer_cancel(&n->timer);
 
+       clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
        clear_bit(NAPI_STATE_DISABLE, &n->state);
 }
 EXPORT_SYMBOL(napi_disable);
@@ -6781,6 +6816,19 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
                goto out_unlock;
        }
 
+       /* The NAPI context has more processing work, but busy-polling
+        * is preferred. Exit early.
+        */
+       if (napi_prefer_busy_poll(n)) {
+               if (napi_complete_done(n, work)) {
+                       /* If timeout is not set, we need to make sure
+                        * that the NAPI is re-scheduled.
+                        */
+                       napi_schedule(n);
+               }
+               goto out_unlock;
+       }
+
        if (n->gro_bitmask) {
                /* flush too old packets
                 * If HZ < 1000, flush all packets.
index 727ea1cc633ca50fdf423c5ddb985c0ae65f71cf..e05f2e52b5a8b86b8dab1a9ec2e5d95e4aca853d 100644 (file)
@@ -1159,6 +1159,12 @@ set_sndbuf:
                                sk->sk_ll_usec = val;
                }
                break;
+       case SO_PREFER_BUSY_POLL:
+               if (valbool && !capable(CAP_NET_ADMIN))
+                       ret = -EPERM;
+               else
+                       WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
+               break;
 #endif
 
        case SO_MAX_PACING_RATE:
@@ -1523,6 +1529,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
        case SO_BUSY_POLL:
                v.val = sk->sk_ll_usec;
                break;
+       case SO_PREFER_BUSY_POLL:
+               v.val = READ_ONCE(sk->sk_prefer_busy_poll);
+               break;
 #endif
 
        case SO_MAX_PACING_RATE: