eventpoll: add support for min-wait

author Jens Axboe <axboe@kernel.dk>

Fri, 19 Aug 2022 03:00:18 +0000 (21:00 -0600)

committer Jens Axboe <axboe@kernel.dk>

Fri, 7 Oct 2022 19:49:28 +0000 (13:49 -0600)
author Jens Axboe <axboe@kernel.dk>
Fri, 19 Aug 2022 03:00:18 +0000 (21:00 -0600)
committer Jens Axboe <axboe@kernel.dk>
Fri, 7 Oct 2022 19:49:28 +0000 (13:49 -0600)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c

index 79aa61a951df5aab8f0190f34891a7aa6931f3e2..ccb8400e225257eb0bc27a36ee339fe5b883af95 100644 (file)
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -39,6 +39,11 @@
  #include <linux/rculist.h>
  #include <net/busy_poll.h>
  
+/*
+ * If a default min_wait timeout is desired, set this to non-zero. In usecs.
+ */
+#define EPOLL_DEF_MIN_WAIT     0
+
  /*
   * LOCKING:
   * There are three level of locking required by epoll :
@@ -117,6 +122,9 @@ struct eppoll_entry {
         /* The "base" pointer is set to the container "struct epitem" */
         struct epitem *base;
  
+       /* min wait time if (min_wait_ts) & 1 != 0 */
+       ktime_t min_wait_ts;
+
         /*
          * Wait queue item that will be linked to the target file wait
          * queue head.
@@ -217,6 +225,9 @@ struct eventpoll {
         u64 gen;
         struct hlist_head refs;
  
+       /* min wait for epoll_wait() */
+       unsigned int min_wait_ts;
+
  #ifdef CONFIG_NET_RX_BUSY_POLL
         /* used to track busy poll napi_id */
         unsigned int napi_id;
@@ -953,6 +964,7 @@ static int ep_alloc(struct eventpoll **pep)
         ep->rbr = RB_ROOT_CACHED;
         ep->ovflist = EP_UNACTIVE_PTR;
         ep->user = user;
+       ep->min_wait_ts = EPOLL_DEF_MIN_WAIT;
  
         *pep = ep;
  
@@ -1747,6 +1759,32 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
         return to;
  }
  
+struct epoll_wq {
+       wait_queue_entry_t wait;
+       struct hrtimer timer;
+       ktime_t timeout_ts;
+       ktime_t min_wait_ts;
+       struct eventpoll *ep;
+       bool timed_out;
+       int maxevents;
+       int wakeups;
+};
+
+static bool ep_should_min_wait(struct epoll_wq *ewq)
+{
+       if (ewq->min_wait_ts & 1) {
+               /* just an approximation */
+               if (++ewq->wakeups >= ewq->maxevents)
+                       goto stop_wait;
+               if (ktime_before(ktime_get_ns(), ewq->min_wait_ts))
+                       return true;
+       }
+
+stop_wait:
+       ewq->min_wait_ts &= ~(u64) 1;
+       return false;
+}
+
  /*
   * autoremove_wake_function, but remove even on failure to wake up, because we
   * know that default_wake_function/ttwu will only fail if the thread is already
@@ -1756,27 +1794,37 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
  static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
                                        unsigned int mode, int sync, void *key)
  {
-       int ret = default_wake_function(wq_entry, mode, sync, key);
+       struct epoll_wq *ewq = container_of(wq_entry, struct epoll_wq, wait);
+       int ret;
+
+       /*
+        * If min wait time hasn't been satisfied yet, keep waiting
+        */
+       if (ep_should_min_wait(ewq))
+               return 0;
  
+       ret = default_wake_function(wq_entry, mode, sync, key);
         list_del_init(&wq_entry->entry);
         return ret;
  }
  
-struct epoll_wq {
-       wait_queue_entry_t wait;
-       struct hrtimer timer;
-       ktime_t timeout_ts;
-       bool timed_out;
-};
-
  static enum hrtimer_restart ep_timer(struct hrtimer *timer)
  {
         struct epoll_wq *ewq = container_of(timer, struct epoll_wq, timer);
         struct task_struct *task = ewq->wait.private;
+       const bool is_min_wait = ewq->min_wait_ts & 1;
+
+       if (!is_min_wait || ep_events_available(ewq->ep)) {
+               if (!is_min_wait)
+                       ewq->timed_out = true;
+               ewq->min_wait_ts &= ~(u64) 1;
+               wake_up_process(task);
+               return HRTIMER_NORESTART;
+       }
  
-       ewq->timed_out = true;
-       wake_up_process(task);
-       return HRTIMER_NORESTART;
+       ewq->min_wait_ts &= ~(u64) 1;
+       hrtimer_set_expires_range_ns(&ewq->timer, ewq->timeout_ts, 0);
+       return HRTIMER_RESTART;
  }
  
  static void ep_schedule(struct eventpoll *ep, struct epoll_wq *ewq, ktime_t *to,
@@ -1831,12 +1879,14 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
  
         lockdep_assert_irqs_enabled();
  
+       ewq.ep = ep;
         ewq.timed_out = false;
+       ewq.maxevents = maxevents;
+       ewq.wakeups = 0;
  
         if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
                 slack = select_estimate_accuracy(timeout);
-               to = &ewq.timeout_ts;
-               *to = timespec64_to_ktime(*timeout);
+               ewq.timeout_ts = timespec64_to_ktime(*timeout);
         } else if (timeout) {
                 /*
                  * Avoid the unnecessary trip to the wait queue loop, if the
@@ -1845,6 +1895,21 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
                 ewq.timed_out = 1;
         }
  
+       /*
+        * If min_wait is set for this epoll instance, note the min_wait
+        * time. Ensure the lowest bit is set in ewq.min_wait_ts, that's
+        * the state bit for whether or not min_wait is enabled.
+        */
+       if (ep->min_wait_ts) {
+               ewq.min_wait_ts = ktime_add_us(ktime_get_ns(),
+                                               ep->min_wait_ts);
+               ewq.min_wait_ts |= (u64) 1;
+               to = &ewq.min_wait_ts;
+       } else {
+               ewq.min_wait_ts = 0;
+               to = &ewq.timeout_ts;
+       }
+
         /*
          * This call is racy: We may or may not see events that are being added
          * to the ready list under the lock (e.g., in IRQ callbacks). For cases
@@ -1913,7 +1978,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
                  * important.
                  */
                 eavail = ep_events_available(ep);
-               if (!eavail) {
+               if (!eavail || ewq.min_wait_ts & 1) {
                         __add_wait_queue_exclusive(&ep->wq, &ewq.wait);
                         write_unlock_irq(&ep->lock);
                         ep_schedule(ep, &ewq, to, slack);
@@ -2111,6 +2176,31 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
         if (!f.file)
                 goto error_return;
  
+       /*
+        * We have to check that the file structure underneath the file
+        * descriptor the user passed to us _is_ an eventpoll file.
+        */
+       error = -EINVAL;
+       if (!is_file_epoll(f.file))
+               goto error_fput;
+
+       /*
+        * At this point it is safe to assume that the "private_data" contains
+        * our own data structure.
+        */
+       ep = f.file->private_data;
+
+       /*
+        * Handle EPOLL_CTL_MIN_WAIT upfront as we don't need to care about
+        * the fd being passed in.
+        */
+       if (op == EPOLL_CTL_MIN_WAIT) {
+               /* return old value */
+               error = ep->min_wait_ts;
+               ep->min_wait_ts = epds->data;
+               goto error_fput;
+       }
+
         /* Get the "struct file *" for the target file */
         tf = fdget(fd);
         if (!tf.file)
@@ -2126,12 +2216,10 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
                 ep_take_care_of_epollwakeup(epds);
  
         /*
-        * We have to check that the file structure underneath the file descriptor
-        * the user passed to us _is_ an eventpoll file. And also we do not permit
-        * adding an epoll file descriptor inside itself.
+        * We do not permit adding an epoll file descriptor inside itself.
          */
         error = -EINVAL;
-       if (f.file == tf.file || !is_file_epoll(f.file))
+       if (f.file == tf.file)
                 goto error_tgt_fput;
  
         /*
@@ -2147,12 +2235,6 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
                         goto error_tgt_fput;
         }
  
-       /*
-        * At this point it is safe to assume that the "private_data" contains
-        * our own data structure.
-        */
-       ep = f.file->private_data;
-
         /*
          * When we insert an epoll file descriptor inside another epoll file
          * descriptor, there is the chance of creating closed loops, which are
@@ -2251,7 +2333,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
  {
         struct epoll_event epds;
  
-       if (ep_op_has_event(op) &&
+       if ((ep_op_has_event(op) || op == EPOLL_CTL_MIN_WAIT) &&
             copy_from_user(&epds, event, sizeof(struct epoll_event)))
                 return -EFAULT;
  
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h

index 3337745d81bd6947c6181bd9f2df3eaf013f0b04..cbef635cb7e4a0ed1995a3d5d92ccc1ea246ea72 100644 (file)
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -59,7 +59,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
  /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
  static inline int ep_op_has_event(int op)
  {
-       return op != EPOLL_CTL_DEL;
+       return op != EPOLL_CTL_DEL && op != EPOLL_CTL_MIN_WAIT;
  }
  
  #else
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h

index 8a3432d0f0dcb69bf1d0caaf75a363bad02468fa..81ecb1ca36e036eac5776abb254ff4f088426d1a 100644 (file)
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -26,6 +26,7 @@
  #define EPOLL_CTL_ADD 1
  #define EPOLL_CTL_DEL 2
  #define EPOLL_CTL_MOD 3
+#define EPOLL_CTL_MIN_WAIT     4
  
  /* Epoll event masks */
  #define EPOLLIN                (__force __poll_t)0x00000001
author	Jens Axboe <axboe@kernel.dk>
	Fri, 19 Aug 2022 03:00:18 +0000 (21:00 -0600)
committer	Jens Axboe <axboe@kernel.dk>
	Fri, 7 Oct 2022 19:49:28 +0000 (13:49 -0600)
fs/eventpoll.c		patch \| blob \| blame \| history
include/linux/eventpoll.h		patch \| blob \| blame \| history
include/uapi/linux/eventpoll.h		patch \| blob \| blame \| history