eventpoll: add support for min-wait
Rather than just have a timeout value for waiting on events, add
EPOLL_CTL_MIN_WAIT to allow setting a minimum time that epoll_wait()
should always wait for events to arrive.
For medium workload efficiencies, some production workloads inject
artificial timers or sleeps before calling epoll_wait() to get
better batching and higher efficiencies. While this does help, it's
not as efficient as it could be. By adding support for epoll_wait()
for this directly, we can avoids extra context switches and scheduler
and timer overhead.
As an example, running an AB test on an identical workload at about
~370K reqs/second, without this change and with the sleep hack
mentioned above (using 200 usec as the timeout), we're doing 310K-340K
non-voluntary context switches per second. Idle CPU on the host is 27-34%.
With the the sleep hack removed and epoll set to the same 200 usec
value, we're handling the exact same load but at 292K-315k non-voluntary
context switches and idle CPU of 33-41%, a substantial win.
Basic test case:
struct d {
int p1, p2;
};
static void *fn(void *data)
{
struct d *d = data;
char b = 0x89;
/* Generate 2 events 20 msec apart */
usleep(10000);
write(d->p1, &b, sizeof(b));
usleep(10000);
write(d->p2, &b, sizeof(b));
return NULL;
}
int main(int argc, char *argv[])
{
struct epoll_event ev, events[2];
pthread_t thread;
int p1[2], p2[2];
struct d d;
int efd, ret;
efd = epoll_create1(0);
if (efd < 0) {
perror("epoll_create");
return 1;
}
if (pipe(p1) < 0) {
perror("pipe");
return 1;
}
if (pipe(p2) < 0) {
perror("pipe");
return 1;
}
ev.events = EPOLLIN;
ev.data.fd = p1[0];
if (epoll_ctl(efd, EPOLL_CTL_ADD, p1[0], &ev) < 0) {
perror("epoll add");
return 1;
}
ev.events = EPOLLIN;
ev.data.fd = p2[0];
if (epoll_ctl(efd, EPOLL_CTL_ADD, p2[0], &ev) < 0) {
perror("epoll add");
return 1;
}
/* always wait 200 msec for events */
ev.data.u64 = 200000;
if (epoll_ctl(efd, EPOLL_CTL_MIN_WAIT, -1, &ev) < 0) {
perror("epoll add set timeout");
return 1;
}
d.p1 = p1[1];
d.p2 = p2[1];
pthread_create(&thread, NULL, fn, &d);
/* expect to get 2 events here rather than just 1 */
ret = epoll_wait(efd, events, 2, -1);
printf("epoll_wait=%d\n", ret);
return 0;
}
Signed-off-by: Jens Axboe <axboe@kernel.dk>