2021-十个问题理解Linux epoll工作原理

原文地址

十个问题理解Linux epoll工作原理

epoll是linux特有的一个I/O事件通知机制。很久以来对epoll如何能够高效处理数以百万记的文件描述符很有兴趣。近期学习、研究了epoll源码，在这个过程中关于epoll数据结构和作者的实现思路产生出不少疑惑，在此总结为了10个问题并逐个加以解答和分析。本文基于的内核源码版本是2.6.39版本 。

Question 1：是否所有的文件类型都可以被epoll监视？

答案：不是。看下面这个实验代码：

#include <stdio.h>
#include <unistd.h>
#include <sys/epoll.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>

#define MAX_EVENTS 1

int main (void)
{
    int epfd;
    epfd = epoll_create(100); /* 创建epoll实例，预计监听100个fd */
    if (epfd < 0) {
        perror ("epoll_create");
    }

    struct epoll_event *events;
    int nr_events, i;
    events = malloc (sizeof (struct epoll_event) * MAX_EVENTS);
    if (!events) {
        perror("malloc");
        return 1;
    }

    /* 打开一个普通文本文件 */
    int target_fd = open ("./11.txt", O_RDONLY);
    printf("target_fd %d\n", target_fd);
    int target_listen_type = EPOLLIN;
    for (i = 0; i < 1; i++) {
        int ret;
        events[i].data.fd = target_fd; /* epoll调用返回后，返回给应用进程的fd号 */
        events[i].events = target_listen_type; /* 需要监听的事件类型 */
        ret = epoll_ctl (epfd, EPOLL_CTL_ADD, target_fd, &events[i]); /* 注册fd到epoll实例上 */
        if (ret) {
     printf("ret %d, errno %d\n", ret, errno);
            perror ("epoll_ctl");
        }
    }

    /* 应用进程阻塞在epoll上，超时时长置为-1表示一直等到有目标事件才会返回 */
    nr_events = epoll_wait(epfd, events, MAX_EVENTS, -1);
    if (nr_events < 0) {
        perror ("epoll_wait");
        free(events);
        return 1;
    }
    for (i = 0; i < nr_events; i++) {
        /* 打印出处于就绪状态的fd及其事件 */
        printf("event=%d on fd=%d\n", events[i].events, events[i].data.fd);
    }
    free (events);
    close(epfd);
    return 0;
}

编译、运行上面的代码，会打印出下列信息：

gcc epoll_test.c -o epdemo
./epdemo
target_fd 4
ret -1, errno 1
epoll_ctl: Operation not permitted

正常打开了"txt"文件fd=4,但调用epoll_ctl监视这个fd时却ret=-1失败了,并且错误码为1，错误信息为"Operation not permitted"。错误码指明这个fd不能够被epoll监视。

那什么样的fd才可以被epoll监视呢？

只有底层驱动实现了file_operations中poll函数的文件类型才可以被epoll监视！socket类型的文件驱动是实现了poll函数的，因此才可以被epoll监视。struct file_operations声明位置是在include/linux/fs.h中。

Question 2：ep->wq的作用是什么？

答案：wq是一个等待队列，用来保存对某一个epoll实例调用epoll_wait()的所有进程。

一个进程调用epoll_wait()后，如果当前还没有任何事件发生，需要让当前进程挂起等待（放到ep->wq里）；当epoll实例监视的文件上有事件发生后，需要唤醒ep->wq上的进程去继续执行用户态的业务逻辑。之所以要用一个等待队列来维护关注这个epoll的进程，是因为有时候调用epoll_wait()的不只一个进程，当多个进程都在关注同一个epoll实例时，休眠的进程们通过这个等待队列就可以逐个被唤醒了。

多个进程关注同一个epoll实例，那么有事件发生后先唤醒谁？后唤醒谁？还是一起全唤醒？这涉及到一个称为“惊群效应”的问题。

Question 3：什么是epoll惊群？

答案：多个进程等待在ep->wq上，事件触发后所有进程都被唤醒，但只有其中1个进程能够成功继续执行的现象。其他被白白唤起的进程等于做了无用功，可能会造成系统负载过高的问题。下面这段代码能够直观感受什么是epoll惊群：

#include <sys/types.h>
#include <sys/socket.h>
#include <sys/epoll.h>
#include <netdb.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/wait.h>

#define PROCESS_NUM 10

static int create_and_bind (char *port)
{
    int fd = socket(PF_INET, SOCK_STREAM, 0);
    struct sockaddr_in serveraddr;
    serveraddr.sin_family = AF_INET;
    serveraddr.sin_addr.s_addr = htonl(INADDR_ANY);
    serveraddr.sin_port = htons(atoi(port));
    bind(fd, (struct sockaddr*)&serveraddr, sizeof(serveraddr));
    return fd;
}

static int make_socket_non_blocking (int sfd)
{
    int flags, s;

    flags = fcntl (sfd, F_GETFL, 0);
    if (flags == -1)
    {
        perror ("fcntl");
        return -1;
    }

    flags |= O_NONBLOCK;
    s = fcntl (sfd, F_SETFL, flags);
    if (s == -1)
    {
        perror ("fcntl");
        return -1;
    }

    return 0;
}

#define MAXEVENTS 64

int main (int argc, char *argv[])
{
    int sfd, s;
    int efd;
    struct epoll_event event;
    struct epoll_event *events;

    sfd = create_and_bind("8001");
    if (sfd == -1)
        abort ();

    s = make_socket_non_blocking (sfd);
    if (s == -1)
        abort ();

    s = listen(sfd, SOMAXCONN);
    if (s == -1)
    {
        perror ("listen");
        abort ();
    }

    efd = epoll_create(MAXEVENTS);
    if (efd == -1)
    {
        perror("epoll_create");
        abort();
    }

    event.data.fd = sfd;
    //event.events = EPOLLIN | EPOLLET;
    event.events = EPOLLIN;
    s = epoll_ctl(efd, EPOLL_CTL_ADD, sfd, &event);
    if (s == -1)
    {
        perror("epoll_ctl");
        abort();
    }

    /* Buffer where events are returned */
    events = calloc(MAXEVENTS, sizeof event);
    int k;
    for(k = 0; k < PROCESS_NUM; k++)
    {
        int pid = fork();
        if(pid == 0)
        {

            /* The event loop */
            while (1)
            {
                int n, i;
                n = epoll_wait(efd, events, MAXEVENTS, -1);
                printf("process %d return from epoll_wait!\n", getpid());
             for (i = 0; i < n; i++)
                {
                    if ((events[i].events & EPOLLERR) || (events[i].events & EPOLLHUP) || (!(events[i].events & EPOLLIN)))
                    {
                        /* An error has occured on this fd, or the socket is not ready for reading (why were we notified then?) */
                        fprintf (stderr, "epoll error\n");
                        close (events[i].data.fd);
                        continue;
                    }
                    else if (sfd == events[i].data.fd)
                    {
                        /* We have a notification on the listening socket, which means one or more incoming connections. */
                        struct sockaddr in_addr;
                        socklen_t in_len;
                        int infd;
                        char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];

                        in_len = sizeof in_addr;
                        infd = accept(sfd, &in_addr, &in_len);
                        if (infd == -1)
                        {
                            printf("process %d accept failed!\n", getpid());
                            break;
                        }
                        printf("process %d accept successed!\n", getpid());

                        /* Make the incoming socket non-blocking and add it to the list of fds to monitor. */
                        close(infd);
                    }
                }
            }
        }
    }
    int status;
    wait(&status);
    free (events);
    close (sfd);
    return EXIT_SUCCESS;
}

将服务端的监听socket fd加入到epoll_wait的监视集合中，这样当有客户端想要建立连接，就会事件触发epoll_wait返回。此时如果10个进程同时在epoll_wait同一个epoll实例就出现了惊群效应。所有10个进程都被唤起，但只有一个能成功accept。

为了解决epoll惊群，内核后续的高版本又提供了EPOLLEXCLUSIVE选项和SO_REUSEPORT选项，我个人理解两种解决方案思路上的不同点在于：EPOLLEXCLUSIVE是在唤起进程阶段起作用，只唤起排在队列最前面的1个进程；而SO_REUSEPORT是在分配连接时起作用，相当于每个进程自己都有一个独立的epoll实例，内核来决策把连接分配给哪个epoll。

Question 4：ep->poll_wait的作用是什么？

答案：ep->poll_wait是epoll实例中另一个等待队列。当被监视的文件是一个epoll类型时，需要用这个等待队列来处理递归唤醒。

在阅读内核代码过程中，ep->wq还算挺好理解，但我发现伴随着ep->wq唤醒，还有一个ep->poll_wait的唤醒过程。比如下面这段代码，在eventpoll.c中出现了很多次：

/* If the file is already "ready" we drop it inside the ready list */
    if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
        list_add_tail(&epi->rdllink, &ep->rdllist);

        /* Notify waiting tasks that events are available */
        if (waitqueue_active(&ep->wq))
            wake_up_locked(&ep->wq);
        if (waitqueue_active(&ep->poll_wait))
            pwake++;
    }

    spin_unlock_irqrestore(&ep->lock, flags);

    atomic_long_inc(&ep->user->epoll_watches);

    /* We have to call this outside the lock */
    if (pwake)
        ep_poll_safewake(&ep->poll_wait);

查阅很多资料后才搞明白其实epoll也是一种文件类型，其底层驱动也实现了file_operations中的poll函数，因此一个epoll类型的fd可以被其他epoll实例监视。而epoll类型的fd只会有“读就绪”的事件。当epoll所监视的非epoll类型文件有“读就绪”事件时，当前epoll也会进入“读就绪”状态。

因此如果一个epoll实例监视了另一个epoll就会出现递归。举个例子，如图所示：

epollfd1监视了2个“非epoll”类型的fd
epollfd2监视了epollfd1和2个“非epoll”类型的fd

如果epollfd1所监视的2个fd中有可读事件触发，fd的ep_poll_callback回调函数会触发将fd放到epollfd1的rdllist中。此时epollfd1本身的可读事件也会触发，就需要从epollfd1的poll_wait等待队列中找到epollfd2，调用epollfd1的ep_poll_callback(将epollfd1放到epollfd2的rdllist中)。因此ep->poll_wait是用来处理epoll间嵌套监视的情况的。

Question 5：ep->rdllist的作用是什么？

答案：epoll实例中包含就绪事件的fd组成的链表。

通过扫描ep->rdllist链表，内核可以轻松获取当前有事件触发的fd。而不是像select()/poll()那样全量扫描所有被监视的fd，再从中找出有事件就绪的。因此可以说这一点决定了epoll的性能是远高于select/poll的。

看到这里你可能又产生了一个小小的疑问：为什么epoll中事件就绪的fd会“主动”跑到rdllist中去，而不用全量扫描就能找到它们呢？ 这是因为每当调用epoll_ctl新增一个被监视的fd时，都会注册一下这个fd的回调函数ep_poll_callback，当网卡收到数据包会触发一个中断，中断处理函数再回调ep_poll_callback将这个fd所属的“epitem”添加至epoll实例中的rdllist中。

Question 6：ep->ovflist的作用是什么？

答案：在rdllist被占用时，用来在不持有ep->lock的情况下收集有就绪事件的fd。

当epoll上已经有了一些就绪事件的时候，内核需要扫描rdllist将就绪的fd返回给用户态。这一步通过ep_scan_ready_list函数来实现。其中sproc是一个回调函数(也就是ep_send_events_proc函数)，来处理数据从内核态到用户态的复制。

/**
 * ep_scan_ready_list - Scans the ready list in a way that makes possible for the scan code, to call f_op->poll(). Also allows for O(NumReady) performance.
 * @ep: Pointer to the epoll private data structure.
 * @sproc: Pointer to the scan callback.
 * @priv: Private opaque data passed to the @sproc callback.
 * Returns: The same integer error code returned by the @sproc callback.
 */
static int ep_scan_ready_list(struct eventpoll *ep,
                  int (*sproc)(struct eventpoll *,
                       struct list_head *, void *),
                  void *priv)

由于rdllist链表业务非常繁忙（epoll增加监视文件、修改监视文件、有事件触发…等情况都需要操作rdllist)，所以在复制数据到用户空间时，加了一个ep->mtx互斥锁来保护epoll自身数据结构线程安全，此时其他执行流程里有争抢ep->mtx的操作都会因命中ep->mtx进入休眠。

但加锁期间很可能有新事件源源不断地产生，进而调用ep_poll_callback(ep_poll_callback不用争抢ep->mtx所以不会休眠)，新触发的事件需要一个地方来收集，不然就丢事件了。这个用来临时收集新事件的链表就是ovflist。我的理解是：引入ovflist后新产生的事件就不用因为想向rdllist里写而去和ep_send_events_proc争抢自旋锁(ep->lock),同时ep_send_events_proc也可以放心大胆地在无锁(不持有ep->lock)的情况下修改rdllist。

看代码时会发现，还有一个txlist链表，这个链表用来最后向用户态复制数据，rdllist要先把自己的数据全部转移到txlist，然后rdllist自己被清空。ep_send_events_proc遍历txlist处理向用户空间复制，复制成功后如果是水平触发(LT)还要把这个事件还回rdllist，等待下一次epoll_wait来获取它。

ovflist上的fd会合入rdllist上等待下一次扫描；如果txlist上的fd没有处理完，最后也会合入rdllist。这3个链表的关系是这样：

Question 7：epitem->pwqlist队列的作用是什么？

答案：用来保存这个epitem的poll等待队列。

首先介绍下什么是epitem。epitem是epoll中很重要的一种数据结构，是红黑树和rdllist的基本组成元素。需要监听的文件和事件信息，都被包装在epitem结构里。

struct epitem {
    struct rb_node rbn;  // 用于加入红黑树
    struct list_head rdllink; // 用于加入rdllist
    struct epoll_filefd ffd; // 包含被监视文件的文件指针和fd信息
    struct list_head pwqlist; // poll等待队列
    struct eventpoll *ep; // 所属的epoll实例
    struct epoll_event event;  // 关注的事件
    /* 其他成员省略 */
};

回忆一下上文说到，每当用户调用epoll_ctl()新增一个监视文件，都要给这个文件注册一个回调函数ep_poll_callback,当网卡收到数据后软中断会调用这个ep_poll_callback把这个epitem加入到ep->rdllist中。

pwdlist就是跟ep_poll_callback注册相关的。

当调用epoll_ctl()新增一个监视文件后，内核会为这个epitem创建一个eppoll_entry对象，通过eppoll_entry->wait_queue_t->wait_queue_func_t来设置ep_poll_callback。pwdlist为什么要做成一个队列呢，直接设置成eppoll_entry对象不就行了吗？实际上不同文件类型实现file_operations->poll用到等待队列数量可能不同。虽然大多数都是1个，但也有例外。比如“scullpipe”类型的文件就用到了2个等待队列。

pwqlist、epitem、fd、epoll_entry、ep_poll_callback间的关系是这样：

Question 8：epmutex、ep->mtx、ep->lock 3把锁的区别是？

答案：锁的粒度和使用目的不同。

epmutex是一个全局互斥锁，epoll中一共只有3个地方用到这把锁。分别是ep_free()销毁一个epoll实例时、eventpoll_release_file()清理从epoll中已经关闭的文件时、epoll_ctl()时避免epoll间嵌套调用时形成死锁。我的理解是epmutex的锁粒度最大，用来处理跨epoll实例级别的同步操作。
ep->mtx是一个epoll内部的互斥锁，在ep_scan_ready_list()扫描就绪列表、eventpoll_release_file()中执行ep_remove()删除一个被监视文件、ep_loop_check_proc()检查epoll是否有循环嵌套或过深嵌套、还有epoll_ctl()操作被监视文件增删改等处有使用。可以看出上述的函数里都会涉及对epoll实例中rdllist或红黑树的访问，因此我的理解是ep->mtx是一个epoll实例内的互斥锁，用来保护epoll实例内部的数据结构的线程安全。
ep->lock是一个epoll实例内部的自旋锁，用来保护ep->rdllist的线程安全。自旋锁的特点是得不到锁时不会引起进程休眠，所以在ep_poll_callback中只能使用ep->lock，否则就会丢事件。

Question 9：epoll使用红黑树的目的是什么？

答案：用来维护一个epoll实例中所有的epitem。

用户态调用epoll_ctl()来操作epoll的监视文件时，需要增、删、改、查等动作有着比较高的效率。尤其是当epoll监视的文件数量达到百万级的时候，选用不同的数据结构带来的效率差异可能非常大。

从时间(增、删、改、查、按序遍历)、空间(存储空间大小、扩展性)等方面考量，红黑树都是非常优秀的数据结构(当然这以红黑树比较高的实现复杂度作为代价)。epoll红黑树中的epitem是按什么顺序组织的。阅读代码可以发现是先比较2个文件指针的地址大小，如果相同再比较文件fd的大小。

/* Compare RB tree keys */
static inline int ep_cmp_ffd(struct epoll_filefd *p1, struct epoll_filefd *p2)
{
    return (p1->file > p2->file ? +1 : (p1->file < p2->file ? -1 : p1->fd - p2->fd));
}

epoll、epitem、和红黑树间的组织关系是这样：

Question 10：什么是水平触发、边缘触发？

答案：水平触发(LT)和边缘触发(ET)是epoll_wait的2种工作模式。水平触发：关注点是数据（读操作缓冲区不为空，写操作缓冲区不为满），epoll_wait总会返回就绪。LT是epoll的默认工作模式。

边缘触发：关注点是变化，只有监视的文件上有数据变化发生（读操作关注有数据写进缓冲区，写操作关注数据从缓冲区取走），epoll_wait才会返回。

看一个实验 ,直观感受下2种模式的区别,客户端都是输入“abcdefgh” 8个字符，服务端每次接收2个字符。

水平触发时，客户端输入8个字符触发了一次读就绪事件，由于被监视文件上还有数据可读故一直返回读就绪，服务端4次循环每次都能取到2个字符，直到8个字符全部读完。

边缘触发时，客户端同样输入8个字符但服务端一次循环读到2个字符后这个读就绪事件就没有了。等客户端再输入一个字符串后，服务端关注到了数据的“变化”继续从缓冲区读接下来的2个字符“c”和”d”。

小结

本文通过10个问题，其实也是从10个不同的视角去观察epoll这间宏伟的殿堂。至此也基本介绍完了epoll从监视事件，到内部数据结构组织、事件处理，最后到epoll_wait返回的整体工作过程。最后附上一张epoll相关数据结构间的关系图，在学习epoll过程中它曾解答了我心中不少的疑惑，我愿称之为灯塔~

最近更新于0001-01-01