On Sun, 2004-05-30 at 13:26 +0100, Joe Orton wrote:
> Interesting... some nits:
> 
> - C++ // comments bad ;)
> - CPP #directives must not have leading whitespace before the #

Fixed in the updated patch.

> - needs to cope with the fact that glibc implements epoll* returning 
> ENOSYS or whatever on 2.4 kernels IIRC, via autoconf or runtime checks

I would prefer doing this in autoconf, unless people are willing to
accept a much larger patch that could make the pollset back end
selectable at runtime.  The other issue with this is if APR was built on
a machine running 2.6, it might not be usable on a machine with 2.4.
That would be the other advantage of making the pollset back end runtime
selectable.  

Another goal of mine is to make a KQueue() based back end for FreeBSD
sooner or later.

> An issue I mentioned before w.r.t. removal of apr_poll() in HEAD: this
> type of implementation will surely end up being slower than using poll()
> for polling on small numbers of fds (a not uncommon case), due to the
> fact that it requires three rather than one system call?  Some
> benchmarks might be interesting.

I did a simple benchmark with an epoll enabled HTTPd 2.1 yesterday.
[mostly to see if it all worked correctly :) ]

It did a few transactions a second _faster_ than a normal poll() based
HTTPd 2.1.  Not significantly better or worse for Apache's common case
of not very many sockets.

Maybe later this week I can make up some more official benchmarks. 

> I wondered whether apr_poll() should remain using poll() with it's low
> constant overhead, and the apr_pollset_* interface alone should use the
> high-constant-overhead interfaces like epoll or /dev/poll?

Maybe.  I don't think poll's overhead is much less than epoll.  We will
have to benchmark it to find a real answer.

-Paul Querna
Index: configure.in
===================================================================
RCS file: /home/cvspublic/apr/configure.in,v
retrieving revision 1.581
diff -u -r1.581 configure.in
--- configure.in	24 May 2004 09:33:26 -0000	1.581
+++ configure.in	30 May 2004 21:44:52 -0000
@@ -636,6 +636,13 @@
 
 AC_CHECK_FUNCS(poll)
 
+haveepoll=no
+AC_CHECK_FUNCS(epoll_ctl, [haveepoll=yes], )
+if test "x$haveepoll" = "xyes" ; then
+        AC_DEFINE(HAVE_EPOLL, 1,
+                [Define if your system supports the epoll system calls])
+fi
+
 dnl ----------------------------- Checking for missing POSIX thread functions
 AC_CHECK_FUNCS([getpwnam_r getpwuid_r getgrnam_r getgrgid_r])
 
Index: poll/unix/poll.c
===================================================================
RCS file: /home/cvspublic/apr/poll/unix/poll.c,v
retrieving revision 1.44
diff -u -r1.44 poll.c
--- poll/unix/poll.c	13 Feb 2004 09:38:33 -0000	1.44
+++ poll/unix/poll.c	30 May 2004 21:44:54 -0000
@@ -25,15 +25,142 @@
 #if HAVE_SYS_POLL_H
 #include <sys/poll.h>
 #endif
-
+#ifdef HAVE_EPOLL
+#include <sys/epoll.h>
+#endif
 
 #ifdef NETWARE
 #define HAS_SOCKETS(dt) (dt == APR_POLL_SOCKET) ? 1 : 0
 #define HAS_PIPES(dt) (dt == APR_POLL_FILE) ? 1 : 0
 #endif
 
+#ifdef HAVE_EPOLL
+static apr_int16_t get_event(apr_int16_t event)
+{
+    apr_int16_t rv = 0;
 
-#ifdef HAVE_POLL    /* We can just use poll to do our socket polling. */
+    if (event & APR_POLLIN)
+        rv |= EPOLLIN;        
+    if (event & APR_POLLPRI)
+        rv |= EPOLLPRI;        
+    if (event & APR_POLLOUT)
+        rv |= EPOLLOUT;       
+    if (event & APR_POLLERR)
+        rv |= EPOLLERR;        
+    if (event & APR_POLLHUP)
+        rv |= EPOLLHUP;        
+    /**
+     * APR_POLLNVAL is not handled by epoll?
+     */
+    return rv;
+}
+
+static apr_int16_t get_revent(apr_int16_t event)
+{
+    apr_int16_t rv = 0;
+
+    if (event & EPOLLIN)
+        rv |= APR_POLLIN;
+    if (event & EPOLLPRI)
+        rv |= APR_POLLPRI;
+    if (event & EPOLLOUT)
+        rv |= APR_POLLOUT;
+    if (event & EPOLLERR)
+        rv |= APR_POLLERR;
+    if (event & EPOLLHUP)
+        rv |= APR_POLLHUP;
+    /**
+     * APR_POLLNVAL is not handled by epoll?
+     */
+
+    return rv;
+}        
+
+#define SMALL_POLLSET_LIMIT  8
+
+APR_DECLARE(apr_status_t) apr_poll(apr_pollfd_t *aprset, apr_int32_t num,
+                      apr_int32_t *nsds, apr_interval_time_t timeout)
+{
+    int i, num_to_poll;
+    int epoll_fd;
+    struct epoll_event ev;
+#ifdef HAVE_VLA
+    /* XXX: I trust that this is a segv when insufficient stack exists? */
+    struct epoll_event pollset[num];
+#elif defined(HAVE_ALLOCA)
+    struct epoll_event *pollset = alloca(sizeof(struct epoll_event) * num);
+    if (!pollset)
+        return APR_ENOMEM;
+#else
+    struct epoll_event tmp_pollset[SMALL_POLLSET_LIMIT];
+    struct epoll_event *pollset;
+
+    if (num <= SMALL_POLLSET_LIMIT) {
+        pollset = tmp_pollset;
+    }
+    else {
+        /* This does require O(n) to copy the descriptors to the internal
+         * mapping.
+         */
+        pollset = malloc(sizeof(struct epoll_event) * num);
+        /* The other option is adding an apr_pool_abort() fn to invoke
+         * the pool's out of memory handler
+         */
+        if (!pollset)
+            return APR_ENOMEM;
+    }
+#endif
+
+    epoll_fd = epoll_create(num);
+    if(epoll_fd < 0) 
+        return APR_ENOMEM;
+
+    for (i = 0; i < num; i++) {
+        ev.events = get_event(aprset[i].reqevents);
+        if (aprset[i].desc_type == APR_POLL_SOCKET) {
+            ev.data.fd = aprset[i].desc.s->socketdes;
+            epoll_ctl(epoll_fd, EPOLL_CTL_ADD, aprset[i].desc.s->socketdes, &ev);
+        }
+        else if (aprset[i].desc_type == APR_POLL_FILE) {
+            ev.data.fd = aprset[i].desc.f->filedes;
+            epoll_ctl(epoll_fd, EPOLL_CTL_ADD, aprset[i].desc.f->filedes, &ev);
+        }
+        else {
+            break;
+        }
+    }
+    num_to_poll = i;
+
+    if (timeout > 0) {
+        timeout /= 1000; /* convert microseconds to milliseconds */
+    }
+
+    i = epoll_wait(epoll_fd, pollset, num_to_poll, timeout);
+    (*nsds) = i;
+
+    for (i = 0; i < num; i++) {
+        aprset[i].rtnevents = get_revent(pollset[i].events);
+    }
+    
+#if !defined(HAVE_VLA) && !defined(HAVE_ALLOCA)
+    if (num > SMALL_POLLSET_LIMIT) {
+        free(pollset);
+    }
+#endif
+
+    close(epoll_fd);
+
+    if ((*nsds) < 0) {
+        return apr_get_netos_error();
+    }
+    if ((*nsds) == 0) {
+        return APR_TIMEUP;
+    }
+    return APR_SUCCESS;
+
+}
+
+#elif defined(HAVE_POLL)    /* We can just use poll to do our socket polling. */
 
 static apr_int16_t get_event(apr_int16_t event)
 {
@@ -286,7 +413,10 @@
 struct apr_pollset_t {
     apr_uint32_t nelts;
     apr_uint32_t nalloc;
-#ifdef HAVE_POLL
+#ifdef HAVE_EPOLL
+    int epoll_fd;
+    struct epoll_event *pollset;
+#elif defined(HAVE_POLL)
     struct pollfd *pollset;
 #else
     fd_set readset, writeset, exceptset;
@@ -305,7 +435,7 @@
                                              apr_pool_t *p,
                                              apr_uint32_t flags)
 {
-#if !defined(HAVE_POLL) && defined(FD_SETSIZE)
+#if !defined(HAVE_EPOLL) && !defined(HAVE_POLL) && defined(FD_SETSIZE)
     if (size > FD_SETSIZE) {
         *pollset = NULL;
         return APR_EINVAL;
@@ -314,7 +444,10 @@
     *pollset = apr_palloc(p, sizeof(**pollset));
     (*pollset)->nelts = 0;
     (*pollset)->nalloc = size;
-#ifdef HAVE_POLL
+#ifdef HAVE_EPOLL
+    (*pollset)->epoll_fd = epoll_create(size);
+    (*pollset)->pollset = apr_palloc(p, size * sizeof(struct epoll_event));
+#elif defined(HAVE_POLL)
     (*pollset)->pollset = apr_palloc(p, size * sizeof(struct pollfd));
 #else
     FD_ZERO(&((*pollset)->readset));
@@ -333,16 +466,20 @@
 
 APR_DECLARE(apr_status_t) apr_pollset_destroy(apr_pollset_t *pollset)
 {
-    /* A no-op function for now.  If we later implement /dev/poll
-     * support, we'll need to close the /dev/poll fd here
-     */
+#ifdef HAVE_EPOLL
+    close(pollset->epoll_fd);
+#endif
     return APR_SUCCESS;
 }
 
 APR_DECLARE(apr_status_t) apr_pollset_add(apr_pollset_t *pollset,
                                           const apr_pollfd_t *descriptor)
 {
-#ifndef HAVE_POLL
+#ifdef HAVE_EPOLL
+    struct epoll_event ev;
+    int ret = -1;
+#endif
+#if !defined(HAVE_POLL)
     apr_os_sock_t fd;
 #endif
 
@@ -351,7 +488,20 @@
     }
 
     pollset->query_set[pollset->nelts] = *descriptor;
-#ifdef HAVE_POLL
+#ifdef HAVE_EPOLL
+    ev.events = get_event(descriptor->reqevents);
+    if (descriptor->desc_type == APR_POLL_SOCKET) {
+        ev.data.fd = descriptor->desc.s->socketdes;
+        ret = epoll_ctl(pollset->epoll_fd, EPOLL_CTL_ADD, descriptor->desc.s->socketdes, &ev);
+    }
+    else {
+        ev.data.fd = descriptor->desc.f->filedes;
+        ret = epoll_ctl(pollset->epoll_fd, EPOLL_CTL_ADD, descriptor->desc.f->filedes, &ev);
+    }
+    if(0 != ret) {
+        return APR_EBADF;
+    } 
+#elif defined(HAVE_POLL)
 
     if (descriptor->desc_type == APR_POLL_SOCKET) {
         pollset->pollset[pollset->nelts].fd = descriptor->desc.s->socketdes;
@@ -420,11 +570,47 @@
                                              const apr_pollfd_t *descriptor)
 {
     apr_uint32_t i;
+#ifdef HAVE_EPOLL
+    struct epoll_event ev;
+    int ret = -1;
+#endif
 #ifndef HAVE_POLL
     apr_os_sock_t fd;
 #endif
 
-#ifdef HAVE_POLL
+#ifdef HAVE_EPOLL
+    for (i = 0; i < pollset->nelts; i++) {
+        if (descriptor->desc.s == pollset->query_set[i].desc.s) {
+            /* Found an instance of the fd: remove this and any other copies */
+            apr_uint32_t dst = i;
+            apr_uint32_t old_nelts = pollset->nelts;
+            pollset->nelts--;
+            for (i++; i < old_nelts; i++) {
+                if (descriptor->desc.s == pollset->query_set[i].desc.s) {
+                    pollset->nelts--;
+                }
+                else {
+                    pollset->query_set[dst] = pollset->query_set[i];
+                    dst++;
+                }
+            }
+            ev.events = get_event(descriptor->reqevents);
+            if (descriptor->desc_type == APR_POLL_SOCKET) {
+                ev.data.fd = descriptor->desc.s->socketdes;
+                ret = epoll_ctl(pollset->epoll_fd, EPOLL_CTL_DEL, descriptor->desc.s->socketdes, &ev);
+            }
+            else {
+                ev.data.fd = descriptor->desc.f->filedes;
+                ret = epoll_ctl(pollset->epoll_fd, EPOLL_CTL_DEL, descriptor->desc.f->filedes, &ev);
+            }
+            if(ret < 0) {
+                return APR_EBADF;
+            }
+
+            return APR_SUCCESS;
+        }
+    }
+#elif defined(HAVE_POLL)
     for (i = 0; i < pollset->nelts; i++) {
         if (descriptor->desc.s == pollset->query_set[i].desc.s) {
             /* Found an instance of the fd: remove this and any other copies */
@@ -486,7 +672,58 @@
     return APR_NOTFOUND;
 }
 
-#ifdef HAVE_POLL
+#ifdef HAVE_EPOLL
+APR_DECLARE(apr_status_t) apr_pollset_poll(apr_pollset_t *pollset,
+                                           apr_interval_time_t timeout,
+                                           apr_int32_t *num,
+                                           const apr_pollfd_t **descriptors)
+{
+    int rv;
+    apr_uint32_t i, j, k;
+
+    if (timeout > 0) {
+        timeout /= 1000;
+    }
+
+    rv = epoll_wait(pollset->epoll_fd, pollset->pollset , pollset->nelts, timeout);
+    (*num) = rv;
+    if (rv < 0) {
+        return apr_get_netos_error();
+    }
+    if (rv == 0) {
+        return APR_TIMEUP;
+    }
+    j = 0;
+    for (i = 0; i < pollset->nelts; i++) {
+        if (pollset->pollset[i].events != 0) {
+            /* TODO: Is there a better way to re-associate our data? */
+            for(k = 0; k < pollset->nelts; k++) {
+                if(pollset->query_set[k].desc_type == APR_POLL_SOCKET &&
+                    pollset->query_set[k].desc.s->socketdes == pollset->pollset[i].data.fd
+                    ) {
+                    pollset->result_set[j] = pollset->query_set[k];
+                    pollset->result_set[j].rtnevents =
+                        get_revent(pollset->pollset[i].events);
+                    j++;
+                    break;
+                } 
+                else if(pollset->query_set[k].desc_type == APR_POLL_FILE &&
+                        pollset->query_set[k].desc.f->filedes == pollset->pollset[i].data.fd
+                ) {
+                    pollset->result_set[j] = pollset->query_set[k];
+                    pollset->result_set[j].rtnevents =
+                        get_revent(pollset->pollset[i].events);
+                    j++;
+                    break;
+                }
+            }
+        }
+    }
+    if (descriptors)
+        *descriptors = pollset->result_set;
+    return APR_SUCCESS;
+}
+#elif defined(HAVE_POLL)
 APR_DECLARE(apr_status_t) apr_pollset_poll(apr_pollset_t *pollset,
                                            apr_interval_time_t timeout,
                                            apr_int32_t *num,
Index: test/testpoll.c
===================================================================
RCS file: /home/cvspublic/apr/test/testpoll.c,v
retrieving revision 1.33
diff -u -r1.33 testpoll.c
--- test/testpoll.c	26 May 2004 14:50:27 -0000	1.33
+++ test/testpoll.c	30 May 2004 21:44:54 -0000
@@ -489,10 +489,12 @@
     rv = apr_pollset_poll(pollset, 1000, &num, &hot_files);
     ABTS_INT_EQUAL(tc, APR_SUCCESS, rv);
     ABTS_INT_EQUAL(tc, 2, num);
-    ABTS_PTR_EQUAL(tc, (void *)1, hot_files[0].client_data);
-    ABTS_PTR_EQUAL(tc, s[0], hot_files[0].desc.s);
-    ABTS_PTR_EQUAL(tc, (void *)4, hot_files[1].client_data);
-    ABTS_PTR_EQUAL(tc, s[3], hot_files[1].desc.s);
+    ABTS_ASSERT(tc, "Incorrect socket in result set",
+            ((hot_files[0].desc.s == s[0]) && (hot_files[1].desc.s == s[3])) ||
+            ((hot_files[0].desc.s == s[3]) && (hot_files[1].desc.s == s[0])));
+    ABTS_ASSERT(tc, "Incorrect client data in result set",
+            ((hot_files[0].client_data == (void *)1) && (hot_files[1].client_data == (void *)4)) ||
+            ((hot_files[0].client_data == (void *)4) && (hot_files[1].client_data == (void *)1)));
 }
 
 abts_suite *testpoll(abts_suite *suite)

Reply via email to