On Sun, 2004-05-30 at 13:26 +0100, Joe Orton wrote: > Interesting... some nits: > > - C++ // comments bad ;) > - CPP #directives must not have leading whitespace before the #
Fixed in the updated patch. > - needs to cope with the fact that glibc implements epoll* returning > ENOSYS or whatever on 2.4 kernels IIRC, via autoconf or runtime checks I would prefer doing this in autoconf, unless people are willing to accept a much larger patch that could make the pollset back end selectable at runtime. The other issue with this is if APR was built on a machine running 2.6, it might not be usable on a machine with 2.4. That would be the other advantage of making the pollset back end runtime selectable. Another goal of mine is to make a KQueue() based back end for FreeBSD sooner or later. > An issue I mentioned before w.r.t. removal of apr_poll() in HEAD: this > type of implementation will surely end up being slower than using poll() > for polling on small numbers of fds (a not uncommon case), due to the > fact that it requires three rather than one system call? Some > benchmarks might be interesting. I did a simple benchmark with an epoll enabled HTTPd 2.1 yesterday. [mostly to see if it all worked correctly :) ] It did a few transactions a second _faster_ than a normal poll() based HTTPd 2.1. Not significantly better or worse for Apache's common case of not very many sockets. Maybe later this week I can make up some more official benchmarks. > I wondered whether apr_poll() should remain using poll() with it's low > constant overhead, and the apr_pollset_* interface alone should use the > high-constant-overhead interfaces like epoll or /dev/poll? Maybe. I don't think poll's overhead is much less than epoll. We will have to benchmark it to find a real answer. -Paul Querna
Index: configure.in =================================================================== RCS file: /home/cvspublic/apr/configure.in,v retrieving revision 1.581 diff -u -r1.581 configure.in --- configure.in 24 May 2004 09:33:26 -0000 1.581 +++ configure.in 30 May 2004 21:44:52 -0000 @@ -636,6 +636,13 @@ AC_CHECK_FUNCS(poll) +haveepoll=no +AC_CHECK_FUNCS(epoll_ctl, [haveepoll=yes], ) +if test "x$haveepoll" = "xyes" ; then + AC_DEFINE(HAVE_EPOLL, 1, + [Define if your system supports the epoll system calls]) +fi + dnl ----------------------------- Checking for missing POSIX thread functions AC_CHECK_FUNCS([getpwnam_r getpwuid_r getgrnam_r getgrgid_r]) Index: poll/unix/poll.c =================================================================== RCS file: /home/cvspublic/apr/poll/unix/poll.c,v retrieving revision 1.44 diff -u -r1.44 poll.c --- poll/unix/poll.c 13 Feb 2004 09:38:33 -0000 1.44 +++ poll/unix/poll.c 30 May 2004 21:44:54 -0000 @@ -25,15 +25,142 @@ #if HAVE_SYS_POLL_H #include <sys/poll.h> #endif - +#ifdef HAVE_EPOLL +#include <sys/epoll.h> +#endif #ifdef NETWARE #define HAS_SOCKETS(dt) (dt == APR_POLL_SOCKET) ? 1 : 0 #define HAS_PIPES(dt) (dt == APR_POLL_FILE) ? 1 : 0 #endif +#ifdef HAVE_EPOLL +static apr_int16_t get_event(apr_int16_t event) +{ + apr_int16_t rv = 0; -#ifdef HAVE_POLL /* We can just use poll to do our socket polling. */ + if (event & APR_POLLIN) + rv |= EPOLLIN; + if (event & APR_POLLPRI) + rv |= EPOLLPRI; + if (event & APR_POLLOUT) + rv |= EPOLLOUT; + if (event & APR_POLLERR) + rv |= EPOLLERR; + if (event & APR_POLLHUP) + rv |= EPOLLHUP; + /** + * APR_POLLNVAL is not handled by epoll? + */ + return rv; +} + +static apr_int16_t get_revent(apr_int16_t event) +{ + apr_int16_t rv = 0; + + if (event & EPOLLIN) + rv |= APR_POLLIN; + if (event & EPOLLPRI) + rv |= APR_POLLPRI; + if (event & EPOLLOUT) + rv |= APR_POLLOUT; + if (event & EPOLLERR) + rv |= APR_POLLERR; + if (event & EPOLLHUP) + rv |= APR_POLLHUP; + /** + * APR_POLLNVAL is not handled by epoll? + */ + + return rv; +} + +#define SMALL_POLLSET_LIMIT 8 + +APR_DECLARE(apr_status_t) apr_poll(apr_pollfd_t *aprset, apr_int32_t num, + apr_int32_t *nsds, apr_interval_time_t timeout) +{ + int i, num_to_poll; + int epoll_fd; + struct epoll_event ev; +#ifdef HAVE_VLA + /* XXX: I trust that this is a segv when insufficient stack exists? */ + struct epoll_event pollset[num]; +#elif defined(HAVE_ALLOCA) + struct epoll_event *pollset = alloca(sizeof(struct epoll_event) * num); + if (!pollset) + return APR_ENOMEM; +#else + struct epoll_event tmp_pollset[SMALL_POLLSET_LIMIT]; + struct epoll_event *pollset; + + if (num <= SMALL_POLLSET_LIMIT) { + pollset = tmp_pollset; + } + else { + /* This does require O(n) to copy the descriptors to the internal + * mapping. + */ + pollset = malloc(sizeof(struct epoll_event) * num); + /* The other option is adding an apr_pool_abort() fn to invoke + * the pool's out of memory handler + */ + if (!pollset) + return APR_ENOMEM; + } +#endif + + epoll_fd = epoll_create(num); + if(epoll_fd < 0) + return APR_ENOMEM; + + for (i = 0; i < num; i++) { + ev.events = get_event(aprset[i].reqevents); + if (aprset[i].desc_type == APR_POLL_SOCKET) { + ev.data.fd = aprset[i].desc.s->socketdes; + epoll_ctl(epoll_fd, EPOLL_CTL_ADD, aprset[i].desc.s->socketdes, &ev); + } + else if (aprset[i].desc_type == APR_POLL_FILE) { + ev.data.fd = aprset[i].desc.f->filedes; + epoll_ctl(epoll_fd, EPOLL_CTL_ADD, aprset[i].desc.f->filedes, &ev); + } + else { + break; + } + } + num_to_poll = i; + + if (timeout > 0) { + timeout /= 1000; /* convert microseconds to milliseconds */ + } + + i = epoll_wait(epoll_fd, pollset, num_to_poll, timeout); + (*nsds) = i; + + for (i = 0; i < num; i++) { + aprset[i].rtnevents = get_revent(pollset[i].events); + } + +#if !defined(HAVE_VLA) && !defined(HAVE_ALLOCA) + if (num > SMALL_POLLSET_LIMIT) { + free(pollset); + } +#endif + + close(epoll_fd); + + if ((*nsds) < 0) { + return apr_get_netos_error(); + } + if ((*nsds) == 0) { + return APR_TIMEUP; + } + return APR_SUCCESS; + +} + +#elif defined(HAVE_POLL) /* We can just use poll to do our socket polling. */ static apr_int16_t get_event(apr_int16_t event) { @@ -286,7 +413,10 @@ struct apr_pollset_t { apr_uint32_t nelts; apr_uint32_t nalloc; -#ifdef HAVE_POLL +#ifdef HAVE_EPOLL + int epoll_fd; + struct epoll_event *pollset; +#elif defined(HAVE_POLL) struct pollfd *pollset; #else fd_set readset, writeset, exceptset; @@ -305,7 +435,7 @@ apr_pool_t *p, apr_uint32_t flags) { -#if !defined(HAVE_POLL) && defined(FD_SETSIZE) +#if !defined(HAVE_EPOLL) && !defined(HAVE_POLL) && defined(FD_SETSIZE) if (size > FD_SETSIZE) { *pollset = NULL; return APR_EINVAL; @@ -314,7 +444,10 @@ *pollset = apr_palloc(p, sizeof(**pollset)); (*pollset)->nelts = 0; (*pollset)->nalloc = size; -#ifdef HAVE_POLL +#ifdef HAVE_EPOLL + (*pollset)->epoll_fd = epoll_create(size); + (*pollset)->pollset = apr_palloc(p, size * sizeof(struct epoll_event)); +#elif defined(HAVE_POLL) (*pollset)->pollset = apr_palloc(p, size * sizeof(struct pollfd)); #else FD_ZERO(&((*pollset)->readset)); @@ -333,16 +466,20 @@ APR_DECLARE(apr_status_t) apr_pollset_destroy(apr_pollset_t *pollset) { - /* A no-op function for now. If we later implement /dev/poll - * support, we'll need to close the /dev/poll fd here - */ +#ifdef HAVE_EPOLL + close(pollset->epoll_fd); +#endif return APR_SUCCESS; } APR_DECLARE(apr_status_t) apr_pollset_add(apr_pollset_t *pollset, const apr_pollfd_t *descriptor) { -#ifndef HAVE_POLL +#ifdef HAVE_EPOLL + struct epoll_event ev; + int ret = -1; +#endif +#if !defined(HAVE_POLL) apr_os_sock_t fd; #endif @@ -351,7 +488,20 @@ } pollset->query_set[pollset->nelts] = *descriptor; -#ifdef HAVE_POLL +#ifdef HAVE_EPOLL + ev.events = get_event(descriptor->reqevents); + if (descriptor->desc_type == APR_POLL_SOCKET) { + ev.data.fd = descriptor->desc.s->socketdes; + ret = epoll_ctl(pollset->epoll_fd, EPOLL_CTL_ADD, descriptor->desc.s->socketdes, &ev); + } + else { + ev.data.fd = descriptor->desc.f->filedes; + ret = epoll_ctl(pollset->epoll_fd, EPOLL_CTL_ADD, descriptor->desc.f->filedes, &ev); + } + if(0 != ret) { + return APR_EBADF; + } +#elif defined(HAVE_POLL) if (descriptor->desc_type == APR_POLL_SOCKET) { pollset->pollset[pollset->nelts].fd = descriptor->desc.s->socketdes; @@ -420,11 +570,47 @@ const apr_pollfd_t *descriptor) { apr_uint32_t i; +#ifdef HAVE_EPOLL + struct epoll_event ev; + int ret = -1; +#endif #ifndef HAVE_POLL apr_os_sock_t fd; #endif -#ifdef HAVE_POLL +#ifdef HAVE_EPOLL + for (i = 0; i < pollset->nelts; i++) { + if (descriptor->desc.s == pollset->query_set[i].desc.s) { + /* Found an instance of the fd: remove this and any other copies */ + apr_uint32_t dst = i; + apr_uint32_t old_nelts = pollset->nelts; + pollset->nelts--; + for (i++; i < old_nelts; i++) { + if (descriptor->desc.s == pollset->query_set[i].desc.s) { + pollset->nelts--; + } + else { + pollset->query_set[dst] = pollset->query_set[i]; + dst++; + } + } + ev.events = get_event(descriptor->reqevents); + if (descriptor->desc_type == APR_POLL_SOCKET) { + ev.data.fd = descriptor->desc.s->socketdes; + ret = epoll_ctl(pollset->epoll_fd, EPOLL_CTL_DEL, descriptor->desc.s->socketdes, &ev); + } + else { + ev.data.fd = descriptor->desc.f->filedes; + ret = epoll_ctl(pollset->epoll_fd, EPOLL_CTL_DEL, descriptor->desc.f->filedes, &ev); + } + if(ret < 0) { + return APR_EBADF; + } + + return APR_SUCCESS; + } + } +#elif defined(HAVE_POLL) for (i = 0; i < pollset->nelts; i++) { if (descriptor->desc.s == pollset->query_set[i].desc.s) { /* Found an instance of the fd: remove this and any other copies */ @@ -486,7 +672,58 @@ return APR_NOTFOUND; } -#ifdef HAVE_POLL +#ifdef HAVE_EPOLL +APR_DECLARE(apr_status_t) apr_pollset_poll(apr_pollset_t *pollset, + apr_interval_time_t timeout, + apr_int32_t *num, + const apr_pollfd_t **descriptors) +{ + int rv; + apr_uint32_t i, j, k; + + if (timeout > 0) { + timeout /= 1000; + } + + rv = epoll_wait(pollset->epoll_fd, pollset->pollset , pollset->nelts, timeout); + (*num) = rv; + if (rv < 0) { + return apr_get_netos_error(); + } + if (rv == 0) { + return APR_TIMEUP; + } + j = 0; + for (i = 0; i < pollset->nelts; i++) { + if (pollset->pollset[i].events != 0) { + /* TODO: Is there a better way to re-associate our data? */ + for(k = 0; k < pollset->nelts; k++) { + if(pollset->query_set[k].desc_type == APR_POLL_SOCKET && + pollset->query_set[k].desc.s->socketdes == pollset->pollset[i].data.fd + ) { + pollset->result_set[j] = pollset->query_set[k]; + pollset->result_set[j].rtnevents = + get_revent(pollset->pollset[i].events); + j++; + break; + } + else if(pollset->query_set[k].desc_type == APR_POLL_FILE && + pollset->query_set[k].desc.f->filedes == pollset->pollset[i].data.fd + ) { + pollset->result_set[j] = pollset->query_set[k]; + pollset->result_set[j].rtnevents = + get_revent(pollset->pollset[i].events); + j++; + break; + } + } + } + } + if (descriptors) + *descriptors = pollset->result_set; + return APR_SUCCESS; +} +#elif defined(HAVE_POLL) APR_DECLARE(apr_status_t) apr_pollset_poll(apr_pollset_t *pollset, apr_interval_time_t timeout, apr_int32_t *num, Index: test/testpoll.c =================================================================== RCS file: /home/cvspublic/apr/test/testpoll.c,v retrieving revision 1.33 diff -u -r1.33 testpoll.c --- test/testpoll.c 26 May 2004 14:50:27 -0000 1.33 +++ test/testpoll.c 30 May 2004 21:44:54 -0000 @@ -489,10 +489,12 @@ rv = apr_pollset_poll(pollset, 1000, &num, &hot_files); ABTS_INT_EQUAL(tc, APR_SUCCESS, rv); ABTS_INT_EQUAL(tc, 2, num); - ABTS_PTR_EQUAL(tc, (void *)1, hot_files[0].client_data); - ABTS_PTR_EQUAL(tc, s[0], hot_files[0].desc.s); - ABTS_PTR_EQUAL(tc, (void *)4, hot_files[1].client_data); - ABTS_PTR_EQUAL(tc, s[3], hot_files[1].desc.s); + ABTS_ASSERT(tc, "Incorrect socket in result set", + ((hot_files[0].desc.s == s[0]) && (hot_files[1].desc.s == s[3])) || + ((hot_files[0].desc.s == s[3]) && (hot_files[1].desc.s == s[0]))); + ABTS_ASSERT(tc, "Incorrect client data in result set", + ((hot_files[0].client_data == (void *)1) && (hot_files[1].client_data == (void *)4)) || + ((hot_files[0].client_data == (void *)4) && (hot_files[1].client_data == (void *)1))); } abts_suite *testpoll(abts_suite *suite)