Module Name: src Committed By: rmind Date: Thu Jul 8 12:23:31 UTC 2010
Modified Files: src/sys/kern: sys_select.c src/sys/sys: fd_set.h lwp.h param.h selinfo.h types.h Log Message: Implement direct select/poll support, currently effective for socket and pipe subsystems. Avoids overhead of second selscan() on wake-up, and thus improves performance on certain workloads (especially when polling on many file-descriptors). Also, clean-up sys/fd_set.h header and improve macros. Welcome to 5.99.36! To generate a diff of this commit: cvs rdiff -u -r1.22 -r1.23 src/sys/kern/sys_select.c cvs rdiff -u -r1.2 -r1.3 src/sys/sys/fd_set.h cvs rdiff -u -r1.136 -r1.137 src/sys/sys/lwp.h cvs rdiff -u -r1.370 -r1.371 src/sys/sys/param.h cvs rdiff -u -r1.7 -r1.8 src/sys/sys/selinfo.h cvs rdiff -u -r1.86 -r1.87 src/sys/sys/types.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/kern/sys_select.c diff -u src/sys/kern/sys_select.c:1.22 src/sys/kern/sys_select.c:1.23 --- src/sys/kern/sys_select.c:1.22 Sun Apr 25 15:55:24 2010 +++ src/sys/kern/sys_select.c Thu Jul 8 12:23:31 2010 @@ -1,11 +1,11 @@ -/* $NetBSD: sys_select.c,v 1.22 2010/04/25 15:55:24 ad Exp $ */ +/* $NetBSD: sys_select.c,v 1.23 2010/07/08 12:23:31 rmind Exp $ */ /*- * Copyright (c) 2007, 2008, 2009, 2010 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation - * by Andrew Doran. + * by Andrew Doran and Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -84,7 +84,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.22 2010/04/25 15:55:24 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.23 2010/07/08 12:23:31 rmind Exp $"); #include <sys/param.h> #include <sys/systm.h> @@ -109,7 +109,12 @@ /* Flags for lwp::l_selflag. */ #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */ #define SEL_SCANNING 1 /* polling descriptors */ -#define SEL_BLOCKING 2 /* about to block on select_cv */ +#define SEL_BLOCKING 2 /* blocking and waiting for event */ +#define SEL_EVENT 3 /* interrupted, events set directly */ + +/* Operations: either select() or poll(). */ +#define SELOP_SELECT 1 +#define SELOP_POLL 2 /* * Per-cluster state for select()/poll(). For a system with fewer @@ -125,10 +130,16 @@ uint32_t sc_mask; } selcluster_t; -static inline int selscan(char *, u_int, register_t *); -static inline int pollscan(struct pollfd *, u_int, register_t *); +static inline int selscan(char *, const int, const size_t, register_t *); +static inline int pollscan(struct pollfd *, const int, register_t *); static void selclear(void); +static const int sel_flag[] = { + POLLRDNORM | POLLHUP | POLLERR, + POLLWRNORM | POLLHUP | POLLERR, + POLLRDBAND +}; + static syncobj_t select_sobj = { SOBJ_SLEEPQ_FIFO, sleepq_unsleep, @@ -137,7 +148,7 @@ syncobj_noowner, }; -static selcluster_t *selcluster[SELCLUSTERS]; +static selcluster_t *selcluster[SELCLUSTERS] __read_mostly; /* * Select system call. @@ -206,8 +217,8 @@ * sel_do_scan: common code to perform the scan on descriptors. */ static int -sel_do_scan(void *fds, u_int nfds, struct timespec *ts, sigset_t *mask, - register_t *retval, int selpoll) +sel_do_scan(const int op, void *fds, const int nf, const size_t ni, + struct timespec *ts, sigset_t *mask, register_t *retval) { lwp_t * const l = curlwp; proc_t * const p = l->l_proc; @@ -237,6 +248,14 @@ lock = sc->sc_lock; l->l_selcluster = sc; SLIST_INIT(&l->l_selwait); + + l->l_selret = 0; + if (op == SELOP_SELECT) { + l->l_selbits = (char *)fds + ni * 3; + l->l_selni = ni; + } else { + l->l_selbits = NULL; + } for (;;) { int ncoll; @@ -250,28 +269,51 @@ l->l_selflag = SEL_SCANNING; ncoll = sc->sc_ncoll; - if (selpoll) { - error = selscan((char *)fds, nfds, retval); + if (op == SELOP_SELECT) { + error = selscan((char *)fds, nf, ni, retval); } else { - error = pollscan((struct pollfd *)fds, nfds, retval); + error = pollscan((struct pollfd *)fds, nf, retval); } - if (error || *retval) break; if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0) break; + /* + * Acquire the lock and perform the (re)checks. Note, if + * collision has occured, then our state does not matter, + * as we must perform re-scan. Therefore, check it first. + */ +state_check: mutex_spin_enter(lock); - if (l->l_selflag != SEL_SCANNING || sc->sc_ncoll != ncoll) { + if (__predict_false(sc->sc_ncoll != ncoll)) { + /* Collision: perform re-scan. */ mutex_spin_exit(lock); continue; } + if (__predict_true(l->l_selflag == SEL_EVENT)) { + /* Events occured, they are set directly. */ + mutex_spin_exit(lock); + KASSERT(l->l_selret != 0); + *retval = l->l_selret; + break; + } + if (__predict_true(l->l_selflag == SEL_RESET)) { + /* Events occured, but re-scan is requested. */ + mutex_spin_exit(lock); + continue; + } + KASSERT(l->l_selflag == SEL_SCANNING); + /* Nothing happen, therefore - sleep. */ l->l_selflag = SEL_BLOCKING; l->l_kpriority = true; sleepq_enter(&sc->sc_sleepq, l, lock); sleepq_enqueue(&sc->sc_sleepq, sc, "select", &select_sobj); error = sleepq_block(timo, true); - if (error != 0) + if (error != 0) { break; + } + /* Awoken: need to check the state. */ + goto state_check; } selclear(); @@ -326,7 +368,7 @@ getbits(ex, 2); #undef getbits - error = sel_do_scan(bits, nd, ts, mask, retval, 1); + error = sel_do_scan(SELOP_SELECT, bits, nd, ni, ts, mask, retval); if (error == 0 && u_in != NULL) error = copyout(bits + ni * 3, u_in, ni); if (error == 0 && u_ou != NULL) @@ -340,30 +382,32 @@ } static inline int -selscan(char *bits, u_int nfd, register_t *retval) +selscan(char *bits, const int nfd, const size_t ni, register_t *retval) { - static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR, - POLLWRNORM | POLLHUP | POLLERR, - POLLRDBAND }; fd_mask *ibitp, *obitp; - int msk, i, j, fd, ni, n; - fd_mask ibits, obits; + int msk, i, j, fd, n; file_t *fp; - ni = howmany(nfd, NFDBITS) * sizeof(fd_mask); ibitp = (fd_mask *)(bits + ni * 0); obitp = (fd_mask *)(bits + ni * 3); n = 0; for (msk = 0; msk < 3; msk++) { for (i = 0; i < nfd; i += NFDBITS) { + fd_mask ibits, obits; + ibits = *ibitp++; obits = 0; while ((j = ffs(ibits)) && (fd = i + --j) < nfd) { ibits &= ~(1 << j); if ((fp = fd_getfile(fd)) == NULL) return (EBADF); - if ((*fp->f_ops->fo_poll)(fp, flag[msk])) { + /* + * Setup an argument to selrecord(), which is + * a file descriptor number. + */ + curlwp->l_selrec = fd; + if ((*fp->f_ops->fo_poll)(fp, sel_flag[msk])) { obits |= (1 << j); n++; } @@ -464,7 +508,7 @@ if (error) goto fail; - error = sel_do_scan(fds, nfds, ts, mask, retval, 0); + error = sel_do_scan(SELOP_POLL, fds, nfds, ni, ts, mask, retval); if (error == 0) error = copyout(fds, u_fds, ni); fail: @@ -474,12 +518,11 @@ } static inline int -pollscan(struct pollfd *fds, u_int nfd, register_t *retval) +pollscan(struct pollfd *fds, const int nfd, register_t *retval) { - int i, n; file_t *fp; + int i, n = 0; - n = 0; for (i = 0; i < nfd; i++, fds++) { if (fds->fd < 0) { fds->revents = 0; @@ -487,6 +530,12 @@ fds->revents = POLLNVAL; n++; } else { + /* + * Perform poll: registers select request or returns + * the events which are set. Setup an argument for + * selrecord(), which is a pointer to struct pollfd. + */ + curlwp->l_selrec = (uintptr_t)fds; fds->revents = (*fp->f_ops->fo_poll)(fp, fds->events | POLLERR | POLLHUP); if (fds->revents != 0) @@ -498,7 +547,6 @@ return (0); } -/*ARGSUSED*/ int seltrue(dev_t dev, int events, lwp_t *l) { @@ -539,28 +587,73 @@ other = sip->sel_lwp; if (other == selector) { - /* `selector' has already claimed it. */ + /* 1. We (selector) already claimed to be the first LWP. */ KASSERT(sip->sel_cluster = sc); } else if (other == NULL) { /* - * First named waiter, although there may be unnamed - * waiters (collisions). Issue a memory barrier to - * ensure that we access sel_lwp (above) before other - * fields - this guards against a call to selclear(). + * 2. No first LWP, therefore we (selector) are the first. + * + * There may be unnamed waiters (collisions). Issue a memory + * barrier to ensure that we access sel_lwp (above) before + * other fields - this guards against a call to selclear(). */ membar_enter(); sip->sel_lwp = selector; SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain); + /* Copy the argument, which is for selnotify(). */ + sip->sel_fdinfo = selector->l_selrec; /* Replace selinfo's lock with the chosen cluster's lock. */ sip->sel_cluster = sc; } else { - /* Multiple waiters: record a collision. */ + /* 3. Multiple waiters: record a collision. */ sip->sel_collision |= sc->sc_mask; KASSERT(sip->sel_cluster != NULL); } } /* + * sel_setevents: a helper function for selnotify(), to set the events + * for LWP sleeping in selcommon() or pollcommon(). + */ +static inline void +sel_setevents(lwp_t *l, struct selinfo *sip, const int events) +{ + const int oflag = l->l_selflag; + + /* + * If we require re-scan or it was required by somebody else, + * then just (re)set SEL_RESET and return. + */ + if (__predict_false(events == 0 || oflag == SEL_RESET)) { + l->l_selflag = SEL_RESET; + return; + } + /* + * Direct set. Note: select state of LWP is locked. First, + * determine whether it is selcommon() or pollcommon(). + */ + if (l->l_selbits != NULL) { + fd_mask *fds = (fd_mask *)l->l_selbits; + const int ni = l->l_selni; + const int fd = sip->sel_fdinfo; + int n; + + for (n = 0; n < 3; n++) { + if (sel_flag[n] | events) { + fds[fd >> __NFDSHIFT] |= (fd & __NFDMASK); + } + fds = (fd_mask *)((char *)fds + ni); + } + } else { + struct pollfd *pfd = (void *)sip->sel_fdinfo; + pfd->revents |= events; + } + /* Indicate direct set and note the event (cluster lock is held). */ + l->l_selflag = SEL_EVENT; + l->l_selret++; +} + +/* * Do a wakeup when a selectable event occurs. Concurrency issues: * * As per selrecord(), the caller's object lock is held. If there @@ -590,14 +683,18 @@ mutex_spin_enter(lock); /* Still there? */ if (sip->sel_lwp != NULL) { + /* + * Set the events for our LWP and indicate that. + * Otherwise, request for a full re-scan. + */ l = sip->sel_lwp; + oflag = l->l_selflag; + sel_setevents(l, sip, events); /* * If thread is sleeping, wake it up. If it's not * yet asleep, it will notice the change in state * and will re-poll the descriptors. */ - oflag = l->l_selflag; - l->l_selflag = SEL_RESET; if (oflag == SEL_BLOCKING && l->l_mutex == lock) { KASSERT(l->l_wchan == sc); sleepq_unsleep(l, false); @@ -710,7 +807,7 @@ * Concurrency issues: we only need guard against a call to selclear() * by a thread exiting sel_do_scan(). The caller has prevented further * references being made to the selinfo record via selrecord(), and it - * won't call selwakeup() again. + * will not call selnotify() again. */ void seldestroy(struct selinfo *sip) Index: src/sys/sys/fd_set.h diff -u src/sys/sys/fd_set.h:1.2 src/sys/sys/fd_set.h:1.3 --- src/sys/sys/fd_set.h:1.2 Sun Dec 11 12:25:20 2005 +++ src/sys/sys/fd_set.h Thu Jul 8 12:23:31 2010 @@ -1,4 +1,4 @@ -/* $NetBSD: fd_set.h,v 1.2 2005/12/11 12:25:20 christos Exp $ */ +/* $NetBSD: fd_set.h,v 1.3 2010/07/08 12:23:31 rmind Exp $ */ /*- * Copyright (c) 1992, 1993 @@ -39,44 +39,45 @@ #include <machine/int_types.h> /* - * Implementation dependent defines, hidden from user space. X/Open does not - * specify them. + * Implementation dependent defines, hidden from user space. + * POSIX does not specify them. */ -#define __NBBY 8 /* number of bits in a byte */ -typedef __int32_t __fd_mask; -/* bits per mask */ -#define __NFDBITS ((unsigned int)sizeof(__fd_mask) * __NBBY) +typedef __int32_t __fd_mask; -#define __howmany(x, y) (((x) + ((y) - 1)) / (y)) +/* 32 = 2 ^ 5 */ +#define __NFDBITS (32) +#define __NFDSHIFT (5) +#define __NFDMASK (__NFDBITS - 1) /* - * Select uses bit masks of file descriptors in longs. These macros - * manipulate such bit fields (the filesystem macros use chars). - * FD_SETSIZE may be defined by the user, but the default here should - * be enough for most uses. + * Select uses bit fields of file descriptors. These macros manipulate + * such bit fields. Note: FD_SETSIZE may be defined by the user. */ + #ifndef FD_SETSIZE #define FD_SETSIZE 256 #endif +#define __NFD_SIZE (((FD_SETSIZE) + (__NFDBITS - 1)) / __NFDBITS) + typedef struct fd_set { - __fd_mask fds_bits[__howmany(FD_SETSIZE, __NFDBITS)]; + __fd_mask fds_bits[__NFD_SIZE]; } fd_set; #define FD_SET(n, p) \ - ((p)->fds_bits[(n)/__NFDBITS] |= (1 << ((n) % __NFDBITS))) + ((p)->fds_bits[(n) >> __NFDSHIFT] |= (1 << ((n) & __NFDMASK))) #define FD_CLR(n, p) \ - ((p)->fds_bits[(n)/__NFDBITS] &= ~(1 << ((n) % __NFDBITS))) + ((p)->fds_bits[(n) >> __NFDSHIFT] &= ~(1 << ((n) & __NFDMASK))) #define FD_ISSET(n, p) \ - ((p)->fds_bits[(n)/__NFDBITS] & (1 << ((n) % __NFDBITS))) + ((p)->fds_bits[(n) >> __NFDSHIFT] & (1 << ((n) & __NFDMASK))) #if __GNUC_PREREQ__(2, 95) #define FD_ZERO(p) (void)__builtin_memset((p), 0, sizeof(*(p))) #else #define FD_ZERO(p) do { \ fd_set *__fds = (p); \ unsigned int __i; \ - for (__i = 0; __i < __howmany(FD_SETSIZE, __NFDBITS); __i++) \ + for (__i = 0; __i < __NFD_SIZE; __i++) \ __fds->fds_bits[__i] = 0; \ } while (/* CONSTCOND */ 0) #endif /* GCC 2.95 */ @@ -86,11 +87,8 @@ */ #if defined(_NETBSD_SOURCE) -#define fd_mask __fd_mask -#define NFDBITS __NFDBITS -#ifndef howmany -#define howmany(a, b) __howmany(a, b) -#endif +#define fd_mask __fd_mask +#define NFDBITS __NFDBITS #if __GNUC_PREREQ__(2, 95) #define FD_COPY(f, t) (void)__builtin_memcpy((t), (f), sizeof(*(f))) @@ -98,7 +96,7 @@ #define FD_COPY(f, t) do { \ fd_set *__f = (f), *__t = (t); \ unsigned int __i; \ - for (__i = 0; __i < __howmany(FD_SETSIZE, __NFDBITS); __i++) \ + for (__i = 0; __i < __NFD_SIZE; __i++) \ __t->fds_bits[__i] = __f->fds_bits[__i]; \ } while (/* CONSTCOND */ 0) #endif /* GCC 2.95 */ Index: src/sys/sys/lwp.h diff -u src/sys/sys/lwp.h:1.136 src/sys/sys/lwp.h:1.137 --- src/sys/sys/lwp.h:1.136 Wed Jul 7 01:30:38 2010 +++ src/sys/sys/lwp.h Thu Jul 8 12:23:31 2010 @@ -1,4 +1,4 @@ -/* $NetBSD: lwp.h,v 1.136 2010/07/07 01:30:38 chs Exp $ */ +/* $NetBSD: lwp.h,v 1.137 2010/07/08 12:23:31 rmind Exp $ */ /*- * Copyright (c) 2001, 2006, 2007, 2008, 2009, 2010 @@ -134,11 +134,17 @@ int l_prflag; /* p: process level flags */ u_int l_refcnt; /* p: reference count on this LWP */ lwpid_t l_lid; /* (: LWP identifier; local to proc */ - int l_selflag; /* S: select() flags */ - SLIST_HEAD(,selinfo) l_selwait; /* S: descriptors waited on */ - struct selcluster *l_selcluster;/* !: associated select data */ char *l_name; /* (: name, optional */ + /* State of select() or poll() */ + int l_selflag; /* S: polling state flags */ + SLIST_HEAD(,selinfo) l_selwait; /* S: descriptors waited on */ + int l_selret; /* S: return value of select/poll */ + uintptr_t l_selrec; /* (: argument for selrecord() */ + struct selcluster *l_selcluster;/* (: associated cluster data */ + void * l_selbits; /* (: select() bit-field */ + size_t l_selni; /* (: size of a single bit-field */ + /* Signals */ int l_sigrestore; /* p: need to restore old sig mask */ sigset_t l_sigwaitset; /* p: signals being waited for */ Index: src/sys/sys/param.h diff -u src/sys/sys/param.h:1.370 src/sys/sys/param.h:1.371 --- src/sys/sys/param.h:1.370 Wed Jul 7 01:37:35 2010 +++ src/sys/sys/param.h Thu Jul 8 12:23:31 2010 @@ -1,4 +1,4 @@ -/* $NetBSD: param.h,v 1.370 2010/07/07 01:37:35 chs Exp $ */ +/* $NetBSD: param.h,v 1.371 2010/07/08 12:23:31 rmind Exp $ */ /*- * Copyright (c) 1982, 1986, 1989, 1993 @@ -63,7 +63,7 @@ * 2.99.9 (299000900) */ -#define __NetBSD_Version__ 599003500 /* NetBSD 5.99.35 */ +#define __NetBSD_Version__ 599003600 /* NetBSD 5.99.36 */ #define __NetBSD_Prereq__(M,m,p) (((((M) * 100000000) + \ (m) * 1000000) + (p) * 100) <= __NetBSD_Version__) Index: src/sys/sys/selinfo.h diff -u src/sys/sys/selinfo.h:1.7 src/sys/sys/selinfo.h:1.8 --- src/sys/sys/selinfo.h:1.7 Sun Apr 25 15:55:24 2010 +++ src/sys/sys/selinfo.h Thu Jul 8 12:23:31 2010 @@ -1,4 +1,4 @@ -/* $NetBSD: selinfo.h,v 1.7 2010/04/25 15:55:24 ad Exp $ */ +/* $NetBSD: selinfo.h,v 1.8 2010/07/08 12:23:31 rmind Exp $ */ /*- * Copyright (c) 2008, 2010 The NetBSD Foundation, Inc. @@ -74,6 +74,7 @@ struct klist sel_klist; /* knotes attached to this selinfo */ void *sel_cluster; /* current cluster association */ struct lwp *sel_lwp; /* first LWP to be notified */ + uintptr_t sel_fdinfo; /* selected descriptor by first LWP */ SLIST_ENTRY(selinfo) sel_chain; /* entry on LWP's list of selinfo */ uint32_t sel_collision; /* mask of colliding cpus */ uint32_t sel_reserved[3];/* reserved for future expansion */ Index: src/sys/sys/types.h diff -u src/sys/sys/types.h:1.86 src/sys/sys/types.h:1.87 --- src/sys/sys/types.h:1.86 Sat Mar 7 21:59:25 2009 +++ src/sys/sys/types.h Thu Jul 8 12:23:31 2010 @@ -1,4 +1,4 @@ -/* $NetBSD: types.h,v 1.86 2009/03/07 21:59:25 ad Exp $ */ +/* $NetBSD: types.h,v 1.87 2010/07/08 12:23:31 rmind Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993, 1994 @@ -311,7 +311,8 @@ #ifdef _NETBSD_SOURCE #include <sys/fd_set.h> -#define NBBY __NBBY + +#define NBBY 8 typedef struct kauth_cred *kauth_cred_t;