Module Name:    src
Committed By:   rmind
Date:           Thu Jul  8 12:23:31 UTC 2010

Modified Files:
        src/sys/kern: sys_select.c
        src/sys/sys: fd_set.h lwp.h param.h selinfo.h types.h

Log Message:
Implement direct select/poll support, currently effective for socket and
pipe subsystems.  Avoids overhead of second selscan() on wake-up, and thus
improves performance on certain workloads (especially when polling on many
file-descriptors).  Also, clean-up sys/fd_set.h header and improve macros.

Welcome to 5.99.36!


To generate a diff of this commit:
cvs rdiff -u -r1.22 -r1.23 src/sys/kern/sys_select.c
cvs rdiff -u -r1.2 -r1.3 src/sys/sys/fd_set.h
cvs rdiff -u -r1.136 -r1.137 src/sys/sys/lwp.h
cvs rdiff -u -r1.370 -r1.371 src/sys/sys/param.h
cvs rdiff -u -r1.7 -r1.8 src/sys/sys/selinfo.h
cvs rdiff -u -r1.86 -r1.87 src/sys/sys/types.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/kern/sys_select.c
diff -u src/sys/kern/sys_select.c:1.22 src/sys/kern/sys_select.c:1.23
--- src/sys/kern/sys_select.c:1.22	Sun Apr 25 15:55:24 2010
+++ src/sys/kern/sys_select.c	Thu Jul  8 12:23:31 2010
@@ -1,11 +1,11 @@
-/*	$NetBSD: sys_select.c,v 1.22 2010/04/25 15:55:24 ad Exp $	*/
+/*	$NetBSD: sys_select.c,v 1.23 2010/07/08 12:23:31 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2007, 2008, 2009, 2010 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
- * by Andrew Doran.
+ * by Andrew Doran and Mindaugas Rasiukevicius.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -84,7 +84,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.22 2010/04/25 15:55:24 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.23 2010/07/08 12:23:31 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -109,7 +109,12 @@
 /* Flags for lwp::l_selflag. */
 #define	SEL_RESET	0	/* awoken, interrupted, or not yet polling */
 #define	SEL_SCANNING	1	/* polling descriptors */
-#define	SEL_BLOCKING	2	/* about to block on select_cv */
+#define	SEL_BLOCKING	2	/* blocking and waiting for event */
+#define	SEL_EVENT	3	/* interrupted, events set directly */
+
+/* Operations: either select() or poll(). */
+#define	SELOP_SELECT	1
+#define	SELOP_POLL	2
 
 /*
  * Per-cluster state for select()/poll().  For a system with fewer
@@ -125,10 +130,16 @@
 	uint32_t	sc_mask;
 } selcluster_t;
 
-static inline int	selscan(char *, u_int, register_t *);
-static inline int	pollscan(struct pollfd *, u_int, register_t *);
+static inline int	selscan(char *, const int, const size_t, register_t *);
+static inline int	pollscan(struct pollfd *, const int, register_t *);
 static void		selclear(void);
 
+static const int sel_flag[] = {
+	POLLRDNORM | POLLHUP | POLLERR,
+	POLLWRNORM | POLLHUP | POLLERR,
+	POLLRDBAND
+};
+
 static syncobj_t select_sobj = {
 	SOBJ_SLEEPQ_FIFO,
 	sleepq_unsleep,
@@ -137,7 +148,7 @@
 	syncobj_noowner,
 };
 
-static selcluster_t	*selcluster[SELCLUSTERS];
+static selcluster_t	*selcluster[SELCLUSTERS] __read_mostly;
 
 /*
  * Select system call.
@@ -206,8 +217,8 @@
  * sel_do_scan: common code to perform the scan on descriptors.
  */
 static int
-sel_do_scan(void *fds, u_int nfds, struct timespec *ts, sigset_t *mask,
-    register_t *retval, int selpoll)
+sel_do_scan(const int op, void *fds, const int nf, const size_t ni,
+    struct timespec *ts, sigset_t *mask, register_t *retval)
 {
 	lwp_t		* const l = curlwp;
 	proc_t		* const p = l->l_proc;
@@ -237,6 +248,14 @@
 	lock = sc->sc_lock;
 	l->l_selcluster = sc;
 	SLIST_INIT(&l->l_selwait);
+
+	l->l_selret = 0;
+	if (op == SELOP_SELECT) {
+		l->l_selbits = (char *)fds + ni * 3;
+		l->l_selni = ni;
+	} else {
+		l->l_selbits = NULL;
+	}
 	for (;;) {
 		int ncoll;
 
@@ -250,28 +269,51 @@
 		l->l_selflag = SEL_SCANNING;
 		ncoll = sc->sc_ncoll;
 
-		if (selpoll) {
-			error = selscan((char *)fds, nfds, retval);
+		if (op == SELOP_SELECT) {
+			error = selscan((char *)fds, nf, ni, retval);
 		} else {
-			error = pollscan((struct pollfd *)fds, nfds, retval);
+			error = pollscan((struct pollfd *)fds, nf, retval);
 		}
-
 		if (error || *retval)
 			break;
 		if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0)
 			break;
+		/*
+		 * Acquire the lock and perform the (re)checks.  Note, if
+		 * collision has occured, then our state does not matter,
+		 * as we must perform re-scan.  Therefore, check it first.
+		 */
+state_check:
 		mutex_spin_enter(lock);
-		if (l->l_selflag != SEL_SCANNING || sc->sc_ncoll != ncoll) {
+		if (__predict_false(sc->sc_ncoll != ncoll)) {
+			/* Collision: perform re-scan. */
 			mutex_spin_exit(lock);
 			continue;
 		}
+		if (__predict_true(l->l_selflag == SEL_EVENT)) {
+			/* Events occured, they are set directly. */
+			mutex_spin_exit(lock);
+			KASSERT(l->l_selret != 0);
+			*retval = l->l_selret;
+			break;
+		}
+		if (__predict_true(l->l_selflag == SEL_RESET)) {
+			/* Events occured, but re-scan is requested. */
+			mutex_spin_exit(lock);
+			continue;
+		}
+		KASSERT(l->l_selflag == SEL_SCANNING);
+		/* Nothing happen, therefore - sleep. */
 		l->l_selflag = SEL_BLOCKING;
 		l->l_kpriority = true;
 		sleepq_enter(&sc->sc_sleepq, l, lock);
 		sleepq_enqueue(&sc->sc_sleepq, sc, "select", &select_sobj);
 		error = sleepq_block(timo, true);
-		if (error != 0)
+		if (error != 0) {
 			break;
+		}
+		/* Awoken: need to check the state. */
+		goto state_check;
 	}
 	selclear();
 
@@ -326,7 +368,7 @@
 	getbits(ex, 2);
 #undef	getbits
 
-	error = sel_do_scan(bits, nd, ts, mask, retval, 1);
+	error = sel_do_scan(SELOP_SELECT, bits, nd, ni, ts, mask, retval);
 	if (error == 0 && u_in != NULL)
 		error = copyout(bits + ni * 3, u_in, ni);
 	if (error == 0 && u_ou != NULL)
@@ -340,30 +382,32 @@
 }
 
 static inline int
-selscan(char *bits, u_int nfd, register_t *retval)
+selscan(char *bits, const int nfd, const size_t ni, register_t *retval)
 {
-	static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
-			       POLLWRNORM | POLLHUP | POLLERR,
-			       POLLRDBAND };
 	fd_mask *ibitp, *obitp;
-	int msk, i, j, fd, ni, n;
-	fd_mask ibits, obits;
+	int msk, i, j, fd, n;
 	file_t *fp;
 
-	ni = howmany(nfd, NFDBITS) * sizeof(fd_mask);
 	ibitp = (fd_mask *)(bits + ni * 0);
 	obitp = (fd_mask *)(bits + ni * 3);
 	n = 0;
 
 	for (msk = 0; msk < 3; msk++) {
 		for (i = 0; i < nfd; i += NFDBITS) {
+			fd_mask ibits, obits;
+
 			ibits = *ibitp++;
 			obits = 0;
 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
 				ibits &= ~(1 << j);
 				if ((fp = fd_getfile(fd)) == NULL)
 					return (EBADF);
-				if ((*fp->f_ops->fo_poll)(fp, flag[msk])) {
+				/*
+				 * Setup an argument to selrecord(), which is
+				 * a file descriptor number.
+				 */
+				curlwp->l_selrec = fd;
+				if ((*fp->f_ops->fo_poll)(fp, sel_flag[msk])) {
 					obits |= (1 << j);
 					n++;
 				}
@@ -464,7 +508,7 @@
 	if (error)
 		goto fail;
 
-	error = sel_do_scan(fds, nfds, ts, mask, retval, 0);
+	error = sel_do_scan(SELOP_POLL, fds, nfds, ni, ts, mask, retval);
 	if (error == 0)
 		error = copyout(fds, u_fds, ni);
  fail:
@@ -474,12 +518,11 @@
 }
 
 static inline int
-pollscan(struct pollfd *fds, u_int nfd, register_t *retval)
+pollscan(struct pollfd *fds, const int nfd, register_t *retval)
 {
-	int i, n;
 	file_t *fp;
+	int i, n = 0;
 
-	n = 0;
 	for (i = 0; i < nfd; i++, fds++) {
 		if (fds->fd < 0) {
 			fds->revents = 0;
@@ -487,6 +530,12 @@
 			fds->revents = POLLNVAL;
 			n++;
 		} else {
+			/*
+			 * Perform poll: registers select request or returns
+			 * the events which are set.  Setup an argument for
+			 * selrecord(), which is a pointer to struct pollfd.
+			 */
+			curlwp->l_selrec = (uintptr_t)fds;
 			fds->revents = (*fp->f_ops->fo_poll)(fp,
 			    fds->events | POLLERR | POLLHUP);
 			if (fds->revents != 0)
@@ -498,7 +547,6 @@
 	return (0);
 }
 
-/*ARGSUSED*/
 int
 seltrue(dev_t dev, int events, lwp_t *l)
 {
@@ -539,28 +587,73 @@
 	other = sip->sel_lwp;
 
 	if (other == selector) {
-		/* `selector' has already claimed it. */
+		/* 1. We (selector) already claimed to be the first LWP. */
 		KASSERT(sip->sel_cluster = sc);
 	} else if (other == NULL) {
 		/*
-		 * First named waiter, although there may be unnamed
-		 * waiters (collisions).  Issue a memory barrier to
-		 * ensure that we access sel_lwp (above) before other
-		 * fields - this guards against a call to selclear().
+		 * 2. No first LWP, therefore we (selector) are the first.
+		 *
+		 * There may be unnamed waiters (collisions).  Issue a memory
+		 * barrier to ensure that we access sel_lwp (above) before
+		 * other fields - this guards against a call to selclear().
 		 */
 		membar_enter();
 		sip->sel_lwp = selector;
 		SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
+		/* Copy the argument, which is for selnotify(). */
+		sip->sel_fdinfo = selector->l_selrec;
 		/* Replace selinfo's lock with the chosen cluster's lock. */
 		sip->sel_cluster = sc;
 	} else {
-		/* Multiple waiters: record a collision. */
+		/* 3. Multiple waiters: record a collision. */
 		sip->sel_collision |= sc->sc_mask;
 		KASSERT(sip->sel_cluster != NULL);
 	}
 }
 
 /*
+ * sel_setevents: a helper function for selnotify(), to set the events
+ * for LWP sleeping in selcommon() or pollcommon().
+ */
+static inline void
+sel_setevents(lwp_t *l, struct selinfo *sip, const int events)
+{
+	const int oflag = l->l_selflag;
+
+	/*
+	 * If we require re-scan or it was required by somebody else,
+	 * then just (re)set SEL_RESET and return.
+	 */
+	if (__predict_false(events == 0 || oflag == SEL_RESET)) {
+		l->l_selflag = SEL_RESET;
+		return;
+	}
+	/*
+	 * Direct set.  Note: select state of LWP is locked.  First,
+	 * determine whether it is selcommon() or pollcommon().
+	 */
+	if (l->l_selbits != NULL) {
+		fd_mask *fds = (fd_mask *)l->l_selbits;
+		const int ni = l->l_selni;
+		const int fd = sip->sel_fdinfo;
+		int n;
+
+		for (n = 0; n < 3; n++) {
+			if (sel_flag[n] | events) {
+				fds[fd >> __NFDSHIFT] |= (fd & __NFDMASK);
+			}
+			fds = (fd_mask *)((char *)fds + ni);
+		}
+	} else {
+		struct pollfd *pfd = (void *)sip->sel_fdinfo;
+		pfd->revents |= events;
+	}
+	/* Indicate direct set and note the event (cluster lock is held). */
+	l->l_selflag = SEL_EVENT;
+	l->l_selret++;
+}
+
+/*
  * Do a wakeup when a selectable event occurs.  Concurrency issues:
  *
  * As per selrecord(), the caller's object lock is held.  If there
@@ -590,14 +683,18 @@
 		mutex_spin_enter(lock);
 		/* Still there? */
 		if (sip->sel_lwp != NULL) {
+			/*
+			 * Set the events for our LWP and indicate that.
+			 * Otherwise, request for a full re-scan.
+			 */
 			l = sip->sel_lwp;
+			oflag = l->l_selflag;
+			sel_setevents(l, sip, events);
 			/*
 			 * If thread is sleeping, wake it up.  If it's not
 			 * yet asleep, it will notice the change in state
 			 * and will re-poll the descriptors.
 			 */
-			oflag = l->l_selflag;
-			l->l_selflag = SEL_RESET;
 			if (oflag == SEL_BLOCKING && l->l_mutex == lock) {
 				KASSERT(l->l_wchan == sc);
 				sleepq_unsleep(l, false);
@@ -710,7 +807,7 @@
  * Concurrency issues: we only need guard against a call to selclear()
  * by a thread exiting sel_do_scan().  The caller has prevented further
  * references being made to the selinfo record via selrecord(), and it
- * won't call selwakeup() again.
+ * will not call selnotify() again.
  */
 void
 seldestroy(struct selinfo *sip)

Index: src/sys/sys/fd_set.h
diff -u src/sys/sys/fd_set.h:1.2 src/sys/sys/fd_set.h:1.3
--- src/sys/sys/fd_set.h:1.2	Sun Dec 11 12:25:20 2005
+++ src/sys/sys/fd_set.h	Thu Jul  8 12:23:31 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: fd_set.h,v 1.2 2005/12/11 12:25:20 christos Exp $	*/
+/*	$NetBSD: fd_set.h,v 1.3 2010/07/08 12:23:31 rmind Exp $	*/
 
 /*-
  * Copyright (c) 1992, 1993
@@ -39,44 +39,45 @@
 #include <machine/int_types.h>
 
 /*
- * Implementation dependent defines, hidden from user space. X/Open does not
- * specify them.
+ * Implementation dependent defines, hidden from user space.
+ * POSIX does not specify them.
  */
-#define	__NBBY		8		/* number of bits in a byte */
-typedef __int32_t	__fd_mask;
 
-/* bits per mask */
-#define __NFDBITS	((unsigned int)sizeof(__fd_mask) * __NBBY)
+typedef	__int32_t	__fd_mask;
 
-#define	__howmany(x, y)	(((x) + ((y) - 1)) / (y))
+/* 32 = 2 ^ 5 */
+#define	__NFDBITS	(32)
+#define	__NFDSHIFT	(5)
+#define	__NFDMASK	(__NFDBITS - 1)
 
 /*
- * Select uses bit masks of file descriptors in longs.  These macros
- * manipulate such bit fields (the filesystem macros use chars).
- * FD_SETSIZE may be defined by the user, but the default here should
- * be enough for most uses.
+ * Select uses bit fields of file descriptors.  These macros manipulate
+ * such bit fields.  Note: FD_SETSIZE may be defined by the user.
  */
+
 #ifndef	FD_SETSIZE
 #define	FD_SETSIZE	256
 #endif
 
+#define	__NFD_SIZE	(((FD_SETSIZE) + (__NFDBITS - 1)) / __NFDBITS)
+
 typedef	struct fd_set {
-	__fd_mask	fds_bits[__howmany(FD_SETSIZE, __NFDBITS)];
+	__fd_mask	fds_bits[__NFD_SIZE];
 } fd_set;
 
 #define	FD_SET(n, p)	\
-    ((p)->fds_bits[(n)/__NFDBITS] |= (1 << ((n) % __NFDBITS)))
+    ((p)->fds_bits[(n) >> __NFDSHIFT] |= (1 << ((n) & __NFDMASK)))
 #define	FD_CLR(n, p)	\
-    ((p)->fds_bits[(n)/__NFDBITS] &= ~(1 << ((n) % __NFDBITS)))
+    ((p)->fds_bits[(n) >> __NFDSHIFT] &= ~(1 << ((n) & __NFDMASK)))
 #define	FD_ISSET(n, p)	\
-    ((p)->fds_bits[(n)/__NFDBITS] & (1 << ((n) % __NFDBITS)))
+    ((p)->fds_bits[(n) >> __NFDSHIFT] & (1 << ((n) & __NFDMASK)))
 #if __GNUC_PREREQ__(2, 95)
 #define	FD_ZERO(p)	(void)__builtin_memset((p), 0, sizeof(*(p)))
 #else
 #define	FD_ZERO(p)	do {						\
 	fd_set *__fds = (p);						\
 	unsigned int __i;						\
-	for (__i = 0; __i < __howmany(FD_SETSIZE, __NFDBITS); __i++)	\
+	for (__i = 0; __i < __NFD_SIZE; __i++)				\
 		__fds->fds_bits[__i] = 0;				\
 	} while (/* CONSTCOND */ 0)
 #endif /* GCC 2.95 */
@@ -86,11 +87,8 @@
  */
 #if defined(_NETBSD_SOURCE)
 
-#define fd_mask __fd_mask
-#define NFDBITS __NFDBITS
-#ifndef howmany
-#define howmany(a, b) __howmany(a, b)
-#endif
+#define	fd_mask		__fd_mask
+#define	NFDBITS		__NFDBITS
 
 #if __GNUC_PREREQ__(2, 95)
 #define	FD_COPY(f, t)	(void)__builtin_memcpy((t), (f), sizeof(*(f)))
@@ -98,7 +96,7 @@
 #define	FD_COPY(f, t)	do {						\
 	fd_set *__f = (f), *__t = (t);					\
 	unsigned int __i;						\
-	for (__i = 0; __i < __howmany(FD_SETSIZE, __NFDBITS); __i++)	\
+	for (__i = 0; __i < __NFD_SIZE; __i++)				\
 		__t->fds_bits[__i] = __f->fds_bits[__i];		\
 	} while (/* CONSTCOND */ 0)
 #endif /* GCC 2.95 */

Index: src/sys/sys/lwp.h
diff -u src/sys/sys/lwp.h:1.136 src/sys/sys/lwp.h:1.137
--- src/sys/sys/lwp.h:1.136	Wed Jul  7 01:30:38 2010
+++ src/sys/sys/lwp.h	Thu Jul  8 12:23:31 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: lwp.h,v 1.136 2010/07/07 01:30:38 chs Exp $	*/
+/*	$NetBSD: lwp.h,v 1.137 2010/07/08 12:23:31 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2001, 2006, 2007, 2008, 2009, 2010
@@ -134,11 +134,17 @@
 	int		l_prflag;	/* p: process level flags */
 	u_int		l_refcnt;	/* p: reference count on this LWP */
 	lwpid_t		l_lid;		/* (: LWP identifier; local to proc */
-	int		l_selflag;	/* S: select() flags */
-	SLIST_HEAD(,selinfo) l_selwait;	/* S: descriptors waited on */
-	struct selcluster *l_selcluster;/* !: associated select data */
 	char		*l_name;	/* (: name, optional */
 
+	/* State of select() or poll() */
+	int		l_selflag;	/* S: polling state flags */
+	SLIST_HEAD(,selinfo) l_selwait;	/* S: descriptors waited on */
+	int		l_selret;	/* S: return value of select/poll */
+	uintptr_t	l_selrec;	/* (: argument for selrecord() */
+	struct selcluster *l_selcluster;/* (: associated cluster data */
+	void *		l_selbits;	/* (: select() bit-field */
+	size_t		l_selni;	/* (: size of a single bit-field */
+
 	/* Signals */
 	int		l_sigrestore;	/* p: need to restore old sig mask */
 	sigset_t	l_sigwaitset;	/* p: signals being waited for */

Index: src/sys/sys/param.h
diff -u src/sys/sys/param.h:1.370 src/sys/sys/param.h:1.371
--- src/sys/sys/param.h:1.370	Wed Jul  7 01:37:35 2010
+++ src/sys/sys/param.h	Thu Jul  8 12:23:31 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: param.h,v 1.370 2010/07/07 01:37:35 chs Exp $	*/
+/*	$NetBSD: param.h,v 1.371 2010/07/08 12:23:31 rmind Exp $	*/
 
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
@@ -63,7 +63,7 @@
  *	2.99.9		(299000900)
  */
 
-#define	__NetBSD_Version__	599003500	/* NetBSD 5.99.35 */
+#define	__NetBSD_Version__	599003600	/* NetBSD 5.99.36 */
 
 #define __NetBSD_Prereq__(M,m,p) (((((M) * 100000000) + \
     (m) * 1000000) + (p) * 100) <= __NetBSD_Version__)

Index: src/sys/sys/selinfo.h
diff -u src/sys/sys/selinfo.h:1.7 src/sys/sys/selinfo.h:1.8
--- src/sys/sys/selinfo.h:1.7	Sun Apr 25 15:55:24 2010
+++ src/sys/sys/selinfo.h	Thu Jul  8 12:23:31 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: selinfo.h,v 1.7 2010/04/25 15:55:24 ad Exp $	*/
+/*	$NetBSD: selinfo.h,v 1.8 2010/07/08 12:23:31 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2008, 2010 The NetBSD Foundation, Inc.
@@ -74,6 +74,7 @@
 	struct klist	sel_klist;	/* knotes attached to this selinfo */
 	void		*sel_cluster;	/* current cluster association */
 	struct lwp	*sel_lwp;	/* first LWP to be notified */
+	uintptr_t	sel_fdinfo;	/* selected descriptor by first LWP */
 	SLIST_ENTRY(selinfo) sel_chain;	/* entry on LWP's list of selinfo */
 	uint32_t	sel_collision;	/* mask of colliding cpus */
 	uint32_t	sel_reserved[3];/* reserved for future expansion */

Index: src/sys/sys/types.h
diff -u src/sys/sys/types.h:1.86 src/sys/sys/types.h:1.87
--- src/sys/sys/types.h:1.86	Sat Mar  7 21:59:25 2009
+++ src/sys/sys/types.h	Thu Jul  8 12:23:31 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: types.h,v 1.86 2009/03/07 21:59:25 ad Exp $	*/
+/*	$NetBSD: types.h,v 1.87 2010/07/08 12:23:31 rmind Exp $	*/
 
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993, 1994
@@ -311,7 +311,8 @@
 
 #ifdef _NETBSD_SOURCE
 #include <sys/fd_set.h>
-#define	NBBY	__NBBY
+
+#define	NBBY			8
 
 typedef struct kauth_cred *kauth_cred_t;
 

Reply via email to