Module Name: src Committed By: ad Date: Fri Oct 13 19:07:09 UTC 2023
Modified Files: src/sys/ddb: db_command.c db_interface.h db_xxx.c src/sys/kern: sys_pipe.c src/sys/sys: pipe.h src/usr.bin/fstat: fstat.c Log Message: Simplify/streamline pipes a little bit: - Allocate only one struct pipe not two (no need to be bidirectional here). - Then use f_flag (FREAD/FWRITE) to figure out what to do in the fileops. - Never wake the other side or acquire long-term (I/O) lock unless needed. - Whenever possible, defer wakeups until after locks have been released. - Do some things locklessly in pipe_ioctl() and pipe_poll(). Some notable results: - -30% latency on a 486DX2/66 doing 1 byte ping-pong within a single process. - 2.5x less lock contention during "make cleandir" of src on a 48 CPU machine. - 1.5x bandwith with 1kB messages on the same 48 CPU machine (8kB: same b/w). To generate a diff of this commit: cvs rdiff -u -r1.186 -r1.187 src/sys/ddb/db_command.c cvs rdiff -u -r1.41 -r1.42 src/sys/ddb/db_interface.h cvs rdiff -u -r1.77 -r1.78 src/sys/ddb/db_xxx.c cvs rdiff -u -r1.164 -r1.165 src/sys/kern/sys_pipe.c cvs rdiff -u -r1.39 -r1.40 src/sys/sys/pipe.h cvs rdiff -u -r1.118 -r1.119 src/usr.bin/fstat/fstat.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/ddb/db_command.c diff -u src/sys/ddb/db_command.c:1.186 src/sys/ddb/db_command.c:1.187 --- src/sys/ddb/db_command.c:1.186 Sat Oct 7 20:27:20 2023 +++ src/sys/ddb/db_command.c Fri Oct 13 19:07:08 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: db_command.c,v 1.186 2023/10/07 20:27:20 ad Exp $ */ +/* $NetBSD: db_command.c,v 1.187 2023/10/13 19:07:08 ad Exp $ */ /* * Copyright (c) 1996, 1997, 1998, 1999, 2002, 2009, 2019 @@ -61,7 +61,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: db_command.c,v 1.186 2023/10/07 20:27:20 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: db_command.c,v 1.187 2023/10/13 19:07:08 ad Exp $"); #ifdef _KERNEL_OPT #include "opt_aio.h" @@ -301,6 +301,8 @@ static const struct db_command db_show_c 0 ,"List all used memory pages.",NULL,NULL) }, { DDB_ADD_CMD("panic", db_show_panic, 0, "Print the current panic string",NULL,NULL) }, + { DDB_ADD_CMD("pipe", db_show_pipe, + 0 ,"Show the contents of a pipe.",NULL,NULL) }, { DDB_ADD_CMD("pool", db_pool_print_cmd, 0, "Print the pool at address.", "[/clp] address",NULL) }, /* added from all sub cmds */ Index: src/sys/ddb/db_interface.h diff -u src/sys/ddb/db_interface.h:1.41 src/sys/ddb/db_interface.h:1.42 --- src/sys/ddb/db_interface.h:1.41 Sat Oct 7 20:27:20 2023 +++ src/sys/ddb/db_interface.h Fri Oct 13 19:07:08 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: db_interface.h,v 1.41 2023/10/07 20:27:20 ad Exp $ */ +/* $NetBSD: db_interface.h,v 1.42 2023/10/13 19:07:08 ad Exp $ */ /*- * Copyright (c) 1995, 2023 The NetBSD Foundation, Inc. @@ -86,6 +86,9 @@ void db_show_sleepq(db_expr_t, bool, db /* kern/kern_condvar.c */ void db_show_condvar(db_expr_t, bool, db_expr_t, const char *); +/* kern/sys_pipe.c */ +void db_show_pipe(db_expr_t, bool, db_expr_t, const char *); + /* kern/sys_select.c */ void db_show_selinfo(db_expr_t, bool, db_expr_t, const char *); Index: src/sys/ddb/db_xxx.c diff -u src/sys/ddb/db_xxx.c:1.77 src/sys/ddb/db_xxx.c:1.78 --- src/sys/ddb/db_xxx.c:1.77 Sun Oct 8 15:03:16 2023 +++ src/sys/ddb/db_xxx.c Fri Oct 13 19:07:08 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: db_xxx.c,v 1.77 2023/10/08 15:03:16 martin Exp $ */ +/* $NetBSD: db_xxx.c,v 1.78 2023/10/13 19:07:08 ad Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 @@ -37,7 +37,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: db_xxx.c,v 1.77 2023/10/08 15:03:16 martin Exp $"); +__KERNEL_RCSID(0, "$NetBSD: db_xxx.c,v 1.78 2023/10/13 19:07:08 ad Exp $"); #ifdef _KERNEL_OPT #include "opt_kgdb.h" @@ -72,6 +72,7 @@ __KERNEL_RCSID(0, "$NetBSD: db_xxx.c,v 1 #include <sys/condvar.h> #include <sys/sleepq.h> #include <sys/selinfo.h> +#include <sys/pipe.h> #include <ddb/ddb.h> #include <ddb/db_user.h> @@ -366,6 +367,51 @@ db_show_sleepq(db_expr_t addr, bool hadd } void +db_show_pipe(db_expr_t addr, bool haddr, db_expr_t count, const char *modif) +{ + struct pipe pipe, *ppipe = (struct pipe *)addr; + + db_read_bytes(addr, sizeof(pipe), (char *)&pipe); + + db_printf("pipe_lock\t\t%p\n", pipe.pipe_lock); + + db_printf("pipe_read\t\t"); + db_show_condvar((db_addr_t)&ppipe->pipe_read, false, 0, modif); + + db_printf("pipe_write\t\t"); + db_show_condvar((db_addr_t)&ppipe->pipe_write, false, 0, modif); + + db_printf("pipe_busy\t\t"); + db_show_condvar((db_addr_t)&ppipe->pipe_busy, false, 0, modif); + + db_printf("pipe_buffer.cnt\t\t%ld\n", (long)pipe.pipe_buffer.cnt); + db_printf("pipe_buffer.in\t\t%d\n", pipe.pipe_buffer.in); + db_printf("pipe_buffer.out\t\t%d\n", pipe.pipe_buffer.out); + db_printf("pipe_buffer.size\t%ld\n", (long)pipe.pipe_buffer.size); + db_printf("pipe_buffer.buffer\t%p\n", pipe.pipe_buffer.buffer); + + db_printf("pipe_wrsel\t\t"); + db_show_selinfo((db_addr_t)&ppipe->pipe_wrsel, false, 0, modif); + db_printf("pipe_rdsel\t\t"); + db_show_selinfo((db_addr_t)&ppipe->pipe_rdsel, false, 0, modif); + + db_printf("pipe_atime\t\t"); + db_print_timespec(&pipe.pipe_atime); + + db_printf("\npipe_mtime\t\t"); + db_print_timespec(&pipe.pipe_mtime); + + db_printf("\npipe_btime\t\t"); + db_print_timespec(&pipe.pipe_btime); + + db_printf("\npipe_kmem\t\t%lx\n", (long)pipe.pipe_kmem); + db_printf("pipe_owner\t\t%p\n", pipe.pipe_owner); + db_printf("pipe_wrpgid\t\t%d\n", pipe.pipe_wrpgid); + db_printf("pipe_rdpgid\t\t%d\n", pipe.pipe_rdpgid); + db_printf("pipe_state\t\t%#08x\n", pipe.pipe_state); +} + +void db_show_selinfo(db_expr_t addr, bool haddr, db_expr_t count, const char *modif) { struct selinfo sel; Index: src/sys/kern/sys_pipe.c diff -u src/sys/kern/sys_pipe.c:1.164 src/sys/kern/sys_pipe.c:1.165 --- src/sys/kern/sys_pipe.c:1.164 Thu Oct 5 19:44:26 2023 +++ src/sys/kern/sys_pipe.c Fri Oct 13 19:07:08 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: sys_pipe.c,v 1.164 2023/10/05 19:44:26 ad Exp $ */ +/* $NetBSD: sys_pipe.c,v 1.165 2023/10/13 19:07:08 ad Exp $ */ /*- * Copyright (c) 2003, 2007, 2008, 2009, 2023 The NetBSD Foundation, Inc. @@ -55,7 +55,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.164 2023/10/05 19:44:26 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.165 2023/10/13 19:07:08 ad Exp $"); #include <sys/param.h> #include <sys/systm.h> @@ -126,57 +126,42 @@ static u_int nbigpipe = 0; */ static u_int amountpipekva = 0; -static void pipeclose(struct pipe *); -static void pipe_free_kmem(struct pipe *); -static int pipe_create(struct pipe **, pool_cache_t, struct timespec *); -static int pipelock(struct pipe *, bool); -static inline void pipeunlock(struct pipe *); -static void pipeselwakeup(struct pipe *, struct pipe *, int); -static int pipespace(struct pipe *, int); +static bool pipebusy(struct pipe *); +static bool pipeunbusy(struct pipe *); +static void pipeselwakeup(struct pipe *, int, int); static int pipe_ctor(void *, void *, int); static void pipe_dtor(void *, void *); -static pool_cache_t pipe_wr_cache; -static pool_cache_t pipe_rd_cache; +static pool_cache_t pipe_cache __read_mostly; void pipe_init(void) { - /* Writer side is not automatically allocated KVA. */ - pipe_wr_cache = pool_cache_init(sizeof(struct pipe), 0, 0, 0, "pipewr", - NULL, IPL_NONE, pipe_ctor, pipe_dtor, NULL); - KASSERT(pipe_wr_cache != NULL); - - /* Reader side gets preallocated KVA. */ - pipe_rd_cache = pool_cache_init(sizeof(struct pipe), 0, 0, 0, "piperd", - NULL, IPL_NONE, pipe_ctor, pipe_dtor, (void *)1); - KASSERT(pipe_rd_cache != NULL); + pipe_cache = pool_cache_init(sizeof(struct pipe), COHERENCY_UNIT, 0, 0, + "pipe", NULL, IPL_NONE, pipe_ctor, pipe_dtor, NULL); + KASSERT(pipe_cache != NULL); } static int pipe_ctor(void *arg, void *obj, int flags) { - struct pipe *pipe; - vaddr_t va; - - pipe = obj; + struct pipe *pipe = obj; memset(pipe, 0, sizeof(struct pipe)); - if (arg != NULL) { - /* Preallocate space. */ - va = uvm_km_alloc(kernel_map, PIPE_SIZE, 0, - UVM_KMF_PAGEABLE | UVM_KMF_WAITVA); - KASSERT(va != 0); - pipe->pipe_kmem = va; - atomic_add_int(&amountpipekva, PIPE_SIZE); - } - cv_init(&pipe->pipe_rcv, "pipe_rd"); - cv_init(&pipe->pipe_wcv, "pipe_wr"); - cv_init(&pipe->pipe_draincv, "pipe_drn"); - cv_init(&pipe->pipe_lkcv, "pipe_lk"); - selinit(&pipe->pipe_sel); - pipe->pipe_state = PIPE_SIGNALR; + pipe->pipe_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); + cv_init(&pipe->pipe_read, "piperd"); + cv_init(&pipe->pipe_write, "pipewr"); + cv_init(&pipe->pipe_busy, "pipebusy"); + selinit(&pipe->pipe_rdsel); + selinit(&pipe->pipe_wrsel); + pipe->pipe_kmem = uvm_km_alloc(kernel_map, PIPE_SIZE, 0, + UVM_KMF_PAGEABLE | UVM_KMF_WAITVA); + pipe->pipe_state = PIPE_SIGNALR | PIPE_RDOPEN | PIPE_WROPEN; + pipe->pipe_buffer.buffer = (void *)pipe->pipe_kmem; + pipe->pipe_buffer.size = PIPE_SIZE; + KASSERT(pipe->pipe_kmem != 0); + atomic_add_int(&amountpipekva, PIPE_SIZE); return 0; } @@ -184,20 +169,16 @@ pipe_ctor(void *arg, void *obj, int flag static void pipe_dtor(void *arg, void *obj) { - struct pipe *pipe; + struct pipe *pipe = obj; - pipe = obj; - - cv_destroy(&pipe->pipe_rcv); - cv_destroy(&pipe->pipe_wcv); - cv_destroy(&pipe->pipe_draincv); - cv_destroy(&pipe->pipe_lkcv); - seldestroy(&pipe->pipe_sel); - if (pipe->pipe_kmem != 0) { - uvm_km_free(kernel_map, pipe->pipe_kmem, PIPE_SIZE, - UVM_KMF_PAGEABLE); - atomic_add_int(&amountpipekva, -PIPE_SIZE); - } + cv_destroy(&pipe->pipe_read); + cv_destroy(&pipe->pipe_write); + cv_destroy(&pipe->pipe_busy); + seldestroy(&pipe->pipe_rdsel); + seldestroy(&pipe->pipe_wrsel); + mutex_obj_free(pipe->pipe_lock); + uvm_km_free(kernel_map, pipe->pipe_kmem, PIPE_SIZE, UVM_KMF_PAGEABLE); + atomic_add_int(&amountpipekva, -PIPE_SIZE); } /* @@ -206,8 +187,7 @@ pipe_dtor(void *arg, void *obj) int pipe1(struct lwp *l, int *fildes, int flags) { - struct pipe *rpipe, *wpipe; - struct timespec nt; + struct pipe *pipe; file_t *rf, *wf; int fd, error; proc_t *p; @@ -215,162 +195,104 @@ pipe1(struct lwp *l, int *fildes, int fl if (flags & ~(O_CLOEXEC|O_NONBLOCK|O_NOSIGPIPE)) return EINVAL; p = curproc; - rpipe = wpipe = NULL; - getnanotime(&nt); - if ((error = pipe_create(&rpipe, pipe_rd_cache, &nt)) || - (error = pipe_create(&wpipe, pipe_wr_cache, &nt))) { - goto free2; - } - rpipe->pipe_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); - wpipe->pipe_lock = rpipe->pipe_lock; - mutex_obj_hold(wpipe->pipe_lock); + + pipe = pool_cache_get(pipe_cache, PR_WAITOK); + getnanotime(&pipe->pipe_atime); + pipe->pipe_mtime = pipe->pipe_atime; + pipe->pipe_btime = pipe->pipe_atime; error = fd_allocfile(&rf, &fd); - if (error) - goto free2; + if (error) { + pool_cache_put(pipe_cache, pipe); + return error; + } fildes[0] = fd; error = fd_allocfile(&wf, &fd); - if (error) - goto free3; + if (error) { + fd_abort(p, rf, fildes[0]); + pool_cache_put(pipe_cache, pipe); + return error; + } fildes[1] = fd; rf->f_flag = FREAD | flags; rf->f_type = DTYPE_PIPE; - rf->f_pipe = rpipe; + rf->f_pipe = pipe; rf->f_ops = &pipeops; fd_set_exclose(l, fildes[0], (flags & O_CLOEXEC) != 0); wf->f_flag = FWRITE | flags; wf->f_type = DTYPE_PIPE; - wf->f_pipe = wpipe; + wf->f_pipe = pipe; wf->f_ops = &pipeops; fd_set_exclose(l, fildes[1], (flags & O_CLOEXEC) != 0); - rpipe->pipe_peer = wpipe; - wpipe->pipe_peer = rpipe; - fd_affix(p, rf, fildes[0]); fd_affix(p, wf, fildes[1]); - return (0); -free3: - fd_abort(p, rf, fildes[0]); -free2: - pipeclose(wpipe); - pipeclose(rpipe); - - return (error); -} - -/* - * Allocate kva for pipe circular buffer, the space is pageable - * This routine will 'realloc' the size of a pipe safely, if it fails - * it will retain the old buffer. - * If it fails it will return ENOMEM. - */ -static int -pipespace(struct pipe *pipe, int size) -{ - void *buffer; - - /* - * Allocate pageable virtual address space. Physical memory is - * allocated on demand. - */ - if (size == PIPE_SIZE && pipe->pipe_kmem != 0) { - buffer = (void *)pipe->pipe_kmem; - } else { - buffer = (void *)uvm_km_alloc(kernel_map, round_page(size), - 0, UVM_KMF_PAGEABLE); - if (buffer == NULL) - return (ENOMEM); - atomic_add_int(&amountpipekva, size); - } - - /* free old resources if we're resizing */ - pipe_free_kmem(pipe); - pipe->pipe_buffer.buffer = buffer; - pipe->pipe_buffer.size = size; - pipe->pipe_buffer.in = 0; - pipe->pipe_buffer.out = 0; - pipe->pipe_buffer.cnt = 0; - return (0); -} - -/* - * Initialize and allocate VM and memory for pipe. - */ -static int -pipe_create(struct pipe **pipep, pool_cache_t cache, struct timespec *nt) -{ - struct pipe *pipe; - int error; - - pipe = pool_cache_get(cache, PR_WAITOK); - KASSERT(pipe != NULL); - *pipep = pipe; - error = 0; - pipe->pipe_atime = pipe->pipe_mtime = pipe->pipe_btime = *nt; - pipe->pipe_lock = NULL; - if (cache == pipe_rd_cache) { - error = pipespace(pipe, PIPE_SIZE); - } else { - pipe->pipe_buffer.buffer = NULL; - pipe->pipe_buffer.size = 0; - pipe->pipe_buffer.in = 0; - pipe->pipe_buffer.out = 0; - pipe->pipe_buffer.cnt = 0; - } - return error; + return 0; } /* - * Lock a pipe for I/O, blocking other access - * Called with pipe spin lock held. + * Busy a pipe for I/O, blocking other access. Called with pipe lock held. + * NB: curlwp may already hold the pipe busy. */ -static int -pipelock(struct pipe *pipe, bool catch_p) +static bool +pipebusy(struct pipe *pipe) { - int error; + struct lwp *l = curlwp; + bool blocked = false; KASSERT(mutex_owned(pipe->pipe_lock)); - while (pipe->pipe_state & PIPE_LOCKFL) { - if (catch_p) { - error = cv_wait_sig(&pipe->pipe_lkcv, pipe->pipe_lock); - if (error != 0) { - return error; - } - } else - cv_wait(&pipe->pipe_lkcv, pipe->pipe_lock); + if (pipe->pipe_owner != l) { + while (__predict_false(pipe->pipe_owner != NULL)) { + cv_wait(&pipe->pipe_busy, pipe->pipe_lock); + blocked = true; + } + pipe->pipe_owner = l; } - pipe->pipe_state |= PIPE_LOCKFL; - - return 0; + return blocked; } /* - * unlock a pipe I/O lock + * Unbusy a pipe for I/O, if held busy by curlwp. */ -static inline void -pipeunlock(struct pipe *pipe) +static bool +pipeunbusy(struct pipe *pipe) { - KASSERT(pipe->pipe_state & PIPE_LOCKFL); + KASSERT(mutex_owned(pipe->pipe_lock)); - pipe->pipe_state &= ~PIPE_LOCKFL; - cv_signal(&pipe->pipe_lkcv); + if (pipe->pipe_owner == curlwp) { + pipe->pipe_owner = NULL; + return true; + } else + return false; } /* - * Select/poll wakup. This also sends SIGIO to peer connected to - * 'sigpipe' side of pipe. + * Select/poll wakeup. This also sends SIGIO to peer. */ static void -pipeselwakeup(struct pipe *selp, struct pipe *sigp, int code) +pipeselwakeup(struct pipe *pipe, int side, int code) { - int band; + struct selinfo *selp; + int band, flag; + pid_t pgid; + + KASSERT(mutex_owned(pipe->pipe_lock)); + + if (side == FREAD) { + selp = &pipe->pipe_rdsel; + pgid = pipe->pipe_rdpgid; + flag = PIPE_RDASYNC; + } else { + selp = &pipe->pipe_wrsel; + pgid = pipe->pipe_wrpgid; + flag = PIPE_WRASYNC; + } switch (code) { case POLL_IN: @@ -393,26 +315,22 @@ pipeselwakeup(struct pipe *selp, struct break; } - selnotify(&selp->pipe_sel, band, NOTE_SUBMIT); - - if (sigp == NULL || (sigp->pipe_state & PIPE_ASYNC) == 0) - return; + selnotify(selp, band, NOTE_SUBMIT); - fownsignal(sigp->pipe_pgid, SIGIO, code, band, selp); + if (pgid != 0 && (pipe->pipe_state & flag) != 0) + fownsignal(pgid, SIGIO, code, band, pipe); } static int pipe_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, int flags) { - struct pipe *rpipe = fp->f_pipe; - struct pipebuf *bp = &rpipe->pipe_buffer; - kmutex_t *lock = rpipe->pipe_lock; - int error; - size_t nread = 0; - size_t size; - size_t ocnt; - unsigned int wakeup_state = 0; + struct pipe *pipe = fp->f_pipe; + struct pipebuf *bp = &pipe->pipe_buffer; + size_t size, cnt, ocnt, nread = 0; + kmutex_t *lock = pipe->pipe_lock; + int error = 0; + bool unbusy; /* * Try to avoid locking the pipe if we have nothing to do. @@ -425,32 +343,30 @@ pipe_read(file_t *fp, off_t *offset, str */ if ((fp->f_flag & FNONBLOCK) != 0) { if (__predict_false(uio->uio_resid == 0)) - return (0); + return 0; if (atomic_load_relaxed(&bp->cnt) == 0 && - (atomic_load_relaxed(&rpipe->pipe_state) & PIPE_EOF) == 0) - return (EAGAIN); + (atomic_load_relaxed(&pipe->pipe_state) & PIPE_EOF) == 0) + return EAGAIN; } mutex_enter(lock); - ++rpipe->pipe_busy; ocnt = bp->cnt; -again: - error = pipelock(rpipe, true); - if (error) - goto unlocked_error; - while (uio->uio_resid) { /* * Normal pipe buffer receive. */ if (bp->cnt > 0) { + /* If pipebusy() blocked then re-validate. */ + if (pipebusy(pipe)) + continue; size = bp->size - bp->out; if (size > bp->cnt) size = bp->cnt; if (size > uio->uio_resid) size = uio->uio_resid; + KASSERT(pipe->pipe_owner == curlwp); mutex_exit(lock); error = uiomove((char *)bp->buffer + bp->out, size, uio); mutex_enter(lock); @@ -460,7 +376,6 @@ again: bp->out += size; if (bp->out >= bp->size) bp->out = 0; - bp->cnt -= size; /* @@ -486,142 +401,117 @@ again: * Detect EOF condition. * Read returns 0 on EOF, no need to set error. */ - if (rpipe->pipe_state & PIPE_EOF) + if ((pipe->pipe_state & PIPE_EOF) != 0) break; /* * Don't block on non-blocking I/O. */ - if (fp->f_flag & FNONBLOCK) { + if ((fp->f_flag & FNONBLOCK) != 0) { error = EAGAIN; break; } /* - * Unlock the pipe buffer for our remaining processing. - * We will either break out with an error or we will - * sleep and relock to loop. - */ - pipeunlock(rpipe); - -#if 1 /* XXX (dsl) I'm sure these aren't needed here ... */ - /* - * We want to read more, wake up select/poll. - */ - pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_OUT); - - /* - * If the "write-side" is blocked, wake it up now. + * Awaken the other side (including select/poll/kqueue) + * then sleep ASAP to minimise contention. */ - cv_broadcast(&rpipe->pipe_wcv); -#endif - - if (wakeup_state & PIPE_RESTART) { - error = ERESTART; - goto unlocked_error; - } - - /* Now wait until the pipe is filled */ - error = cv_wait_sig(&rpipe->pipe_rcv, lock); - if (error != 0) - goto unlocked_error; - wakeup_state = rpipe->pipe_state; - goto again; + pipeselwakeup(pipe, FWRITE, POLL_OUT); + if (pipeunbusy(pipe)) + cv_signal(&pipe->pipe_busy); + cv_broadcast(&pipe->pipe_write); + if ((error = cv_wait_sig(&pipe->pipe_read, lock)) != 0) + break; } + /* + * Update timestamp and drop the long term lock (if held). + */ if (error == 0) - getnanotime(&rpipe->pipe_atime); - pipeunlock(rpipe); - -unlocked_error: - --rpipe->pipe_busy; - if (rpipe->pipe_busy == 0) { - rpipe->pipe_state &= ~PIPE_RESTART; - cv_broadcast(&rpipe->pipe_draincv); - } - if (bp->cnt < MINPIPESIZE) { - cv_broadcast(&rpipe->pipe_wcv); - } + getnanotime(&pipe->pipe_atime); + unbusy = pipeunbusy(pipe); /* * If anything was read off the buffer, signal to the writer it's * possible to write more data. Also send signal if we are here for the * first time after last write. */ - if ((bp->size - bp->cnt) >= PIPE_BUF - && (ocnt != bp->cnt || (rpipe->pipe_state & PIPE_SIGNALR))) { - pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_OUT); - rpipe->pipe_state &= ~PIPE_SIGNALR; + cnt = bp->cnt; + if (bp->size - cnt >= PIPE_BUF + && (ocnt != cnt || (pipe->pipe_state & PIPE_SIGNALR) != 0)) { + pipe->pipe_state &= ~PIPE_SIGNALR; + pipeselwakeup(pipe, FWRITE, POLL_OUT); } + /* + * Release the mutex and only then wake the other side, to minimise + * contention. + */ mutex_exit(lock); - return (error); + if (unbusy) + cv_signal(&pipe->pipe_busy); + if (cnt < MINPIPESIZE) + cv_broadcast(&pipe->pipe_write); + + return error; } static int pipe_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, int flags) { - struct pipe *wpipe, *rpipe; - struct pipebuf *bp; - kmutex_t *lock; + struct pipe *pipe = fp->f_pipe; + struct pipebuf *bp = &pipe->pipe_buffer; + kmutex_t *lock = pipe->pipe_lock; + size_t cnt, space, orig_resid = uio->uio_resid; + bool unbusy; int error; - unsigned int wakeup_state = 0; - - /* We want to write to our peer */ - rpipe = fp->f_pipe; - lock = rpipe->pipe_lock; - error = 0; - - mutex_enter(lock); - wpipe = rpipe->pipe_peer; - - /* - * Detect loss of pipe read side, issue SIGPIPE if lost. - */ - if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) != 0) { - mutex_exit(lock); - return EPIPE; - } - ++wpipe->pipe_busy; - - /* Acquire the long-term pipe lock */ - if ((error = pipelock(wpipe, true)) != 0) { - --wpipe->pipe_busy; - if (wpipe->pipe_busy == 0) { - wpipe->pipe_state &= ~PIPE_RESTART; - cv_broadcast(&wpipe->pipe_draincv); - } - mutex_exit(lock); - return (error); - } - - bp = &wpipe->pipe_buffer; /* * If it is advantageous to resize the pipe buffer, do so. */ - if ((uio->uio_resid > PIPE_SIZE) && - (nbigpipe < maxbigpipes) && - (bp->size <= PIPE_SIZE) && (bp->cnt == 0)) { - - if (pipespace(wpipe, BIG_PIPE_SIZE) == 0) + mutex_enter(lock); + if (uio->uio_resid > PIPE_SIZE && + (pipe->pipe_state & PIPE_RESIZED) == 0 && + nbigpipe < maxbigpipes && bp->cnt == 0) { + size_t size = round_page(BIG_PIPE_SIZE); + void *buffer = (void *)uvm_km_alloc(kernel_map, size, + 0, UVM_KMF_PAGEABLE); + if (buffer != NULL) { + atomic_add_int(&amountpipekva, size); atomic_inc_uint(&nbigpipe); + pipe->pipe_buffer.buffer = buffer; + pipe->pipe_buffer.size = size; + pipe->pipe_buffer.in = 0; + pipe->pipe_buffer.out = 0; + pipe->pipe_buffer.cnt = 0; + } + pipe->pipe_state |= PIPE_RESIZED; } - while (uio->uio_resid) { - size_t space; - - space = bp->size - bp->cnt; + while (uio->uio_resid > 0) { + /* + * If read side has gone away, we just issue a signal to + * ourselves. + */ + if ((pipe->pipe_state & PIPE_EOF) != 0) { + error = EPIPE; + break; + } /* Writes of size <= PIPE_BUF must be atomic. */ - if ((space < uio->uio_resid) && (uio->uio_resid <= PIPE_BUF)) + space = bp->size - bp->cnt; + if (space < uio->uio_resid && uio->uio_resid <= PIPE_BUF) space = 0; if (space > 0) { int size; /* Transfer size */ int segsize; /* first segment to transfer */ + /* If pipebusy() blocked then re-validate. */ + if (pipebusy(pipe)) + continue; + /* * Transfer size is minimum of uio transfer * and free space in pipe buffer. @@ -642,6 +532,7 @@ pipe_write(file_t *fp, off_t *offset, st segsize = size; /* Transfer first segment */ + KASSERT(pipe->pipe_owner == curlwp); mutex_exit(lock); error = uiomove((char *)bp->buffer + bp->in, segsize, uio); @@ -668,83 +559,63 @@ pipe_write(file_t *fp, off_t *offset, st bp->cnt += size; KASSERT(bp->cnt <= bp->size); - wakeup_state = 0; - } else { - /* - * If the "read-side" has been blocked, wake it up now. - */ - cv_broadcast(&wpipe->pipe_rcv); - - /* - * Don't block on non-blocking I/O. - */ - if (fp->f_flag & FNONBLOCK) { - error = EAGAIN; - break; - } - - /* - * We have no more space and have something to offer, - * wake up select/poll. - */ - if (bp->cnt) - pipeselwakeup(wpipe, wpipe, POLL_IN); - - if (wakeup_state & PIPE_RESTART) { - error = ERESTART; - break; - } - - /* - * If read side wants to go away, we just issue a signal - * to ourselves. - */ - if (wpipe->pipe_state & PIPE_EOF) { - error = EPIPE; - break; - } + continue; + } - pipeunlock(wpipe); - error = cv_wait_sig(&wpipe->pipe_wcv, lock); - (void)pipelock(wpipe, false); - if (error != 0) - break; - wakeup_state = wpipe->pipe_state; + /* + * Don't block on non-blocking I/O. + */ + if ((fp->f_flag & FNONBLOCK) != 0) { + error = EAGAIN; + break; } - } - --wpipe->pipe_busy; - if (wpipe->pipe_busy == 0) { - wpipe->pipe_state &= ~PIPE_RESTART; - cv_broadcast(&wpipe->pipe_draincv); - } - if (bp->cnt > 0) { - cv_broadcast(&wpipe->pipe_rcv); + /* + * Awaken the other side (including select/poll/kqueue) then + * sleep ASAP to minimise contention. + */ + pipeselwakeup(pipe, FREAD, POLL_IN); + if (pipeunbusy(pipe)) + cv_signal(&pipe->pipe_busy); + cv_broadcast(&pipe->pipe_read); + if ((error = cv_wait_sig(&pipe->pipe_write, lock)) != 0) + break; } /* * Don't return EPIPE if I/O was successful */ - if (error == EPIPE && bp->cnt == 0 && uio->uio_resid == 0) + if (error == EPIPE && uio->uio_resid != orig_resid) error = 0; + /* + * Update timestamp and drop the long term lock (if held). + */ if (error == 0) - getnanotime(&wpipe->pipe_mtime); + getnanotime(&pipe->pipe_mtime); + unbusy = pipeunbusy(pipe); /* - * We have something to offer, wake up select/poll. + * Arrange for next read(2) to do a signal. */ - if (bp->cnt) - pipeselwakeup(wpipe, wpipe, POLL_IN); + pipe->pipe_state |= PIPE_SIGNALR; /* - * Arrange for next read(2) to do a signal. + * We have something to offer, wake up select/poll. */ - wpipe->pipe_state |= PIPE_SIGNALR; + if ((cnt = bp->cnt) > 0) + pipeselwakeup(pipe, FREAD, POLL_IN); - pipeunlock(wpipe); + /* + * Release the mutex then wake other side, to minimise contention. + */ mutex_exit(lock); - return (error); + if (unbusy) + cv_signal(&pipe->pipe_busy); + if (cnt > 0) + cv_broadcast(&pipe->pipe_read); + + return error; } /* @@ -755,131 +626,130 @@ pipe_ioctl(file_t *fp, u_long cmd, void { struct pipe *pipe = fp->f_pipe; kmutex_t *lock = pipe->pipe_lock; + int flag; switch (cmd) { - case FIONBIO: - return (0); + return 0; case FIOASYNC: + flag = (fp->f_flag & FREAD) != 0 ? PIPE_RDASYNC : PIPE_WRASYNC; mutex_enter(lock); - if (*(int *)data) { - pipe->pipe_state |= PIPE_ASYNC; - } else { - pipe->pipe_state &= ~PIPE_ASYNC; - } + if (*(int *)data) + pipe->pipe_state |= flag; + else + pipe->pipe_state &= ~flag; mutex_exit(lock); - return (0); + return 0; case FIONREAD: - mutex_enter(lock); - *(int *)data = pipe->pipe_buffer.cnt; - mutex_exit(lock); - return (0); + if ((fp->f_flag & FREAD) != 0) + *(int *)data = + atomic_load_relaxed(&pipe->pipe_buffer.cnt); + else + *(int *)data = 0; + return 0; case FIONWRITE: - /* Look at other side */ - mutex_enter(lock); - pipe = pipe->pipe_peer; - if (pipe == NULL) - *(int *)data = 0; + if ((fp->f_flag & FWRITE) != 0) + *(int *)data = + atomic_load_relaxed(&pipe->pipe_buffer.cnt); else - *(int *)data = pipe->pipe_buffer.cnt; - mutex_exit(lock); + *(int *)data = 0; return (0); case FIONSPACE: - /* Look at other side */ - mutex_enter(lock); - pipe = pipe->pipe_peer; - if (pipe == NULL) - *(int *)data = 0; - else + if ((fp->f_flag & FWRITE) != 0) { + mutex_enter(lock); *(int *)data = pipe->pipe_buffer.size - pipe->pipe_buffer.cnt; - mutex_exit(lock); + mutex_exit(lock); + } else + *(int *)data = 0; return (0); case TIOCSPGRP: case FIOSETOWN: - return fsetown(&pipe->pipe_pgid, cmd, data); + return fsetown((fp->f_flag & FREAD) != 0 ? + &pipe->pipe_rdpgid : &pipe->pipe_wrpgid, cmd, data); case TIOCGPGRP: case FIOGETOWN: - return fgetown(pipe->pipe_pgid, cmd, data); + return fgetown((fp->f_flag & FREAD) != 0 ? + pipe->pipe_rdpgid : pipe->pipe_wrpgid, cmd, data); + default: + return EPASSTHROUGH; } - return (EPASSTHROUGH); } int pipe_poll(file_t *fp, int events) { - struct pipe *rpipe = fp->f_pipe; - struct pipe *wpipe; - int eof = 0; + struct pipe *pipe = fp->f_pipe; + kmutex_t *lock = pipe->pipe_lock; int revents = 0; - mutex_enter(rpipe->pipe_lock); - wpipe = rpipe->pipe_peer; - - if (events & (POLLIN | POLLRDNORM)) - if ((rpipe->pipe_buffer.cnt > 0) || - (rpipe->pipe_state & PIPE_EOF)) - revents |= events & (POLLIN | POLLRDNORM); - - eof |= (rpipe->pipe_state & PIPE_EOF); + /* Unlocked fast path for make(1). */ + if ((fp->f_flag & FREAD) != 0 && + atomic_load_relaxed(&pipe->pipe_buffer.cnt) != 0 && + (atomic_load_relaxed(&pipe->pipe_state) & PIPE_EOF) == 0 && + (events & (POLLIN | POLLRDNORM)) != 0 && + (events & (POLLOUT | POLLWRNORM)) == 0) + return events & (POLLIN | POLLRDNORM); - if (wpipe == NULL) - revents |= events & (POLLOUT | POLLWRNORM); - else { - if (events & (POLLOUT | POLLWRNORM)) - if ((wpipe->pipe_state & PIPE_EOF) || ( - (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) - revents |= events & (POLLOUT | POLLWRNORM); + mutex_enter(lock); - eof |= (wpipe->pipe_state & PIPE_EOF); + if ((fp->f_flag & FREAD) != 0) { + if ((events & (POLLIN | POLLRDNORM)) != 0) { + if (pipe->pipe_buffer.cnt > 0 || + (pipe->pipe_state & PIPE_EOF) != 0) + revents |= events & (POLLIN | POLLRDNORM); + selrecord(curlwp, &pipe->pipe_rdsel); + } + } else if ((events & (POLLOUT | POLLWRNORM)) != 0) { + KASSERT((fp->f_flag & FWRITE) != 0); + size_t space = pipe->pipe_buffer.size - pipe->pipe_buffer.cnt; + if ((pipe->pipe_state & PIPE_EOF) != 0) + revents |= events & (POLLOUT | POLLWRNORM); + if ((pipe->pipe_state & PIPE_EOF) || space >= PIPE_BUF) + revents |= events & (POLLOUT | POLLWRNORM); + selrecord(curlwp, &pipe->pipe_wrsel); } - if (wpipe == NULL || eof) + if ((pipe->pipe_state & PIPE_EOF) != 0) revents |= POLLHUP; - if (revents == 0) { - if (events & (POLLIN | POLLRDNORM)) - selrecord(curlwp, &rpipe->pipe_sel); - - if (events & (POLLOUT | POLLWRNORM)) - selrecord(curlwp, &wpipe->pipe_sel); - } - mutex_exit(rpipe->pipe_lock); + mutex_exit(lock); - return (revents); + return revents; } static int pipe_stat(file_t *fp, struct stat *ub) { struct pipe *pipe = fp->f_pipe; + kmutex_t *lock = pipe->pipe_lock; - mutex_enter(pipe->pipe_lock); memset(ub, 0, sizeof(*ub)); + + mutex_enter(lock); ub->st_mode = S_IFIFO | S_IRUSR | S_IWUSR; ub->st_blksize = pipe->pipe_buffer.size; - if (ub->st_blksize == 0 && pipe->pipe_peer) - ub->st_blksize = pipe->pipe_peer->pipe_buffer.size; ub->st_size = pipe->pipe_buffer.cnt; ub->st_blocks = (ub->st_size) ? 1 : 0; ub->st_atimespec = pipe->pipe_atime; ub->st_mtimespec = pipe->pipe_mtime; - ub->st_ctimespec = ub->st_birthtimespec = pipe->pipe_btime; + ub->st_ctimespec = pipe->pipe_btime; + ub->st_birthtimespec = pipe->pipe_btime; ub->st_uid = kauth_cred_geteuid(fp->f_cred); ub->st_gid = kauth_cred_getegid(fp->f_cred); + mutex_exit(lock); /* * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. * XXX (st_dev, st_ino) should be unique. */ - mutex_exit(pipe->pipe_lock); return 0; } @@ -887,29 +757,90 @@ static int pipe_close(file_t *fp) { struct pipe *pipe = fp->f_pipe; + kmutex_t *lock = pipe->pipe_lock; + u_int state; + + KASSERT(cv_is_valid(&pipe->pipe_read)); + KASSERT(cv_is_valid(&pipe->pipe_write)); + KASSERT(cv_is_valid(&pipe->pipe_busy)); fp->f_pipe = NULL; - pipeclose(pipe); - return (0); + + /* + * If the other side is blocked, wake it up. + * + * Any knote objects still left in the list are the one attached by + * peer. Since no one will traverse this list, we just clear it. + * + * XXX Exposes select/kqueue internals. + */ + mutex_enter(lock); + pipebusy(pipe); + state = pipe->pipe_state | PIPE_EOF; + if ((fp->f_flag & FREAD) != 0) { + KASSERT((state & PIPE_RDOPEN) != 0); + SLIST_INIT(&pipe->pipe_rdsel.sel_klist); + pipe->pipe_rdpgid = 0; + state &= ~(PIPE_RDASYNC | PIPE_RDOPEN); + pipeselwakeup(pipe, FWRITE, POLL_HUP); + cv_broadcast(&pipe->pipe_write); + } else { + KASSERT((fp->f_flag & FWRITE) != 0); + KASSERT((state & PIPE_WROPEN) != 0); + SLIST_INIT(&pipe->pipe_wrsel.sel_klist); + pipe->pipe_wrpgid = 0; + state &= ~(PIPE_WRASYNC | PIPE_WROPEN); + pipeselwakeup(pipe, FREAD, POLL_HUP); + cv_broadcast(&pipe->pipe_read); + } + pipe->pipe_state = state; + pipeunbusy(pipe); + cv_signal(&pipe->pipe_busy); + mutex_exit(lock); + + /* + * NB: now that the mutex is released, we cannot touch "pipe" any + * more unless we are the last guy out, since nothing else is + * keeping the data structure around. This also means we have to + * wake the other side with the mutex held above. + */ + if ((state & (PIPE_RDOPEN | PIPE_WROPEN)) != 0) + return 0; + + /* Both sides are closed, free resources. */ + pipe->pipe_state = PIPE_SIGNALR | PIPE_RDOPEN | PIPE_WROPEN; + pipe->pipe_buffer.in = 0; + pipe->pipe_buffer.out = 0; + pipe->pipe_buffer.cnt = 0; + if (pipe->pipe_buffer.buffer != (void *)pipe->pipe_kmem) { + uvm_km_free(kernel_map, (vaddr_t)pipe->pipe_buffer.buffer, + pipe->pipe_buffer.size, UVM_KMF_PAGEABLE); + atomic_add_int(&amountpipekva, -pipe->pipe_buffer.size); + atomic_dec_uint(&nbigpipe); + pipe->pipe_buffer.buffer = (void *)pipe->pipe_kmem; + pipe->pipe_buffer.size = PIPE_SIZE; + } + pool_cache_put(pipe_cache, pipe); + + return 0; } static void pipe_restart(file_t *fp) { struct pipe *pipe = fp->f_pipe; + kmutex_t *lock = pipe->pipe_lock; /* * Unblock blocked reads/writes in order to allow close() to complete. * System calls return ERESTART so that the fd is revalidated. * (Partial writes return the transfer length.) */ - mutex_enter(pipe->pipe_lock); - pipe->pipe_state |= PIPE_RESTART; - /* Wakeup both cvs, maybe we only need one, but maybe there are some - * other paths where wakeup is needed, and it saves deciding which! */ - cv_broadcast(&pipe->pipe_rcv); - cv_broadcast(&pipe->pipe_wcv); - mutex_exit(pipe->pipe_lock); + mutex_enter(lock); + cv_fdrestart(&pipe->pipe_read); + cv_fdrestart(&pipe->pipe_write); + cv_fdrestart(&pipe->pipe_busy); + mutex_exit(lock); } static int @@ -933,148 +864,41 @@ pipe_posix_fadvise(struct file *fp, off_ } static void -pipe_free_kmem(struct pipe *pipe) -{ - - if (pipe->pipe_buffer.buffer != NULL) { - if (pipe->pipe_buffer.size > PIPE_SIZE) { - atomic_dec_uint(&nbigpipe); - } - if (pipe->pipe_buffer.buffer != (void *)pipe->pipe_kmem) { - uvm_km_free(kernel_map, - (vaddr_t)pipe->pipe_buffer.buffer, - pipe->pipe_buffer.size, UVM_KMF_PAGEABLE); - atomic_add_int(&amountpipekva, - -pipe->pipe_buffer.size); - } - pipe->pipe_buffer.buffer = NULL; - } -} - -/* - * Shutdown the pipe. - */ -static void -pipeclose(struct pipe *pipe) -{ - kmutex_t *lock; - struct pipe *ppipe; - - if (pipe == NULL) - return; - - KASSERT(cv_is_valid(&pipe->pipe_rcv)); - KASSERT(cv_is_valid(&pipe->pipe_wcv)); - KASSERT(cv_is_valid(&pipe->pipe_draincv)); - KASSERT(cv_is_valid(&pipe->pipe_lkcv)); - - lock = pipe->pipe_lock; - if (lock == NULL) - /* Must have failed during create */ - goto free_resources; - - mutex_enter(lock); - pipeselwakeup(pipe, pipe, POLL_HUP); - - /* - * If the other side is blocked, wake it up saying that - * we want to close it down. - */ - pipe->pipe_state |= PIPE_EOF; - if (pipe->pipe_busy) { - while (pipe->pipe_busy) { - cv_broadcast(&pipe->pipe_wcv); - cv_wait_sig(&pipe->pipe_draincv, lock); - } - } - - /* - * Disconnect from peer. - */ - if ((ppipe = pipe->pipe_peer) != NULL) { - pipeselwakeup(ppipe, ppipe, POLL_HUP); - ppipe->pipe_state |= PIPE_EOF; - cv_broadcast(&ppipe->pipe_rcv); - ppipe->pipe_peer = NULL; - } - - /* - * Any knote objects still left in the list are - * the one attached by peer. Since no one will - * traverse this list, we just clear it. - * - * XXX Exposes select/kqueue internals. - */ - SLIST_INIT(&pipe->pipe_sel.sel_klist); - - KASSERT((pipe->pipe_state & PIPE_LOCKFL) == 0); - mutex_exit(lock); - mutex_obj_free(lock); - - /* - * Free resources. - */ - free_resources: - pipe->pipe_pgid = 0; - pipe->pipe_state = PIPE_SIGNALR; - pipe->pipe_peer = NULL; - pipe->pipe_lock = NULL; - pipe_free_kmem(pipe); - if (pipe->pipe_kmem != 0) { - pool_cache_put(pipe_rd_cache, pipe); - } else { - pool_cache_put(pipe_wr_cache, pipe); - } -} - -static void filt_pipedetach(struct knote *kn) { - struct pipe *pipe; - kmutex_t *lock; - - pipe = ((file_t *)kn->kn_obj)->f_pipe; - lock = pipe->pipe_lock; + struct file *fp = kn->kn_obj; + struct pipe *pipe = fp->f_pipe; + kmutex_t *lock = pipe->pipe_lock; mutex_enter(lock); - switch(kn->kn_filter) { - case EVFILT_WRITE: - /* Need the peer structure, not our own. */ - pipe = pipe->pipe_peer; - - /* If reader end already closed, just return. */ - if (pipe == NULL) { - mutex_exit(lock); - return; - } - - break; - default: - /* Nothing to do. */ - break; - } - KASSERT(kn->kn_hook == pipe); - selremove_knote(&pipe->pipe_sel, kn); + if ((fp->f_flag & FREAD) != 0) { + if ((pipe->pipe_state & PIPE_RDOPEN) != 0) + selremove_knote(&pipe->pipe_rdsel, kn); + } else if ((pipe->pipe_state & PIPE_WROPEN) != 0) + selremove_knote(&pipe->pipe_wrsel, kn); mutex_exit(lock); } static int filt_piperead(struct knote *kn, long hint) { - struct pipe *rpipe = ((file_t *)kn->kn_obj)->f_pipe; - struct pipe *wpipe; + struct file *fp = kn->kn_obj; + struct pipe *pipe = fp->f_pipe; + kmutex_t *lock = pipe->pipe_lock; int rv; if ((hint & NOTE_SUBMIT) == 0) { - mutex_enter(rpipe->pipe_lock); + mutex_enter(lock); } - wpipe = rpipe->pipe_peer; - kn->kn_data = rpipe->pipe_buffer.cnt; - if ((rpipe->pipe_state & PIPE_EOF) || - (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { + if ((fp->f_flag & FREAD) != 0) + kn->kn_data = pipe->pipe_buffer.cnt; + else + kn->kn_data = 0; + + if ((pipe->pipe_state & PIPE_EOF) != 0) { knote_set_eof(kn, 0); rv = 1; } else { @@ -1082,7 +906,7 @@ filt_piperead(struct knote *kn, long hin } if ((hint & NOTE_SUBMIT) == 0) { - mutex_exit(rpipe->pipe_lock); + mutex_exit(lock); } return rv; } @@ -1090,26 +914,29 @@ filt_piperead(struct knote *kn, long hin static int filt_pipewrite(struct knote *kn, long hint) { - struct pipe *rpipe = ((file_t *)kn->kn_obj)->f_pipe; - struct pipe *wpipe; + struct file *fp = kn->kn_obj; + struct pipe *pipe = fp->f_pipe; + kmutex_t *lock = pipe->pipe_lock; int rv; if ((hint & NOTE_SUBMIT) == 0) { - mutex_enter(rpipe->pipe_lock); + mutex_enter(lock); } - wpipe = rpipe->pipe_peer; - if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { + if ((pipe->pipe_state & PIPE_EOF)) { kn->kn_data = 0; knote_set_eof(kn, 0); rv = 1; - } else { - kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; + } else if ((fp->f_flag & FWRITE) != 0) { + kn->kn_data = pipe->pipe_buffer.size - pipe->pipe_buffer.cnt; rv = kn->kn_data >= PIPE_BUF; + } else { + kn->kn_data = 0; + rv = 0; } if ((hint & NOTE_SUBMIT) == 0) { - mutex_exit(rpipe->pipe_lock); + mutex_exit(lock); } return rv; } @@ -1131,36 +958,37 @@ static const struct filterops pipe_wfilt static int pipe_kqfilter(file_t *fp, struct knote *kn) { - struct pipe *pipe; - kmutex_t *lock; - - pipe = ((file_t *)kn->kn_obj)->f_pipe; - lock = pipe->pipe_lock; - - mutex_enter(lock); + struct pipe *pipe = ((file_t *)kn->kn_obj)->f_pipe; + kmutex_t *lock = pipe->pipe_lock; switch (kn->kn_filter) { case EVFILT_READ: + if ((fp->f_flag & FREAD) == 0) + return EINVAL; + mutex_enter(lock); kn->kn_fop = &pipe_rfiltops; + kn->kn_hook = pipe; + selrecord_knote(&pipe->pipe_rdsel, kn); + mutex_exit(lock); break; case EVFILT_WRITE: + if ((fp->f_flag & FWRITE) == 0) + return EINVAL; + mutex_enter(lock); kn->kn_fop = &pipe_wfiltops; - pipe = pipe->pipe_peer; - if (pipe == NULL) { + if ((pipe->pipe_state & PIPE_EOF) != 0) { /* Other end of pipe has been closed. */ mutex_exit(lock); - return (EBADF); + return EBADF; } + kn->kn_hook = pipe; + selrecord_knote(&pipe->pipe_wrsel, kn); + mutex_exit(lock); break; default: - mutex_exit(lock); - return (EINVAL); + return EINVAL; } - kn->kn_hook = pipe; - selrecord_knote(&pipe->pipe_sel, kn); - mutex_exit(lock); - return (0); } Index: src/sys/sys/pipe.h diff -u src/sys/sys/pipe.h:1.39 src/sys/sys/pipe.h:1.40 --- src/sys/sys/pipe.h:1.39 Wed Oct 4 22:19:58 2023 +++ src/sys/sys/pipe.h Fri Oct 13 19:07:09 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: pipe.h,v 1.39 2023/10/04 22:19:58 ad Exp $ */ +/* $NetBSD: pipe.h,v 1.40 2023/10/13 19:07:09 ad Exp $ */ /* * Copyright (c) 1996 John S. Dyson @@ -75,34 +75,34 @@ struct pipebuf { /* * Bits in pipe_state. */ -#define PIPE_ASYNC 0x001 /* Async I/O */ -#define PIPE_EOF 0x010 /* Pipe is in EOF condition */ -#define PIPE_SIGNALR 0x020 /* Do selwakeup() on read(2) */ -#define PIPE_LOCKFL 0x100 /* Process has exclusive access to - pointers/data. */ -/* unused 0x200 */ -#define PIPE_RESTART 0x400 /* Return ERESTART to blocked syscalls */ +#define PIPE_RDASYNC 0x001 /* Async I/O on reader side */ +#define PIPE_WRASYNC 0x002 /* Async I/O on writer side */ +#define PIPE_RDOPEN 0x010 /* Reader side open */ +#define PIPE_WROPEN 0x020 /* Writer side open */ +#define PIPE_EOF 0x100 /* Pipe is in EOF condition */ +#define PIPE_SIGNALR 0x200 /* Do selwakeup() on read(2) */ +#define PIPE_RESIZED 0x400 /* Attempted to resize */ /* * Per-pipe data structure. * Two of these are linked together to produce bi-directional pipes. */ struct pipe { - kmutex_t *pipe_lock; /* pipe mutex */ - kcondvar_t pipe_rcv; /* cv for readers */ - kcondvar_t pipe_wcv; /* cv for writers */ - kcondvar_t pipe_draincv; /* cv for close */ - kcondvar_t pipe_lkcv; /* locking */ - struct pipebuf pipe_buffer; /* data storage */ - struct selinfo pipe_sel; /* for compat with select */ - struct timespec pipe_atime; /* time of last access */ - struct timespec pipe_mtime; /* time of last modify */ - struct timespec pipe_btime; /* time of creation */ - struct pipe *pipe_peer; /* link with other direction */ - pid_t pipe_pgid; /* process group for sigio */ - u_int pipe_state; /* pipe status info */ - int pipe_busy; /* busy flag, to handle rundown */ - vaddr_t pipe_kmem; /* preallocated PIPE_SIZE buffer */ + kmutex_t *pipe_lock; /* pipe mutex */ + struct lwp *pipe_owner; /* who holds the pipe busy */ + u_int pipe_state; /* pipe status info */ + struct pipebuf pipe_buffer; /* data storage */ + kcondvar_t pipe_read; /* cv for readers */ + kcondvar_t pipe_write; /* cv for writers */ + kcondvar_t pipe_busy; /* cv for locking */ + struct selinfo pipe_wrsel; /* for compat with select */ + struct selinfo pipe_rdsel; /* for compat with select */ + struct timespec pipe_atime; /* time of last access */ + struct timespec pipe_mtime; /* time of last modify */ + struct timespec pipe_btime; /* time of creation */ + pid_t pipe_wrpgid; /* process group for sigio */ + pid_t pipe_rdpgid; /* process group for sigio */ + vaddr_t pipe_kmem; /* preallocated PIPE_SIZE buffer */ }; /* Index: src/usr.bin/fstat/fstat.c diff -u src/usr.bin/fstat/fstat.c:1.118 src/usr.bin/fstat/fstat.c:1.119 --- src/usr.bin/fstat/fstat.c:1.118 Mon Jul 10 02:31:55 2023 +++ src/usr.bin/fstat/fstat.c Fri Oct 13 19:07:09 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: fstat.c,v 1.118 2023/07/10 02:31:55 christos Exp $ */ +/* $NetBSD: fstat.c,v 1.119 2023/10/13 19:07:09 ad Exp $ */ /*- * Copyright (c) 1988, 1993 @@ -39,7 +39,7 @@ __COPYRIGHT("@(#) Copyright (c) 1988, 19 #if 0 static char sccsid[] = "@(#)fstat.c 8.3 (Berkeley) 5/2/95"; #else -__RCSID("$NetBSD: fstat.c,v 1.118 2023/07/10 02:31:55 christos Exp $"); +__RCSID("$NetBSD: fstat.c,v 1.119 2023/10/13 19:07:09 ad Exp $"); #endif #endif /* not lint */ @@ -1271,6 +1271,7 @@ static void ptrans(struct file *fp, struct pipe *cpipe, int i) { struct pipe cp; + int flag; PREFIX(i); @@ -1281,12 +1282,12 @@ ptrans(struct file *fp, struct pipe *cpi } /* pipe descriptor is either read or write, never both */ - (void)printf("* pipe %p %s %p %s%s%s", cpipe, + flag = (fp->f_flag & FWRITE) ? PIPE_WRASYNC : PIPE_RDASYNC; + (void)printf("* pipe %p %s %s%s%s", cpipe, (fp->f_flag & FWRITE) ? "->" : "<-", - cp.pipe_peer, (fp->f_flag & FWRITE) ? "w" : "r", (fp->f_flag & FNONBLOCK) ? "n" : "", - (cp.pipe_state & PIPE_ASYNC) ? "a" : ""); + (cp.pipe_state & flag) ? "a" : ""); oprint(fp, "\n"); return; bad: