Re: Enable EVFILT_EXCEPT

2020-08-21 Thread Visa Hankala
On Fri, Aug 21, 2020 at 09:32:13AM +0200, Martin Pieuchot wrote:
> The kqueue-based poll(2) backend is still a WIP due to regressions in
> the kqueue layer.  In the meantime should we expose EVFILT_EXCEPT to
> userland?  The diff below should be enough to allow userland apps to
> use the new code paths. 

The EVFILT_EXCEPT macro is not guarded by #ifdef _KERNEL or similar.
It is already visible to userspace, and some pieces of software may have
tried to use it but have failed so far.

OK visa@

> Index: sys/event.h
> ===
> RCS file: /cvs/src/sys/sys/event.h,v
> retrieving revision 1.44
> diff -u -p -r1.44 event.h
> --- sys/event.h   22 Jun 2020 13:14:32 -  1.44
> +++ sys/event.h   21 Aug 2020 07:09:31 -
> @@ -41,7 +41,7 @@
>  #define EVFILT_DEVICE(-8)/* devices */
>  #define EVFILT_EXCEPT(-9)/* exceptional conditions */
>  
> -#define EVFILT_SYSCOUNT  8
> +#define EVFILT_SYSCOUNT  9
>  
>  #define EV_SET(kevp, a, b, c, d, e, f) do {  \
>   struct kevent *__kevp = (kevp); \
> 



Re: Log mutex for msgbuf concurrency control

2020-08-20 Thread Visa Hankala
On Tue, Aug 18, 2020 at 03:06:58PM +, Visa Hankala wrote:
> This diff introduces a mutex that serializes access to the kernel
> message buffers. At the moment, the buffers do not have clear
> concurrency control.
> 
> The mutex controls access to all the modifiable fields of struct msgbuf.
> It also protects logsoftc.sc_state for managing thread wakeups.
> 
> A tricky thing with the diff is that the code is indirectly used in many
> different contexts and in varied machine-specific functions. Luckily,
> the code already utilizes splhigh() in critical parts, so the transition
> to using a mutex possibly is not such a big step any longer.
> 
> To avoid creating problems with lock order, the code does not take
> new locks when the log mutex is held. This is visible in logwakeup()
> and logread(). It looks that selrecord() does not acquire new locks,
> so no extra steps are needed in logpoll().
> 
> In logread(), the code assumes that there is only one reader. This keeps
> the data extraction logic simple.
> 
> An additional benefit (?) of the diff is that now uiomove(9) is called
> without splhigh().
> 
> I have tested the diff on amd64 and octeon. However, additional tests
> on these and other platforms would be very helpful.

Here is an updated diff that has sleep_setup()'s priority parameter
fixed. The mistake was noticed by mpi@.

Index: kern/subr_log.c
===
RCS file: src/sys/kern/subr_log.c,v
retrieving revision 1.68
diff -u -p -r1.68 subr_log.c
--- kern/subr_log.c 18 Aug 2020 13:41:49 -  1.68
+++ kern/subr_log.c 20 Aug 2020 13:52:05 -
@@ -52,6 +52,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #ifdef KTRACE
@@ -69,8 +70,12 @@
 #define LOG_ASYNC  0x04
 #define LOG_RDWAIT 0x08
 
+/*
+ * Locking:
+ * L   log_mtx
+ */
 struct logsoftc {
-   int sc_state;   /* see above for possibilities */
+   int sc_state;   /* [L] see above for possibilities */
struct  selinfo sc_selp;/* process waiting on select call */
struct  sigio_ref sc_sigio; /* async I/O registration */
int sc_need_wakeup; /* if set, wake up waiters */
@@ -83,6 +88,14 @@ struct   msgbuf *msgbufp;/* the mapped b
 struct msgbuf *consbufp;   /* console message buffer. */
 struct file *syslogf;
 
+/*
+ * Lock that serializes access to log message buffers.
+ * This should be kept as a leaf lock in order not to constrain where
+ * printf(9) can be used.
+ */
+struct mutex log_mtx =
+MUTEX_INITIALIZER_FLAGS(IPL_HIGH, "logmtx", MTX_NOWITNESS);
+
 void filt_logrdetach(struct knote *kn);
 int filt_logread(struct knote *kn, long hint);
 
@@ -144,13 +157,11 @@ initconsbuf(void)
 void
 msgbuf_putchar(struct msgbuf *mbp, const char c)
 {
-   int s;
-
if (mbp->msg_magic != MSG_MAGIC)
/* Nothing we can do */
return;
 
-   s = splhigh();
+   mtx_enter(_mtx);
mbp->msg_bufc[mbp->msg_bufx++] = c;
if (mbp->msg_bufx < 0 || mbp->msg_bufx >= mbp->msg_bufs)
mbp->msg_bufx = 0;
@@ -160,20 +171,19 @@ msgbuf_putchar(struct msgbuf *mbp, const
mbp->msg_bufr = 0;
mbp->msg_bufd++;
}
-   splx(s);
+   mtx_leave(_mtx);
 }
 
 size_t
 msgbuf_getlen(struct msgbuf *mbp)
 {
long len;
-   int s;
 
-   s = splhigh();
+   mtx_enter(_mtx);
len = mbp->msg_bufx - mbp->msg_bufr;
if (len < 0)
len += mbp->msg_bufs;
-   splx(s);
+   mtx_leave(_mtx);
return (len);
 }
 
@@ -208,34 +218,47 @@ logclose(dev_t dev, int flag, int mode, 
 int
 logread(dev_t dev, struct uio *uio, int flag)
 {
+   struct sleep_state sls;
struct msgbuf *mbp = msgbufp;
-   size_t l;
-   int s, error = 0;
+   size_t l, rpos;
+   int error = 0;
 
-   s = splhigh();
+   mtx_enter(_mtx);
while (mbp->msg_bufr == mbp->msg_bufx) {
if (flag & IO_NDELAY) {
error = EWOULDBLOCK;
goto out;
}
logsoftc.sc_state |= LOG_RDWAIT;
-   error = tsleep_nsec(mbp, LOG_RDPRI | PCATCH, "klog", INFSLP);
+   mtx_leave(_mtx);
+   /*
+* Set up and enter sleep manually instead of using msleep()
+* to keep log_mtx as a leaf lock.
+*/
+   sleep_setup(, mbp, LOG_RDPRI | PCATCH, "klog");
+   sleep_setup_signal();
+   sleep_finish(, logsoftc.sc_state & LOG_RDWAIT);
+   error = sleep_finish_signal();
+   mtx_enter(_mtx);
if (error)

Re: Fewer pool_get() in kqueue_register()

2020-08-19 Thread Visa Hankala
On Wed, Aug 19, 2020 at 12:10:12PM +0200, Martin Pieuchot wrote:
> On 18/08/20(Tue) 15:30, Visa Hankala wrote:
> > On Tue, Aug 18, 2020 at 11:04:47AM +0200, Martin Pieuchot wrote:
> > > Diff below changes the order of operations in kqueue_register() to get
> > > rid of an unnecessary pool_get().  When an event is already present on
> > > the list try to acquire it first.  Note that knote_acquire() may sleep
> > > in which case the list might have changed so the lookup has to always
> > > begin from the start.
> > > 
> > > This will help with lazy removal of knote in poll/select.  In this
> > > scenario EV_ADD is generally always done with an knote already on the
> > > list.
> > 
> > Some of the overhead could be absorbed by using a pool cache, as shown
> > in the diff below. However, I am not suggesting that the cache should
> > be taken into use right now. The frequency of knote pool usage is
> > relatively low currently; there are other pools that would benefit more
> > from caching.
> 
> Agreed, this is a nice idea to revisit.  Do you have a way to measure
> which pool could benefit from caches?

I think systat(1)'s pool view gives a starting point.

> > A related question is what implications the increased use of the pool
> > cache feature would have under memory pressure.
> 
> Do you have suggestion on how to measure this as well?  Could dt(4)
> probes or evtcount() help us? 

I do not have a specific tool in mind, just a vague question: "Can the
system start to perform poorly more easily than before?" However, only
experimentation will tell.



Re: Fewer pool_get() in kqueue_register()

2020-08-18 Thread Visa Hankala
On Tue, Aug 18, 2020 at 11:04:47AM +0200, Martin Pieuchot wrote:
> Diff below changes the order of operations in kqueue_register() to get
> rid of an unnecessary pool_get().  When an event is already present on
> the list try to acquire it first.  Note that knote_acquire() may sleep
> in which case the list might have changed so the lookup has to always
> begin from the start.
> 
> This will help with lazy removal of knote in poll/select.  In this
> scenario EV_ADD is generally always done with an knote already on the
> list.

Some of the overhead could be absorbed by using a pool cache, as shown
in the diff below. However, I am not suggesting that the cache should
be taken into use right now. The frequency of knote pool usage is
relatively low currently; there are other pools that would benefit more
from caching.

A related question is what implications the increased use of the pool
cache feature would have under memory pressure.

Index: kern/init_main.c
===
RCS file: src/sys/kern/init_main.c,v
retrieving revision 1.300
diff -u -p -r1.300 init_main.c
--- kern/init_main.c16 Jun 2020 05:09:29 -  1.300
+++ kern/init_main.c18 Aug 2020 15:09:38 -
@@ -71,6 +71,7 @@
 #include 
 #endif
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -148,7 +149,6 @@ voidcrypto_init(void);
 void   db_ctf_init(void);
 void   prof_init(void);
 void   init_exec(void);
-void   kqueue_init(void);
 void   futex_init(void);
 void   taskq_init(void);
 void   timeout_proc_init(void);
@@ -431,7 +431,9 @@ main(void *framep)
prof_init();
 #endif
 
-   mbcpuinit();/* enable per cpu mbuf data */
+   /* Enable per cpu data. */
+   mbcpuinit();
+   kqueue_init_percpu();
 
/* init exec and emul */
init_exec();
Index: kern/kern_event.c
===
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.142
diff -u -p -r1.142 kern_event.c
--- kern/kern_event.c   12 Aug 2020 13:49:24 -  1.142
+++ kern/kern_event.c   18 Aug 2020 15:09:38 -
@@ -205,6 +205,12 @@ kqueue_init(void)
PR_WAITOK, "knotepl", NULL);
 }
 
+void
+kqueue_init_percpu(void)
+{
+   pool_cache_init(_pool);
+}
+
 int
 filt_fileattach(struct knote *kn)
 {
Index: sys/event.h
===
RCS file: src/sys/sys/event.h,v
retrieving revision 1.44
diff -u -p -r1.44 event.h
--- sys/event.h 22 Jun 2020 13:14:32 -  1.44
+++ sys/event.h 18 Aug 2020 15:09:38 -
@@ -210,6 +210,8 @@ extern void knote_activate(struct knote 
 extern voidknote_remove(struct proc *p, struct knlist *list);
 extern voidknote_fdclose(struct proc *p, int fd);
 extern voidknote_processexit(struct proc *);
+extern voidkqueue_init(void);
+extern voidkqueue_init_percpu(void);
 extern int kqueue_register(struct kqueue *kq,
struct kevent *kev, struct proc *p);
 extern int filt_seltrue(struct knote *kn, long hint);



Log mutex for msgbuf concurrency control

2020-08-18 Thread Visa Hankala
This diff introduces a mutex that serializes access to the kernel
message buffers. At the moment, the buffers do not have clear
concurrency control.

The mutex controls access to all the modifiable fields of struct msgbuf.
It also protects logsoftc.sc_state for managing thread wakeups.

A tricky thing with the diff is that the code is indirectly used in many
different contexts and in varied machine-specific functions. Luckily,
the code already utilizes splhigh() in critical parts, so the transition
to using a mutex possibly is not such a big step any longer.

To avoid creating problems with lock order, the code does not take
new locks when the log mutex is held. This is visible in logwakeup()
and logread(). It looks that selrecord() does not acquire new locks,
so no extra steps are needed in logpoll().

In logread(), the code assumes that there is only one reader. This keeps
the data extraction logic simple.

An additional benefit (?) of the diff is that now uiomove(9) is called
without splhigh().

I have tested the diff on amd64 and octeon. However, additional tests
on these and other platforms would be very helpful.

OK?

Index: kern/subr_log.c
===
RCS file: src/sys/kern/subr_log.c,v
retrieving revision 1.68
diff -u -p -r1.68 subr_log.c
--- kern/subr_log.c 18 Aug 2020 13:41:49 -  1.68
+++ kern/subr_log.c 18 Aug 2020 14:10:25 -
@@ -52,6 +52,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #ifdef KTRACE
@@ -69,8 +70,12 @@
 #define LOG_ASYNC  0x04
 #define LOG_RDWAIT 0x08
 
+/*
+ * Locking:
+ * L   log_mtx
+ */
 struct logsoftc {
-   int sc_state;   /* see above for possibilities */
+   int sc_state;   /* [L] see above for possibilities */
struct  selinfo sc_selp;/* process waiting on select call */
struct  sigio_ref sc_sigio; /* async I/O registration */
int sc_need_wakeup; /* if set, wake up waiters */
@@ -83,6 +88,14 @@ struct   msgbuf *msgbufp;/* the mapped b
 struct msgbuf *consbufp;   /* console message buffer. */
 struct file *syslogf;
 
+/*
+ * Lock that serializes access to log message buffers.
+ * This should be kept as a leaf lock in order not to constrain where
+ * printf(9) can be used.
+ */
+struct mutex log_mtx =
+MUTEX_INITIALIZER_FLAGS(IPL_HIGH, "logmtx", MTX_NOWITNESS);
+
 void filt_logrdetach(struct knote *kn);
 int filt_logread(struct knote *kn, long hint);
 
@@ -144,13 +157,11 @@ initconsbuf(void)
 void
 msgbuf_putchar(struct msgbuf *mbp, const char c)
 {
-   int s;
-
if (mbp->msg_magic != MSG_MAGIC)
/* Nothing we can do */
return;
 
-   s = splhigh();
+   mtx_enter(_mtx);
mbp->msg_bufc[mbp->msg_bufx++] = c;
if (mbp->msg_bufx < 0 || mbp->msg_bufx >= mbp->msg_bufs)
mbp->msg_bufx = 0;
@@ -160,20 +171,19 @@ msgbuf_putchar(struct msgbuf *mbp, const
mbp->msg_bufr = 0;
mbp->msg_bufd++;
}
-   splx(s);
+   mtx_leave(_mtx);
 }
 
 size_t
 msgbuf_getlen(struct msgbuf *mbp)
 {
long len;
-   int s;
 
-   s = splhigh();
+   mtx_enter(_mtx);
len = mbp->msg_bufx - mbp->msg_bufr;
if (len < 0)
len += mbp->msg_bufs;
-   splx(s);
+   mtx_leave(_mtx);
return (len);
 }
 
@@ -208,34 +218,47 @@ logclose(dev_t dev, int flag, int mode, 
 int
 logread(dev_t dev, struct uio *uio, int flag)
 {
+   struct sleep_state sls;
struct msgbuf *mbp = msgbufp;
-   size_t l;
-   int s, error = 0;
+   size_t l, rpos;
+   int error = 0;
 
-   s = splhigh();
+   mtx_enter(_mtx);
while (mbp->msg_bufr == mbp->msg_bufx) {
if (flag & IO_NDELAY) {
error = EWOULDBLOCK;
goto out;
}
logsoftc.sc_state |= LOG_RDWAIT;
-   error = tsleep_nsec(mbp, LOG_RDPRI | PCATCH, "klog", INFSLP);
+   mtx_leave(_mtx);
+   /*
+* Set up and enter sleep manually instead of using msleep()
+* to keep log_mtx as a leaf lock.
+*/
+   sleep_setup(, mbp, LOG_RDWAIT | PCATCH, "klog");
+   sleep_setup_signal();
+   sleep_finish(, logsoftc.sc_state & LOG_RDWAIT);
+   error = sleep_finish_signal();
+   mtx_enter(_mtx);
if (error)
goto out;
}
-   logsoftc.sc_state &= ~LOG_RDWAIT;
 
if (mbp->msg_bufd > 0) {
char buf[64];
+   long ndropped;
 
+   ndropped = mbp->msg_bufd;
+   mtx_leave(_mtx);
l = snprintf(buf, sizeof(buf),
"<%d>klog: dropped %ld byte%s, message buffer full\n",
-   

Re: Fix kn_data returned by filt_logread()

2020-08-16 Thread Visa Hankala
On Sun, Aug 16, 2020 at 11:39:31AM +, Visa Hankala wrote:
> The kernel message buffer is circular. This is not handled properly
> by filt_logread(). When the write index wraps, and becomes smaller than
> the read index, filt_logread() reports an incorrect number of available
> bytes. This has not affected the activation of the event, though,
> because the reported number is non-zero even in the incorrect case.
> 
> The error looks like the following with ktrace:
> 
>  60243 syslogd  STRU  struct kevent { ident=10, filter=EVFILT_READ, 
> flags=0x1, fflags=0<>, data=18446744073709551615, udata=0x34c7e21000 }

Here is an updated diff. It uses the same buffer length computation
for kn_data and FIONREAD. These interfaces, kevent(2) and ioctl(2),
should indicate the same number of available bytes after all. Also,
now the computation has the more compact form used by FIONREAD.

OK?

Index: kern/subr_log.c
===
RCS file: src/sys/kern/subr_log.c,v
retrieving revision 1.66
diff -u -p -r1.66 subr_log.c
--- kern/subr_log.c 7 Apr 2020 13:27:51 -   1.66
+++ kern/subr_log.c 17 Aug 2020 05:37:41 -
@@ -95,6 +95,7 @@ const struct filterops logread_filtops =
 
 int dosendsyslog(struct proc *, const char *, size_t, int, enum uio_seg);
 void logtick(void *);
+size_t msgbuf_getlen(struct msgbuf *);
 
 void
 initmsgbuf(caddr_t buf, size_t bufsize)
@@ -163,6 +163,20 @@ msgbuf_putchar(struct msgbuf *mbp, const
splx(s);
 }
 
+size_t
+msgbuf_getlen(struct msgbuf *mbp)
+{
+   long len;
+   int s;
+
+   s = splhigh();
+   len = mbp->msg_bufx - mbp->msg_bufr;
+   if (len < 0)
+   len += mbp->msg_bufs;
+   splx(s);
+   return (len);
+}
+
 int
 logopen(dev_t dev, int flags, int mode, struct proc *p)
 {
@@ -297,14 +311,10 @@ filt_logrdetach(struct knote *kn)
 int
 filt_logread(struct knote *kn, long hint)
 {
-   struct  msgbuf *p = (struct  msgbuf *)kn->kn_hook;
-   int s, event = 0;
+   struct msgbuf *mbp = kn->kn_hook;
 
-   s = splhigh();
-   kn->kn_data = (int)(p->msg_bufx - p->msg_bufr);
-   event = (p->msg_bufx != p->msg_bufr);
-   splx(s);
-   return (event);
+   kn->kn_data = msgbuf_getlen(mbp);
+   return (kn->kn_data != 0);
 }
 
 void
@@ -357,19 +367,13 @@ int
 logioctl(dev_t dev, u_long com, caddr_t data, int flag, struct proc *p)
 {
struct file *fp;
-   long l;
-   int error, s;
+   int error;
 
switch (com) {
 
/* return number of characters immediately available */
case FIONREAD:
-   s = splhigh();
-   l = msgbufp->msg_bufx - msgbufp->msg_bufr;
-   splx(s);
-   if (l < 0)
-   l += msgbufp->msg_bufs;
-   *(int *)data = l;
+   *(int *)data = (int)msgbuf_getlen(msgbufp);
break;
 
case FIONBIO:



Fix kn_data returned by filt_logread()

2020-08-16 Thread Visa Hankala
The kernel message buffer is circular. This is not handled properly
by filt_logread(). When the write index wraps, and becomes smaller than
the read index, filt_logread() reports an incorrect number of available
bytes. This has not affected the activation of the event, though,
because the reported number is non-zero even in the incorrect case.

The error looks like the following with ktrace:

 60243 syslogd  STRU  struct kevent { ident=10, filter=EVFILT_READ, 
flags=0x1, fflags=0<>, data=18446744073709551615, udata=0x34c7e21000 }


The diff below adds handling for the wrapped index case.

OK?

Index: kern/subr_log.c
===
RCS file: src/sys/kern/subr_log.c,v
retrieving revision 1.66
diff -u -p -r1.66 subr_log.c
--- kern/subr_log.c 7 Apr 2020 13:27:51 -   1.66
+++ kern/subr_log.c 16 Aug 2020 10:57:59 -
@@ -297,14 +296,16 @@ filt_logrdetach(struct knote *kn)
 int
 filt_logread(struct knote *kn, long hint)
 {
-   struct  msgbuf *p = (struct  msgbuf *)kn->kn_hook;
-   int s, event = 0;
+   struct msgbuf *mbp = kn->kn_hook;
+   int s;
 
s = splhigh();
-   kn->kn_data = (int)(p->msg_bufx - p->msg_bufr);
-   event = (p->msg_bufx != p->msg_bufr);
+   if (mbp->msg_bufx >= mbp->msg_bufr)
+   kn->kn_data = mbp->msg_bufx - mbp->msg_bufr;
+   else
+   kn->kn_data = (mbp->msg_bufs - mbp->msg_bufr) + mbp->msg_bufx;
splx(s);
-   return (event);
+   return (kn->kn_data != 0);
 }
 
 void



Remove unnecessary field from struct msgbuf

2020-08-16 Thread Visa Hankala
The msg_bufl field of struct msgbuf is written but never read. The value
was used by kernfs which is no longer present, so the code could be
cleaned up a little by removing the field.

On some systems the message buffer data are preserved across a reboot.
However, the preservation is best-effort only, and initmsgbuf() refuses
to use the old data if the struct's size has changed.

Changing the struct affects at least dmesg(8). The program has to be
recompiled.

OK?

Index: kern/subr_log.c
===
RCS file: src/sys/kern/subr_log.c,v
retrieving revision 1.66
diff -u -p -r1.66 subr_log.c
--- kern/subr_log.c 7 Apr 2020 13:27:51 -   1.66
+++ kern/subr_log.c 16 Aug 2020 10:57:59 -
@@ -151,7 +151,6 @@ msgbuf_putchar(struct msgbuf *mbp, const
 
s = splhigh();
mbp->msg_bufc[mbp->msg_bufx++] = c;
-   mbp->msg_bufl = lmin(mbp->msg_bufl+1, mbp->msg_bufs);
if (mbp->msg_bufx < 0 || mbp->msg_bufx >= mbp->msg_bufs)
mbp->msg_bufx = 0;
/* If the buffer is full, keep the most recent data. */
Index: sys/msgbuf.h
===
RCS file: src/sys/sys/msgbuf.h,v
retrieving revision 1.11
diff -u -p -r1.11 msgbuf.h
--- sys/msgbuf.h23 Jun 2016 13:15:21 -  1.11
+++ sys/msgbuf.h16 Aug 2020 10:57:59 -
@@ -38,7 +38,6 @@ structmsgbuf {
longmsg_bufx;   /* write pointer */
longmsg_bufr;   /* read pointer */
longmsg_bufs;   /* real msg_bufc size (bytes) */
-   longmsg_bufl;   /* # chars, <= msg_bufs */
longmsg_bufd;   /* number of dropped bytes */
charmsg_bufc[1];/* buffer */
 };



Re: kqueue_scan_setup/finish

2020-08-15 Thread Visa Hankala
On Fri, Aug 14, 2020 at 12:31:33PM +0200, Martin Pieuchot wrote:
> The previous change introducing the kqueue_scan_setup()/finish() API
> required to switch poll(2) internals to use the kqueue mechanism has
> been backed out.  The reason for the regression is still unknown, so
> let's take a baby step approach.
> 
> Diff below introduces the new API with only minimal changes.  It should
> not introduce any change in behavior.

There is a subtle change in behaviour: the markers' kn_filter and
kn_status are now initialized only once, at the start of the scan.
Previously, they were set also at the start of each retry.

This diff should be tested at least by those who had problems with
the original kqueue_scan_state patch. The original patch could cause
data loss, and the exact cause is still unknown. Hence extra caution is
warranted.

> Index: kern/kern_event.c
> ===
> RCS file: /cvs/src/sys/kern/kern_event.c,v
> retrieving revision 1.142
> diff -u -p -r1.142 kern_event.c
> --- kern/kern_event.c 12 Aug 2020 13:49:24 -  1.142
> +++ kern/kern_event.c 14 Aug 2020 10:13:38 -
> @@ -64,9 +64,6 @@ voidKQREF(struct kqueue *);
>  void KQRELE(struct kqueue *);
>  
>  int  kqueue_sleep(struct kqueue *, struct timespec *);
> -int  kqueue_scan(struct kqueue *kq, int maxevents,
> - struct kevent *ulistp, struct timespec *timeout,
> - struct kevent *kev, struct proc *p, int *retval);
>  
>  int  kqueue_read(struct file *, struct uio *, int);
>  int  kqueue_write(struct file *, struct uio *, int);
> @@ -554,6 +551,7 @@ out:
>  int
>  sys_kevent(struct proc *p, void *v, register_t *retval)
>  {
> + struct kqueue_scan_state scan;
>   struct filedesc* fdp = p->p_fd;
>   struct sys_kevent_args /* {
>   syscallarg(int) fd;
> @@ -635,11 +633,12 @@ sys_kevent(struct proc *p, void *v, regi
>   goto done;
>   }
>  
> - KQREF(kq);
> + kqueue_scan_setup(, kq);
>   FRELE(fp, p);
> - error = kqueue_scan(kq, SCARG(uap, nevents), SCARG(uap, eventlist),
> + error = kqueue_scan(, SCARG(uap, nevents), SCARG(uap, eventlist),
>   tsp, kev, p, );
> - KQRELE(kq);
> + kqueue_scan_finish();
> +
>   *retval = n;
>   return (error);
>  
> @@ -895,11 +894,13 @@ kqueue_sleep(struct kqueue *kq, struct t
>  }
>  
>  int
> -kqueue_scan(struct kqueue *kq, int maxevents, struct kevent *ulistp,
> -struct timespec *tsp, struct kevent *kev, struct proc *p, int *retval)
> +kqueue_scan(struct kqueue_scan_state *scan, int maxevents,
> +struct kevent *ulistp, struct timespec *tsp, struct kevent *kev,
> +struct proc *p, int *retval)
>  {
> + struct kqueue *kq = scan->kqs_kq;
>   struct kevent *kevp;
> - struct knote mend, mstart, *kn;
> + struct knote *kn;
>   int s, count, nkev, error = 0;
>  
>   nkev = 0;
> @@ -909,9 +910,6 @@ kqueue_scan(struct kqueue *kq, int maxev
>   if (count == 0)
>   goto done;
>  
> - memset(, 0, sizeof(mstart));
> - memset(, 0, sizeof(mend));
> -
>  retry:
>   KASSERT(count == maxevents);
>   KASSERT(nkev == 0);
> @@ -939,18 +937,16 @@ retry:
>   goto done;
>   }
>  
> - mstart.kn_filter = EVFILT_MARKER;
> - mstart.kn_status = KN_PROCESSING;
> - TAILQ_INSERT_HEAD(>kq_head, , kn_tqe);
> - mend.kn_filter = EVFILT_MARKER;
> - mend.kn_status = KN_PROCESSING;
> - TAILQ_INSERT_TAIL(>kq_head, , kn_tqe);
> + TAILQ_INSERT_TAIL(>kq_head, >kqs_end, kn_tqe);
> + TAILQ_INSERT_HEAD(>kq_head, >kqs_start, kn_tqe);
>   while (count) {
> - kn = TAILQ_NEXT(, kn_tqe);
> + kn = TAILQ_NEXT(>kqs_start, kn_tqe);
>   if (kn->kn_filter == EVFILT_MARKER) {
> - if (kn == ) {
> - TAILQ_REMOVE(>kq_head, , kn_tqe);
> - TAILQ_REMOVE(>kq_head, , kn_tqe);
> + if (kn == >kqs_end) {
> + TAILQ_REMOVE(>kq_head, >kqs_end,
> + kn_tqe);
> + TAILQ_REMOVE(>kq_head, >kqs_start,
> + kn_tqe);
>   splx(s);
>   if (count == maxevents)
>   goto retry;
> @@ -958,8 +954,9 @@ retry:
>   }
>  
>   /* Move start marker past another thread's marker. */
> - TAILQ_REMOVE(>kq_head, , kn_tqe);
> - TAILQ_INSERT_AFTER(>kq_head, kn, , kn_tqe);
> + TAILQ_REMOVE(>kq_head, >kqs_start, kn_tqe);
> + TAILQ_INSERT_AFTER(>kq_head, kn, >kqs_start,
> + kn_tqe);
>   continue;
>   }
>  
> @@ -1029,8 +1026,8 @@ retry:
>   break;
>   }
>   }
> 

Reduce kqueue_scan()'s stack usage

2020-08-09 Thread Visa Hankala
sys_kevent() and kqueue_scan() consume a relatively large amount of
kernel stack space, 352 and 544 bytes, respectively, on amd64. The
portion of kqueue_scan() can be reduced by 256 bytes to 288 bytes
by reusing sys_kevent()'s kev[] array.

This diff overlaps with the non-committed kqueue_scan() refactoring.
However, there is an unsolved system hang issue with the related
kqueue_scan_state patch. The change below reduces the likelihood of
kernel stack exhaustion, though it is unlikely that it would help with
the hang. I think the reduction of stack usage is useful in itself,
however.

The added asserts state that the retry branch should be taken only
if no events have been collected.

OK?

Index: kern/kern_event.c
===
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.141
diff -u -p -r1.141 kern_event.c
--- kern/kern_event.c   4 Jul 2020 08:33:43 -   1.141
+++ kern/kern_event.c   9 Aug 2020 14:51:35 -
@@ -66,7 +66,7 @@ void  KQRELE(struct kqueue *);
 intkqueue_sleep(struct kqueue *, struct timespec *);
 intkqueue_scan(struct kqueue *kq, int maxevents,
struct kevent *ulistp, struct timespec *timeout,
-   struct proc *p, int *retval);
+   struct kevent *kev, struct proc *p, int *retval);
 
 intkqueue_read(struct file *, struct uio *, int);
 intkqueue_write(struct file *, struct uio *, int);
@@ -638,7 +638,7 @@ sys_kevent(struct proc *p, void *v, regi
KQREF(kq);
FRELE(fp, p);
error = kqueue_scan(kq, SCARG(uap, nevents), SCARG(uap, eventlist),
-   tsp, p, );
+   tsp, kev, p, );
KQRELE(kq);
*retval = n;
return (error);
@@ -896,12 +896,14 @@ kqueue_sleep(struct kqueue *kq, struct t
 
 int
 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent *ulistp,
-struct timespec *tsp, struct proc *p, int *retval)
+struct timespec *tsp, struct kevent *kev, struct proc *p, int *retval)
 {
struct kevent *kevp;
struct knote mend, mstart, *kn;
-   int s, count, nkev = 0, error = 0;
-   struct kevent kev[KQ_NEVENTS];
+   int s, count, nkev, error = 0;
+
+   nkev = 0;
+   kevp = kev;
 
count = maxevents;
if (count == 0)
@@ -911,12 +913,14 @@ kqueue_scan(struct kqueue *kq, int maxev
memset(, 0, sizeof(mend));
 
 retry:
+   KASSERT(count == maxevents);
+   KASSERT(nkev == 0);
+
if (kq->kq_state & KQ_DYING) {
error = EBADF;
goto done;
}
 
-   kevp = [0];
s = splhigh();
if (kq->kq_count == 0) {
if (tsp != NULL && !timespecisset(tsp)) {
@@ -1019,7 +1023,7 @@ retry:
sizeof(struct kevent) * nkev);
ulistp += nkev;
nkev = 0;
-   kevp = [0];
+   kevp = kev;
s = splhigh();
if (error)
break;



Use klist_invalidate() in knote_processexit()

2020-06-29 Thread Visa Hankala
knote_processexit() uses knote_remove() to clear any remaining knotes
from >ps_klist when process `pr' exits. All EVFILT_PROC knotes are
removed by the KNOTE(>ps_klist, NOTE_EXIT) call. However, ps_klist
can contain EVFILT_SIGNAL knotes after the KNOTE().

To reserve knote_remove() for the kqueue subsystem's internal use, the
diff below makes knote_processexit() use klist_invalidate() instead.
For now, knote_remove() and klist_invalidate() are quite similar.
However, more differences will arise with fine-grained locking, and
knote_remove() will no longer be appropriate for knote_processexit().

To keep the existing behaviour with EVFILT_SIGNAL knotes, the diff
adjusts klist_invalidate() to drop non-fd-based knotes instead of
activating them with an EOF condition. This overloading of FILTEROP_ISFD
should be fine because EVFILT_PROC, EVFILT_SIGNAL and EVFILT_TIMER
are the only filter types that are not fd-based; EVFILT_PROC and
EVFILT_SIGNAL use the same knote list, whereas EVFILT_TIMER does not
use a knote list. The existing uses of klist_invalidate() should not
be affected.

OK?

Index: kern/kern_event.c
===
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.140
diff -u -p -r1.140 kern_event.c
--- kern/kern_event.c   22 Jun 2020 13:14:32 -  1.140
+++ kern/kern_event.c   29 Jun 2020 16:10:14 -
@@ -1336,10 +1336,12 @@ knote_processexit(struct proc *p)
 {
struct process *pr = p->p_p;
 
+   KASSERT(p == curproc);
+
KNOTE(>ps_klist, NOTE_EXIT);
 
/* remove other knotes hanging off the process */
-   knote_remove(p, >ps_klist.kl_list);
+   klist_invalidate(>ps_klist);
 }
 
 void
@@ -1446,6 +1448,7 @@ void
 klist_invalidate(struct klist *list)
 {
struct knote *kn;
+   struct proc *p = curproc;
int s;
 
/*
@@ -1460,10 +1463,15 @@ klist_invalidate(struct klist *list)
continue;
splx(s);
kn->kn_fop->f_detach(kn);
-   kn->kn_fop = _filtops;
-   knote_activate(kn);
-   s = splhigh();
-   knote_release(kn);
+   if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
+   kn->kn_fop = _filtops;
+   knote_activate(kn);
+   s = splhigh();
+   knote_release(kn);
+   } else {
+   knote_drop(kn, p);
+   s = splhigh();
+   }
}
splx(s);
 }



Retire

2020-06-19 Thread Visa Hankala
The  header has been unhooked from  /
 for a while, and no one has complained. Nothing seems to
reference  any longer, so the following files should
be ready for removal:

sys/arch/alpha/include/stdarg.h
sys/arch/amd64/include/stdarg.h
sys/arch/arm/include/stdarg.h
sys/arch/arm64/include/stdarg.h
sys/arch/armv7/include/stdarg.h
sys/arch/hppa/include/stdarg.h
sys/arch/i386/include/stdarg.h
sys/arch/landisk/include/stdarg.h
sys/arch/loongson/include/stdarg.h
sys/arch/luna88k/include/stdarg.h
sys/arch/m88k/include/stdarg.h
sys/arch/macppc/include/stdarg.h
sys/arch/mips64/include/stdarg.h
sys/arch/octeon/include/stdarg.h
sys/arch/powerpc/include/stdarg.h
sys/arch/sgi/include/stdarg.h
sys/arch/sh/include/stdarg.h
sys/arch/sparc64/include/stdarg.h

OK to remove them?



Block interrupts when changing struct process' ps_klist

2020-06-13 Thread Visa Hankala
The ps_klist member of struct process can be accessed from interrupt
context as a result of signal sending. Currently, interrupts are not
blocked when ps_klist is modified, which allows race conditions.

The patch below guards ps_klist insertions and removals with splhigh().
The list should only be modified from process context.

Note that in practice the highest interrupt priority level where kqueue
can be used is IPL_SCHED. It is not safe to access the scheduler from
a higher level.

OK?

Index: kern/kern_event.c
===
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.136
diff -u -p -r1.136 kern_event.c
--- kern/kern_event.c   12 Jun 2020 09:34:17 -  1.136
+++ kern/kern_event.c   13 Jun 2020 15:30:07 -
@@ -246,6 +246,7 @@ int
 filt_procattach(struct knote *kn)
 {
struct process *pr;
+   int s;
 
if ((curproc->p_p->ps_flags & PS_PLEDGE) &&
(curproc->p_p->ps_pledge & PLEDGE_PROC) == 0)
@@ -275,7 +276,9 @@ filt_procattach(struct knote *kn)
}
 
/* XXX lock the proc here while adding to the list? */
+   s = splhigh();
klist_insert(>ps_klist, kn);
+   splx(s);
 
return (0);
 }
@@ -292,12 +295,15 @@ void
 filt_procdetach(struct knote *kn)
 {
struct process *pr = kn->kn_ptr.p_process;
+   int s;
 
if (kn->kn_status & KN_DETACHED)
return;
 
/* XXX locking?  this might modify another process. */
+   s = splhigh();
klist_remove(>ps_klist, kn);
+   splx(s);
 }
 
 int
@@ -326,10 +332,10 @@ filt_proc(struct knote *kn, long hint)
 
s = splhigh();
kn->kn_status |= KN_DETACHED;
-   splx(s);
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
kn->kn_data = W_EXITCODE(pr->ps_xexit, pr->ps_xsig);
klist_remove(>ps_klist, kn);
+   splx(s);
return (1);
}
 
Index: kern/kern_sig.c
===
RCS file: src/sys/kern/kern_sig.c,v
retrieving revision 1.256
diff -u -p -r1.256 kern_sig.c
--- kern/kern_sig.c 7 Apr 2020 13:27:51 -   1.256
+++ kern/kern_sig.c 13 Jun 2020 15:30:07 -
@@ -1803,6 +1803,7 @@ int
 filt_sigattach(struct knote *kn)
 {
struct process *pr = curproc->p_p;
+   int s;
 
if (kn->kn_id >= NSIG)
return EINVAL;
@@ -1811,7 +1812,9 @@ filt_sigattach(struct knote *kn)
kn->kn_flags |= EV_CLEAR;   /* automatically set */
 
/* XXX lock the proc here while adding to the list? */
+   s = splhigh();
klist_insert(>ps_klist, kn);
+   splx(s);
 
return (0);
 }
@@ -1820,8 +1823,11 @@ void
 filt_sigdetach(struct knote *kn)
 {
struct process *pr = kn->kn_ptr.p_process;
+   int s;
 
+   s = splhigh();
klist_remove(>ps_klist, kn);
+   splx(s);
 }
 
 /*



Call FRELE() without fdplock in dup*(2)

2020-06-10 Thread Visa Hankala
A while ago, finishdup() was changed to release fdplock before calling
closef() to avoid potential lock order problems in the file close path.
However, the dup* code can still invoke that path with fdplock held
through FRELE(). That is fixed by the diff below.

(In principle, the FRELE(fp, p) could be eliminated by getting fp while
the file descriptor table is locked. As long as the lock is held, other
threads are unable to close the file descriptor and hence the temporary
file reference is unnecessary.)

OK?

Index: kern/kern_descrip.c
===
RCS file: src/sys/kern/kern_descrip.c,v
retrieving revision 1.201
diff -u -p -r1.201 kern_descrip.c
--- kern/kern_descrip.c 13 Mar 2020 10:07:01 -  1.201
+++ kern/kern_descrip.c 10 Jun 2020 14:16:12 -
@@ -301,13 +301,14 @@ restart:
return (EBADF);
fdplock(fdp);
if ((error = fdalloc(p, 0, )) != 0) {
-   FRELE(fp, p);
if (error == ENOSPC) {
fdexpand(p);
fdpunlock(fdp);
+   FRELE(fp, p);
goto restart;
}
fdpunlock(fdp);
+   FRELE(fp, p);
return (error);
}
/* No need for FRELE(), finishdup() uses current ref. */
@@ -373,13 +374,14 @@ restart:
fdplock(fdp);
if (new >= fdp->fd_nfiles) {
if ((error = fdalloc(p, new, )) != 0) {
-   FRELE(fp, p);
if (error == ENOSPC) {
fdexpand(p);
fdpunlock(fdp);
+   FRELE(fp, p);
goto restart;
}
fdpunlock(fdp);
+   FRELE(fp, p);
return (error);
}
if (new != i)
@@ -433,13 +435,14 @@ restart:
}
fdplock(fdp);
if ((error = fdalloc(p, newmin, )) != 0) {
-   FRELE(fp, p);
if (error == ENOSPC) {
fdexpand(p);
fdpunlock(fdp);
+   FRELE(fp, p);
goto restart;
}
fdpunlock(fdp);
+   FRELE(fp, p);
} else {
int dupflags = 0;
 



Re: Kill NFS-only kqueue poller thread

2020-05-31 Thread Visa Hankala
On Sun, May 31, 2020 at 09:40:47AM +0200, Martin Pieuchot wrote:
> On 30/05/20(Sat) 15:49, Visa Hankala wrote:
> > [...] 
> > Local filesystems can observe changes at the source, which makes polling
> > unnecessary. NFS clients do not have that benefit. The NFSv3 protocol
> > lacks a mechanism to notify clients of changes.
> > 
> > The NFS polling mechanism is in use for example when running tail -f
> > on a remote file. Directory monitoring can be utilized for example
> > by a directory browser application or a distributed batch processing
> > system.
> > 
> > > > It is true that the code is not perfect, but having even basic and
> > > > best-effort functionality with kevent(2) and NFS is better than nothing
> > > > in my opinion. The kqueue subsystem should not dumb down for emulating
> > > > poll(2) and select(2).
> > > 
> > > In that case we've to accept or work-around with the fact that a thread
> > > is allocated, an operation that can fail, in the kqfilter().  Do you have
> > > an opinion on that?  What should we do for other FSs is uniformity wanted?
> > 
> > Can't the emulation of poll(2) and select(2) operate without the poller?
> > These system calls are supposed to return true for regular files for
> > reading and writing. No polling of remote state is needed for that.
> > 
> > The read behaviour of poll(2) and select(2) differs from kevent(2)'s
> > EVFILT_READ. Consequently, there has to be an indicator that makes
> > f_event work in poll(2)/select(2) mode. I guess the same indicator can
> > control nfs_kqfilter().
> 
> So are you saying that poll(2) is broken on NFS and suggest that I extend
> the kqfilter to match this behavior?

I am not saying that. I am saying that poll(2) and kevent(2) have
different semantics with regular files. Except that the write filter
is missing at the moment, the current NFS poll(2) and kevent(2)
behaviour looks about similar to the other filesystems.

poll(2) returns true immediately, whereas kevent(2) EVFILT_READ returns
true once the file pointer is not at the end of file (unless NOTE_EOF
is set).

For the semantics of poll(2), the NFS poller is pointless. However,
the poller is necessary for approximating kevent(2) semantics on NFS.



Re: Kill NFS-only kqueue poller thread

2020-05-30 Thread Visa Hankala
On Sat, May 30, 2020 at 03:34:06PM +0200, Martin Pieuchot wrote:
> On 30/05/20(Sat) 09:22, Visa Hankala wrote:
> > On Thu, May 28, 2020 at 12:11:20PM +0200, Martin Pieuchot wrote:
> > > When it comes to kqueue filters NFS is special.  A custom thread is
> > > created when the first event is registered.  Its purpose is to poll
> > > for changes every 2.5sec.  This logic has been inherited from NetBSD
> > > and is not present in FreeBSD.
> > > 
> > > Since filt_nfsread() only check `n_size' of a given nfsnode having a
> > > different context to emulate stat(2) behavior in kernel is questionable.
> > > So the diff below gets rid of this logic.  The rationals are KISS,
> > > coherency with nfs_poll() and match what other FS kqueue filters do.
> > > 
> > > While here add a missing write filter.
> > > 
> > > Matching the nfs_poll() logic and the write filter are necessary to keep
> > > the current behavior of poll(2) & select(2) once they start querying
> > > kqfilter handlers.
> > 
> > I think it is not good to remove nfs_kqpoll(). The function does more
> > than just supports the read filter. It generates various vnode events.
> > If the poller is removed, it becomes impossible for example to monitor
> > an NFS directory with kevent(2).
> 
> What do you mean with "impossible to monitor a NFS directory"?  Are you
> saying that NFS being is the only FS to have a specific thread to implement
> a poller is wanted feature?  If it is wanted, why is it NFS only?  Which
> use case involve monitoring a NFS directory with kevent(2)?  When are we
> aware of the existence of a "nfskqpoll" kernel thread?

Local filesystems can observe changes at the source, which makes polling
unnecessary. NFS clients do not have that benefit. The NFSv3 protocol
lacks a mechanism to notify clients of changes.

The NFS polling mechanism is in use for example when running tail -f
on a remote file. Directory monitoring can be utilized for example
by a directory browser application or a distributed batch processing
system.

> > It is true that the code is not perfect, but having even basic and
> > best-effort functionality with kevent(2) and NFS is better than nothing
> > in my opinion. The kqueue subsystem should not dumb down for emulating
> > poll(2) and select(2).
> 
> In that case we've to accept or work-around with the fact that a thread
> is allocated, an operation that can fail, in the kqfilter().  Do you have
> an opinion on that?  What should we do for other FSs is uniformity wanted?

Can't the emulation of poll(2) and select(2) operate without the poller?
These system calls are supposed to return true for regular files for
reading and writing. No polling of remote state is needed for that.

The read behaviour of poll(2) and select(2) differs from kevent(2)'s
EVFILT_READ. Consequently, there has to be an indicator that makes
f_event work in poll(2)/select(2) mode. I guess the same indicator can
control nfs_kqfilter().



Outdated BUGS section in kqueue.2

2020-05-30 Thread Visa Hankala
The BUGS section in kqueue.2 is outdated. The FIFO limitation does not
seem to exist anymore, AIO is not implemented at all in the kernel,
and watching vnodes looks possible on all filesystem types that
allow writing.

OK to remove the section?

Index: kqueue.2
===
RCS file: src/lib/libc/sys/kqueue.2,v
retrieving revision 1.39
diff -u -p -r1.39 kqueue.2
--- kqueue.21 Jul 2019 16:52:02 -   1.39
+++ kqueue.230 May 2020 09:30:12 -
@@ -565,7 +565,3 @@ The
 .Fn kqueue
 system and this manual page were written by
 .An Jonathan Lemon Aq Mt jle...@freebsd.org .
-.Sh BUGS
-It is currently not possible to watch FIFOs or AIO that reside
-on anything but a UFS file system.
-Watching a vnode is possible on UFS, NFS and MS-DOS file systems.



Re: Kill NFS-only kqueue poller thread

2020-05-30 Thread Visa Hankala
On Thu, May 28, 2020 at 12:11:20PM +0200, Martin Pieuchot wrote:
> When it comes to kqueue filters NFS is special.  A custom thread is
> created when the first event is registered.  Its purpose is to poll
> for changes every 2.5sec.  This logic has been inherited from NetBSD
> and is not present in FreeBSD.
> 
> Since filt_nfsread() only check `n_size' of a given nfsnode having a
> different context to emulate stat(2) behavior in kernel is questionable.
> So the diff below gets rid of this logic.  The rationals are KISS,
> coherency with nfs_poll() and match what other FS kqueue filters do.
> 
> While here add a missing write filter.
> 
> Matching the nfs_poll() logic and the write filter are necessary to keep
> the current behavior of poll(2) & select(2) once they start querying
> kqfilter handlers.

I think it is not good to remove nfs_kqpoll(). The function does more
than just supports the read filter. It generates various vnode events.
If the poller is removed, it becomes impossible for example to monitor
an NFS directory with kevent(2).

It is true that the code is not perfect, but having even basic and
best-effort functionality with kevent(2) and NFS is better than nothing
in my opinion. The kqueue subsystem should not dumb down for emulating
poll(2) and select(2).



Retire

2020-05-23 Thread Visa Hankala
The  header is not used any longer. Consequently,
it should be safe to remove the following files:

sys/arch/alpha/include/varargs.h
sys/arch/hppa/include/varargs.h
sys/arch/i386/include/varargs.h
sys/arch/landisk/include/varargs.h
sys/arch/loongson/include/varargs.h
sys/arch/luna88k/include/varargs.h
sys/arch/m88k/include/varargs.h
sys/arch/macppc/include/varargs.h
sys/arch/mips64/include/varargs.h
sys/arch/octeon/include/varargs.h
sys/arch/powerpc/include/varargs.h
sys/arch/sgi/include/varargs.h
sys/arch/sh/include/varargs.h
sys/arch/sparc64/include/varargs.h

OK to remove them?


(As a leftover of gcc 2.95 times, the  header should be ripe
for removal as well. clang's copy of  gives an #error,
whereas attempts to use the header with gcc 4 fail because of missing
symbol __builtin_varargs_start.)



Update stale comment in

2020-05-20 Thread Visa Hankala
The  header contains an outdated comment about
headers  and  and should
be updated.

The comment is not fully consistent with the syslog(3) manual page
because the manual does mention . However, the header's
point still seems valid.

OK?

Index: sys/syslog.h
===
RCS file: src/sys/sys/syslog.h,v
retrieving revision 1.16
diff -u -p -r1.16 syslog.h
--- sys/syslog.h8 Aug 2017 14:23:23 -   1.16
+++ sys/syslog.h20 May 2020 16:11:59 -
@@ -183,11 +183,10 @@ struct syslog_data {
 #ifndef _KERNEL
 
 /*
- * Don't use va_list in the vsyslog() prototype.   Va_list is typedef'd in two
- * places ( and ), so if we include one
- * of them here we may collide with the utility's includes.  It's unreasonable
- * for utilities to have to include one of them to include syslog.h, so we get
- * __va_list from  and use it.
+ * Don't use va_list in the vsyslog() prototype.   Va_list is typedef'd
+ * in .  Including it here may collide with the utility's includes.
+ * It's unreasonable for utilities to have to include it to include ,
+ * so we get __va_list from  and use it.
  */
 #include 
 #include 



Remove sparc64 mutex.S

2020-05-18 Thread Visa Hankala
sparc64 has used the MI mutex since year 2018. However, the MD code
still exists. The diff below removes it.

OK?

Index: arch/sparc64/sparc64/mutex.S
===
RCS file: arch/sparc64/sparc64/mutex.S
diff -N arch/sparc64/sparc64/mutex.S
--- arch/sparc64/sparc64/mutex.S23 Apr 2019 13:35:12 -  1.10
+++ /dev/null   1 Jan 1970 00:00:00 -
@@ -1,126 +0,0 @@
-/* $OpenBSD: mutex.S,v 1.10 2019/04/23 13:35:12 visa Exp $ */
-
-/*
- * Copyright (c) 2007 Mark Kettenis
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-/*
- * Mutex implementation based on Example 9 from Appendix J of
- * SPARC-V9/R1.4.5, The SPARC Architecture Manual.
- */
-
-#include "assym.h"
-#include 
-#include 
-
-#ifdef MULTIPROCESSOR
-
-#define GET_CURCPU(ci) \
-   ldx [%g7 + CI_SELF], ci
-
-#else
-
-#define GET_CURCPU(ci) \
-   set CPUINFO_VA, ci
-
-#endif
-
-
-ENTRY(__mtx_init)
-   stx %g0, [%o0 + MTX_OWNER]
-   stw %o1, [%o0 + MTX_WANTIPL]
-   retl
-stw%g0, [%o0 + MTX_OLDIPL]
-
-ENTRY(mtx_enter)
-   rdpr%pil, %g4
-   GET_CURCPU(%g1)
-1:
-   ld  [%o0 + MTX_WANTIPL], %g5
-   cmp %g4, %g5
-   bge 2f
-nop
-   wrpr%g5, %pil
-2:
-   mov %g1, %g5
-/*
- * The assembler doesn't like the next line, even if MTX_OWNER is 0.
- */
-!  casx[%o0 + MTX_OWNER], %g0, %g5
-   casx[%o0], %g0, %g5
-   tst %g5
-   be  4f
-nop
-   wrpr%g4, %pil
-3:
-   ldx [%o0 + MTX_OWNER], %g5
-   tst %g5
-   bne 3b
-nop
-   ba,a1b
-4:
-   stw %g4, [%o0 + MTX_OLDIPL]
-#ifdef DIAGNOSTIC
-   ld  [%g1 + CI_MUTEX_LEVEL], %g5
-   add %g5, 1, %g5
-   st  %g5, [%g1 + CI_MUTEX_LEVEL]
-#endif
-   retl
-membar #LoadLoad | #LoadStore
-
-ENTRY(mtx_enter_try)
-   rdpr%pil, %g4
-   GET_CURCPU(%g1)
-1:
-   ld  [%o0 + MTX_WANTIPL], %g5
-   bge 2f
-nop
-   wrpr%g5, %pil
-2:
-   mov %g1, %g5
-/*
- * The assembler doesn't like the next line, even if MTX_OWNER is 0.
- */
-!  casx[%o0 + MTX_OWNER], %g0, %g5
-   casx[%o0], %g0, %g5
-   tst %g5
-   be  3f
-nop
-   wrpr%g4, %pil
-   retl
-mov0, %o0
-3:
-   stw %g4, [%o0 + MTX_OLDIPL]
-#ifdef DIAGNOSTIC
-   ld  [%g1 + CI_MUTEX_LEVEL], %g5
-   add %g5, 1, %g5
-   st  %g5, [%g1 + CI_MUTEX_LEVEL]
-#endif
-   membar  #LoadLoad | #LoadStore
-   retl
-mov1, %o0
-
-ENTRY(mtx_leave)
-#ifdef DIAGNOSTIC
-   GET_CURCPU(%g1)
-   ld  [%g1 + CI_MUTEX_LEVEL], %g5
-   sub %g5, 1, %g5
-   st  %g5, [%g1 + CI_MUTEX_LEVEL]
-#endif
-   ld  [%o0 + MTX_OLDIPL], %g1
-   membar  #StoreStore | #LoadStore
-   stx %g0, [%o0 + MTX_OWNER]
-   retl
-wrpr   %g1, %pil



Convert octrtc(4) to use

2020-05-18 Thread Visa Hankala
This diff converts octrtc(4) to use the  interface,
to reduce use of .

Some highlights for reviewing:

* dt.dt_year contains the actual year, while tt->year has base 1900.
* dt.dt_wday uses range 0-6, whereas tt->dow's range is 1-7.
* octrtc_gettime() no longer extracts day of week because
  clock_ymdhms_to_secs() does not use the value.

Testing is welcome. I don't have the hardware to test this myself.

OK?

Index: arch/octeon/dev/octrtc.c
===
RCS file: src/sys/arch/octeon/dev/octrtc.c,v
retrieving revision 1.11
diff -u -p -r1.11 octrtc.c
--- arch/octeon/dev/octrtc.c20 Nov 2017 15:20:03 -  1.11
+++ arch/octeon/dev/octrtc.c16 May 2020 16:04:12 -
@@ -21,7 +21,7 @@
 #include 
 #include 
 
-#include 
+#include 
 
 #include 
 #include 
@@ -37,6 +37,11 @@
 #define MIO_TWS_SW_TWSI_EXT0x000118001018ULL
 #define OCTRTC_REG 0x68
 
+struct octrtc_softc {
+   struct device   sc_dev;
+   struct todr_chip_handle sc_todr;
+};
+
 struct cfdriver octrtc_cd = {
NULL, "octrtc", DV_DULL
 };
@@ -44,15 +49,15 @@ struct cfdriver octrtc_cd = {
 intoctrtc_match(struct device *, void *, void *);
 void   octrtc_attach(struct device *, struct device *, void *);
 
-void   octrtc_gettime(void *, time_t, struct tod_time *);
+intoctrtc_gettime(struct todr_chip_handle *, struct timeval *);
 intoctrtc_read(uint8_t *, char);
 
-void   octrtc_settime(void *, struct tod_time *);
+intoctrtc_settime(struct todr_chip_handle *, struct timeval *);
 intoctrtc_write(uint8_t);
 
 
 struct cfattach octrtc_ca = {
-   sizeof(struct device), octrtc_match, octrtc_attach,
+   sizeof(struct octrtc_softc), octrtc_match, octrtc_attach,
 };
 
 
@@ -105,16 +110,18 @@ octrtc_attach(struct device *parent, str
 {
struct octrtc_softc *sc = (struct octrtc_softc *)self;
 
-   sys_tod.tod_cookie = sc;
-   sys_tod.tod_get = octrtc_gettime;
-   sys_tod.tod_set = octrtc_settime;
+   sc->sc_todr.cookie = sc;
+   sc->sc_todr.todr_gettime = octrtc_gettime;
+   sc->sc_todr.todr_settime = octrtc_settime;
+   todr_attach(>sc_todr);
 
printf(": DS1337\n");
 }
 
-void
-octrtc_gettime(void *cookie, time_t unused, struct tod_time *tt)
+int
+octrtc_gettime(struct todr_chip_handle *handle, struct timeval *tv)
 {
+   struct clock_ymdhms dt;
uint8_t tod[8];
uint8_t check;
int i, rc;
@@ -126,14 +133,14 @@ octrtc_gettime(void *cookie, time_t unus
rc = octrtc_read([0], 1);   /* ia read */
if (rc) {
DPRINTF(("octrtc_read(0) failed %d", rc));
-   return;
+   return EIO;
}
 
for (i = 1; i < 8; i++) {
rc = octrtc_read([i], 0);   /* current address */
if (rc) {
DPRINTF(("octrtc_read(%d) failed %d", i, rc));
-   return;
+   return EIO;
}
DPRINTF(("%#X ", tod[i]));
}
@@ -141,8 +148,8 @@ octrtc_gettime(void *cookie, time_t unus
/* Check against time-wrap */
rc = octrtc_read(, 1);/* ia read */
if (rc) {
-   DPRINTF(("octrtc_read(check) failed %d", i, rc));
-   return;
+   DPRINTF(("octrtc_read(check) failed %d", rc));
+   return EIO;
}
if ((check & 0xf) == (tod[0] & 0xf))
break;
@@ -158,15 +165,24 @@ octrtc_gettime(void *cookie, time_t unus
FROMBCD(tod[1] & 0x7f), /* minute */
FROMBCD(tod[0] & 0x7f)));   /* second */
 
-   tt->year = ((tod[5] & 0x80) ? 100 : 0) + FROMBCD(tod[6]);
-   tt->mon = FROMBCD(tod[5] & 0x1f);
-   tt->day = FROMBCD(tod[4] & 0x3f);
-   tt->dow = (tod[3] & 0x7);
-   tt->hour = FROMBCD(tod[2] & 0x3f);
+   dt.dt_year = ((tod[5] & 0x80) ? 2000 : 1900) + FROMBCD(tod[6]);
+   dt.dt_mon = FROMBCD(tod[5] & 0x1f);
+   dt.dt_day = FROMBCD(tod[4] & 0x3f);
+   dt.dt_hour = FROMBCD(tod[2] & 0x3f);
if ((tod[2] & 0x40) && (tod[2] & 0x20)) /* adjust AM/PM format */
-   tt->hour = (tt->hour + 12) % 24;
-   tt->min = FROMBCD(tod[1] & 0x7f);
-   tt->sec = FROMBCD(tod[0] & 0x7f);
+   dt.dt_hour = (dt.dt_hour + 12) % 24;
+   dt.dt_min = FROMBCD(tod[1] & 0x7f);
+   dt.dt_sec = FROMBCD(tod[0] & 0x7f);
+
+   if (dt.dt_sec > 59 || dt.dt_min > 59 || dt.dt_hour > 23 ||
+   dt.dt_day > 31 || dt.dt_day == 0 ||
+   dt.dt_mon > 12 || dt.dt_mon == 0 ||
+   dt.dt_year < POSIX_BASE_YEAR)
+   return EINVAL;
+
+   tv->tv_sec = clock_ymdhms_to_secs();
+   tv->tv_usec = 0;
+   return 0;
 }
 
 int
@@ -217,46 

Struct for kqueue scan state

2020-04-20 Thread Visa Hankala
This diff introduces a struct for kqueue scan state. It eases making
scans piecewise by keeping track of the scan's end point. The end point
prevents the scan from recollecting events that are already being
reported to the userspace.

Below is an overview of the goal. It is achieved by combining this diff
with mpi@'s kqueue_scan() refactoring.

kqueue_scan_setup(, kq);
while (room for events) {
...
ready = kqueue_scan(, ...);
if (ready == 0)
break;
...
}
kqueue_scan_finish();

The diff takes a conservative approach with the scan start marker and
reinserts it for each round in kqueue_scan().

As the diff does not provide any immediate improvement, it will wait
after release.

OK?

Index: kern/kern_event.c
===
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.131
diff -u -p -r1.131 kern_event.c
--- kern/kern_event.c   7 Apr 2020 13:27:51 -   1.131
+++ kern/kern_event.c   20 Apr 2020 14:17:59 -
@@ -62,7 +62,7 @@ void  KQREF(struct kqueue *);
 void   KQRELE(struct kqueue *);
 
 intkqueue_sleep(struct kqueue *, struct timespec *);
-intkqueue_scan(struct kqueue *kq, int maxevents,
+intkqueue_scan(struct kqueue_scan_state *scan, int maxevents,
struct kevent *ulistp, struct timespec *timeout,
struct proc *p, int *retval);
 
@@ -529,6 +529,7 @@ out:
 int
 sys_kevent(struct proc *p, void *v, register_t *retval)
 {
+   struct kqueue_scan_state scan;
struct filedesc* fdp = p->p_fd;
struct sys_kevent_args /* {
syscallarg(int) fd;
@@ -612,8 +613,10 @@ sys_kevent(struct proc *p, void *v, regi
 
KQREF(kq);
FRELE(fp, p);
-   error = kqueue_scan(kq, SCARG(uap, nevents), SCARG(uap, eventlist),
+   kqueue_scan_setup(, kq);
+   error = kqueue_scan(, SCARG(uap, nevents), SCARG(uap, eventlist),
tsp, p, );
+   kqueue_scan_finish();
KQRELE(kq);
*retval = n;
return (error);
@@ -870,11 +873,12 @@ kqueue_sleep(struct kqueue *kq, struct t
 }
 
 int
-kqueue_scan(struct kqueue *kq, int maxevents, struct kevent *ulistp,
-struct timespec *tsp, struct proc *p, int *retval)
+kqueue_scan(struct kqueue_scan_state *scan, int maxevents,
+struct kevent *ulistp, struct timespec *tsp, struct proc *p, int *retval)
 {
struct kevent *kevp;
-   struct knote mend, mstart, *kn;
+   struct knote *kn;
+   struct kqueue *kq = scan->kqs_kq;
int s, count, nkev = 0, error = 0;
struct kevent kev[KQ_NEVENTS];
 
@@ -882,9 +886,6 @@ kqueue_scan(struct kqueue *kq, int maxev
if (count == 0)
goto done;
 
-   memset(, 0, sizeof(mstart));
-   memset(, 0, sizeof(mend));
-
 retry:
if (kq->kq_state & KQ_DYING) {
error = EBADF;
@@ -894,7 +895,8 @@ retry:
kevp = [0];
s = splhigh();
if (kq->kq_count == 0) {
-   if (tsp != NULL && !timespecisset(tsp)) {
+   if ((tsp != NULL && !timespecisset(tsp)) ||
+   scan->kqs_nevent != 0) {
splx(s);
error = 0;
goto done;
@@ -910,27 +912,40 @@ retry:
goto done;
}
 
-   mstart.kn_filter = EVFILT_MARKER;
-   mstart.kn_status = KN_PROCESSING;
-   TAILQ_INSERT_HEAD(>kq_head, , kn_tqe);
-   mend.kn_filter = EVFILT_MARKER;
-   mend.kn_status = KN_PROCESSING;
-   TAILQ_INSERT_TAIL(>kq_head, , kn_tqe);
+   /*
+* Put the end marker in the queue to limit the scan to the events
+* that are currently active.  This prevents events from being
+* recollected if they reactivate during scan.
+*
+* If a partial scan has been performed already but no events have
+* been collected, reposition the end marker to make any new events
+* reachable.
+*/
+   if (!scan->kqs_queued) {
+   TAILQ_INSERT_TAIL(>kq_head, >kqs_end, kn_tqe);
+   scan->kqs_queued = 1;
+   } else if (scan->kqs_nevent == 0) {
+   TAILQ_REMOVE(>kq_head, >kqs_end, kn_tqe);
+   TAILQ_INSERT_TAIL(>kq_head, >kqs_end, kn_tqe);
+   }
+
+   TAILQ_INSERT_HEAD(>kq_head, >kqs_start, kn_tqe);
while (count) {
-   kn = TAILQ_NEXT(, kn_tqe);
+   kn = TAILQ_NEXT(>kqs_start, kn_tqe);
if (kn->kn_filter == EVFILT_MARKER) {
-   if (kn == ) {
-   TAILQ_REMOVE(>kq_head, , kn_tqe);
-   TAILQ_REMOVE(>kq_head, , kn_tqe);
+   if (kn == >kqs_end) {
+   TAILQ_REMOVE(>kq_head, >kqs_start,
+   kn_tqe);
splx(s);
-   

stacktrace_save() sync

2020-04-15 Thread Visa Hankala
This diff:

* upgrades stacktrace_save() to stacktrace_save_at() if the latter
  is missing on the architecture,
* defines stacktrace_save() as an inline function in 
  to replace MD definitions.

OK?

Index: arch/amd64/amd64/db_trace.c
===
RCS file: src/sys/arch/amd64/amd64/db_trace.c,v
retrieving revision 1.51
diff -u -p -r1.51 db_trace.c
--- arch/amd64/amd64/db_trace.c 29 Mar 2020 15:14:28 -  1.51
+++ arch/amd64/amd64/db_trace.c 15 Apr 2020 15:55:08 -
@@ -288,12 +288,6 @@ stacktrace_save_at(struct stacktrace *st
}
 }
 
-void
-stacktrace_save(struct stacktrace *st)
-{
-   return stacktrace_save_at(st, 0);
-}
-
 vaddr_t
 db_get_pc(struct trapframe *tf)
 {
Index: arch/arm64/arm64/db_trace.c
===
RCS file: src/sys/arch/arm64/arm64/db_trace.c,v
retrieving revision 1.10
diff -u -p -r1.10 db_trace.c
--- arch/arm64/arm64/db_trace.c 29 Mar 2020 15:14:28 -  1.10
+++ arch/arm64/arm64/db_trace.c 15 Apr 2020 15:55:08 -
@@ -150,7 +150,7 @@ db_stack_trace_print(db_expr_t addr, int
 }
 
 void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, unsigned int skip)
 {
struct callframe *frame, *lastframe, *limit;
struct proc *p = curproc;
@@ -166,7 +166,10 @@ stacktrace_save(struct stacktrace *st)
sizeof(struct trapframe) - 0x10);
 
while (st->st_count < STACKTRACE_MAX) {
-   st->st_pc[st->st_count++] = frame->f_lr;
+   if (skip == 0)
+   st->st_pc[st->st_count++] = frame->f_lr;
+   else
+   skip--;
 
lastframe = frame;
frame = frame->f_frame;
Index: arch/hppa/hppa/db_interface.c
===
RCS file: src/sys/arch/hppa/hppa/db_interface.c,v
retrieving revision 1.47
diff -u -p -r1.47 db_interface.c
--- arch/hppa/hppa/db_interface.c   20 Jan 2020 15:58:23 -  1.47
+++ arch/hppa/hppa/db_interface.c   15 Apr 2020 15:55:09 -
@@ -316,7 +316,7 @@ db_stack_trace_print(db_expr_t addr, int
 }
 
 void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, unsigned int skip)
 {
register_t *fp, pc, rp;
int i;
@@ -327,7 +327,10 @@ stacktrace_save(struct stacktrace *st)
 
st->st_count = 0;
for (i = 0; i < STACKTRACE_MAX; i++) {
-   st->st_pc[st->st_count++] = rp;
+   if (skip == 0)
+   st->st_pc[st->st_count++] = rp;
+   else
+   skip--;
 
/* next frame */
pc = rp;
Index: arch/i386/i386/db_trace.c
===
RCS file: src/sys/arch/i386/i386/db_trace.c,v
retrieving revision 1.40
diff -u -p -r1.40 db_trace.c
--- arch/i386/i386/db_trace.c   29 Mar 2020 15:14:28 -  1.40
+++ arch/i386/i386/db_trace.c   15 Apr 2020 15:55:09 -
@@ -293,12 +293,6 @@ stacktrace_save_at(struct stacktrace *st
}
 }
 
-void
-stacktrace_save(struct stacktrace *st)
-{
-   return stacktrace_save_at(st, 0);
-}
-
 vaddr_t
 db_get_pc(struct trapframe *tf)
 {
Index: arch/mips64/mips64/trap.c
===
RCS file: src/sys/arch/mips64/mips64/trap.c,v
retrieving revision 1.143
diff -u -p -r1.143 trap.c
--- arch/mips64/mips64/trap.c   20 Jan 2020 15:58:23 -  1.143
+++ arch/mips64/mips64/trap.c   15 Apr 2020 15:55:09 -
@@ -1477,7 +1477,7 @@ end:
 
 #ifdef DDB
 void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, unsigned int skip)
 {
extern char k_general[];
extern char u_general[];
@@ -1503,8 +1503,12 @@ stacktrace_save(struct stacktrace *st)
if (!VALID_ADDRESS(pc) || !VALID_ADDRESS(sp))
break;
 
-   if (!first)
-   st->st_pc[st->st_count++] = pc;
+   if (!first) {
+   if (skip == 0)
+   st->st_pc[st->st_count++] = pc;
+   else
+   skip--;
+   }
first = 0;
 
/* Determine the start address of the current subroutine. */
Index: arch/powerpc/ddb/db_trace.c
===
RCS file: src/sys/arch/powerpc/ddb/db_trace.c,v
retrieving revision 1.15
diff -u -p -r1.15 db_trace.c
--- arch/powerpc/ddb/db_trace.c 10 Apr 2020 07:23:21 -  1.15
+++ arch/powerpc/ddb/db_trace.c 15 Apr 2020 15:55:09 -
@@ -253,9 +253,3 @@ stacktrace_save_at(struct stacktrace *st
break;
}
 }
-
-void
-stacktrace_save(struct stacktrace *st)
-{
-   return stacktrace_save_at(st, 0);
-}
Index: arch/sparc64/sparc64/db_trace.c

Re: Simplify NET_LOCK() variations

2020-04-12 Thread Visa Hankala
On Sun, Apr 12, 2020 at 03:26:14PM +0200, Martin Pieuchot wrote:
> The existing variations of the NET_LOCK() macros are confusing.  We're
> now all aware of this fact.  So let's keep them simple to prevent future
> mistakes :)
> 
> The diff below reduces the current set of methods to the following:
> 
>   NET_LOCK()/NET_UNLOCK()
>   NET_ASSERT_LOCKED()
>   NET_ASSERT_UNLOCKED()
> 
> On top of those, two extra methods are provided for a *very specific*
> purpose:
> 
>   NET_RLOCK_IN_IOCTL()
>   NET_RUNLOCK_IN_IOCTL()
> 
> "IN_IOCTL()" mean they should only be used in ioctl(2) context ;)  The
> purpose is to keep previous behavior where read-only ioctl(2) dot not
> wait for packet processing to finish. 
> 
> To achieve this NET_LOCK() and NET_UNLOCK() now contain some magic to
> ensure the only packet processing thread grabbing a read-lock is the
> softnet thread.  In other words, one doesn't need to be aware of all
> this magic, just imagine that the Network Stack is big-locked and use
> NET_LOCK()/NET_UNLOCK() everywhere.

I think it is a potential source of error if NET_LOCK() is sensitive
to context. To understand what this code does, you have to know which
thread calls it.

I would prefer to have a separate NET_RLOCK variant for explicit use
by softnet threads, say NET_RLOCK_IN_SOFTNET() or NET_RLOCK_SOFTNET(),
which can assert that the calling context is correct and read-lock
netlock.



Fix error path of VOP_IOCTL() in sr_hotspare()

2020-04-05 Thread Visa Hankala
In sr_hotspare(), the error path of VOP_IOCTL() appears to do a
redundant VOP_CLOSE() and vput(). The diff below fixes that. The fail
branch will close the vnode because `open' is true.

OK?

Index: dev/softraid.c
===
RCS file: src/sys/dev/softraid.c,v
retrieving revision 1.399
diff -u -p -r1.399 softraid.c
--- dev/softraid.c  10 Mar 2020 08:41:19 -  1.399
+++ dev/softraid.c  5 Apr 2020 12:08:08 -
@@ -2846,8 +2846,6 @@ sr_hotspare(struct sr_softc *sc, dev_t d
NOCRED, curproc)) {
DNPRINTF(SR_D_META, "%s: sr_hotspare ioctl failed\n",
DEVNAME(sc));
-   VOP_CLOSE(vn, FREAD | FWRITE, NOCRED, curproc);
-   vput(vn);
goto fail;
}
if (label.d_partitions[part].p_fstype != FS_RAID) {



Change knote list head type

2020-04-04 Thread Visa Hankala
Below is a mostly mechanical patch that wraps the SLIST head of knotes
inside another struct. The patch also introduces functions for adding
and removing knotes from these lists.

My intent is to extend the list struct so that the system can assert
when the knote list should be locked. The idea is that when you invoke
KNOTE(), you should hold the lock that serializes access to the knote
list. Combined with NOTE_SUBMIT as a signal to f_event to skip locking,
this should keep the locking patterns tractable.

OK?

Index: arch/arm64/dev/apm.c
===
RCS file: src/sys/arch/arm64/dev/apm.c,v
retrieving revision 1.3
diff -u -p -r1.3 apm.c
--- arch/arm64/dev/apm.c20 Feb 2020 16:56:51 -  1.3
+++ arch/arm64/dev/apm.c4 Apr 2020 13:25:56 -
@@ -271,7 +271,7 @@ filt_apmrdetach(struct knote *kn)
 {
struct apm_softc *sc = (struct apm_softc *)kn->kn_hook;
 
-   SLIST_REMOVE(>sc_note, kn, knote, kn_selnext);
+   klist_remove(>sc_note, kn);
 }
 
 int
@@ -303,7 +303,7 @@ apmkqfilter(dev_t dev, struct knote *kn)
}
 
kn->kn_hook = (caddr_t)sc;
-   SLIST_INSERT_HEAD(>sc_note, kn, kn_selnext);
+   klist_insert(>sc_note, kn);
 
return (0);
 }
Index: arch/i386/i386/apm.c
===
RCS file: src/sys/arch/i386/i386/apm.c,v
retrieving revision 1.122
diff -u -p -r1.122 apm.c
--- arch/i386/i386/apm.c20 Feb 2020 16:56:51 -  1.122
+++ arch/i386/i386/apm.c4 Apr 2020 13:25:57 -
@@ -1118,7 +1118,7 @@ filt_apmrdetach(struct knote *kn)
struct apm_softc *sc = (struct apm_softc *)kn->kn_hook;
 
rw_enter_write(>sc_lock);
-   SLIST_REMOVE(>sc_note, kn, knote, kn_selnext);
+   klist_remove(>sc_note, kn);
rw_exit_write(>sc_lock);
 }
 
@@ -1152,7 +1152,7 @@ apmkqfilter(dev_t dev, struct knote *kn)
kn->kn_hook = (caddr_t)sc;
 
rw_enter_write(>sc_lock);
-   SLIST_INSERT_HEAD(>sc_note, kn, kn_selnext);
+   klist_insert(>sc_note, kn);
rw_exit_write(>sc_lock);
return (0);
 }
Index: arch/loongson/dev/apm.c
===
RCS file: src/sys/arch/loongson/dev/apm.c,v
retrieving revision 1.35
diff -u -p -r1.35 apm.c
--- arch/loongson/dev/apm.c 20 Feb 2020 16:56:51 -  1.35
+++ arch/loongson/dev/apm.c 4 Apr 2020 13:25:58 -
@@ -292,7 +292,7 @@ filt_apmrdetach(struct knote *kn)
 {
struct apm_softc *sc = (struct apm_softc *)kn->kn_hook;
 
-   SLIST_REMOVE(>sc_note, kn, knote, kn_selnext);
+   klist_remove(>sc_note, kn);
 }
 
 int
@@ -324,7 +324,7 @@ apmkqfilter(dev_t dev, struct knote *kn)
}
 
kn->kn_hook = (caddr_t)sc;
-   SLIST_INSERT_HEAD(>sc_note, kn, kn_selnext);
+   klist_insert(>sc_note, kn);
 
return (0);
 }
Index: arch/macppc/dev/apm.c
===
RCS file: src/sys/arch/macppc/dev/apm.c,v
retrieving revision 1.21
diff -u -p -r1.21 apm.c
--- arch/macppc/dev/apm.c   20 Feb 2020 16:56:51 -  1.21
+++ arch/macppc/dev/apm.c   4 Apr 2020 13:25:58 -
@@ -305,7 +305,7 @@ filt_apmrdetach(struct knote *kn)
 {
struct apm_softc *sc = (struct apm_softc *)kn->kn_hook;
 
-   SLIST_REMOVE(>sc_note, kn, knote, kn_selnext);
+   klist_remove(>sc_note, kn);
 }
 
 int
@@ -337,7 +337,7 @@ apmkqfilter(dev_t dev, struct knote *kn)
}
 
kn->kn_hook = (caddr_t)sc;
-   SLIST_INSERT_HEAD(>sc_note, kn, kn_selnext);
+   klist_insert(>sc_note, kn);
 
return (0);
 }
Index: dev/hotplug.c
===
RCS file: src/sys/dev/hotplug.c,v
retrieving revision 1.19
diff -u -p -r1.19 hotplug.c
--- dev/hotplug.c   20 Feb 2020 16:56:52 -  1.19
+++ dev/hotplug.c   4 Apr 2020 13:25:58 -
@@ -211,7 +211,7 @@ hotplugkqfilter(dev_t dev, struct knote 
}
 
s = splbio();
-   SLIST_INSERT_HEAD(klist, kn, kn_selnext);
+   klist_insert(klist, kn);
splx(s);
return (0);
 }
@@ -222,7 +222,7 @@ filt_hotplugrdetach(struct knote *kn)
int s;
 
s = splbio();
-   SLIST_REMOVE(_sel.si_note, kn, knote, kn_selnext);
+   klist_remove(_sel.si_note, kn);
splx(s);
 }
 
Index: dev/midi.c
===
RCS file: src/sys/dev/midi.c,v
retrieving revision 1.46
diff -u -p -r1.46 midi.c
--- dev/midi.c  20 Feb 2020 16:56:52 -  1.46
+++ dev/midi.c  4 Apr 2020 13:25:58 -
@@ -363,7 +363,7 @@ midikqfilter(dev_t dev, struct knote *kn
kn->kn_hook = (void *)sc;
 
mtx_enter(_lock);
-   SLIST_INSERT_HEAD(klist, kn, kn_selnext);
+   klist_insert(klist, kn);
mtx_leave(_lock);
 done:
device_unref(>dev);
@@ -376,7 +376,7 @@ filt_midirdetach(struct knote *kn)

Constify pledgenames[]

2020-04-03 Thread Visa Hankala
This diff makes pledgenames[] const.

OK?

Index: sys/kern/kern_pledge.c
===
RCS file: src/sys/kern/kern_pledge.c,v
retrieving revision 1.261
diff -u -p -r1.261 kern_pledge.c
--- sys/kern/kern_pledge.c  15 Feb 2020 09:35:48 -  1.261
+++ sys/kern/kern_pledge.c  3 Apr 2020 03:32:10 -
@@ -527,7 +527,7 @@ pledge_syscall(struct proc *p, int code,
 int
 pledge_fail(struct proc *p, int error, uint64_t code)
 {
-   char *codes = "";
+   const char *codes = "";
int i;
struct sigaction sa;
 
Index: sys/sys/pledge.h
===
RCS file: src/sys/sys/pledge.h,v
retrieving revision 1.40
diff -u -p -r1.40 pledge.h
--- sys/sys/pledge.h19 Jun 2019 16:55:51 -  1.40
+++ sys/sys/pledge.h3 Apr 2020 03:32:10 -
@@ -72,9 +72,9 @@
 #define PLEDGE_YPACTIVE0x8000ULL   /* YP use detected and 
allowed */
 
 #ifdef PLEDGENAMES
-static struct {
+static const struct {
uint64_tbits;
-   char*name;
+   const char  *name;
 } pledgenames[] = {
{ PLEDGE_RPATH, "rpath" },
{ PLEDGE_WPATH, "wpath" },
Index: usr.bin/kdump/kdump.c
===
RCS file: src/usr.bin/kdump/kdump.c,v
retrieving revision 1.141
diff -u -p -r1.141 kdump.c
--- usr.bin/kdump/kdump.c   18 Jan 2020 23:56:31 -  1.141
+++ usr.bin/kdump/kdump.c   3 Apr 2020 03:32:10 -
@@ -1375,7 +1375,7 @@ ktrexec(const char *ptr, size_t len)
 static void
 ktrpledge(struct ktr_pledge *pledge, size_t len)
 {
-   char *name = "";
+   const char *name = "";
int i;
 
if (len < sizeof(struct ktr_pledge))



Avoid selwakeup() in kqueue_wakeup()

2020-04-02 Thread Visa Hankala
selwakeup(sip) calls KNOTE(>si_note, 0), which implies that
kqueue_wakeup() should not call selwakeup() directly. Otherwise,
a contrived program can trigger deep recursion.

The diff below moves selwakeup() from kqueue_wakeup() to kqueue_task().
In addition to preventing the recursion, this change is necessary with
the current select/poll implementation to make kqueue_wakeup() free of
the kernel lock.

OK?

Index: kern/kern_event.c
===
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.129
diff -u -p -r1.129 kern_event.c
--- kern/kern_event.c   2 Apr 2020 07:00:25 -   1.129
+++ kern/kern_event.c   3 Apr 2020 03:44:00 -
@@ -1102,7 +1102,12 @@ kqueue_task(void *arg)
 {
struct kqueue *kq = arg;
 
-   KNOTE(>kq_sel.si_note, 0);
+   if (kq->kq_state & KQ_SEL) {
+   kq->kq_state &= ~KQ_SEL;
+   selwakeup(>kq_sel);
+   } else {
+   KNOTE(>kq_sel.si_note, 0);
+   }
KQRELE(kq);
 }
 
@@ -1114,10 +1119,7 @@ kqueue_wakeup(struct kqueue *kq)
kq->kq_state &= ~KQ_SLEEP;
wakeup(kq);
}
-   if (kq->kq_state & KQ_SEL) {
-   kq->kq_state &= ~KQ_SEL;
-   selwakeup(>kq_sel);
-   } else if (!SLIST_EMPTY(>kq_sel.si_note)) {
+   if ((kq->kq_state & KQ_SEL) || !SLIST_EMPTY(>kq_sel.si_note)) {
/* Defer activation to avoid recursion. */
KQREF(kq);
if (!task_add(systq, >kq_task))



Check ktrstart() error in doktrace()

2020-03-21 Thread Visa Hankala
This diff makes doktrace() check the outcome of ktrstart() and skip
tracing if the trace file header cannot be written.

OK?

Index: kern/kern_ktrace.c
===
RCS file: src/sys/kern/kern_ktrace.c,v
retrieving revision 1.101
diff -u -p -r1.101 kern_ktrace.c
--- kern/kern_ktrace.c  21 Mar 2020 08:58:50 -  1.101
+++ kern/kern_ktrace.c  22 Mar 2020 05:17:23 -
@@ -54,7 +54,7 @@
 
 void   ktrinitheaderraw(struct ktr_header *, uint, pid_t, pid_t);
 void   ktrinitheader(struct ktr_header *, struct proc *, int);
-void   ktrstart(struct proc *, struct vnode *, struct ucred *);
+intktrstart(struct proc *, struct vnode *, struct ucred *);
 intktrops(struct proc *, struct process *, int, int, struct vnode *,
struct ucred *);
 intktrsetchildren(struct proc *, struct process *, int, int,
@@ -141,13 +141,13 @@ ktrinitheader(struct ktr_header *kth, st
memcpy(kth->ktr_comm, pr->ps_comm, MAXCOMLEN);
 }
 
-void
+int
 ktrstart(struct proc *p, struct vnode *vp, struct ucred *cred)
 {
struct ktr_header kth;
 
ktrinitheaderraw(, htobe32(KTR_START), -1, -1);
-   ktrwriteraw(p, vp, cred, , NULL);
+   return (ktrwriteraw(p, vp, cred, , NULL));
 }
 
 void
@@ -449,7 +449,9 @@ doktrace(struct vnode *vp, int ops, int 
if (ops == KTROP_SET) {
if (suser(p) == 0)
facs |= KTRFAC_ROOT;
-   ktrstart(p, vp, cred);
+   error = ktrstart(p, vp, cred);
+   if (error != 0)
+   goto done;
}
/*
 * do it



Fix tsleep(9) during execve(2)

2020-03-21 Thread Visa Hankala
The recent change of initializing sls_sig = 0 in sleep_setup()
(r1.164 of kern_synch.c) has introduced a regression with execve(2).
execve(2) sets the current process in single thread mode for replacing
the program image. In this mode, sleep_setup_signal() cancels a sleep,
making tsleep(9) with PCATCH return immediately. Previously, when
sls_sig was initialized with value 1, tsleep() with PCATCH returned
EINTR during exec. Now it returns zero.

The previous behaviour was not exactly right but did not seem to cause
apparent harm. The current situation is different. For example, the
call chain shown below can hang. The sleep does not actually happen so
vwaitforio() can spin because the pending I/O does not get finished.

tsleep
vwaitforio
nfs_flush
nfs_close
VOP_CLOSE
vn_closefile
fdrop
closef
fdcloseexec
sys_execve

The hang was reported and offending commit located by Pavel Korovin.

I think the proper way to fix the problem is to tighten the conditions
when sleep_setup_signal() cancels the sleep. Instead of checking
p->p_p->ps_single != NULL, which affects the whole process, the thread
should determine if P_SUSPSINGLE is set in p->p_flag. This limits the
cancelling to the threads that have been suspended for single thread
mode. The thread that runs execve(2) does not have the flag set.

OK?

Index: kern/kern_synch.c
===
RCS file: src/sys/kern/kern_synch.c,v
retrieving revision 1.166
diff -u -p -r1.166 kern_synch.c
--- kern/kern_synch.c   20 Mar 2020 17:13:51 -  1.166
+++ kern/kern_synch.c   22 Mar 2020 04:25:32 -
@@ -480,7 +480,7 @@ sleep_setup_signal(struct sleep_state *s
 * stopped, p->p_wchan will be 0 upon return from CURSIG.
 */
atomic_setbits_int(>p_flag, P_SINTR);
-   if (p->p_p->ps_single != NULL || (sls->sls_sig = CURSIG(p)) != 0) {
+   if ((p->p_flag & P_SUSPSINGLE) || (sls->sls_sig = CURSIG(p)) != 0) {
unsleep(p);
p->p_stat = SONPROC;
sls->sls_do_sleep = 0;



Fix a panic with unlocked soclose()

2020-03-11 Thread Visa Hankala
jcs@ has reported the following panic which started appearing as
a consequence of the recent file close unlocking:

panic: kernel diagnostic assertion "timo || _kernel_lock_held()" failed: file 
"/usr/src/sys/kern/kern_synch.c", line 123

panic
__assert
tsleep
usbd_transfer
usbd_do_request_flags
ure_iff
ure_ioctl
in6_delmulti
in6_leavegroup
in6_freemoptions
in_pcbdetach
udp_detach
soclose

The above shows that soclose() can call parts of the kernel that still
need the kernel lock. This is now apparent because close(2) is unlocked.
However, the issue has been present longer. If a thread raced with the
closing of a socket file descriptor, the final FRELE() might be invoked
in an unlocked context, triggering a similar situation.

I think most of the socket handling is now under NET_LOCK(). However,
calls to the driver layer from the socket code need to be serialized
with the kernel lock. The following diff does that for SIOCDELMULTI
ioctl requests that can be invoked from the socket close path. I wonder
if this is actually enough and if most of soclose() should be put back
under KERNEL_LOCK() as a precaution...

Index: netinet/in.c
===
RCS file: src/sys/netinet/in.c,v
retrieving revision 1.168
diff -u -p -r1.168 in.c
--- netinet/in.c1 Dec 2019 21:12:42 -   1.168
+++ netinet/in.c11 Mar 2020 16:26:44 -
@@ -929,7 +929,9 @@ in_delmulti(struct in_multi *inm)
sizeof(struct sockaddr_in);
satosin(_addr)->sin_family = AF_INET;
satosin(_addr)->sin_addr = inm->inm_addr;
+   KERNEL_LOCK();
(*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t));
+   KERNEL_UNLOCK();
 
TAILQ_REMOVE(>if_maddrlist, >inm_ifma,
ifma_list);
Index: netinet/ip_mroute.c
===
RCS file: src/sys/netinet/ip_mroute.c,v
retrieving revision 1.128
diff -u -p -r1.128 ip_mroute.c
--- netinet/ip_mroute.c 2 Sep 2019 13:12:09 -   1.128
+++ netinet/ip_mroute.c 11 Mar 2020 16:26:44 -
@@ -772,7 +772,9 @@ vif_delete(struct ifnet *ifp)
satosin(_addr)->sin_len = sizeof(struct sockaddr_in);
satosin(_addr)->sin_family = AF_INET;
satosin(_addr)->sin_addr = zeroin_addr;
+   KERNEL_LOCK();
(*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t));
+   KERNEL_UNLOCK();
 
free(v, M_MRTABLE, sizeof(*v));
 }
Index: netinet6/in6.c
===
RCS file: src/sys/netinet6/in6.c,v
retrieving revision 1.234
diff -u -p -r1.234 in6.c
--- netinet6/in6.c  18 Nov 2019 22:08:59 -  1.234
+++ netinet6/in6.c  11 Mar 2020 16:26:44 -
@@ -1100,7 +1100,9 @@ in6_delmulti(struct in6_multi *in6m)
ifr.ifr_addr.sin6_len = sizeof(struct sockaddr_in6);
ifr.ifr_addr.sin6_family = AF_INET6;
ifr.ifr_addr.sin6_addr = in6m->in6m_addr;
+   KERNEL_LOCK();
(*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t));
+   KERNEL_UNLOCK();
 
TAILQ_REMOVE(>if_maddrlist, >in6m_ifma,
ifma_list);
Index: netinet6/ip6_mroute.c
===
RCS file: src/sys/netinet6/ip6_mroute.c,v
retrieving revision 1.122
diff -u -p -r1.122 ip6_mroute.c
--- netinet6/ip6_mroute.c   4 Sep 2019 16:13:49 -   1.122
+++ netinet6/ip6_mroute.c   11 Mar 2020 16:26:44 -
@@ -565,7 +565,9 @@ ip6_mrouter_detach(struct ifnet *ifp)
memset(, 0, sizeof(ifr));
ifr.ifr_addr.sin6_family = AF_INET6;
ifr.ifr_addr.sin6_addr = in6addr_any;
+   KERNEL_LOCK();
(*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t));
+   KERNEL_UNLOCK();
 
free(m6, M_MRTABLE, sizeof(*m6));
 }



Fix use of WITNESS_UNLOCK() in rw_exit_{read,write}()

2020-02-29 Thread Visa Hankala
There is a bug in how rw_exit_read() and rw_exit_write() use
WITNESS_UNLOCK(). The fast paths call WITNESS_UNLOCK() after releasing
the actual lock. This leads to a use-after-free in the checker if the
lock is dynamically allocated and another thread happens to free it
too early.

The following diff splits rw_exit() into two separate functions, one
which is the rw_exit(9) frontend, and the other, rw_do_exit(), which is
common between rw_exit(9), rw_exit_read(9) and rw_exit_write(9). This
makes it possible to call WITNESS_UNLOCK() before the actual lock
release without turning the code into an #ifdef maze.

The diff additionally does the following tweaks:

* Invoke membar_exit_before_atomic() only once even if the slow path is
  taken. The initial barrier is enough for the release semantics.

* Read rwl_owner for rw_cas() as late as possible so that the CAS
  operation would be less likely to fail. Note that in rw_exit() the
  purpose of the read is different; it determines the type of the lock
  (read / write).

* Put the rw_cas() in rw_exit() (now rw_do_exit()) inside
  __predict_false(). This compacts the resulting machine code a bit.

The code could be simplified by removing the fast path from
rw_exit_read() and rw_exit_write(), and always calling rw_do_exit().
However, that would reduce performance a tiny amount. I saw about 0.5%
increase in kernel build time in a test setup. The patch below does
not seem to affect performance negatively.

OK?

Index: kern/kern_rwlock.c
===
RCS file: src/sys/kern/kern_rwlock.c,v
retrieving revision 1.44
diff -u -p -r1.44 kern_rwlock.c
--- kern/kern_rwlock.c  30 Nov 2019 11:19:17 -  1.44
+++ kern/kern_rwlock.c  29 Feb 2020 16:24:59 -
@@ -25,6 +25,8 @@
 #include 
 #include 
 
+void   rw_do_exit(struct rwlock *, unsigned long);
+
 /* XXX - temporary measure until proc0 is properly aligned */
 #define RW_PROC(p) (((long)p) & ~RWLOCK_MASK)
 
@@ -129,31 +131,31 @@ rw_enter_write(struct rwlock *rwl)
 void
 rw_exit_read(struct rwlock *rwl)
 {
-   unsigned long owner = rwl->rwl_owner;
+   unsigned long owner;
 
rw_assert_rdlock(rwl);
+   WITNESS_UNLOCK(>rwl_lock_obj, 0);
 
membar_exit_before_atomic();
+   owner = rwl->rwl_owner;
if (__predict_false((owner & RWLOCK_WAIT) ||
rw_cas(>rwl_owner, owner, owner - RWLOCK_READ_INCR)))
-   rw_exit(rwl);
-   else
-   WITNESS_UNLOCK(>rwl_lock_obj, 0);
+   rw_do_exit(rwl, 0);
 }
 
 void
 rw_exit_write(struct rwlock *rwl)
 {
-   unsigned long owner = rwl->rwl_owner;
+   unsigned long owner;
 
rw_assert_wrlock(rwl);
+   WITNESS_UNLOCK(>rwl_lock_obj, LOP_EXCLUSIVE);
 
membar_exit_before_atomic();
+   owner = rwl->rwl_owner;
if (__predict_false((owner & RWLOCK_WAIT) ||
rw_cas(>rwl_owner, owner, 0)))
-   rw_exit(rwl);
-   else
-   WITNESS_UNLOCK(>rwl_lock_obj, LOP_EXCLUSIVE);
+   rw_do_exit(rwl, RWLOCK_WRLOCK);
 }
 
 #ifdef DIAGNOSTIC
@@ -314,22 +316,29 @@ retry:
 void
 rw_exit(struct rwlock *rwl)
 {
-   unsigned long owner = rwl->rwl_owner;
-   int wrlock = owner & RWLOCK_WRLOCK;
-   unsigned long set;
+   unsigned long wrlock;
 
/* Avoid deadlocks after panic or in DDB */
if (panicstr || db_active)
return;
 
+   wrlock = rwl->rwl_owner & RWLOCK_WRLOCK;
if (wrlock)
rw_assert_wrlock(rwl);
else
rw_assert_rdlock(rwl);
-
WITNESS_UNLOCK(>rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0);
 
membar_exit_before_atomic();
+   rw_do_exit(rwl, wrlock);
+}
+
+/* membar_exit_before_atomic() has to precede call of this function. */
+void
+rw_do_exit(struct rwlock *rwl, unsigned long wrlock)
+{
+   unsigned long owner, set;
+
do {
owner = rwl->rwl_owner;
if (wrlock)
@@ -337,7 +346,7 @@ rw_exit(struct rwlock *rwl)
else
set = (owner - RWLOCK_READ_INCR) &
~(RWLOCK_WAIT|RWLOCK_WRWANT);
-   } while (rw_cas(>rwl_owner, owner, set));
+   } while (__predict_false(rw_cas(>rwl_owner, owner, set)));
 
if (owner & RWLOCK_WAIT)
wakeup(rwl);



Generic flags field in struct filterops

2020-02-19 Thread Visa Hankala
Eventually, it will become necessary to add new properties to
struct filterops. One such property is whether the filter is safe to use
without the kernel lock.

The diff below replaces the f_isfd field with a generic flags field in
struct filterops, to allow adding new properties without cluttering
the struct.

OK?

Index: arch/arm64/dev/apm.c
===
RCS file: src/sys/arch/arm64/dev/apm.c,v
retrieving revision 1.2
diff -u -p -r1.2 apm.c
--- arch/arm64/dev/apm.c31 Dec 2019 13:48:31 -  1.2
+++ arch/arm64/dev/apm.c19 Feb 2020 15:08:23 -
@@ -88,7 +88,7 @@ int apmkqfilter(dev_t dev, struct knote 
 int apm_getdefaultinfo(struct apm_power_info *);
 
 const struct filterops apmread_filtops = {
-   .f_isfd = 1,
+   .f_flags= FILTEROP_ISFD,
.f_attach   = NULL,
.f_detach   = filt_apmrdetach,
.f_event= filt_apmread,
Index: arch/i386/i386/apm.c
===
RCS file: src/sys/arch/i386/i386/apm.c,v
retrieving revision 1.121
diff -u -p -r1.121 apm.c
--- arch/i386/i386/apm.c31 Dec 2019 13:48:31 -  1.121
+++ arch/i386/i386/apm.c19 Feb 2020 15:08:23 -
@@ -104,7 +104,7 @@ voidfilt_apmrdetach(struct knote *kn);
 intfilt_apmread(struct knote *kn, long hint);
 
 const struct filterops apmread_filtops = {
-   .f_isfd = 1,
+   .f_flags= FILTEROP_ISFD,
.f_attach   = NULL,
.f_detach   = filt_apmrdetach,
.f_event= filt_apmread,
Index: arch/loongson/dev/apm.c
===
RCS file: src/sys/arch/loongson/dev/apm.c,v
retrieving revision 1.34
diff -u -p -r1.34 apm.c
--- arch/loongson/dev/apm.c 16 Feb 2020 23:37:23 -  1.34
+++ arch/loongson/dev/apm.c 19 Feb 2020 15:08:23 -
@@ -94,7 +94,7 @@ int apm_getdefaultinfo(struct apm_power_
 int apm_suspend(int state);
 
 const struct filterops apmread_filtops = {
-   .f_isfd = 1,
+   .f_flags= FILTEROP_ISFD,
.f_attach   = NULL,
.f_detach   = filt_apmrdetach,
.f_event= filt_apmread,
Index: arch/macppc/dev/apm.c
===
RCS file: src/sys/arch/macppc/dev/apm.c,v
retrieving revision 1.20
diff -u -p -r1.20 apm.c
--- arch/macppc/dev/apm.c   31 Dec 2019 13:48:31 -  1.20
+++ arch/macppc/dev/apm.c   19 Feb 2020 15:08:23 -
@@ -86,7 +86,7 @@ int filt_apmread(struct knote *kn, long 
 int apmkqfilter(dev_t dev, struct knote *kn);
 
 const struct filterops apmread_filtops = {
-   .f_isfd = 1,
+   .f_flags= FILTEROP_ISFD,
.f_attach   = NULL,
.f_detach   = filt_apmrdetach,
.f_event= filt_apmread,
Index: dev/hotplug.c
===
RCS file: src/sys/dev/hotplug.c,v
retrieving revision 1.18
diff -u -p -r1.18 hotplug.c
--- dev/hotplug.c   31 Dec 2019 13:48:31 -  1.18
+++ dev/hotplug.c   19 Feb 2020 15:08:23 -
@@ -39,7 +39,7 @@ void filt_hotplugrdetach(struct knote *)
 int  filt_hotplugread(struct knote *, long);
 
 const struct filterops hotplugread_filtops = {
-   .f_isfd = 1,
+   .f_flags= FILTEROP_ISFD,
.f_attach   = NULL,
.f_detach   = filt_hotplugrdetach,
.f_event= filt_hotplugread,
Index: dev/midi.c
===
RCS file: src/sys/dev/midi.c,v
retrieving revision 1.45
diff -u -p -r1.45 midi.c
--- dev/midi.c  31 Dec 2019 13:48:31 -  1.45
+++ dev/midi.c  19 Feb 2020 15:08:23 -
@@ -67,7 +67,7 @@ void filt_midiwdetach(struct knote *);
 int filt_midiwrite(struct knote *, long);
 
 const struct filterops midiwrite_filtops = {
-   .f_isfd = 1,
+   .f_flags= FILTEROP_ISFD,
.f_attach   = NULL,
.f_detach   = filt_midiwdetach,
.f_event= filt_midiwrite,
@@ -77,7 +77,7 @@ void filt_midirdetach(struct knote *);
 int filt_midiread(struct knote *, long);
 
 const struct filterops midiread_filtops = {
-   .f_isfd = 1,
+   .f_flags= FILTEROP_ISFD,
.f_attach   = NULL,
.f_detach   = filt_midirdetach,
.f_event= filt_midiread,
Index: dev/rnd.c
===
RCS file: src/sys/dev/rnd.c,v
retrieving revision 1.200
diff -u -p -r1.200 rnd.c
--- dev/rnd.c   31 Dec 2019 13:48:31 -  1.200
+++ dev/rnd.c   19 Feb 2020 15:08:23 -
@@ -242,14 +242,14 @@ static void _rs_seed(u_char *, size_t);
 static void _rs_clearseed(const void *p, size_t s);
 
 const struct filterops randomread_filtops = {
-   .f_isfd = 1,
+   .f_flags= FILTEROP_ISFD,
   

Release fdplock before closef() in finishdup()

2020-02-18 Thread Visa Hankala
The following diff makes finishdup() release the file descriptor table
lock before calling closef(). This makes the order of operations similar
to that of fdrelease().

The dup* code still has an issue with file closing that this diff does
not address. If fdalloc() fails in dodup3(), sys_dup() or sys_fcntl(),
the code releases the reference to the file while still holding fdplock.
The reference is acquired before locking the fdp, so another thread
might have closed the file descriptor and the current thread now holds
the last file reference. In that case the FRELE() invokes the file
close path.

OK?

Index: kern/kern_descrip.c
===
RCS file: src/sys/kern/kern_descrip.c,v
retrieving revision 1.199
diff -u -p -r1.199 kern_descrip.c
--- kern/kern_descrip.c 18 Feb 2020 03:47:18 -  1.199
+++ kern/kern_descrip.c 18 Feb 2020 16:29:51 -
@@ -307,14 +307,11 @@ restart:
fdpunlock(fdp);
goto restart;
}
-   goto out;
+   fdpunlock(fdp);
+   return (error);
}
/* No need for FRELE(), finishdup() uses current ref. */
-   error = finishdup(p, fp, old, new, retval, 0);
-
-out:
-   fdpunlock(fdp);
-   return (error);
+   return (finishdup(p, fp, old, new, retval, 0));
 }
 
 /*
@@ -382,7 +379,8 @@ restart:
fdpunlock(fdp);
goto restart;
}
-   goto out;
+   fdpunlock(fdp);
+   return (error);
}
if (new != i)
panic("dup2: fdalloc");
@@ -394,11 +392,7 @@ restart:
dupflags |= DUPF_CLOEXEC;
 
/* No need for FRELE(), finishdup() uses current ref. */
-   error = finishdup(p, fp, old, new, retval, dupflags);
-
-out:
-   fdpunlock(fdp);
-   return (error);
+   return (finishdup(p, fp, old, new, retval, dupflags));
 }
 
 /*
@@ -445,6 +439,7 @@ restart:
fdpunlock(fdp);
goto restart;
}
+   fdpunlock(fdp);
} else {
int dupflags = 0;
 
@@ -454,8 +449,6 @@ restart:
/* No need for FRELE(), finishdup() uses current ref. */
error = finishdup(p, fp, fd, i, retval, dupflags);
}
-
-   fdpunlock(fdp);
return (error);
 
case F_GETFD:
@@ -657,23 +650,24 @@ finishdup(struct proc *p, struct file *f
 {
struct file *oldfp;
struct filedesc *fdp = p->p_fd;
+   int error;
 
fdpassertlocked(fdp);
KASSERT(fp->f_iflags & FIF_INSERTED);
 
if (fp->f_count >= FDUP_MAX_COUNT) {
-   FRELE(fp, p);
-   return (EDEADLK);
+   error = EDEADLK;
+   goto fail;
}
 
oldfp = fd_getfile(fdp, new);
if ((dupflags & DUPF_DUP2) && oldfp == NULL) {
if (fd_inuse(fdp, new)) {
-   FRELE(fp, p);
-   return (EBUSY);
-   }
+   error = EBUSY;
+   goto fail;
+   }
fd_used(fdp, new);
-   }
+   }
 
/*
 * Use `fd_fplock' to synchronize with fd_getfile() so that
@@ -690,10 +684,18 @@ finishdup(struct proc *p, struct file *f
 
if (oldfp != NULL) {
knote_fdclose(p, new);
+   fdpunlock(fdp);
closef(oldfp, p);
+   } else {
+   fdpunlock(fdp);
}
 
return (0);
+
+fail:
+   fdpunlock(fdp);
+   FRELE(fp, p);
+   return (error);
 }
 
 void



Set UF_EXCLOSE inside finishdup()

2020-02-16 Thread Visa Hankala
The closing of a file can be a complex operation, and hence it would be
good to hold as few locks as possible when calling closef(). In
particular, the code behind close(2) already releases the file
descriptor table lock before closef(). This might not be strictly
necessary when the type of the file is known beforehand. However,
general file descriptor operations should be careful.

The following diff moves the setting of the UF_EXCLOSE fd flag inside
finishdup(). This makes possible to release fdplock before closef() in
finishdup() in a subsequent patch.

This changes the order of setting UF_EXCLOSE relative to
knote_fdclose(). However, that should not be a problem because the flag
concerns file descriptor handling and not kqueue.

OK?

Index: kern/kern_descrip.c
===
RCS file: src/sys/kern/kern_descrip.c,v
retrieving revision 1.198
diff -u -p -r1.198 kern_descrip.c
--- kern/kern_descrip.c 5 Feb 2020 17:03:13 -   1.198
+++ kern/kern_descrip.c 16 Feb 2020 16:56:14 -
@@ -82,6 +82,9 @@ int finishdup(struct proc *, struct file
 int find_last_set(struct filedesc *, int);
 int dodup3(struct proc *, int, int, int, register_t *);
 
+#define DUPF_CLOEXEC   0x01
+#define DUPF_DUP2  0x02
+
 struct pool file_pool;
 struct pool fdesc_pool;
 
@@ -350,7 +353,7 @@ dodup3(struct proc *p, int old, int new,
 {
struct filedesc *fdp = p->p_fd;
struct file *fp;
-   int i, error;
+   int dupflags, error, i;
 
 restart:
if ((fp = fd_getfile(fdp, old)) == NULL)
@@ -385,10 +388,13 @@ restart:
panic("dup2: fdalloc");
fd_unused(fdp, new);
}
+
+   dupflags = DUPF_DUP2;
+   if (flags & O_CLOEXEC)
+   dupflags |= DUPF_CLOEXEC;
+
/* No need for FRELE(), finishdup() uses current ref. */
-   error = finishdup(p, fp, old, new, retval, 1);
-   if (!error && flags & O_CLOEXEC)
-   fdp->fd_ofileflags[new] |= UF_EXCLOSE;
+   error = finishdup(p, fp, old, new, retval, dupflags);
 
 out:
fdpunlock(fdp);
@@ -440,11 +446,13 @@ restart:
goto restart;
}
} else {
-   /* No need for FRELE(), finishdup() uses current ref. */
-   error = finishdup(p, fp, fd, i, retval, 0);
+   int dupflags = 0;
+
+   if (SCARG(uap, cmd) == F_DUPFD_CLOEXEC)
+   dupflags |= DUPF_CLOEXEC;
 
-   if (!error && SCARG(uap, cmd) == F_DUPFD_CLOEXEC)
-   fdp->fd_ofileflags[i] |= UF_EXCLOSE;
+   /* No need for FRELE(), finishdup() uses current ref. */
+   error = finishdup(p, fp, fd, i, retval, dupflags);
}
 
fdpunlock(fdp);
@@ -645,7 +653,7 @@ out:
  */
 int
 finishdup(struct proc *p, struct file *fp, int old, int new,
-register_t *retval, int dup2)
+register_t *retval, int dupflags)
 {
struct file *oldfp;
struct filedesc *fdp = p->p_fd;
@@ -659,7 +667,7 @@ finishdup(struct proc *p, struct file *f
}
 
oldfp = fd_getfile(fdp, new);
-   if (dup2 && oldfp == NULL) {
+   if ((dupflags & DUPF_DUP2) && oldfp == NULL) {
if (fd_inuse(fdp, new)) {
FRELE(fp, p);
return (EBUSY);
@@ -676,6 +684,8 @@ finishdup(struct proc *p, struct file *f
mtx_leave(>fd_fplock);
 
fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE;
+   if (dupflags & DUPF_CLOEXEC)
+   fdp->fd_ofileflags[new] |= UF_EXCLOSE;
*retval = new;
 
if (oldfp != NULL) {



Re: Raise spl for updating kn_status

2020-02-16 Thread Visa Hankala
On Sat, Feb 15, 2020 at 09:42:53PM +0100, Martin Pieuchot wrote:
> On 15/02/20(Sat) 16:56, Visa Hankala wrote:
> > When I added the knote_acquire() call in kqueue_register(), I overlooked
> > the fact that the knote could be modified by a (soft) interrupt.
> > Interrupts have to be blocked when changing kn_status. Otherwise the
> > state could become confused.
> 
> Can the same knote be modified by different interrupt handlers or are we
> just considering a race between process and interrupt contexts?

In theory, the same knote could be modified by different interrupt
handlers if the associated event source used more than one interrupt.
The kernel lock already ensures that only one CPU at at time can access
a knote. This diff fixes a situation where the interrupt context is not
blocked properly. The particular case in my mind is timer events.
However, the same issue applies to all situations where activation
can happen from interrupt context.

> > The diff below adds splhigh() where kn_status is modified. This also
> > ensures that the steps of knote_activate() run as an uninterrupted
> > sequence. As a consequence, knote_enqueue() and knote_dequeue() can rely
> > on the caller to raise the spl.
> 
> Do you think it would make sense to document which fields required a SPL
> protection just like we started doing with locks?

Maybe, though I am somewhat hesitant to start adding more formalism.
With mutexes the SPL requirement is implied, though not necessarily
fully accurate. For example, the same mutex can protect some fields
that are accessed from process context only and some other fields that
are accessed from both process and interrupt contexts. With tricky code,
instead of SPL it might be good to indicate somehow if multiple contexts
are involved though.



Raise spl for updating kn_status

2020-02-15 Thread Visa Hankala
When I added the knote_acquire() call in kqueue_register(), I overlooked
the fact that the knote could be modified by a (soft) interrupt.
Interrupts have to be blocked when changing kn_status. Otherwise the
state could become confused.

The diff below adds splhigh() where kn_status is modified. This also
ensures that the steps of knote_activate() run as an uninterrupted
sequence. As a consequence, knote_enqueue() and knote_dequeue() can rely
on the caller to raise the spl.

OK?

Index: kern/kern_event.c
===
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.124
diff -u -p -r1.124 kern_event.c
--- kern/kern_event.c   14 Feb 2020 16:50:25 -  1.124
+++ kern/kern_event.c   15 Feb 2020 16:52:41 -
@@ -312,8 +312,11 @@ filt_proc(struct knote *kn, long hint)
 */
if (event == NOTE_EXIT) {
struct process *pr = kn->kn_ptr.p_process;
+   int s;
 
+   s = splhigh();
kn->kn_status |= KN_DETACHED;
+   splx(s);
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
kn->kn_data = W_EXITCODE(pr->ps_xexit, pr->ps_xsig);
SLIST_REMOVE(>ps_klist, kn, knote, kn_selnext);
@@ -712,13 +715,16 @@ again:
SLIST_FOREACH(kn, list, kn_link) {
if (kev->filter == kn->kn_filter &&
kev->ident == kn->kn_id) {
-   if (knote_acquire(kn) == 0) {
+   s = splhigh();
+   if (!knote_acquire(kn)) {
+   splx(s);
if (fp != NULL) {
FRELE(fp, p);
fp = NULL;
}
goto again;
}
+   splx(s);
break;
}
}
@@ -820,7 +826,9 @@ again:
splx(s);
}
 
+   s = splhigh();
knote_release(kn);
+   splx(s);
 done:
if (fp != NULL)
FRELE(fp, p);
@@ -1157,6 +1165,7 @@ kqueue_expand_list(struct kqueue *kq, in
 int
 knote_acquire(struct knote *kn)
 {
+   splassert(IPL_HIGH);
KASSERT(kn->kn_filter != EVFILT_MARKER);
 
if (kn->kn_status & KN_PROCESSING) {
@@ -1175,6 +1184,7 @@ knote_acquire(struct knote *kn)
 void
 knote_release(struct knote *kn)
 {
+   splassert(IPL_HIGH);
KASSERT(kn->kn_filter != EVFILT_MARKER);
KASSERT(kn->kn_status & KN_PROCESSING);
 
@@ -1192,9 +1202,13 @@ knote_release(struct knote *kn)
 void
 knote_activate(struct knote *kn)
 {
+   int s;
+
+   s = splhigh();
kn->kn_status |= KN_ACTIVE;
if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)
knote_enqueue(kn);
+   splx(s);
 }
 
 /*
@@ -1217,10 +1231,15 @@ void
 knote_remove(struct proc *p, struct klist *list)
 {
struct knote *kn;
+   int s;
 
while ((kn = SLIST_FIRST(list)) != NULL) {
-   if (!knote_acquire(kn))
+   s = splhigh();
+   if (!knote_acquire(kn)) {
+   splx(s);
continue;
+   }
+   splx(s);
kn->kn_fop->f_detach(kn);
knote_drop(kn, p);
}
@@ -1297,6 +1316,7 @@ knote_drop(struct knote *kn, struct proc
 {
struct kqueue *kq = kn->kn_kq;
struct klist *list;
+   int s;
 
KASSERT(kn->kn_filter != EVFILT_MARKER);
 
@@ -1306,12 +1326,14 @@ knote_drop(struct knote *kn, struct proc
list = >kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
 
SLIST_REMOVE(list, kn, knote, kn_link);
+   s = splhigh();
if (kn->kn_status & KN_QUEUED)
knote_dequeue(kn);
if (kn->kn_status & KN_WAITING) {
kn->kn_status &= ~KN_WAITING;
wakeup(kn);
}
+   splx(s);
if (kn->kn_fop->f_isfd)
FRELE(kn->kn_fp, p);
pool_put(_pool, kn);
@@ -1322,8 +1344,8 @@ void
 knote_enqueue(struct knote *kn)
 {
struct kqueue *kq = kn->kn_kq;
-   int s = splhigh();
 
+   splassert(IPL_HIGH);
KASSERT(kn->kn_filter != EVFILT_MARKER);
KASSERT((kn->kn_status & KN_QUEUED) == 0);
 
@@ -1332,7 +1354,6 @@ knote_enqueue(struct knote *kn)
kn->kn_status |= KN_QUEUED;
kq->kq_count++;
kqueue_check(kq);
-   splx(s);
kqueue_wakeup(kq);
 }
 
@@ -1340,8 +1361,8 @@ void
 knote_dequeue(struct knote *kn)
 {
struct kqueue *kq = kn->kn_kq;
-   int s = splhigh();
 
+   splassert(IPL_HIGH);
KASSERT(kn->kn_filter != EVFILT_MARKER);
KASSERT(kn->kn_status & KN_QUEUED);
 
@@ -1350,13 +1371,13 @@ 

Re: const*ify cfattach

2020-02-15 Thread Visa Hankala
On Fri, Feb 14, 2020 at 06:08:31PM +0100, Martin Pieuchot wrote:
> On 13/02/20(Thu) 17:23, Visa Hankala wrote:
> > On Thu, Feb 13, 2020 at 06:07:01PM +0100, Martin Pieuchot wrote:
> > > On 13/02/20(Thu) 16:53, Visa Hankala wrote:
> > >  [...]
> > > > I wonder if this constification should also be reflected in the output
> > > > of config(8).
> > > 
> > > What do you mean?
> > 
> > The program generates file ioconf.c that contains lines like these:
> > 
> > extern struct cfattach nsphy_ca;
> > extern struct cfattach nsphyter_ca;
> > extern struct cfattach inphy_ca;
> > extern struct cfattach iophy_ca;
> > extern struct cfattach rlphy_ca;
> > extern struct cfattach lxtphy_ca;
> > extern struct cfattach ukphy_ca;
> > extern struct cfattach brgphy_ca;
> > extern struct cfattach rgephy_ca;
> > extern struct cfattach ciphy_ca;
> > extern struct cfattach scsibus_ca;
> > extern struct cfattach cd_ca;
> > extern struct cfattach sd_ca;
> > 
> > These are non-const even though the actual struct instances are
> > read-only. Later in the same file, the struct cfdata array references
> > these structs through non-const pointers. This has worked so far, but
> > the situation is not ideal because the compiler makes wrong assumptions.
> 
> Below is a full diff that:
>  - add the "const" prefix to cfattach in config(8)
>  - add it, when required, to kern/subr_autoconf.c and sys/device.h
>  - drop the keyword in dev/rd.c

Please commit your initial const struct cfattach diff without the rd.c
part first. You have my OK for that. The change should be fine because
there already are many existing instances of the use of const with
struct cfattach.

The config(8) change should be reviewed separately. Given that the
cf_attach pointer of struct cfdata gets the const qualifier, couldn't
rd_ca now become const as well?

To find any inconsistencies of autoconf structs quickly, it would be
nice if config(8) put the extern declarations in a header file, say
"ioconf.h". Then this header could be included in driver files, maybe
as part of  under #ifdef _KERNEL. However, this would
create a point that easily triggers a rebuild of a large portion of
the kernel. Hence it might be easiest to allow looseness and rely on
discipline. (:



Re: const*ify cfattach

2020-02-13 Thread Visa Hankala
On Thu, Feb 13, 2020 at 06:07:01PM +0100, Martin Pieuchot wrote:
> On 13/02/20(Thu) 16:53, Visa Hankala wrote:
> > On Thu, Feb 13, 2020 at 12:00:35PM +0100, Martin Pieuchot wrote:
> > > These structures are only used by autoconf(9) and don't need to be
> > > modified.  Some subsystems already define most of them as 'const'.
> > > Diff below turn all the remaining one as such.
> > > 
> > > Only a single driver, de(4), needed a modification apart from adding
> > > the const: removing a forward definition fixed it ;)
> > > 
> > > Built for all archs, I tested i386, amd64 and sparc64.
> > > 
> > > Ok?
> > 
> > No, RAMDISK build fails.
> > 
> > /usr/src/sys/dev/rd.c:87:15: error: assigning to 'struct cfattach *' from 
> > 'const struct cfattach *' discards qualifiers 
> > [-Werror,-Wincompatible-pointer-types-discards-qualifiers]
> > cf.cf_attach = _ca;
> >  ^ ~~
> 
> Removing const from rd(4) fixed it here in my tree.  Thanks!
>  
> > I wonder if this constification should also be reflected in the output
> > of config(8).
> 
> What do you mean?

The program generates file ioconf.c that contains lines like these:

extern struct cfattach nsphy_ca;
extern struct cfattach nsphyter_ca;
extern struct cfattach inphy_ca;
extern struct cfattach iophy_ca;
extern struct cfattach rlphy_ca;
extern struct cfattach lxtphy_ca;
extern struct cfattach ukphy_ca;
extern struct cfattach brgphy_ca;
extern struct cfattach rgephy_ca;
extern struct cfattach ciphy_ca;
extern struct cfattach scsibus_ca;
extern struct cfattach cd_ca;
extern struct cfattach sd_ca;

These are non-const even though the actual struct instances are
read-only. Later in the same file, the struct cfdata array references
these structs through non-const pointers. This has worked so far, but
the situation is not ideal because the compiler makes wrong assumptions.



Re: const*ify cfattach

2020-02-13 Thread Visa Hankala
On Thu, Feb 13, 2020 at 12:00:35PM +0100, Martin Pieuchot wrote:
> These structures are only used by autoconf(9) and don't need to be
> modified.  Some subsystems already define most of them as 'const'.
> Diff below turn all the remaining one as such.
> 
> Only a single driver, de(4), needed a modification apart from adding
> the const: removing a forward definition fixed it ;)
> 
> Built for all archs, I tested i386, amd64 and sparc64.
> 
> Ok?

No, RAMDISK build fails.

/usr/src/sys/dev/rd.c:87:15: error: assigning to 'struct cfattach *' from 
'const struct cfattach *' discards qualifiers 
[-Werror,-Wincompatible-pointer-types-discards-qualifiers]
cf.cf_attach = _ca;
 ^ ~~

I wonder if this constification should also be reflected in the output
of config(8).



Re: Push KERNEL_LOCK() down in pgsigio() and selwakeup()

2020-02-09 Thread Visa Hankala
On Tue, Feb 04, 2020 at 02:21:02PM +0100, Martin Pieuchot wrote:
>  void
>  selwakeup(struct selinfo *sip)
>  {
> + KERNEL_LOCK();
> + KNOTE(>si_note, NOTE_SUBMIT);
> + doselwakeup(sip);
> + KERNEL_UNLOCK();
> +}

There is a problem with audio code. Audio interrupt handlers generally
run without the kernel lock. Acquiring the kernel lock is not safe at
this level because the audio_lock mutex might be locked. In addition,
IPL_AUDIO is above IPL_MPFLOOR on some architectures.



Remove non-__STDC__ asserts from

2020-02-08 Thread Visa Hankala
The header  defines two sets of assert macros,
one for standard C and another for the traditional cpp. As the header
requires the use of a C compiler (that is, no inclusion from an assembly
file), it looks that the non-__STDC__ parts could be removed.

OK?

Index: lib/libkern/libkern.h
===
RCS file: src/sys/lib/libkern/libkern.h,v
retrieving revision 1.35
diff -u -p -r1.35 libkern.h
--- lib/libkern/libkern.h   25 Apr 2018 11:15:58 -  1.35
+++ lib/libkern/libkern.h   8 Feb 2020 16:41:00 -
@@ -105,13 +105,8 @@ abs(int j)
 #ifdef NDEBUG  /* tradition! */
 #defineassert(e)   ((void)0)
 #else
-#ifdef __STDC__
 #defineassert(e)   ((e) ? (void)0 :
\
__assert("", __FILE__, __LINE__, #e))
-#else
-#defineassert(e)   ((e) ? (void)0 :
\
-   __assert("", __FILE__, __LINE__, "e"))
-#endif
 #endif
 
 #define__KASSERTSTR"kernel %sassertion \"%s\" failed: file \"%s\", 
line %d"
@@ -120,38 +115,22 @@ abs(int j)
 #defineKASSERTMSG(e, msg, ...) ((void)0)
 #defineKASSERT(e)  ((void)0)
 #else
-#ifdef __STDC__
 #defineKASSERTMSG(e, msg, ...) ((e) ? (void)0 :
\
panic(__KASSERTSTR " " msg, "diagnostic ", #e,  \
__FILE__, __LINE__, ## __VA_ARGS__))
 #defineKASSERT(e)  ((e) ? (void)0 :
\
__assert("diagnostic ", __FILE__, __LINE__, #e))
-#else
-#defineKASSERTMSG(e, msg, ...) ((e) ? (void)0 :
\
-   panic(__KASSERTSTR " " msg, "diagnostic ", "e", \
-   __FILE__, __LINE__, ## __VA_ARGS__))
-#defineKASSERT(e)  ((e) ? (void)0 :
\
-   __assert("diagnostic ", __FILE__, __LINE__, "e"))
-#endif
 #endif
 
 #ifndef DEBUG
 #defineKDASSERTMSG(e, msg, ...)((void)0)
 #defineKDASSERT(e) ((void)0)
 #else
-#ifdef __STDC__
 #defineKDASSERTMSG(e, msg, ...)((e) ? (void)0 :
\
panic(__KASSERTSTR " " msg, "debugging ", #e,   \
__FILE__, __LINE__, ## __VA_ARGS__))
 #defineKDASSERT(e) ((e) ? (void)0 :
\
__assert("debugging ", __FILE__, __LINE__, #e))
-#else
-#defineKDASSERTMSG(e, msg, ...)((e) ? (void)0 :
\
-   panic(__KASSERTSTR " " msg, "debugging ", "e",  \
-   __FILE__, __LINE__, ## __VA_ARGS__))
-#defineKDASSERT(e) ((e) ? (void)0 :
\
-   __assert("debugging ", __FILE__, __LINE__, "e"))
-#endif
 #endif
 
 #defineCTASSERT(x) extern char  _ctassert[(x) ? 1 : -1 ]   \



Zero knotes on allocation

2020-02-08 Thread Visa Hankala
Zero knotes on allocation. Parts of struct knote, such as kn_tqe, can
retain their old value quite long, which is not good in this complex
piece of code.

OK?

Index: kern/kern_event.c
===
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.122
diff -u -p -r1.122 kern_event.c
--- kern/kern_event.c   5 Feb 2020 17:03:13 -   1.122
+++ kern/kern_event.c   8 Feb 2020 16:41:00 -
@@ -686,7 +686,7 @@ kqueue_register(struct kqueue *kq, struc
}
 
if (kev->flags & EV_ADD)
-   newkn = pool_get(_pool, PR_WAITOK);
+   newkn = pool_get(_pool, PR_WAITOK | PR_ZERO);
 
 again:
if (fops->f_isfd) {



Re: Push KERNEL_LOCK() down in pgsigio() and selwakeup()

2020-02-08 Thread Visa Hankala
On Tue, Feb 04, 2020 at 02:21:02PM +0100, Martin Pieuchot wrote:
> As recently profiled, the softnet thread which runs mostly without
> KERNEL_LOCK() is still somehow serialized with the rest of the kernel.
> This is because the various subsystems to notify userland, either via
> poll(2)/select(2), kqueue(2) or pgsignal(9) still require this lock. 
> 
> We worked around this in bpf(4) by deferring the notification.  However
> such approach isn't trivially applied everywhere and doesn't help with
> getting the subsystems out of the KERNEL_LOCK().
> 
> Thanks to many people working on MP, the kernel now has multiple consumers
> of the notification hooks that can be used to test & expose possible bugs
> related to the removal of the KERNEL_LOCK() in these areas.
> 
> Diff below takes the first step of pushing the KERNEL_LOCK() in the
> subsystems.  That means the lock might be taken & released twice if an
> application asked for being notified via signal and via poll/kevent.

I think this is fine. Use of signal-based notification is quite unusual
in modern software, and use of both signals and poll/kevent with the same
file is all the more so.

> It is unclear to me if this change of behavior degrades latency in a 
> noticeable way.  If it does and people report it, we should consider
> keeping a single KERNEL_LOCK/UNLOCK() dance in the affected subsystem.
> 
> Comments?  Oks?

OK visa@



Re: Replace ttkqflush() with klist_invalidate()

2020-02-04 Thread Visa Hankala
On Tue, Feb 04, 2020 at 01:04:24PM +0100, Martin Pieuchot wrote:
> On 02/02/20(Sun) 09:58, Visa Hankala wrote:
> > tty(4) uses custom code for revoking knotes. It should be changed to use
> > klist_invalidate() to handle the revocation in one place. The following
> > diff does that.
> 
> Great, your diff is ok mpi@
> 
> > In addition, the patch makes the code store the tty context pointer
> > in kn_hook directly. This simplifies the code.
> 
> Does that mean we'll need some reference counting when getting kqueue
> out of the KERNEL_LOCK()?  Who is the owner of `kn_hook'?

I think the proper way is to invalidate knotes before the associated
event source object or kqueue is destroyed. That way there is no need
for reference counting. klist_invalidate() does the job when the object
is destroyed kind of involuntarily, like when a device is detached.
Typically, knotes are freed by knote_fdclose() when the file descriptor
is closed, which happens before the object is freed. kqueue_close() 
takes care of freeing kqueue's registered knotes when the kqueue itself
is being freed.

I would say the main owner of kn_hook and the knote itself is the object
that owns the kn_selnext list. klist_invalidate() dissolves that
ownership (well, any knotes on the list get freed either when the EOF
event is delivered to userspace or when the kqueue is destroyed).

> > Unlike ttkqflush(), klist_invalidate() can sleep. However, that should
> > not be a problem because the callers of ttyfree() call vdevgone() that
> > can sleep as well.
> 
> It took me some time to audit com_detach() because ttyfree() isn't the
> last call.  Most of the *_detach() functions have similar problems: they
> assume the execution of the code is atomic and free data structures
> before unregistering handlers.
> 
> Diff below improves that, ok?

Right, at least the soft interrupt handler dereferences sc_tty. The
code assumes that softintr_disestablish() has barrier-like behaviour,
which should be the case in this context on most architectures because
of the kernel lock.

OK visa@

> Index: dev/ic/com.c
> ===
> RCS file: /cvs/src/sys/dev/ic/com.c,v
> retrieving revision 1.170
> diff -u -p -r1.170 com.c
> --- dev/ic/com.c  19 Jul 2019 00:17:15 -  1.170
> +++ dev/ic/com.c  4 Feb 2020 11:27:06 -
> @@ -186,14 +186,14 @@ com_detach(struct device *self, int flag
>   mn |= 0x80;
>   vdevgone(maj, mn, mn, VCHR);
>  
> + timeout_del(>sc_dtr_tmo);
> + timeout_del(>sc_diag_tmo);
> + softintr_disestablish(sc->sc_si);
> +
>   /* Detach and free the tty. */
>   if (sc->sc_tty) {
>   ttyfree(sc->sc_tty);
>   }
> -
> - timeout_del(>sc_dtr_tmo);
> - timeout_del(>sc_diag_tmo);
> - softintr_disestablish(sc->sc_si);
>  
>   return (0);
>  }
> 



Replace ttkqflush() with klist_invalidate()

2020-02-02 Thread Visa Hankala
tty(4) uses custom code for revoking knotes. It should be changed to use
klist_invalidate() to handle the revocation in one place. The following
diff does that.

In addition, the patch makes the code store the tty context pointer
in kn_hook directly. This simplifies the code.

Unlike ttkqflush(), klist_invalidate() can sleep. However, that should
not be a problem because the callers of ttyfree() call vdevgone() that
can sleep as well.

It would be good to have this tested in various setups. Originally,
ttkqflush() was added to prevent crashing when a ucom(4) device is
detached while still in use by a kqueue-enabled program such as cu(1).

OK?

Index: kern/tty.c
===
RCS file: src/sys/kern/tty.c,v
retrieving revision 1.151
diff -u -p -r1.151 tty.c
--- kern/tty.c  9 Jan 2020 14:35:20 -   1.151
+++ kern/tty.c  2 Feb 2020 09:38:17 -
@@ -75,7 +75,6 @@ static void ttyblock(struct tty *);
 void ttyunblock(struct tty *);
 static void ttyecho(int, struct tty *);
 static void ttyrubo(struct tty *, int);
-void   ttkqflush(struct klist *klist);
 intfilt_ttyread(struct knote *kn, long hint);
 void   filt_ttyrdetach(struct knote *kn);
 intfilt_ttywrite(struct knote *kn, long hint);
@@ -1124,7 +1123,7 @@ ttkqfilter(dev_t dev, struct knote *kn)
return (EINVAL);
}
 
-   kn->kn_hook = (caddr_t)((u_long)dev);
+   kn->kn_hook = tp;
 
s = spltty();
SLIST_INSERT_HEAD(klist, kn, kn_selnext);
@@ -1134,29 +1133,11 @@ ttkqfilter(dev_t dev, struct knote *kn)
 }
 
 void
-ttkqflush(struct klist *klist)
-{
-   struct knote *kn, *kn1;
-
-   SLIST_FOREACH_SAFE(kn, klist, kn_selnext, kn1) {
-   SLIST_REMOVE(klist, kn, knote, kn_selnext);
-   kn->kn_hook = (caddr_t)((u_long)NODEV);
-   kn->kn_flags |= EV_EOF;
-   knote_activate(kn);
-   }
-}
-
-void
 filt_ttyrdetach(struct knote *kn)
 {
-   dev_t dev = (dev_t)((u_long)kn->kn_hook);
-   struct tty *tp;
+   struct tty *tp = kn->kn_hook;
int s;
 
-   if (dev == NODEV)
-   return;
-   tp = (*cdevsw[major(dev)].d_tty)(dev);
-
s = spltty();
SLIST_REMOVE(>t_rsel.si_note, kn, knote, kn_selnext);
splx(s);
@@ -1165,16 +1146,9 @@ filt_ttyrdetach(struct knote *kn)
 int
 filt_ttyread(struct knote *kn, long hint)
 {
-   dev_t dev = (dev_t)((u_long)kn->kn_hook);
-   struct tty *tp;
+   struct tty *tp = kn->kn_hook;
int s;
 
-   if (dev == NODEV) {
-   kn->kn_flags |= EV_EOF;
-   return (1);
-   }
-   tp = (*cdevsw[major(dev)].d_tty)(dev);
-
s = spltty();
kn->kn_data = ttnread(tp);
splx(s);
@@ -1188,14 +1162,9 @@ filt_ttyread(struct knote *kn, long hint
 void
 filt_ttywdetach(struct knote *kn)
 {
-   dev_t dev = (dev_t)((u_long)kn->kn_hook);
-   struct tty *tp;
+   struct tty *tp = kn->kn_hook;
int s;
 
-   if (dev == NODEV)
-   return;
-   tp = (*cdevsw[major(dev)].d_tty)(dev);
-
s = spltty();
SLIST_REMOVE(>t_wsel.si_note, kn, knote, kn_selnext);
splx(s);
@@ -1204,16 +1173,9 @@ filt_ttywdetach(struct knote *kn)
 int
 filt_ttywrite(struct knote *kn, long hint)
 {
-   dev_t dev = (dev_t)((u_long)kn->kn_hook);
-   struct tty *tp;
+   struct tty *tp = kn->kn_hook;
int canwrite, s;
 
-   if (dev == NODEV) {
-   kn->kn_flags |= EV_EOF;
-   return (1);
-   }
-   tp = (*cdevsw[major(dev)].d_tty)(dev);
-
s = spltty();
kn->kn_data = tp->t_outq.c_cn - tp->t_outq.c_cc;
canwrite = (tp->t_outq.c_cc <= tp->t_lowat);
@@ -2378,6 +2340,7 @@ ttymalloc(int baud)
 void
 ttyfree(struct tty *tp)
 {
+   int s;
 
--tty_count;
 #ifdef DIAGNOSTIC
@@ -2386,8 +2349,10 @@ ttyfree(struct tty *tp)
 #endif
TAILQ_REMOVE(, tp, tty_link);
 
-   ttkqflush(>t_rsel.si_note);
-   ttkqflush(>t_wsel.si_note);
+   s = spltty();
+   klist_invalidate(>t_rsel.si_note);
+   klist_invalidate(>t_wsel.si_note);
+   splx(s);
 
clfree(>t_rawq);
clfree(>t_canq);



Re: Null pointer crash in filt_uhidrdetach

2020-01-06 Thread Visa Hankala
On Fri, Jan 03, 2020 at 11:37:22PM -0800, Greg Steuck wrote:
> While playing with chromium u2f support[1] I managed to induce kernel
> crashes in filt_uhidrdetach. It takes a few attempts of plugging/unplugging
> the fido key while trying to authenticate at demo.yubico.com/playground.
> Eventually the kernel panics with this stack trace (retyped from [2]):
> 
> filt_uhidrdetach+0x33: movq 0x8(%rcx), rcx
> kqueue_close
> drop
> closef
> fdfree
> exit1
> single_thread_check
> userret
> intr_user_exit

The kernel is not very good at revoking knotes when a device is
detached. There is already some code for doing a cleanish detachment,
but it is not used everywhere.

Does the following diff help with this problem with uhid(4)?

Index: dev/usb/uhid.c
===
RCS file: src/sys/dev/usb/uhid.c,v
retrieving revision 1.75
diff -u -p -r1.75 uhid.c
--- dev/usb/uhid.c  31 Dec 2019 13:48:31 -  1.75
+++ dev/usb/uhid.c  6 Jan 2020 17:33:17 -
@@ -168,6 +168,10 @@ uhid_detach(struct device *self, int fla
mn = self->dv_unit;
vdevgone(maj, mn, mn, VCHR);
 
+   s = splusb();
+   klist_invalidate(>sc_rsel.si_note);
+   splx(s);
+
return (0);
 }
 
Index: kern/kern_event.c
===
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.117
diff -u -p -r1.117 kern_event.c
--- kern/kern_event.c   6 Jan 2020 10:28:32 -   1.117
+++ kern/kern_event.c   6 Jan 2020 17:33:17 -
@@ -464,6 +464,27 @@ seltrue_kqfilter(dev_t dev, struct knote
return (0);
 }
 
+static int
+filt_dead(struct knote *kn, long hint)
+{
+   kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+   kn->kn_data = 0;
+   return (1);
+}
+
+static void
+filt_deaddetach(struct knote *kn)
+{
+   /* Nothing to do */
+}
+
+static const struct filterops dead_filtops = {
+   .f_isfd = 1,
+   .f_attach   = NULL,
+   .f_detach   = filt_deaddetach,
+   .f_event= filt_dead,
+};
+
 int
 sys_kqueue(struct proc *p, void *v, register_t *retval)
 {
@@ -1304,8 +1325,18 @@ klist_invalidate(struct klist *list)
 {
struct knote *kn;
 
-   SLIST_FOREACH(kn, list, kn_selnext) {
-   kn->kn_status |= KN_DETACHED;
-   kn->kn_flags |= EV_EOF | EV_ONESHOT;
+   /*
+* NET_LOCK() must not be held because it can block another thread
+* in f_event with a knote acquired.
+*/
+   NET_ASSERT_UNLOCKED();
+
+   while ((kn = SLIST_FIRST(list)) != NULL) {
+   if (knote_acquire(kn) == 0)
+   continue;
+   kn->kn_fop->f_detach(kn);
+   kn->kn_fop = _filtops;
+   knote_activate(kn);
+   knote_release(kn);
}
 }



Re: SMR question

2019-12-29 Thread Visa Hankala
On Sun, Dec 29, 2019 at 05:23:43PM -0600, Amit Kulkarni wrote:
> I was preparing a diff and wanted to know if it is safe to disable the
> SMR kthread by commenting it out for testing purposes on amd64. Just a
> simple yes/no would do!

no



Re: grep -R with no path

2019-12-02 Thread Visa Hankala
On Mon, Dec 02, 2019 at 06:08:35PM +0100, Jeremie Courreges-Anglas wrote:
> On Mon, Dec 02 2019, Marc Espie  wrote:
> > On Mon, Dec 02, 2019 at 08:31:02AM +, Miod Vallat wrote:
> >> grep(1), when invoked with the -R option but no path, displays a
> >> "recursive search of stdin" warning and acts as if -R had not been
> >> specified.
> >> 
> >> GNU grep, in that case, will perform a recursive search in the current
> >> directory, i.e. uses an implicit "." path if none is given.
> >> 
> >> This is IMO a much better behaviour. What about the following diff?
> >> 
> >> Index: grep.c
> >> ===
> >> RCS file: /OpenBSD/src/usr.bin/grep/grep.c,v
> >> retrieving revision 1.62
> >> diff -u -p -r1.62 grep.c
> >> --- grep.c 7 Oct 2019 20:04:00 -   1.62
> >> +++ grep.c 2 Dec 2019 08:27:09 -
> >> @@ -473,8 +473,12 @@ main(int argc, char *argv[])
> >>++argv;
> >>}
> >>  
> >> -  if (Rflag && argc == 0)
> >> -  warnx("warning: recursive search of stdin");
> >> +  if (Rflag && argc == 0) {
> >> +  /* default to . if no path given */
> >> +  static char *dot_argv[] = { ".", NULL };
> >> +  argv = dot_argv;
> >> +  argc = 1;
> >> +  }
> >>if (Eflag)
> >>cflags |= REG_EXTENDED;
> >>if (Fflag)
> >> 
> >> 
> > I concur, I like it better as well.
> > -R isn't in POSIX anyway, so we don't have to worry about standard.
> 
> Same here, even if muscle memory helps this is just a better default IMO.
> 
> I have had a similar diff for some time but didn't push for it because
> nitpicking: GNU grep doesn't prepend "./" to file names in this case.
> I have a diff to do this but let's keep the nitpicking for later.
> 
> I'd like us to go with Miod's diff.  Any objection?  oks?
> ok jca@ if Marc or someone else wants to commit it.

OK visa@



Re: Fix regress/sys/rtable for octeon

2019-06-24 Thread Visa Hankala
On Sun, Jun 23, 2019 at 02:48:24PM +0200, Moritz Buhl wrote:
> Hi,
> while running regression tests on an octeon ERL, I noticed that
> no-macro-redefined flag was not known to the compiler.
> I added some #undefs and some hours ago a change to art_walk made
> the tests fail too.
> 
> The no-macro-redefine was added 3 month ago.

Committed, thank you.



Re: mtx_enter_try(9) & recursion

2019-06-02 Thread Visa Hankala
On Sun, Jun 02, 2019 at 10:29:01AM -0300, Martin Pieuchot wrote:
> On 02/06/19(Sun) 04:30, Visa Hankala wrote:
> > On Sat, Jun 01, 2019 at 07:04:23PM -0300, Martin Pieuchot wrote:
> > > On 01/06/19(Sat) 23:22, Mark Kettenis wrote:
> > > > > Date: Sat, 1 Jun 2019 17:32:52 -0300
> > > > > From: Martin Pieuchot 
> > > > > 
> > > > > Currently it isn't safe to call mtx_enter_try(9) if you're already
> > > > > holding the mutex.  That means it isn't safe to call that function
> > > > > in hardclock(9), like with `windup_mtx'.  That's why the mutex needs
> > > > > to be initialized as IPL_CLOCK.
> > > > > 
> > > > > I'm working on removing the SCHED_LOCK() from inside hardclock(9).
> > > > > That leads me to wonder if I should initialize all mutexes to 
> > > > > IPL_SCHED,
> > > > > possibly blocking clock interrupts, or if we should change the mutex 
> > > > > API
> > > > > to allow mtx_enter_try(9) to deal with recursion.
> > > > > 
> > > > > The diff below removes the recursion check for mtx_enter_try(9).
> > > > > 
> > > > > Comments?  Oks?
> > > > 
> > > > My initial reaction is that if you're trying to lock when you already
> > > > have the lock, there is something wrong with your locking strategy and
> > > > that this is something we don't want.
> > > 
> > > Could you elaborate?  Are you saying that preventing hardclock(9) to run
> > > is the way to move forward to unlock its internals?  Why isn't that
> > > strategy wrong?
> > > 
> > > In the `windup_mtx' case, does it matter if the mutex is taken by
> > > another CPU or by myself?  What's the problem when CPU0 is one holding
> > > the lock?
> > 
> > mutex(9) is not and should not become recursive. Recursive locking
> > works when it is voluntary. If recursion was allowed with interrupts,
> > the CPU could re-enter the critical section at any moment, possibly
> > seeing inconsistent state or breaking assumptions made by the original
> > entry.
> 
> I don't understand how your answer is related to my diff :(
> 
> Why allowing mtx_enter_try(9) to fail and not to panic(9) when the CPU
> executing the code already owns the mutex is wrong?
> 
> Is that what you call recursion?  Not entering the critical section in
> the interrupt instead of blocking the interrupt?
> 
> How is that different from not entering the critical section because
> another CPU is holding the mutex?

To me, it looks very suspicious if a CPU tries to lock a mutex that it
already holds because it indicates that there is a potential recursion.
My stance is that such a locking behaviour is a design error in general
and the current way of checking recursion in mtx_enter_try() is sound.

If a mutex can be taken on multiple interrupt priority levels, the lock
has to be configured to block the highest level, and mutexes used
in hardclock() are no exception.



Re: mtx_enter_try(9) & recursion

2019-06-01 Thread Visa Hankala
On Sat, Jun 01, 2019 at 07:04:23PM -0300, Martin Pieuchot wrote:
> On 01/06/19(Sat) 23:22, Mark Kettenis wrote:
> > > Date: Sat, 1 Jun 2019 17:32:52 -0300
> > > From: Martin Pieuchot 
> > > 
> > > Currently it isn't safe to call mtx_enter_try(9) if you're already
> > > holding the mutex.  That means it isn't safe to call that function
> > > in hardclock(9), like with `windup_mtx'.  That's why the mutex needs
> > > to be initialized as IPL_CLOCK.
> > > 
> > > I'm working on removing the SCHED_LOCK() from inside hardclock(9).
> > > That leads me to wonder if I should initialize all mutexes to IPL_SCHED,
> > > possibly blocking clock interrupts, or if we should change the mutex API
> > > to allow mtx_enter_try(9) to deal with recursion.
> > > 
> > > The diff below removes the recursion check for mtx_enter_try(9).
> > > 
> > > Comments?  Oks?
> > 
> > My initial reaction is that if you're trying to lock when you already
> > have the lock, there is something wrong with your locking strategy and
> > that this is something we don't want.
> 
> Could you elaborate?  Are you saying that preventing hardclock(9) to run
> is the way to move forward to unlock its internals?  Why isn't that
> strategy wrong?
> 
> In the `windup_mtx' case, does it matter if the mutex is taken by
> another CPU or by myself?  What's the problem when CPU0 is one holding
> the lock?

mutex(9) is not and should not become recursive. Recursive locking
works when it is voluntary. If recursion was allowed with interrupts,
the CPU could re-enter the critical section at any moment, possibly
seeing inconsistent state or breaking assumptions made by the original
entry.



Re: Patch to enable building of Octeon kernel with clang

2019-01-03 Thread Visa Hankala
On Thu, Jan 03, 2019 at 04:06:25PM +0100, Janne Johansson wrote:
> Den tors 3 jan. 2019 kl 15:25 skrev Visa Hankala :
> >
> > On Thu, Jan 03, 2019 at 01:16:05PM +0300, Mikhael Skvortsov wrote:
> > > Tested by running GENERIC.MP built by
> > > make CC=clang COMPILER_VERSION=clang
> > > on a CN6120 device.
> [...]
> > > Index: sys/arch/octeon/dev/octcrypto_asm.S
> > > ===
> > > RCS file: /cvs/src/sys/arch/octeon/dev/octcrypto_asm.S,v
> > > retrieving revision 1.1
> > > diff -u -p -u -r1.1 octcrypto_asm.S
> > > --- sys/arch/octeon/dev/octcrypto_asm.S9 Apr 2018 13:46:15 -
> > > 1.1
> > > +++ sys/arch/octeon/dev/octcrypto_asm.S3 Jan 2019 09:34:57 -
> > > @@ -648,7 +648,7 @@ LEAF(octcrypto_ghash_init, 0)
> > >  dmtc2t0, MT_GFM_RESINP
> > >  jrra
> > >   dmtc2t1, MT_GFM_RESINP+1
> > > -END(octcrypto_ghash_set_state)
> > > +END(octcrypto_ghash_init)
> 
> This at least seem like a regular mistake in the existing code,
> regardless of the rest of the changes/fixups for clang.

You are right. I have committed that part.



Re: Patch to enable building of Octeon kernel with clang

2019-01-03 Thread Visa Hankala
On Thu, Jan 03, 2019 at 01:16:05PM +0300, Mikhael Skvortsov wrote:
> Tested by running GENERIC.MP built by
> make CC=clang COMPILER_VERSION=clang
> on a CN6120 device.

Thank you. I have something similar pending and will commit soonish.
Certain things have to be done in concert with loongson and sgi.

> Index: sys/arch/mips64/mips64/fp_emulate.c
> ===
> RCS file: /cvs/src/sys/arch/mips64/mips64/fp_emulate.c,v
> retrieving revision 1.20
> diff -u -p -u -r1.20 fp_emulate.c
> --- sys/arch/mips64/mips64/fp_emulate.c22 Oct 2018 17:31:25 -1.20
> +++ sys/arch/mips64/mips64/fp_emulate.c3 Jan 2019 09:34:57 -
> @@ -158,8 +158,14 @@ MipsFPTrap(struct trapframe *tf)
>  sr = getsr();
>  setsr(sr | SR_COP_1_BIT);
> 
> -__asm__ volatile ("cfc1 %0, $31" : "=r" (fsr));
> -__asm__ volatile ("cfc1 %0, $31" : "=r" (fsr));
> +__asm__ volatile (".set push\n\t"
> +  ".set hardfloat\n\t"
> +  "cfc1 %0, $31\n\t"
> +  ".set pop" : "=r" (fsr));
> +__asm__ volatile (".set push\n\t"
> +  ".set hardfloat\n\t"
> +  "cfc1 %0, $31\n\t"
> +  ".set pop" : "=r" (fsr));
> 
>  /*
>   * If this is not an unimplemented operation, but a genuine
> @@ -399,7 +405,10 @@ deliver:
>  tf->fsr = fsr;
> 
>  if (CPU_HAS_FPU(ci)) {
> -__asm__ volatile ("ctc1 %0, $31" :: "r" (fsr));
> +__asm__ volatile (".set push\n\t"
> +  ".set hardfloat\n\t"
> +  "ctc1 %0, $31\n\t"
> +  ".set pop" :: "r" (fsr));
>  /* disable fpu before returning to trap() */
>  setsr(sr);
>  }
> Index: sys/arch/mips64/mips64/lcore_float.S
> ===
> RCS file: /cvs/src/sys/arch/mips64/mips64/lcore_float.S,v
> retrieving revision 1.22
> diff -u -p -u -r1.22 lcore_float.S
> --- sys/arch/mips64/mips64/lcore_float.S3 Oct 2012 11:18:23 -1.22
> +++ sys/arch/mips64/mips64/lcore_float.S3 Jan 2019 09:34:57 -
> @@ -39,6 +39,7 @@
> 
>  .setmips3
>  .setnoreorder# Noreorder is default style!
> +.sethardfloat
> 
>  
> /*
>   *
> Index: sys/arch/mips64/mips64/tlbhandler.S
> ===
> RCS file: /cvs/src/sys/arch/mips64/mips64/tlbhandler.S,v
> retrieving revision 1.48
> diff -u -p -u -r1.48 tlbhandler.S
> --- sys/arch/mips64/mips64/tlbhandler.S13 Dec 2018 16:35:07 -1.48
> +++ sys/arch/mips64/mips64/tlbhandler.S3 Jan 2019 09:34:57 -
> @@ -337,7 +337,7 @@ sys_stk_chk:
>  GET_CPU_INFO(k1, k0)
>  PTR_Lk1, CI_CURPROCPADDR(k1)
>  PTR_SUBU k0, sp, k1# check to see if we have a
> -sltiuk0, 2048#  valid kernel stack
> +sltiuk0, k0, 2048#  valid kernel stack
>  beqzk0, go_k_general# yes, handle.
>  nop
> 
> @@ -414,7 +414,8 @@ LEAF(tlb_flush, 0)
>  mtc0ta1, COP_0_TLB_INDEX# Set the index register.
>  #ifdef CPU_OCTEON
>  dmtc0v0, COP_0_TLB_HI# Mark entry high as invalid
> -PTR_ADDU v0, v0, 2 * PAGE_SIZE
> +lita2, 2 * PAGE_SIZE
> +PTR_ADDU v0, v0, ta2
>  #endif
>  adduta1, ta1, 1# Increment index.
>  TLB_HAZARD
> Index: sys/arch/octeon/conf/Makefile.octeon
> ===
> RCS file: /cvs/src/sys/arch/octeon/conf/Makefile.octeon,v
> retrieving revision 1.51
> diff -u -p -u -r1.51 Makefile.octeon
> --- sys/arch/octeon/conf/Makefile.octeon31 Oct 2018 10:15:47 -1.51
> +++ sys/arch/octeon/conf/Makefile.octeon3 Jan 2019 09:34:57 -
> @@ -43,6 +43,10 @@ CMACHFLAGS+=-fno-stack-protector
>  .if ${IDENT:M-DSMALL_KERNEL}
>  SORTR=cat
>  .endif
> +.if ${COMPILER_VERSION:Mclang}
> +NO_INTEGR_AS=   -no-integrated-as
> +CWARNFLAGS+=-Wno-address-of-packed-member -Wno-constant-conversion
> +.endif
> 
>  DEBUG?=-g
>  COPTS?=-O2
> @@ -100,7 +104,7 @@ LINKFLAGS+=-S
>  assym.h: $S/kern/genassym.sh Makefile \
>   ${_archdir}/${_arch}/genassym.cf ${_machdir}/${_mach}/genassym.cf
>  cat ${_archdir}/${_arch}/genassym.cf ${_machdir}/${_mach}/genassym.cf | \
> -sh $S/kern/genassym.sh ${CC} ${CFLAGS} ${CPPFLAGS} -MF
> assym.P > assym.h.tmp
> +sh $S/kern/genassym.sh ${CC} ${NO_INTEGR_AS} ${CFLAGS}
> ${CPPFLAGS} -MF assym.P > assym.h.tmp
>  sed '1s/.*/assym.h: \\/' assym.P > assym.d
>  sort -u assym.h.tmp > assym.h
> 
> Index: sys/arch/octeon/dev/octcrypto_asm.S
> ===
> RCS file: /cvs/src/sys/arch/octeon/dev/octcrypto_asm.S,v
> retrieving revision 1.1
> diff -u -p -u -r1.1 octcrypto_asm.S
> --- 

Re: Patch for install64.octeon : EdgeRouter 6 info

2018-12-21 Thread Visa Hankala
On Thu, Dec 20, 2018 at 07:39:44PM -0500, Chris McGee wrote:
> That looks much better to me, though it does get machine-specific
> 
> I feel like it would be more clear if the examples uniformly used your
> new variable
> convention, as some do and some do not. This allows us to eliminate one 
> example
> too. (I assume the ER Lite does OK if you specify numcores=1 ? )
> 
> I left in the ER Lite machine-specific example about resetting the USB
> controller.
> I only booted my new ER6 from USB a few times before switching to MMC,
> but it did not seem to have any issues detecting the USB controller or 
> devices.
> (It WAS picky about what USB sticks it would work correctly with.)
> 
> This is a diff from -current's /usr/src/distrib/notes/octeon/install
> 
> me@box> diff -u install install.ori

Please use cvs to generate diffs, so that the output would be applicable
to the source code tree in the version control system.

I have committed an adapted version of the patch.



Re: Patch for install64.octeon : EdgeRouter 6 info

2018-12-18 Thread Visa Hankala
On Mon, Dec 17, 2018 at 11:22:40PM -0500, Chris McGee wrote:
> Hi:
> 
>   I would like to add some info for Edgerouter 6
> (and presumably ER4, and maybe also ER12?) to install64.octeon.
> The document is great but it won't get a new user booting on the new
> 4-core machines with MMC drives.
> 
> I tried to make it as brief as possible while pointing the user in the right
> direction, so for example it mentions that you're going to need to drop
> bsd.mp into the msdos kernel loader partition but doesn't explain how
> to do that. Seemed to be the right level of detail for this document.
> 
> Here is a diff with my additions. Diff is from
> /OpenBSD/6.4/INSTALL.octeon.
> 
> me@box> diff INSTALL.octeon INSTALL.octeon.er6
> 690a691,692
> > For the EdgeRouter Lite:
> >
> 702a705,710
> > For the EdgeRouter 6, installing to the internal MMC drive:
> >
> >   # setenv bootcmd 'fatload mmc 0 ${loadaddr} bsd;bootoctlinux 
> > coremask=0xf rootdev=/dev/sd0'
> >   # setenv bootdelay 5
> >   # saveenv
> >
> 707c715
> < On multi-core systems, the numcores parameter enables the secondary CPUs.
> ---
> > On multi-core systems, the numcores parameter enables multiple cores.
> 708a717,719
> > Note that this boot command does not actually put a multiprocessor kernel in
> > place; you will also need to copy the bsd.mp kernel to the octeon MS-DOS
> > partition (disklabel i by default) on your boot drive for multicore support.
> 709a721
> > Example booting from USB on the Edgerouter Lite:
> 711a724,726
> > Example booting from USB on the EdgeRouter 6:
> >   fatload usb 0 ${loadaddr} bsd; bootoctlinux rootdev=sd0 numcores=4
> >
> 716a732,736
> > If you installed from a USB stick to the MMC on an EdgeRouter 4/6/8:
> > The machine assigns sd0 to USB first if present, then to MMC if present.
> > If you leave the USB install stick in, the machine will try to boot it.
> > Removing the USB device will cause sd0 to be assigned to mmc0 next boot,
> > allowing the machine to boot your newly-installed OpenBSD drive.

Good points. However, I would like to keep the text general and avoid
listing machine specifics if possible. Does the patch below make the
text any clearer?

In principle, the installer could show an example bootoctlinux command
with the correct parameters for the system.

Future snapshots built after today should handle the copying of bsd.mp
automatically.

Index: notes/octeon/install
===
RCS file: src/distrib/notes/octeon/install,v
retrieving revision 1.16
diff -u -p -r1.16 install
--- notes/octeon/install30 Nov 2017 15:25:37 -  1.16
+++ notes/octeon/install18 Dec 2018 16:21:22 -
@@ -56,8 +56,8 @@ restore it later if needed:
 
 ${bootcmd} is run by U-Boot when ${autoload} is enabled. Now create a new
 ${bootcmd} which will load an ELF file called 'bsd' from the first active FAT
-partition on the first CF card or USB device. The FAT partition has been 
created
-by the installer.
+partition on the first CF card. The FAT partition has been created by the
+installer.
 
# setenv bootcmd 'fatload ide 0:1 ${loadaddr} bsd;bootoctlinux 
rootdev=/dev/octcf0'
# setenv bootdelay 5
@@ -71,9 +71,19 @@ by the installer.
Protected 1 sectors
#
 
-If you have installed onto USB use the following bootcmd instead:
+If you have installed onto eMMC, SATA or USB, use the following
+bootcmd instead:
 
-  fatload usb 0 ${loadaddr} bsd; bootoctlinux rootdev=sd0
+  fatload  0 ${loadaddr} bsd; bootoctlinux rootdev=sd0
+
+where you replace ``'' with ``mmc'', ``sata'' or ``usb''.
+
+For stable root disk selection, you can specify the disk
+by disklabel(8) UID (DUID):
+
+  fatload usb 0 ${loadaddr} bsd; bootoctlinux rootdev=
+
+where ``'' is the DUID of your root disk.
 
 On multi-core systems, the numcores parameter enables the secondary CPUs.
 Use the total number of cores on your system as the value of the parameter.



Re: SGI O2 mec(4) cold boot issue (and workaround)

2018-12-09 Thread Visa Hankala
On Sat, Dec 08, 2018 at 05:32:41PM +, Miod Vallat wrote:
> I have noticed, for a while, that my O2 systems were horribly slow
> during installs or upgrades, when fetching sets from the network (28
> *minutes* to fetch base64.tgz).
> 
> At first, I thought this was a bsd.rd specific bug, but couldn't find
> anything obvious. After gathering enough data, I found out that the
> problem only occurs on a cold boot. After a reboot, the network
> performance is as good as it can be. That would explain why I would only
> notice it during upgrades.
> 
> I also noticed that, on a warm boot, the dmesg would show:
> 
> mec0 at macebus0 base 0x0028 irq 3: MAC-110 rev 1, address 
> 08:00:69:0e:bf:a1
> nsphy0 at mec0 phy 8: DP83840 10/100 PHY, rev. 1
> 
> but on cold boots, it would show:
> 
> mec0 at macebus0 base 0x0028 irq 3: MAC-110 rev 1, address 
> 08:00:69:0e:bf:a1
> nsphy0 at mec0 phy 10: DP83840 10/100 PHY, rev. 1
> 
> Note that, in these cases, the phy seems to attach to a different
> address. In these cases, after booting, "ifconfig mec0" would show:
> 
> mec0: flags=8843 mtu 1500
> lladdr 08:00:69:0e:bf:a1
> llprio 3
>   media: Ethernet autoselect
> status: active
> inet 10.0.1.193 netmask 0xff00 broadcast 10.255.255.255
> 
> while one would expect the "media" line to be similar to:
> 
> media: Ethernet autoselect (100baseTX full-duplex)
> 
> Investigating further, it seems that, after a cold boot, the MII bus
> takes some time to initialize; the phy does not answer to address 8 but
> to a larger address (10 or 11), then, after being reset, to its correct
> address of 8.
> 
> So the kernel would discover the phy at a wrong address, attach it, and
> after it gets reset, reading from the phy at the wrong address would
> return either all bits clear or all bits set, confusing the link speed
> logic without any way to recover.
> 
> What I tried but did not work:
> - invoking mii_attach() twice in the mec driver. This would attach nsphy
>   twice, once at the wrong address, then once at the correct address,
>   but the first (wrong) attachment would be preferred.
> - adding a one second delay between the Ethernet interface reset and
>   mii_attach(). This would work most of the time, but not always.
> 
> What I tried and works:
> - the first time the interface is reset, the mii bus is walked and all
>   phys found on it are reset. Thus, by the time mii_attach() runs and
>   walks the bus again, the phy will answer at the right address.
> 
> The diff below implements this (last chunk of if_mec.c), and also cleans
> the mii read/write routines a bit (all the other chunks).
> 
> Tested on three different R5K family O2 systems, which have all been
> exposing that problem on cold boot.

Committed. Thank you!



Re: Unlock sendit-based syscalls

2018-06-27 Thread Visa Hankala
Here is a new version of the f_count atomic update diff. Now all
updating of the `f_count' field should use atomic operations, and
the `fhdlk' is only used for serializing access to the list of open
files. In addition, the role of the `fd_fplock' should be clearer now:
its only purpose is to let fd_getfile() instantiate a file reference
safely. The `fd_lock' has to be locked when changing a file descriptor
table.

When the unlocking of system calls proceeds further, it will become
necessary to protect the `fd_ofileflags' field with a lock because
of fdexpand() and to maintain coherency with `fd_ofiles'. The patch
includes two small, logically separate, pieces towards that: fdp
locking for fcntl(F_GETFD), and fdp locking for unp_internalize().
The former merely ensures that the `fd_ofileflags' array is valid,
while the latter also makes sure the `fd_ofileflags[fd]' entry matches
with the file instance.

OK?

Index: kern/kern_descrip.c
===
RCS file: src/sys/kern/kern_descrip.c,v
retrieving revision 1.171
diff -u -p -r1.171 kern_descrip.c
--- kern/kern_descrip.c 26 Jun 2018 14:43:01 -  1.171
+++ kern/kern_descrip.c 27 Jun 2018 12:52:25 -
@@ -198,6 +198,7 @@ struct file *
 fd_iterfile(struct file *fp, struct proc *p)
 {
struct file *nfp;
+   unsigned int count;
 
mtx_enter();
if (fp == NULL)
@@ -206,10 +207,15 @@ fd_iterfile(struct file *fp, struct proc
nfp = LIST_NEXT(fp, f_list);
 
/* don't refcount when f_count == 0 to avoid race in fdrop() */
-   while (nfp != NULL && nfp->f_count == 0)
-   nfp = LIST_NEXT(nfp, f_list);
-   if (nfp != NULL)
-   nfp->f_count++;
+   while (nfp != NULL) {
+   count = nfp->f_count;
+   if (count == 0) {
+   nfp = LIST_NEXT(nfp, f_list);
+   continue;
+   }
+   if (atomic_cas_uint(>f_count, count, count + 1) == count)
+   break;
+   }
mtx_leave();
 
if (fp != NULL)
@@ -228,11 +234,11 @@ fd_getfile(struct filedesc *fdp, int fd)
if ((u_int)fd >= fdp->fd_nfiles)
return (NULL);
 
-   mtx_enter();
+   mtx_enter(>fd_fplock);
fp = fdp->fd_ofiles[fd];
if (fp != NULL)
-   fp->f_count++;
-   mtx_leave();
+   atomic_inc_int(>f_count);
+   mtx_leave(>fd_fplock);
 
return (fp);
 }
@@ -433,7 +439,9 @@ restart:
return (error);
 
case F_GETFD:
+   fdplock(fdp);
*retval = fdp->fd_ofileflags[fd] & UF_EXCLOSE ? 1 : 0;
+   fdpunlock(fdp);
break;
 
case F_SETFD:
@@ -652,7 +660,7 @@ finishdup(struct proc *p, struct file *f
fdpassertlocked(fdp);
KASSERT(fp->f_iflags & FIF_INSERTED);
 
-   if (fp->f_count == LONG_MAX-2) {
+   if (fp->f_count >= FDUP_MAX_COUNT) {
FRELE(fp, p);
return (EDEADLK);
}
@@ -666,7 +674,14 @@ finishdup(struct proc *p, struct file *f
fd_used(fdp, new);
}
 
+   /*
+* Use `fd_fplock' to synchronize with fd_getfile() so that
+* the function no longer creates a new reference to the old file.
+*/
+   mtx_enter(>fd_fplock);
fdp->fd_ofiles[new] = fp;
+   mtx_leave(>fd_fplock);
+
fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE;
*retval = new;
 
@@ -694,18 +709,31 @@ fdinsert(struct filedesc *fdp, int fd, i
LIST_INSERT_HEAD(, fp, f_list);
}
}
+   mtx_leave();
+
+   mtx_enter(>fd_fplock);
KASSERT(fdp->fd_ofiles[fd] == NULL);
fdp->fd_ofiles[fd] = fp;
+   mtx_leave(>fd_fplock);
+
fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE);
-   mtx_leave();
 }
 
 void
 fdremove(struct filedesc *fdp, int fd)
 {
fdpassertlocked(fdp);
+
+   /*
+* Use `fd_fplock' to synchronize with fd_getfile() so that
+* the function no longer creates a new reference to the file.
+*/
+   mtx_enter(>fd_fplock);
fdp->fd_ofiles[fd] = NULL;
+   mtx_leave(>fd_fplock);
+
fdp->fd_ofileflags[fd] = 0;
+
fd_unused(fdp, fd);
 }
 
@@ -839,6 +867,8 @@ fdalloc(struct proc *p, int want, int *r
int lim, last, i;
u_int new, off;
 
+   fdpassertlocked(fdp);
+
/*
 * Search for a free descriptor starting at the higher
 * of want or fd_freefile.  If that fails, consider
@@ -886,14 +916,17 @@ void
 fdexpand(struct proc *p)
 {
struct filedesc *fdp = p->p_fd;
-   int nfiles;
+   int nfiles, oldnfiles;
size_t copylen;
-   struct file **newofile;
+   struct file **newofile, **oldofile;
char *newofileflags;
u_int *newhimap, *newlomap;
 
fdpassertlocked(fdp);
 
+   oldnfiles 

Re: Unlock sendit-based syscalls

2018-06-21 Thread Visa Hankala
On Thu, Jun 21, 2018 at 04:49:48PM +, Visa Hankala wrote:
> Here is an updated diff that has been adjusted for the current tree.

Hmm, the diff is not right yet. Please ignore.



Re: Unlock sendit-based syscalls

2018-06-21 Thread Visa Hankala
On Tue, Jun 19, 2018 at 03:36:57PM +, Visa Hankala wrote:
> On Tue, Jun 19, 2018 at 03:58:51PM +0200, Mark Kettenis wrote:
> > > Date: Tue, 19 Jun 2018 15:38:01 +0200
> > > From: Martin Pieuchot 
> > > 
> > > On 19/06/18(Tue) 14:55, Mark Kettenis wrote:
> > > > > To avoid races with another thread that might be clearing our pointer
> > > > > in `fd_ofiles', we need more than atomic operations.  For that we need
> > > > > to serialize the threads.  The most simple way to do so is with a 
> > > > > mutex
> > > > > on a different data structure.  Either global, like in my diff, or as
> > > > > visa@ suggested in the corresponding `fdp'.
> > > > 
> > > > Right.  Another case of trying to take a reference without holding one
> > > > already.  The trick here is to use the fact that as long as there is a
> > > > file descriptor allocated for this open file the reference count is at
> > > > least 1.  So in that case taking a reference is safe.  Your global
> > > > mutex does indeed do the trick.  But why aren't you using the file
> > > > descriptor table rwlock for this purpose?  Is that because there is a
> > > > lock ordering problem between the kernel lock and that rwlock?
> > > 
> > > I have two reasons.  First I don't want to introduce new sleeping points,
> > > secondly I want this mutex to disappear.  IMHO extending the scope of a
> > > lock is going in the wrong direction because then we'll want to split it.
> > > That's why I'm happy that our discussion made visa@ look at improving 
> > > this.
> > 
> > I wouldn't call this extending the scope of the lock.  But yes, it
> > might cause unnecessary sleeps if a write lock is held for the purpose
> > of opening a file.  The mutex that visa@ proposes trades that in for
> > potential contion in the multiple-readers case.
> 
> Below is a revised version of the diff. Like mpi@'s diff, it uses
> a `fhdlk' mutex for serializing access to the file list. However, that
> lock is not used for anything else. Each file descriptor table has a
> dedicated mutex that allows fd_getfile() acquire a file reference
> safely.
> 
> fd_iterfile() uses a compare-and-swap sequence for reference
> acquisition. The sequence ensures that once `f_count' has become zero
> the value is never raised.

Here is an updated diff that has been adjusted for the current tree.

OK?

Index: kern/kern_descrip.c
===
RCS file: src/sys/kern/kern_descrip.c,v
retrieving revision 1.167
diff -u -p -r1.167 kern_descrip.c
--- kern/kern_descrip.c 20 Jun 2018 10:52:49 -  1.167
+++ kern/kern_descrip.c 21 Jun 2018 16:10:39 -
@@ -198,6 +198,7 @@ struct file *
 fd_iterfile(struct file *fp, struct proc *p)
 {
struct file *nfp;
+   unsigned int count;
 
mtx_enter();
if (fp == NULL)
@@ -206,10 +207,15 @@ fd_iterfile(struct file *fp, struct proc
nfp = LIST_NEXT(fp, f_list);
 
/* don't refcount when f_count == 0 to avoid race in fdrop() */
-   while (nfp != NULL && nfp->f_count == 0)
-   nfp = LIST_NEXT(nfp, f_list);
-   if (nfp != NULL)
-   nfp->f_count++;
+   while (nfp != NULL) {
+   count = nfp->f_count;
+   if (count == 0) {
+   nfp = LIST_NEXT(nfp, f_list);
+   continue;
+   }
+   if (atomic_cas_uint(>f_count, count, count + 1) == count)
+   break;
+   }
mtx_leave();
 
if (fp != NULL)
@@ -228,11 +234,11 @@ fd_getfile(struct filedesc *fdp, int fd)
if ((u_int)fd >= fdp->fd_nfiles)
return (NULL);
 
-   mtx_enter();
+   mtx_enter(>fd_fplock);
fp = fdp->fd_ofiles[fd];
if (fp != NULL)
-   fp->f_count++;
-   mtx_leave();
+   atomic_inc_int(>f_count);
+   mtx_leave(>fd_fplock);
 
return (fp);
 }
@@ -652,7 +658,7 @@ finishdup(struct proc *p, struct file *f
fdpassertlocked(fdp);
KASSERT(fp->f_iflags & FIF_INSERTED);
 
-   if (fp->f_count == LONG_MAX-2) {
+   if (fp->f_count >= FDUP_MAX_COUNT) {
FRELE(fp, p);
return (EDEADLK);
}
@@ -666,8 +672,10 @@ finishdup(struct proc *p, struct file *f
fd_used(fdp, new);
}
 
+   mtx_enter(>fd_fplock);
fdp->fd_ofiles[new] = fp;
fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE;
+   mtx_leave(>fd_fplock);
*retval = new;
 
if (oldfp != NULL) {
@@ -686,2

Re: Unlock sendit-based syscalls

2018-06-19 Thread Visa Hankala
On Tue, Jun 19, 2018 at 03:58:51PM +0200, Mark Kettenis wrote:
> > Date: Tue, 19 Jun 2018 15:38:01 +0200
> > From: Martin Pieuchot 
> > 
> > On 19/06/18(Tue) 14:55, Mark Kettenis wrote:
> > > > To avoid races with another thread that might be clearing our pointer
> > > > in `fd_ofiles', we need more than atomic operations.  For that we need
> > > > to serialize the threads.  The most simple way to do so is with a mutex
> > > > on a different data structure.  Either global, like in my diff, or as
> > > > visa@ suggested in the corresponding `fdp'.
> > > 
> > > Right.  Another case of trying to take a reference without holding one
> > > already.  The trick here is to use the fact that as long as there is a
> > > file descriptor allocated for this open file the reference count is at
> > > least 1.  So in that case taking a reference is safe.  Your global
> > > mutex does indeed do the trick.  But why aren't you using the file
> > > descriptor table rwlock for this purpose?  Is that because there is a
> > > lock ordering problem between the kernel lock and that rwlock?
> > 
> > I have two reasons.  First I don't want to introduce new sleeping points,
> > secondly I want this mutex to disappear.  IMHO extending the scope of a
> > lock is going in the wrong direction because then we'll want to split it.
> > That's why I'm happy that our discussion made visa@ look at improving this.
> 
> I wouldn't call this extending the scope of the lock.  But yes, it
> might cause unnecessary sleeps if a write lock is held for the purpose
> of opening a file.  The mutex that visa@ proposes trades that in for
> potential contion in the multiple-readers case.

Below is a revised version of the diff. Like mpi@'s diff, it uses
a `fhdlk' mutex for serializing access to the file list. However, that
lock is not used for anything else. Each file descriptor table has a
dedicated mutex that allows fd_getfile() acquire a file reference
safely.

fd_iterfile() uses a compare-and-swap sequence for reference
acquisition. The sequence ensures that once `f_count' has become zero
the value is never raised.

This diff does not unlock any system calls.


Index: kern/kern_descrip.c
===
RCS file: src/sys/kern/kern_descrip.c,v
retrieving revision 1.166
diff -u -p -r1.166 kern_descrip.c
--- kern/kern_descrip.c 18 Jun 2018 09:15:05 -  1.166
+++ kern/kern_descrip.c 19 Jun 2018 15:13:05 -
@@ -66,7 +66,11 @@
 
 /*
  * Descriptor management.
+ *
+ * We need to block interrupts as long as `fhdlk' is being taken
+ * with and without the KERNEL_LOCK().
  */
+struct mutex fhdlk = MUTEX_INITIALIZER(IPL_MPFLOOR);
 struct filelist filehead;  /* head of list of open files */
 int numfiles;  /* actual number of open files */
 
@@ -194,17 +198,25 @@ struct file *
 fd_iterfile(struct file *fp, struct proc *p)
 {
struct file *nfp;
+   unsigned int count;
 
+   mtx_enter();
if (fp == NULL)
nfp = LIST_FIRST();
else
nfp = LIST_NEXT(fp, f_list);
 
/* don't FREF when f_count == 0 to avoid race in fdrop() */
-   while (nfp != NULL && nfp->f_count == 0)
-   nfp = LIST_NEXT(nfp, f_list);
-   if (nfp != NULL)
-   FREF(nfp);
+   while (nfp != NULL) {
+   count = nfp->f_count;
+   if (count == 0) {
+   nfp = LIST_NEXT(nfp, f_list);
+   continue;
+   }
+   if (atomic_cas_uint(>f_count, count, count + 1) == count)
+   break;
+   }
+   mtx_leave();
 
if (fp != NULL)
FRELE(fp, p);
@@ -217,10 +229,17 @@ fd_getfile(struct filedesc *fdp, int fd)
 {
struct file *fp;
 
-   if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL)
+   vfs_stall_barrier();
+
+   if ((u_int)fd >= fdp->fd_nfiles)
return (NULL);
 
-   FREF(fp);
+   mtx_enter(>fd_fplock);
+   fp = fdp->fd_ofiles[fd];
+   if (fp != NULL)
+   atomic_inc_int(>f_count);
+   mtx_leave(>fd_fplock);
+
return (fp);
 }
 
@@ -639,7 +658,7 @@ finishdup(struct proc *p, struct file *f
fdpassertlocked(fdp);
KASSERT(fp->f_iflags & FIF_INSERTED);
 
-   if (fp->f_count == LONG_MAX-2) {
+   if (fp->f_count >= FDUP_MAX_COUNT) {
FRELE(fp, p);
return (EDEADLK);
}
@@ -653,8 +672,10 @@ finishdup(struct proc *p, struct file *f
fd_used(fdp, new);
}
 
+   mtx_enter(>fd_fplock);
fdp->fd_ofiles[new] = fp;
fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE;
+   mtx_leave(>fd_fplock);
*retval = new;
 
if (oldfp != NULL) {
@@ -671,22 +692,34 @@ fdinsert(struct filedesc *fdp, int fd, i
struct file *fq;
 
fdpassertlocked(fdp);
+
+   mtx_enter();
+   fp->f_iflags |= 

Re: Make witness(4) ready for UP systems

2018-06-14 Thread Visa Hankala
On Wed, Jun 13, 2018 at 10:45:08PM +0200, Christian Ludwig wrote:
> It makes sense to have witness(4) on uniprocessor systems, too. Lock-order
> violations are not an MP-only thing. Since UP kernels do not have the kernel
> lock, wrap the code in appropriate ifdefs.

Committed. Thank you.



Re: make octeon kernels compile with DEBUG.

2018-06-13 Thread Visa Hankala
On Wed, Jun 13, 2018 at 08:34:46AM +0200, Janne Johansson wrote:
> The unconditional #define DEBUG in octeon/machdep.c is somewhat weird.
> 
> Should we just keep the whole block and remove the #ifdefs, move it to
> #if 1 for later easy removal? Dunno, but it won't compile with DEBUG
> unless something is done or it will complain about DEBUG getting redefined.

I think it is best to make the block unconditional. The output contains
interesting information after all.

I committed the patch with minor tweaks and added two changes to cover
the RAMDISK kernel.

Thank you!



Locking problem with open(2)

2018-06-02 Thread Visa Hankala
The open(2) system call uses a problematic locking pattern that can
cause problems when the process has more than one thread. The system
call keeps the file descriptor table locked when calling vn_open(9).
If vn_open(9) blocks, the other threads get blocked too if they try
to modify the descriptor table. The issue is explained by art@ on
the following page (mpi@ brought this to my attention):

https://stackoverflow.com/questions/23440216/race-condition-when-using-dup2/24012015#24012015

The following patch makes open(2) use a locking pattern similar to
accept(2). The descriptor table lock is released immediately after
allocating a file descriptor, so other threads are not directly
affected if vn_open(9) blocks. Once vn_open(9) has finished and the
code is done with the vnode, the descriptor table lock is reacquired
for inserting the file descriptor.

The change of the locking pattern has an effect on dup2(2).
If vn_open(9) has blocked and another thread tries to dup2() to the
already allocated but not yet inserted file descriptor, dup2()
fails with error EBUSY. accept(2) already has that behaviour.

OK?  Tests are welcome too.

Index: lib/libc/sys/dup.2
===
RCS file: src/lib/libc/sys/dup.2,v
retrieving revision 1.19
diff -u -p -r1.19 dup.2
--- lib/libc/sys/dup.2  28 May 2018 08:55:11 -  1.19
+++ lib/libc/sys/dup.2  2 Jun 2018 10:51:48 -
@@ -160,6 +160,8 @@ limit.
 .It Bq Er EBUSY
 A race condition with
 .Xr accept 2
+or
+.Xr open 2
 has been detected.
 .It Bq Er EINTR
 An interrupt was received.
Index: sys/kern/vfs_syscalls.c
===
RCS file: src/sys/kern/vfs_syscalls.c,v
retrieving revision 1.284
diff -u -p -r1.284 vfs_syscalls.c
--- sys/kern/vfs_syscalls.c 2 Jun 2018 10:27:43 -   1.284
+++ sys/kern/vfs_syscalls.c 2 Jun 2018 10:51:48 -
@@ -916,6 +916,8 @@ doopenat(struct proc *p, int fd, const c
fdplock(fdp);
if ((error = falloc(p, , )) != 0)
goto out;
+   fdpunlock(fdp);
+
flags = FFLAGS(oflags);
if (flags & FREAD)
ni_pledge |= PLEDGE_RPATH;
@@ -935,6 +937,7 @@ doopenat(struct proc *p, int fd, const c
flags &= ~O_TRUNC;  /* Must do truncate ourselves */
}
if ((error = vn_open(, flags, cmode)) != 0) {
+   fdplock(fdp);
if (error == ENODEV &&
p->p_dupfd >= 0 &&  /* XXX from fdopen */
(error =
@@ -969,6 +972,7 @@ doopenat(struct proc *p, int fd, const c
VOP_UNLOCK(vp);
error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, , type);
if (error) {
+   fdplock(fdp);
/* closef will vn_close the file for us. */
fdremove(fdp, indx);
closef(fp, p);
@@ -991,6 +995,7 @@ doopenat(struct proc *p, int fd, const c
}
if (error) {
VOP_UNLOCK(vp);
+   fdplock(fdp);
/* closef will close the file for us. */
fdremove(fdp, indx);
closef(fp, p);
@@ -999,6 +1004,7 @@ doopenat(struct proc *p, int fd, const c
}
VOP_UNLOCK(vp);
*retval = indx;
+   fdplock(fdp);
fdinsert(fdp, indx, cloexec, fp);
FRELE(fp, p);
 out:



Re: Remove unused C3 values (VIA)

2018-06-02 Thread Visa Hankala
On Sat, Jun 02, 2018 at 01:00:05PM +0200, Frederic Cambus wrote:
> Hi tech@,
> 
> Here is a diff to remove unused C3 values.
> 
> Comments? OK?

I would keep the definitions because they can at least serve as a weak
form of documentation about the hardware. For example,
C3_CRYPT_CWLO_ALG_M indicates what bits specify the algorithm, which
in turn helps one to understand the value of C3_CRYPT_CWLO_ALG_AES.

> Index: sys/arch/amd64/include/specialreg.h
> ===
> RCS file: /cvs/src/sys/arch/amd64/include/specialreg.h,v
> retrieving revision 1.72
> diff -u -p -r1.72 specialreg.h
> --- sys/arch/amd64/include/specialreg.h   23 May 2018 05:37:01 -  
> 1.72
> +++ sys/arch/amd64/include/specialreg.h   2 Jun 2018 09:14:32 -
> @@ -873,14 +873,9 @@
>  #define C3_CPUID_DO_PMM  0x002000
>  
>  /* VIA C3 xcrypt-* instruction context control options */
> -#define  C3_CRYPT_CWLO_ROUND_M   0x000f
> -#define  C3_CRYPT_CWLO_ALG_M 0x0070
>  #define  C3_CRYPT_CWLO_ALG_AES   0x
> -#define  C3_CRYPT_CWLO_KEYGEN_M  0x0080
> -#define  C3_CRYPT_CWLO_KEYGEN_HW 0x
>  #define  C3_CRYPT_CWLO_KEYGEN_SW 0x0080
>  #define  C3_CRYPT_CWLO_NORMAL0x
> -#define  C3_CRYPT_CWLO_INTERMEDIATE  0x0100
>  #define  C3_CRYPT_CWLO_ENCRYPT   0x
>  #define  C3_CRYPT_CWLO_DECRYPT   0x0200
>  #define  C3_CRYPT_CWLO_KEY1280x000a  /* 128bit, 10 
> rds */
> Index: sys/arch/i386/include/specialreg.h
> ===
> RCS file: /cvs/src/sys/arch/i386/include/specialreg.h,v
> retrieving revision 1.66
> diff -u -p -r1.66 specialreg.h
> --- sys/arch/i386/include/specialreg.h28 May 2018 20:52:44 -  
> 1.66
> +++ sys/arch/i386/include/specialreg.h2 Jun 2018 09:14:32 -
> @@ -724,14 +724,9 @@
>  #define C3_CPUID_DO_PMM  0x002000
>  
>  /* VIA C3 xcrypt-* instruction context control options */
> -#define  C3_CRYPT_CWLO_ROUND_M   0x000f
> -#define  C3_CRYPT_CWLO_ALG_M 0x0070
>  #define  C3_CRYPT_CWLO_ALG_AES   0x
> -#define  C3_CRYPT_CWLO_KEYGEN_M  0x0080
> -#define  C3_CRYPT_CWLO_KEYGEN_HW 0x
>  #define  C3_CRYPT_CWLO_KEYGEN_SW 0x0080
>  #define  C3_CRYPT_CWLO_NORMAL0x
> -#define  C3_CRYPT_CWLO_INTERMEDIATE  0x0100
>  #define  C3_CRYPT_CWLO_ENCRYPT   0x
>  #define  C3_CRYPT_CWLO_DECRYPT   0x0200
>  #define  C3_CRYPT_CWLO_KEY1280x000a  /* 128bit, 10 
> rds */
> 



Re: Move FREF() inside fd_getfile()

2018-04-26 Thread Visa Hankala
On Thu, Apr 26, 2018 at 08:34:08AM +0200, Martin Pieuchot wrote:
> On 25/04/18(Wed) 17:07, Visa Hankala wrote:
> > On Wed, Apr 25, 2018 at 12:12:29PM +0200, Martin Pieuchot wrote:
> > > The goal is to avoid races between fd_getfile() and FREF().  So we want
> > > a properly refcounted 'struct file *' as soon as possible.
> > 
> > Boot hangs with this patch. The last line on the console is
> > "setting tty flags".
> > 
> > Two issues spotted so far:
> > 
> > > @@ -201,9 +202,10 @@ fd_getfile_mode(struct filedesc *fdp, in
> > >   KASSERT(mode != 0);
> > >  
> > >   fp = fd_getfile(fdp, fd);
> > > -
> > > - if (fp == NULL || (fp->f_flag & mode) == 0)
> > > + if (fp == NULL || (fp->f_flag & mode) == 0) {
> > > + FRELE(fp, curproc);
> > >   return (NULL);
> > > + }
> > >  
> > >   return (fp);
> > >  }
> > 
> > * The FRELE() above can dereference a NULL pointer.
> > 
> > * sys_close() lacks an FRELE().
> 
> Thanks, updated diff below.

OK visa@



Re: Move FREF() inside fd_getfile()

2018-04-25 Thread Visa Hankala
On Wed, Apr 25, 2018 at 12:12:29PM +0200, Martin Pieuchot wrote:
> The goal is to avoid races between fd_getfile() and FREF().  So we want
> a properly refcounted 'struct file *' as soon as possible.

Boot hangs with this patch. The last line on the console is
"setting tty flags".

Two issues spotted so far:

> @@ -201,9 +202,10 @@ fd_getfile_mode(struct filedesc *fdp, in
>   KASSERT(mode != 0);
>  
>   fp = fd_getfile(fdp, fd);
> -
> - if (fp == NULL || (fp->f_flag & mode) == 0)
> + if (fp == NULL || (fp->f_flag & mode) == 0) {
> + FRELE(fp, curproc);
>   return (NULL);
> + }
>  
>   return (fp);
>  }

* The FRELE() above can dereference a NULL pointer.

* sys_close() lacks an FRELE().



Re: vput(9) for NFS

2018-04-11 Thread Visa Hankala
On Wed, Apr 11, 2018 at 12:34:03PM +0200, Martin Pieuchot wrote:
> On 09/04/18(Mon) 16:10, Martin Pieuchot wrote:
> > Diff below implements most of the missing locking goo for NFS without
> > enabling it.  It does the following:
> > 
> > - Add a missing PDIRUNLOCK in nfs_lookup()
> > 
> > - Change vrele(9) into vput(9) where necessary.  nfs_nget() will in
> >   future return a locked NFSnode.  Just like ffs_vget() returns a
> >   locked vnode.  So every time it is called we need a corresponding
> >   vput(9).
> > 
> > - Make sure VN_KNOTE() is always called with a valid reference, before
> >   vrele(9) or vput(9).
> > 
> > - Substitute an XXX by an assert in nfs_removeit().
> > 
> > However as long as nfs_lock/unlock are noops, this diff should not
> > introduce any change in behavior.  But please do test this diff :)
> 
> New version of the diff after a review from visa@.  It includes:
> 
> - Stop calling vput(9) inside nfs_mknodrpc() to be able to call
>   VN_KNOTE() with a proper reference in nfs_mknod().
> 
> - Do not unconditionally unlock a node in an error case in
>   nfs_lookitup().
> 
> - Fix a dangling reference in the case of 'goto again' in nfs_create().

OK visa@, with one more tweak:

The KASSERT(VOP_ISLOCKED(sp->s_dvp)) in nfs_removeit() should be
commented out until the locking is enabled. Currently, VOP_ISLOCKED()
returns zero with NFS vnodes.



Re: NFS vs NET_LOCK()

2018-03-29 Thread Visa Hankala
On Thu, Mar 29, 2018 at 11:57:42AM +0200, Martin Pieuchot wrote:
> On 28/03/18(Wed) 16:09, Visa Hankala wrote:
> > On Wed, Mar 28, 2018 at 11:59:46AM +0200, Martin Pieuchot wrote:
> > > Index: uvm/uvm_vnode.c
> > > ===
> > > RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v
> > > retrieving revision 1.99
> > > diff -u -p -r1.99 uvm_vnode.c
> > > --- uvm/uvm_vnode.c   8 Mar 2018 22:04:18 -   1.99
> > > +++ uvm/uvm_vnode.c   28 Mar 2018 09:56:51 -
> > > @@ -1105,6 +1105,7 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t 
> > >   off_t file_offset;
> > >   int waitf, result, mapinflags;
> > >   size_t got, wanted;
> > > + int netunlocked = 0;
> > >  
> > >   /* init values */
> > >   waitf = (flags & PGO_SYNCIO) ? M_WAITOK : M_NOWAIT;
> > > @@ -1174,18 +1175,24 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t 
> > >* Ideally, this kind of operation *should* work.
> > >*/
> > >   result = 0;
> > > - if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
> > > - result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL, curproc);
> > > -
> > > - if (result == 0) {
> > > - int netlocked = (rw_status() == RW_WRITE);
> > > -
> > > + if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0) {
> > >   /*
> > >* This process may already have the NET_LOCK(), if we
> > >* faulted in copyin() or copyout() in the network stack.
> > >*/
> > > - if (netlocked)
> > > + if (rw_status() == RW_WRITE) {
> > > + netunlocked = 1;
> > > + NET_UNLOCK();
> > > + }
> > > +
> > > + result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL, curproc);
> > > + }
> > > +
> > > + if (result == 0) {
> > > + if (!netunlocked && (rw_status() == RW_WRITE)) {
> > > + netunlocked = 1;
> > >   NET_UNLOCK();
> > > + }
> > 
> > Is it necessary to have two unlock cases? It looks that a single
> > check of netlock + unlock before
> > the ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0) condition would
> > be enough.
> > 
> > if (rw_status() == RW_WRITE) {
> > netunlocked = 1;
> > NET_UNLOCK();
> > }
> > 
> > result = 0;
> > if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
> > result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL, curproc);
> > 
> > if (result == 0) {
> > /* NOTE: vnode now locked! */
> > ...
> 
> You're completely right!  Updated diff below.

OK visa@

> Index: uvm/uvm_vnode.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v
> retrieving revision 1.99
> diff -u -p -r1.99 uvm_vnode.c
> --- uvm/uvm_vnode.c   8 Mar 2018 22:04:18 -   1.99
> +++ uvm/uvm_vnode.c   29 Mar 2018 09:56:30 -
> @@ -1105,6 +1105,7 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t 
>   off_t file_offset;
>   int waitf, result, mapinflags;
>   size_t got, wanted;
> + int netunlocked = 0;
>  
>   /* init values */
>   waitf = (flags & PGO_SYNCIO) ? M_WAITOK : M_NOWAIT;
> @@ -1163,6 +1164,15 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t 
>   uio.uio_resid = wanted;
>   uio.uio_procp = curproc;
>  
> + /*
> +  * This process may already have the NET_LOCK(), if we
> +  * faulted in copyin() or copyout() in the network stack.
> +  */
> + if (rw_status() == RW_WRITE) {
> + NET_UNLOCK();
> + netunlocked = 1;
> + }
> +
>   /* do the I/O!  (XXX: curproc?) */
>   /*
>* This process may already have this vnode locked, if we faulted in
> @@ -1178,15 +1188,6 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t 
>   result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL, curproc);
>  
>   if (result == 0) {
> - int netlocked = (rw_status() == RW_WRITE);
> -
> - /*
> -  * This process may already have the NET_LOCK(), if we
> -  * faulted in copyin() or copyout() in the network stack.
> -  */
> - if (netlocked)
> - NET_UNLOCK();
> -
>   /* NOTE: vnode now locked! */
>   if (rw == UIO_READ)
>   result = VOP_READ(vn, , 0, curproc->p_ucred);
> @@ -1195,12 +1196,14 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t 
>   (flags & PGO_PDFREECLUST) ? IO_NOCACHE : 0,
>   curproc->p_ucred);
>  
> - if (netlocked)
> - NET_LOCK();
> -
>   if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
>   VOP_UNLOCK(vn, curproc);
> +
>   }
> +
> + if (netunlocked)
> + NET_LOCK();
> +
>  
>   /* NOTE: vnode now unlocked (unless vnislocked) */
>   /*
> 



Re: NFS vs NET_LOCK()

2018-03-28 Thread Visa Hankala
On Wed, Mar 28, 2018 at 11:59:46AM +0200, Martin Pieuchot wrote:
> Index: uvm/uvm_vnode.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v
> retrieving revision 1.99
> diff -u -p -r1.99 uvm_vnode.c
> --- uvm/uvm_vnode.c   8 Mar 2018 22:04:18 -   1.99
> +++ uvm/uvm_vnode.c   28 Mar 2018 09:56:51 -
> @@ -1105,6 +1105,7 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t 
>   off_t file_offset;
>   int waitf, result, mapinflags;
>   size_t got, wanted;
> + int netunlocked = 0;
>  
>   /* init values */
>   waitf = (flags & PGO_SYNCIO) ? M_WAITOK : M_NOWAIT;
> @@ -1174,18 +1175,24 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t 
>* Ideally, this kind of operation *should* work.
>*/
>   result = 0;
> - if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
> - result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL, curproc);
> -
> - if (result == 0) {
> - int netlocked = (rw_status() == RW_WRITE);
> -
> + if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0) {
>   /*
>* This process may already have the NET_LOCK(), if we
>* faulted in copyin() or copyout() in the network stack.
>*/
> - if (netlocked)
> + if (rw_status() == RW_WRITE) {
> + netunlocked = 1;
> + NET_UNLOCK();
> + }
> +
> + result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL, curproc);
> + }
> +
> + if (result == 0) {
> + if (!netunlocked && (rw_status() == RW_WRITE)) {
> + netunlocked = 1;
>   NET_UNLOCK();
> + }

Is it necessary to have two unlock cases? It looks that a single
check of netlock + unlock before
the ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0) condition would
be enough.

if (rw_status() == RW_WRITE) {
netunlocked = 1;
NET_UNLOCK();
}

result = 0;
if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL, curproc);

if (result == 0) {
/* NOTE: vnode now locked! */
...



Re: NFS vs NET_LOCK()

2018-03-27 Thread Visa Hankala
On Tue, Mar 27, 2018 at 11:00:20AM +0200, Martin Pieuchot wrote:
> Diff below prevents a future lock ordering problem between NFSnode
> locks (similar to Inode locks) and the NET_LOCK().
> 
> It ensures the NET_LOCK() is always locked *after* any NFSnode lock
> by fixing the UVM fault case.  So we have always have:
>   VFS -> NFS -> NFSnode lock -> socket -> NET_LOCK(). 
> 
> Ok?
> 
> Index: uvm/uvm_vnode.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v
> retrieving revision 1.99
> diff -u -p -r1.99 uvm_vnode.c
> --- uvm/uvm_vnode.c   8 Mar 2018 22:04:18 -   1.99
> +++ uvm/uvm_vnode.c   20 Mar 2018 12:58:17 -
> @@ -1105,6 +1105,7 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t 
>   off_t file_offset;
>   int waitf, result, mapinflags;
>   size_t got, wanted;
> + int netlocked = 0;
>  
>   /* init values */
>   waitf = (flags & PGO_SYNCIO) ? M_WAITOK : M_NOWAIT;
> @@ -1174,18 +1175,24 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t 
>* Ideally, this kind of operation *should* work.
>*/
>   result = 0;
> - if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
> - result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL, curproc);
> -
> - if (result == 0) {
> - int netlocked = (rw_status() == RW_WRITE);
> -
> + if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0) {
>   /*
>* This process may already have the NET_LOCK(), if we
>* faulted in copyin() or copyout() in the network stack.
>*/
> - if (netlocked)
> + if (rw_status() == RW_WRITE) {
> + netlocked = 1;
> + NET_UNLOCK();
> + }
> +
> + result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL, curproc);
> + }
> +
> + if (result == 0) {
> + if (!netlocked && (rw_status() == RW_WRITE)) {
> + netlocked = 1;
>   NET_UNLOCK();
> + }
>  
>   /* NOTE: vnode now locked! */
>   if (rw == UIO_READ)
> @@ -1195,11 +1202,12 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t 
>   (flags & PGO_PDFREECLUST) ? IO_NOCACHE : 0,
>   curproc->p_ucred);
>  
> + if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
> + VOP_UNLOCK(vn, curproc);
> +
>   if (netlocked)
>   NET_LOCK();
>  
> - if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
> - VOP_UNLOCK(vn, curproc);
>   }

This can leave NET_LOCK() unrestored if vn_lock() returns non-zero.



Re: sdhc(4) base clock

2018-03-18 Thread Visa Hankala
On Sun, Mar 18, 2018 at 06:33:59PM +0100, Mark Kettenis wrote:
> > Date: Sun, 18 Mar 2018 12:17:09 +
> > From: Visa Hankala <v...@openbsd.org>
> > 
> > On Sat, Mar 17, 2018 at 07:41:39PM +0100, Mark Kettenis wrote:
> > > Index: dev/sdmmc/sdhc.c
> > > ===
> > > RCS file: /cvs/src/sys/dev/sdmmc/sdhc.c,v
> > > retrieving revision 1.56
> > > diff -u -p -r1.56 sdhc.c
> > > --- dev/sdmmc/sdhc.c  10 Feb 2018 05:21:13 -  1.56
> > > +++ dev/sdmmc/sdhc.c  17 Mar 2018 18:34:08 -
> > > @@ -203,6 +203,11 @@ sdhc_host_found(struct sdhc_softc *sc, b
> > >   hp->clkbase = SDHC_BASE_FREQ_KHZ(caps);
> > >   }
> > >   if (hp->clkbase == 0) {
> > > + /* Make sure we can clock down to 400 kHz. */
> > > + max_clock = 400 * 2046;
> > > + hp->clkbase = sc->sc_clkbase;
> > > + }
> > 
> > The above looks strange. Did you mean "up to 400 MHz"?
> > max_clock is in kHz.
> 
> Right.  The maximum clock divisor for SDHC 3.0 is 2046.  So if the
> base clock is 400 * 2046 kHz, using the maximum divisor yields 400
> kHz.

Ah, now I understand. Please use a macro for the value 2046,
say SDHC_CLK_DIV_MAX_V3, to help the next code reader.
With that tweak, OK visa@



Re: sdhc(4) base clock

2018-03-18 Thread Visa Hankala
On Sat, Mar 17, 2018 at 07:41:39PM +0100, Mark Kettenis wrote:
> Index: dev/sdmmc/sdhc.c
> ===
> RCS file: /cvs/src/sys/dev/sdmmc/sdhc.c,v
> retrieving revision 1.56
> diff -u -p -r1.56 sdhc.c
> --- dev/sdmmc/sdhc.c  10 Feb 2018 05:21:13 -  1.56
> +++ dev/sdmmc/sdhc.c  17 Mar 2018 18:34:08 -
> @@ -203,6 +203,11 @@ sdhc_host_found(struct sdhc_softc *sc, b
>   hp->clkbase = SDHC_BASE_FREQ_KHZ(caps);
>   }
>   if (hp->clkbase == 0) {
> + /* Make sure we can clock down to 400 kHz. */
> + max_clock = 400 * 2046;
> + hp->clkbase = sc->sc_clkbase;
> + }

The above looks strange. Did you mean "up to 400 MHz"?
max_clock is in kHz.



Re: MI mutex

2018-01-23 Thread Visa Hankala
On Tue, Jan 23, 2018 at 03:23:24PM +0100, Martin Pieuchot wrote:
> On 23/01/18(Tue) 14:06, Visa Hankala wrote:
> > In addition, you should put the common mutex code into kern_mutex.c.
> > Lets keep kern_lock.c for the mplock only.
> 
> I'm more in favor of putting everything into kern_lock.c.  We're talking
> about less than 400 lines of code.

To me this seems a bit odd at the moment, but fine.

> +#ifdef MP_LOCKDEBUG
> +#ifndef DDB
> +#error "MP_LOCKDEBUG requires DDB"
> +#endif
> +
> +/* CPU-dependent timing, needs this to be settable from ddb. */
> +extern int __mp_lock_spinout;
> +#endif

The above should be left out. There already is a similar block
at the start of the file. Otherwise OK visa@.



Re: MI mutex

2018-01-23 Thread Visa Hankala
On Mon, Jan 22, 2018 at 11:12:13AM +0100, Martin Pieuchot wrote:
> Diff below moves the common mutex implementation to kern/ and enable it for
> alpha, amd64, arm64, i386, mips64, powerpc.

Your diff seems to miss the necessary bits in .
In addition, you should put the common mutex code into kern_mutex.c.
Lets keep kern_lock.c for the mplock only.



Re: ddb(4) userland trace and SMAP

2017-12-07 Thread Visa Hankala
On Thu, Dec 07, 2017 at 11:43:09AM +0100, Martin Pieuchot wrote:
> On 05/12/17(Tue) 14:52, Visa Hankala wrote:
> > On Tue, Dec 05, 2017 at 11:32:53AM +0100, Martin Pieuchot wrote:
> > > On 04/12/17(Mon) 12:24, Martin Pieuchot wrote:
> > > > Since SMAP is enabled ddb(4)'s 'trace /u' and 'trace /p' for a userland
> > > > processes result, as expected, in page faults.
> > > > 
> > > > Diff below disable SMAP for the duration of the command.  This allows us
> > > > to see any possible frame corruption.
> > > 
> > > Updated version that:
> > > 
> > >  - Removes the goto by shuffling parameter tests
> > >  - Initializes cr4save to limit the effect of this gadget. 
> > >  - Skip lcr4() completely if the CPU doesn't support SMAP.
> > 
> > On i386, it might be necessary to make the rcr4() conditional to a CPU
> > feature flag because olden x86 processors do not have the CR4 register.
> > Condition curcpu()->ci_feature_sefflags_ebx & SEFF0EBX_SMAP might be
> > good enough in this case.

> Good point, I'd like to commit the diff below then.

OK visa@



Re: ddb(4) userland trace and SMAP

2017-12-05 Thread Visa Hankala
On Tue, Dec 05, 2017 at 11:32:53AM +0100, Martin Pieuchot wrote:
> On 04/12/17(Mon) 12:24, Martin Pieuchot wrote:
> > Since SMAP is enabled ddb(4)'s 'trace /u' and 'trace /p' for a userland
> > processes result, as expected, in page faults.
> > 
> > Diff below disable SMAP for the duration of the command.  This allows us
> > to see any possible frame corruption.
> 
> Updated version that:
> 
>  - Removes the goto by shuffling parameter tests
>  - Initializes cr4save to limit the effect of this gadget. 
>  - Skip lcr4() completely if the CPU doesn't support SMAP.

On i386, it might be necessary to make the rcr4() conditional to a CPU
feature flag because olden x86 processors do not have the CR4 register.
Condition curcpu()->ci_feature_sefflags_ebx & SEFF0EBX_SMAP might be
good enough in this case.

With that issue fixed, OK visa@



Re: Fix a typo in INSTALL.octeon

2017-11-30 Thread Visa Hankala
On Thu, Nov 30, 2017 at 01:05:58AM +0100, Rafael Neves wrote:
> Hi,
> 
> The patch below replace one "$loadaddr" to "${loadaddr}" in U-Boot 
> instructions of INSTALL.octeon.
> 
> Regards,
> Rafael Neves

Committed, thanks.

> Index: install
> ===
> RCS file: /cvs/src/distrib/notes/octeon/install,v
> retrieving revision 1.14
> diff -u -p -r1.14 install
> --- install   2 Mar 2017 15:31:15 -   1.14
> +++ install   30 Nov 2017 01:56:04 -
> @@ -82,7 +82,7 @@ On dual-core systems, the coremask param
>  On the EdgeRouter Lite, bootcmd may also reset the USB controller for
>  more reliable USB device detection:
>  
> -  usb reset; fatload usb 0 $loadaddr bsd; bootoctlinux rootdev=sd0 
> coremask=0x3
> +  usb reset; fatload usb 0 ${loadaddr} bsd; bootoctlinux rootdev=sd0 
> coremask=0x3
>  
>  OpenBSDCongratulations
>  
> 



Re: un-KERNEL_LOCK() TCP/UDP input & co

2017-11-14 Thread Visa Hankala
On Tue, Nov 14, 2017 at 10:37:16AM +0100, Martin Pieuchot wrote:
> Note that the diff below still includes a false positive.  If the
> current thread doesn't have the read-lock but another one has it,
> then RW_READ will still be returned.

That's true if witness(4) has not been enabled.



Re: un-KERNEL_LOCK() TCP/UDP input & co

2017-11-13 Thread Visa Hankala
On Mon, Nov 13, 2017 at 01:25:42PM +0100, Martin Pieuchot wrote:
> I'd love is somebody could do the changes in the rwlock API to let us
> check if we're holding a lock.  Maybe this is already present in
> witness(4), visa?

witness(4) does maintain data about lock ownership. The patch below
might do the trick for rwlocks without adding extra state tracking.

The patch also fixes the witness-side initialization of statically
initialized locks, in functions witness_checkorder() and
witness_lock(). The fix belongs to a separate commit though.

Index: share/man/man9/rwlock.9
===
RCS file: src/share/man/man9/rwlock.9,v
retrieving revision 1.20
diff -u -p -r1.20 rwlock.9
--- share/man/man9/rwlock.9 30 Oct 2017 13:33:36 -  1.20
+++ share/man/man9/rwlock.9 13 Nov 2017 17:05:51 -
@@ -33,6 +33,7 @@
 .Nm rw_assert_anylock ,
 .Nm rw_assert_unlocked ,
 .Nm rw_status ,
+.Nm rw_status_curproc ,
 .Nm RWLOCK_INITIALIZER ,
 .Nm rrw_init ,
 .Nm rrw_init_flags ,
@@ -68,6 +69,8 @@
 .Fn rw_assert_unlocked "struct rwlock *rwl"
 .Ft int
 .Fn rw_status "struct rwlock *rwl"
+.Ft int
+.Fn rw_status_curproc "struct rwlock *rwl"
 .Fn RWLOCK_INITIALIZER "const char *name"
 .Ft void
 .Fn rrw_init "struct rrwlock *rrwl" "const char *name"
@@ -181,7 +184,7 @@ functions check the status
 .Fa rwl ,
 panicking if it is not write-, read-, any-, or unlocked, respectively.
 .Pp
-.Nm rw_status
+.Fn rw_status
 returns the current state of the lock:
 .Pp
 .Bl -tag -width "RW_WRITE_OTHER" -offset indent -compact
@@ -194,6 +197,22 @@ Lock is read locked.
 The current thread may be one of the threads that has it locked.
 .It 0
 Lock is not locked.
+.El
+.Pp
+.Fn rw_status_curproc
+returns the current state of the lock relative to the calling thread:
+.Pp
+.Bl -tag -width "RW_WRITE_OTHER" -offset indent -compact
+.It Dv RW_WRITE
+Lock is write locked by the calling thread.
+.It Dv RW_READ
+Lock is read locked.
+The current thread may be one of the threads that has it locked.
+If the kernel has been compiled with
+.Xr witness 4 ,
+the status is exact, and the lock is read locked by the current thread.
+.It 0
+Lock is not locked by the calling thread.
 .El
 .Pp
 A lock declaration may be initialised with the
Index: sys/kern/kern_rwlock.c
===
RCS file: src/sys/kern/kern_rwlock.c,v
retrieving revision 1.32
diff -u -p -r1.32 kern_rwlock.c
--- sys/kern/kern_rwlock.c  24 Oct 2017 08:53:15 -  1.32
+++ sys/kern/kern_rwlock.c  13 Nov 2017 17:05:51 -
@@ -325,6 +325,34 @@ rw_status(struct rwlock *rwl)
return (0);
 }
 
+int
+rw_status_curproc(struct rwlock *rwl)
+{
+   unsigned long owner;
+#ifdef WITNESS
+   int status = witness_status(>rwl_lock_obj);
+
+   if (status != -1) {
+   if (status & LA_XLOCKED)
+   return (RW_WRITE);
+   if (status & LA_SLOCKED)
+   return (RW_READ);
+   return (0);
+   }
+#endif
+
+   owner = rwl->rwl_owner;
+   if (owner & RWLOCK_WRLOCK) {
+   if (RW_PROC(curproc) == RW_PROC(owner))
+   return (RW_WRITE);
+   else
+   return (0);
+   }
+   if (owner)
+   return RW_READ;
+   return (0);
+}
+
 #ifdef DIAGNOSTIC
 void
 rw_assert_wrlock(struct rwlock *rwl)
Index: sys/kern/subr_witness.c
===
RCS file: src/sys/kern/subr_witness.c,v
retrieving revision 1.4
diff -u -p -r1.4 subr_witness.c
--- sys/kern/subr_witness.c 12 Aug 2017 03:13:23 -  1.4
+++ sys/kern/subr_witness.c 13 Nov 2017 17:05:51 -
@@ -744,8 +744,8 @@ witness_checkorder(struct lock_object *l
struct witness *w, *w1;
int i, j, s;
 
-   if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL ||
-   panicstr != NULL)
+   if (witness_cold || witness_watch < 1 || panicstr != NULL ||
+   (lock->lo_flags & LO_WITNESS) == 0)
return;
 
w = lock->lo_witness;
@@ -1078,8 +1078,8 @@ witness_lock(struct lock_object *lock, i
struct witness *w;
int s;
 
-   if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL ||
-   panicstr != NULL)
+   if (witness_cold || witness_watch == -1 || panicstr != NULL ||
+   (lock->lo_flags & LO_WITNESS) == 0)
return;
 
w = lock->lo_witness;
@@ -2004,6 +2004,42 @@ witness_assert(const struct lock_object 
 
}
 #endif /* INVARIANT_SUPPORT */
+}
+
+int
+witness_status(struct lock_object *lock)
+{
+   struct lock_instance *instance;
+   struct lock_class *class;
+   int status = 0;
+
+   if (witness_cold || witness_watch < 1 || panicstr != NULL ||
+   (lock->lo_flags & LO_WITNESS) == 0)
+   return -1;
+
+   class = LOCK_CLASS(lock);

Re: pow() returns a negative result on loongson

2017-10-10 Thread Visa Hankala
On Mon, Oct 09, 2017 at 10:39:47PM +0200, Juan Francisco Cantero Hurtado wrote:
> Marc Feeley (Gambit Scheme) has been helping me with a bug on Gambit on
> Loongson. Apparently the bug is on our side.
> 
> I've created this little test based on his code:
> 
> #include 
> #include 
> 
> int main()
> {
> double x = 0.5;
> double y = 1074.0;
> printf("x=%.20g y=%.20g pow(x,y)=%.20g\n",x,y,pow(x,y));
> return 0;
> }
> 
> On OpenBSD/amd64, Linux/amd64, Linux/aarch64 and Linux/mips64:
> x=0.5 y=1074 pow(x,y)=4.9406564584124654418e-324
> 
> On OpenBSD/loongson:
> x=0.5 y=1074 pow(x,y)=-4.9406564584124654418e-324
> 
> Is the negative result expected?

No. On mips64, ldexp(3) seems to use a garbage value when figuring out
the sign of a denormal value. This applies to infinity values as well.

The patch below should fix the bug. The t3 register contains the binary
representation of the floating-point `x' parameter. I can also make
a regression test for the patch.

OK?

Index: arch/mips64/gen/ldexp.S
===
RCS file: src/lib/libc/arch/mips64/gen/ldexp.S,v
retrieving revision 1.7
diff -u -p -r1.7 ldexp.S
--- arch/mips64/gen/ldexp.S 27 Oct 2015 05:54:49 -  1.7
+++ arch/mips64/gen/ldexp.S 10 Oct 2017 11:52:56 -
@@ -143,20 +143,20 @@ LEAF(ldexp, 0)
xorit2, 1
 1:
dmtc1   t2, $f0 # save denormalized result (LSW)
-   bge v1, zero, 1f# should result be negative?
+   bge t3, zero, 1f# should result be negative?
neg.d   $f0, $f0# negate result
 1:
j   ra
 7:
dmtc1   zero, $f0   # result is zero
-   beq t0, zero, 1f# is result positive?
+   bge t3, zero, 1f# is result positive?
neg.d   $f0, $f0# negate result
 1:
j   ra
 8:
dli t1, 0x7ff0  # result is infinity (MSW)
dmtc1   t1, $f0 
-   bge v1, zero, 1f# should result be negative infinity?
+   bge t3, zero, 1f# should result be negative infinity?
neg.d   $f0, $f0# result is negative infinity
 1:
add.d   $f0, $f0# cause overflow faults if enabled



Reduce pool cache items when there is no contention

2017-07-05 Thread Visa Hankala
The current pool cache code increases the number of items that can be
cached locally in response to lock contention. This patch adds a tweak
that lowers the number when contention does not occur. The idea is to
let resources be returned to the common pool when pressure on the cache
has decreased.

This change alone does not move elements from the cache to the common
pool. The cache has to have some activity for the moving to happen.

It is likely that `pr_cache_items' does not settle on a single value
but oscillates within some range. In my testing, I have not seen any
wild swings that would indicate positive feedback in the process.
The adjustments are constant per step and happen relatively rarely.

OK?

Index: kern/subr_pool.c
===
RCS file: src/sys/kern/subr_pool.c,v
retrieving revision 1.217
diff -u -p -r1.217 subr_pool.c
--- kern/subr_pool.c23 Jun 2017 01:21:55 -  1.217
+++ kern/subr_pool.c5 Jul 2017 09:27:40 -
@@ -1957,6 +1957,9 @@ pool_cache_gc(struct pool *pp)
if ((contention - pp->pr_cache_contention_prev) > 8 /* magic */) {
if ((ncpusfound * 8 * 2) <= pp->pr_cache_nitems)
pp->pr_cache_items += 8;
+   } else if ((contention - pp->pr_cache_contention_prev) == 0) {
+   if (pp->pr_cache_items > 8)
+   pp->pr_cache_items--;
}
pp->pr_cache_contention_prev = contention;
 }



Re: Properly serialize pflow's sc_outputqueue

2017-05-30 Thread Visa Hankala
On Wed, May 31, 2017 at 01:52:31AM +1000, Jonathan Matthew wrote:
> On Tue, May 30, 2017 at 01:04:07PM +0000, Visa Hankala wrote:
> > Index: net/if_pflow.c
> > ===
> > RCS file: src/sys/net/if_pflow.c,v
> > retrieving revision 1.78
> > diff -u -p -r1.78 if_pflow.c
> > --- net/if_pflow.c  27 May 2017 21:44:22 -  1.78
> > +++ net/if_pflow.c  30 May 2017 12:40:26 -
> > @@ -132,7 +132,7 @@ pflow_output_process(void *arg)
> > struct mbuf *m;
> >  
> > KERNEL_LOCK();
> > -   while ((m = ml_dequeue(>sc_outputqueue)) != NULL) {
> > +   while ((m = mq_dequeue(>sc_outputqueue)) != NULL) {
> > pflow_sendout_mbuf(sc, m);
> > }
> > KERNEL_UNLOCK();
> 
> I'd suggest using mq_delist here rather than dequeueing each mbuf
> individually, so you only take the mbuf_queue mutex once per call.

Right. Updated patch below.

Index: net/if_pflow.c
===
RCS file: src/sys/net/if_pflow.c,v
retrieving revision 1.78
diff -u -p -r1.78 if_pflow.c
--- net/if_pflow.c  27 May 2017 21:44:22 -  1.78
+++ net/if_pflow.c  30 May 2017 15:57:44 -
@@ -128,11 +128,13 @@ pflow_output(struct ifnet *ifp, struct m
 void
 pflow_output_process(void *arg)
 {
+   struct mbuf_list ml;
struct pflow_softc *sc = arg;
struct mbuf *m;
 
+   mq_delist(>sc_outputqueue, );
KERNEL_LOCK();
-   while ((m = ml_dequeue(>sc_outputqueue)) != NULL) {
+   while ((m = ml_dequeue()) != NULL) {
pflow_sendout_mbuf(sc, m);
}
KERNEL_UNLOCK();
@@ -256,7 +258,7 @@ pflow_clone_create(struct if_clone *ifc,
ifp->if_hdrlen = PFLOW_HDRLEN;
ifp->if_flags = IFF_UP;
ifp->if_flags &= ~IFF_RUNNING;  /* not running, need receiver */
-   ml_init(>sc_outputqueue);
+   mq_init(>sc_outputqueue, 8192, IPL_SOFTNET);
pflow_setmtu(pflowif, ETHERMTU);
pflow_init_timeouts(pflowif);
if_attach(ifp);
@@ -288,7 +290,7 @@ pflow_clone_destroy(struct ifnet *ifp)
timeout_del(>sc_tmo_tmpl);
pflow_flush(sc);
task_del(softnettq, >sc_outputtask);
-   ml_purge(>sc_outputqueue);
+   mq_purge(>sc_outputqueue);
m_freem(sc->send_nam);
if (sc->so != NULL) {
error = soclose(sc->so);
@@ -1089,8 +1091,8 @@ pflow_sendout_v5(struct pflow_softc *sc)
getnanotime();
h->time_sec = htonl(tv.tv_sec); /* XXX 2038 */
h->time_nanosec = htonl(tv.tv_nsec);
-   ml_enqueue(>sc_outputqueue, m);
-   task_add(softnettq, >sc_outputtask);
+   if (mq_enqueue(>sc_outputqueue, m) == 0)
+   task_add(softnettq, >sc_outputtask);
return (0);
 }
 
@@ -1151,8 +1153,8 @@ pflow_sendout_ipfix(struct pflow_softc *
h10->flow_sequence = htonl(sc->sc_sequence);
sc->sc_sequence += count;
h10->observation_dom = htonl(PFLOW_ENGINE_TYPE);
-   ml_enqueue(>sc_outputqueue, m);
-   task_add(softnettq, >sc_outputtask);
+   if (mq_enqueue(>sc_outputqueue, m) == 0)
+   task_add(softnettq, >sc_outputtask);
return (0);
 }
 
@@ -1193,8 +1195,8 @@ pflow_sendout_ipfix_tmpl(struct pflow_so
h10->observation_dom = htonl(PFLOW_ENGINE_TYPE);
 
timeout_add_sec(>sc_tmo_tmpl, PFLOW_TMPL_TIMEOUT);
-   ml_enqueue(>sc_outputqueue, m);
-   task_add(softnettq, >sc_outputtask);
+   if (mq_enqueue(>sc_outputqueue, m) == 0)
+   task_add(softnettq, >sc_outputtask);
return (0);
 }
 
Index: net/if_pflow.h
===
RCS file: src/sys/net/if_pflow.h,v
retrieving revision 1.16
diff -u -p -r1.16 if_pflow.h
--- net/if_pflow.h  27 May 2017 21:06:06 -  1.16
+++ net/if_pflow.h  30 May 2017 15:57:44 -
@@ -184,7 +184,7 @@ struct pflow_softc {
struct timeout   sc_tmo;
struct timeout   sc_tmo6;
struct timeout   sc_tmo_tmpl;
-   struct mbuf_list sc_outputqueue;
+   struct mbuf_queuesc_outputqueue;
struct task  sc_outputtask;
struct socket   *so;
struct mbuf *send_nam;



Re: Properly serialize pflow's sc_outputqueue

2017-05-30 Thread Visa Hankala
On Mon, May 29, 2017 at 03:33:35PM +, Visa Hankala wrote:
> Currently, access to pflow's sc_outputqueue is not serialized properly.
> The producer has the NET_LOCK(), while the consumer does not.
> mpi@ suggested using mbuf_queue to solve the issue.

mpi@ commented that the pflow output task should be scheduled only
if mq_enqueue succeeds.

Another issue is that mbuf_queue has a size limit. If a lot of flows
get exported at once, the queue might become full, dropping pflow data.
To make this less likely, I tweaked the limit from 256 (IFQ_MAXLEN) to
8192. Of course, packet loss can happen in many other places as well.

Updated patch below. OK?

Index: net/if_pflow.c
===
RCS file: src/sys/net/if_pflow.c,v
retrieving revision 1.78
diff -u -p -r1.78 if_pflow.c
--- net/if_pflow.c  27 May 2017 21:44:22 -  1.78
+++ net/if_pflow.c  30 May 2017 12:40:26 -
@@ -132,7 +132,7 @@ pflow_output_process(void *arg)
struct mbuf *m;
 
KERNEL_LOCK();
-   while ((m = ml_dequeue(>sc_outputqueue)) != NULL) {
+   while ((m = mq_dequeue(>sc_outputqueue)) != NULL) {
pflow_sendout_mbuf(sc, m);
}
KERNEL_UNLOCK();
@@ -256,7 +256,7 @@ pflow_clone_create(struct if_clone *ifc,
ifp->if_hdrlen = PFLOW_HDRLEN;
ifp->if_flags = IFF_UP;
ifp->if_flags &= ~IFF_RUNNING;  /* not running, need receiver */
-   ml_init(>sc_outputqueue);
+   mq_init(>sc_outputqueue, 8192, IPL_SOFTNET);
pflow_setmtu(pflowif, ETHERMTU);
pflow_init_timeouts(pflowif);
if_attach(ifp);
@@ -288,7 +288,7 @@ pflow_clone_destroy(struct ifnet *ifp)
timeout_del(>sc_tmo_tmpl);
pflow_flush(sc);
task_del(softnettq, >sc_outputtask);
-   ml_purge(>sc_outputqueue);
+   mq_purge(>sc_outputqueue);
m_freem(sc->send_nam);
if (sc->so != NULL) {
error = soclose(sc->so);
@@ -1089,8 +1089,8 @@ pflow_sendout_v5(struct pflow_softc *sc)
getnanotime();
h->time_sec = htonl(tv.tv_sec); /* XXX 2038 */
h->time_nanosec = htonl(tv.tv_nsec);
-   ml_enqueue(>sc_outputqueue, m);
-   task_add(softnettq, >sc_outputtask);
+   if (mq_enqueue(>sc_outputqueue, m) == 0)
+   task_add(softnettq, >sc_outputtask);
return (0);
 }
 
@@ -1151,8 +1151,8 @@ pflow_sendout_ipfix(struct pflow_softc *
h10->flow_sequence = htonl(sc->sc_sequence);
sc->sc_sequence += count;
h10->observation_dom = htonl(PFLOW_ENGINE_TYPE);
-   ml_enqueue(>sc_outputqueue, m);
-   task_add(softnettq, >sc_outputtask);
+   if (mq_enqueue(>sc_outputqueue, m) == 0)
+   task_add(softnettq, >sc_outputtask);
return (0);
 }
 
@@ -1193,8 +1193,8 @@ pflow_sendout_ipfix_tmpl(struct pflow_so
h10->observation_dom = htonl(PFLOW_ENGINE_TYPE);
 
timeout_add_sec(>sc_tmo_tmpl, PFLOW_TMPL_TIMEOUT);
-   ml_enqueue(>sc_outputqueue, m);
-   task_add(softnettq, >sc_outputtask);
+   if (mq_enqueue(>sc_outputqueue, m) == 0)
+   task_add(softnettq, >sc_outputtask);
return (0);
 }
 
Index: net/if_pflow.h
===
RCS file: src/sys/net/if_pflow.h,v
retrieving revision 1.16
diff -u -p -r1.16 if_pflow.h
--- net/if_pflow.h  27 May 2017 21:06:06 -  1.16
+++ net/if_pflow.h  30 May 2017 12:40:26 -
@@ -184,7 +184,7 @@ struct pflow_softc {
struct timeout   sc_tmo;
struct timeout   sc_tmo6;
struct timeout   sc_tmo_tmpl;
-   struct mbuf_list sc_outputqueue;
+   struct mbuf_queuesc_outputqueue;
struct task  sc_outputtask;
struct socket   *so;
struct mbuf *send_nam;



Properly serialize pflow's sc_outputqueue

2017-05-29 Thread Visa Hankala
Currently, access to pflow's sc_outputqueue is not serialized properly.
The producer has the NET_LOCK(), while the consumer does not.
mpi@ suggested using mbuf_queue to solve the issue.

OK?

Index: net/if_pflow.c
===
RCS file: src/sys/net/if_pflow.c,v
retrieving revision 1.78
diff -u -p -r1.78 if_pflow.c
--- net/if_pflow.c  27 May 2017 21:44:22 -  1.78
+++ net/if_pflow.c  29 May 2017 15:26:58 -
@@ -132,7 +132,7 @@ pflow_output_process(void *arg)
struct mbuf *m;
 
KERNEL_LOCK();
-   while ((m = ml_dequeue(>sc_outputqueue)) != NULL) {
+   while ((m = mq_dequeue(>sc_outputqueue)) != NULL) {
pflow_sendout_mbuf(sc, m);
}
KERNEL_UNLOCK();
@@ -256,7 +256,7 @@ pflow_clone_create(struct if_clone *ifc,
ifp->if_hdrlen = PFLOW_HDRLEN;
ifp->if_flags = IFF_UP;
ifp->if_flags &= ~IFF_RUNNING;  /* not running, need receiver */
-   ml_init(>sc_outputqueue);
+   mq_init(>sc_outputqueue, IFQ_MAXLEN, IPL_SOFTNET);
pflow_setmtu(pflowif, ETHERMTU);
pflow_init_timeouts(pflowif);
if_attach(ifp);
@@ -288,7 +288,7 @@ pflow_clone_destroy(struct ifnet *ifp)
timeout_del(>sc_tmo_tmpl);
pflow_flush(sc);
task_del(softnettq, >sc_outputtask);
-   ml_purge(>sc_outputqueue);
+   mq_purge(>sc_outputqueue);
m_freem(sc->send_nam);
if (sc->so != NULL) {
error = soclose(sc->so);
@@ -1089,7 +1089,7 @@ pflow_sendout_v5(struct pflow_softc *sc)
getnanotime();
h->time_sec = htonl(tv.tv_sec); /* XXX 2038 */
h->time_nanosec = htonl(tv.tv_nsec);
-   ml_enqueue(>sc_outputqueue, m);
+   mq_enqueue(>sc_outputqueue, m);
task_add(softnettq, >sc_outputtask);
return (0);
 }
@@ -1151,7 +1151,7 @@ pflow_sendout_ipfix(struct pflow_softc *
h10->flow_sequence = htonl(sc->sc_sequence);
sc->sc_sequence += count;
h10->observation_dom = htonl(PFLOW_ENGINE_TYPE);
-   ml_enqueue(>sc_outputqueue, m);
+   mq_enqueue(>sc_outputqueue, m);
task_add(softnettq, >sc_outputtask);
return (0);
 }
@@ -1193,7 +1193,7 @@ pflow_sendout_ipfix_tmpl(struct pflow_so
h10->observation_dom = htonl(PFLOW_ENGINE_TYPE);
 
timeout_add_sec(>sc_tmo_tmpl, PFLOW_TMPL_TIMEOUT);
-   ml_enqueue(>sc_outputqueue, m);
+   mq_enqueue(>sc_outputqueue, m);
task_add(softnettq, >sc_outputtask);
return (0);
 }
Index: net/if_pflow.h
===
RCS file: src/sys/net/if_pflow.h,v
retrieving revision 1.16
diff -u -p -r1.16 if_pflow.h
--- net/if_pflow.h  27 May 2017 21:06:06 -  1.16
+++ net/if_pflow.h  29 May 2017 15:26:58 -
@@ -184,7 +184,7 @@ struct pflow_softc {
struct timeout   sc_tmo;
struct timeout   sc_tmo6;
struct timeout   sc_tmo_tmpl;
-   struct mbuf_list sc_outputqueue;
+   struct mbuf_queuesc_outputqueue;
struct task  sc_outputtask;
struct socket   *so;
struct mbuf *send_nam;



mips64 copyin32(9)

2017-05-18 Thread Visa Hankala
Here is a copyin32(9) for mips64. With futex(2) using it, the regress
tests pass on loongson, octeon and sgi. Source addresses with wrong
alignment are handled by the on-fault logic in itsa() trap handler.

OK?

Index: arch/mips64/mips64/lcore_access.S
===
RCS file: src/sys/arch/mips64/mips64/lcore_access.S,v
retrieving revision 1.26
diff -u -p -r1.26 lcore_access.S
--- arch/mips64/mips64/lcore_access.S   23 May 2016 20:11:49 -  1.26
+++ arch/mips64/mips64/lcore_access.S   18 May 2017 13:06:41 -
@@ -255,6 +255,31 @@ NON_LEAF(copyout, FRAMESZ(CF_SZ), ra)
 move   v0, zero
 END(copyout)
 
+/*
+ * Copy an aligned 32-bit word atomically from user space into the kernel
+ * copyin32(from, to)
+ * uint32_t *from; (user source address)
+ * uint32_t *to;   (kernel destination address)
+ */
+NON_LEAF(copyin32, FRAMESZ(CF_SZ), ra)
+   PTR_SUBU sp, sp, FRAMESZ(CF_SZ)
+   .mask   0x8000, (CF_RA_OFFS - FRAMESZ(CF_SZ))
+   PTR_S   ra, CF_RA_OFFS(sp)
+   blt a0, zero, _copyerr  # make sure address is in user space
+li v0, KT_COPYERR
+   GET_CPU_INFO(t1, t0)
+   PTR_L   t3, CI_CURPROCPADDR(t1)
+   sw  v0, PCB_ONFAULT(t3)
+
+   lw  v1, 0(a0)
+   sw  v1, 0(a1)
+
+   sw  zero, PCB_ONFAULT(t3)
+   PTR_ADDU sp, sp, FRAMESZ(CF_SZ)
+   j   ra
+move   v0, zero
+END(copyin32)
+
 _copyerr:
PTR_L   ra, CF_RA_OFFS(sp)
GET_CPU_INFO(t1, t0)



Re: Atomic copyin(9)/copyout(9) for amd64

2017-05-15 Thread Visa Hankala
On Sun, May 14, 2017 at 10:05:28PM +0200, Mark Kettenis wrote:
> I tried to convert a few more architectures, and things get a bit
> messier than I envisioned on architectures that have an optimized copy
> function and care about alignment.  Especially when copyin(9) is
> implemented in assembly.  So I implemented a new function called
> copyin_futex(9), which is all we really need.  The implementations for
> powerpc and sparc64 below have been (lightly) tested.  The regression
> test that mpi@ created passes and doesn't crash the kernel!  I have an
> (untested) implementation for hppa as well.  The amd64 and i386
> implementation can simply be a wrapper around copyin(9).  The same
> holds for all out non-MULTIPROCESSOR architectures.  So my idea is to
> let kern/sys_futex.c provide that wrapper if MULTIPROCESSOR isn't
> defined and add an #ifdef MULTIPROCESSOR around all MD copyin_futex()
> implementations.
> 
> Thoughts?

This direction is much better than the previous one in my opinion.



Re: Atomic copyin(9)/copyout(9) for amd64

2017-05-12 Thread Visa Hankala
On Mon, May 01, 2017 at 06:02:24PM +0200, Mark Kettenis wrote:
> The futex(2) syscall needs to be able to atomically copy the futex in
> and out of userland.  The current implementation uses copyin(9) and
> copyout(9) for that.  The futex is a 32-bit integer, and currently our
> copyin(9) and copyout(9) don't guarantee an atomic 32-bit access.
> Previously mpi@ and I discussed implementing new interfaces that do
> guarantee the required atomicity.  However, it oocurred to me that we
> could simply change our copyin implementations such that they
> guarantee atomicity of a properly aligned 32-bit copyin and copyout.
> 
> The i386 version of these calls uses "rep movsl", which means it is
> already atomic.  At least that is how I interpret 8.2.4 in Volume 3A
> of the Intel SDM.  The diff below makes the amd64 version safe as
> well.  This does introduce a few additional instructions in the loop.
> Apparently modern Intel CPUs optimize the string loops.  If we can
> rely on the hardware to turn 32-bit moves into 64-bit moves, we could
> simplify the code by using "rep movsl" instead of "rep movsq".

I submit to this approach. OK visa@

> Index: arch/amd64/amd64/copy.S
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/copy.S,v
> retrieving revision 1.7
> diff -u -p -r1.7 copy.S
> --- arch/amd64/amd64/copy.S   25 Apr 2015 21:31:24 -  1.7
> +++ arch/amd64/amd64/copy.S   1 May 2017 15:32:17 -
> @@ -138,7 +138,12 @@ ENTRY(copyout)
>   rep
>   movsq
>   movb%al,%cl
> - andb$7,%cl
> + shrb$2,%cl
> + andb$1,%cl
> + rep
> + movsl
> + movb%al,%cl
> + andb$3,%cl
>   rep
>   movsb
>   SMAP_CLAC
> @@ -168,7 +173,12 @@ ENTRY(copyin)
>   rep
>   movsq
>   movb%al,%cl
> - andb$7,%cl
> + shrb$2,%cl
> + andb$1,%cl
> + rep
> + movsl
> + movb%al,%cl
> + andb$3,%cl
>   rep
>   movsb
>  
> 



Re: Atomic copyin(9)/copyout(9) for amd64

2017-05-02 Thread Visa Hankala
On Mon, May 01, 2017 at 06:02:24PM +0200, Mark Kettenis wrote:
> The futex(2) syscall needs to be able to atomically copy the futex in
> and out of userland.  The current implementation uses copyin(9) and
> copyout(9) for that.  The futex is a 32-bit integer, and currently our
> copyin(9) and copyout(9) don't guarantee an atomic 32-bit access.
> Previously mpi@ and I discussed implementing new interfaces that do
> guarantee the required atomicity.  However, it oocurred to me that we
> could simply change our copyin implementations such that they
> guarantee atomicity of a properly aligned 32-bit copyin and copyout.
> 
> The i386 version of these calls uses "rep movsl", which means it is
> already atomic.  At least that is how I interpret 8.2.4 in Volume 3A
> of the Intel SDM.  The diff below makes the amd64 version safe as
> well.  This does introduce a few additional instructions in the loop.
> Apparently modern Intel CPUs optimize the string loops.  If we can
> rely on the hardware to turn 32-bit moves into 64-bit moves, we could
> simplify the code by using "rep movsl" instead of "rep movsq".
> 
> Thoughts?

I would add separate routines for atomic copying because they should
fail if atomicity cannot be guaranteed. copyin(9) and copyout(9) do
not catch memory aligment issues, for example. Forcing them to handle
a special case like this does not look nice.



Re: Break pluart(4)

2017-04-11 Thread Visa Hankala
On Tue, Apr 11, 2017 at 04:55:45PM +0200, Mark Kettenis wrote:
> Se here is an updated diff; thanks for catching this.
> 
> ok?

OK visa@

> Index: arch/arm64/dev/pluart.c
> ===
> RCS file: /cvs/src/sys/arch/arm64/dev/pluart.c,v
> retrieving revision 1.2
> diff -u -p -r1.2 pluart.c
> --- arch/arm64/dev/pluart.c   20 Jan 2017 08:03:21 -  1.2
> +++ arch/arm64/dev/pluart.c   11 Apr 2017 14:54:22 -
> @@ -256,6 +256,7 @@ pluartattach(struct device *parent, stru
>   cn_tab->cn_dev = makedev(maj, sc->sc_dev.dv_unit);
>  
>   printf(": console");
> + SET(sc->sc_hwflags, COM_HW_CONSOLE);
>   }
>  
>   timeout_set(>sc_diag_tmo, pluart_diag, sc);
> @@ -304,8 +305,18 @@ pluart_intr(void *arg)
>  
>   p = sc->sc_ibufp;
>  
> - while(ISSET(bus_space_read_4(iot, ioh, UART_FR), UART_FR_RXFF)) {
> - c = bus_space_read_1(iot, ioh, UART_DR);
> + while (ISSET(bus_space_read_4(iot, ioh, UART_FR), UART_FR_RXFF)) {
> + c = bus_space_read_2(iot, ioh, UART_DR);
> + if (c & UART_DR_BE) {
> +#ifdef DDB
> + if (ISSET(sc->sc_hwflags, COM_HW_CONSOLE)) {
> + if (db_console)
> + Debugger();
> + continue;
> + }
> +#endif
> + c = 0;
> + }
>   if (p >= sc->sc_ibufend) {
>   sc->sc_floods++;
>   if (sc->sc_errors++ == 0)
> Index: arch/armv7/dev/pluart.c
> ===
> RCS file: /cvs/src/sys/arch/armv7/dev/pluart.c,v
> retrieving revision 1.1
> diff -u -p -r1.1 pluart.c
> --- arch/armv7/dev/pluart.c   31 Aug 2016 16:19:40 -  1.1
> +++ arch/armv7/dev/pluart.c   11 Apr 2017 14:54:22 -
> @@ -260,6 +260,7 @@ pluartattach(struct device *parent, stru
>   cn_tab->cn_dev = makedev(maj, sc->sc_dev.dv_unit);
>  
>   printf(": console");
> + SET(sc->sc_hwflags, COM_HW_CONSOLE);
>   }
>  
>   timeout_set(>sc_diag_tmo, pluart_diag, sc);
> @@ -308,8 +309,18 @@ pluart_intr(void *arg)
>  
>   p = sc->sc_ibufp;
>  
> - while(ISSET(bus_space_read_4(iot, ioh, UART_FR), UART_FR_RXFF)) {
> - c = bus_space_read_1(iot, ioh, UART_DR);
> + while (ISSET(bus_space_read_4(iot, ioh, UART_FR), UART_FR_RXFF)) {
> + c = bus_space_read_2(iot, ioh, UART_DR);
> + if (c & UART_DR_BE) {
> +#ifdef DDB
> + if (ISSET(sc->sc_hwflags, COM_HW_CONSOLE)) {
> + if (db_console)
> + Debugger();
> + continue;
> + }
> +#endif
> + c = 0;
> + }
>   if (p >= sc->sc_ibufend) {
>   sc->sc_floods++;
>   if (sc->sc_errors++ == 0)
> 



Re: Break pluart(4)

2017-04-11 Thread Visa Hankala
On Tue, Apr 11, 2017 at 03:51:51PM +0200, Mark Kettenis wrote:
> + if (ISSET(sc->sc_hwflags, COM_HW_CONSOLE)) {
> + if (db_console)
> + Debugger();
> + continue;
> + }

Should these be inside #ifdef DDB like in com(4)?



Refactor SIMPLEQ out of sk(4)

2017-01-15 Thread Visa Hankala
sk(4) uses a queue for managing Tx DMA maps. Because the queue would
require synchronization if the driver was unlocked, it might be best
to refactor the code a bit. This patch removes the queue by making
Tx and Rx DMA maps part of struct sk_chain.

OK?

Index: dev/pci/if_sk.c
===
RCS file: src/sys/dev/pci/if_sk.c,v
retrieving revision 1.185
diff -u -p -r1.185 if_sk.c
--- dev/pci/if_sk.c 8 Jan 2017 18:08:14 -   1.185
+++ dev/pci/if_sk.c 15 Jan 2017 13:56:06 -
@@ -97,7 +97,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #include 
 
@@ -569,14 +568,11 @@ sk_init_tx_ring(struct sk_if_softc *sc_i
struct sk_softc *sc = sc_if->sk_softc;
struct sk_chain_data*cd = _if->sk_cdata;
struct sk_ring_data *rd = sc_if->sk_rdata;
-   bus_dmamap_tdmamap;
-   struct sk_txmap_entry   *entry;
int i, nexti;
 
bzero(sc_if->sk_rdata->sk_tx_ring,
sizeof(struct sk_tx_desc) * SK_TX_RING_CNT);
 
-   SIMPLEQ_INIT(_if->sk_txmap_head);
for (i = 0; i < SK_TX_RING_CNT; i++) {
cd->sk_tx_chain[i].sk_desc = >sk_tx_ring[i];
if (i == (SK_TX_RING_CNT - 1))
@@ -584,19 +580,12 @@ sk_init_tx_ring(struct sk_if_softc *sc_i
else
nexti = i + 1;
cd->sk_tx_chain[i].sk_next = >sk_tx_chain[nexti];
-   rd->sk_tx_ring[i].sk_next = htole32(SK_TX_RING_ADDR(sc_if, 
nexti));
+   rd->sk_tx_ring[i].sk_next = htole32(SK_TX_RING_ADDR(sc_if,
+   nexti));
 
if (bus_dmamap_create(sc->sc_dmatag, SK_JLEN, SK_NTXSEG,
-  SK_JLEN, 0, BUS_DMA_NOWAIT, ))
-   return (ENOBUFS);
-
-   entry = malloc(sizeof(*entry), M_DEVBUF, M_NOWAIT);
-   if (!entry) {
-   bus_dmamap_destroy(sc->sc_dmatag, dmamap);
+  SK_JLEN, 0, BUS_DMA_NOWAIT, >sk_tx_chain[i].sk_map))
return (ENOBUFS);
-   }
-   entry->dmamap = dmamap;
-   SIMPLEQ_INSERT_HEAD(_if->sk_txmap_head, entry, link);
}
 
sc_if->sk_cdata.sk_tx_prod = 0;
@@ -627,7 +616,8 @@ sk_newbuf(struct sk_if_softc *sc_if)
m_adj(m, ETHER_ALIGN);
 
prod = sc_if->sk_cdata.sk_rx_prod;
-   dmamap = sc_if->sk_cdata.sk_rx_map[prod];
+   c = _if->sk_cdata.sk_rx_chain[prod];
+   dmamap = c->sk_map;
 
error = bus_dmamap_load_mbuf(sc_if->sk_softc->sc_dmatag, dmamap, m,
BUS_DMA_READ|BUS_DMA_NOWAIT);
@@ -639,7 +629,6 @@ sk_newbuf(struct sk_if_softc *sc_if)
bus_dmamap_sync(sc_if->sk_softc->sc_dmatag, dmamap, 0,
dmamap->dm_mapsize, BUS_DMASYNC_PREREAD);
 
-   c = _if->sk_cdata.sk_rx_chain[prod];
c->sk_mbuf = m;
 
r = c->sk_desc;
@@ -979,7 +968,7 @@ sk_attach(struct device *parent, struct 
 
for (i = 0; i < SK_RX_RING_CNT; i++) {
error = bus_dmamap_create(sc->sc_dmatag, SK_JLEN, 1,
-   SK_JLEN, 0, 0, _if->sk_cdata.sk_rx_map[i]);
+   SK_JLEN, 0, 0, _if->sk_cdata.sk_rx_chain[i].sk_map);
if (error != 0) {
printf(": unable to create rx DMA map %d, "
"error = %d\n", i, error);
@@ -1051,10 +1040,11 @@ sk_attach(struct device *parent, struct 
return;
 fail_4:
for (i = 0; i < SK_RX_RING_CNT; i++) {
-   if (sc_if->sk_cdata.sk_rx_map[i] == NULL)
+   if (sc_if->sk_cdata.sk_rx_chain[i].sk_map == NULL)
continue;
 
-   bus_dmamap_destroy(sc->sc_dmatag, sc_if->sk_cdata.sk_rx_map[i]);
+   bus_dmamap_destroy(sc->sc_dmatag,
+   sc_if->sk_cdata.sk_rx_chain[i].sk_map);
}
 fail_3:
bus_dmamem_unmap(sc->sc_dmatag, kva, sizeof(struct sk_ring_data));
@@ -1391,19 +1381,12 @@ sk_encap(struct sk_if_softc *sc_if, stru
struct sk_tx_desc   *f = NULL;
u_int32_t   frag, cur, sk_ctl;
int i;
-   struct sk_txmap_entry   *entry;
bus_dmamap_ttxmap;
 
DPRINTFN(2, ("sk_encap\n"));
 
-   entry = SIMPLEQ_FIRST(_if->sk_txmap_head);
-   if (entry == NULL) {
-   DPRINTFN(2, ("sk_encap: no txmap available\n"));
-   return (ENOBUFS);
-   }
-   txmap = entry->dmamap;
-
cur = frag = *txidx;
+   txmap = sc_if->sk_cdata.sk_tx_chain[*txidx].sk_map;
 
 #ifdef SK_DEBUG
if (skdebug >= 2)
@@ -1446,10 +1429,12 @@ sk_encap(struct sk_if_softc *sc_if, stru
SK_INC(frag, SK_TX_RING_CNT);
}
 
+   sc_if->sk_cdata.sk_tx_chain[*txidx].sk_map =
+   sc_if->sk_cdata.sk_tx_chain[cur].sk_map;
+   sc_if->sk_cdata.sk_tx_chain[cur].sk_map = txmap;
+
sc_if->sk_cdata.sk_tx_chain[cur].sk_mbuf = 

Re: {ah,esp}_input_cb() & NET_LOCK()

2017-01-09 Thread Visa Hankala
On Mon, Jan 09, 2017 at 04:10:48PM +0100, Martin Pieuchot wrote:
> As reported by Hrvoje Popovski, these two callbacks also need the
> NET_LOCK():
> 
>   splassert: ip_output: want 1 have 0
>   Starting stack trace...
>   ip_output() at ip_output+0x7d
>   pfsync_sendout() at pfsync_sendout+0x499
>   pfsync_update_tdb() at pfsync_update_tdb+0x13a
>   esp_input_cb() at esp_input_cb+0x234
>   taskq_thread() at taskq_thread+0x6c
>   end trace frame: 0x0, count: 252
>   End of stack trace.
> 
> ok?

ok visa@

I guess ipcomp needs similar treatment:

Index: netinet/ip_ipcomp.c
===
RCS file: src/sys/netinet/ip_ipcomp.c,v
retrieving revision 1.49
diff -u -p -r1.49 ip_ipcomp.c
--- netinet/ip_ipcomp.c 24 Dec 2016 11:17:35 -  1.49
+++ netinet/ip_ipcomp.c 9 Jan 2017 16:39:16 -
@@ -217,7 +217,7 @@ ipcomp_input_cb(struct cryptop *crp)
return (EINVAL);
}
 
-   s = splsoftnet();
+   NET_LOCK(s);
 
tdb = gettdb(tc->tc_rdomain, tc->tc_spi, >tc_dst, tc->tc_proto);
if (tdb == NULL) {
@@ -254,7 +254,7 @@ ipcomp_input_cb(struct cryptop *crp)
/* Reset the session ID */
if (tdb->tdb_cryptoid != 0)
tdb->tdb_cryptoid = crp->crp_sid;
-   splx(s);
+   NET_UNLOCK(s);
return crypto_dispatch(crp);
}
free(tc, M_XDATA, 0);
@@ -336,11 +336,11 @@ ipcomp_input_cb(struct cryptop *crp)
 
/* Back to generic IPsec input processing */
error = ipsec_common_input_cb(m, tdb, skip, protoff);
-   splx(s);
+   NET_UNLOCK(s);
return error;
 
 baddone:
-   splx(s);
+   NET_UNLOCK(s);
 
m_freem(m);



Re: rasops(9): Use proper RGB values for the ANSI color palette

2017-01-07 Thread Visa Hankala
On Sat, Jan 07, 2017 at 12:23:31AM +0100, Frederic Cambus wrote:
> Hi tech@,
> 
> Here is a diff to use proper RGB values for the ANSI color palette in
> rasops(9).
> 
> Comments? OK?

I prefer the old palette because its contrast is higher.

> Index: sys/dev/rasops/rasops.c
> ===
> RCS file: /cvs/src/sys/dev/rasops/rasops.c,v
> retrieving revision 1.44
> diff -u -p -r1.44 rasops.c
> --- sys/dev/rasops/rasops.c   15 Dec 2016 19:18:41 -  1.44
> +++ sys/dev/rasops/rasops.c   6 Jan 2017 22:11:02 -
> @@ -48,21 +48,21 @@
>  /* ANSI colormap (R,G,B) */
>  
>  #define  NORMAL_BLACK0x00
> -#define  NORMAL_RED  0x7f
> -#define  NORMAL_GREEN0x007f00
> -#define  NORMAL_BROWN0x7f7f00
> -#define  NORMAL_BLUE 0x7f
> -#define  NORMAL_MAGENTA  0x7f007f
> -#define  NORMAL_CYAN 0x007f7f
> -#define  NORMAL_WHITE0xc7c7c7/* XXX too dim? */
> +#define  NORMAL_RED  0xaa
> +#define  NORMAL_GREEN0x00aa00
> +#define  NORMAL_BROWN0xaa5500
> +#define  NORMAL_BLUE 0xaa
> +#define  NORMAL_MAGENTA  0xaa00aa
> +#define  NORMAL_CYAN 0x00
> +#define  NORMAL_WHITE0xaa
>  
> -#define  HILITE_BLACK0x7f7f7f
> -#define  HILITE_RED  0xff
> -#define  HILITE_GREEN0x00ff00
> -#define  HILITE_BROWN0x00
> -#define  HILITE_BLUE 0xff
> -#define  HILITE_MAGENTA  0xff00ff
> -#define  HILITE_CYAN 0x00
> +#define  HILITE_BLACK0x55
> +#define  HILITE_RED  0xff
> +#define  HILITE_GREEN0x55ff55
> +#define  HILITE_BROWN0x55
> +#define  HILITE_BLUE 0xff
> +#define  HILITE_MAGENTA  0xff55ff
> +#define  HILITE_CYAN 0x55
>  #define  HILITE_WHITE0xff
>  
>  const u_char rasops_cmap[256 * 3] = {
> 



Interrupt race in NET_LOCK/NET_UNLOCK

2016-12-22 Thread Visa Hankala
NET_LOCK() should raise IPL before acquiring the lock, and NET_UNLOCK()
should restore the level after releasing the lock. Otherwise, lock
recursion can occur, most likely right after the splx(). An example:

nd6_slowtimo  <- NET_LOCK()
timeout_run
softclock
softintr_dispatch
dosoftint
interrupt
k_intr
if_netisr  <- NET_LOCK()
taskq_thread

OK?

Index: sys/systm.h
===
RCS file: src/sys/sys/systm.h,v
retrieving revision 1.120
diff -u -p -r1.120 systm.h
--- sys/systm.h 19 Dec 2016 08:36:50 -  1.120
+++ sys/systm.h 23 Dec 2016 05:59:26 -
@@ -297,14 +297,14 @@ extern struct rwlock netlock;
 
 #defineNET_LOCK(s) 
\
 do {   \
-   rw_enter_write();   \
s = splsoftnet();   \
+   rw_enter_write();   \
 } while (0)
 
-#defineNET_UNLOCK(s)   \
+#defineNET_UNLOCK(s)   
\
 do {   \
-   splx(s);\
rw_exit_write();\
+   splx(s);\
 } while (0)
 
 #defineNET_ASSERT_LOCKED() 
\



NET_LOCK() recursion during netboot

2016-12-21 Thread Visa Hankala
Network boot triggers a netlock recursion:

panic
rw_enter
sosend  <- NET_LOCK()
nfs_send
nfs_request
nfs_lookup
VOP_LOOKUP
vfs_lookup
namei
unp_connect
uipc_usrreq
soconnect  <- NET_LOCK()
sys_connect

An XXXSMP workaround seems appropriate here.

OK?

Index: kern/uipc_usrreq.c
===
RCS file: src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.104
diff -u -p -r1.104 uipc_usrreq.c
--- kern/uipc_usrreq.c  19 Dec 2016 08:36:49 -  1.104
+++ kern/uipc_usrreq.c  21 Dec 2016 16:44:03 -
@@ -143,7 +143,10 @@ uipc_usrreq(struct socket *so, int req, 
break;
 
case PRU_CONNECT:
+   /* XXXSMP breaks atomicity */
+   rw_exit_write();
error = unp_connect(so, nam, p);
+   rw_enter_write();
break;
 
case PRU_CONNECT2:



Outdated remarks about partitioning on sgi

2016-10-05 Thread Visa Hankala
With the disklabel read logic in the sgi bootblocks, the 'a' partition
no longer has to be the first partition on a disk, so the positioning
remarks in the install notes and the installer seem outdated. The patch
below removes them.

OK?

Index: distrib/notes/sgi/install
===
RCS file: src/distrib/notes/sgi/install,v
retrieving revision 1.28
diff -u -p -r1.28 install
--- distrib/notes/sgi/install   12 Aug 2015 06:47:16 -  1.28
+++ distrib/notes/sgi/install   5 Oct 2016 12:03:16 -
@@ -173,14 +173,7 @@ OpenBSDInstallPart4
 OpenBSDInstallPart5
 
No partitions should overlap with the SGI Volume Header, which by
-   default will use the first 3134 sectors. Additionally, the 'a'
-   partition must be the first partition on the disk, immediately
-   following the SGI Volume Header. If the default Volume Header size is
-   used, the 'a' partition should be located at offset 3135. If the 'a'
-   partition is not located immediately after the Volume Header the boot
-   loader will not be able to locate and load the kernel.
-dnl XXX Note that this is a #$%@ boot blocks limitation which should be fixed
-dnl XXX by reading the real label in the boot blocks.
+   default will use the first 3134 sectors.
 
 OpenBSDInstallPart6({:-CD-ROM, NFS -:})
 
Index: distrib/sgi/ramdisk/install.md
===
RCS file: src/distrib/sgi/ramdisk/install.md,v
retrieving revision 1.39
diff -u -p -r1.39 install.md
--- distrib/sgi/ramdisk/install.md  1 Jan 2016 00:47:51 -   1.39
+++ distrib/sgi/ramdisk/install.md  5 Oct 2016 12:03:16 -
@@ -110,12 +110,6 @@ The 'p' partition must be retained since
 this in turn contains the boot loader. No other partitions should overlap
 with the SGI Volume Header, which by default will use the first 3134 sectors.
 
-Additionally, the 'a' partition must be the first partition on the disk,
-immediately following the SGI Volume Header. If the default SGI Volume Header
-size is used, the 'a' partition should be located at offset 3135. If the
-'a' partition is not located immediately after the SGI Volume Header the
-boot loader will not be able to locate and load the kernel.
-
 Do not change any parameters except the partition layout and the label name.
 
 __EOT



Re: Smarter OpenBSD/sgi boot blocks

2016-10-05 Thread Visa Hankala
On Tue, Oct 04, 2016 at 08:08:32PM +, Miod Vallat wrote:
> The sgi boot blocks use the PROM (ARCBios or ARCS) for its I/O routines.
> When using a disk-based path, these routines are using the partition
> table found in the ``volume header''.
> 
> In order to be able to use 16 partitions per disk, the OpenBSD port only
> claims one volume header partition, #0, as the OpenBSD area, and puts
> its own label in there.
> 
> Unfortunately, this means that `sd0a', the OpenBSD partition on which
> the kernel is found, may not actually start at the same location as the
> volume header partition #0, even though this is the case in most setups.
> 
> When reinstalling an O2 some time ago, I did not pay attention to this
> during install:
> 
>   Use (A)uto layout, (E)dit auto layout, or create (C)ustom layout? [a] c
>   [...]
>   Label editor (enter '?' for help at any prompt)
>   > p
>   OpenBSD area: 0-17773524; size: 17773524; free: 0
>   #size   offset  fstype [fsize bsize   cpg]
> a: 17770389 3135  4.2BSD   1024  819216
> c: 177735240  unused
> p: 31350 unknown
>   > d a
>   > a
>   partition: [a]
>   offset: [3135]
>   size: [17770389] 400M
>   Rounding size to cylinder (3360 sectors): 816705
>   FS type: [4.2BSD]
>   mount point: [none] /
>   Rounding offset to bsize (32 sectors): 3136
>   Rounding size to bsize (32 sectors): 816704
> 
> And `sd0a' ended up starting one sector after the beginning of the
> volume header partition #0.
> 
> Of course, rebooting afterwards did not work as expected:
> 
>   OpenBSD/sgi-IP32 ARCBios boot version 1.7
>   arg 0: pci(0)scsi(0)disk(1)rdisk(0)partition(8)/boot
>   arg 1: OSLoadOptions=auto
>   arg 2: ConsoleIn=serial(0)
>   arg 3: ConsoleOut=serial(0)
>   arg 4: SystemPartition=pci(0)scsi(0)disk(1)rdisk(0)partition(8)
>   arg 5: OSLoader=boot
>   arg 6: OSLoadPartition=pci(0)scsi(0)disk(1)rdisk(0)partition(0)
>   arg 7: OSLoadFilename=bsd
>   Boot: pci(0)scsi(0)disk(1)rdisk(0)partition(0)bsd
>   cannot open pci(0)scsi(0)disk(1)rdisk(0)partition(0)/etc/random.seed: 
> Invalid argument
>   open pci(0)scsi(0)disk(1)rdisk(0)partition(0)bsd: Invalid argument
>   Boot FAILED!
> 
> The proper way to fix this is to make the boot blocks read the real
> OpenBSD label instead of assuming sd0a spans volume header partition #0
> in all cases.
> 
> This is achieved by the diff below, which will attempt to use the raw
> disk device (volume header partition #10, equivalent to sd0c) for disk
> accesses and will read both the volume header (to know where the OpenBSD
> disklabel lies) and then the OpenBSD label (to know where sd0a really
> is).
> 
> Of course, this has to cope with the two valid disk syntaxes (true
> ARCBios used on older sgi systems, and ARCS dksc() syntax used on Origin
> and later systems).
> 
> This diff has been tested on O2 (IP32) and Origin 350 (IP27). It is
> written in a conservative way, in order to revert to the existing
> behaviour if anything fails (invalid volume header, no OpenBSD label
> yet...) and should not cause any regression.
> 
> Note that network boots are not affected by these changes.

I tested this also on Octane (IP30), and it works fine. The diff is now
committed. Thanks!

> Index: Makefile32.inc
> ===
> RCS file: /OpenBSD/src/sys/arch/sgi/stand/Makefile32.inc,v
> retrieving revision 1.5
> diff -u -p -r1.5 Makefile32.inc
> --- Makefile32.inc19 Oct 2012 13:51:59 -  1.5
> +++ Makefile32.inc30 Sep 2016 09:19:15 -
> @@ -18,6 +18,7 @@ AS+=-32
>  LD?= ld
>  LD+= -m elf32btsmip
>  LIBSA_CPPFLAGS=
> +CFLAGS+= -DLIBSA_LONGLONG_PRINTF
>  .endif
>  
>  ### Figure out what to use for libsa and libz
> Index: boot/Makefile
> ===
> RCS file: /OpenBSD/src/sys/arch/sgi/stand/boot/Makefile,v
> retrieving revision 1.16
> diff -u -p -r1.16 Makefile
> --- boot/Makefile 30 Jul 2016 03:25:49 -  1.16
> +++ boot/Makefile 30 Sep 2016 09:19:15 -
> @@ -13,14 +13,14 @@ AFLAGS+=  ${SAABI}
>  
>  S=   ${.CURDIR}/../../../..
>  SRCS=start.S arcbios.c boot.c conf.c diskio.c filesystem.c \
> - netfs.c netio.c strchr.c strstr.c
> + netfs.c netio.c strstr.c
>  
>  .PATH:   ${S}/lib/libsa
>  SRCS+=   loadfile.c
>  
>  .PATH:   ${S}/lib/libkern/arch/mips64 ${S}/lib/libkern
> -SRCS+=   strlcpy.c memcpy.c strlen.c strrchr.c strlcat.c 
> strncmp.c \
> - strcmp.S
> +SRCS+=   memcpy.c strchr.c strcmp.S strlcat.c strlcpy.c strlen.c 
> \
> + strncmp.c strrchr.c
>  
>  CLEANFILES+= machine mips64
>  
> Index: boot/diskio.c
> ===
> RCS file: 

Re: Manpage for uctuctl(4)

2016-09-18 Thread Visa Hankala
On Sun, Sep 18, 2016 at 03:33:00PM +0200, Ingo Schwarze wrote:
> So, here is a cleaned-up version:
> 
>  - Move the new page to the proper directory.
>  - Mention it in the Makefile.
>  - Put the correct manual page author into the Copyright notice.
>  - Add the architecture to the .Dt line.
>  - Remove the needless .Pq from the .Cd line
>(it might be useful in ehci(4), but not here).
>  - Remove the argument from .Nm in the DESCRIPTION.
>  - Append an AUTHORS section.
> 
> In long and complicated manuals, the AUTHORS section can also contain
> a sentence like:  This manual page was written by Rafael Neves.
> But i don't think that's interesting for such a short manual.
> 
> OK?
>   Ingo

ok visa@



  1   2   >