Re: add in6 multicast support to vxlan(4) ; question on mbufs

2016-10-05 Thread YASUOKA Masahiko

On Wed, 5 Oct 2016 14:30:27 +0200
Mike Belopuhov  wrote:
> On Wed, Oct 05, 2016 at 10:58 +0900, YASUOKA Masahiko wrote:
>> On Tue, 4 Oct 2016 17:27:12 +0200
>> Mike Belopuhov  wrote:
>> > On Tue, Oct 04, 2016 at 01:07 +0200, Vincent Gross wrote:
>> >> On Sat, 24 Sep 2016 10:58:10 +0200
>> >> Vincent Gross  wrote:
>> >> 
>> >> > Hi,
>> >> > 
>> >> [snip]
>> >> > 
>> >> > Aside from the mbuf issue, is this Ok ?
>> >> 
>> >> I will go back on the mbuff stuff later.
>> >> 
>> >> Diff rebased, ok anyone ?
>> >> 
>> >> Index: net/if_vxlan.c
>> >> ===
>> >> RCS file: /cvs/src/sys/net/if_vxlan.c,v
>> >> retrieving revision 1.48
>> >> diff -u -p -r1.48 if_vxlan.c
>> >> --- net/if_vxlan.c30 Sep 2016 10:22:05 -  1.48
>> >> +++ net/if_vxlan.c3 Oct 2016 23:12:42 -
>> >> @@ -638,7 +749,9 @@ vxlan_lookup(struct mbuf *m, struct udph
>> >>   if (m->m_pkthdr.len < skip + sizeof(struct ether_header))
>> >>   return (EINVAL);
>> >>  
>> >> - m_adj(m, skip);
>> >> + m_adj(m, skip - ETHER_ALIGN);
>> >> + m = m_pullup(m, ETHER_HDR_LEN + ETHER_ALIGN);
>> >> + m_adj(m, ETHER_ALIGN);
>> >>   ifp = >sc_ac.ac_if;
>> >>  
>> >>  #if NBRIDGE > 0
>> > 
>> > I think this chunk is correct.  First you ensure that m->m_data
>> > points to a contiguous and well-aligned ethernet header and then
>> > you trim the alignment so that mtod() points directly at it.
>> 
>> Isn't it possible that m_pullup() may return non aligned pointer?
>> 
> 
> Do you mean if m->m_data pointer can be not aligned properly after
> an m_pullup?

Sorry.  Yes, I meant that.

> Of course, if the header layout is such that m_pullup doesn't need
> to move data around and at the same time the header that we're going
> to m_adj[ust] to is not aligned properly, this is going to be a
> problem.

The diff I sent to hackers@ is including my lastest proposal

@@ -657,6 +658,16 @@ vxlan_lookup(struct mbuf *m, struct udph
 #if NPF > 0
pf_pkt_addr_changed(m);
 #endif
+   if ((m = m_pullup(m, sizeof(struct ether_header))) == NULL)
+   return (ENOBUFS);
+
+   if (!ALIGNED_POINTER(mtod(m, caddr_t) + ETHER_HDR_LEN, uint32_t)) {
+   m0 = m;
+   m = m_dup_pkt(m0, ETHER_ALIGN, M_NOWAIT);
+   m_freem(m0);
+   if (m == NULL)
+   return (ENOBUFS);
+   }
 
ml_enqueue(, m);
if_input(ifp, );


We might have a better way, but I think above diff does what we need
to do here simply.

  1. ensure entire ethernet header is on first mbuf for ether_input()
  2. align 32bit at the ethernet payload (eg. IP)

--yasuoka



Re: syslogd milliseconds timestamp

2016-10-05 Thread David Gwynne

> On 6 Oct 2016, at 07:07, Alexander Bluhm  wrote:
> 
> Hi,
> 
> RFC 5424 says you should add 1 to 6 digits fractions of a second
> to each syslog timestamp.  As we do not measure the time in syslog(3),
> it takes 100 microseconds on my laptop before syslogd(8) adds the
> timestamp.  So I have decided to use only 3 digits with millisecond
> precision.  Note that this is only visible with syslogd -Z.
> 
> Here is the difference between the old an new format:
> 2016-10-05T20:55:26Z t430s syslogd: exiting on signal 15
> 2016-10-05T20:55:26.160Z t430s syslogd: start
> 
> ok?

ok

> 
> bluhm
> 
> Index: usr.sbin/syslogd/syslogd.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/usr.sbin/syslogd/syslogd.c,v
> retrieving revision 1.216
> diff -u -p -r1.216 syslogd.c
> --- usr.sbin/syslogd/syslogd.c4 Oct 2016 22:09:21 -   1.216
> +++ usr.sbin/syslogd/syslogd.c5 Oct 2016 20:12:02 -
> @@ -1577,7 +1577,7 @@ printsys(char *msg)
>   }
> }
> 
> -time_t   now;
> +struct timeval   now;
> 
> /*
>  * Log a message to the appropriate log files, users, etc. based on
> @@ -1587,7 +1587,6 @@ void
> logmsg(int pri, char *msg, char *from, int flags)
> {
>   struct filed *f;
> - struct tm *tm;
>   int fac, msglen, prilev, i;
>   char timestamp[33];
>   char prog[NAME_MAX+1];
> @@ -1665,13 +1664,18 @@ logmsg(int pri, char *msg, char *from, i
>   flags |= ADDDATE;
>   }
> 
> - (void)time();
> + (void)gettimeofday(, NULL);
>   if (flags & ADDDATE) {
>   if (ZuluTime) {
> - tm = gmtime();
> - strftime(timestamp, sizeof(timestamp), "%FT%TZ", tm);
> + struct tm *tm;
> + size_t l;
> +
> + tm = gmtime(_sec);
> + l = strftime(timestamp, sizeof(timestamp), "%FT%T", tm);
> + snprintf(timestamp + l, sizeof(timestamp) -l, ".%03ldZ",
> + now.tv_usec / 1000);
>   } else
> - strlcpy(timestamp, ctime() + 4, 16);
> + strlcpy(timestamp, ctime(_sec) + 4, 16);
>   }
> 
>   /* extract facility and priority level */
> @@ -1724,7 +1728,8 @@ logmsg(int pri, char *msg, char *from, i
>   continue;
> 
>   /* don't output marks to recently written files */
> - if ((flags & MARK) && (now - f->f_time) < MarkInterval / 2)
> + if ((flags & MARK) &&
> + (now.tv_sec - f->f_time) < MarkInterval / 2)
>   continue;
> 
>   /*
> @@ -1737,7 +1742,7 @@ logmsg(int pri, char *msg, char *from, i
>   sizeof(f->f_lasttime));
>   f->f_prevcount++;
>   logdebug("msg repeated %d times, %ld sec of %d\n",
> - f->f_prevcount, (long)(now - f->f_time),
> + f->f_prevcount, (long)(now.tv_sec - f->f_time),
>   repeatinterval[f->f_repeatcount]);
>   /*
>* If domark would have logged this by now,
> @@ -1745,7 +1750,7 @@ logmsg(int pri, char *msg, char *from, i
>* but back off so we'll flush less often
>* in the future.
>*/
> - if (now > REPEATTIME(f)) {
> + if (now.tv_sec > REPEATTIME(f)) {
>   fprintlog(f, flags, (char *)NULL);
>   BACKOFF(f);
>   }
> @@ -1787,7 +1792,7 @@ fprintlog(struct filed *f, int flags, ch
>   if (f->f_type == F_WALL) {
>   l = snprintf(greetings, sizeof(greetings),
>   "\r\n\7Message from syslogd@%s at %.24s ...\r\n",
> - f->f_prevhost, ctime());
> + f->f_prevhost, ctime(_sec));
>   if (l < 0 || (size_t)l >= sizeof(greetings))
>   l = strlen(greetings);
>   v->iov_base = greetings;
> @@ -1844,7 +1849,7 @@ fprintlog(struct filed *f, int flags, ch
>   v++;
> 
>   logdebug("Logging to %s", TypeNames[f->f_type]);
> - f->f_time = now;
> + f->f_time = now.tv_sec;
> 
>   switch (f->f_type) {
>   case F_UNUSED:
> @@ -1946,8 +1951,8 @@ fprintlog(struct filed *f, int flags, ch
> 
>   /* pipe is non-blocking. log and drop message if full */
>   if (e == EAGAIN && f->f_type == F_PIPE) {
> - if (now - f->f_lasterrtime > 120) {
> - f->f_lasterrtime = now;
> + if (now.tv_sec - f->f_lasterrtime > 120) {
> + f->f_lasterrtime = now.tv_sec;
>   logerror(f->f_un.f_fname);
>  

syslogd milliseconds timestamp

2016-10-05 Thread Alexander Bluhm
Hi,

RFC 5424 says you should add 1 to 6 digits fractions of a second
to each syslog timestamp.  As we do not measure the time in syslog(3),
it takes 100 microseconds on my laptop before syslogd(8) adds the
timestamp.  So I have decided to use only 3 digits with millisecond
precision.  Note that this is only visible with syslogd -Z.

Here is the difference between the old an new format:
2016-10-05T20:55:26Z t430s syslogd: exiting on signal 15
2016-10-05T20:55:26.160Z t430s syslogd: start

ok?

bluhm

Index: usr.sbin/syslogd/syslogd.c
===
RCS file: /data/mirror/openbsd/cvs/src/usr.sbin/syslogd/syslogd.c,v
retrieving revision 1.216
diff -u -p -r1.216 syslogd.c
--- usr.sbin/syslogd/syslogd.c  4 Oct 2016 22:09:21 -   1.216
+++ usr.sbin/syslogd/syslogd.c  5 Oct 2016 20:12:02 -
@@ -1577,7 +1577,7 @@ printsys(char *msg)
}
 }
 
-time_t now;
+struct timeval now;
 
 /*
  * Log a message to the appropriate log files, users, etc. based on
@@ -1587,7 +1587,6 @@ void
 logmsg(int pri, char *msg, char *from, int flags)
 {
struct filed *f;
-   struct tm *tm;
int fac, msglen, prilev, i;
char timestamp[33];
char prog[NAME_MAX+1];
@@ -1665,13 +1664,18 @@ logmsg(int pri, char *msg, char *from, i
flags |= ADDDATE;
}
 
-   (void)time();
+   (void)gettimeofday(, NULL);
if (flags & ADDDATE) {
if (ZuluTime) {
-   tm = gmtime();
-   strftime(timestamp, sizeof(timestamp), "%FT%TZ", tm);
+   struct tm *tm;
+   size_t l;
+
+   tm = gmtime(_sec);
+   l = strftime(timestamp, sizeof(timestamp), "%FT%T", tm);
+   snprintf(timestamp + l, sizeof(timestamp) -l, ".%03ldZ",
+   now.tv_usec / 1000);
} else
-   strlcpy(timestamp, ctime() + 4, 16);
+   strlcpy(timestamp, ctime(_sec) + 4, 16);
}
 
/* extract facility and priority level */
@@ -1724,7 +1728,8 @@ logmsg(int pri, char *msg, char *from, i
continue;
 
/* don't output marks to recently written files */
-   if ((flags & MARK) && (now - f->f_time) < MarkInterval / 2)
+   if ((flags & MARK) &&
+   (now.tv_sec - f->f_time) < MarkInterval / 2)
continue;
 
/*
@@ -1737,7 +1742,7 @@ logmsg(int pri, char *msg, char *from, i
sizeof(f->f_lasttime));
f->f_prevcount++;
logdebug("msg repeated %d times, %ld sec of %d\n",
-   f->f_prevcount, (long)(now - f->f_time),
+   f->f_prevcount, (long)(now.tv_sec - f->f_time),
repeatinterval[f->f_repeatcount]);
/*
 * If domark would have logged this by now,
@@ -1745,7 +1750,7 @@ logmsg(int pri, char *msg, char *from, i
 * but back off so we'll flush less often
 * in the future.
 */
-   if (now > REPEATTIME(f)) {
+   if (now.tv_sec > REPEATTIME(f)) {
fprintlog(f, flags, (char *)NULL);
BACKOFF(f);
}
@@ -1787,7 +1792,7 @@ fprintlog(struct filed *f, int flags, ch
if (f->f_type == F_WALL) {
l = snprintf(greetings, sizeof(greetings),
"\r\n\7Message from syslogd@%s at %.24s ...\r\n",
-   f->f_prevhost, ctime());
+   f->f_prevhost, ctime(_sec));
if (l < 0 || (size_t)l >= sizeof(greetings))
l = strlen(greetings);
v->iov_base = greetings;
@@ -1844,7 +1849,7 @@ fprintlog(struct filed *f, int flags, ch
v++;
 
logdebug("Logging to %s", TypeNames[f->f_type]);
-   f->f_time = now;
+   f->f_time = now.tv_sec;
 
switch (f->f_type) {
case F_UNUSED:
@@ -1946,8 +1951,8 @@ fprintlog(struct filed *f, int flags, ch
 
/* pipe is non-blocking. log and drop message if full */
if (e == EAGAIN && f->f_type == F_PIPE) {
-   if (now - f->f_lasterrtime > 120) {
-   f->f_lasterrtime = now;
+   if (now.tv_sec - f->f_lasterrtime > 120) {
+   f->f_lasterrtime = now.tv_sec;
logerror(f->f_un.f_fname);
}
break;
@@ -2884,7 +2889,7 @@ markit(void)
 {
struct filed *f;
 
-   now = time(NULL);
+   (void)gettimeofday(, NULL);

Re: hide wpi firmware error log

2016-10-05 Thread Theo Buehler
On Wed, Oct 05, 2016 at 10:35:50PM +0200, Stefan Sperling wrote:
> I don't think any normal human being understands the lines
> printed by the wpi(4) driver after 'fatal firmware error'.
> 
> wpi0: fatal firmware error
> firmware error log (count=1):
>   error type = "UNKNOWN" (0x0013)
>   error data  = 0x0070
>   branch link = 0x08B60274
>   interrupt link  = 0x03206AA8
>   time= 3028342675
> driver status:
>   tx ring  0: qid=0  cur=178 queued=21 
>   tx ring  1: qid=1  cur=0   queued=0  
>   tx ring  2: qid=2  cur=0   queued=0  
>   tx ring  3: qid=3  cur=0   queued=0  
>   tx ring  4: qid=4  cur=21  queued=0  
>   tx ring  5: qid=5  cur=0   queued=0  
>   rx ring: cur=0
>   802.11 state 4
> 
> Anyone seriously looking at this is hacking the driver anyway.
> For everyone else it's just dmesg spam.
> Let's ifdef this away into WPI_DEBUG like it's done in iwm(4).
> 
> ok?
> 

Yes please. I saw this today as I dusted off and used an Acer and was
wondering what this would mean without having the time to investigate. I
was surprised that the net still worked.  Please get rid of it.  It's
confusing.

> Index: if_wpi.c
> ===
> RCS file: /cvs/src/sys/dev/pci/if_wpi.c,v
> retrieving revision 1.135
> diff -u -p -r1.135 if_wpi.c
> --- if_wpi.c  5 Sep 2016 08:18:40 -   1.135
> +++ if_wpi.c  5 Oct 2016 20:27:12 -
> @@ -1526,6 +1526,7 @@ wpi_notif_intr(struct wpi_softc *sc)
>   WPI_WRITE(sc, WPI_FH_RX_WPTR, hw & ~7);
>  }
>  
> +#ifdef WPI_DEBUG
>  /*
>   * Dump the error log of the firmware when a firmware panic occurs.  Although
>   * we can't debug the firmware because it is neither open source nor free, it
> @@ -1593,6 +1594,7 @@ wpi_fatal_intr(struct wpi_softc *sc)
>   printf("  802.11 state %d\n", sc->sc_ic.ic_state);
>  #undef N
>  }
> +#endif
>  
>  int
>  wpi_intr(void *arg)
> @@ -1622,7 +1624,9 @@ wpi_intr(void *arg)
>   if (r1 & (WPI_INT_SW_ERR | WPI_INT_HW_ERR)) {
>   printf("%s: fatal firmware error\n", sc->sc_dev.dv_xname);
>   /* Dump firmware error log and stop. */
> +#ifdef WPI_DEBUG
>   wpi_fatal_intr(sc);
> +#endif
>   wpi_stop(ifp, 1);
>   task_add(systq, >init_task);
>   return 1;
> 



hide wpi firmware error log

2016-10-05 Thread Stefan Sperling
I don't think any normal human being understands the lines
printed by the wpi(4) driver after 'fatal firmware error'.

wpi0: fatal firmware error
firmware error log (count=1):
  error type = "UNKNOWN" (0x0013)
  error data  = 0x0070
  branch link = 0x08B60274
  interrupt link  = 0x03206AA8
  time= 3028342675
driver status:
  tx ring  0: qid=0  cur=178 queued=21 
  tx ring  1: qid=1  cur=0   queued=0  
  tx ring  2: qid=2  cur=0   queued=0  
  tx ring  3: qid=3  cur=0   queued=0  
  tx ring  4: qid=4  cur=21  queued=0  
  tx ring  5: qid=5  cur=0   queued=0  
  rx ring: cur=0
  802.11 state 4

Anyone seriously looking at this is hacking the driver anyway.
For everyone else it's just dmesg spam.
Let's ifdef this away into WPI_DEBUG like it's done in iwm(4).

ok?

Index: if_wpi.c
===
RCS file: /cvs/src/sys/dev/pci/if_wpi.c,v
retrieving revision 1.135
diff -u -p -r1.135 if_wpi.c
--- if_wpi.c5 Sep 2016 08:18:40 -   1.135
+++ if_wpi.c5 Oct 2016 20:27:12 -
@@ -1526,6 +1526,7 @@ wpi_notif_intr(struct wpi_softc *sc)
WPI_WRITE(sc, WPI_FH_RX_WPTR, hw & ~7);
 }
 
+#ifdef WPI_DEBUG
 /*
  * Dump the error log of the firmware when a firmware panic occurs.  Although
  * we can't debug the firmware because it is neither open source nor free, it
@@ -1593,6 +1594,7 @@ wpi_fatal_intr(struct wpi_softc *sc)
printf("  802.11 state %d\n", sc->sc_ic.ic_state);
 #undef N
 }
+#endif
 
 int
 wpi_intr(void *arg)
@@ -1622,7 +1624,9 @@ wpi_intr(void *arg)
if (r1 & (WPI_INT_SW_ERR | WPI_INT_HW_ERR)) {
printf("%s: fatal firmware error\n", sc->sc_dev.dv_xname);
/* Dump firmware error log and stop. */
+#ifdef WPI_DEBUG
wpi_fatal_intr(sc);
+#endif
wpi_stop(ifp, 1);
task_add(systq, >init_task);
return 1;



Can we get rid of _SUBDIRUSE for both "all" and "regress"?

2016-10-05 Thread Ingo Schwarze
Hi,

a Makefile including bsd.regress.mk cannot reasonably set both
SUBDIR and REGRESS_TARGETS right now, or "make all" will first
run "make regress" in all subdirs, and then run "make all" in
all subdirs, because "all" depends on "regress", and both depend
on _SUBDIRUSE.  So all the SUBDIR tests will get run twice.

That's bad because if a test suite is growing organically -
which is a good thing! - the need to have both in the top level
directory arises naturally:  You first put some basic tests there,
and later realize that more refined tests for specific subsystems
would be worthwhile too, such that you then need SUBDIR as well.

A prominent example exhibiting the problem is:
  /usr/src/regress/usr.bin/ssh/Makefile

The following patch fixes the problem by avoiding the descent for
"regress" if we are going to descend for "all" later.
I have an OK from bluhm@ for it.

But before fiddling with /usr/share/mk/, i decided to be cautious
and show the patch here, such that you have a chance to speak up
if you see any potential problems.

Unless somebody raises any concerns, i'm going to commit.

Yours,
  Ingo

P.S.
clean and cleandir show the same problem as regress and all;
of course, it's less important for them, but why not fix them, too.


Index: bsd.subdir.mk
===
RCS file: /cvs/src/share/mk/bsd.subdir.mk,v
retrieving revision 1.21
diff -u -p -r1.21 bsd.subdir.mk
--- bsd.subdir.mk   8 Mar 2015 21:59:48 -   1.21
+++ bsd.subdir.mk   29 Sep 2016 11:37:40 -
@@ -76,11 +76,17 @@ realinstall: beforeinstall _SUBDIRUSE
 .endif
 
 
-.for t in all clean cleandir includes depend obj tags regress manlint
+.for t in all cleandir includes depend obj tags manlint
 .  if !target($t)
 $t: _SUBDIRUSE
 .  endif
 .endfor
+.if !target(regress) && empty(.TARGETS:Mall)
+regress: _SUBDIRUSE
+.endif
+.if !target(clean) && empty(.TARGETS:Mcleandir)
+clean: _SUBDIRUSE
+.endif
 
 .if !defined(BSD_OWN_MK)
 .  include 



Re: add in6 multicast support to vxlan(4) ; question on mbufs

2016-10-05 Thread Mike Belopuhov
On Wed, Oct 05, 2016 at 20:36 +0200, Mike Belopuhov wrote:
> On Wed, Oct 05, 2016 at 14:30 +0200, Mike Belopuhov wrote:
> > On Wed, Oct 05, 2016 at 10:58 +0900, YASUOKA Masahiko wrote:
> > > On Tue, 4 Oct 2016 17:27:12 +0200
> > > Mike Belopuhov  wrote:
> > > > On Tue, Oct 04, 2016 at 01:07 +0200, Vincent Gross wrote:
> > > >> On Sat, 24 Sep 2016 10:58:10 +0200
> > > >> Vincent Gross  wrote:
> > > >> 
> > > >> > Hi,
> > > >> > 
> > > >> [snip]
> > > >> > 
> > > >> > Aside from the mbuf issue, is this Ok ?
> > > >> 
> > > >> I will go back on the mbuff stuff later.
> > > >> 
> > > >> Diff rebased, ok anyone ?
> > > >> 
> > > >> Index: net/if_vxlan.c
> > > >> ===
> > > >> RCS file: /cvs/src/sys/net/if_vxlan.c,v
> > > >> retrieving revision 1.48
> > > >> diff -u -p -r1.48 if_vxlan.c
> > > >> --- net/if_vxlan.c 30 Sep 2016 10:22:05 -  1.48
> > > >> +++ net/if_vxlan.c 3 Oct 2016 23:12:42 -
> > > >> @@ -638,7 +749,9 @@ vxlan_lookup(struct mbuf *m, struct udph
> > > >>if (m->m_pkthdr.len < skip + sizeof(struct ether_header))
> > > >>return (EINVAL);
> > > >>  
> > > >> -  m_adj(m, skip);
> > > >> +  m_adj(m, skip - ETHER_ALIGN);
> > > >> +  m = m_pullup(m, ETHER_HDR_LEN + ETHER_ALIGN);
> > > >> +  m_adj(m, ETHER_ALIGN);
> > > >>ifp = >sc_ac.ac_if;
> > > >>  
> > > >>  #if NBRIDGE > 0
> > > > 
> > > > I think this chunk is correct.  First you ensure that m->m_data
> > > > points to a contiguous and well-aligned ethernet header and then
> > > > you trim the alignment so that mtod() points directly at it.
> > > 
> > > Isn't it possible that m_pullup() may return non aligned pointer?
> > > 
> > 
> > Do you mean if m->m_data pointer can be not aligned properly after
> > an m_pullup?  Of course, if the header layout is such that m_pullup
> > doesn't need to move data around and at the same time the header
> > that we're going to m_adj[ust] to is not aligned properly, this is
> > going to be a problem.
> > 
> > I wrote a test program (attached) that illustrates the change in
> > behavior when compiled w/o any options versus when compiled with
> > -DTEST3 (cc -DTEST3 -o mbuf mbuf.c)
> > 
> > Meaning that m_pullup might not be enough in a generic case.
> 
> I have come up with a code that looks like this:
> 
> struct mbuf *n;
> int off;
> 
> n = m_getptr(m_head, skip + ETHER_HDR_LEN, );
> if ((m_head->m_len == m_head->m_pkthdr.len) &&
> ((n = m_getptr(m_head, skip + ETHER_HDR_LEN, )) == m_head) &&
> !ALIGNED_POINTER(mtod(n, char *) + off, uint32_t)) {
> m_adj(m_head, skip - ETHER_ALIGN);
> memmove(mtod(m_head, char *) - ETHER_ALIGN,
> mtod(m_head, char *), m_head->m_len);
> m_head->m_len -= ETHER_ALIGN;
> } else if ((n = m_getptr(m_head, skip, )) != m_head) {
> /* Move Ethernet header to the first mbuf */
> m_adj(m_head, skip - ETHER_HDR_LEN - ETHER_ALIGN);
> m_copydata(n, off, ETHER_HDR_LEN, mtod(m_head, char *) +
> ETHER_ALIGN);
> m_adj(m_head, ETHER_ALIGN);
> /* See if we need to shift the data */
> if (!ALIGNED_POINTER(mtod(n, char *) + ETHER_HDR_LEN, uint32_t)) {
> memmove(mtod(n, char *), mtod(n, char *) + ETHER_HDR_LEN,
> n->m_len - ETHER_HDR_LEN);
> m_adj(n, -ETHER_HDR_LEN);
> } else {
> m_adj(n, ETHER_HDR_LEN);
> }
> } else {
> m_adj(m_head, skip);
> }
> 
> But then I've realised that ether_input simply does m_adj(m, ETHER_HDR_LEN)
> and passes the mbuf on to the ipv4_input where we do mtod(m, struct ip *).
> Meaning that neither this funny code nor a simple m_pullup of the Ethernet
> header will work.  Ethernet header as well as an IP header must reside in
> a contiguous space and be well aligned.  Sigh.

Heh, rzalamena@ has noticed that ipv4_input does in fact do a pullup so
that should prevent problems where IP header is in the different mbuf.
However this doesn't solve the IP header alignment issue which is solved
by my code above.  So it's either that or a full copy of the packet via
m_dup_pkt.  I'm attaching an improved testing framework that I'm using.

#include 
#include 

#include 
#include 
#include 
#include 
#include 
#include 

/* sparc64 version */
#define XALIGNED_POINTER(p, t)  unsigned long)(p)) & (sizeof(t) - 1)) == 0)
/* amd64 version */
#define ALIGNED_POINTER(p, t)   1

#define panic(x...) errx(1, x)

unsigned int
min(unsigned int a, unsigned int b)
{
return (a < b ? a : b);
}

void
m_extfree(struct mbuf *m)
{
free(m->m_ext.ext_buf);

m->m_flags &= ~(M_EXT|M_EXTWR);
}

struct mbuf *
m_free(struct mbuf *m)
{
struct mbuf *n;

if (m == NULL)
return (NULL);

n = m->m_next;

if (m->m_flags & M_EXT)
m_extfree(m);

 

Re: Help me testing the netlock

2016-10-05 Thread Lampshade
panic: rw_enter: netlock locking against myself

 TIDPID UIDPRFLAGS
 12705 12705 1041 0x23
*62630 62630 1000 0x32
   
PFLAGS CPU COMMAND
0  1chrome
0  0 Xorg

panic()
rw_enter()
rw_enter_write()
rw_enter_timer()
timeout_run()
softlock()
softintr_dispatch()
Xsoftlock()
---interrupt---
param.c at 0x8
Bad frame pointer: 0x800032d78 b14

Above was similar with ddbcpu 0
Following machine
ddbcpu 1

x86_ipi_handler()
Xresume_lapic_ipi_
---interrupt---
__mp_lock()
syscall()
---syscall (number 27)---
end of kernel



Re: add in6 multicast support to vxlan(4) ; question on mbufs

2016-10-05 Thread Mike Belopuhov
On Wed, Oct 05, 2016 at 14:30 +0200, Mike Belopuhov wrote:
> On Wed, Oct 05, 2016 at 10:58 +0900, YASUOKA Masahiko wrote:
> > On Tue, 4 Oct 2016 17:27:12 +0200
> > Mike Belopuhov  wrote:
> > > On Tue, Oct 04, 2016 at 01:07 +0200, Vincent Gross wrote:
> > >> On Sat, 24 Sep 2016 10:58:10 +0200
> > >> Vincent Gross  wrote:
> > >> 
> > >> > Hi,
> > >> > 
> > >> [snip]
> > >> > 
> > >> > Aside from the mbuf issue, is this Ok ?
> > >> 
> > >> I will go back on the mbuff stuff later.
> > >> 
> > >> Diff rebased, ok anyone ?
> > >> 
> > >> Index: net/if_vxlan.c
> > >> ===
> > >> RCS file: /cvs/src/sys/net/if_vxlan.c,v
> > >> retrieving revision 1.48
> > >> diff -u -p -r1.48 if_vxlan.c
> > >> --- net/if_vxlan.c   30 Sep 2016 10:22:05 -  1.48
> > >> +++ net/if_vxlan.c   3 Oct 2016 23:12:42 -
> > >> @@ -638,7 +749,9 @@ vxlan_lookup(struct mbuf *m, struct udph
> > >>  if (m->m_pkthdr.len < skip + sizeof(struct ether_header))
> > >>  return (EINVAL);
> > >>  
> > >> -m_adj(m, skip);
> > >> +m_adj(m, skip - ETHER_ALIGN);
> > >> +m = m_pullup(m, ETHER_HDR_LEN + ETHER_ALIGN);
> > >> +m_adj(m, ETHER_ALIGN);
> > >>  ifp = >sc_ac.ac_if;
> > >>  
> > >>  #if NBRIDGE > 0
> > > 
> > > I think this chunk is correct.  First you ensure that m->m_data
> > > points to a contiguous and well-aligned ethernet header and then
> > > you trim the alignment so that mtod() points directly at it.
> > 
> > Isn't it possible that m_pullup() may return non aligned pointer?
> > 
> 
> Do you mean if m->m_data pointer can be not aligned properly after
> an m_pullup?  Of course, if the header layout is such that m_pullup
> doesn't need to move data around and at the same time the header
> that we're going to m_adj[ust] to is not aligned properly, this is
> going to be a problem.
> 
> I wrote a test program (attached) that illustrates the change in
> behavior when compiled w/o any options versus when compiled with
> -DTEST3 (cc -DTEST3 -o mbuf mbuf.c)
> 
> Meaning that m_pullup might not be enough in a generic case.

I have come up with a code that looks like this:

struct mbuf *n;
int off;

n = m_getptr(m_head, skip + ETHER_HDR_LEN, );
if ((m_head->m_len == m_head->m_pkthdr.len) &&
((n = m_getptr(m_head, skip + ETHER_HDR_LEN, )) == m_head) &&
!ALIGNED_POINTER(mtod(n, char *) + off, uint32_t)) {
m_adj(m_head, skip - ETHER_ALIGN);
memmove(mtod(m_head, char *) - ETHER_ALIGN,
mtod(m_head, char *), m_head->m_len);
m_head->m_len -= ETHER_ALIGN;
} else if ((n = m_getptr(m_head, skip, )) != m_head) {
/* Move Ethernet header to the first mbuf */
m_adj(m_head, skip - ETHER_HDR_LEN - ETHER_ALIGN);
m_copydata(n, off, ETHER_HDR_LEN, mtod(m_head, char *) +
ETHER_ALIGN);
m_adj(m_head, ETHER_ALIGN);
/* See if we need to shift the data */
if (!ALIGNED_POINTER(mtod(n, char *) + ETHER_HDR_LEN, uint32_t)) {
memmove(mtod(n, char *), mtod(n, char *) + ETHER_HDR_LEN,
n->m_len - ETHER_HDR_LEN);
m_adj(n, -ETHER_HDR_LEN);
} else {
m_adj(n, ETHER_HDR_LEN);
}
} else {
m_adj(m_head, skip);
}

But then I've realised that ether_input simply does m_adj(m, ETHER_HDR_LEN)
and passes the mbuf on to the ipv4_input where we do mtod(m, struct ip *).
Meaning that neither this funny code nor a simple m_pullup of the Ethernet
header will work.  Ethernet header as well as an IP header must reside in
a contiguous space and be well aligned.  Sigh.



Re: httpd(8): dup2() fix for proc.c

2016-10-05 Thread Reyk Floeter
On Wed, Oct 05, 2016 at 06:56:54PM +0200, Rafael Zalamena wrote:
> This diff fixes the same problem ntpd(8) had with the dup2() when oldd == 
> newd.
> 
> Quick background:
> when you dup2(oldd, newd) and oldd == newd the CLOEXEC flag won't be removed
> by the descriptor. We could use dup3() to detect this, but it is easier/faster
> just to compare the fds and do the fcntl() ourselves.
> 
> ok?
> 

OK (needs a little merge with the setsid() commit).

Please sync it to {httpd,switchd,relayd,vmd}/proc.c
(all the new fork+exec style variants)

Reyk

> 
> Index: proc.c
> ===
> RCS file: /home/obsdcvs/src/usr.sbin/httpd/proc.c,v
> retrieving revision 1.27
> diff -u -p -r1.27 proc.c
> --- proc.c28 Sep 2016 12:01:04 -  1.27
> +++ proc.c5 Oct 2016 16:50:40 -
> @@ -22,6 +22,7 @@
>  #include 
>  #include 
>  
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -131,7 +132,12 @@ proc_exec(struct privsep *ps, struct pri
>   break;
>   case 0:
>   /* Prepare parent socket. */
> - dup2(fd, PROC_PARENT_SOCK_FILENO);
> + if (fd != PROC_PARENT_SOCK_FILENO) {
> + if (dup2(fd, PROC_PARENT_SOCK_FILENO)
> + == -1)
> + fatal("dup2");
> + } else if (fcntl(fd, F_SETFD, 0) == -1)
> + fatal("fcntl");
>  
>   execvp(argv[0], nargv);
>   fatal("%s: execvp", __func__);
> 

-- 
Esdenera Networks GmbH
Konkordiastr. 14b, 30449 Hannover, Germany
mobile: +49-151-24018199, reyk.floe...@esdenera.com
http://www.esdenera.com/, twitter: @esdenera

Managing Director: Reyk Floeter
Jurisdiction: HRB 209963, Hannover
USt-IdNr. (VAT-ID): DE289693407



Re: syslogd fork+exec

2016-10-05 Thread Alexander Bluhm
On Sun, Oct 02, 2016 at 11:07:21PM +0200, Alexander Bluhm wrote:
> On Sat, Oct 01, 2016 at 07:41:13PM +0200, Rafael Zalamena wrote:
> > This could be replaced with "closefrom(4);".
> 
> Updated diff:
> - use closefrom(2)
> - use execvp(3) to allow starting syslogd(8) without full path
> - add a debug message to make testing easier

Merged with -current.  Any ok?

bluhm

Index: privsep.c
===
RCS file: /data/mirror/openbsd/cvs/src/usr.sbin/syslogd/privsep.c,v
retrieving revision 1.61
diff -u -p -r1.61 privsep.c
--- privsep.c   28 Jun 2016 18:22:50 -  1.61
+++ privsep.c   5 Oct 2016 17:11:32 -
@@ -2,6 +2,7 @@
 
 /*
  * Copyright (c) 2003 Anil Madhavapeddy 
+ * Copyright (c) 2016 Alexander Bluhm 
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -71,9 +72,6 @@ enum cmd_types {
 
 static int priv_fd = -1;
 static volatile pid_t child_pid = -1;
-static char config_file[PATH_MAX];
-static struct stat cf_info;
-static int allow_getnameinfo = 0;
 static volatile sig_atomic_t cur_state = STATE_INIT;
 
 /* Queue for the allowed logfiles */
@@ -94,25 +92,12 @@ static void must_read(int, void *, size_
 static void must_write(int, void *, size_t);
 static int  may_read(int, void *, size_t);
 
-int
-priv_init(char *conf, int numeric, int lockfd, int nullfd, char *argv[])
+void
+priv_init(int lockfd, int nullfd, int argc, char *argv[])
 {
-   int i, fd, socks[2], cmd, addr_len, result, restart;
-   size_t path_len, protoname_len, hostname_len, servname_len;
-   char path[PATH_MAX], protoname[5];
-   char hostname[NI_MAXHOST], servname[NI_MAXSERV];
-   struct sockaddr_storage addr;
-   struct stat cf_stat;
+   int i, socks[2];
struct passwd *pw;
-   struct addrinfo hints, *res0;
-   struct sigaction sa;
-
-   memset(, 0, sizeof(sa));
-   sigemptyset(_mask);
-   sa.sa_flags = SA_RESTART;
-   sa.sa_handler = SIG_DFL;
-   for (i = 1; i < _NSIG; i++)
-   sigaction(i, , NULL);
+   char childnum[11], **privargv;
 
/* Create sockets */
if (socketpair(AF_LOCAL, SOCK_STREAM, PF_UNSPEC, socks) == -1)
@@ -141,12 +126,9 @@ priv_init(char *conf, int numeric, int l
err(1, "setresuid() failed");
close(socks[0]);
priv_fd = socks[1];
-   return 0;
+   return;
}
-
-   if (pledge("stdio rpath wpath cpath dns getpw sendfd id proc exec",
-   NULL) == -1)
-   err(1, "pledge priv");
+   close(socks[1]);
 
if (!Debug) {
close(lockfd);
@@ -157,20 +139,6 @@ priv_init(char *conf, int numeric, int l
if (nullfd > 2)
close(nullfd);
 
-   /* Father */
-   /* Pass TERM/HUP/INT/QUIT through to child, and accept CHLD */
-   sa.sa_handler = sig_pass_to_chld;
-   sigaction(SIGTERM, , NULL);
-   sigaction(SIGHUP, , NULL);
-   sigaction(SIGINT, , NULL);
-   sigaction(SIGQUIT, , NULL);
-   sa.sa_handler = sig_got_chld;
-   sa.sa_flags |= SA_NOCLDSTOP;
-   sigaction(SIGCHLD, , NULL);
-
-   setproctitle("[priv]");
-   close(socks[1]);
-
/* Close descriptors that only the unpriv child needs */
if (fd_ctlconn != -1)
close(fd_ctlconn);
@@ -194,38 +162,87 @@ priv_init(char *conf, int numeric, int l
if (fd_unix[i] != -1)
close(fd_unix[i]);
 
-   /* Save the config file specified by the child process */
-   if (strlcpy(config_file, conf, sizeof config_file) >= 
sizeof(config_file))
-   errx(1, "config_file truncation");
+   if (dup3(socks[0], 3, 0) == -1)
+   err(1, "dup3 priv sock failed");
+   snprintf(childnum, sizeof(childnum), "%d", child_pid);
+   if ((privargv = reallocarray(NULL, argc + 3, sizeof(char *))) == NULL)
+   err(1, "alloc priv argv failed");
+   for (i = 0; i < argc; i++)
+   privargv[i] = argv[i];
+   privargv[i++] = "-P";
+   privargv[i++] = childnum;
+   privargv[i++] = NULL;
+   execvp(privargv[0], privargv);
+   err(1, "exec priv '%s' failed", privargv[0]);
+}
 
-   if (stat(config_file, _info) < 0)
-   err(1, "stat config file failed");
+__dead void
+priv_exec(char *conf, int numeric, int child, int argc, char *argv[])
+{
+   int i, fd, sock, cmd, addr_len, result, restart;
+   size_t path_len, protoname_len, hostname_len, servname_len;
+   char path[PATH_MAX], protoname[5];
+   char hostname[NI_MAXHOST], servname[NI_MAXSERV];
+   struct sockaddr_storage addr;
+   struct stat cf_info, cf_stat;
+   struct addrinfo hints, *res0;
+   struct sigaction sa;
+
+   if (pledge("stdio rpath wpath cpath dns getpw sendfd id 

Re: httpd(8): dup2() fix for proc.c

2016-10-05 Thread Alexander Bluhm
On Wed, Oct 05, 2016 at 06:56:54PM +0200, Rafael Zalamena wrote:
> This diff fixes the same problem ntpd(8) had with the dup2() when oldd == 
> newd.
> 
> Quick background:
> when you dup2(oldd, newd) and oldd == newd the CLOEXEC flag won't be removed
> by the descriptor. We could use dup3() to detect this, but it is easier/faster
> just to compare the fds and do the fcntl() ourselves.
> 
> ok?

OK bluhm@

> 
> 
> Index: proc.c
> ===
> RCS file: /home/obsdcvs/src/usr.sbin/httpd/proc.c,v
> retrieving revision 1.27
> diff -u -p -r1.27 proc.c
> --- proc.c28 Sep 2016 12:01:04 -  1.27
> +++ proc.c5 Oct 2016 16:50:40 -
> @@ -22,6 +22,7 @@
>  #include 
>  #include 
>  
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -131,7 +132,12 @@ proc_exec(struct privsep *ps, struct pri
>   break;
>   case 0:
>   /* Prepare parent socket. */
> - dup2(fd, PROC_PARENT_SOCK_FILENO);
> + if (fd != PROC_PARENT_SOCK_FILENO) {
> + if (dup2(fd, PROC_PARENT_SOCK_FILENO)
> + == -1)
> + fatal("dup2");
> + } else if (fcntl(fd, F_SETFD, 0) == -1)
> + fatal("fcntl");
>  
>   execvp(argv[0], nargv);
>   fatal("%s: execvp", __func__);



httpd(8): dup2() fix for proc.c

2016-10-05 Thread Rafael Zalamena
This diff fixes the same problem ntpd(8) had with the dup2() when oldd == newd.

Quick background:
when you dup2(oldd, newd) and oldd == newd the CLOEXEC flag won't be removed
by the descriptor. We could use dup3() to detect this, but it is easier/faster
just to compare the fds and do the fcntl() ourselves.

ok?


Index: proc.c
===
RCS file: /home/obsdcvs/src/usr.sbin/httpd/proc.c,v
retrieving revision 1.27
diff -u -p -r1.27 proc.c
--- proc.c  28 Sep 2016 12:01:04 -  1.27
+++ proc.c  5 Oct 2016 16:50:40 -
@@ -22,6 +22,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
@@ -131,7 +132,12 @@ proc_exec(struct privsep *ps, struct pri
break;
case 0:
/* Prepare parent socket. */
-   dup2(fd, PROC_PARENT_SOCK_FILENO);
+   if (fd != PROC_PARENT_SOCK_FILENO) {
+   if (dup2(fd, PROC_PARENT_SOCK_FILENO)
+   == -1)
+   fatal("dup2");
+   } else if (fcntl(fd, F_SETFD, 0) == -1)
+   fatal("fcntl");
 
execvp(argv[0], nargv);
fatal("%s: execvp", __func__);



Re: timeout_set_proc(9)

2016-10-05 Thread Christiano F. Haesbaert
On 5 October 2016 at 18:26, Ted Unangst  wrote:
> Christiano F. Haesbaert wrote:
>> There is another bug, the thread runs outside of IPL_SOFTCLOCK, the
>> interrupt handler already runs at IPL_SOFTCLOCK so it did not need to raise
>> it, but the thread does.
>>
>> The mutex is not enough as it will drop before running the handler, this
>> can cause intr timeouts to interrupt proc timeouts.
>
> Is this a real bug? A proc timeout can, by definition, sleep, so it shouldn't
> be making any assumptions about running atomically. If it needs to block
> timeouts, the handler should use spl.

I agree, you can't assume it is atomic, but at this point, in this
transition phase I'd raise the ipl, since the code written under is
assumed to be running without any timeouts triggering.
The code that actually does sleep (say on a rwlock), is "new" code
that should address losing the atomicity, but again, at this point,
this does not happen and it breaks an old invariant.
If this does not hold, then we shouldn't pin the thread to cpu0
either, it should run freely.



Re: timeout_set_proc(9)

2016-10-05 Thread Ted Unangst
Christiano F. Haesbaert wrote:
> There is another bug, the thread runs outside of IPL_SOFTCLOCK, the
> interrupt handler already runs at IPL_SOFTCLOCK so it did not need to raise
> it, but the thread does.
> 
> The mutex is not enough as it will drop before running the handler, this
> can cause intr timeouts to interrupt proc timeouts.

Is this a real bug? A proc timeout can, by definition, sleep, so it shouldn't
be making any assumptions about running atomically. If it needs to block
timeouts, the handler should use spl.



wsfont.c: remove reference to non existing font

2016-10-05 Thread Frederic Cambus
Hi tech@,

Include file with font data (courier11x18.h) was removed from NetBSD
due to licensing concerns [1] before wsfont was imported into OpenBSD.

I renumbered the cookie values for consistency, and verified that no
code in tree used hardcoded cookie values.

Comments? OK?

[1] http://cvsweb.netbsd.org/bsdweb.cgi/src/sys/dev/wsfont/Attic/courier11x18.h

Index: sys/dev/wsfont/wsfont.c
===
RCS file: /cvs/src/sys/dev/wsfont/wsfont.c,v
retrieving revision 1.44
diff -u -p -r1.44 wsfont.c
--- sys/dev/wsfont/wsfont.c 14 Sep 2016 03:25:51 -  1.44
+++ sys/dev/wsfont/wsfont.c 4 Oct 2016 21:56:23 -
@@ -139,32 +139,29 @@ static struct font builtin_fonts[] = {
 #ifdef FONT_BOLD8x16_ISO1
BUILTIN_FONT(bold8x16_iso1, 2),
 #endif
-#ifdef FONT_COURIER11x18
-   BUILTIN_FONT(courier11x18, 3),
-#endif
 #ifdef FONT_GALLANT12x22
-   BUILTIN_FONT(gallant12x22, 4),
+   BUILTIN_FONT(gallant12x22, 3),
 #endif
 #ifdef FONT_LUCIDA16x29
-   BUILTIN_FONT(lucida16x29, 5),
+   BUILTIN_FONT(lucida16x29, 4),
 #endif
 #ifdef FONT_QVSS8x15
-   BUILTIN_FONT(qvss8x15, 6),
+   BUILTIN_FONT(qvss8x15, 5),
 #endif
 #ifdef FONT_VT220L8x8
-   BUILTIN_FONT(vt220l8x8, 7),
+   BUILTIN_FONT(vt220l8x8, 6),
 #endif
 #ifdef FONT_VT220L8x10
-   BUILTIN_FONT(vt220l8x10, 8),
+   BUILTIN_FONT(vt220l8x10, 7),
 #endif
 #ifdef FONT_SONY8x16
-   BUILTIN_FONT(sony8x16, 9),
+   BUILTIN_FONT(sony8x16, 8),
 #endif
 #ifdef FONT_SONY12x24
-   BUILTIN_FONT(sony12x24, 10),
+   BUILTIN_FONT(sony12x24, 9),
 #endif
 #ifdef FONT_OMRON12x20
-   BUILTIN_FONT(omron12x20, 11),
+   BUILTIN_FONT(omron12x20, 10),
 #endif
 #undef BUILTIN_FONT
 };



Re: timeout_set_proc(9)

2016-10-05 Thread Christiano F. Haesbaert
Am Montag, 26. September 2016 schrieb David Gwynne :

>
> > On 26 Sep 2016, at 13:36, Ted Unangst  > wrote:
> >
> > David Gwynne wrote:
> >> +mtx_enter(_mutex);
> >> +while (!CIRCQ_EMPTY(_proc)) {
> >> +to = timeout_from_circq(CIRCQ_
> FIRST(_proc));
> >> +CIRCQ_REMOVE(>to_list);
> >   leave();
> >> +timeout_run(to);
> >   enter();
> >> +}
> >> +mtx_leave(_mutex);
> >
> > you need to drop the mutex when running the timeout. with those changes,
> looks
> > pretty good.
>
> timeout_run drops the mutex.



There is another bug, the thread runs outside of IPL_SOFTCLOCK, the
interrupt handler already runs at IPL_SOFTCLOCK so it did not need to raise
it, but the thread does.

The mutex is not enough as it will drop before running the handler, this
can cause intr timeouts to interrupt proc timeouts.

I suggest raising it on the thread startup and leaving it always up. That's
why my diff had added IPL support for tasks btw.


Re: regress for etherip and vxlan

2016-10-05 Thread Alexander Bluhm
On Wed, Oct 05, 2016 at 04:48:55PM +0900, YASUOKA Masahiko wrote:
> The diff add regress scripts for vxlan(4) and etherip(4).
> 
> This will be my first commit to regress/.  ok?

I does not run with an obj directory.

root@ot1:.../etherip# make obj
/usr/src/regress/sys/net/etherip/obj -> /usr/obj/regress/sys/net/etherip
root@ot1:.../etherip# make
ksh etherip_1.sh
ksh: etherip_1.sh: No such file or directory
*** Error 1 in . (Makefile:7 'etherip_1')
FAILED
*** Error 1 in target 'regress' (ignored)

> +# rdomains
> +RD1=11
> +RD2=12
> +
> +# interface minor numbers
> +IFNO1=11
> +IFNO2=12

I would prefer to see these variables in the Makefile.  There you
should tune the test resources.  And a little longer names perhaps.

> +if [ $VAL -eq 0 ]; then
> + echo "Aborted.  Disabled etherip by sysctl net.inet.etherip.allow" >&2
> + exit 255
> +fi

Could you print SKIPPED here.  My test framework checks for this
to find tests that cannot run in the current environment.

bluhm



Re: add in6 multicast support to vxlan(4) ; question on mbufs

2016-10-05 Thread Mike Belopuhov
On Wed, Oct 05, 2016 at 10:58 +0900, YASUOKA Masahiko wrote:
> On Tue, 4 Oct 2016 17:27:12 +0200
> Mike Belopuhov  wrote:
> > On Tue, Oct 04, 2016 at 01:07 +0200, Vincent Gross wrote:
> >> On Sat, 24 Sep 2016 10:58:10 +0200
> >> Vincent Gross  wrote:
> >> 
> >> > Hi,
> >> > 
> >> [snip]
> >> > 
> >> > Aside from the mbuf issue, is this Ok ?
> >> 
> >> I will go back on the mbuff stuff later.
> >> 
> >> Diff rebased, ok anyone ?
> >> 
> >> Index: net/if_vxlan.c
> >> ===
> >> RCS file: /cvs/src/sys/net/if_vxlan.c,v
> >> retrieving revision 1.48
> >> diff -u -p -r1.48 if_vxlan.c
> >> --- net/if_vxlan.c 30 Sep 2016 10:22:05 -  1.48
> >> +++ net/if_vxlan.c 3 Oct 2016 23:12:42 -
> >> @@ -638,7 +749,9 @@ vxlan_lookup(struct mbuf *m, struct udph
> >>if (m->m_pkthdr.len < skip + sizeof(struct ether_header))
> >>return (EINVAL);
> >>  
> >> -  m_adj(m, skip);
> >> +  m_adj(m, skip - ETHER_ALIGN);
> >> +  m = m_pullup(m, ETHER_HDR_LEN + ETHER_ALIGN);
> >> +  m_adj(m, ETHER_ALIGN);
> >>ifp = >sc_ac.ac_if;
> >>  
> >>  #if NBRIDGE > 0
> > 
> > I think this chunk is correct.  First you ensure that m->m_data
> > points to a contiguous and well-aligned ethernet header and then
> > you trim the alignment so that mtod() points directly at it.
> 
> Isn't it possible that m_pullup() may return non aligned pointer?
> 

Do you mean if m->m_data pointer can be not aligned properly after
an m_pullup?  Of course, if the header layout is such that m_pullup
doesn't need to move data around and at the same time the header
that we're going to m_adj[ust] to is not aligned properly, this is
going to be a problem.

I wrote a test program (attached) that illustrates the change in
behavior when compiled w/o any options versus when compiled with
-DTEST3 (cc -DTEST3 -o mbuf mbuf.c)

Meaning that m_pullup might not be enough in a generic case.
#include 
#include 

#include 
#include 
#include 
#include 
#include 

#define panic(x...) errx(1, x)

unsigned int
min(unsigned int a, unsigned int b)
{
return (a < b ? a : b);
}

void
m_extfree(struct mbuf *m)
{
free(m->m_ext.ext_buf);

m->m_flags &= ~(M_EXT|M_EXTWR);
}

struct mbuf *
m_free(struct mbuf *m)
{
struct mbuf *n;

if (m == NULL)
return (NULL);

n = m->m_next;

if (m->m_flags & M_EXT)
m_extfree(m);

free(m);

return (n);
}

struct mbuf *
m_freem(struct mbuf *m)
{
struct mbuf *n;

if (m == NULL)
return (NULL);

n = m->m_nextpkt;

do
m = m_free(m);
while (m != NULL);

return (n);
}

struct mbuf *
m_get(int nowait, int type)
{
struct mbuf *m;

m = calloc(1, sizeof(*m));
if (m == NULL)
return (NULL);

m->m_type = type;
m->m_next = NULL;
m->m_nextpkt = NULL;
m->m_data = m->m_dat;
m->m_flags = 0;

return (m);
}

struct mbuf *
m_inithdr(struct mbuf *m)
{
/* keep in sync with m_gethdr */
m->m_next = NULL;
m->m_nextpkt = NULL;
m->m_data = m->m_pktdat;
m->m_flags = M_PKTHDR;
memset(>m_pkthdr, 0, sizeof(m->m_pkthdr));

return (m);
}

struct mbuf *
m_gethdr(int nowait, int type)
{
struct mbuf *m;

m = calloc(1, sizeof(*m));
if (m == NULL)
return (NULL);

m->m_type = type;

return (m_inithdr(m));
}

struct mbuf *
m_clget(struct mbuf *m, int how, u_int pktlen)
{
struct mbuf *m0 = NULL;
caddr_t buf;

if (m == NULL) {
m0 = m_gethdr(how, MT_DATA);
if (m0 == NULL)
return (NULL);

m = m0;
}
buf = calloc(1, pktlen);
if (buf == NULL) {
if (m0)
m_freem(m0);
return (NULL);
}

MEXTADD(m, buf, pktlen, M_EXTWR, MEXTFREE_POOL, NULL);
return (m);
}

struct mbuf *
m_getptr(struct mbuf *m, int loc, int *off)
{
while (loc >= 0) {
/* Normal end of search */
if (m->m_len > loc) {
*off = loc;
return (m);
} else {
loc -= m->m_len;

if (m->m_next == NULL) {
if (loc == 0) {
/* Point at the end of valid data */
*off = m->m_len;
return (m);
} else {
return (NULL);
}
} else {
m = m->m_next;
}
}
}


Outdated remarks about partitioning on sgi

2016-10-05 Thread Visa Hankala
With the disklabel read logic in the sgi bootblocks, the 'a' partition
no longer has to be the first partition on a disk, so the positioning
remarks in the install notes and the installer seem outdated. The patch
below removes them.

OK?

Index: distrib/notes/sgi/install
===
RCS file: src/distrib/notes/sgi/install,v
retrieving revision 1.28
diff -u -p -r1.28 install
--- distrib/notes/sgi/install   12 Aug 2015 06:47:16 -  1.28
+++ distrib/notes/sgi/install   5 Oct 2016 12:03:16 -
@@ -173,14 +173,7 @@ OpenBSDInstallPart4
 OpenBSDInstallPart5
 
No partitions should overlap with the SGI Volume Header, which by
-   default will use the first 3134 sectors. Additionally, the 'a'
-   partition must be the first partition on the disk, immediately
-   following the SGI Volume Header. If the default Volume Header size is
-   used, the 'a' partition should be located at offset 3135. If the 'a'
-   partition is not located immediately after the Volume Header the boot
-   loader will not be able to locate and load the kernel.
-dnl XXX Note that this is a #$%@ boot blocks limitation which should be fixed
-dnl XXX by reading the real label in the boot blocks.
+   default will use the first 3134 sectors.
 
 OpenBSDInstallPart6({:-CD-ROM, NFS -:})
 
Index: distrib/sgi/ramdisk/install.md
===
RCS file: src/distrib/sgi/ramdisk/install.md,v
retrieving revision 1.39
diff -u -p -r1.39 install.md
--- distrib/sgi/ramdisk/install.md  1 Jan 2016 00:47:51 -   1.39
+++ distrib/sgi/ramdisk/install.md  5 Oct 2016 12:03:16 -
@@ -110,12 +110,6 @@ The 'p' partition must be retained since
 this in turn contains the boot loader. No other partitions should overlap
 with the SGI Volume Header, which by default will use the first 3134 sectors.
 
-Additionally, the 'a' partition must be the first partition on the disk,
-immediately following the SGI Volume Header. If the default SGI Volume Header
-size is used, the 'a' partition should be located at offset 3135. If the
-'a' partition is not located immediately after the SGI Volume Header the
-boot loader will not be able to locate and load the kernel.
-
 Do not change any parameters except the partition layout and the label name.
 
 __EOT



Re: Smarter OpenBSD/sgi boot blocks

2016-10-05 Thread Visa Hankala
On Tue, Oct 04, 2016 at 08:08:32PM +, Miod Vallat wrote:
> The sgi boot blocks use the PROM (ARCBios or ARCS) for its I/O routines.
> When using a disk-based path, these routines are using the partition
> table found in the ``volume header''.
> 
> In order to be able to use 16 partitions per disk, the OpenBSD port only
> claims one volume header partition, #0, as the OpenBSD area, and puts
> its own label in there.
> 
> Unfortunately, this means that `sd0a', the OpenBSD partition on which
> the kernel is found, may not actually start at the same location as the
> volume header partition #0, even though this is the case in most setups.
> 
> When reinstalling an O2 some time ago, I did not pay attention to this
> during install:
> 
>   Use (A)uto layout, (E)dit auto layout, or create (C)ustom layout? [a] c
>   [...]
>   Label editor (enter '?' for help at any prompt)
>   > p
>   OpenBSD area: 0-17773524; size: 17773524; free: 0
>   #size   offset  fstype [fsize bsize   cpg]
> a: 17770389 3135  4.2BSD   1024  819216
> c: 177735240  unused
> p: 31350 unknown
>   > d a
>   > a
>   partition: [a]
>   offset: [3135]
>   size: [17770389] 400M
>   Rounding size to cylinder (3360 sectors): 816705
>   FS type: [4.2BSD]
>   mount point: [none] /
>   Rounding offset to bsize (32 sectors): 3136
>   Rounding size to bsize (32 sectors): 816704
> 
> And `sd0a' ended up starting one sector after the beginning of the
> volume header partition #0.
> 
> Of course, rebooting afterwards did not work as expected:
> 
>   OpenBSD/sgi-IP32 ARCBios boot version 1.7
>   arg 0: pci(0)scsi(0)disk(1)rdisk(0)partition(8)/boot
>   arg 1: OSLoadOptions=auto
>   arg 2: ConsoleIn=serial(0)
>   arg 3: ConsoleOut=serial(0)
>   arg 4: SystemPartition=pci(0)scsi(0)disk(1)rdisk(0)partition(8)
>   arg 5: OSLoader=boot
>   arg 6: OSLoadPartition=pci(0)scsi(0)disk(1)rdisk(0)partition(0)
>   arg 7: OSLoadFilename=bsd
>   Boot: pci(0)scsi(0)disk(1)rdisk(0)partition(0)bsd
>   cannot open pci(0)scsi(0)disk(1)rdisk(0)partition(0)/etc/random.seed: 
> Invalid argument
>   open pci(0)scsi(0)disk(1)rdisk(0)partition(0)bsd: Invalid argument
>   Boot FAILED!
> 
> The proper way to fix this is to make the boot blocks read the real
> OpenBSD label instead of assuming sd0a spans volume header partition #0
> in all cases.
> 
> This is achieved by the diff below, which will attempt to use the raw
> disk device (volume header partition #10, equivalent to sd0c) for disk
> accesses and will read both the volume header (to know where the OpenBSD
> disklabel lies) and then the OpenBSD label (to know where sd0a really
> is).
> 
> Of course, this has to cope with the two valid disk syntaxes (true
> ARCBios used on older sgi systems, and ARCS dksc() syntax used on Origin
> and later systems).
> 
> This diff has been tested on O2 (IP32) and Origin 350 (IP27). It is
> written in a conservative way, in order to revert to the existing
> behaviour if anything fails (invalid volume header, no OpenBSD label
> yet...) and should not cause any regression.
> 
> Note that network boots are not affected by these changes.

I tested this also on Octane (IP30), and it works fine. The diff is now
committed. Thanks!

> Index: Makefile32.inc
> ===
> RCS file: /OpenBSD/src/sys/arch/sgi/stand/Makefile32.inc,v
> retrieving revision 1.5
> diff -u -p -r1.5 Makefile32.inc
> --- Makefile32.inc19 Oct 2012 13:51:59 -  1.5
> +++ Makefile32.inc30 Sep 2016 09:19:15 -
> @@ -18,6 +18,7 @@ AS+=-32
>  LD?= ld
>  LD+= -m elf32btsmip
>  LIBSA_CPPFLAGS=
> +CFLAGS+= -DLIBSA_LONGLONG_PRINTF
>  .endif
>  
>  ### Figure out what to use for libsa and libz
> Index: boot/Makefile
> ===
> RCS file: /OpenBSD/src/sys/arch/sgi/stand/boot/Makefile,v
> retrieving revision 1.16
> diff -u -p -r1.16 Makefile
> --- boot/Makefile 30 Jul 2016 03:25:49 -  1.16
> +++ boot/Makefile 30 Sep 2016 09:19:15 -
> @@ -13,14 +13,14 @@ AFLAGS+=  ${SAABI}
>  
>  S=   ${.CURDIR}/../../../..
>  SRCS=start.S arcbios.c boot.c conf.c diskio.c filesystem.c \
> - netfs.c netio.c strchr.c strstr.c
> + netfs.c netio.c strstr.c
>  
>  .PATH:   ${S}/lib/libsa
>  SRCS+=   loadfile.c
>  
>  .PATH:   ${S}/lib/libkern/arch/mips64 ${S}/lib/libkern
> -SRCS+=   strlcpy.c memcpy.c strlen.c strrchr.c strlcat.c 
> strncmp.c \
> - strcmp.S
> +SRCS+=   memcpy.c strchr.c strcmp.S strlcat.c strlcpy.c strlen.c 
> \
> + strncmp.c strrchr.c
>  
>  CLEANFILES+= machine mips64
>  
> Index: boot/diskio.c
> ===
> RCS file: 

vmd: virtual switches and enhanced interface configuration

2016-10-05 Thread Reyk Floeter
Hi,

the following diff enhances vmd(8)'s network configuration.

Before, you could only set "interfaces N" or -i N for the number of
desired interfaces.

The diff introduces the concept of virtual switches: bridge(4) or
switch(4) devices that are partially managed by vmd(8) with the
following features:

- Add VM interfaces to virtual switches automatically
- Allow to set the lladdr of each guest interface (zero/random by default)
- Set the interface or switch interface to up or down automatically

The idea is that each VM specifies its interfaces and assigns them to
pre-configured networks via the switches.  This pretty much matches
what I would expect from a VMM.  And, of course, this might interact
with switchd later.

See the examples and manpage below.  It is still compatible to the
previous "interfaces" or -i option, so you can just add interfaces
without any additional configuration like before.  All tap(4)
interfaces are now UP by default.  vmctl does not support the enhanced
configuration yet, but I will probably add subset to it later.

OK?

Reyk

Index: etc/examples/vm.conf
===
RCS file: /cvs/src/etc/examples/vm.conf,v
retrieving revision 1.4
diff -u -p -u -p -r1.4 vm.conf
--- etc/examples/vm.conf6 Jan 2016 09:59:30 -   1.4
+++ etc/examples/vm.conf5 Oct 2016 09:07:34 -
@@ -9,6 +9,19 @@ sets="/var/www/htdocs/pub/OpenBSD/snapsh
 # Virtual machines
 #
 
+switch "wired" {
+   # This interface will default to bridge0, but switch(4) is supported
+   #interface switch0
+
+   # Add additional members
+   add em0
+   down
+}
+
+switch "wireless" {
+   add iwm0
+}
+
 # OpenBSD snapshot install test
 vm "openbsd.vm" {
memory 512M
@@ -18,17 +31,23 @@ vm "openbsd.vm" {
disk "/home/vm/OpenBSD.img"
 
# Second disk from OpenBSD contains the install sets
-   disk $sets "install59.fs"
+   disk $sets "install60.fs"
 
# Interface will show up as tap(4) on the host and as vio(4) in the VM
-   interfaces 1
+   interface { switch "wireless" }
+   interface { switch "wired" }
 }
 
 # Another VM that is disabled on startup
 vm "vm1.example.com" {
disable
memory 1G
-   interfaces 2
kernel "/bsd"
disk "/home/vm/vm1-disk.img"
+
+   # Use a specific tap(4) interface with a hardcoded MAC address
+   interface tap3 {
+   lladdr 00:11:22:aa:bb:cc
+   down
+   }
 }
Index: usr.sbin/vmd/config.c
===
RCS file: /cvs/src/usr.sbin/vmd/config.c,v
retrieving revision 1.13
diff -u -p -u -p -r1.13 config.c
--- usr.sbin/vmd/config.c   4 Oct 2016 17:17:30 -   1.13
+++ usr.sbin/vmd/config.c   5 Oct 2016 09:07:34 -
@@ -38,6 +38,9 @@
 #include "proc.h"
 #include "vmd.h"
 
+/* Supported bridge types */
+const char *vmd_descsw[] = { "switch", "bridge", NULL };
+
 int
 config_init(struct vmd *env)
 {
@@ -56,6 +59,12 @@ config_init(struct vmd *env)
return (-1);
TAILQ_INIT(env->vmd_vms);
}
+   if (what & CONFIG_SWITCHES) {
+   if ((env->vmd_switches = calloc(1,
+   sizeof(*env->vmd_switches))) == NULL)
+   return (-1);
+   TAILQ_INIT(env->vmd_switches);
+   }
 
return (0);
 }
@@ -65,13 +74,19 @@ config_purge(struct vmd *env, unsigned i
 {
struct privsep  *ps = >vmd_ps;
struct vmd_vm   *vm;
+   struct vmd_switch   *vsw;
unsigned int what;
 
what = ps->ps_what[privsep_process] & reset;
if (what & CONFIG_VMS && env->vmd_vms != NULL) {
while ((vm = TAILQ_FIRST(env->vmd_vms)) != NULL)
vm_remove(vm);
-   env->vmd_vmcount = 0;
+   env->vmd_nvm = 0;
+   }
+   if (what & CONFIG_SWITCHES && env->vmd_switches != NULL) {
+   while ((vsw = TAILQ_FIRST(env->vmd_switches)) != NULL)
+   switch_remove(vsw);
+   env->vmd_nswitches = 0;
}
 }
 
@@ -105,17 +120,21 @@ config_getreset(struct vmd *env, struct 
 }
 
 int
-config_getvm(struct privsep *ps, struct vm_create_params *vcp,
+config_getvm(struct privsep *ps, struct vmop_create_params *vmc,
 int kernel_fd, uint32_t peerid)
 {
struct vmd  *env = ps->ps_env;
struct vmd_vm   *vm = NULL;
+   struct vmd_if   *vif;
+   struct vm_create_params *vcp = >vmc_params;
unsigned int i;
int  fd, ttys_fd;
int  kernfd = -1, *diskfds = NULL, *tapfds = NULL;
int  saved_errno = 0;
char ptyname[VM_TTYNAME_MAX];
-   char ifname[IF_NAMESIZE];
+   char 

Re: Help me testing the netlock

2016-10-05 Thread Martin Pieuchot

On 10/04/16 16:44, Martin Pieuchot wrote:

On 10/03/16 16:43, Martin Pieuchot wrote:

Diff below introduces a single write lock that will be used to serialize
access to ip_output().

This lock will be then split in multiple readers and writers to allow
multiple forwarding paths to run in parallel of each others but still
serialized with the socket layer.

I'm currently looking for people wanting to run this diff and try to
break it.  In other words, your machine might panic with it and if it
does report the panic to me so the diff can be improved.

I tested NFS v2 and v3 so I'm quite confident, but I might have missed
some obvious stuff.


Updated diff attaced including a fix for syn_cache_timer(), problem
reported by Chris Jackman.


Thanks to all testers!

Here's a newer version that includes a fix for rt_timer_timer() also
found by Chris Jackman.

Index: kern/kern_rwlock.c
===
RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
retrieving revision 1.27
diff -u -p -r1.27 kern_rwlock.c
--- kern/kern_rwlock.c	14 Mar 2015 07:33:42 -	1.27
+++ kern/kern_rwlock.c	5 Oct 2016 08:11:09 -
@@ -98,6 +98,12 @@ rw_enter_read(struct rwlock *rwl)
 		membar_enter();
 }
 
+#if 1
+#include 
+#include 
+#include 
+#endif
+
 void
 rw_enter_write(struct rwlock *rwl)
 {
@@ -108,6 +114,15 @@ rw_enter_write(struct rwlock *rwl)
 		rw_enter(rwl, RW_WRITE);
 	else
 		membar_enter();
+
+#if 1
+	if ((rwl == ) && (splassert_ctl == 3)) {
+		printf("ENTER::%d::", cpu_number());
+		db_stack_trace_print(
+		(db_expr_t)__builtin_frame_address(1),
+		TRUE, 1, "", printf);
+	}
+#endif
 }
 
 void
@@ -129,6 +144,15 @@ rw_exit_write(struct rwlock *rwl)
 	unsigned long owner = rwl->rwl_owner;
 
 	rw_assert_wrlock(rwl);
+
+#if 1
+	if ((rwl == ) && (splassert_ctl == 3)) {
+		printf("EXIT::%d::", cpu_number());
+		db_stack_trace_print(
+		(db_expr_t)__builtin_frame_address(1),
+		TRUE, 1, "", printf);
+	}
+#endif
 
 	membar_exit();
 	if (__predict_false((owner & RWLOCK_WAIT) ||
Index: kern/sys_socket.c
===
RCS file: /cvs/src/sys/kern/sys_socket.c,v
retrieving revision 1.21
diff -u -p -r1.21 sys_socket.c
--- kern/sys_socket.c	5 Dec 2015 10:11:53 -	1.21
+++ kern/sys_socket.c	5 Oct 2016 08:11:09 -
@@ -131,8 +131,10 @@ soo_poll(struct file *fp, int events, st
 {
 	struct socket *so = fp->f_data;
 	int revents = 0;
-	int s = splsoftnet();
+	int s;
 
+	rw_enter_write();
+	s = splsoftnet();
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (soreadable(so))
 			revents |= events & (POLLIN | POLLRDNORM);
@@ -159,6 +161,7 @@ soo_poll(struct file *fp, int events, st
 		}
 	}
 	splx(s);
+	rw_exit_write();
 	return (revents);
 }
 
Index: kern/uipc_socket.c
===
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.161
diff -u -p -r1.161 uipc_socket.c
--- kern/uipc_socket.c	20 Sep 2016 14:27:43 -	1.161
+++ kern/uipc_socket.c	5 Oct 2016 08:11:10 -
@@ -123,6 +123,7 @@ socreate(int dom, struct socket **aso, i
 		return (EPROTONOSUPPORT);
 	if (prp->pr_type != type)
 		return (EPROTOTYPE);
+	rw_enter_write();
 	s = splsoftnet();
 	so = pool_get(_pool, PR_WAITOK | PR_ZERO);
 	TAILQ_INIT(>so_q0);
@@ -142,9 +143,11 @@ socreate(int dom, struct socket **aso, i
 		so->so_state |= SS_NOFDREF;
 		sofree(so);
 		splx(s);
+		rw_exit_write();
 		return (error);
 	}
 	splx(s);
+	rw_exit_write();
 	*aso = so;
 	return (0);
 }
@@ -152,11 +155,13 @@ socreate(int dom, struct socket **aso, i
 int
 sobind(struct socket *so, struct mbuf *nam, struct proc *p)
 {
-	int s = splsoftnet();
-	int error;
+	int s, error;
 
+	rw_enter_write();
+	s = splsoftnet();
 	error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, p);
 	splx(s);
+	rw_exit_write();
 	return (error);
 }
 
@@ -171,11 +176,13 @@ solisten(struct socket *so, int backlog)
 	if (isspliced(so) || issplicedback(so))
 		return (EOPNOTSUPP);
 #endif /* SOCKET_SPLICE */
+	rw_enter_write();
 	s = splsoftnet();
 	error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL,
 	curproc);
 	if (error) {
 		splx(s);
+		rw_exit_write();
 		return (error);
 	}
 	if (TAILQ_FIRST(>so_q) == NULL)
@@ -186,6 +193,7 @@ solisten(struct socket *so, int backlog)
 		backlog = sominconn;
 	so->so_qlimit = backlog;
 	splx(s);
+	rw_exit_write();
 	return (0);
 }
 
@@ -196,6 +204,7 @@ solisten(struct socket *so, int backlog)
 void
 sofree(struct socket *so)
 {
+	rw_assert_wrlock();
 	splsoftassert(IPL_SOFTNET);
 
 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
@@ -234,9 +243,10 @@ int
 soclose(struct socket *so)
 {
 	struct socket *so2;
-	int s = splsoftnet();		/* conservative */
-	int error = 0;
+	int s, error = 0;
 
+	rw_enter_write();
+	s = splsoftnet();		/* conservative */
 	if (so->so_options & SO_ACCEPTCONN) {
 		while ((so2 = TAILQ_FIRST(>so_q0)) != NULL) {
 			(void) soqremque(so2, 0);
@@ 

regress for etherip and vxlan

2016-10-05 Thread YASUOKA Masahiko
The diff add regress scripts for vxlan(4) and etherip(4).

This will be my first commit to regress/.  ok?

diff --git a/regress/sys/net/Makefile b/regress/sys/net/Makefile
index 40f49cc..f46c3dd 100644
--- a/regress/sys/net/Makefile
+++ b/regress/sys/net/Makefile
@@ -1,5 +1,6 @@
 #  $OpenBSD: Makefile,v 1.9 2016/09/21 10:40:39 mpi Exp $
 
-SUBDIR +=  pf_divert pf_forward pf_fragment pf_print rdomains rtable
+SUBDIR +=  etherip pf_divert pf_forward pf_fragment pf_print rdomains
+SUBDIR +=  rtable
 
 .include 
diff --git a/regress/sys/net/etherip/Makefile b/regress/sys/net/etherip/Makefile
new file mode 100644
index 000..dd18c5f
--- /dev/null
+++ b/regress/sys/net/etherip/Makefile
@@ -0,0 +1,9 @@
+#  $OpenBSD$
+
+REGRESS_TARGETS=   etherip_1
+REGRESS_ROOT_TARGETS=  etherip_1
+
+etherip_1:
+   ${SUDO} ksh $@.sh
+
+.include 
diff --git a/regress/sys/net/etherip/etherip_1.sh 
b/regress/sys/net/etherip/etherip_1.sh
new file mode 100644
index 000..8c08129
--- /dev/null
+++ b/regress/sys/net/etherip/etherip_1.sh
@@ -0,0 +1,78 @@
+#!/bin/ksh
+#  $OpenBSD$
+
+
+cleanup()
+{
+   for if in $ALL_IFS; do
+   ifconfig $if destroy 2>/dev/null
+   done
+}
+
+. ./etherip_subr
+
+# rdomains
+RD1=11
+RD2=12
+
+# interface minor numbers
+IFNO1=11
+IFNO2=12
+
+ALL_IFS="bridge$IFNO2 bridge$IFNO1 vether$IFNO2 vether$IFNO1 etherip$IFNO2
+etherip$IFNO1 pair$IFNO2 pair$IFNO1"
+
+#
+# Check pre-conditions
+#
+# etherip is enabled by sysctl?
+VAL=$(sysctl -n net.inet.etherip.allow)
+VAL=${VAL:-0}
+if [ $VAL -eq 0 ]; then
+   echo "Aborted.  Disabled etherip by sysctl net.inet.etherip.allow" >&2
+   exit 255
+fi
+# interfaces are busy?
+for if in $ALL_IFS; do
+   if iface_exists $if; then
+   echo "Aborted.  interface \`$if' is used already." >&2
+   exit 255
+   fi
+done
+# rdomains are busy?
+for rt in $RD1 $RD2; do
+   if ! rdomain_is_used $rt; then
+   echo "Aborted.  rdomain \`$rt' is used already." >&2
+   exit 255
+   fi
+done
+
+#
+# Prepeare the test
+#
+[ $VERBOSE -gt 0 ] && set -x
+ifconfig pair$IFNO1rdomain $RD1 172.31.0.1/24
+ifconfig pair$IFNO2rdomain $RD2 172.31.0.2/24 patch pair$IFNO1
+ifconfig vether$IFNO1  rdomain $RD1 192.168.0.1
+ifconfig vether$IFNO2  rdomain $RD2 192.168.0.2
+ifconfig etherip$IFNO1 rdomain $RD1 tunneldomain $RD1 || abort_test
+ifconfig etherip$IFNO2 rdomain $RD2 tunneldomain $RD2 || abort_test
+ifconfig bridge$IFNO1  rdomain $RD1 add vether$IFNO1 add etherip$IFNO1 up
+ifconfig bridge$IFNO2  rdomain $RD2 add vether$IFNO2 add etherip$IFNO2 up
+
+#
+# Test config
+#
+ifconfig etherip$IFNO1 tunnel 172.31.0.1 172.31.0.2 up || abort_test
+ifconfig etherip$IFNO2 tunnel 172.31.0.2 172.31.0.1 up || abort_test
+
+#
+# Test behavior
+#
+test ping -w 1 -c 1 -V $RD1 192.168.0.2
+test ping -w 1 -c 1 -V $RD2 192.168.0.1
+set +x
+
+# Done
+cleanup
+exit $FAILS
diff --git a/regress/sys/net/etherip/etherip_subr 
b/regress/sys/net/etherip/etherip_subr
new file mode 100644
index 000..82e03ef
--- /dev/null
+++ b/regress/sys/net/etherip/etherip_subr
@@ -0,0 +1,67 @@
+#
+# Copyright (c) 2015 Vincent Gross 
+#
+# Permission to use, copy, modify, and distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+VERBOSE=0
+FAILS=0
+
+iface_exists()
+{
+   ifconfig_out=`ifconfig "$1" 2>&1`
+   [ "${ifconfig_out}" != "$1: no such interface" ]
+}
+
+rdomain_is_used()
+{
+   _rdomains=$(ifconfig | sed -n '/^[a-z].* rdomain \([0-9]*\).*/s//\1/p' \
+   | sort | uniq)
+   for _r in $_rdomains; do
+   if [ $_r = $1 ]; then
+   return 1
+   fi
+   done
+   return 0
+}
+
+abort_test()
+{
+   echo "** Aborted" >&2
+   [ $# -ge 0 ] && echo "$1" >&2
+   cleanup
+   exit 1
+}
+
+test()
+{
+   if [ $VERBOSE -gt 0 ]; then
+   "$@"
+   else
+   "$@" > /dev/null 2>&1
+   fi
+   if [ $? -ne 0 ]; then
+   FAILS=$((FAILS + 1))
+   fi
+}
+
+while getopts 'v' ch $@; do
+   case $ch in
+   v)
+   VERBOSE=$((VERBOSE + 1))
+   ;;
+   *)
+   echo "usage: $(basename $0) [-v]"
+   exit 64
+   ;;
+   esac
+done
diff --git 

Re: Help me testing the netlock

2016-10-05 Thread Mike Larkin
On Tue, Oct 04, 2016 at 04:44:29PM +0200, Martin Pieuchot wrote:
> On 10/03/16 16:43, Martin Pieuchot wrote:
> > Diff below introduces a single write lock that will be used to serialize
> > access to ip_output().
> > 
> > This lock will be then split in multiple readers and writers to allow
> > multiple forwarding paths to run in parallel of each others but still
> > serialized with the socket layer.
> > 
> > I'm currently looking for people wanting to run this diff and try to
> > break it.  In other words, your machine might panic with it and if it
> > does report the panic to me so the diff can be improved.
> > 
> > I tested NFS v2 and v3 so I'm quite confident, but I might have missed
> > some obvious stuff.
> 
> Updated diff attaced including a fix for syn_cache_timer(), problem
> reported by Chris Jackman.
> 

So far, so good, on i386 and amd64 vmm(4) VMs. booted, did a pkg_add 
upgrade, and cvsync.

No issues seen so far.

-ml

> Index: kern/kern_rwlock.c
> ===
> RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
> retrieving revision 1.27
> diff -u -p -r1.27 kern_rwlock.c
> --- kern/kern_rwlock.c14 Mar 2015 07:33:42 -  1.27
> +++ kern/kern_rwlock.c4 Oct 2016 14:40:29 -
> @@ -98,6 +98,12 @@ rw_enter_read(struct rwlock *rwl)
>   membar_enter();
>  }
>  
> +#if 1
> +#include 
> +#include 
> +#include 
> +#endif
> +
>  void
>  rw_enter_write(struct rwlock *rwl)
>  {
> @@ -108,6 +114,15 @@ rw_enter_write(struct rwlock *rwl)
>   rw_enter(rwl, RW_WRITE);
>   else
>   membar_enter();
> +
> +#if 1
> + if ((rwl == ) && (splassert_ctl == 3)) {
> + printf("ENTER::%d::", cpu_number());
> + db_stack_trace_print(
> + (db_expr_t)__builtin_frame_address(1),
> + TRUE, 1, "", printf);
> + }
> +#endif
>  }
>  
>  void
> @@ -129,6 +144,15 @@ rw_exit_write(struct rwlock *rwl)
>   unsigned long owner = rwl->rwl_owner;
>  
>   rw_assert_wrlock(rwl);
> +
> +#if 1
> + if ((rwl == ) && (splassert_ctl == 3)) {
> + printf("EXIT::%d::", cpu_number());
> + db_stack_trace_print(
> + (db_expr_t)__builtin_frame_address(1),
> + TRUE, 1, "", printf);
> + }
> +#endif
>  
>   membar_exit();
>   if (__predict_false((owner & RWLOCK_WAIT) ||
> Index: kern/sys_socket.c
> ===
> RCS file: /cvs/src/sys/kern/sys_socket.c,v
> retrieving revision 1.21
> diff -u -p -r1.21 sys_socket.c
> --- kern/sys_socket.c 5 Dec 2015 10:11:53 -   1.21
> +++ kern/sys_socket.c 4 Oct 2016 14:40:29 -
> @@ -131,8 +131,10 @@ soo_poll(struct file *fp, int events, st
>  {
>   struct socket *so = fp->f_data;
>   int revents = 0;
> - int s = splsoftnet();
> + int s;
>  
> + rw_enter_write();
> + s = splsoftnet();
>   if (events & (POLLIN | POLLRDNORM)) {
>   if (soreadable(so))
>   revents |= events & (POLLIN | POLLRDNORM);
> @@ -159,6 +161,7 @@ soo_poll(struct file *fp, int events, st
>   }
>   }
>   splx(s);
> + rw_exit_write();
>   return (revents);
>  }
>  
> Index: kern/uipc_socket.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.161
> diff -u -p -r1.161 uipc_socket.c
> --- kern/uipc_socket.c20 Sep 2016 14:27:43 -  1.161
> +++ kern/uipc_socket.c4 Oct 2016 14:40:29 -
> @@ -123,6 +123,7 @@ socreate(int dom, struct socket **aso, i
>   return (EPROTONOSUPPORT);
>   if (prp->pr_type != type)
>   return (EPROTOTYPE);
> + rw_enter_write();
>   s = splsoftnet();
>   so = pool_get(_pool, PR_WAITOK | PR_ZERO);
>   TAILQ_INIT(>so_q0);
> @@ -142,9 +143,11 @@ socreate(int dom, struct socket **aso, i
>   so->so_state |= SS_NOFDREF;
>   sofree(so);
>   splx(s);
> + rw_exit_write();
>   return (error);
>   }
>   splx(s);
> + rw_exit_write();
>   *aso = so;
>   return (0);
>  }
> @@ -152,11 +155,13 @@ socreate(int dom, struct socket **aso, i
>  int
>  sobind(struct socket *so, struct mbuf *nam, struct proc *p)
>  {
> - int s = splsoftnet();
> - int error;
> + int s, error;
>  
> + rw_enter_write();
> + s = splsoftnet();
>   error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, p);
>   splx(s);
> + rw_exit_write();
>   return (error);
>  }
>  
> @@ -171,11 +176,13 @@ solisten(struct socket *so, int backlog)
>   if (isspliced(so) || issplicedback(so))
>   return (EOPNOTSUPP);
>  #endif /* SOCKET_SPLICE */
> + rw_enter_write();
>   s = splsoftnet();
>   error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL,
>   curproc);
>