from:"David Gwynne"

rewrite dhcpv6 handling in tcpdump(8)

2019-12-02 Thread David Gwynne

milliseconds */
-#define DH6T_XID_TIMEOUT   15  /* milliseconds */
-#define DH6T_RECONF_MULTICAST_REQUEST_WAIT 16  /* milliseconds */
-
-#if 0
-extern struct dhcp6_opt *dh6o_pad;
-extern struct dhcp6_opt *dh6o_end;
-extern int dhcp6_param[];
-extern void dhcp6opttab_init(void);
-extern struct dhcp6_opt *dhcp6opttab_byname(char *);
-extern struct dhcp6_opt *dhcp6opttab_bycode(u_int);
-#endif
-
-#endif /*__DHCP6OPT_H_DEFINED*/
Index: interface.h
===
RCS file: /cvs/src/usr.sbin/tcpdump/interface.h,v
retrieving revision 1.81
diff -u -p -r1.81 interface.h
--- interface.h 26 May 2019 22:42:42 -  1.81
+++ interface.h 2 Dec 2019 11:51:36 -
@@ -296,7 +296,7 @@ extern void icmp6_print(const u_char *, 
 extern void ripng_print(const u_char *, int);
 extern int rt6_print(const u_char *, const u_char *);
 extern void ospf6_print(const u_char *, u_int);
-extern void dhcp6_print(const u_char *, u_int, u_short, u_short);
+extern void dhcp6_print(const u_char *, u_int);
 
 extern uint32_t in_cksum_add(const void *, size_t, uint32_t);
 extern uint16_t in_cksum_fini(uint32_t);
Index: print-dhcp6.c
===
RCS file: /cvs/src/usr.sbin/tcpdump/print-dhcp6.c,v
retrieving revision 1.11
diff -u -p -r1.11 print-dhcp6.c
--- print-dhcp6.c   22 Oct 2018 16:12:45 -  1.11
+++ print-dhcp6.c   2 Dec 2019 11:51:36 -
@@ -1,32 +1,19 @@
 /* $OpenBSD: print-dhcp6.c,v 1.11 2018/10/22 16:12:45 kn Exp $ */
 
 /*
- * Copyright (C) 1998 and 1999 WIDE Project.
- * All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *notice, this list of conditions and the following disclaimer in the
- *documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the project nor the names of its contributors
- *may be used to endorse or promote products derived from this software
- *without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * Copyright (c) 2019 David Gwynne 
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
 #include 
@@ -44,286 +31,179 @@ struct rtentry;
 #include 
 
 #include "interface.h"
+#include "extract.h"
 #include "addrtoname.h"
-#include "dhcp6.h"
-#include "dhcp6opt.h"
 
-#if 0
-static void dhcp6opttab_init(void);
-static struct dhcp6_opt *dhcp6opttab_byname(char *);
-#endif
-static struct dhcp6_opt *dhcp6opttab_bycode(u_int);
-
-static char tstr[] = " [|dhcp6]";
-
-static struct dhcp6_opt dh6opttab[] = {
-   /* IP Address Extension */
-   { 1, OL6_N, "IP Address",   OT6_NONE, },
-
-   /* General Extension */
-   { 2, 4, "Time Offset",  OT6_NUM, },
-   { 3, OL6_N, "IEEE 1003.1 POSIX Timezone",   OT6_STR, },
-   { 6, OL6_16N,   "Domain Name Server",   OT6_V6, },
-   { 10, OL6_N,"Domain Name",  OT6_STR, },
-
-   /* Application and Service Parameters */
-   { 16, OL6_N,"Directory Agent",

use tasks and a task_list to manage if_addrhooks

2019-11-07 Thread David Gwynne

this applies the use of tasks and a task_list to interface address
hooks. it's like the detach and linkstate hooks, except it seems other
things run the hooks more than things register hooks, and i can't tell
if the places that run the hooks have the NET_LOCK or not. not by casual
reading anyway.

to cope with if_addrhooks_run maybe not being called with NET_LOCK being
held, i made it safe to call the hook runner multiple times
concurrently.

one of the users of address hooks is pf, and the pfi_kif struct. it's
part of the ABI, pfctl and snmpd use it, so i kept it using a void * and
had it allocate the task separately. it should be as robust as it was
before.

everything else was pretty straightforward.

tests? ok?

Index: kern/kern_task.c
===
RCS file: /cvs/src/sys/kern/kern_task.c,v
retrieving revision 1.26
diff -u -p -r1.26 kern_task.c
--- kern/kern_task.c23 Jun 2019 12:56:10 -  1.26
+++ kern/kern_task.c7 Nov 2019 11:21:00 -
@@ -258,6 +258,8 @@ taskq_barrier_task(void *p)
 void
 task_set(struct task *t, void (*fn)(void *), void *arg)
 {
+   KASSERT(fn != NULL);
+
t->t_func = fn;
t->t_arg = arg;
t->t_flags = 0;
Index: net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.591
diff -u -p -r1.591 if.c
--- net/if.c7 Nov 2019 08:03:18 -   1.591
+++ net/if.c7 Nov 2019 11:21:00 -
@@ -630,9 +630,7 @@ if_attach_common(struct ifnet *ifp)
ifp->if_iqs = ifp->if_rcv.ifiq_ifiqs;
ifp->if_niqs = 1;
 
-   ifp->if_addrhooks = malloc(sizeof(*ifp->if_addrhooks),
-   M_TEMP, M_WAITOK);
-   TAILQ_INIT(ifp->if_addrhooks);
+   TAILQ_INIT(>if_addrhooks);
TAILQ_INIT(>if_linkstatehooks);
TAILQ_INIT(>if_detachhooks);
 
@@ -1046,19 +1044,18 @@ if_netisr(void *unused)
 void
 if_hooks_run(struct task_list *hooks)
 {
-   struct task *t, *nt, cursor;
+   struct task *t, *nt;
+   struct task cursor = { .t_func = NULL };
void (*func)(void *);
void *arg;
 
-   /*
-* holding the NET_LOCK guarantees that concurrent if_hooks_run
-* calls can't happen, and they therefore can't try and call
-* each others cursors as actual hooks.
-*/
-   NET_ASSERT_LOCKED();
-
mtx_enter(_hooks_mtx);
for (t = TAILQ_FIRST(hooks); t != NULL; t = nt) {
+   while (t->t_func == NULL) { /* skip cursors */
+   t = TAILQ_NEXT(t, t_entry);
+   if (t == NULL)
+   break;
+   }
func = t->t_func;
arg = t->t_arg;
 
@@ -1177,7 +1174,7 @@ if_detach(struct ifnet *ifp)
}
}
 
-   free(ifp->if_addrhooks, M_TEMP, sizeof(*ifp->if_addrhooks));
+   KASSERT(TAILQ_EMPTY(>if_addrhooks));
KASSERT(TAILQ_EMPTY(>if_linkstatehooks));
KASSERT(TAILQ_EMPTY(>if_detachhooks));
 
@@ -3100,7 +3097,7 @@ ifnewlladdr(struct ifnet *ifp)
ifa = _ifpforlinklocal(ifp, 0)->ia_ifa;
if (ifa) {
in6_purgeaddr(ifa);
-   dohooks(ifp->if_addrhooks, 0);
+   if_hooks_run(>if_addrhooks);
in6_ifattach(ifp);
}
}
@@ -3112,6 +3109,28 @@ ifnewlladdr(struct ifnet *ifp)
(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t));
}
splx(s);
+}
+
+void
+if_addrhook_add(struct ifnet *ifp, struct task *t)
+{
+   mtx_enter(_hooks_mtx);
+   TAILQ_INSERT_TAIL(>if_addrhooks, t, t_entry);
+   mtx_leave(_hooks_mtx);
+}
+
+void
+if_addrhook_del(struct ifnet *ifp, struct task *t)
+{
+   mtx_enter(_hooks_mtx);
+   TAILQ_REMOVE(>if_addrhooks, t, t_entry);
+   mtx_leave(_hooks_mtx);
+}
+
+void
+if_addrhooks_run(struct ifnet *ifp)
+{
+   if_hooks_run(>if_addrhooks);
 }
 
 int net_ticks;
Index: net/if_pppx.c
===
RCS file: /cvs/src/sys/net/if_pppx.c,v
retrieving revision 1.68
diff -u -p -r1.68 if_pppx.c
--- net/if_pppx.c   24 Jun 2019 13:43:19 -  1.68
+++ net/if_pppx.c   7 Nov 2019 11:21:00 -
@@ -919,7 +919,7 @@ pppx_add_session(struct pppx_dev *pxd, s
printf("pppx: unable to set addresses for %s, error=%d\n",
ifp->if_xname, error);
} else {
-   dohooks(ifp->if_addrhooks, 0);
+   if_addrhooks_run(ifp);
}
rw_enter_write(_ifs_lk);
pxi->pxi_ready = 1;
Index: net/if_spppsubr.c
===
RCS file: /cvs/src/sys/net/if_spppsubr.c,v
retrieving revision 1.179
diff -u -p -r1.179 if_spppsubr.c
--- net/if_spppsubr.c   24 Jun 2019 21:36:53 -  1.179
+++ net/if_spppsubr.c   7 Nov 2019 11:21:00 -
@@ -4230,7 +4230,7 @@

Re: use tasks and a task_list to manage if_linkstatehooks

2019-11-06 Thread David Gwynne

On Wed, Nov 06, 2019 at 03:12:27PM +1000, David Gwynne wrote:
> this is a follow-up to "use tasks and a task_list to manage
> if_detachhooks" and converts the link state hooks to a task_list
> with pre-allocated tasks.
> 
> it's mostly mechanical, except for carp. each carp interface has a link
> state hook it registers on its parent, but each hook then runs against
> every carp device on that parent. the diff changes it so the link state
> hook is only run for the relevant carp interface.
> 
> it also reorders the addition of the carp link state hook till after the
> old link state hook is removed.
> 
> unless anyone objects i am going to commit this around 10am GMT+10
> tomorrow (7th nov).

hrvoje points out i missed a file in my diff.

Index: net/bridgestp.c
===
RCS file: /cvs/src/sys/net/bridgestp.c,v
retrieving revision 1.72
diff -u -p -r1.72 bridgestp.c
--- net/bridgestp.c 13 May 2019 18:20:13 -  1.72
+++ net/bridgestp.c 6 Nov 2019 11:13:24 -
@@ -1986,9 +1986,8 @@ bstp_add(struct bstp_state *bs, struct i
bstp_update_roles(bs, bp);
 
/* Register callback for physical link state changes */
-   if (ifp->if_linkstatehooks != NULL)
-   bp->bp_lhcookie = hook_establish(ifp->if_linkstatehooks, 1,
-   bstp_ifstate, ifp);
+   task_set(>bp_ltask, bstp_ifstate, ifp);
+   if_linkstatehook_add(ifp, >bp_ltask);
 
return (bp);
 }
@@ -2002,8 +2001,7 @@ bstp_delete(struct bstp_port *bp)
if (!bp->bp_active)
panic("not a bstp member");
 
-   if (ifp != NULL && ifp->if_linkstatehooks != NULL)
-   hook_disestablish(ifp->if_linkstatehooks, bp->bp_lhcookie);
+   if_linkstatehook_del(ifp, >bp_ltask);
 
LIST_REMOVE(bp, bp_next);
free(bp, M_DEVBUF, sizeof *bp);
Index: net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.589
diff -u -p -r1.589 if.c
--- net/if.c6 Nov 2019 03:51:26 -   1.589
+++ net/if.c6 Nov 2019 11:13:24 -
@@ -629,9 +629,7 @@ if_attach_common(struct ifnet *ifp)
ifp->if_addrhooks = malloc(sizeof(*ifp->if_addrhooks),
M_TEMP, M_WAITOK);
TAILQ_INIT(ifp->if_addrhooks);
-   ifp->if_linkstatehooks = malloc(sizeof(*ifp->if_linkstatehooks),
-   M_TEMP, M_WAITOK);
-   TAILQ_INIT(ifp->if_linkstatehooks);
+   TAILQ_INIT(>if_linkstatehooks);
TAILQ_INIT(>if_detachhooks);
 
if (ifp->if_rtrequest == NULL)
@@ -1055,8 +1053,6 @@ if_deactivate(struct ifnet *ifp)
NET_LOCK();
TAILQ_FOREACH_SAFE(t, >if_detachhooks, t_entry, nt)
(*t->t_func)(t->t_arg);
-
-   KASSERT(TAILQ_EMPTY(>if_detachhooks));
NET_UNLOCK();
 }
 
@@ -1148,7 +1144,8 @@ if_detach(struct ifnet *ifp)
}
 
free(ifp->if_addrhooks, M_TEMP, sizeof(*ifp->if_addrhooks));
-   free(ifp->if_linkstatehooks, M_TEMP, sizeof(*ifp->if_linkstatehooks));
+   KASSERT(TAILQ_EMPTY(>if_linkstatehooks));
+   KASSERT(TAILQ_EMPTY(>if_detachhooks));
 
for (i = 0; (dp = domains[i]) != NULL; i++) {
if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
@@ -1646,11 +1643,29 @@ if_linkstate_task(void *xifidx)
 void
 if_linkstate(struct ifnet *ifp)
 {
+   struct task *t, *nt;
+
NET_ASSERT_LOCKED();
 
rtm_ifchg(ifp);
rt_if_track(ifp);
-   dohooks(ifp->if_linkstatehooks, 0);
+
+   TAILQ_FOREACH_SAFE(t, >if_linkstatehooks, t_entry, nt)
+   (*t->t_func)(t->t_arg);
+}
+
+void
+if_linkstatehook_add(struct ifnet *ifp, struct task *t)
+{
+   NET_ASSERT_LOCKED();
+   TAILQ_INSERT_TAIL(>if_linkstatehooks, t, t_entry);
+}
+
+void
+if_linkstatehook_del(struct ifnet *ifp, struct task *t)
+{
+   NET_ASSERT_LOCKED();
+   TAILQ_REMOVE(>if_linkstatehooks, t, t_entry);
 }
 
 /*
Index: net/if_aggr.c
===
RCS file: /cvs/src/sys/net/if_aggr.c,v
retrieving revision 1.20
diff -u -p -r1.20 if_aggr.c
--- net/if_aggr.c   6 Nov 2019 03:51:26 -   1.20
+++ net/if_aggr.c   6 Nov 2019 11:13:24 -
@@ -335,7 +335,7 @@ struct aggr_port {
int (*p_output)(struct ifnet *, struct mbuf *, struct sockaddr *,
struct rtentry *);
 
-   void*p_lcookie;
+   struct task  p_lhook;
struct task  p_dhook;
 
struct aggr_softc   *p_aggr;
@@ -1135,8 +1135,8 @@ aggr_add_port(struct aggr_softc *sc, con
}
}
 
-   p->p_lcookie = hook_establish(ifp0->if_linkstatehooks, 1,
-   aggr_p_linkch, p);
+   task_set(>p_lhook, aggr_p

use tasks and a task_list to manage if_linkstatehooks

2019-11-05 Thread David Gwynne

this is a follow-up to "use tasks and a task_list to manage
if_detachhooks" and converts the link state hooks to a task_list
with pre-allocated tasks.

it's mostly mechanical, except for carp. each carp interface has a link
state hook it registers on its parent, but each hook then runs against
every carp device on that parent. the diff changes it so the link state
hook is only run for the relevant carp interface.

it also reorders the addition of the carp link state hook till after the
old link state hook is removed.

unless anyone objects i am going to commit this around 10am GMT+10
tomorrow (7th nov).

Index: net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.589
diff -u -p -r1.589 if.c
--- net/if.c6 Nov 2019 03:51:26 -   1.589
+++ net/if.c6 Nov 2019 05:04:03 -
@@ -629,9 +629,7 @@ if_attach_common(struct ifnet *ifp)
ifp->if_addrhooks = malloc(sizeof(*ifp->if_addrhooks),
M_TEMP, M_WAITOK);
TAILQ_INIT(ifp->if_addrhooks);
-   ifp->if_linkstatehooks = malloc(sizeof(*ifp->if_linkstatehooks),
-   M_TEMP, M_WAITOK);
-   TAILQ_INIT(ifp->if_linkstatehooks);
+   TAILQ_INIT(>if_linkstatehooks);
TAILQ_INIT(>if_detachhooks);
 
if (ifp->if_rtrequest == NULL)
@@ -1055,8 +1053,6 @@ if_deactivate(struct ifnet *ifp)
NET_LOCK();
TAILQ_FOREACH_SAFE(t, >if_detachhooks, t_entry, nt)
(*t->t_func)(t->t_arg);
-
-   KASSERT(TAILQ_EMPTY(>if_detachhooks));
NET_UNLOCK();
 }
 
@@ -1148,7 +1144,8 @@ if_detach(struct ifnet *ifp)
}
 
free(ifp->if_addrhooks, M_TEMP, sizeof(*ifp->if_addrhooks));
-   free(ifp->if_linkstatehooks, M_TEMP, sizeof(*ifp->if_linkstatehooks));
+   KASSERT(TAILQ_EMPTY(>if_linkstatehooks));
+   KASSERT(TAILQ_EMPTY(>if_detachhooks));
 
for (i = 0; (dp = domains[i]) != NULL; i++) {
if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
@@ -1646,11 +1643,29 @@ if_linkstate_task(void *xifidx)
 void
 if_linkstate(struct ifnet *ifp)
 {
+   struct task *t, *nt;
+
NET_ASSERT_LOCKED();
 
rtm_ifchg(ifp);
rt_if_track(ifp);
-   dohooks(ifp->if_linkstatehooks, 0);
+
+   TAILQ_FOREACH_SAFE(t, >if_linkstatehooks, t_entry, nt)
+   (*t->t_func)(t->t_arg);
+}
+
+void
+if_linkstatehook_add(struct ifnet *ifp, struct task *t)
+{
+   NET_ASSERT_LOCKED();
+   TAILQ_INSERT_TAIL(>if_linkstatehooks, t, t_entry);
+}
+
+void
+if_linkstatehook_del(struct ifnet *ifp, struct task *t)
+{
+   NET_ASSERT_LOCKED();
+   TAILQ_REMOVE(>if_linkstatehooks, t, t_entry);
 }
 
 /*
Index: net/if_aggr.c
===
RCS file: /cvs/src/sys/net/if_aggr.c,v
retrieving revision 1.20
diff -u -p -r1.20 if_aggr.c
--- net/if_aggr.c   6 Nov 2019 03:51:26 -   1.20
+++ net/if_aggr.c   6 Nov 2019 05:04:03 -
@@ -335,7 +335,7 @@ struct aggr_port {
int (*p_output)(struct ifnet *, struct mbuf *, struct sockaddr *,
struct rtentry *);
 
-   void*p_lcookie;
+   struct task  p_lhook;
struct task  p_dhook;
 
struct aggr_softc   *p_aggr;
@@ -1135,8 +1135,8 @@ aggr_add_port(struct aggr_softc *sc, con
}
}
 
-   p->p_lcookie = hook_establish(ifp0->if_linkstatehooks, 1,
-   aggr_p_linkch, p);
+   task_set(>p_lhook, aggr_p_linkch, p);
+   if_linkstatehook_add(ifp0, >p_lhook);
 
task_set(>p_dhook, aggr_p_detach, p);
if_detachhook_add(ifp0, >p_dhook);
@@ -1428,7 +1428,7 @@ aggr_p_dtor(struct aggr_softc *sc, struc
}
 
if_detachhook_del(ifp0, >p_dhook);
-   hook_disestablish(ifp0->if_linkstatehooks, p->p_lcookie);
+   if_linkstatehook_del(ifp0, >p_lhook);
 
if_put(ifp0);
free(p, M_DEVBUF, sizeof(*p));
Index: net/if_bpe.c
===
RCS file: /cvs/src/sys/net/if_bpe.c,v
retrieving revision 1.9
diff -u -p -r1.9 if_bpe.c
--- net/if_bpe.c6 Nov 2019 03:51:26 -   1.9
+++ net/if_bpe.c6 Nov 2019 05:04:03 -
@@ -102,7 +102,7 @@ struct bpe_softc {
int sc_rxhprio;
uint8_t sc_group[ETHER_ADDR_LEN];
 
-   void *  sc_lh_cookie;
+   struct task sc_ltask;
struct task sc_dtask;
 
struct bpe_map  sc_bridge_map;
@@ -174,6 +174,7 @@ bpe_clone_create(struct if_clone *ifc, i
sc->sc_txhprio = IF_HDRPRIO_PACKET;
sc->sc_rxhprio = IF_HDRPRIO_OUTER;
 
+   task_set(>sc_ltask, bpe_link_hook, sc);
task_set(>sc_dtask, bpe_detach_hook, sc);
 
rw_init(>sc_bridge_lock, "bpebr");
@@ -634,8 +635,7 @@ bpe_up(struct bpe_softc *sc)
}
 
/* Register callback for physical link state changes

use designators for array initialiser in src/sys/netinet/in.h CTL_IPPROTO_NAMES

2019-11-04 Thread David Gwynne

this makes it harder to mess up the assignment of a protocol to the
right slot in the CTL_IPPROTO_NAMES initialiser. it also shrinks the
code a lot, and i think it makes what the array index means a lot more
explicit.

this gets used in sysctl(8), which still works as expected after
this change. we use designators for init in other places, so i don't
see this one as being a problem, even on older compilers.

ok?

Index: in.h
===
RCS file: /cvs/src/sys/netinet/in.h,v
retrieving revision 1.137
diff -u -p -r1.137 in.h
--- in.h4 Nov 2019 23:52:28 -   1.137
+++ in.h5 Nov 2019 01:16:56 -
@@ -386,265 +386,23 @@ struct ip_mreq {
 #defineIPPROTO_MAXID   (IPPROTO_DIVERT + 1)/* don't list to 
IPPROTO_MAX */
 
 #defineCTL_IPPROTO_NAMES { \
-   { "ip", CTLTYPE_NODE }, \
-   { "icmp", CTLTYPE_NODE }, \
-   { "igmp", CTLTYPE_NODE }, \
-   { "ggp", CTLTYPE_NODE }, \
-   { "ipip", CTLTYPE_NODE }, \
-   { 0, 0 }, \
-   { "tcp", CTLTYPE_NODE }, \
-   { 0, 0 }, \
-   { "egp", CTLTYPE_NODE }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { "pup", CTLTYPE_NODE }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { "udp", CTLTYPE_NODE }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { "gre", CTLTYPE_NODE }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { "esp", CTLTYPE_NODE }, \
-   { "ah", CTLTYPE_NODE }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { "etherip", CTLTYPE_NODE }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { "ipcomp", CTLTYPE_NODE }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { "carp", CTLTYPE_NODE }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0 }, \
-   { 0, 0

use tasks and a task_list to manage if_detachhooks

2019-11-04 Thread David Gwynne

hook_establish can fail, but drivers are inconsistent about checking for
that. apparently there's also a requirement that detach hooks are
run in opposite order to the one they were established in, but that
is also applied inconsistently by drivers.

this replaces if_detachhooks with a task_list, and has users of it
allocate an set a struct task to put on it. this means the users of it
allocate the task up front as part of their per port or softc structure,
so adding the task to the if_detachhooks list cannot fail. to enforce
the required ordering, ive added wrappers around the list operations.

ive had this as a long standing itch to scratch, but i was forced into
action after looking at a report by markus@ about the order of these
hook establishment/disestablishments in carp. if you reparent a carp
interface, you've established detach hooks on multiple interfaces, but
if you need to roll back you can lose one of them. this reorders the ops
so you only add the detach hook after you've been put on the new
interface, and because adding the detach hook is reliable there's no
rollback or unwinding to implement.

i would love some testing. ive only kicked a few of the tyres on this
one.

ok?

if people are ok with this, i'll go through and do the same for the link
state and address change hooks.

Index: net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.588
diff -u -p -r1.588 if.c
--- net/if.c21 Aug 2019 15:32:18 -  1.588
+++ net/if.c4 Nov 2019 22:51:52 -
@@ -632,9 +623,7 @@ if_attach_common(struct ifnet *ifp)
ifp->if_linkstatehooks = malloc(sizeof(*ifp->if_linkstatehooks),
M_TEMP, M_WAITOK);
TAILQ_INIT(ifp->if_linkstatehooks);
-   ifp->if_detachhooks = malloc(sizeof(*ifp->if_detachhooks),
-   M_TEMP, M_WAITOK);
-   TAILQ_INIT(ifp->if_detachhooks);
+   TAILQ_INIT(>if_detachhooks);
 
if (ifp->if_rtrequest == NULL)
ifp->if_rtrequest = if_rtrequest_dummy;
@@ -1046,17 +1035,36 @@ if_netisr(void *unused)
 void
 if_deactivate(struct ifnet *ifp)
 {
-   NET_LOCK();
+   struct task *t, *nt;
+
/*
 * Call detach hooks from head to tail.  To make sure detach
 * hooks are executed in the reverse order they were added, all
 * the hooks have to be added to the head!
 */
-   dohooks(ifp->if_detachhooks, HOOK_REMOVE | HOOK_FREE);
 
+   NET_LOCK();
+   TAILQ_FOREACH_SAFE(t, >if_detachhooks, t_entry, nt)
+   (*t->t_func)(t->t_arg);
+
+   KASSERT(TAILQ_EMPTY(>if_detachhooks));
NET_UNLOCK();
 }
 
+void
+if_detachhook_add(struct ifnet *ifp, struct task *t)
+{
+   NET_ASSERT_LOCKED();
+   TAILQ_INSERT_HEAD(>if_detachhooks, t, t_entry);
+}
+
+void
+if_detachhook_del(struct ifnet *ifp, struct task *t)
+{
+   NET_ASSERT_LOCKED();
+   TAILQ_REMOVE(>if_detachhooks, t, t_entry);
+}
+
 /*
  * Detach an interface from everything in the kernel.  Also deallocate
  * private resources.
@@ -1132,7 +1140,6 @@ if_detach(struct ifnet *ifp)
 
free(ifp->if_addrhooks, M_TEMP, sizeof(*ifp->if_addrhooks));
free(ifp->if_linkstatehooks, M_TEMP, sizeof(*ifp->if_linkstatehooks));
-   free(ifp->if_detachhooks, M_TEMP, sizeof(*ifp->if_detachhooks));
 
for (i = 0; (dp = domains[i]) != NULL; i++) {
if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
Index: net/if_aggr.c
===
RCS file: /cvs/src/sys/net/if_aggr.c,v
retrieving revision 1.19
diff -u -p -r1.19 if_aggr.c
--- net/if_aggr.c   5 Aug 2019 10:42:51 -   1.19
+++ net/if_aggr.c   4 Nov 2019 22:51:52 -
@@ -336,7 +336,7 @@ struct aggr_port {
struct rtentry *);
 
void*p_lcookie;
-   void*p_dcookie;
+   struct task  p_dtask;
 
struct aggr_softc   *p_aggr;
TAILQ_ENTRY(aggr_port)   p_entry;
@@ -1138,8 +1138,9 @@ aggr_add_port(struct aggr_softc *sc, con
 
p->p_lcookie = hook_establish(ifp0->if_linkstatehooks, 1,
aggr_p_linkch, p);
-   p->p_dcookie = hook_establish(ifp0->if_detachhooks, 0,
-   aggr_p_detach, p);
+
+   task_set(>p_dtask, aggr_p_detach, p);
+   if_detachhook_add(ifp0, >p_dtask);
 
task_set(>p_rxm_task, aggr_rx, p);
mq_init(>p_rxm_mq, 3, IPL_NET);
@@ -1427,7 +1428,7 @@ aggr_p_dtor(struct aggr_softc *sc, struc
ifp->if_xname, op, ifp0->if_xname);
}
 
-   hook_disestablish(ifp0->if_detachhooks, p->p_dcookie);
+   if_detachhook_del(ifp0, >p_dtask);
hook_disestablish(ifp0->if_linkstatehooks, p->p_lcookie);
 
if_put(ifp0);
Index: net/if_bpe.c
===
RCS file: /cvs/src/sys/net/if_bpe.c,v
retrieving revision 1.8
diff -u -p -r1.8 if_bpe.c
---

GRE datagram socket support

2019-10-29 Thread David Gwynne

 ip->ip_dst;
key.t_dst4 = ip->ip_src;
 
-   if (gre_input_key(mp, offp, type, af, ip->ip_tos, ) == -1)
-   return (rip_input(mp, offp, type, af));
-
-   return (IPPROTO_DONE);
+   return (gre_if_input(m, hlen, ip->ip_tos, ));
 }
 
 #ifdef INET6
-int
-gre_input6(struct mbuf **mp, int *offp, int type, int af)
+struct mbuf *
+gre_if6_input(struct mbuf *m, int hlen)
 {
-   struct mbuf *m = *mp;
struct gre_tunnel key;
struct ip6_hdr *ip6;
uint32_t flow;
@@ -933,10 +905,7 @@ gre_input6(struct mbuf **mp, int *offp, 
 
flow = bemtoh32(>ip6_flow);
 
-   if (gre_input_key(mp, offp, type, af, flow >> 20, ) == -1)
-   return (rip6_input(mp, offp, type, af));
-
-   return (IPPROTO_DONE);
+   return (gre_if_input(m, hlen, flow >> 20, ));
 }
 #endif /* INET6 */
 
@@ -996,12 +965,10 @@ gre_input_1(struct gre_tunnel *key, stru
return (m);
 }
 
-static int
-gre_input_key(struct mbuf **mp, int *offp, int type, int af, uint8_t otos,
-struct gre_tunnel *key)
+static struct mbuf *
+gre_if_input(struct mbuf *m, int iphlen, uint8_t otos, struct gre_tunnel *key)
 {
-   struct mbuf *m = *mp;
-   int iphlen = *offp, hlen, rxprio;
+   int hlen, rxprio;
struct ifnet *ifp;
const struct gre_tunnel *tunnel;
caddr_t buf;
@@ -1025,7 +992,7 @@ gre_input_key(struct mbuf **mp, int *off
 
m = m_pullup(m, hlen);
if (m == NULL)
-   return (IPPROTO_DONE);
+   return (NULL);
 
buf = mtod(m, caddr_t);
gh = (struct gre_header *)(buf + iphlen);
@@ -1038,7 +1005,7 @@ gre_input_key(struct mbuf **mp, int *off
case htons(GRE_VERS_1):
m = gre_input_1(key, m, gh, otos, iphlen);
if (m == NULL)
-   return (IPPROTO_DONE);
+   return (NULL);
/* FALLTHROUGH */
default:
goto decline;
@@ -1055,7 +1022,7 @@ gre_input_key(struct mbuf **mp, int *off
 
m = m_pullup(m, hlen);
if (m == NULL)
-   return (IPPROTO_DONE);
+   return (NULL);
 
buf = mtod(m, caddr_t);
gh = (struct gre_header *)(buf + iphlen);
@@ -1071,7 +1038,7 @@ gre_input_key(struct mbuf **mp, int *off
nvgre_input(key, m, hlen, otos) == -1)
goto decline;
 
-   return (IPPROTO_DONE);
+   return (NULL);
}
 
ifp = gre_find(key);
@@ -1148,7 +1115,7 @@ gre_input_key(struct mbuf **mp, int *off
 
m_adj(m, hlen);
gre_keepalive_recv(ifp, m);
-   return (IPPROTO_DONE);
+   return (NULL);
 
default:
goto decline;
@@ -1162,7 +1129,7 @@ gre_input_key(struct mbuf **mp, int *off
 
m = (*patch)(tunnel, m, , otos);
if (m == NULL)
-   return (IPPROTO_DONE); 
+   return (NULL); 
 
if (tunnel->t_key_mask == GRE_KEY_ENTROPY) {
m->m_pkthdr.ph_flowid = M_FLOWID_VALID |
@@ -1203,10 +1170,9 @@ gre_input_key(struct mbuf **mp, int *off
 #endif
 
(*input)(ifp, m);
-   return (IPPROTO_DONE);
+   return (NULL);
 decline:
-   *mp = m;
-   return (-1);
+   return (m);
 }
 
 static struct mbuf *
Index: sys/netinet/gre_proto.h
===
RCS file: sys/netinet/gre_proto.h
diff -N sys/netinet/gre_proto.h
--- /dev/null   1 Jan 1970 00:00:00 -
+++ sys/netinet/gre_proto.h 29 Oct 2019 07:57:58 -
@@ -0,0 +1,48 @@
+/* $OpenBSD$ */
+
+/*
+ * Copyright (c) 2019 David Gwynne 
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _NETINET_GRE_H_
+#define _NETINET_GRE_H_
+
+struct gre_header {
+   uint16_tgre_flags;
+#define GRE_CP 0x8000  /* Checksum Present */
+#define GRE_KP 0x2000  /* Key Present */
+#define GRE_SP 0x1000  /* Sequence Present */
+
+#define GRE_VERS_MASK  0x0007
+#define GRE_VERS_0 0x
+#define GRE_VERS_1 0x0001
+
+   uint16_tgre_proto;
+};
+
+struct

remove mobileip(4) from the kernel

2019-10-28 Thread David Gwynne

this removes all the kernel code. if we want it back we can look in the
attic.

ok? or is this too much too fast?

Index: conf/files
===
RCS file: /cvs/src/sys/conf/files,v
retrieving revision 1.675
diff -u -p -r1.675 files
--- conf/files  5 Oct 2019 05:33:14 -   1.675
+++ conf/files  29 Oct 2019 04:37:52 -
@@ -554,7 +554,6 @@ pseudo-device carp: ifnet, ether
 pseudo-device sppp: ifnet
 pseudo-device gif: ifnet
 pseudo-device gre: ifnet
-pseudo-device mobileip: ifnet
 pseudo-device crypto: ifnet
 pseudo-device trunk: ifnet, ether, ifmedia
 pseudo-device aggr: ifnet, ether, ifmedia
@@ -810,7 +809,6 @@ file net/rtsock.c
 file net/slcompress.c  ppp
 file net/if_enc.c  enc needs-count
 file net/if_gre.c  gre needs-count
-file net/if_mobileip.c mobileipneeds-count
 file net/if_trunk.ctrunk   needs-count
 file net/trunklacp.c   trunk
 file net/if_aggr.c aggr
Index: net/if_mobileip.c
===
RCS file: net/if_mobileip.c
diff -N net/if_mobileip.c
--- net/if_mobileip.c   18 Feb 2018 23:53:17 -  1.8
+++ /dev/null   1 Jan 1970 00:00:00 -
@@ -1,659 +0,0 @@
-/* $OpenBSD: if_mobileip.c,v 1.8 2018/02/18 23:53:17 dlg Exp $ */
-
-/*
- * Copyright (c) 2016 David Gwynne 
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include "mobileip.h"
-
-#include "bpfilter.h"
-#include "pf.h"
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-#include 
-#include 
-#include 
-#include 
-
-#include 
-#include 
-#include 
-
-#if NBPFILTER > 0
-#include 
-#endif
-
-#if NPF > 0
-#include 
-#endif
-
-#include 
-
-struct mobileip_tunnel {
-   unsigned intt_rtableid;
-   struct in_addr  t_src;
-   struct in_addr  t_dst;
-
-   TAILQ_ENTRY(mobileip_tunnel)
-   t_entry;
-};
-
-TAILQ_HEAD(mobileip_list, mobileip_tunnel);
-
-struct mobileip_softc {
-   struct mobileip_tunnel  sc_tunnel;
-   struct ifnetsc_if;
-};
-
-static int mobileip_clone_create(struct if_clone *, int);
-static int mobileip_clone_destroy(struct ifnet *);
-
-static struct if_clone mobileip_cloner = IF_CLONE_INITIALIZER("mobileip",
-mobileip_clone_create, mobileip_clone_destroy);
-
-static inline int
-   mobileip_cmp(const struct mobileip_tunnel *,
-   const struct mobileip_tunnel *);
-
-struct mobileip_list mobileip_list = TAILQ_HEAD_INITIALIZER(mobileip_list);
-
-#define MOBILEIPMTU(1500 - (sizeof(struct mobileip_header) +   \
-   sizeof(struct mobileip_h_src))) \
-
-static int mobileip_ioctl(struct ifnet *, u_long, caddr_t);
-static int mobileip_up(struct mobileip_softc *);
-static int mobileip_down(struct mobileip_softc *);
-static int mobileip_set_tunnel(struct mobileip_softc *,
-   struct if_laddrreq *);
-static int mobileip_get_tunnel(struct mobileip_softc *,
-   struct if_laddrreq *);
-static int mobileip_del_tunnel(struct mobileip_softc *);
-
-static int mobileip_output(struct ifnet *, struct mbuf *,
-   struct sockaddr *, struct rtentry *);
-static voidmobileip_start(struct ifnet *);
-static int mobileip_encap(struct mobileip_softc *, struct mbuf *);
-static struct mobileip_softc *
-   mobileip_find(const struct mobileip_tunnel *);
-
-/*
- * let's begin
- */
-
-intmobileip_allow = 0;
-
-void
-mobileipattach(int n)
-{
-   if_clone_attach(_cloner);
-}
-
-int
-mobileip_clone_create(struct if_clone *ifc, int unit)
-{
-   struct mobileip_softc *sc;
-
-   sc = malloc(sizeof(*sc), M_DEVBUF, M_NOWAIT|M_ZERO);
-   if (!sc)
-   return (ENOMEM);
-
-   sc->sc_tunnel.t_rtableid = 0;
-   sc->sc_tunnel.t_src.s_addr = INADDR_ANY;
-   sc->sc_tunnel.t_dst.s_addr = INADDR_ANY;
-
-   snprintf(sc->sc_if.if_xname, sizeof sc-&

remove mobileip(4) knobs from sysctl(8)

2019-10-28 Thread David Gwynne

i disabled mobileip(4) with the idea that if noone complained, i'd
remove it. noone has complained, so im going to remove it.

starting at the edge, this removes the mobileip stuff from sysctl(8).

ok?

Index: sysctl.c
===
RCS file: /cvs/src/sbin/sysctl/sysctl.c,v
retrieving revision 1.247
diff -u -p -r1.247 sysctl.c
--- sysctl.c22 Sep 2019 01:16:39 -  1.247
+++ sysctl.c29 Oct 2019 04:05:17 -
@@ -1485,7 +1485,6 @@ struct ctlname espname[] = ESPCTL_NAMES;
 struct ctlname ahname[] = AHCTL_NAMES;
 struct ctlname etheripname[] = ETHERIPCTL_NAMES;
 struct ctlname grename[] = GRECTL_NAMES;
-struct ctlname mobileipname[] = MOBILEIPCTL_NAMES;
 struct ctlname ipcompname[] = IPCOMPCTL_NAMES;
 struct ctlname carpname[] = CARPCTL_NAMES;
 struct ctlname pfsyncname[] = PFSYNCCTL_NAMES;
@@ -1550,7 +1549,7 @@ struct list inetvars[] = {
{ 0, 0 },
{ 0, 0 },
{ 0, 0 },
-   { mobileipname, MOBILEIPCTL_MAXID }, /* mobileip */
+   { 0, 0 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 },

Re: mbuf limit atomic operation

2019-10-17 Thread David Gwynne

why? is it significantly faster? page allocation should be in the slow path.

dlg

> On 18 Oct 2019, at 08:33, Alexander Bluhm  wrote:
> 
> Hi,
> 
> Can we replace the mutex that protects the mbuf allocation limit
> by an atomic operation?
> 
> ok?
> 
> bluhm
> 
> Index: kern/uipc_mbuf.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_mbuf.c,v
> retrieving revision 1.272
> diff -u -p -r1.272 uipc_mbuf.c
> --- kern/uipc_mbuf.c  19 Jul 2019 09:03:03 -  1.272
> +++ kern/uipc_mbuf.c  17 Oct 2019 22:29:17 -
> @@ -133,7 +133,6 @@ structmutex m_extref_mtx = MUTEX_INITIA
> void  m_extfree(struct mbuf *);
> void  m_zero(struct mbuf *);
> 
> -struct mutex m_pool_mtx = MUTEX_INITIALIZER(IPL_NET);
> unsigned long mbuf_mem_limit; /* how much memory can be allocated */
> unsigned long mbuf_mem_alloc; /* how much memory has been allocated */
> 
> @@ -1473,30 +1472,20 @@ m_microtime(const struct mbuf *m, struct
> void *
> m_pool_alloc(struct pool *pp, int flags, int *slowdown)
> {
> - void *v = NULL;
> - int avail = 1;
> + void *v;
> + long alloc;
> 
> - if (mbuf_mem_alloc + pp->pr_pgsize > mbuf_mem_limit)
> - return (NULL);
> -
> - mtx_enter(_pool_mtx);
> - if (mbuf_mem_alloc + pp->pr_pgsize > mbuf_mem_limit)
> - avail = 0;
> - else
> - mbuf_mem_alloc += pp->pr_pgsize;
> - mtx_leave(_pool_mtx);
> + alloc = atomic_add_long_nv(_mem_alloc, pp->pr_pgsize);
> + if (alloc > mbuf_mem_limit)
> + goto fail;
> 
> - if (avail) {
> - v = (*pool_allocator_multi.pa_alloc)(pp, flags, slowdown);
> + v = (*pool_allocator_multi.pa_alloc)(pp, flags, slowdown);
> + if (v != NULL)
> + return (v);
> 
> - if (v == NULL) {
> - mtx_enter(_pool_mtx);
> - mbuf_mem_alloc -= pp->pr_pgsize;
> - mtx_leave(_pool_mtx);
> - }
> - }
> -
> - return (v);
> + fail:
> + atomic_sub_long(_mem_alloc, pp->pr_pgsize);
> + return (NULL);
> }
> 
> void
> @@ -1504,9 +1493,7 @@ m_pool_free(struct pool *pp, void *v)
> {
>   (*pool_allocator_multi.pa_free)(pp, v);
> 
> - mtx_enter(_pool_mtx);
> - mbuf_mem_alloc -= pp->pr_pgsize;
> - mtx_leave(_pool_mtx);
> + atomic_sub_long(_mem_alloc, pp->pr_pgsize);
> }
> 
> void
>

in6_setsockaddr and in6_setpeeraddr can return void, like their v4 counterparts

2019-10-16 Thread David Gwynne

the ipv4 ones return void, and these can't fail, so make them return
nothing too.

ok?

Index: netinet/in_pcb.h
===
RCS file: /cvs/src/sys/netinet/in_pcb.h,v
retrieving revision 1.116
diff -u -p -r1.116 in_pcb.h
--- netinet/in_pcb.h15 Jul 2019 12:40:42 -  1.116
+++ netinet/in_pcb.h16 Oct 2019 10:09:24 -
@@ -287,8 +287,8 @@ struct inpcb *
 int in6_pcbaddrisavail(struct inpcb *, struct sockaddr_in6 *, int,
struct proc *);
 int in6_pcbconnect(struct inpcb *, struct mbuf *);
-int in6_setsockaddr(struct inpcb *, struct mbuf *);
-int in6_setpeeraddr(struct inpcb *, struct mbuf *);
+voidin6_setsockaddr(struct inpcb *, struct mbuf *);
+voidin6_setpeeraddr(struct inpcb *, struct mbuf *);
 #endif /* INET6 */
 voidin_pcbinit(struct inpcbtable *, int);
 struct inpcb *
Index: netinet6/in6_pcb.c
===
RCS file: /cvs/src/sys/netinet6/in6_pcb.c,v
retrieving revision 1.108
diff -u -p -r1.108 in6_pcb.c
--- netinet6/in6_pcb.c  4 Oct 2018 17:33:41 -   1.108
+++ netinet6/in6_pcb.c  16 Oct 2019 10:09:24 -
@@ -305,7 +305,7 @@ in6_pcbconnect(struct inpcb *inp, struct
  * Get the local address/port, and put it in a sockaddr_in6.
  * This services the getsockname(2) call.
  */
-int
+void
 in6_setsockaddr(struct inpcb *inp, struct mbuf *nam)
 {
struct sockaddr_in6 *sin6;
@@ -320,15 +320,13 @@ in6_setsockaddr(struct inpcb *inp, struc
sin6->sin6_addr = inp->inp_laddr6;
/* KAME hack: recover scopeid */
in6_recoverscope(sin6, >inp_laddr6);
-
-   return 0;
 }
 
 /*
  * Get the foreign address/port, and put it in a sockaddr_in6.
  * This services the getpeername(2) call.
  */
-int
+void
 in6_setpeeraddr(struct inpcb *inp, struct mbuf *nam)
 {
struct sockaddr_in6 *sin6;
@@ -343,8 +341,6 @@ in6_setpeeraddr(struct inpcb *inp, struc
sin6->sin6_addr = inp->inp_faddr6;
/* KAME hack: recover scopeid */
in6_recoverscope(sin6, >inp_faddr6);
-
-   return 0;
 }
 
 /*

remove src/sys/netinet/ip_ether.c because it is not used

2019-10-04 Thread David Gwynne

jca@ has already oked this. anyone else want to get on board?

Index: ip_ether.c
===
RCS file: ip_ether.c
diff -N ip_ether.c
--- ip_ether.c  10 Feb 2018 08:12:01 -  1.99
+++ /dev/null   1 Jan 1970 00:00:00 -
@@ -1,28 +0,0 @@
-/* $OpenBSD: ip_ether.c,v 1.99 2018/02/10 08:12:01 dlg Exp $  */
-/*
- * The author of this code is Angelos D. Keromytis (ker...@adk.gr)
- *
- * This code was written by Angelos D. Keromytis for OpenBSD in October 1999.
- *
- * Copyright (C) 1999-2001 Angelos D. Keromytis.
- *
- * Permission to use, copy, and modify this software with or without fee
- * is hereby granted, provided that this entire notice is included in
- * all copies of any software which is or includes a copy or
- * modification of this software.
- * You may use this code under the GNU public license if you so wish. Please
- * contribute changes back to the authors under this freer than GPL license
- * so that we may further the use of strong encryption without limitations to
- * all.
- *
- * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
- * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
- * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
- * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
- * PURPOSE.
- */
-
-/*
- * Ethernet-inside-IP processing (RFC3378).
- */
-

src/sys/netinet/ip_ether.c is empty, and also not used by gif

2019-10-03 Thread David Gwynne

so we can remove it, starting with taking it out of sys/conf/files.

ok?

Index: files
===
RCS file: /cvs/src/sys/conf/files,v
retrieving revision 1.674
diff -u -p -r1.674 files
--- files   29 Sep 2019 13:04:03 -  1.674
+++ files   4 Oct 2019 05:15:49 -
@@ -866,7 +870,6 @@ file netinet/ip_gre.c
 file netinet/ip_ipsp.c ipsec | tcp_signature
 file netinet/ip_spd.c  ipsec | tcp_signature
 file netinet/ip_ipip.c
-file netinet/ip_ether.cgif
 file netinet/ipsec_input.c ipsec
 file netinet/ipsec_output.cipsec
 file netinet/ip_esp.c  ipsec

Re: umb(4) switch bpf to DLT_LOOP

2019-10-02 Thread David Gwynne

ok.

> On 2 Oct 2019, at 18:17, Claudio Jeker  wrote:
> 
> umb(4) is currently the only user of DLT_RAW. The problem with this is
> that it only works for IPv4 packets and that is less than ideal.
> This diff switches umb(4) to DLT_LOOP like lo(4), tun(4), gif(4), ...
> 
> To make this work the driver injects the af header in umb_decap() when
> pulling the packet of the ring and pops that header in umb_input() this
> way the common interface input handling can still be used.
> For the outbound packets umb_output() will now set ph_family header in the
> mbuf so that umb_start() can use this in bpf_mtap_af().
> 
> With this tcpdump should be able to show also IPv6 packets. 
> My provider does not do IPv6 (neither via umb MBIM_CID_IP_CONFIGURATION or
> via slaac) but maybe someone else has a more tech competent provider than
> me and is willing to make IPv6 go on umb(4).
> -- 
> :wq Claudio
> 
> Index: dev/usb/if_umb.c
> ===
> RCS file: /cvs/src/sys/dev/usb/if_umb.c,v
> retrieving revision 1.26
> diff -u -p -r1.26 if_umb.c
> --- dev/usb/if_umb.c  29 Sep 2019 15:31:16 -  1.26
> +++ dev/usb/if_umb.c  1 Oct 2019 11:00:10 -
> @@ -516,7 +516,7 @@ umb_attach(struct device *parent, struct
>   if_alloc_sadl(ifp);
>   ifp->if_softc = sc;
> #if NBPFILTER > 0
> - bpfattach(>if_bpf, ifp, DLT_RAW, 0);
> + bpfattach(>if_bpf, ifp, DLT_LOOP, sizeof(uint32_t));
> #endif
>   /*
>* Open the device now so that we are able to query device information.
> @@ -759,19 +759,20 @@ umb_output(struct ifnet *ifp, struct mbu
>   m_freem(m);
>   return ENETDOWN;
>   }
> + m->m_pkthdr.ph_family = dst->sa_family;
>   return if_enqueue(ifp, m);
> }
> 
> int
> umb_input(struct ifnet *ifp, struct mbuf *m, void *cookie)
> {
> - uint8_t ipv;
> + uint32_t af;
> 
>   if ((ifp->if_flags & IFF_UP) == 0) {
>   m_freem(m);
>   return 1;
>   }
> - if (m->m_pkthdr.len < sizeof (struct ip)) {
> + if (m->m_pkthdr.len < sizeof (struct ip) + sizeof(af)) {
>   ifp->if_ierrors++;
>   DPRINTFN(4, "%s: dropping short packet (len %d)\n", __func__,
>   m->m_pkthdr.len);
> @@ -779,16 +780,19 @@ umb_input(struct ifnet *ifp, struct mbuf
>   return 1;
>   }
>   m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
> - m_copydata(m, 0, sizeof (ipv), );
> - ipv >>= 4;
> +
> + /* pop of DLT_LOOP header, no longer needed */
> + af = *mtod(m, uint32_t *);
> + m_adj(m, sizeof (af));
> + af = ntohl(af);
> 
>   ifp->if_ibytes += m->m_pkthdr.len;
> - switch (ipv) {
> - case 4:
> + switch (af) {
> + case AF_INET:
>   ipv4_input(ifp, m);
>   return 1;
> #ifdef INET6
> - case 6:
> + case AF_INET6:
>   ipv6_input(ifp, m);
>   return 1;
> #endif /* INET6 */
> @@ -878,7 +882,8 @@ umb_start(struct ifnet *ifp)
> 
> #if NBPFILTER > 0
>   if (ifp->if_bpf)
> - bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
> + bpf_mtap_af(ifp->if_bpf, m->m_pkthdr.ph_family, m,
> + BPF_DIRECTION_OUT);
> #endif
>   }
>   if (ml_empty(>sc_tx_ml))
> @@ -1916,7 +1921,7 @@ umb_decap(struct umb_softc *sc, struct u
>   struct ifnet *ifp = GET_IFP(sc);
>   int  s;
>   void*buf;
> - uint32_t len;
> + uint32_t len, af = 0;
>   char*dp;
>   struct ncm_header16 *hdr16;
>   struct ncm_header32 *hdr32;
> @@ -2033,12 +2038,25 @@ umb_decap(struct umb_softc *sc, struct u
> 
>   dp = buf + doff;
>   DPRINTFN(3, "%s: decap %d bytes\n", DEVNAM(sc), dlen);
> - m = m_devget(dp, dlen, 0);
> + m = m_devget(dp, dlen, sizeof(uint32_t));
>   if (m == NULL) {
>   ifp->if_iqdrops++;
>   continue;
>   }
> -
> + m = m_prepend(m, sizeof(uint32_t), M_DONTWAIT);
> + if (m == NULL) {
> + ifp->if_iqdrops++;
> + continue;
> + }
> + switch (*dp & 0xf0) {
> + case 4 << 4:
> + af = htonl(AF_INET);
> + break;
> + case 6 << 4:
> + af = htonl(AF_INET6);
> + break;
> + }
> + *mtod(m, uint32_t *) = af;
>   ml_enqueue(, m);
>   }
> done:
>

let trunk(4) in lacp mode select interfaces with if_baudrate 0

2019-10-01 Thread David Gwynne

jmatthew@ and I thought I had broken trunk(4) because we could never get
it to come up on our new boxes, while aggr(4) always worked fine. It
turns out we were only ever testing it with new drivers like ixl(4) or
mcx(4), and we hadn't added code to them to properly report baudrate.

This tickled an edge condition in the trunk lacp code where it would
effectively only select aggregators with a baudrate. This diff tweaks
the test so an interface with baudrate 0 can be selected if no other
aggregator already exists.

This should also make it more likely that things like etherip or egre
will work with trunk(4) in lacp mode, but there may still be issues with
how they report link state which might make trunk(4) unhappy.

We've also fixed ixl and mcx so they do report baudrate, but I still
think trunk(4) should cope.

ok?

Index: trunklacp.c
===
RCS file: /cvs/src/sys/net/trunklacp.c,v
retrieving revision 1.31
diff -u -p -r1.31 trunklacp.c
--- trunklacp.c 29 Apr 2019 03:54:52 -  1.31
+++ trunklacp.c 2 Oct 2019 00:28:00 -
@@ -919,7 +919,7 @@ lacp_select_active_aggregator(struct lac
 *  or, the total aggregated speed is higher
 *  or, it is already the chosen aggregator
 */
-   if ((best_la != NULL && LACP_SYS_PRI(la->la_partner) <
+   if ((best_la == NULL || LACP_SYS_PRI(la->la_partner) <
 LACP_SYS_PRI(best_la->la_partner)) ||
speed > best_speed ||
(speed == best_speed &&

Re: remove custom mbuf copy function support from bpf internals

2019-09-30 Thread David Gwynne

On Mon, Sep 30, 2019 at 08:54:14AM +0200, Claudio Jeker wrote:
> On Mon, Sep 30, 2019 at 12:06:34PM +1000, David Gwynne wrote:
> > the "public" bpf api no longer supports custom copy functions, so we can
> > remove the plumbing for it internally in the bpf code.
> > 
> > ok?
> > 
> > Index: bpf.c
> > ===
> > RCS file: /cvs/src/sys/net/bpf.c,v
> > retrieving revision 1.180
> > diff -u -p -r1.180 bpf.c
> > --- bpf.c   30 Sep 2019 01:53:05 -  1.180
> > +++ bpf.c   30 Sep 2019 02:04:37 -
> > @@ -94,8 +94,6 @@ LIST_HEAD(, bpf_d) bpf_d_list;
> >  
> >  intbpf_allocbufs(struct bpf_d *);
> >  void   bpf_ifname(struct bpf_if*, struct ifreq *);
> > -int_bpf_mtap(caddr_t, const struct mbuf *, u_int,
> > -   void (*)(const void *, void *, size_t));
> >  void   bpf_mcopy(const void *, void *, size_t);
> >  intbpf_movein(struct uio *, struct bpf_d *, struct mbuf **,
> > struct sockaddr *);
> > @@ -105,7 +103,7 @@ int bpfkqfilter(dev_t, struct knote *);
> >  void   bpf_wakeup(struct bpf_d *);
> >  void   bpf_wakeup_cb(void *);
> >  void   bpf_catchpacket(struct bpf_d *, u_char *, size_t, size_t,
> > -   void (*)(const void *, void *, size_t), struct timeval *);
> > +   struct timeval *);
> >  intbpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
> >  intbpf_setdlt(struct bpf_d *, u_int);
> >  
> > @@ -1241,12 +1239,8 @@ bpf_mcopy(const void *src_arg, void *dst
> > }
> >  }
> >  
> > -/*
> > - * like bpf_mtap, but copy fn can be given. used by various bpf_mtap*
> > - */
> >  int
> > -_bpf_mtap(caddr_t arg, const struct mbuf *m, u_int direction,
> > -void (*cpfn)(const void *, void *, size_t))
> > +bpf_mtap(caddr_t arg, const struct mbuf *m, u_int direction)
> >  {
> > struct bpf_if *bp = (struct bpf_if *)arg;
> > struct bpf_d *d;
> > @@ -1259,9 +1253,6 @@ _bpf_mtap(caddr_t arg, const struct mbuf
> > if (m == NULL)
> > return (0);
> >  
> > -   if (cpfn == NULL)
> > -   cpfn = bpf_mcopy;
> > -
> > if (bp == NULL)
> > return (0);
> >  
> > @@ -1299,8 +1290,7 @@ _bpf_mtap(caddr_t arg, const struct mbuf
> > }
> >  
> > mtx_enter(>bd_mtx);
> > -   bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn,
> > -   );
> > +   bpf_catchpacket(d, (u_char *)m, pktlen, slen, );
> > mtx_leave(>bd_mtx);
> > }
> > }
> > @@ -1345,16 +1335,7 @@ bpf_tap_hdr(caddr_t arg, const void *hdr
> > *mp = (struct mbuf *)
> > }
> >  
> > -   return _bpf_mtap(arg, m0, direction, bpf_mcopy);
> > -}
> > -
> > -/*
> > - * Incoming linkage from device drivers, when packet is in an mbuf chain.
> > - */
> > -int
> > -bpf_mtap(caddr_t arg, const struct mbuf *m, u_int direction)
> > -{
> > -   return _bpf_mtap(arg, m, direction, NULL);
> > +   return bpf_mtap(arg, m0, direction);
> >  }
> >  
> >  /*
> > @@ -1382,7 +1363,7 @@ bpf_mtap_hdr(caddr_t arg, const void *da
> > } else 
> > m0 = m;
> >  
> > -   return _bpf_mtap(arg, m0, direction, NULL);
> > +   return bpf_mtap(arg, m0, direction);
> >  }
> >  
> >  /*
> > @@ -1460,7 +1441,7 @@ bpf_mtap_ether(caddr_t arg, const struct
> >   */
> >  void
> >  bpf_catchpacket(struct bpf_d *d, u_char *pkt, size_t pktlen, size_t 
> > snaplen,
> > -void (*cpfn)(const void *, void *, size_t), struct timeval *tv)
> > +struct timeval *tv)
> >  {
> > struct bpf_hdr *hp;
> > int totlen, curlen;
> > @@ -1513,10 +1494,12 @@ bpf_catchpacket(struct bpf_d *d, u_char 
> > hp->bh_tstamp.tv_usec = tv->tv_usec;
> > hp->bh_datalen = pktlen;
> > hp->bh_hdrlen = hdrlen;
> > +
> > /*
> >  * Copy the packet data into the store buffer and update its length.
> >  */
> > -   (*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen));
> > +   bpf_mcopy(pkt, (u_char *)hp + hdrlen,
> > +   (hp->bh_caplen = totlen - hdrlen));
> 
> This new line is not really needed but also yuck on assigning the size in
> a function call argument. Maybe do the hp->bh_caplen = totlen - hdrlen before
> the call and pass hp->bh_caplen to bpf_mcopy().

ikr. i'll take it as a free second commit after this one.

> 
> > d->bd_slen = curlen + totlen;
> >  
> > if (d->bd_immediate) {
> > 
> 
> OK claudio@

cheers :)

> 
> -- 
> :wq Claudio

Re: eoip.4: document interface admin

2019-09-30 Thread David Gwynne

On Mon, Sep 30, 2019 at 06:49:14AM +0100, Jason McIntyre wrote:
> On Mon, Sep 30, 2019 at 10:28:50AM +1000, David Gwynne wrote:
> > i got an email recently asking how to configure the tunnel identifier
> > on an eoip(4) interface, and initially wanted to point the sender
> > at the manpage. unfortunately, the manpage is written for programmers
> > who have spent a lot of time in network drivers, ie, me. everyone
> > else who just wants to configure an interface with ifconfig(8) or
> > netstart(8) loses.
> > 
> > this adds a subsection to the eoip manpage on how to administer the
> > interfaces, and very slightly tweaks the example to show how the tunnel
> > id lines up between openbsd and whatever mikrotik calls their os.
> > 
> > so ok?
> > 
> > i actually like this change as it makes the documentation more useful
> > for what people do with an interface, which is operate it. if this goes
> > in i would like to update the other pseudo interface driver manpages so
> > they're more like this.
> > 
> 
> morning.
> 
> i'm not against this addition, but it does seem like we're
> setting up issues for the future: we'll be moving from having this info
> in one place to many. when it changes, it's easier to get it wrong.
> 
> what's the thinking? i mean, the stuff is already there in ifconfig(8).
> is it not clear enough? can we make it clearer? or are people just not
> working out where to look?

we (openbsd) try to provide a generic set of ioctls and therefore
command line options in ifconfig that end up configuring specific things
in a variety of drivers. on top of this, different drivers implement a
different subset of these ioctls and ifconfig arguments depending on
what the protocol is actually capable of.

in this specific situation, the problem is that eoip on a mikrotik
(where the eoip protocol was invented) requires the configuration of a
16-bit tunnel identifier, but it is not obvious what you have to
type in openbsd to set that. the tunnel-id is kind of equivalent
to a vlan tag number, and like the vlan tag number on a vlan
interface, it is configured with the vnetid argument in ifconfig.
that's not obvious from the manpage.

another example is GRE, which as a protocol it has an optional key
field in it's header. it's the same problem here same in that it
is called one thing in the protocol but you don't type "ifconfig gre0
key 1234". again, you use vnetid to set that.

an example of the differening subsets of functionality is also around
vnetids. in GRE as a protocol, the Key is optional so you can use
ifconfig gre0 -vnetid to disable use of it. in vlan(4) there's a magic
value (0) on the wire that means the vlan tag is kind of like a default and
you only care about the priority bits, which again you use -vnetid to enable
the use of. eoip(4) is different again in that the tunnel-id is
mandatory and there are no magic values, so it doesn't support
-vnetid.

> we kind of farmed all this info into ifconfig(8). i'm not sure whether
> we want to move it out again. or if we do, do we try to reduce the
> content in ifconfig.8?

my opinion is that we should still document what things you can
pass to ifconfig, but cut out some references to specific behaviours
of various drivers. what functionality a specific driver supports
should be in the documentation for that specific driver.

alternatively we can come up with a better way to document what a driver
does wrt to things like vnetids.

> 
> jmc
> 
> > Index: eoip.4
> > ===
> > RCS file: /cvs/src/share/man/man4/eoip.4,v
> > retrieving revision 1.4
> > diff -u -p -r1.4 eoip.4
> > --- eoip.4  29 May 2019 19:37:06 -  1.4
> > +++ eoip.4  30 Sep 2019 00:11:55 -
> > @@ -83,12 +83,68 @@ route to the tunnel destination than the
> >  via the tunnel interface.
> >  Alternatively, the tunnel traffic may be configured in a separate
> >  routing table to the encapsulated traffic.
> > +.Ss Network Interface Administration
> > +.Nm
> > +interfaces may be configured by
> > +.Xr ifconfig 8
> > +or
> > +.Xr netstart 8
> > +using the following options:
> > +.Bl -tag -width indent
> > +.It Cm tunnel Ar src_address dest_address
> > +Set the unicast IPv4 or IPv6 addresses for the encapsulating IP packets.
> > +The addresses may only be configured while the interface is down.
> > +.It Fl Ns Cm tunnel
> > +Clear the addresses used for the encapsulating IP packets.
> > +The addresses may only be cleared while the interface is down.
> > +.It Cm vnetid Ar tunnel-identifier
> > +Configure the virtual network identifier to use as the Tunnel Identifier.
> > +The virtual network i

remove custom mbuf copy function support from bpf internals

2019-09-29 Thread David Gwynne

the "public" bpf api no longer supports custom copy functions, so we can
remove the plumbing for it internally in the bpf code.

ok?

Index: bpf.c
===
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.180
diff -u -p -r1.180 bpf.c
--- bpf.c   30 Sep 2019 01:53:05 -  1.180
+++ bpf.c   30 Sep 2019 02:04:37 -
@@ -94,8 +94,6 @@ LIST_HEAD(, bpf_d) bpf_d_list;
 
 intbpf_allocbufs(struct bpf_d *);
 void   bpf_ifname(struct bpf_if*, struct ifreq *);
-int_bpf_mtap(caddr_t, const struct mbuf *, u_int,
-   void (*)(const void *, void *, size_t));
 void   bpf_mcopy(const void *, void *, size_t);
 intbpf_movein(struct uio *, struct bpf_d *, struct mbuf **,
struct sockaddr *);
@@ -105,7 +103,7 @@ int bpfkqfilter(dev_t, struct knote *);
 void   bpf_wakeup(struct bpf_d *);
 void   bpf_wakeup_cb(void *);
 void   bpf_catchpacket(struct bpf_d *, u_char *, size_t, size_t,
-   void (*)(const void *, void *, size_t), struct timeval *);
+   struct timeval *);
 intbpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
 intbpf_setdlt(struct bpf_d *, u_int);
 
@@ -1241,12 +1239,8 @@ bpf_mcopy(const void *src_arg, void *dst
}
 }
 
-/*
- * like bpf_mtap, but copy fn can be given. used by various bpf_mtap*
- */
 int
-_bpf_mtap(caddr_t arg, const struct mbuf *m, u_int direction,
-void (*cpfn)(const void *, void *, size_t))
+bpf_mtap(caddr_t arg, const struct mbuf *m, u_int direction)
 {
struct bpf_if *bp = (struct bpf_if *)arg;
struct bpf_d *d;
@@ -1259,9 +1253,6 @@ _bpf_mtap(caddr_t arg, const struct mbuf
if (m == NULL)
return (0);
 
-   if (cpfn == NULL)
-   cpfn = bpf_mcopy;
-
if (bp == NULL)
return (0);
 
@@ -1299,8 +1290,7 @@ _bpf_mtap(caddr_t arg, const struct mbuf
}
 
mtx_enter(>bd_mtx);
-   bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn,
-   );
+   bpf_catchpacket(d, (u_char *)m, pktlen, slen, );
mtx_leave(>bd_mtx);
}
}
@@ -1345,16 +1335,7 @@ bpf_tap_hdr(caddr_t arg, const void *hdr
*mp = (struct mbuf *)
}
 
-   return _bpf_mtap(arg, m0, direction, bpf_mcopy);
-}
-
-/*
- * Incoming linkage from device drivers, when packet is in an mbuf chain.
- */
-int
-bpf_mtap(caddr_t arg, const struct mbuf *m, u_int direction)
-{
-   return _bpf_mtap(arg, m, direction, NULL);
+   return bpf_mtap(arg, m0, direction);
 }
 
 /*
@@ -1382,7 +1363,7 @@ bpf_mtap_hdr(caddr_t arg, const void *da
} else 
m0 = m;
 
-   return _bpf_mtap(arg, m0, direction, NULL);
+   return bpf_mtap(arg, m0, direction);
 }
 
 /*
@@ -1460,7 +1441,7 @@ bpf_mtap_ether(caddr_t arg, const struct
  */
 void
 bpf_catchpacket(struct bpf_d *d, u_char *pkt, size_t pktlen, size_t snaplen,
-void (*cpfn)(const void *, void *, size_t), struct timeval *tv)
+struct timeval *tv)
 {
struct bpf_hdr *hp;
int totlen, curlen;
@@ -1513,10 +1494,12 @@ bpf_catchpacket(struct bpf_d *d, u_char 
hp->bh_tstamp.tv_usec = tv->tv_usec;
hp->bh_datalen = pktlen;
hp->bh_hdrlen = hdrlen;
+
/*
 * Copy the packet data into the store buffer and update its length.
 */
-   (*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen));
+   bpf_mcopy(pkt, (u_char *)hp + hdrlen,
+   (hp->bh_caplen = totlen - hdrlen));
d->bd_slen = curlen + totlen;
 
if (d->bd_immediate) {

eoip.4: document interface admin

2019-09-29 Thread David Gwynne

i got an email recently asking how to configure the tunnel identifier
on an eoip(4) interface, and initially wanted to point the sender
at the manpage. unfortunately, the manpage is written for programmers
who have spent a lot of time in network drivers, ie, me. everyone
else who just wants to configure an interface with ifconfig(8) or
netstart(8) loses.

this adds a subsection to the eoip manpage on how to administer the
interfaces, and very slightly tweaks the example to show how the tunnel
id lines up between openbsd and whatever mikrotik calls their os.

so ok?

i actually like this change as it makes the documentation more useful
for what people do with an interface, which is operate it. if this goes
in i would like to update the other pseudo interface driver manpages so
they're more like this.

Index: eoip.4
===
RCS file: /cvs/src/share/man/man4/eoip.4,v
retrieving revision 1.4
diff -u -p -r1.4 eoip.4
--- eoip.4  29 May 2019 19:37:06 -  1.4
+++ eoip.4  30 Sep 2019 00:11:55 -
@@ -83,12 +83,68 @@ route to the tunnel destination than the
 via the tunnel interface.
 Alternatively, the tunnel traffic may be configured in a separate
 routing table to the encapsulated traffic.
+.Ss Network Interface Administration
+.Nm
+interfaces may be configured by
+.Xr ifconfig 8
+or
+.Xr netstart 8
+using the following options:
+.Bl -tag -width indent
+.It Cm tunnel Ar src_address dest_address
+Set the unicast IPv4 or IPv6 addresses for the encapsulating IP packets.
+The addresses may only be configured while the interface is down.
+.It Fl Ns Cm tunnel
+Clear the addresses used for the encapsulating IP packets.
+The addresses may only be cleared while the interface is down.
+.It Cm vnetid Ar tunnel-identifier
+Configure the virtual network identifier to use as the Tunnel Identifier.
+The virtual network identifier may only be configured while the
+interface is down.
+The Tunnel Identifier is a 16-bit value between 0 and 65535 inclusive.
+.It Cm tunneldomain Ar rdomain
+Set the routing table the tunnel traffic operates in.
+The routing table may only be configured while the interface is down.
+.It Cm tunnelttl Ar ttl
+Set the Time-To-Live field in IPv4 encapsulation headers, or the
+Hop Limit field in IPv6 encapsulation headers.
+.It Cm tunneldf
+Disable fragmentation of tunnel traffic by the interface.
+This sets the Don't Fragment (DF) bit on IPv4 packets,
+and disables fragmentation of IPv6 packets.
+.It Fl Ns Cm tunneldf
+Allow fragmentation of tunnel traffic.
+.\" This clears the Don't Fragment (DF) bit on IPv4 packets,
+.\" and allows fragmentation of IPv6 packets.
+.It Cm rxprio Ar prio
+Configure how the interface sets the priority of incoming packets.
+.Ar packet
+maintains the existing priority of the packet.
+Priority may be hardcoded with a number from 0 to 7.
+.Ar outer
+uses the Type of Service field in IPv4 encapsulation headers, or
+the Traffic Class field in IPv6 encapsulation headers to set the
+packet priority.
+.It Cm txprio Ar prio
+Configure which value is used in the Type of Service field in IPv4
+encapsulation headers, or the Traffic Class field in IPv6 encapsulation
+headers.
+Values may be hardcoded with a number from 0 to 7, or
+.Ar packet
+to specify that the current priority of a packet should be used.
+.It Cm keepalive Ar period count
+Enable the transmission of keepalive packets to detect tunnel failure.
+Keepalives may only be configured while the interface is down.
+.It Fl Ns Cm keepalive
+Disable the transmission of keepalive packets to detect tunnel failure.
+Keepalives may only be disabled while the interface is down.
+.El
 .Ss Programming Interface
 .Nm
 interfaces support the following
 .Xr ioctl 2
 calls for configuring tunnel options:
-.Bl -tag -width indent -offset 3n
+.Bl -tag -width indent
 .It Dv SIOCSLIFPHYADDR Fa "struct if_laddrreq *"
 Set the unicast IPv4 or IPv6 addresses for the encapsulating IP packets.
 The addresses may only be configured while the interface is down.
@@ -101,7 +157,7 @@ The addresses may only be cleared while 
 Configure a virtual network identifier for use as the Tunnel Identifier.
 The virtual network identifier may only be configured while the
 interface is down.
-The Tunnel Identifier is a 16-bit value.
+The Tunnel Identifier is a 16-bit value between 0 and 65535 inclusive.
 .It Dv SIOCGVNETID Fa "struct ifreq *"
 Get the virtual network identifer used in the GRE Key header.
 .It Dv SIOCSLIFPHYRTABLE Fa "struct ifreq *"
@@ -173,6 +229,7 @@ On Host A
 # route add default B
 # ifconfig eoipN create
 # ifconfig eoipN tunnel A D
+# ifconfig eoipN vnetid T
 # ifconfig eoipN up
 # route add E D
 .Ed
@@ -181,7 +238,7 @@ On Host D (MikroTik):
 .Bd -literal -offset indent
 [admin@MikroTik] > interface eoip
 [admin@MikroTik] /interface eoip> add name="eoipN" \e
-\e... local-address=D remote-address=A
+\e... tunnel-id=T local-address=D remote-address=A
 [admin@MikroTik]

bpf_mtap_hdr copy function is redundant

2019-09-18 Thread David Gwynne

this removes the cpfn argument from bpf_mtap_hdr since nothing uses it
anymore.

the last thing to use it was pflog, and that's been reworked to avoid it
a week or so ago. this is a largely mechanical diff to remove the cpfn
argument, and adjust all the callers to cope.

there's some followup work that can be done to clean up bpf internally,
but that's for later.

ok?

Index: net/bpf.h
===
RCS file: /cvs/src/sys/net/bpf.h,v
retrieving revision 1.67
diff -u -p -r1.67 bpf.h
--- net/bpf.h   12 Sep 2019 01:17:38 -  1.67
+++ net/bpf.h   19 Sep 2019 01:21:36 -
@@ -315,8 +315,7 @@ struct mbuf;
 
 int bpf_validate(struct bpf_insn *, int);
 int bpf_mtap(caddr_t, const struct mbuf *, u_int);
-int bpf_mtap_hdr(caddr_t, const void *, u_int, const struct mbuf *, u_int,
-   void (*)(const void *, void *, size_t));
+int bpf_mtap_hdr(caddr_t, const void *, u_int, const struct mbuf *, u_int);
 int bpf_mtap_af(caddr_t, u_int32_t, const struct mbuf *, u_int);
 int bpf_mtap_ether(caddr_t, const struct mbuf *, u_int);
 int bpf_tap_hdr(caddr_t, const void *, u_int, const void *, u_int, u_int);
Index: net/bpf.c
===
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.179
diff -u -p -r1.179 bpf.c
--- net/bpf.c   12 Sep 2019 01:25:14 -  1.179
+++ net/bpf.c   19 Sep 2019 01:21:36 -
@@ -1368,7 +1368,7 @@ bpf_mtap(caddr_t arg, const struct mbuf 
  */
 int
 bpf_mtap_hdr(caddr_t arg, const void *data, u_int dlen, const struct mbuf *m,
-u_int direction, void (*cpfn)(const void *, void *, size_t))
+u_int direction)
 {
struct m_hdr mh;
const struct mbuf *m0;
@@ -1382,7 +1382,7 @@ bpf_mtap_hdr(caddr_t arg, const void *da
} else 
m0 = m;
 
-   return _bpf_mtap(arg, m0, direction, cpfn);
+   return _bpf_mtap(arg, m0, direction, NULL);
 }
 
 /*
@@ -1401,7 +1401,7 @@ bpf_mtap_af(caddr_t arg, u_int32_t af, c
 
afh = htonl(af);
 
-   return bpf_mtap_hdr(arg, , sizeof(afh), m, direction, NULL);
+   return bpf_mtap_hdr(arg, , sizeof(afh), m, direction);
 }
 
 /*
@@ -1446,7 +1446,7 @@ bpf_mtap_ether(caddr_t arg, const struct
mh.mh_next = m->m_next;
 
return bpf_mtap_hdr(arg, , sizeof(evh),
-   (struct mbuf *), direction, NULL);
+   (struct mbuf *), direction);
 #endif
 }
 
Index: dev/pci/if_iwi.c
===
RCS file: /cvs/src/sys/dev/pci/if_iwi.c,v
retrieving revision 1.142
diff -u -p -r1.142 if_iwi.c
--- dev/pci/if_iwi.c18 Sep 2019 23:52:32 -  1.142
+++ dev/pci/if_iwi.c19 Sep 2019 01:21:36 -
@@ -937,7 +937,7 @@ iwi_frame_intr(struct iwi_softc *sc, str
tap->wr_flags |= IEEE80211_RADIOTAP_F_SHORTPRE;
 
bpf_mtap_hdr(sc->sc_drvbpf, tap, sc->sc_rxtap_len,
-   m, BPF_DIRECTION_IN, NULL);
+   m, BPF_DIRECTION_IN);
}
 #endif
 
@@ -1268,7 +1268,7 @@ iwi_tx_start(struct ifnet *ifp, struct m
tap->wt_chan_flags = htole16(ic->ic_bss->ni_chan->ic_flags);
 
bpf_mtap_hdr(sc->sc_drvbpf, tap, sc->sc_txtap_len,
-   m0, BPF_DIRECTION_OUT, NULL);
+   m0, BPF_DIRECTION_OUT);
}
 #endif
 
Index: dev/pci/if_iwm.c
===
RCS file: /cvs/src/sys/dev/pci/if_iwm.c,v
retrieving revision 1.251
diff -u -p -r1.251 if_iwm.c
--- dev/pci/if_iwm.c18 Sep 2019 23:52:32 -  1.251
+++ dev/pci/if_iwm.c19 Sep 2019 01:21:36 -
@@ -3555,7 +3555,7 @@ iwm_rx_rx_mpdu(struct iwm_softc *sc, str
}
 
bpf_mtap_hdr(sc->sc_drvbpf, tap, sc->sc_rxtap_len,
-   m, BPF_DIRECTION_IN, NULL);
+   m, BPF_DIRECTION_IN);
}
 #endif
ieee80211_inputm(IC2IFP(ic), m, ni, , ml);
@@ -4262,7 +4262,7 @@ iwm_tx(struct iwm_softc *sc, struct mbuf
tap->wt_flags |= IEEE80211_RADIOTAP_F_WEP;
 
bpf_mtap_hdr(sc->sc_drvbpf, tap, sc->sc_txtap_len,
-   m, BPF_DIRECTION_OUT, NULL);
+   m, BPF_DIRECTION_OUT);
}
 #endif
 
Index: dev/pci/if_iwn.c
===
RCS file: /cvs/src/sys/dev/pci/if_iwn.c,v
retrieving revision 1.217
diff -u -p -r1.217 if_iwn.c
--- dev/pci/if_iwn.c18 Sep 2019 23:52:32 -  1.217
+++ dev/pci/if_iwn.c19 Sep 2019 01:21:36 -
@@ -2227,7 +2227,7 @@ iwn_rx_done(struct iwn_softc *sc, struct
}
 
bpf_mtap_hdr(sc->sc_drvbpf, tap, sc->sc_rxtap_len,
-   m, BPF_DIRECTION_IN, NULL);
+   m, BPF_DIRECTION_IN);
}
 #endif
 
@@ -3277,7 +3277,7 @@ iwn_tx(struct iwn_softc *sc, struct mbuf
tap->wt_flags |=

don't hand-roll bpf_mtap_hdr for radiotap in wireless drivers

2019-09-11 Thread David Gwynne

radiotap code puts an mbuf on the stack so it can build a chain that
includes a radiotap header for bpf_mtap to chew on. bpf_mtap_hdr can do
that for you though.

bpf_mtap_hdr also cheats by using an m_hdr instead of a full mbuf,
which makes the stack usage less, but im arguing for the diff because it
makes the calling code simpler.

this only tweaks the intel wireless drivers, there's a handful of other
ones that could be fixed too if this diff goes in.

tests? ok?

Index: if_ipw.c
===
RCS file: /cvs/src/sys/dev/pci/if_ipw.c,v
retrieving revision 1.123
diff -u -p -r1.123 if_ipw.c
--- if_ipw.c25 Jul 2019 01:46:14 -  1.123
+++ if_ipw.c12 Sep 2019 03:14:03 -
@@ -878,7 +878,6 @@ ipw_data_intr(struct ipw_softc *sc, stru
 
 #if NBPFILTER > 0
if (sc->sc_drvbpf != NULL) {
-   struct mbuf mb;
struct ipw_rx_radiotap_header *tap = >sc_rxtap;
 
tap->wr_flags = 0;
@@ -886,13 +885,8 @@ ipw_data_intr(struct ipw_softc *sc, stru
tap->wr_chan_freq = htole16(ic->ic_ibss_chan->ic_freq);
tap->wr_chan_flags = htole16(ic->ic_ibss_chan->ic_flags);
 
-   mb.m_data = (caddr_t)tap;
-   mb.m_len = sc->sc_rxtap_len;
-   mb.m_next = m;
-   mb.m_nextpkt = NULL;
-   mb.m_type = 0;
-   mb.m_flags = 0;
-   bpf_mtap(sc->sc_drvbpf, , BPF_DIRECTION_IN);
+   bpf_mtap_hdr(sc->sc_drvbpf, tap, sc->sc_rxtap_len,
+   m, BPF_DIRECTION_IN, NULL);
}
 #endif
 
@@ -1153,20 +1147,14 @@ ipw_tx_start(struct ifnet *ifp, struct m
 
 #if NBPFILTER > 0
if (sc->sc_drvbpf != NULL) {
-   struct mbuf mb;
struct ipw_tx_radiotap_header *tap = >sc_txtap;
 
tap->wt_flags = 0;
tap->wt_chan_freq = htole16(ic->ic_ibss_chan->ic_freq);
tap->wt_chan_flags = htole16(ic->ic_ibss_chan->ic_flags);
 
-   mb.m_data = (caddr_t)tap;
-   mb.m_len = sc->sc_txtap_len;
-   mb.m_next = m;
-   mb.m_nextpkt = NULL;
-   mb.m_type = 0;
-   mb.m_flags = 0;
-   bpf_mtap(sc->sc_drvbpf, , BPF_DIRECTION_OUT);
+   bpf_mtap_hdr(sc->sc_drvbpf, tap, sc->sc_txtap_len,
+   m, BPF_DIRECTION_OUT, NULL);
}
 #endif
 
Index: if_iwi.c
===
RCS file: /cvs/src/sys/dev/pci/if_iwi.c,v
retrieving revision 1.140
diff -u -p -r1.140 if_iwi.c
--- if_iwi.c25 Jul 2019 01:46:14 -  1.140
+++ if_iwi.c12 Sep 2019 03:14:03 -
@@ -923,7 +923,6 @@ iwi_frame_intr(struct iwi_softc *sc, str
 
 #if NBPFILTER > 0
if (sc->sc_drvbpf != NULL) {
-   struct mbuf mb;
struct iwi_rx_radiotap_header *tap = >sc_rxtap;
 
tap->wr_flags = 0;
@@ -937,13 +936,8 @@ iwi_frame_intr(struct iwi_softc *sc, str
if (frame->antenna & 0x40)
tap->wr_flags |= IEEE80211_RADIOTAP_F_SHORTPRE;
 
-   mb.m_data = (caddr_t)tap;
-   mb.m_len = sc->sc_rxtap_len;
-   mb.m_next = m;
-   mb.m_nextpkt = NULL;
-   mb.m_type = 0;
-   mb.m_flags = 0;
-   bpf_mtap(sc->sc_drvbpf, , BPF_DIRECTION_IN);
+   bpf_mtap_hdr(sc->sc_drvbpf, tap, sc->sc_rxtap_len,
+   m, BPF_DIRECTION_IN, NULL);
}
 #endif
 
@@ -1265,20 +1259,14 @@ iwi_tx_start(struct ifnet *ifp, struct m
 
 #if NBPFILTER > 0
if (sc->sc_drvbpf != NULL) {
-   struct mbuf mb;
struct iwi_tx_radiotap_header *tap = >sc_txtap;
 
tap->wt_flags = 0;
tap->wt_chan_freq = htole16(ic->ic_bss->ni_chan->ic_freq);
tap->wt_chan_flags = htole16(ic->ic_bss->ni_chan->ic_flags);
 
-   mb.m_data = (caddr_t)tap;
-   mb.m_len = sc->sc_txtap_len;
-   mb.m_next = m0;
-   mb.m_nextpkt = NULL;
-   mb.m_type = 0;
-   mb.m_flags = 0;
-   bpf_mtap(sc->sc_drvbpf, , BPF_DIRECTION_OUT);
+   bpf_mtap_hdr(sc->sc_drvbpf, tap, sc->sc_txtap_len,
+   m0, BPF_DIRECTION_OUT, NULL);
}
 #endif
 
Index: if_iwm.c
===
RCS file: /cvs/src/sys/dev/pci/if_iwm.c,v
retrieving revision 1.244
diff -u -p -r1.244 if_iwm.c
--- if_iwm.c8 Aug 2019 13:56:56 -   1.244
+++ if_iwm.c12 Sep 2019 03:14:03 -
@@ -3629,7 +3629,6 @@ iwm_rx_rx_mpdu(struct iwm_softc *sc, str
 
 #if NBPFILTER > 0
if (sc->sc_drvbpf != NULL) {
-   struct mbuf mb;
struct iwm_rx_radiotap_header *tap = >sc_rxtap;
uint16_t chan_flags;
 
@@ -3674,13 +3673,8 @@ iwm_rx_rx_mpdu(struct iwm_softc *sc, str

make tun(4) kq kevent data more like everything else

2019-09-11 Thread David Gwynne

kqueue stuff lets you provide some data when an event fires. generally
things that you read and write (ie, file descriptors) provide how much
data you can read or write. tun(4) provides how many packets you can
read, and how many bytes you could write.

im arguing that the number of packets is inconsistent with everything
else and should change. im also arguing we should calculate the maximum
write userland can do better too.

this is a very low impact change. i would bet that people use libevent,
not kq directly, and libevent doesn't really provide a way to get
to the kevent data. i just want to stop stumbling over the current code
when im reading tun(4).

ok?

Index: if_tun.c
===
RCS file: /cvs/src/sys/net/if_tun.c,v
retrieving revision 1.189
diff -u -p -r1.189 if_tun.c
--- if_tun.c12 Sep 2019 01:28:29 -  1.189
+++ if_tun.c12 Sep 2019 01:37:36 -
@@ -1008,9 +1008,6 @@ tun_dev_poll(struct tun_softc *tp, int e
  *
  * The tun driver uses an array of tun_softc's based on the minor number
  * of the device.  kn->kn_hook gets set to the specific tun_softc.
- *
- * filt_tunread() sets kn->kn_data to the iface qsize
- * filt_tunwrite() sets kn->kn_data to the MTU size
  */
 int
 tunkqfilter(dev_t dev, struct knote *kn)
@@ -1082,7 +1079,6 @@ filt_tunread(struct knote *kn, long hint
 {
struct tun_softc*tp;
struct ifnet*ifp;
-   unsigned int len;
 
if (kn->kn_status & KN_DETACHED) {
kn->kn_data = 0;
@@ -1092,16 +1088,9 @@ filt_tunread(struct knote *kn, long hint
tp = (struct tun_softc *)kn->kn_hook;
ifp = >tun_if;
 
-   len = IFQ_LEN(>if_snd);
-   if (len > 0) {
-   kn->kn_data = len;
+   kn->kn_data = ifq_hdatalen(>if_snd);
 
-   TUNDEBUG(("%s: tunkqread q=%d\n", ifp->if_xname,
-   IFQ_LEN(>if_snd)));
-   return (1);
-   }
-   TUNDEBUG(("%s: tunkqread waiting\n", ifp->if_xname));
-   return (0);
+   return (kn->kn_data > 0);
 }
 
 void
@@ -1131,7 +1120,7 @@ filt_tunwrite(struct knote *kn, long hin
tp = (struct tun_softc *)kn->kn_hook;
ifp = >tun_if;
 
-   kn->kn_data = ifp->if_mtu;
+   kn->kn_data = ifp->if_hdrlen + ifp->if_hardmtu;
 
return (1);
 }

let bpf_mtap_hdr take a void * instead of caddr_t for the header

2019-09-10 Thread David Gwynne

this makes it easier to call at least.

it also brings it in line with bpf_tap_hdr. otherwise there's no
functional change.

ok?

Index: sys/net/bpf.c
===
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.177
diff -u -p -r1.177 bpf.c
--- sys/net/bpf.c   13 Jun 2019 21:14:53 -  1.177
+++ sys/net/bpf.c   10 Sep 2019 23:59:14 -
@@ -1367,7 +1367,7 @@ bpf_mtap(caddr_t arg, const struct mbuf 
  * it or keep a pointer to it.
  */
 int
-bpf_mtap_hdr(caddr_t arg, caddr_t data, u_int dlen, const struct mbuf *m,
+bpf_mtap_hdr(caddr_t arg, const void *data, u_int dlen, const struct mbuf *m,
 u_int direction, void (*cpfn)(const void *, void *, size_t))
 {
struct m_hdr mh;
@@ -1377,7 +1377,7 @@ bpf_mtap_hdr(caddr_t arg, caddr_t data, 
mh.mh_flags = 0;
mh.mh_next = (struct mbuf *)m;
mh.mh_len = dlen;
-   mh.mh_data = data;
+   mh.mh_data = (void *)data;
m0 = (struct mbuf *)
} else 
m0 = m;
Index: sys/net/bpf.h
===
RCS file: /cvs/src/sys/net/bpf.h,v
retrieving revision 1.66
diff -u -p -r1.66 bpf.h
--- sys/net/bpf.h   17 Mar 2019 23:57:12 -  1.66
+++ sys/net/bpf.h   10 Sep 2019 23:59:14 -
@@ -315,7 +315,7 @@ struct mbuf;
 
 int bpf_validate(struct bpf_insn *, int);
 int bpf_mtap(caddr_t, const struct mbuf *, u_int);
-int bpf_mtap_hdr(caddr_t, caddr_t, u_int, const struct mbuf *, u_int,
+int bpf_mtap_hdr(caddr_t, const void *, u_int, const struct mbuf *, u_int,
void (*)(const void *, void *, size_t));
 int bpf_mtap_af(caddr_t, u_int32_t, const struct mbuf *, u_int);
 int bpf_mtap_ether(caddr_t, const struct mbuf *, u_int);
Index: share/man/man9/bpf_mtap.9
===
RCS file: /cvs/src/share/man/man9/bpf_mtap.9,v
retrieving revision 1.14
diff -u -p -r1.14 bpf_mtap.9
--- share/man/man9/bpf_mtap.9   2 Feb 2018 10:52:44 -   1.14
+++ share/man/man9/bpf_mtap.9   10 Sep 2019 23:59:14 -
@@ -66,7 +66,7 @@
 .Ft int
 .Fo bpf_mtap_hdr
 .Fa "caddr_t bpf"
-.Fa "caddr_t hdr"
+.Fa "const void *hdr"
 .Fa "u_int hdrlen"
 .Fa "const struct mbuf *m"
 .Fa "u_int direction"

let tun(4) and tap(4) receive larger than if_mtu bytes packets

2019-08-28 Thread David Gwynne

tun_dev_write currently checks if the packet being written into the
kernel is less than the current if_mtu of the interface. we don't really
have a conflation of the mtu with the mru in any other driver, so i'd
like to remove it from tun(4).

we can also avoid a check at runtime about what type of interface
it is by using the if_hdrlen value. tun needs the 4 byte af shim,
and tap should have a whole ethernet header before ether_input gets
to look at it.

ok?

Index: if_tun.c
===
RCS file: /cvs/src/sys/net/if_tun.c,v
retrieving revision 1.187
diff -u -p -r1.187 if_tun.c
--- if_tun.c10 Jun 2019 21:55:16 -  1.187
+++ if_tun.c29 Aug 2019 02:53:35 -
@@ -838,8 +838,7 @@ tun_dev_write(struct tun_softc *tp, stru
ifp = >tun_if;
TUNDEBUG(("%s: tunwrite\n", ifp->if_xname));
 
-   if (uio->uio_resid == 0 || uio->uio_resid > ifp->if_mtu +
-   (tp->tun_flags & TUN_LAYER2 ? ETHER_HDR_LEN : sizeof(*th))) {
+   if (uio->uio_resid < ifp->if_hdrlen) {
TUNDEBUG(("%s: len=%d!\n", ifp->if_xname, uio->uio_resid));
return (EMSGSIZE);
}

Re: vio0 recover from mbuf limit

2019-08-06 Thread David Gwynne




> On 5 Aug 2019, at 22:38, Alexander Bluhm  wrote:
> 
> On Thu, Jul 18, 2019 at 11:31:26PM +1000, David Gwynne wrote:
>>> And I have to do ifconfig vio0 down and ifconfig vio0 up to receive
>>> packets on the interface again.
>> 
>> Can you look at systat mb when this happens to check what value
>> vio0 has in the ALIVE column? If ALIVE is 0, it means the ring is
>> empty. It looks like the driver does try to cope with this situation,
>> but maybe it doesn't cope well enough. Let's start with checking
>> if it is an empty ring first.
> 
> After a bit of testing it looks that the codition for adding the
> timeout is never true.  It is a consequence of converting to
> if_rxr_get().
> 
> The easiest fix is to add the rx tick timeout unconditionaly.
> Runnning this code every second is not too much waste.  Other drivers
> also do that.
> 
> ok?

I don't like how big this hammer is, but it is better than the current 
situation.

It doesn't look like vio uses MPSAFE interrupts, so the timeout can't run 
concurrently with the rxeof stuff, so this should be fine.

ok.

> 
> bluhm
> 
> Index: dev/pv/if_vio.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pv/if_vio.c,v
> retrieving revision 1.12
> diff -u -p -r1.12 if_vio.c
> --- dev/pv/if_vio.c   26 May 2019 15:22:31 -  1.12
> +++ dev/pv/if_vio.c   5 Aug 2019 06:52:04 -
> @@ -982,10 +982,7 @@ vio_populate_rx_mbufs(struct vio_softc *
> 
>   if (done)
>   virtio_notify(vsc, vq);
> - if (vq->vq_used_idx != vq->vq_avail_idx)
> - timeout_del(>sc_rxtick);
> - else
> - timeout_add_sec(>sc_rxtick, 1);
> + timeout_add_sec(>sc_rxtick, 1);
> }
> 
> /* dequeue received packets */

Re: tpmr(4): 802.1Q Two-Port MAC Relay

2019-07-30 Thread David Gwynne




> On 30 Jul 2019, at 6:28 pm, Remi Locherer  wrote:
> 
> On Tue, Jul 30, 2019 at 01:36:59PM +1000, David Gwynne wrote:
>> a Two-Port MAC Relay is basically a cut down bridge(4). it only supports
>> two ports, and unconditionally relays packets between those ports
>> instead of doing learning or anything like that.
>> 
>> i've been trying to get a redundant pair of bridges set up between two
>> datacenters here to help me while i migrate between them. so far all my
>> efforts to make it redundant have mostly worked, until they introduced
>> loops in the layer 2 topology, which generates a broadcast storm, which
>> basically takes the net down for a few minutes at a time. it's feels
>> very betraying.
>> 
>> my frustration is that switches plugged together have mechanisms to
>> prevent loops like that, more specifically they use spanning tree or
>> lacp to make appropriate use of redundant links. i got to a point where
>> i just wanted the switches to talk to each other and do their own thing
>> to negotiate use of the redundant links.
>> 
>> unfortunately the only way to get ethernet packets off a physical
>> wire and onto a tunnel over an ip network is bridge(4), and bridge(4)
>> tries to be a compliant switch from a standards point of view. this
>> means it intercepts packets that are meant to be processed by bridges,
>> because it is a bridge. these types of packets include spanning tree and
>> lacp, which means i couldnt get the physical switches at each site to
>> talk to each other. sadface.
>> 
>> so to solve my problem i hacked up a small driver that did less than
>> bridge(4). however, it turns out that what i hacked up is an actual
>> thing that already exists as something done in the real world. IEEE
>> 802.1Q describes TPMR, which is defined as intercepting far less
>> than a real bridge does. one of the appendices specifically describes
>> lacp going through one, which is exactly what i wanted. cisco does
>> something like this with their layer 2 cross-connects (search for cisco
>> xconnect for examples), juniper has l2circuits, and so on.
>> 
>> the way i'm using this is like below. i have a pair of bridges in each
>> datacenter, so 4 boxes in total. they peer directly with the ip network
>> that sits between the datacenter. each box has a 4 physical network
>> ports. 2 of those ports are configured with aggr(4) and talk IP into the
>> core network. the other two ports are connected to the switches at
>> each site for use with tpmr. there's 2 etherip interfaces configured on
>> each physical box, each of which is connected to the tpmr.
>> 
>> all that together looks a bit like the following:
>> 
>> +-+ +--+  +---+ +-+
>> |d|-|ix2 <-> tpmr0 <-> etherip0|--|etherip0 <-> tpmr0 <-> ixl0|-|d|
>> |c| |  |  |   | |c|
>> |0|-|ix3 <-> tpmr1 <-> etherip1|--|etherip1 <-> tpmr1 <-> ixl1|-|1|
>> ||| +--+ \  / +---+ |||
>> |s| dc0-bridge0   \/  dc1-bridge0   |s|
>> |w|   /\|w|
>> |i| +--+ /  \ +---+ |i|
>> |t|-|ix2 <-> tpmr0 <-> etherip0|--|etherip0 <-> tpmr0 <-> ixl0|-|t|
>> |c| |  |  |   | |c|
>> |h|-|ix3 <-> tpmr1 <-> etherip1|--|etherip1 <-> tpmr1 <-> ixl1|-|h|
>> +-+ +--+  +---+ +-+
>> dc0-bridge1   dc1-bridge1
>> 
>> each switch has a 4 port port-channel (lacp aggregation) set up. because
>> each physical interface on the bridges are tied to a single tunnel, the
>> packets effectively traverse a point-to-point link, ie, a really
>> complicated wire. because lacp makes it from each point to the other
>> point, the switches make sure only active lacp ports are used, which
>> avoids layer 2 loops. lacp also means i get to use all the links when
>> theyre available.
>> 
>> with the topology above i can lose a bridge at each site and should
>> still have a working link to the other side, so i get my redundancy. the
>> use of the extra links with lacp is a bonus. at this point i would have
>> been happy for spanning tree to shut links down.
>> 
>> anyway, here's the code.
>> 
>> it was originally called xcon(4) since it pr

Re: tpmr(4): 802.1Q Two-Port MAC Relay

2019-07-29 Thread David Gwynne

On Tue, Jul 30, 2019 at 01:36:59PM +1000, David Gwynne wrote:
> a Two-Port MAC Relay is basically a cut down bridge(4). it only supports
> two ports, and unconditionally relays packets between those ports
> instead of doing learning or anything like that.

i had written a manpage too:

TPMR(4)  Device Drivers Manual TPMR(4)

NAME
 tpmr - IEEE 802.1Q Two-Port MAC Relay interface

SYNOPSIS
 pseudo-device tpmr

DESCRIPTION
 The tpmr driver implements an 802.1Q (originally 802.1aj) Two-Port MAC
 Relay (TPMR), otherwise known as an Ethernet cross-connect, or layer 2
 circuit.

 A TPMR is a simplified Ethernet bridge that provides a subset of the
 functionality as found in bridge(4).  A TPMR has exactly two ports, and
 unconditionally relays Ethernet packets between the two ports.

 tpmr interfaces can be created at runtime using the ifconfig tpmrN create
 command or by setting up a hostname.if(5) configuration file for
 netstart(8).  The interface itself can be configured with ifconfig(8);
 see its manual page for more information.

 tpmr interfaces may be configured with ifconfig(8) and netstart(8) using
 the following options:

 trunkport child-iface
 Add child-iface as a port.

 -trunkport child-iface
 Remove the port child-iface.

 Other forms of Ethernet bridging are available using the bridge(4)
 driver.  Other forms of aggregation of Ethernet interfaces are available
 using the aggr(4) and trunk(4) drivers.

EXAMPLES
 tpmr can be used to cross-connect Ethernet devices that support different
 physical media.  For example, a device that supports a 100baseTX half-
 duplex connection can be connected to a switch with 1000baseSX optical
 ports by using tpmr with a pair of physical network interfaces, each of
 which supports the required media types.  If fxp(4) is used to connect to
 the 100baseTX device, and em(4) is used to connect to the 1000baseSX
 switch, the following configuration can be used:

 # ifconfig tpmr0 create
 # ifconfig tpmr0 trunkport fxp0 trunkport em0
 # ifconfig fxp0 up
 # ifconfig em0 up
 # ifconfig tpmr0 up

 Multiple TPMRs can be chained to transport Ethernet traffic for a pair of
 devices over another network.  Given two physically separate Ethernet
 switches, TPMRs can be used as follows to provide a point-to-point
 Ethernet link between them.  TPMRs allow the use of the Link Aggregation
 Control Protocol (LACP) or Spanning Tree Protocol (STP) by the switches
 to detect communication failures or connectivity loops respectively,
 which is not possible using bridge(4) as it filters those protocols.

 If Host A connected to Router B has the external IP address 192.0.2.10 on
 em0, Host D connected to Router C has the external IP address
 198.51.100.14 on em0, and both hosts have em1 connected to the switches,
 the following configuration can be used to connect the switches together.
 etherip(4) is used to transport the Ethernet packets over the IP network.

 Switch X  Host A -- tunnel --- Host D  Switch E
\/
 \  /
  + Router B  Router C +

 Create the tpmr and etherip(4) interfaces:

   # ifconfig etherip0 create
   # ifconfig tpmr0 create

 Configure the etherip interface:

   (on Host A) # ifconfig etherip0 tunnel 192.0.2.10 198.51.100.14 up
   (on Host D) # ifconfig etherip0 tunnel 198.51.100.14 192.0.2.10 up

 Add the etherip interface and physical interface to the TPMR:

   # ifconfig tpmr0 trunkport em1 trunkport etherip0 up

 An equivalent setup using MPLS pseudowires instead of IP as the transport
 can be built using mpw(4) interfaces.

SEE ALSO
 aggr(4), bridge(4), trunk(4), hostname.if(5), ifconfig(8), netstart(8)

HISTORY
 The tpmr driver first appeared in OpenBSD 6.6.

OpenBSD 6.5  July 5, 2019  OpenBSD 6.5

Index: Makefile
===
RCS file: /cvs/src/share/man/man4/Makefile,v
retrieving revision 1.716
diff -u -p -r1.716 Makefile
--- Makefile5 Jul 2019 01:41:14 -   1.716
+++ Makefile30 Jul 2019 04:10:34 -
@@ -70,8 +70,8 @@ MAN=  aac.4 abcrtc.4 ac97.4 acphy.4 acrtc
st.4 ste.4 stge.4 sti.4 stp.4 sv.4 switch.4 sxiccmu.4 sximmc.4 \
sxipio.4 sxirsb.4 sxirtc.4 sxitemp.4 sxitwi.4 sym.4 sypwr.4 syscon.4 \
tcic.4 tcp.4 termios.4 tht.4 ti.4 tipmic.4 tl.4 \
-   tlphy.4 thmc.4 tpm.4 tqphy.4 trm.4 trunk.4 tsl.4 tty.4 tun.4 tap.4 \
-   twe.4 \
+   tlphy.4 thmc.4 tpm.4 tpmr.4 tqphy.4 trm.4 trunk.4 tsl.4 tty.4 \
+   tun.4 tap.4 twe.4 \
txp.4 txphy.4

tpmr(4): 802.1Q Two-Port MAC Relay

2019-07-29 Thread David Gwynne

oad from 1Mpps: 800Kpps
tpmr(4): 1.75Mpps

1.75Mpps was lower than I was expecting, but it turns out he was hitting
limits in other parts of the system. with some tuning we got it up to
2.25Mpps. the softnet taskq was only at about 66% cpu time, but we
couldnt see any other obvious places that we were dropping load.

on a slower box that can do IP forwarding at 1Mpps, tpmr(4) can do
1.6Mpps. it's worth noting that the boxes were extremely responsive (ie,
ssh feels fine) when tpmr is under load, which is not the case when ip
forwarding or bridge are being hammered.

my point is that it might be useful having tpmr(4) just to be able to
test network driver performance improvements independently of the stack.
im probably going to be using it to monitor links as a "bump in the
wire" too.

lastly regarding the code. i made this use the trunk(4) ioctls instead
of the bridge ones, mostly because i had to fake less stuff to make
ifconfig output look ok.

ifconfig output looks like this:

xdlg@dc3-bridge1:~$ ifconfig tpmr
 
tpmr0: flags=51
description: xconnect
index 15 priority 0 llprio 7
trunk: trunkproto none
ix2 port active,collecting,distributing
etherip10 port active,collecting,distributing
groups: tpmr
status: active

anyway. thoughts? ok?

Index: net/if_tpmr.c
===
RCS file: net/if_tpmr.c
diff -N net/if_tpmr.c
--- /dev/null   1 Jan 1970 00:00:00 -
+++ net/if_tpmr.c   29 Jul 2019 09:44:26 -
@@ -0,0 +1,717 @@
+/* $OpenBSD$ */
+
+/*
+ * Copyright (c) 2019 The University of Queensland
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * This code was written by David Gwynne  as part
+ * of the Information Technology Infrastructure Group (ITIG) in the
+ * Faculty of Engineering, Architecture and Information Technology
+ * (EAIT).
+ */
+
+#include "bpfilter.h"
+#include "vlan.h"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#include  /* if_trunk.h uses ifmedia bits */
+#include  /* if_trunk.h uses siphash bits */
+#include 
+
+#if NBPFILTER > 0
+#include 
+#endif
+
+#if NVLAN > 0
+#include 
+#endif
+
+/*
+ * tpmr interface
+ */
+
+#define TPMR_NUM_PORTS 2
+#define TPMR_TRUNK_PROTO   TRUNK_PROTO_NONE
+
+struct tpmr_softc;
+
+struct tpmr_port {
+   struct ifnet*p_ifp0;
+
+   int (*p_ioctl)(struct ifnet *, u_long, caddr_t);
+   int (*p_output)(struct ifnet *, struct mbuf *, struct sockaddr *,
+   struct rtentry *);
+
+   void*p_lcookie;
+   void*p_dcookie;
+
+   struct tpmr_softc   *p_tpmr;
+   unsigned int p_slot;
+};
+
+struct tpmr_softc {
+   struct ifnet sc_if;
+   unsigned int sc_dead;
+
+   struct tpmr_port*sc_ports[TPMR_NUM_PORTS];
+   unsigned int sc_nports;
+};
+
+#define DPRINTF(_sc, fmt...)   do { \
+   if (ISSET((_sc)->sc_if.if_flags, IFF_DEBUG)) \
+   printf(fmt); \
+} while (0)
+
+static int tpmr_clone_create(struct if_clone *, int);
+static int tpmr_clone_destroy(struct ifnet *);
+
+static int tpmr_ioctl(struct ifnet *, u_long, caddr_t);
+static int tpmr_enqueue(struct ifnet *, struct mbuf *);
+static int tpmr_output(struct ifnet *, struct mbuf *, struct sockaddr *,
+   struct rtentry *);
+static voidtpmr_start(struct ifqueue *);
+
+static int tpmr_up(struct tpmr_softc *);
+static int tpmr_down(struct tpmr_softc *);
+static int tpmr_iff(struct tpmr_softc *);
+
+static voidtpmr_p_linkch(void *);
+static voidtpmr_p_detach(void *);
+static int tpmr_p_ioctl(struct ifnet *, u_long, caddr_t);
+static int tpmr_p_output(struct ifnet *, struct mbuf *,
+   struct sockaddr *, struct rtentry *);
+
+static int tpmr_get_trunk(struct tpmr_softc *, struct trunk_reqall *);
+static voidtpmr_p_dtor(struct tpmr_softc *, struct tpmr_port *,
+   const char *);
+static int

shrink ifconfig trunkport output a bit

2019-07-24 Thread David Gwynne

this makes ifconfig output for a trunk/aggr look a bit smaller:

this is before the change:

aggr0: flags=8943 mtu 9000
lladdr bc:2c:55:9e:34:cb
index 7 priority 0 llprio 7
trunk: trunkproto lacp
trunk id: [(8000,bc:2c:55:9e:34:cb,0007,,),
 (8000,54:7f:ee:7b:a9:c1,0003,,)]
trunkport ix0 lacp_state actor 
activity,aggregation,sync,collecting,distributing
trunkport ix0 lacp_state actor system pri 0x8000 mac 
bc:2c:55:9e:34:cb, key 0x7, port pri 0x8000 number 0x1
trunkport ix0 lacp_state partner 
activity,aggregation,sync,collecting,distributing
trunkport ix0 lacp_state partner system pri 0x8000 mac 
54:7f:ee:7b:a9:c1, key 0x3, port pri 0x8000 number 0x109
trunkport ix0 active,collecting,distributing
trunkport ix1 lacp_state actor 
activity,aggregation,sync,collecting,distributing
trunkport ix1 lacp_state actor system pri 0x8000 mac 
bc:2c:55:9e:34:cb, key 0x7, port pri 0x8000 number 0x2
trunkport ix1 lacp_state partner 
activity,aggregation,sync,collecting,distributing
trunkport ix1 lacp_state partner system pri 0x8000 mac 
54:7f:ee:7b:a9:c1, key 0x3, port pri 0x8000 number 0x10a
trunkport ix1 active,collecting,distributing
groups: aggr
media: Ethernet autoselect
status: active

this is after:

aggr0: flags=8847 mtu 1500
lladdr fe:e1:ba:d0:35:21
index 10 priority 0 llprio 7
trunk: trunkproto lacp
trunk id: [(8000,fe:e1:ba:d0:35:21,000A,,),
 (8000,00:01:e8:d7:b0:34,0034,,)]
ixl0 lacp actor system pri 0x8000 mac fe:e1:ba:d0:35:21, key 
0xa, port pri 0x8000 number 0x4
ixl0 lacp actor state aggregation,sync,collecting,distributing
ixl0 lacp partner system pri 0x8000 mac 00:01:e8:d7:b0:34, key 
0x34, port pri 0x8000 number 0xb6
ixl0 lacp partner state 
activity,timeout,aggregation,sync,collecting,distributing
ixl0 port active,collecting,distributing
ixl1 lacp actor system pri 0x8000 mac fe:e1:ba:d0:35:21, key 
0xa, port pri 0x8000 number 0x5
ixl1 lacp actor state aggregation,sync,collecting,distributing
ixl1 lacp partner system pri 0x8000 mac 00:01:e8:d7:b0:34, key 
0x34, port pri 0x8000 number 0xb2
ixl1 lacp partner state 
activity,timeout,aggregation,sync,collecting,distributing
ixl1 port active,collecting,distributing
groups: aggr
media: Ethernet autoselect
status: active

ok?

Index: ifconfig.c
===
RCS file: /cvs/src/sbin/ifconfig/ifconfig.c,v
retrieving revision 1.405
diff -u -p -r1.405 ifconfig.c
--- ifconfig.c  24 Jul 2019 01:48:53 -  1.405
+++ ifconfig.c  24 Jul 2019 06:05:09 -
@@ -4509,12 +4509,7 @@ trunk_status(void)
for (i = 0; i < ra.ra_ports; i++) {
lp = (struct lacp_opreq *)&(rpbuf[i].rp_lacpreq);
if (ra.ra_proto == TRUNK_PROTO_LACP) {
-   printf("\t\ttrunkport %s lacp_state actor ",
-   rpbuf[i].rp_portname);
-   printb_status(lp->actor_state,
-   LACP_STATE_BITS);
-   putchar('\n');
-   printf("\t\ttrunkport %s lacp_state actor "
+   printf("\t\t%s lacp actor "
"system pri 0x%x mac %s, key 0x%x, "
"port pri 0x%x number 0x%x\n",
rpbuf[i].rp_portname,
@@ -4523,12 +4518,13 @@ trunk_status(void)
 lp->actor_mac),
lp->actor_key,
lp->actor_portprio, lp->actor_portno);
-   printf("\t\ttrunkport %s lacp_state partner ",
+   printf("\t\t%s lacp actor state ",
rpbuf[i].rp_portname);
-   printb_status(lp->partner_state,
+   printb_status(lp->actor_state,
LACP_STATE_BITS);
putchar('\n');
-   printf("\t\ttrunkport %s lacp_state partner "
+
+   printf("\t\t%s lacp partner "
"system pri 0x%x mac %s, key 0x%x, "
"port pri 0x%x number 0x%x\n",
rpbuf[i].rp_portname,
@@ -4537,9 +4533,14 @@ trunk_status(void)
 lp->partner_mac),

make ifconfig print lacp actor and partner info

2019-07-20 Thread David Gwynne

this adds a couple of lines to the lacp output so you can see what
params are used on the wire. this can help if you've put switchports in
different port-channels or such. an example of that is:

aggr0: flags=8847 mtu 1500
lladdr fe:e1:ba:d0:35:21
index 10 priority 0 llprio 7
trunk: trunkproto lacp
trunk id: [(8000,fe:e1:ba:d0:35:21,000A,,),
 (8000,00:01:e8:d7:b0:34,0034,,)]
trunkport ixl0 lacp_state actor 
aggregation,sync,collecting,distributing
trunkport ixl0 lacp_state actor system pri 0x8000 mac 
fe:e1:ba:d0:35:21, key 0xa, port pri 0x8000 number 0x4
trunkport ixl0 lacp_state partner 
activity,timeout,aggregation,sync,collecting,distributing
trunkport ixl0 lacp_state partner system pri 0x8000 mac 
00:01:e8:d7:b0:34, key 0x34, port pri 0x8000 number 0xb6
trunkport ixl0 active,collecting,distributing
trunkport ixl1 lacp_state actor aggregation
trunkport ixl1 lacp_state actor system pri 0x8000 mac 
fe:e1:ba:d0:35:21, key 0xa, port pri 0x8000 number 0x5
trunkport ixl1 lacp_state partner 
activity,timeout,aggregation,sync
trunkport ixl1 lacp_state partner system pri 0x8000 mac 
00:01:e8:d7:b0:34, key 0x30, port pri 0x8000 number 0xb2
trunkport ixl1 
groups: aggr
media: Ethernet autoselect
status: active

the keys (and system mac) must match before ports will be considered
part of the same aggregation. after fixing one of the switch ports:

aggr0: flags=8847 mtu 1500
lladdr fe:e1:ba:d0:35:21
index 10 priority 0 llprio 7
trunk: trunkproto lacp
trunk id: [(8000,fe:e1:ba:d0:35:21,000A,,),
 (8000,00:01:e8:d7:b0:34,0034,,)]
trunkport ixl0 lacp_state actor 
aggregation,sync,collecting,distributing
trunkport ixl0 lacp_state actor system pri 0x8000 mac 
fe:e1:ba:d0:35:21, key 0xa, port pri 0x8000 number 0x4
trunkport ixl0 lacp_state partner 
activity,timeout,aggregation,sync,collecting,distributing
trunkport ixl0 lacp_state partner system pri 0x8000 mac 
00:01:e8:d7:b0:34, key 0x34, port pri 0x8000 number 0xb6
trunkport ixl0 active,collecting,distributing
trunkport ixl1 lacp_state actor 
aggregation,sync,collecting,distributing
trunkport ixl1 lacp_state actor system pri 0x8000 mac 
fe:e1:ba:d0:35:21, key 0xa, port pri 0x8000 number 0x5
trunkport ixl1 lacp_state partner 
activity,timeout,aggregation,sync,collecting,distributing
trunkport ixl1 lacp_state partner system pri 0x8000 mac 
00:01:e8:d7:b0:34, key 0x34, port pri 0x8000 number 0xb2
trunkport ixl1 active,collecting,distributing
groups: aggr
media: Ethernet autoselect
status: active

i do think this output could be trimmed (a lot), but that's mostly about
the flags and should be done in the future.

ok?

Index: ifconfig.c
===
RCS file: /cvs/src/sbin/ifconfig/ifconfig.c,v
retrieving revision 1.401
diff -u -p -r1.401 ifconfig.c
--- ifconfig.c  19 Apr 2019 04:24:25 -  1.401
+++ ifconfig.c  20 Jul 2019 04:54:59 -
@@ -4515,11 +4515,29 @@ trunk_status(void)
printb_status(lp->actor_state,
LACP_STATE_BITS);
putchar('\n');
+   printf("\t\ttrunkport %s lacp_state actor "
+   "system pri 0x%x mac %s, key 0x%x, "
+   "port pri 0x%x number 0x%x\n",
+   rpbuf[i].rp_portname,
+   lp->actor_prio,
+   ether_ntoa((struct ether_addr*)
+lp->actor_mac),
+   lp->actor_key,
+   lp->actor_portprio, lp->actor_portno);
printf("\t\ttrunkport %s lacp_state partner ",
rpbuf[i].rp_portname);
printb_status(lp->partner_state,
LACP_STATE_BITS);
putchar('\n');
+   printf("\t\ttrunkport %s lacp_state partner "
+   "system pri 0x%x mac %s, key 0x%x, "
+   "port pri 0x%x number 0x%x\n",
+   rpbuf[i].rp_portname,
+   lp->partner_prio,
+   ether_ntoa((struct ether_addr*)
+lp->partner_mac),
+   lp->partner_key,
+

Re: mbuf cluster limit pool wakeup

2019-07-18 Thread David Gwynne




> On 18 Jul 2019, at 7:44 am, Alexander Bluhm  wrote:
> 
> On Tue, Jul 16, 2019 at 08:58:43PM -0300, Martin Pieuchot wrote:
>> On 16/07/19(Tue) 21:35, Alexander Bluhm wrote:
>>> Hi,
>>> 
>>> When the kernel reaches the sysclt kern.maxclusters limit, operations
>>> get stuck while holding the net lock.  Increasing the limit does
>>> not help as there is no wakeup of the pools.  So run through the
>>> mbuf pool request list when the limit changes.
>> 
>> Should you call pool_wakeup() only if the new limit is greater than the
>> current one?
> 
> Additional wakups do not hurt, especially from the cold sysctl path.
> Note that sysctl KERN_MAXCLUSTERS has a val != nmbclust check
> already.  So the value has to change.  I think it is not worth to
> add more code to nmbclust_update.
> 
>>> There seem to more problems when recovering from mbuf shortage, but
>>> this is the most obvious one.
>> 
>> By more problems to you mean "How to give mbuf back to the pool"?
> 
> Sorry for being so vague.
> 
> dlg@ has some ideas about what could cause starvation in pools.
> But I see problems in socket, protocol and interface layer.
> 
> The listen queue is full, inetd blocks reading from another socket.
> I guess poll/select information was not correctly reported to
> userland so it got stuck.
> 
> I see half closed TCP connections hanging on loopback forever:
> 
> tcp  0  0  127.0.0.1.9127.0.0.1.18490
> ESTABLISHED
> tcp  0  0  127.0.0.1.18490127.0.0.1.9
> FIN_WAIT_1
> 
> And I have to do ifconfig vio0 down and ifconfig vio0 up to receive
> packets on the interface again.

Can you look at systat mb when this happens to check what value vio0 has in the 
ALIVE column? If ALIVE is 0, it means the ring is empty. It looks like the 
driver does try to cope with this situation, but maybe it doesn't cope well 
enough. Let's start with checking if it is an empty ring first.

dlg

> 
> These things have to be investigated and fixed after this diff has
> been commited.  In general we behave poorly when hitting resource
> limits.  I think nobody is testing this regulary.
> 
> bluhm
> 
>>> Index: kern/subr_pool.c
>>> ===
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/subr_pool.c,v
>>> retrieving revision 1.227
>>> diff -u -p -r1.227 subr_pool.c
>>> --- kern/subr_pool.c23 Apr 2019 13:35:12 -  1.227
>>> +++ kern/subr_pool.c16 Jul 2019 18:02:08 -
>>> @@ -815,6 +815,12 @@ pool_put(struct pool *pp, void *v)
>>> if (freeph != NULL)
>>> pool_p_free(pp, freeph);
>>> 
>>> +   pool_wakeup(pp);
>>> +}
>>> +
>>> +void
>>> +pool_wakeup(struct pool *pp)
>>> +{
>>> if (!TAILQ_EMPTY(>pr_requests)) {
>>> pl_enter(pp, >pr_requests_lock);
>>> pool_runqueue(pp, PR_NOWAIT);
>>> Index: kern/uipc_mbuf.c
>>> ===
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_mbuf.c,v
>>> retrieving revision 1.270
>>> diff -u -p -r1.270 uipc_mbuf.c
>>> --- kern/uipc_mbuf.c16 Jul 2019 17:39:02 -  1.270
>>> +++ kern/uipc_mbuf.c16 Jul 2019 18:04:33 -
>>> @@ -167,8 +167,6 @@ mbinit(void)
>>> 
>>> m_pool_allocator.pa_pagesz = pool_allocator_multi.pa_pagesz;
>>> 
>>> -   error = nmbclust_update(nmbclust);
>>> -   KASSERT(error == 0);
>>> mbuf_mem_alloc = 0;
>>> 
>>> #if DIAGNOSTIC
>>> @@ -196,6 +194,9 @@ mbinit(void)
>>> m_pool_init([i], mclsizes[i], 64, mclnames[i]);
>>> }
>>> 
>>> +   error = nmbclust_update(nmbclust);
>>> +   KASSERT(error == 0);
>>> +
>>> (void)mextfree_register(m_extfree_pool);
>>> KASSERT(num_extfree_fns == 1);
>>> }
>>> @@ -217,11 +218,18 @@ mbcpuinit()
>>> int
>>> nmbclust_update(long newval)
>>> {
>>> +   int i;
>>> +
>>> if (newval < 0 || newval > LONG_MAX / MCLBYTES)
>>> return ERANGE;
>>> /* update the global mbuf memory limit */
>>> nmbclust = newval;
>>> mbuf_mem_limit = nmbclust * MCLBYTES;
>>> +
>>> +   pool_wakeup();
>>> +   for (i = 0; i < nitems(mclsizes); i++)
>>> +   pool_wakeup([i]);
>>> +
>>> return 0;
>>> }
>>> 
>>> Index: sys/pool.h
>>> ===
>>> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/pool.h,v
>>> retrieving revision 1.76
>>> diff -u -p -r1.76 pool.h
>>> --- sys/pool.h  10 Feb 2019 22:45:58 -  1.76
>>> +++ sys/pool.h  16 Jul 2019 18:02:08 -
>>> @@ -271,6 +271,7 @@ voidpool_request_init(struct pool_requ
>>> void (*)(struct pool *, void *, void *), void *);
>>> voidpool_request(struct pool *, struct pool_request *);
>>> voidpool_put(struct pool *, void *);
>>> +void   pool_wakeup(struct pool *);
>>> int pool_reclaim(struct pool *);
>>> voidpool_reclaim_all(void);
>>> int

Re: ETHER_IS_BCASTADDR() / ETHER_IS_ANYADDR()

2019-07-16 Thread David Gwynne




> On 17 Jul 2019, at 09:44, Martin Pieuchot  wrote:
> 
> On 15/07/19(Mon) 15:29, Todd C. Miller wrote:
>> On Mon, 15 Jul 2019 18:21:28 -0300, Martin Pieuchot wrote:
>> 
>>> We have many home brewed ways to check Ethernet addresses.  This diff
>>> introduces two macros similar to ETHER_IS_MULTICAST() and make use of
>>> them in bridge(4) and generic Ethernet layer.
>> 
>> I think you can also replace the ether_isbcast, ether_isequal and
>> ether_cmp macros in net/if_gre.c as well.
> 
> Updated diff:
> 
> - Introduce ETHER_IS() and convert the various ether_is_* in bpe(4) and
>  gre(4) as pointed by millert@
> 
> - Use ETHER_IS_BROADCAST() for compatibility with FreeBSD as pointed by
>  emaste@
> 
> Ok?

ok.

> 
> Index: net/if_bpe.c
> ===
> RCS file: /cvs/src/sys/net/if_bpe.c,v
> retrieving revision 1.7
> diff -u -p -r1.7 if_bpe.c
> --- net/if_bpe.c  21 May 2019 10:11:10 -  1.7
> +++ net/if_bpe.c  16 Jul 2019 23:37:52 -
> @@ -144,10 +144,6 @@ static struct bpe_tree bpe_interfaces = 
> static struct rwlock bpe_lock = RWLOCK_INITIALIZER("bpeifs");
> static struct pool bpe_entry_pool;
> 
> -#define ether_cmp(_a, _b)memcmp((_a), (_b), ETHER_ADDR_LEN)
> -#define ether_is_eq(_a, _b)  (ether_cmp((_a), (_b)) == 0)
> -#define ether_is_bcast(_a)   ether_is_eq((_a), etherbroadcastaddr)
> -
> void
> bpeattach(int count)
> {
> @@ -290,7 +286,7 @@ bpe_start(struct ifnet *ifp)
> 
>   beh = mtod(m, struct ether_header *);
> 
> - if (ether_is_bcast(ceh->ether_dhost)) {
> + if (ETHER_IS_BROADCAST(ceh->ether_dhost)) {
>   memcpy(beh->ether_dhost, sc->sc_group,
>   sizeof(beh->ether_dhost));
>   } else {
> @@ -839,7 +835,7 @@ bpe_input_map(struct bpe_softc *sc, cons
>   be->be_age = time_uptime; /* only a little bit racy */
> 
>   if (be->be_type != BPE_ENTRY_DYNAMIC ||
> - ether_is_eq(ba, >be_b_da))
> + ETHER_IS_EQ(ba, >be_b_da))
>   be = NULL;
>   else
>   refcnt_take(>be_refs);
> Index: net/if_bridge.c
> ===
> RCS file: /cvs/src/sys/net/if_bridge.c,v
> retrieving revision 1.335
> diff -u -p -r1.335 if_bridge.c
> --- net/if_bridge.c   9 Jun 2019 17:42:16 -   1.335
> +++ net/if_bridge.c   16 Jul 2019 23:28:59 -
> @@ -944,10 +944,8 @@ bridgeintr_frame(struct ifnet *brifp, st
>* is not broadcast or multicast, record its address.
>*/
>   if ((bif->bif_flags & IFBIF_LEARNING) &&
> - (eh.ether_shost[0] & 1) == 0 &&
> - !(eh.ether_shost[0] == 0 && eh.ether_shost[1] == 0 &&
> - eh.ether_shost[2] == 0 && eh.ether_shost[3] == 0 &&
> - eh.ether_shost[4] == 0 && eh.ether_shost[5] == 0))
> + !ETHER_IS_MULTICAST(eh.ether_shost) &&
> + !ETHER_IS_ANYADDR(eh.ether_shost))
>   bridge_rtupdate(sc, src, src_if, 0, IFBAF_DYNAMIC, m);
> 
>   if ((bif->bif_flags & IFBIF_STP) &&
> @@ -972,8 +970,7 @@ bridgeintr_frame(struct ifnet *brifp, st
>   return;
>   }
>   } else {
> - if (memcmp(etherbroadcastaddr, eh.ether_dhost,
> - sizeof(etherbroadcastaddr)) == 0)
> + if (ETHER_IS_BROADCAST(eh.ether_dhost))
>   m->m_flags |= M_BCAST;
>   else
>   m->m_flags |= M_MCAST;
> Index: net/if_ethersubr.c
> ===
> RCS file: /cvs/src/sys/net/if_ethersubr.c,v
> retrieving revision 1.259
> diff -u -p -r1.259 if_ethersubr.c
> --- net/if_ethersubr.c20 Feb 2019 00:03:15 -  1.259
> +++ net/if_ethersubr.c16 Jul 2019 23:29:14 -
> @@ -377,8 +377,7 @@ ether_input(struct ifnet *ifp, struct mb
>   goto dropanyway;
>   }
> 
> - if (memcmp(etherbroadcastaddr, eh->ether_dhost,
> - ETHER_ADDR_LEN) == 0)
> + if (ETHER_IS_BROADCAST(eh->ether_dhost))
>   m->m_flags |= M_BCAST;
>   else
>   m->m_flags |= M_MCAST;
> Index: net/if_gre.c
> ===
> RCS file: /cvs/src/sys/net/if_gre.c,v
> retrieving revision 1.150
> diff -u -p -r1.150 if_gre.c
> --- net/if_gre.c  23 Apr 2019 11:48:55 -  1.150
> +++ net/if_gre.c  16 Jul 2019 23:33:25 -
> @@ -353,9 +353,6 @@ struct mgre_tree mgre_tree = RBT_INITIAL
> /*
>  * Ethernet GRE tunnels
>  */
> -#define ether_cmp(_a, _b)memcmp((_a), (_b), ETHER_ADDR_LEN)
> -#define ether_isequal(_a, _b)(ether_cmp((_a), (_b)) == 0)
> -#define ether_isbcast(_e)ether_isequal((_e), etherbroadcastaddr)
> 
> static struct mbuf *
>   gre_ether_align(struct mbuf *, int);
> @@

rfc: let etherip(4) depend on another interface for link state

2019-07-16 Thread David Gwynne

this is not a request for oks, this is me backing up a hack i did to fix
a problem i needed a quick solution for.

we're in the process of moving a bunch of boxes to a new site, and are
lucky that we can take our address space with us. to let us gradually
migrate hosts i want to stretch the layer 2 network between sites, but
i only have ip connectivity between them. therefore i'm bridging vlans
over etherip.

i also care about having some redundancy during this process (which is
going to take weeks at a minimum), so i have a pair of boxes at each
site act as endpoints to the bridges. to keep it simple im using carp
on each end to elect the active bridge endpoint on each side. this means
only one of a pair rxes the etherip packets from the remote site
to be forwarded into the DC the pair sit in.

unfortunately i discovered that carp has no influence on sending etherip
to the remote DC. once i had both pairs of bridges set up i managed
to introduce a loop at layer 2 which DoSsed both DCs. in more detail,
say I have DCs A and B, and each site has 2 bridge boxes that are called
A0, A1, B0, and B1. A0 and B0 are the carp masters. say A0 rxes a
broadcast packet from a vlan on its physical interface. it will forward
that to B0, which transmits it to it's physical interface. B1 will rx
the broadcast packet and send it to DC A via A0. A0 pushes the broadcast
packet to it's physical interface, which is rxed by B1. B1 pushes it to
A0, and so on.

my quick and dirty hack is to make transmission over the etherip leg of
a bridge depend on the state of the carp interface. this is implemented
by hacking up etherip so it supports the configuration of a parent
interface. etherip then watches whether the parent is running and has
link.

so i have the following config in DC A:

xdlg@dca-bridge0 ~$ sudo cat /etc/hostname.carp423
carpdev vlan423
vhid 23 pass secret
inet 172.23.84.113 255.255.255.248 NONE
xdlg@dca-bridge0 ~$ sudo cat /etc/hostname.etherip0
tunnel 172.23.84.113 172.23.84.121
parent carp423
up

this sits underneath this:

xdlg@dca-bridge0 ~$ for i in vlan374 vlan10374 bridge374; do ifconfig $i; done 
vlan374: flags=8943 mtu 9000
lladdr 00:50:56:a1:d7:f7
description: labs-servers
index 29 priority 0 llprio 3
encap: vnetid 374 parent em0 txprio packet rxprio outer
groups: vlan overlay
media: Ethernet autoselect (1000baseT full-duplex,master)
status: active
vlan10374: flags=8943 mtu 9000
lladdr fe:e1:ba:d0:3f:da
description: labs-servers
index 30 priority 0 llprio 3
encap: vnetid 374 parent etherip0 txprio packet rxprio outer
groups: vlan overlay
media: Ethernet autoselect
status: active
bridge374: flags=41
description: labs-servers
index 31 llprio 3
groups: bridge
priority 32768 hellotime 2 fwddelay 15 maxage 20 holdcnt 6 proto rstp
designated: id 00:00:00:00:00:00 priority 0
vlan10374 flags=2007
port 30 ifpriority 0 ifcost 0
vlan374 flags=2007
port 29 ifpriority 0 ifcost 0
Addresses (max cache: 100, timeout: 240):

systat if on dca-bridge0 has this:

IFACE STATE DESC IPKTS IBYTES IFAILS  OPKTS OBYTES OFAILS  COLLS
em0   up:U1505 276472  0  7   1066  0  0
enc0  dn:U   0  0  0  0  0  0  0
lo0   up 0  0  0  0  0  0  0
carp423   up:D   4252  0  0  0  0  0
etherip0  up:D   0  0  0 10914 10  0
vlan374   up:U  labs-servers 0  0  0  0  0  0  0
vlan10374 up:D  labs-servers 0  0  0  0  0  0  0
bridge374 uplabs-servers 0  0  0  0  0  0  0

and dca-bridge1:

IFACE STATE DESC IPKTS IBYTES IFAILS  OPKTS OBYTES OFAILS  COLLS
em0   up:U 841 140375  0 45   6265  0  0
enc0  dn:U   0  0  0  0  0  0  0
lo0   up 0  0  0  0  0  0  0
carp423   up:U   2126  1  3126  0  0
etherip0  up:U   8   1242  0 27   2751  0  0
vlan374   up:U  labs-servers20   4755  0  8   3861  0  0
vlan10374 up:U  labs-servers10   3981  0 12   3499  0  0
bridge374 uplabs-servers26   8496  0 20   7360  0  0

is there something obvious im missing here? how else do i make the
backup bridge not transmit to the other DC?

Index: if_etherip.c
===
RCS file: /cvs/src/sys/net/if_etherip.c,v
retrieving revision 1.45
diff -u -p -r1.45 if_etherip.c
--- if_etherip.c

aggr(4): a new LACP driver

2019-07-03 Thread David Gwynne

nd this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * This driver implements 802.1AX Link Aggregation (formerly 802.3ad)
+ *
+ * The specification describes systems with multiple ports that that
+ * can dynamically form aggregations. The relationships between ports
+ * and aggregations is such that arbitrary ports connected to ports
+ * on other systems may move between aggregations, and there can be
+ * as many aggregations as ports. An aggregation in this model is
+ * effectively an interface, and becomes the point that Ethernet traffic
+ * enters and leaves the system. The spec also contains a description
+ * of the Link Aggregation Control Protocol (LACP) for use on the wire,
+ * and how to process it and select ports and aggregations based on
+ * it.
+ * 
+ * This driver implements a simplified or constrained model where each
+ * aggr(4) interface is effectively an independent system, and will
+ * only support one aggregation. This supports the use of the kernel
+ * interface as a static entity that is created and configured once,
+ * and has the link "come up" when that one aggregation is selected
+ * by the LACP protocol.
+ */
+
+/*
+ * This code was written by David Gwynne  as part
+ * of the Information Technology Infrastructure Group (ITIG) in the
+ * Faculty of Engineering, Architecture and Information Technology
+ * (EAIT).
+ */
+
+/*
+ * TODO:
+ *
+ * - add locking
+ * - figure out the Ready_N and Ready logic
+ */
+
+#include "bpfilter.h"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+
+#include 
+
+#include 
+#include 
+
+#include  /* if_trunk.h uses siphash bits */
+#include 
+
+#if NBPFILTER > 0
+#include 
+#endif
+
+/*
+ * Link Aggregation Control Protocol (LACP)
+ */
+
+struct ether_slowproto_hdr {
+   uint8_t sph_subtype;
+   uint8_t sph_version;
+} __packed;
+
+#define SLOWPROTOCOLS_SUBTYPE_LACP 1
+#define SLOWPROTOCOLS_SUBTYPE_LACP_MARKER \
+   2
+
+#define LACP_VERSION   1
+
+#define LACP_FAST_PERIODIC_TIME1
+#define LACP_SLOW_PERIODIC_TIME30
+#define LACP_TIMEOUT_FACTOR3
+#define LACP_AGGREGATION_WAIT_TIME 2
+
+#define LACP_TX_MACHINE_RATE   3 /* per LACP_FAST_PERIODIC_TIME */
+
+#define LACP_ADDR_C_BRIDGE { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }
+#define LACP_ADDR_SLOW { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x02 }
+#define LACP_ADDR_NON_TPMR_BRIDGE  { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x03 }
+
+struct lacp_tlv_hdr {
+   uint8_t lacp_tlv_type;
+   uint8_t lacp_tlv_length;
+} __packed __aligned(2);
+
+/* LACP TLV types */
+
+#define LACP_T_TERMINATOR  0x00
+#define LACP_T_ACTOR   0x01
+#define LACP_T_PARTNER 0x02
+#define LACP_T_COLLECTOR   0x03
+
+/* LACPv2 TLV types */
+
+#define LACP_T_PORT_ALGORITHM  0x04
+#define LACP_T_PORT_CONVERSATION_ID_DIGEST \
+   0x05
+#define LACP_T_PORT_CONVERSATION_MASK  0x06
+#define LACP_T_PORT_CONVERSATION_SERVICE_MAPPING \
+   0x0a
+
+struct lacp_sysid {
+   uint16_tlacp_sysid_priority;
+   uint8_t lacp_sysid_mac[ETHER_ADDR_LEN];
+} __packed __aligned(2);
+
+struct lacp_portid {
+   uint16_tlacp_portid_priority;
+   uint16_tlacp_portid_number;
+} __packed __aligned(2);
+
+struct lacp_port_info {
+   struct lacp_sysid   lacp_sysid;
+   uint16_tlacp_key;
+   struct lacp_portid  lacp_portid;
+   uint8_t lacp_state;
+   uint8_t lacp_reserved[3];
+} __packed __aligned(2);
+
+#define LACP_STATE_ACTIVITY(1 << 0)
+#define LACP_STATE_TIMEOUT (1 << 1)
+#define LACP_STATE_AGGREGATION (1 << 2)
+#define LACP_STATE_SYNC(1 << 3)
+#define LACP_STATE_COLLECTING  (1 << 4)
+#define LACP_STATE_DISTRIBUTING(1 << 5)
+#define LACP_STATE_DEFAULTED   (1 << 6)
+#define LACP_STATE_EXPIRED (1 << 7)
+
+struct lacp_collector_info {
+   uint16_t

move to interface rx queue backpressure for if_rxr livelock detection

2019-06-30 Thread David Gwynne

interface rx queue processing includes detection of when the stack
becomes too busy to process packets.

there's three stages to this mechanism. firstly, everything is fine
and the packets are simply queued for processing. the second is the
"pressure_return" stage where the interface has queued a few times,
but the stack hasn't run to process them. ifiq_input returns 1 in
this situation to notify the nic that it should start to slow down.
the last stage is the "pressure_drop" stage where the nic has
continued to queue packets and the stack still hasnt run. in this
stage it drops the packets and returns 1.

independently, the stack looks for lost clock ticks (because the stack
traditionally blocked softclock ticks) as a livelock detection
mechanism. this no longer works that well now we're in an MP worls.
firstly, the stack could be running on a different cpu to the clock and
therefore wont block it. secondly, the stack runs in a thread and doesnt
raise the spl, so it shouldnt be blocking clock interrupts even if it is
sharing a cpu now.

therefore the traditional livelock detection mechanism doesnt work and
should be moved away from. the replacement is getting nics that
implement rx ring moderation to look at the return value of the rx queue
input function and telling the rings to slow down. that is what this
diff does.

i've compiled it on amd64, which covers most of the drivers, but there's
a few in fdt that i did blind and havent tested. ive tested a couple of
the interfaces, but more testing would be appreciated.

ok?

Index: ./dev/fdt/if_dwxe.c
===
RCS file: /cvs/src/sys/dev/fdt/if_dwxe.c,v
retrieving revision 1.11
diff -u -p -r1.11 if_dwxe.c
--- ./dev/fdt/if_dwxe.c 3 Jan 2019 00:59:58 -   1.11
+++ ./dev/fdt/if_dwxe.c 1 Jul 2019 00:26:07 -
@@ -962,13 +962,14 @@ dwxe_rx_proc(struct dwxe_softc *sc)
sc->sc_rx_cons++;
}
 
+   if (ifiq_input(>if_rcv, ))
+   if_rxr_livelocked(>sc_rx_ring);
+
dwxe_fill_rx_ring(sc);
 
bus_dmamap_sync(sc->sc_dmat, DWXE_DMA_MAP(sc->sc_rxring), 0,
DWXE_DMA_LEN(sc->sc_rxring),
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
-
-   if_input(ifp, );
 }
 
 void
Index: ./dev/fdt/if_fec.c
===
RCS file: /cvs/src/sys/dev/fdt/if_fec.c,v
retrieving revision 1.8
diff -u -p -r1.8 if_fec.c
--- ./dev/fdt/if_fec.c  6 Feb 2019 22:59:06 -   1.8
+++ ./dev/fdt/if_fec.c  1 Jul 2019 00:26:07 -
@@ -1123,6 +1123,9 @@ fec_rx_proc(struct fec_softc *sc)
sc->sc_rx_cons++;
}
 
+   if (ifiq_input(>if_rcv, ))
+   if_rxr_livelocked(>sc_rx_ring);
+
fec_fill_rx_ring(sc);
 
bus_dmamap_sync(sc->sc_dmat, ENET_DMA_MAP(sc->sc_rxring), 0,
@@ -1131,8 +1134,6 @@ fec_rx_proc(struct fec_softc *sc)
 
/* rx descriptors are ready */
HWRITE4(sc, ENET_RDAR, ENET_RDAR_RDAR);
-
-   if_input(ifp, );
 }
 
 void
Index: ./dev/fdt/if_mvneta.c
===
RCS file: /cvs/src/sys/dev/fdt/if_mvneta.c,v
retrieving revision 1.7
diff -u -p -r1.7 if_mvneta.c
--- ./dev/fdt/if_mvneta.c   30 Apr 2019 20:26:02 -  1.7
+++ ./dev/fdt/if_mvneta.c   1 Jul 2019 00:26:07 -
@@ -1369,9 +1369,10 @@ mvneta_rx_proc(struct mvneta_softc *sc)
sc->sc_rx_cons = MVNETA_RX_RING_NEXT(idx);
}
 
-   mvneta_fill_rx_ring(sc);
+   if (ifiq_input(>if_rcv, ))
+   if_rxr_livelocked(>sc_rx_ring);
 
-   if_input(ifp, );
+   mvneta_fill_rx_ring(sc);
 }
 
 void
Index: ./dev/pci/if_em.c
===
RCS file: /cvs/src/sys/dev/pci/if_em.c,v
retrieving revision 1.342
diff -u -p -r1.342 if_em.c
--- ./dev/pci/if_em.c   1 Mar 2019 10:02:44 -   1.342
+++ ./dev/pci/if_em.c   1 Jul 2019 00:26:07 -
@@ -2922,7 +2922,8 @@ em_rxeof(struct em_softc *sc)
 
sc->sc_rx_desc_tail = i;
 
-   if_input(ifp, );
+   if (ifiq_input(>if_rcv, ))
+   if_rxr_livelocked(>sc_rx_ring);
 
return (rv);
 }
Index: ./dev/pci/if_bge.c
===
RCS file: /cvs/src/sys/dev/pci/if_bge.c,v
retrieving revision 1.388
diff -u -p -r1.388 if_bge.c
--- ./dev/pci/if_bge.c  9 Nov 2018 14:14:31 -   1.388
+++ ./dev/pci/if_bge.c  1 Jul 2019 00:26:07 -
@@ -3561,6 +3561,13 @@ bge_rxeof(struct bge_softc *sc)
ml_enqueue(, m);
}
 
+   if (ifiq_input(>if_rcv, )) {
+   if (stdcnt)
+   if_rxr_livelocked(>bge_std_ring);
+   if (jumbocnt)
+   if_rxr_livelocked(>bge_jumbo_ring);
+   }
+
sc->bge_rx_saved_considx = rx_cons;
bge_writembx(sc, BGE_MBX_RX_CONS0_LO, sc->bge_rx_saved_considx);
if (stdcnt) {
@@

let sysctl tweak interface rx queue backpressure cutoffs

2019-06-30 Thread David Gwynne

this lets read and write the backpressure variables in the interface rx
queue (ifiq) handling:

dlg@ix ~$ sysctl net.link.ifrxq
net.link.ifrxq.pressure_return=6
net.link.ifrxq.pressure_drop=8

ideally this would be temporary, ie, id remove it from the tree once
everyone's happy with these numbers.  6 and 8 are the least worst
values ive come up with so far, but it is handy to move them around
for testing purposes.

Index: sys/net/ifq.c
===
RCS file: /cvs/src/sys/net/ifq.c,v
retrieving revision 1.32
diff -u -p -r1.32 ifq.c
--- sys/net/ifq.c   1 Jul 2019 00:44:29 -   1.32
+++ sys/net/ifq.c   1 Jul 2019 00:48:30 -
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -605,6 +606,43 @@ ifiq_process(void *arg)
mtx_leave(>ifiq_mtx);
 
if_input_process(ifiq->ifiq_if, );
+}
+
+int
+net_ifiq_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, 
+void *newp, size_t newlen)
+{
+   int val;
+   int error;
+
+   if (namelen != 1)
+   return (EISDIR);
+
+   switch (name[0]) {
+   case NET_LINK_IFRXQ_PRESSURE_RETURN:
+   val = ifiq_pressure_return;
+   error = sysctl_int(oldp, oldlenp, newp, newlen, );
+   if (error != 0)
+   return (error);
+   if (val < 1 || val > ifiq_pressure_drop)
+   return (EINVAL);
+   ifiq_pressure_return = val;
+   break;
+   case NET_LINK_IFRXQ_PRESSURE_DROP:
+   val = ifiq_pressure_drop;
+   error = sysctl_int(oldp, oldlenp, newp, newlen, );
+   if (error != 0)
+   return (error);
+   if (ifiq_pressure_return > val)
+   return (EINVAL);
+   ifiq_pressure_drop = val;
+   break;
+   default:
+   error = EOPNOTSUPP;
+   break;
+   }
+
+   return (error);
 }
 
 /*
Index: sbin/sysctl/sysctl.c
===
RCS file: /cvs/src/sbin/sysctl/sysctl.c,v
retrieving revision 1.241
diff -u -p -r1.241 sysctl.c
--- sbin/sysctl/sysctl.c21 Feb 2019 16:37:13 -  1.241
+++ sbin/sysctl/sysctl.c1 Jul 2019 00:48:30 -
@@ -192,6 +192,7 @@ void usage(void);
 int findname(char *, char *, char **, struct list *);
 int sysctl_inet(char *, char **, int *, int, int *);
 int sysctl_inet6(char *, char **, int *, int, int *);
+int sysctl_link(char *, char **, int *, int, int *);
 int sysctl_bpf(char *, char **, int *, int, int *);
 int sysctl_mpls(char *, char **, int *, int, int *);
 int sysctl_pipex(char *, char **, int *, int, int *);
@@ -647,6 +648,12 @@ parse(char *string, int flags)
}
break;
}
+   if (mib[1] == PF_LINK) {
+   len = sysctl_link(string, , mib, flags, );
+   if (len < 0)
+   return;
+   break;
+   }
if (mib[1] == PF_BPF) {
len = sysctl_bpf(string, , mib, flags, );
if (len < 0)
@@ -2230,6 +2237,46 @@ sysctl_inet6(char *string, char **bufpp,
*typep = lp->list[tindx].ctl_type;
return(5);
}
+   return (4);
+}
+
+/* handle net.link requests */
+struct ctlname netlinkname[] = CTL_NET_LINK_NAMES;
+struct ctlname ifrxqname[] = CTL_NET_LINK_IFRXQ_NAMES;
+struct list netlinklist = { netlinkname, NET_LINK_MAXID };
+struct list netlinkvars[] = {
+   [NET_LINK_IFRXQ] = { ifrxqname, NET_LINK_IFRXQ_MAXID },
+};
+
+int
+sysctl_link(char *string, char **bufpp, int mib[], int flags, int *typep)
+{
+   struct list *lp;
+   int indx;
+
+   if (*bufpp == NULL) {
+   listall(string, );
+   return (-1);
+   }
+   if ((indx = findname(string, "third", bufpp, )) == -1)
+   return (-1);
+   mib[2] = indx;
+   if (indx < NET_LINK_MAXID && netlinkvars[indx].list != NULL)
+   lp = [indx];
+   else if (!flags)
+   return (-1);
+   else {
+   warnx("%s: no variables defined for this protocol", string);
+   return (-1);
+   }
+   if (*bufpp == NULL) {
+   listall(string, lp);
+   return (-1);
+   }
+   if ((indx = findname(string, "fourth", bufpp, lp)) == -1)
+   return (-1);
+   mib[3] = indx;
+   *typep = lp->list[indx].ctl_type;
return (4);
 }
 
Index: sys/sys/sysctl.h
===
RCS file: /cvs/src/sys/sys/sysctl.h,v
retrieving revision 1.188
diff -u -p -r1.188 sysctl.h
--- sys/sys/sysctl.h1 Jun 2019 14:11:18 -   1.188
+++ sys/sys/sysctl.h1 Jul 2019 00:48:31 -
@@ -1012,6

Re: another go at bypass support for sparc64 iommu and BUS_DMA_64BIT

2019-06-19 Thread David Gwynne

I'm mostly concerned that nothing that currently works breaks. This changes a 
fairly fundamental chunk of code in the guts of the platform, so having 
machines still work afterward would be nice.

Glad your T5220 works, hopefully the u45 will be ok too.

dlg

> On 20 Jun 2019, at 08:52, Kaashif Hymabaccus  wrote:
> 
> This is great, thanks for the work. I have the new snapshot running on
> a T5220. I also have an Ultra 45 with some GPUs I use mainly for
> playing games and sending people bug reports about how their game
> doesn't work on sparc64, I'll try it there also. If there are any
> specific tests you want done, I am happy to do them. Here is the dmesg
> of the T5220 if there is anything that interests you:
> 
> console is /virtual-devices@100/console@1
> Copyright (c) 1982, 1986, 1989, 1991, 1993
>   The Regents of the University of California.  All rights reserved.
> Copyright (c) 1995-2019 OpenBSD. All rights reserved.  https://www.OpenBSD.org
> 
> OpenBSD 6.5-current (GENERIC.MP) #207: Tue Jun 18 13:15:53 MDT 2019
>dera...@sparc64.openbsd.org:/usr/src/sys/arch/sparc64/compile/GENERIC.MP
> real mem = 17045651456 (16256MB)
> avail mem = 16721928192 (15947MB)
> mpath0 at root
> scsibus0 at mpath0: 256 targets
> mainbus0 at root: SPARC Enterprise T5220
> cpu0 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu1 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu2 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu3 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu4 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu5 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu6 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu7 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu8 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu9 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu10 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu11 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu12 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu13 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu14 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu15 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu16 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu17 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu18 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu19 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu20 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu21 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu22 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu23 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu24 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu25 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu26 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu27 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu28 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu29 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu30 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu31 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu32 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu33 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu34 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu35 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu36 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu37 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu38 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu39 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu40 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu41 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu42 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu43 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu44 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu45 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu46 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu47 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu48 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu49 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu50 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu51 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu52 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu53 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu54 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu55 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu56 at mainbus0: SUNW,UltraSPARC-T2 (rev 0.0) @ 1165.379 MHz
> cpu57 at

try interface rx backpressure again

2019-06-12 Thread David Gwynne

i backed this out before the 6.5 release because of bad interactions
with virtual interfaces like vlan and trunk. those should now be fixed,
so we can try the other backpressure mechanism again.

the summary is that we count the number of attempts to queue packets for
the system to process rather than the number of packets. this should
result in a more responsive system by better limiting the work a network
taskq does per interface.

this interacted badly with things like vlan because they work on a
packet at a time, however, they directly dispatch processing now so
there's no operations that they count and get backpressure for.

ok? even if it's to throw it in the tree to be backed out when we hit a
problem again.

Index: ifq.c
===
RCS file: /cvs/src/sys/net/ifq.c,v
retrieving revision 1.31
diff -u -p -r1.31 ifq.c
--- ifq.c   16 Apr 2019 04:04:19 -  1.31
+++ ifq.c   12 Jun 2019 01:19:51 -
@@ -498,8 +498,8 @@ ifiq_destroy(struct ifiqueue *ifiq)
ml_purge(>ifiq_ml);
 }
 
-unsigned int ifiq_maxlen_drop = 2048 * 5;
-unsigned int ifiq_maxlen_return = 2048 * 3;
+unsigned int ifiq_pressure_drop = 16;
+unsigned int ifiq_pressure_return = 2;
 
 int
 ifiq_input(struct ifiqueue *ifiq, struct mbuf_list *ml)
@@ -508,7 +508,7 @@ ifiq_input(struct ifiqueue *ifiq, struct
struct mbuf *m;
uint64_t packets;
uint64_t bytes = 0;
-   unsigned int len;
+   unsigned int pressure;
 #if NBPFILTER > 0
caddr_t if_bpf;
 #endif
@@ -552,8 +552,8 @@ ifiq_input(struct ifiqueue *ifiq, struct
ifiq->ifiq_packets += packets;
ifiq->ifiq_bytes += bytes;
 
-   len = ml_len(>ifiq_ml);
-   if (len > ifiq_maxlen_drop)
+   pressure = ++ifiq->ifiq_pressure;
+   if (pressure > ifiq_pressure_drop)
ifiq->ifiq_qdrops += ml_len(ml);
else
ml_enlist(>ifiq_ml, ml);
@@ -564,7 +564,7 @@ ifiq_input(struct ifiqueue *ifiq, struct
else
ml_purge(ml);
 
-   return (len > ifiq_maxlen_return);
+   return (pressure > ifiq_pressure_return);
 }
 
 void
@@ -599,6 +599,7 @@ ifiq_process(void *arg)
return;
 
mtx_enter(>ifiq_mtx);
+   ifiq->ifiq_pressure = 0;
ml = ifiq->ifiq_ml;
ml_init(>ifiq_ml);
mtx_leave(>ifiq_mtx);

Re: trunk(4) shouldn't need to play with a port's if_type

2019-06-11 Thread David Gwynne

I get that trunk ports should not be able to be added to bridges or have carp 
interfaces hanging off them, but I think the value of making the if_type 
immutable outweighs this usability feature. Especially when you consider that 
you can have an interface that is already a member of a bridge (or have the 
other things on it) and then add it to trunk, and the system thinks that it is 
fine.

It gets more confusing when you think about whether things like vlan or bpe are 
"service delimiting" or not, and how they're supposed to interact with trunk. 
If trunk is enabled on a physical interface, should vlan be allowed to coexist 
with it? If the trunk is doing LACP, you could argue no, but if it's doing 
failover or broadcast then maybe you want that to operate independently for 
different vlans and the "native" vlan on the one physical interface.

If we ever want to support "independent" LACP trunk port operation like a 
variety of switches do now, then it makes sense to maintain IFT_ETHER too.

dlg

> 
> On 11 Jun 2019, at 17:32, Reyk Floeter  wrote:
> 
> Hi,
> 
> the initial intention was to differentiate a trunk port from a regular 
> Ethernet interface.
> 
> As long as an interface is a member of a trunk, it is not a fully featured 
> Ethernet interface. The changed type prevented from using it elsewhere.
> 
> I‘m not so familiar with the current network stack anymore, so maybe there 
> are other ways to do it these days, but you should test that a trunk port 
> cannot be attached to a bridge or carp or anything like this.
> 
> You also forgot a comment that mentions the type as well.
> 
> Reyk
> 
>> Am 11.06.2019 um 08:33 schrieb David Gwynne :
>> 
>> i think trunk(4) is the only thing left in the kernel that modifies an
>> interfaces if_type at runtime. this diff removes that fiddling, so
>> hopefully we can say that if_type is immutable after this.
>> 
>> however, while this diff reads well to me, i don't actually know if it
>> works. could someone kick the tyres for me?
>> 
>> cheers,
>> dlg
>> 
>> Index: if_trunk.c
>> ===
>> RCS file: /cvs/src/sys/net/if_trunk.c,v
>> retrieving revision 1.140
>> diff -u -p -r1.140 if_trunk.c
>> --- if_trunk.c11 May 2019 18:10:45 -1.140
>> +++ if_trunk.c11 Jun 2019 06:31:29 -
>> @@ -330,10 +330,7 @@ trunk_port_create(struct trunk_softc *tr
>>   }
>>   }
>> 
>> -/* Change the interface type */
>> -tp->tp_iftype = ifp->if_type;
>> -ifp->if_type = IFT_IEEE8023ADLAG;
>> -
>> +/* Change the interface methods */
>>   tp->tp_ioctl = ifp->if_ioctl;
>>   ifp->if_ioctl = trunk_port_ioctl;
>> 
>> @@ -422,9 +419,7 @@ trunk_port_destroy(struct trunk_port *tp
>>   if (tr->tr_port_destroy != NULL)
>>   (*tr->tr_port_destroy)(tp);
>> 
>> -/* Restore interface type. */
>> -ifp->if_type = tp->tp_iftype;
>> -
>> +/* Restore interface methods. */
>>   ifp->if_ioctl = tp->tp_ioctl;
>>   ifp->if_output = tp->tp_output;
>> 
>> @@ -474,8 +469,7 @@ trunk_port_ioctl(struct ifnet *ifp, u_lo
>>   int error = 0;
>> 
>>   /* Should be checked by the caller */
>> -if (ifp->if_type != IFT_IEEE8023ADLAG ||
>> -(tp = trunk_port_get(NULL, ifp)) == NULL ||
>> +if ((tp = trunk_port_get(NULL, ifp)) == NULL ||
>>   (tr = (struct trunk_softc *)tp->tp_trunk) == NULL) {
>>   error = EINVAL;
>>   goto fallback;
>> @@ -521,8 +515,7 @@ trunk_port_output(struct ifnet *ifp, str
>>struct rtentry *rt)
>> {
>>   /* restrict transmission on trunk members to bpf only */
>> -if (ifp->if_type == IFT_IEEE8023ADLAG &&
>> -(m_tag_find(m, PACKET_TAG_DLT, NULL) == NULL)) {
>> +if ((m_tag_find(m, PACKET_TAG_DLT, NULL) == NULL)) {
>>   m_freem(m);
>>   return (EBUSY);
>>   }
>> @@ -1123,10 +1116,6 @@ trunk_input(struct ifnet *ifp, struct mb
>>   eh = mtod(m, struct ether_header *);
>>   if (ETHER_IS_MULTICAST(eh->ether_dhost))
>>   ifp->if_imcasts++;
>> -
>> -/* Should be checked by the caller */
>> -if (ifp->if_type != IFT_IEEE8023ADLAG)
>> -goto bad;
>> 
>>   tp = (struct trunk_port *)cookie;
>>   if ((tr = (struct trunk_softc *)tp->tp_trunk) == NULL)
>> 
>

trunk(4) shouldn't need to play with a port's if_type

2019-06-11 Thread David Gwynne

i think trunk(4) is the only thing left in the kernel that modifies an
interfaces if_type at runtime. this diff removes that fiddling, so
hopefully we can say that if_type is immutable after this.

however, while this diff reads well to me, i don't actually know if it
works. could someone kick the tyres for me?

cheers,
dlg

Index: if_trunk.c
===
RCS file: /cvs/src/sys/net/if_trunk.c,v
retrieving revision 1.140
diff -u -p -r1.140 if_trunk.c
--- if_trunk.c  11 May 2019 18:10:45 -  1.140
+++ if_trunk.c  11 Jun 2019 06:31:29 -
@@ -330,10 +330,7 @@ trunk_port_create(struct trunk_softc *tr
}
}
 
-   /* Change the interface type */
-   tp->tp_iftype = ifp->if_type;
-   ifp->if_type = IFT_IEEE8023ADLAG;
-
+   /* Change the interface methods */
tp->tp_ioctl = ifp->if_ioctl;
ifp->if_ioctl = trunk_port_ioctl;
 
@@ -422,9 +419,7 @@ trunk_port_destroy(struct trunk_port *tp
if (tr->tr_port_destroy != NULL)
(*tr->tr_port_destroy)(tp);
 
-   /* Restore interface type. */
-   ifp->if_type = tp->tp_iftype;
-
+   /* Restore interface methods. */
ifp->if_ioctl = tp->tp_ioctl;
ifp->if_output = tp->tp_output;
 
@@ -474,8 +469,7 @@ trunk_port_ioctl(struct ifnet *ifp, u_lo
int error = 0;
 
/* Should be checked by the caller */
-   if (ifp->if_type != IFT_IEEE8023ADLAG ||
-   (tp = trunk_port_get(NULL, ifp)) == NULL ||
+   if ((tp = trunk_port_get(NULL, ifp)) == NULL ||
(tr = (struct trunk_softc *)tp->tp_trunk) == NULL) {
error = EINVAL;
goto fallback;
@@ -521,8 +515,7 @@ trunk_port_output(struct ifnet *ifp, str
 struct rtentry *rt)
 {
/* restrict transmission on trunk members to bpf only */
-   if (ifp->if_type == IFT_IEEE8023ADLAG &&
-   (m_tag_find(m, PACKET_TAG_DLT, NULL) == NULL)) {
+   if ((m_tag_find(m, PACKET_TAG_DLT, NULL) == NULL)) {
m_freem(m);
return (EBUSY);
}
@@ -1123,10 +1116,6 @@ trunk_input(struct ifnet *ifp, struct mb
eh = mtod(m, struct ether_header *);
if (ETHER_IS_MULTICAST(eh->ether_dhost))
ifp->if_imcasts++;
-
-   /* Should be checked by the caller */
-   if (ifp->if_type != IFT_IEEE8023ADLAG)
-   goto bad;
 
tp = (struct trunk_port *)cookie;
if ((tr = (struct trunk_softc *)tp->tp_trunk) == NULL)

another go at bypass support for sparc64 iommu and BUS_DMA_64BIT

2019-06-10 Thread David Gwynne

this is a reposting of the diff i sent out a while back. it lets sparc64
enable iommu bypass, and then uses that bypass support for BUS_DMA_64BIT
dmamaps.

the main benefit is around performance. without this diff on an M4000,
tcpbench can do about 70Mbps before the system is CPU bound. all that
cpu time is in the iommu tte management. with this diff and an
appropriate network driver, tcpbench does 940 Mbps and the box is mostly
idle. that is a 1300 percent improvement in network transfer speed. if
you factor the cpu time benefits it would be another order of magnitude
improvement.

kernel build times are slightly improved with this diff on mpii. system
time is about 5 percent less, but there's also a lot less I/O when
you're mostly running a compiler.

i hit an edge case with my previous diff that i believe is fixed with
this one. this one properly checks boundaries, segment sizes, and number
of segments for things like mbufs. i got lucky previously with the
drivers i was using.

Index: dev/iommu.c
===
RCS file: /cvs/src/sys/arch/sparc64/dev/iommu.c,v
retrieving revision 1.75
diff -u -p -r1.75 iommu.c
--- dev/iommu.c 25 May 2017 03:19:39 -  1.75
+++ dev/iommu.c 11 Jun 2019 01:37:25 -
@@ -87,6 +87,8 @@ void iommu_dvmamap_print_map(bus_dma_tag
 bus_dmamap_t);
 int iommu_dvmamap_append_range(bus_dma_tag_t, bus_dmamap_t, paddr_t,
 bus_size_t, int, bus_size_t);
+int iommu_dvmamap_insert(bus_dma_tag_t, bus_dmamap_t, bus_addr_t,
+bus_size_t, int, bus_size_t);
 int64_t iommu_tsb_entry(struct iommu_state *, bus_addr_t);
 void strbuf_reset(struct strbuf_ctl *);
 int iommu_iomap_insert_page(struct iommu_map_state *, paddr_t);
@@ -100,6 +102,25 @@ void iommu_iomap_clear_pages(struct iomm
 void _iommu_dvmamap_sync(bus_dma_tag_t, bus_dma_tag_t, bus_dmamap_t,
 bus_addr_t, bus_size_t, int);
 
+void iommu_hw_enable(struct iommu_state *);
+
+const struct iommu_hw iommu_hw_default = {
+   .ihw_enable = iommu_hw_enable,
+
+   .ihw_dvma_pa= IOTTE_PAMASK,
+
+   .ihw_bypass = 0x3fffUL << 50,
+   .ihw_bypass_nc  = 0,
+   .ihw_bypass_ro  = 0,
+};
+
+void
+iommu_hw_enable(struct iommu_state *is)
+{
+   IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
+   IOMMUREG_WRITE(is, iommu_cr, IOMMUCR_EN | (is->is_tsbsize << 16));
+}
+
 /*
  * Initiate an STC entry flush.
  */
@@ -125,7 +146,8 @@ iommu_strbuf_flush(struct strbuf_ctl *sb
  * - create a private DVMA map.
  */
 void
-iommu_init(char *name, struct iommu_state *is, int tsbsize, u_int32_t iovabase)
+iommu_init(char *name, const struct iommu_hw *ihw, struct iommu_state *is,
+int tsbsize, u_int32_t iovabase)
 {
psize_t size;
vaddr_t va;
@@ -149,13 +171,9 @@ iommu_init(char *name, struct iommu_stat
 * be hard-wired, so we read the start and size from the PROM and
 * just use those values.
 */
-   if (strncmp(name, "pyro", 4) == 0) {
-   is->is_cr = IOMMUREG_READ(is, iommu_cr);
-   is->is_cr &= ~IOMMUCR_FIRE_BE;
-   is->is_cr |= (IOMMUCR_FIRE_SE | IOMMUCR_FIRE_CM_EN |
-   IOMMUCR_FIRE_TE);
-   } else 
-   is->is_cr = IOMMUCR_EN;
+
+   is->is_hw = ihw;
+
is->is_tsbsize = tsbsize;
if (iovabase == (u_int32_t)-1) {
is->is_dvmabase = IOTSB_VSTART(is->is_tsbsize);
@@ -237,15 +255,6 @@ iommu_init(char *name, struct iommu_stat
mtx_init(>is_mtx, IPL_HIGH);
 
/*
-* Set the TSB size.  The relevant bits were moved to the TSB
-* base register in the PCIe host bridges.
-*/
-   if (strncmp(name, "pyro", 4) == 0)
-   is->is_ptsb |= is->is_tsbsize;
-   else
-   is->is_cr |= (is->is_tsbsize << 16);
-
-   /*
 * Now actually start up the IOMMU.
 */
iommu_reset(is);
@@ -262,10 +271,7 @@ iommu_reset(struct iommu_state *is)
 {
int i;
 
-   IOMMUREG_WRITE(is, iommu_tsb, is->is_ptsb);
-
-   /* Enable IOMMU */
-   IOMMUREG_WRITE(is, iommu_cr, is->is_cr);
+   (*is->is_hw->ihw_enable)(is);
 
for (i = 0; i < 2; ++i) {
struct strbuf_ctl *sb = is->is_sb[i];
@@ -280,7 +286,7 @@ iommu_reset(struct iommu_state *is)
printf(", STC%d enabled", i);
}
 
-   if (is->is_flags & IOMMU_FLUSH_CACHE)
+   if (ISSET(is->is_hw->ihw_flags, IOMMU_HW_FLUSH_CACHE))
IOMMUREG_WRITE(is, iommu_cache_invalidate, -1ULL);
 }
 
@@ -433,7 +439,7 @@ iommu_extract(struct iommu_state *is, bu
if (dva >= is->is_dvmabase && dva <= is->is_dvmaend)
tte = is->is_tsb[IOTSBSLOT(dva, is->is_tsbsize)];
 
-   return (tte & IOTTE_PAMASK);
+   return (tte & is->is_hw->ihw_dvma_pa);
 }
 
 /*
@@ -601,6 +607,7 @@ iommu_dvmamap_create(bus_dma_tag_t t, bu
 {
int ret;
bus_dmamap_t map;
+   struct iommu_state *is = sb->sb_iommu;

don't allow changing the if_type of tun(4) and tap(4) interfaces from userland

2019-06-10 Thread David Gwynne

making tun(4) not IFT_TUNNEL and tap(4) not IFT_ETHER makes for some
interesting possibilities for fun with the rest of the kernel. I think
we should just not allow the if_types to be changed.

I have another diff somewhere to prevent changing the flags
(IFF_BROADCAST etc) somewhere too, but we can start here.

ok?

Index: if_tun.c
===
RCS file: /cvs/src/sys/net/if_tun.c,v
retrieving revision 1.186
diff -u -p -r1.186 if_tun.c
--- if_tun.c12 May 2019 16:38:02 -  1.186
+++ if_tun.c10 Jun 2019 06:13:22 -
@@ -623,8 +623,9 @@ tun_dev_ioctl(struct tun_softc *tp, u_lo
tunp = (struct tuninfo *)data;
if (tunp->mtu < ETHERMIN || tunp->mtu > TUNMRU)
return (EINVAL);
+   if (tunp->type != tp->tun_if.if_type)
+   return (EINVAL);
tp->tun_if.if_mtu = tunp->mtu;
-   tp->tun_if.if_type = tunp->type;
tp->tun_if.if_flags =
(tunp->flags & TUN_IFF_FLAGS) |
(tp->tun_if.if_flags & ~TUN_IFF_FLAGS);

Re: hardware packet timestamping for mbufs, then sockets and bpf

2019-06-09 Thread David Gwynne

On Fri, Jun 07, 2019 at 07:36:36PM -0500, Scott Cheloha wrote:
> On Fri, Jun 07, 2019 at 02:34:20PM +1000, David Gwynne wrote:
> > nics are starting to offer the ability to timestamp packets when
> > they're received. other systems (eg linux and freebsd) have support
> > for recording that timestamp on mbufs and then using it as the
> > backend for at least the SO_TIMESTAMP socket option instead of a
> > call to microtime().
> > 
> > this implements the above, and additionally supports using the hw
> > timestamp in bpf too. other systems may do the bpf thing too, but i
> > didn't look closely enough to find out.
> > 
> > timestamps are recorded as the uptime of the system in nanoseconds
> > in the ph_timestamp field in mbufs. this mirrors the use of
> > ph_timestamp in the fq_codel code to store the uptime in nanoseconds.
> > im using another bit in m_pkthdr.csum_flags to say whether the timestamp
> > is valid or not (M_TIMESTAMP). im arguing that it's another offloading
> > feature and therefore appropriate for the csum offload flags field.
> > 
> > this adds some inline functions to time.h for turning ns into a timeval
> > and timespec, which are ns_to_microtime and ns_to_nanotime respectively.
> > i originally wanted ns_to_timeval and ns_to_timespec, but the linux
> > compat stuff in drm already uses those names and ruined the idea.
> > especially since they return the time{val,spec}s as values.
> 
> If we're going to add more conversion stuff to sys/time.h I want it to
> resemble the stuff that's already there.  I think NSEC_TO_TIMESPEC and
> NSEC_TO_TIMEVAL would be consistent names, so in sys/time.h I'd strongly
> prefer the following:
> 
> static __inline void
> NSEC_TO_TIMEVAL(uint64_t ns, struct timeval *tv)
> {
>   tv->tv_sec = ns / 10LL;
>   tv->tv_usec = (ns % 10L) / 1000L;
> }
> 
> static __inline void
> NSEC_TO_TIMESPEC(uint64_t ns, struct timespec *ts)
> {
>   ts->tv_sec = ns / 10LL;
>   ts->tv_nsec = ns % 10L;
> }

ok. i update that, and added a nsmicrotime() to go from a uint64_t nsec
offset to a wall clock time in kern_tc.c too.

> > the ipv4 SO_TIMESTAMP and bpf code looks at whether M_TIMESTAMP is set,
> > and if so turns ph_timestamp into a timeval before adding it to boottime
> > (which is the wall clock time that uptime starts at), before using it
> > instead of microtime().
> > 
> > the mcx changes are based on what freebsd did to their driver, but
> > simplified a bit.
> > 
> > i want this because we're being asked to look at recording network
> > traffic for possible audit use. part of that is having accurate
> > timestamps on received packets, and hopefully it will mitigate against
> > chunks of packets getting reordered or delayed significantly when the
> > box is busy.
> > 
> > thoughts? ok?
> 
> Thoughts inline.

cool.

> 
> Same thoughts from before about NSEC_TO_TIMEVAL etc.
> 
> Also, unless I've missed something, I don't see you use ns_to_nanotime
> in this diff.
> 
> However I see people doing this conversion elsewhere, so maybe
> we could use it anyway.

those are my thoughts too..

> > +   struct timeval btv;
> > +   ns_to_microtime(,
> > +   m->m_pkthdr.ph_timestamp);
> > +   microboottime();
> > +   timeradd(, , );
> 
> You could probably put this dance into a function.
> 
> void
> pkthdr_microtime(const struct pkthdr *hdr, struct timeval *tv)
> {
>   struct timeval boottime, uptime;
> 
>   microboottime();
>   NSEC_TO_TIMEVAL(hdr->ph_timestamp, );
>   timeradd(, , );
> }

this is nsmicrotime() in my new diff.

> 
> should this be MCX_CALIBRATE_NORMAL?
> 
> Also, maybe "mcx_start_calibration"?

now we're bikeshedding :)

i want to start hw timestamping "real soon now", which is why i schedule
the first timeout in a few seconds time. after that i don't want to
calibrate too often (to amortise the cost of calibrating), so i use a
longer timeout.

> Would you ever use a mcx_calibration struct in this function that
> wasn't in the mcx_softc?

no, i want to avoid the overhead of finding a usable set of calibration
values on every packet. once per interrupt is good enough.

> 
> >  {
> > struct mcx_slot *ms;
> > struct mbuf *m;
> > @@ -5574,10 +5663,26 @@ mcx_process_rx(struct mcx_softc *sc, str
> >  
> > m = ms->ms_m;
> > ms->ms_m = NULL;
> >

Re: OpenBSD: bridge(4), simplify IFT_ETHER checks

2019-06-09 Thread David Gwynne




> On 10 Jun 2019, at 03:14, Martin Pieuchot  wrote:
> 
> On 09/06/19(Sun) 18:41, Eygene Ryabinkin wrote:
>> Martin, good day.
>> 
>> Sun, Jun 09, 2019 at 11:35:48AM -0300, Martin Pieuchot wrote:
>>> On 07/06/19(Fri) 20:55, Eygene Ryabinkin wrote:
 With the hints from Martin Pieuchot had found out that the current
 handling of IFT_ETHER for bridge(4) /and nowadays, after de-introduction
 of IFT_MPLSTUNNEL/mpw(4) type interfaces, only IFT_ETHER ones can
 be attached to the bridge(4)/
>>> 
>>> Can't gif(4) interfaces be attached as well?
>> 
>> Reading bridge_ioctl() in /sys/net/if_bridge.c and examining
>> paths for SIOCBRDGADD/SIOCBRDGADDL and SIOCBRDGADDS I see the
>> following blocks:
>> {{{ ADD/ADDL
>>if (ifs->if_type == IFT_ETHER) {
>>error = ifpromisc(ifs, 1);
>>if (error != 0)
>>break;
>>} else {
>>error = EINVAL;
>>break;
>>}
>> }}}
>> {{{ ADDS
>>if (ifs->if_type != IFT_ETHER) {
>>error = EINVAL;
>>break;
>>}
>> }}}
>> So, ifs->if_type != IFT_ETHER will result in EINVAL.  Moreover,
>> reading gif(4) yields
>> {{{
>> Previously, gif supported RFC 3378 EtherIP tunnels over bridge(4)
>> interfaces.  This is now handled by etherip(4).
>> }}}
>> and this provides the explicit explanation that gif(4) used to work
>> with bridge(4), but now etherip(4) took over for Ethernet tunnels
>> and other gif(4) types seem to have no Ethernet-compatible MACs,
>> so they can't be used with bridge(4).

There are a couple of interfaces and interface types that can change type at 
runtime. Firstly, any ethernet interface added to a trunk will change from 
IFT_ETHER to IFT_IEEE8023ADLAG. Secondly, tun and tap can change from 
IFT_TUNNEL and IFT_ETHER to anything userland sets.

I really dislike both of these behaviours.

FreeBSD at least blocked changing the tun/tap interface types from userland for 
safety reasons. I would like to do the same.

It sort of makes sense for trunk to change the type of ports, but it also 
doesn't. Can we just let it check another thing to prevent reuse of ports?

Otherwise everything is created with a type and sticks to it at runtime, which 
is a lot safer and predictable in my opinion.

dlg

> 
> Thanks for digging that informations!  In that case I'd like to commit
> the tweaked version of your diff below.  Are you ok with it?
> 
> Index: net/if_bridge.c
> ===
> RCS file: /cvs/src/sys/net/if_bridge.c,v
> retrieving revision 1.333
> diff -u -p -r1.333 if_bridge.c
> --- net/if_bridge.c   13 May 2019 18:14:05 -  1.333
> +++ net/if_bridge.c   9 Jun 2019 17:12:22 -
> @@ -285,7 +285,10 @@ bridge_ioctl(struct ifnet *ifp, u_long c
>   error = ENOENT;
>   break;
>   }
> -
> + if (ifs->if_type != IFT_ETHER) {
> + error = EINVAL;
> + break;
> + }
>   if (ifs->if_bridgeidx != 0) {
>   if (ifs->if_bridgeidx == ifp->if_index)
>   error = EEXIST;
> @@ -304,23 +307,18 @@ bridge_ioctl(struct ifnet *ifp, u_long c
>   break;
>   }
> 
> - if (ifs->if_type == IFT_ETHER) {
> - error = ifpromisc(ifs, 1);
> - if (error != 0)
> - break;
> - } else {
> - error = EINVAL;
> - break;
> - }
> -
>   bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO);
>   if (bif == NULL) {
> - if (ifs->if_type == IFT_ETHER)
> - ifpromisc(ifs, 0);
>   error = ENOMEM;
>   break;
>   }
> 
> + error = ifpromisc(ifs, 1);
> + if (error != 0) {
> + free(bif, M_DEVBUF, sizeof(*bif));
> + break;
> + }
> +
>   bif->bridge_sc = sc;
>   bif->ifp = ifs;
>   bif->bif_flags = IFBIF_LEARNING | IFBIF_DISCOVER;
> @@ -363,7 +361,10 @@ bridge_ioctl(struct ifnet *ifp, u_long c
>   break;
>   }
>   if (ifs->if_bridgeidx != 0) {
> - error = EBUSY;
> + if (ifs->if_bridgeidx == ifp->if_index)
> + error = EEXIST;
> + else
> + error = EBUSY;
>   break;
>   }
>   SMR_SLIST_FOREACH_LOCKED(bif, >sc_spanlist, bif_next) {
> @@ -1201,8 +1202,6 @@ bridge_process(struct ifnet *ifp, struct
>*/
>   bif0 = bif;
>

tweak pci address conflict line for roms

2019-06-07 Thread David Gwynne

currently mem bars and the rom address conflict lines in dmesg look
the same, which is a bit confusing. this makes rom conflicts lines say
"rom conflict" instead.

that looks like this:

dlg@r6415 pci$ dmesg | grep -A4 conflict 
129:0:0: rom address conflict 0xfffc/0x4
129:0:1: rom address conflict 0xfffc/0x4
bge0 at pci7 dev 0 function 0 "Broadcom BCM5720" rev 0x00, BCM5720 A0 
(0x572), APE firmware NCSI 1.4.14.0: msi, address d0:94:66:34:52:57
brgphy0 at bge0 phy 1: BCM5720C 10/100/1000baseT PHY, rev. 0
bge1 at pci7 dev 0 function 1 "Broadcom BCM5720" rev 0x00, BCM5720 A0 
(0x572), APE firmware NCSI 1.4.14.0: msi, address d0:94:66:34:52:58
brgphy1 at bge1 phy 2: BCM5720C 10/100/1000baseT PHY, rev. 0
132:0:0: rom address conflict 0xfff8/0x8
132:0:1: rom address conflict 0xfff8/0x8
bnxt0 at pci10 dev 0 function 0 "Broadcom BCM57416" rev 0x00: fw ver 20.6.151, 
apic 131 int 12, address d0:94:66:45:11:d0
bnxt1 at pci10 dev 0 function 1 "Broadcom BCM57416" rev 0x01: fw ver 20.6.151, 
apic 131 int 13, address d0:94:66:45:11:d1
pchb48 at pci6 dev 2 function 0 "AMD AMD64 17h PCIE" rev 0x00
pchb49 at pci6 dev 3 function 0 "AMD AMD64 17h PCIE" rev 0x00
193:0:0: rom address conflict 0xfff0/0x10
mfii0 at pci14 dev 0 function 0 "Symbios Logic MegaRAID SAS3508" rev 0x01: msi
mfii0: "PERC H740P Mini ", firmware 50.5.0-1750, 8192MB cache
scsibus2 at mfii0: 64 targets
sd0 at scsibus2 targ 0 lun 0:  SCSI3 0/direct 
fixed naa.6d09466036370500226b9c61889dfc88

ok?

Index: pci.c
===
RCS file: /cvs/src/sys/dev/pci/pci.c,v
retrieving revision 1.112
diff -u -p -r1.112 pci.c
--- pci.c   28 Jul 2018 15:28:51 -  1.112
+++ pci.c   7 Jun 2019 06:55:30 -
@@ -911,7 +911,7 @@ pci_reserve_resources(struct pci_attach_
base, size, EX_NOWAIT) &&
pa->pa_memex && extent_alloc_region(pa->pa_memex,
base, size, EX_NOWAIT)) {
-   printf("%d:%d:%d: mem address conflict 
0x%lx/0x%lx\n",
+   printf("%d:%d:%d: rom address conflict 
0x%lx/0x%lx\n",
bus, dev, func, base, size);
pci_conf_write(pc, tag, PCI_ROM_REG, 0);
}

Re: ospfd: allow specifying area by number as well as id

2019-06-06 Thread David Gwynne

yes :D

> On 29 May 2019, at 15:05, Remi Locherer  wrote:
> 
> Hi David,
> 
> are you going to commit this?
> 
> Remi
> 
> 
> On Thu, May 16, 2019 at 11:14:55PM +0200, Remi Locherer wrote:
>> On Thu, May 16, 2019 at 09:39:37AM +0200, Sebastian Benoit wrote:
>>> 
>>> 
>>> 
>>> Remi Locherer(remi.loche...@relo.ch) on 2019.05.15 23:15:03 +0200:
>>>> On Tue, Apr 30, 2019 at 11:10:37PM +0200, Remi Locherer wrote:
>>>>> On Mon, Apr 29, 2019 at 11:10:31AM +0100, Stuart Henderson wrote:
>>>>>> On 2019/04/29 11:58, Sebastian Benoit wrote:
>>>>>>> David Gwynne(da...@gwynne.id.au) on 2019.04.29 19:36:51 +1000:
>>>>>>>> 
>>>>>>>> 
>>>>>>>>> On 29 Apr 2019, at 4:59 pm, Remi Locherer  
>>>>>>>>> wrote:
>>>>>>>>> 
>>>>>>>>> Hi David
>>>>>>>>> 
>>>>>>>>> On Mon, Apr 29, 2019 at 11:53:27AM +1000, David Gwynne wrote:
>>>>>>>>>> it's always bothered me that i config areas on a crisco using a 
>>>>>>>>>> number,
>>>>>>>>>> but then have to think hard to convert that number to an address for 
>>>>>>>>>> use
>>>>>>>>>> in openbsd. eg, i was given area 700 in one place, which is 0.0.2.188
>>>>>>>>>> as an address. super annoying.
>>>>>>>>>> 
>>>>>>>>>> so this changes the ospfd parser so it accepts both a number or 
>>>>>>>>>> address.
>>>>>>>>>> i also changed it so it prints the number by default, which may be
>>>>>>>>>> contentious. the manpage is slightly tweaked too.
>>>>>>>>>> 
>>>>>>>>>> thoughts?
>>>>>>>>> 
>>>>>>>>> I like it to be able to use a number instead of an address!
>>>>>>>>> 
>>>>>>>>> It worked fine in my short test I performed.
>>>>>>>>> 
>>>>>>>>> The output with the comment looks a bit strange to me.
>>>>>>>> 
>>>>>>>> Are you sure it doesn't look... awesome?
>>>>>>> 
>>>>>>> I like it!
>>>>>> 
>>>>>> I don't really, but if we change this it needs to be displayed somehow
>>>>>> and I don't have an idea to make it look nicer than this (cisco's method
>>>>>> seems pretty horrible and wouldn't work for us anyway - looks like they
>>>>>> remember which format was used to configure an area and use that as
>>>>>> the output format...)
>>>>>> 
>>>>> 
>>>>> Maybe it's better when we just allow both input formats but don't change
>>>>> any output.
>>>> 
>>>> Any opinions or comments on this? I think this would be a valuable addition
>>>> to ospfd.
>>> 
>>> Yes, and diff is ok benno@
>>> 
>> 
>> David: ok remi@ for your diff without the printconf part.
>> 
>>> What about ospf6d?
>> 
>> I'll handle that.
>> 
>>> 
>>>>> 
>>>>> Below diff changes ospfctl to accept the address and number format for
>>>>> "ospfct show database area XXX".
>>>>> 
>>>>> 
>>>>> Index: parser.c
>>>>> ===
>>>>> RCS file: /cvs/src/usr.sbin/ospfctl/parser.c,v
>>>>> retrieving revision 1.20
>>>>> diff -u -p -r1.20 parser.c
>>>>> --- parser.c  9 May 2011 12:25:35 -   1.20
>>>>> +++ parser.c  30 Apr 2019 20:28:18 -
>>>>> @@ -39,7 +39,8 @@ enum token_type {
>>>>>   ADDRESS,
>>>>>   FLAG,
>>>>>   PREFIX,
>>>>> - IFNAME
>>>>> + IFNAME,
>>>>> + AREA
>>>>> };
>>>>> 
>>>>> struct token {
>>>>> @@ -107,7 +108,7 @@ static const struct token t_show_db[] = 
>>>>> };
>>>>> 
>>>>> static const struct token t_show_area[] = {
>>>>> - {ADDRESS,   "", NONE,   NULL},
>>>>&g

hardware packet timestamping for mbufs, then sockets and bpf

2019-06-06 Thread David Gwynne

nics are starting to offer the ability to timestamp packets when
they're received. other systems (eg linux and freebsd) have support
for recording that timestamp on mbufs and then using it as the
backend for at least the SO_TIMESTAMP socket option instead of a
call to microtime().

this implements the above, and additionally supports using the hw
timestamp in bpf too. other systems may do the bpf thing too, but i
didn't look closely enough to find out.

timestamps are recorded as the uptime of the system in nanoseconds
in the ph_timestamp field in mbufs. this mirrors the use of
ph_timestamp in the fq_codel code to store the uptime in nanoseconds.
im using another bit in m_pkthdr.csum_flags to say whether the timestamp
is valid or not (M_TIMESTAMP). im arguing that it's another offloading
feature and therefore appropriate for the csum offload flags field.

this adds some inline functions to time.h for turning ns into a timeval
and timespec, which are ns_to_microtime and ns_to_nanotime respectively.
i originally wanted ns_to_timeval and ns_to_timespec, but the linux
compat stuff in drm already uses those names and ruined the idea.
especially since they return the time{val,spec}s as values.

the ipv4 SO_TIMESTAMP and bpf code looks at whether M_TIMESTAMP is set,
and if so turns ph_timestamp into a timeval before adding it to boottime
(which is the wall clock time that uptime starts at), before using it
instead of microtime().

the mcx changes are based on what freebsd did to their driver, but
simplified a bit.

i want this because we're being asked to look at recording network
traffic for possible audit use. part of that is having accurate
timestamps on received packets, and hopefully it will mitigate against
chunks of packets getting reordered or delayed significantly when the
box is busy.

thoughts? ok?

Index: sys/mbuf.h
===
RCS file: /cvs/src/sys/sys/mbuf.h,v
retrieving revision 1.242
diff -u -p -r1.242 mbuf.h
--- sys/mbuf.h  11 Feb 2019 00:25:33 -  1.242
+++ sys/mbuf.h  7 Jun 2019 03:27:41 -
@@ -226,13 +226,14 @@ struct mbuf {
 #defineM_ICMP_CSUM_IN_OK   0x0400  /* ICMP/ICMPv6 checksum 
verified */
 #defineM_ICMP_CSUM_IN_BAD  0x0800  /* ICMP/ICMPv6 checksum bad */
 #defineM_IPV6_DF_OUT   0x1000  /* don't fragment outgoing IPv6 
*/
+#defineM_TIMESTAMP 0x2000  /* ph_timestamp is set */
 
 #ifdef _KERNEL
 #define MCS_BITS \
 ("\20\1IPV4_CSUM_OUT\2TCP_CSUM_OUT\3UDP_CSUM_OUT\4IPV4_CSUM_IN_OK" \
 "\5IPV4_CSUM_IN_BAD\6TCP_CSUM_IN_OK\7TCP_CSUM_IN_BAD\10UDP_CSUM_IN_OK" \
 "\11UDP_CSUM_IN_BAD\12ICMP_CSUM_OUT\13ICMP_CSUM_IN_OK\14ICMP_CSUM_IN_BAD" \
-"\15IPV6_NODF_OUT")
+"\15IPV6_NODF_OUT" "\16TIMESTAMP")
 #endif
 
 /* mbuf types */
Index: sys/time.h
===
RCS file: /cvs/src/sys/sys/time.h,v
retrieving revision 1.41
diff -u -p -r1.41 time.h
--- sys/time.h  3 Jun 2019 01:27:30 -   1.41
+++ sys/time.h  7 Jun 2019 03:27:41 -
@@ -333,6 +333,20 @@ void clock_secs_to_ymdhms(time_t, struct
 /* Traditional POSIX base year */
 #define POSIX_BASE_YEAR 1970
 
+static __inline void
+ns_to_microtime(struct timeval *tv, uint64_t ns)
+{
+   tv->tv_sec = ns / 10L;
+   tv->tv_usec = (ns % 10L) / 1000;
+}
+
+static __inline void
+ns_to_nanotime(struct timespec *tv, uint64_t ns)
+{
+   tv->tv_sec = ns / 10L;
+   tv->tv_nsec = ns % 10L;
+}
+
 #else /* !_KERNEL */
 #include 
 
Index: net/bpf.c
===
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.175
diff -u -p -r1.175 bpf.c
--- net/bpf.c   18 May 2019 12:59:32 -  1.175
+++ net/bpf.c   7 Jun 2019 03:27:41 -
@@ -1284,13 +1284,25 @@ _bpf_mtap(caddr_t arg, const struct mbuf
fcode = bps->bps_bf.bf_insns;
slen = bpf_mfilter(fcode, m, pktlen);
 
-   if (slen == 0)
+   if (slen == 0)
continue;
if (d->bd_fildrop != BPF_FILDROP_PASS)
drop = 1;
if (d->bd_fildrop != BPF_FILDROP_DROP) {
-   if (!gottime++)
-   microtime();
+   if (!gottime) {
+   if (ISSET(m->m_flags, M_PKTHDR) &&
+   ISSET(m->m_pkthdr.csum_flags,
+M_TIMESTAMP)) {
+   struct timeval btv;
+   ns_to_microtime(,
+   m->m_pkthdr.ph_timestamp);
+   microboottime();
+   timeradd(, , );
+   } else
+   microtime();
+
+

tcpdump -T erspan

2019-05-20 Thread David Gwynne

-T erspan lets you force parsing a GRE packet as ERSPAN

Devices supportin ERSPAN type I allow arbitrary GRE protocol numbers to
be specified for encapsulating the spanned Ethernet packets. This lets
tcpdump cope with that by letting the user force erspan packet
processing.

This follows the mechanism used for IP and UDP packet processing. It
might be nice to extend the -T argument processing so you can do
something like tcpdump -T erspan=111 or -T erspan=0x88be so only
specific protocols are forced to erspan instead of all of them.

Anyway, if you're using a recentish Dell (or late Force 10 switch) with
ftos^Wdnos 9, it basically supports ERSPAN Type I without actually
saying that. The "monitor session X type erpm" encapsulates Ethernet
packets in GRE and sends them to a remote IP, and defaults to the same
GRE protocol identifier that ERSPAN uses. It also supports changing the
GRE protocol id, as per the ERSPAN draft specs:

If I configure this:

monitor session 10 type erpm
 source twentyFiveGigE 1/1 direction both
 erpm source-ip 10.138.79.17 dest-ip 10.138.79.2
 no disable

Then tcpdump on 10.138.79.2 shows this:

xdlg@bastion:~/src/usr.sbin/tcpdump$ sudo ./obj/tcpdump -vei vmx2 -B capture ip 
proto gre
tcpdump: listening on vmx2, link-type EN10MB
tcpdump: WARNING: compensating for unaligned libpcap packets
13:55:56.812909 54:bf:64:d9:07:42 00:50:56:a1:c1:4a ip 180: 
eait-42-dc2-c5-2.mgmt.eait.uq.edu.au > bastion.eait.uq.edu.au: gre [] 88be 
erspan I: 00:24:51:5d:84:00 01:00:5e:00:00:05 ip 142: 172.16.163.249 > 
ospf-all.mcast.net: OSPFv2-hello  56[92]: rtrid secret area 0.0.2.188 auth MD5 
key-id 1 seq 1558233217 [|ospf] [tos 0xc0] [ttl 1] (id 63484, len 128) (ttl 
255, id 0, len 166)

If I reconfigure the monitor session with the following erpm line:

 erpm source-ip 10.138.79.17 dest-ip 10.138.79.2 gre-protocol 111

I see this:

xdlg@bastion:~/src/usr.sbin/tcpdump$ sudo ./obj/tcpdump -nvei vmx2 -B capture 
ip proto gre   
tcpdump: listening on vmx2, link-type EN10MB
14:00:30.584863 54:bf:64:d9:07:42 00:50:56:a1:c1:4a 0800 168: 10.138.79.17 > 
10.138.79.2: gre [] 006f unknown-proto-006f (ttl 255, id 0, len 154)
14:00:30.585046 54:bf:64:d9:07:42 00:50:56:a1:c1:4a 0800 104: 10.138.79.17 > 
10.138.79.2: gre [] 006f unknown-proto-006f (ttl 255, id 0, len 90)

now with -T erspan:

xdlg@bastion:~/src/usr.sbin/tcpdump$ sudo ./obj/tcpdump -vei vmx2 -B
capture -T erspan ip proto gre  
tcpdump: listening on vmx2, link-type EN10MB
tcpdump: WARNING: compensating for unaligned libpcap packets
13:55:56.812909 54:bf:64:d9:07:42 00:50:56:a1:c1:4a ip 180: 
eait-42-dc2-c5-2.mgmt.eait.uq.edu.au > bastion.eait.uq.edu.au: gre [] 006f 
erspan I: 00:24:51:5d:84:00 01:00:5e:00:00:05 ip 142: 172.16.163.249 > 
ospf-all.mcast.net: OSPFv2-hello  56[92]: rtrid secret area 0.0.2.188 auth MD5 
key-id 1 seq 1558233217 [|ospf] [tos 0xc0] [ttl 1] (id 63484, len 128) (ttl 
255, id 0, len 166)

ok?

Index: interface.h
===
RCS file: /cvs/src/usr.sbin/tcpdump/interface.h,v
retrieving revision 1.80
diff -u -p -r1.80 interface.h
--- interface.h 5 Apr 2019 00:57:59 -   1.80
+++ interface.h 21 May 2019 03:46:35 -
@@ -64,6 +64,7 @@ extern char *device;  /* as specified by
 #define PT_MPLS10  /* MPLS (over UDP) */
 #define PT_TFTP11  /* Trivial File Transfer Protocol */
 #define PT_VXLAN   12  /* Virtual eXtensible Local Area Network */
+#define PT_ERSPAN  13  /* GRE ERSPAN Type I or II */
 
 #ifndef min
 #define min(a,b) ((a)>(b)?(b):(a))
Index: print-gre.c
===
RCS file: /cvs/src/usr.sbin/tcpdump/print-gre.c,v
retrieving revision 1.26
diff -u -p -r1.26 print-gre.c
--- print-gre.c 17 May 2019 06:47:10 -  1.26
+++ print-gre.c 21 May 2019 03:46:35 -
@@ -223,6 +223,14 @@ gre_print_0(const u_char *p, u_int lengt
 
printf(" ");
 
+   switch (packettype) {
+   case PT_ERSPAN:
+   gre_print_erspan(flags, p, length);
+   return;
+   default:
+   break;
+   }
+
switch (proto) {
case 0:
printf("keep-alive");
Index: tcpdump.8
===
RCS file: /cvs/src/usr.sbin/tcpdump/tcpdump.8,v
retrieving revision 1.101
diff -u -p -r1.101 tcpdump.8
--- tcpdump.8   18 Mar 2019 06:41:52 -  1.101
+++ tcpdump.8   21 May 2019 03:46:35 -
@@ -230,9 +230,11 @@ to be interpreted as the specified
 .Ar type .
 Currently known types are:
 .Pp
-.Bl -tag -width "vxlan" -offset indent -compact
+.Bl -tag -width "erspan" -offset indent -compact
 .It Cm cnfp
 Cisco NetFlow protocol
+.It Cm erspan
+Cisco Encapsulated Remote Switch Port Analyzer (ERSPAN) over GRE
 .It Cm gre
 Generic Routing Encapsulation over UDP
 .It Cm mpls
Index: tcpdump.c

make kevent(2) (a bit) mpsafe

2019-05-01 Thread David Gwynne

i originally came at this from the other side, where i wanted to run
kqueue_enqueue and _dequeue without the KERNEL_LOCK, but that implied
making kqueue_scan use the mutex too, which allowed the syscall to
become less locked.

it assumes that the existing locking in kqueue_scan is in the right
place, it just turns it into a mutex instead of KERNEL_LOCK with
splhigh. it leaves the kqueue_register code under KERNEL_LOCK, but if
you're not making changes with kevent then this should be a win.

there's an extra rwlock around the kqueue_scan call. this protects the
kq_head list from having multiple marker structs attached to it. that is
an extremely rare situation, ie, you'd have to have two threads execute
kevent on the same kq fd concurrently, but that never happens. right?

it seems to work ok, but i havent tested it extensively.

thoughts?

Index: sys/eventvar.h
===
RCS file: /cvs/src/sys/sys/eventvar.h,v
retrieving revision 1.5
diff -u -p -r1.5 eventvar.h
--- sys/eventvar.h  17 Jun 2018 08:22:02 -  1.5
+++ sys/eventvar.h  1 May 2019 06:29:43 -
@@ -35,6 +35,8 @@
 #define KQEXTENT   256 /* linear growth by this amount */
 
 struct kqueue {
+   struct rwlock   kq_kevent;  /* serialise kevent syscall */
+   struct mutexkq_mtx;
TAILQ_HEAD(kqlist, knote) kq_head;  /* list of pending event */
int kq_count;   /* number of pending events */
int kq_refs;/* number of references */
Index: kern/kern_event.c
===
RCS file: /cvs/src/sys/kern/kern_event.c,v
retrieving revision 1.102
diff -u -p -r1.102 kern_event.c
--- kern/kern_event.c   1 May 2019 06:22:39 -   1.102
+++ kern/kern_event.c   1 May 2019 06:29:43 -
@@ -455,6 +455,8 @@ sys_kqueue(struct proc *p, void *v, regi
fp->f_type = DTYPE_KQUEUE;
fp->f_ops = 
kq = pool_get(_pool, PR_WAITOK|PR_ZERO);
+   rw_init(>kq_kevent, "kevent");
+   mtx_init(>kq_mtx, IPL_HIGH);
TAILQ_INIT(>kq_head);
fp->f_data = kq;
KQREF(kq);
@@ -509,37 +511,42 @@ sys_kevent(struct proc *p, void *v, regi
kq = fp->f_data;
nerrors = 0;
 
-   while (SCARG(uap, nchanges) > 0) {
-   n = SCARG(uap, nchanges) > KQ_NEVENTS ?
-   KQ_NEVENTS : SCARG(uap, nchanges);
-   error = copyin(SCARG(uap, changelist), kev,
-   n * sizeof(struct kevent));
-   if (error)
-   goto done;
+   if (SCARG(uap, nchanges) > 0) {
+   KERNEL_LOCK();
+   do {
+   n = SCARG(uap, nchanges) > KQ_NEVENTS ?
+   KQ_NEVENTS : SCARG(uap, nchanges);
+   error = copyin(SCARG(uap, changelist), kev,
+   n * sizeof(struct kevent));
+   if (error)
+   goto done;
 #ifdef KTRACE
-   if (KTRPOINT(p, KTR_STRUCT))
-   ktrevent(p, kev, n);
+   if (KTRPOINT(p, KTR_STRUCT))
+   ktrevent(p, kev, n);
 #endif
-   for (i = 0; i < n; i++) {
-   kevp = [i];
-   kevp->flags &= ~EV_SYSFLAGS;
-   error = kqueue_register(kq, kevp, p);
-   if (error || (kevp->flags & EV_RECEIPT)) {
-   if (SCARG(uap, nevents) != 0) {
-   kevp->flags = EV_ERROR;
-   kevp->data = error;
-   copyout(kevp, SCARG(uap, eventlist),
-   sizeof(*kevp));
-   SCARG(uap, eventlist)++;
-   SCARG(uap, nevents)--;
-   nerrors++;
-   } else {
-   goto done;
+   for (i = 0; i < n; i++) {
+   kevp = [i];
+   kevp->flags &= ~EV_SYSFLAGS;
+   error = kqueue_register(kq, kevp, p);
+   if (error || (kevp->flags & EV_RECEIPT)) {
+   if (SCARG(uap, nevents) != 0) {
+   kevp->flags = EV_ERROR;
+   kevp->data = error;
+   copyout(kevp,
+   SCARG(uap, eventlist),
+   sizeof(*kevp));
+   SCARG(uap, eventlist)++;
+   SCARG(uap,

Re: OpenBGPD 6.5p0 released

2019-04-30 Thread David Gwynne

Nice work

On Tue., 30 Apr. 2019, 22:27 Claudio Jeker,  wrote:

> We have released OpenBGPD 6.5p0, which will be arriving in the
> OpenBGPD directory of your local OpenBSD mirror soon.
>
> OpenBGPD is a free, functional, and secure implementation of the
> Border Gateway Protocol (BGP-4). It is a fairly complete BGP
> implementation and praised for ease of use, as well as its
> reliability. This is the first release of OpenBGPD-portable in
> quite some time. Since the last portable release a significant
> amount of features and performance improvements have been added,
> if you haven't used openbgpd-portable in a while, you'll be positively
> amazed what has changed!
>
> Caveats: there currently there is no support to update FIB / kernel
> routing tables in the portable version of OpenBGPD.  This means
> OpenBGPD-portable is suitable for IXP route servers, route collectors
> and route reflectors but not (yet) for use as a traditional router.
> Some OpenBSD specific features have been turned off as well.
>
> OpenBGPD-portable is known to compile and run on FreeBSD 12.x, and
> the Linux distributions Debian 9, and Ubuntu 14.04. It is our hope
> that packagers take interest and help adapt OpenBGPD-portable to
> more distributions.
>
> We welcome feedback and improvements from the broader community.
> Thanks to all of the contributors who helped make this release
> possible.
>
>

Re: ospfd: allow specifying area by number as well as id

2019-04-29 Thread David Gwynne




> On 29 Apr 2019, at 4:59 pm, Remi Locherer  wrote:
> 
> Hi David
> 
> On Mon, Apr 29, 2019 at 11:53:27AM +1000, David Gwynne wrote:
>> it's always bothered me that i config areas on a crisco using a number,
>> but then have to think hard to convert that number to an address for use
>> in openbsd. eg, i was given area 700 in one place, which is 0.0.2.188
>> as an address. super annoying.
>> 
>> so this changes the ospfd parser so it accepts both a number or address.
>> i also changed it so it prints the number by default, which may be
>> contentious. the manpage is slightly tweaked too.
>> 
>> thoughts?
> 
> I like it to be able to use a number instead of an address!
> 
> It worked fine in my short test I performed.
> 
> The output with the comment looks a bit strange to me.

Are you sure it doesn't look... awesome?

> typhoon ..sbin/ospfd$ doas obj/ospfd -nv 
> 
> router-id 0.0.0.7
> fib-update yes
> fib-priority 32
> rfc1583compat no
> spf-delay msec 1000
> spf-holdtime msec 5000
> 
> area 7 { # 0.0.0.7
> ^
>interface pair7:10.77.77.1 {
>metric 10
>retransmit-interval 5
>router-dead-time 40
> 
> 
> I'd prefer if we settle for one output format and then use only that. The
> number format is more common but that would be a change for the users. I'm
> fine with either format for outputs.

I lean toward the number too. I don't think it would hurt to change it so only 
one is output, so long input works either way.

> There is also "ospfctl show database area 0.0.0.0" and ospf6d. ;-)

Are you offering to help with the implementation of those?

dlg

> 
> Regards,
> Remi
> 
> 
>> 
>> with this diff, i can do the following and things keep
>> working:
>> 
>> --- /etc/ospfd.conf  Mon Apr 29 11:29:56 2019
>> +++ /etc/ospfd.conf.new  Mon Apr 29 11:39:45 2019
>> @@ -7,5 +7,5 @@
>> redistribute rtlabel "backup" set metric 65535
>> 
>> -area 0.0.2.188 {
>> +area 700 {
>>  router-dead-time minimal
>>  fast-hello-interval msec 300
>> 
>> Index: ospfd.conf.5
>> ===
>> RCS file: /cvs/src/usr.sbin/ospfd/ospfd.conf.5,v
>> retrieving revision 1.55
>> diff -u -p -r1.55 ospfd.conf.5
>> --- ospfd.conf.5 28 Dec 2018 19:25:10 -  1.55
>> +++ ospfd.conf.5 29 Apr 2019 01:45:40 -
>> @@ -68,7 +68,7 @@ Macros are not expanded inside quotes.
>> For example:
>> .Bd -literal -offset indent
>> hi="5"
>> -area 0.0.0.0 {
>> +area 0 {
>>  interface em0 {
>>  hello-interval $hi
>>  }
>> @@ -257,10 +257,10 @@ Areas are used for grouping interfaces.
>> All interface-specific parameters can
>> be configured per area, overruling the global settings.
>> .Bl -tag -width Ds
>> -.It Ic area Ar address
>> +.It Ic area Ar id Ns | Ns Ar address
>> Specify an area section, grouping one or more interfaces.
>> .Bd -literal -offset indent
>> -area 0.0.0.0 {
>> +area 0 {
>>  interface em0
>>  interface em1 {
>>  metric 10
>> Index: parse.y
>> ===
>> RCS file: /cvs/src/usr.sbin/ospfd/parse.y,v
>> retrieving revision 1.95
>> diff -u -p -r1.95 parse.y
>> --- parse.y  13 Feb 2019 22:57:08 -  1.95
>> +++ parse.y  29 Apr 2019 01:45:40 -
>> @@ -120,6 +120,7 @@ typedef struct {
>>  int64_t  number;
>>  char*string;
>>  struct redistribute *redist;
>> +struct in_addr   id;
>>  } v;
>>  int lineno;
>> } YYSTYPE;
>> @@ -145,6 +146,7 @@ typedef struct {
>> %type  deadtime
>> %type  string dependon
>> %type  redistribute
>> +%type areaid
>> 
>> %%
>> 
>> @@ -588,15 +590,8 @@ comma   : ','
>>  | /*empty*/
>>  ;
>> 
>> -area: AREA STRING {
>> -struct in_addr  id;
>> -if (inet_aton($2, ) == 0) {
>> -yyerror("error parsing area");
>> -free($2);
>> -YYERROR;
>> -}
>> -free($2);
>> -area = conf_get_area(id);
>> +area: AREA areaid {
>> +

ospfd: allow specifying area by number as well as id

2019-04-28 Thread David Gwynne

it's always bothered me that i config areas on a crisco using a number,
but then have to think hard to convert that number to an address for use
in openbsd. eg, i was given area 700 in one place, which is 0.0.2.188
as an address. super annoying.

so this changes the ospfd parser so it accepts both a number or address.
i also changed it so it prints the number by default, which may be
contentious. the manpage is slightly tweaked too.

thoughts?

with this diff, i can do the following and things keep
working:

--- /etc/ospfd.conf Mon Apr 29 11:29:56 2019
+++ /etc/ospfd.conf.new Mon Apr 29 11:39:45 2019
@@ -7,5 +7,5 @@
 redistribute rtlabel "backup" set metric 65535
 
-area 0.0.2.188 {
+area 700 {
router-dead-time minimal
fast-hello-interval msec 300

Index: ospfd.conf.5
===
RCS file: /cvs/src/usr.sbin/ospfd/ospfd.conf.5,v
retrieving revision 1.55
diff -u -p -r1.55 ospfd.conf.5
--- ospfd.conf.528 Dec 2018 19:25:10 -  1.55
+++ ospfd.conf.529 Apr 2019 01:45:40 -
@@ -68,7 +68,7 @@ Macros are not expanded inside quotes.
 For example:
 .Bd -literal -offset indent
 hi="5"
-area 0.0.0.0 {
+area 0 {
interface em0 {
hello-interval $hi
}
@@ -257,10 +257,10 @@ Areas are used for grouping interfaces.
 All interface-specific parameters can
 be configured per area, overruling the global settings.
 .Bl -tag -width Ds
-.It Ic area Ar address
+.It Ic area Ar id Ns | Ns Ar address
 Specify an area section, grouping one or more interfaces.
 .Bd -literal -offset indent
-area 0.0.0.0 {
+area 0 {
interface em0
interface em1 {
metric 10
Index: parse.y
===
RCS file: /cvs/src/usr.sbin/ospfd/parse.y,v
retrieving revision 1.95
diff -u -p -r1.95 parse.y
--- parse.y 13 Feb 2019 22:57:08 -  1.95
+++ parse.y 29 Apr 2019 01:45:40 -
@@ -120,6 +120,7 @@ typedef struct {
int64_t  number;
char*string;
struct redistribute *redist;
+   struct in_addr   id;
} v;
int lineno;
 } YYSTYPE;
@@ -145,6 +146,7 @@ typedef struct {
 %typedeadtime
 %typestring dependon
 %typeredistribute
+%typeareaid
 
 %%
 
@@ -588,15 +590,8 @@ comma  : ','
| /*empty*/
;
 
-area   : AREA STRING {
-   struct in_addr  id;
-   if (inet_aton($2, ) == 0) {
-   yyerror("error parsing area");
-   free($2);
-   YYERROR;
-   }
-   free($2);
-   area = conf_get_area(id);
+area   : AREA areaid {
+   area = conf_get_area($2);
 
memcpy(, defs, sizeof(areadefs));
md_list_copy(_list, >md_list);
@@ -610,6 +605,23 @@ area   : AREA STRING {
 
 demotecount: NUMBER{ $$ = $1; }
| /*empty*/ { $$ = 1; }
+   ;
+
+areaid : NUMBER {
+   if ($1 < 0 || $1 > 0x) {
+   yyerror("invalid area id");
+   YYERROR;
+   }
+   $$.s_addr = htonl($1);
+   }
+   | STRING {
+   if (inet_aton($1, &$$) == 0) {
+   yyerror("error parsing area");
+   free($1);
+   YYERROR;
+   }
+   free($1);
+   }
;
 
 areaopts_l : areaopts_l areaoptsl nl
Index: printconf.c
===
RCS file: /cvs/src/usr.sbin/ospfd/printconf.c,v
retrieving revision 1.20
diff -u -p -r1.20 printconf.c
--- printconf.c 28 Dec 2018 19:25:10 -  1.20
+++ printconf.c 29 Apr 2019 01:45:40 -
@@ -181,7 +181,8 @@ print_config(struct ospfd_conf *conf)
printf("\n");
 
LIST_FOREACH(area, >area_list, entry) {
-   printf("area %s {\n", inet_ntoa(area->id));
+   printf("area %u { # %s\n", ntohl(area->id.s_addr),
+   inet_ntoa(area->id));
if (area->stub) {
printf("\tstub");
if (SIMPLEQ_EMPTY(>redist_list))

move the vlan_softc^Wifvlan definition from if_vlan_var.h to if_vlan.c

2019-04-25 Thread David Gwynne

nothing else in the kernel needs to look inside struct ifvlan.

so this diff moves it. the next step will be renaming ifvlan to
vlan_softc so it is like all the other drivers in the tree, and
referring to it as "sc" instead of "ifv" as a variable. but this is a
first step.

ok?

Index: if_vlan.c
===
RCS file: /cvs/src/sys/net/if_vlan.c,v
retrieving revision 1.186
diff -u -p -r1.186 if_vlan.c
--- if_vlan.c   22 Apr 2019 03:29:40 -  1.186
+++ if_vlan.c   26 Apr 2019 05:47:09 -
@@ -57,6 +57,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -71,6 +72,41 @@
 #if NBPFILTER > 0
 #include 
 #endif
+
+struct vlan_mc_entry {
+   LIST_ENTRY(vlan_mc_entry)   mc_entries;
+   union {
+   struct ether_multi  *mcu_enm;
+   } mc_u;
+#define mc_enm mc_u.mcu_enm
+   struct sockaddr_storage mc_addr;
+};
+
+struct ifvlan {
+   struct  arpcom ifv_ac;  /* make this an interface */
+   unsigned int ifv_ifidx0;/* parent interface of this vlan */
+   int ifv_rxprio;
+   struct  ifv_linkmib {
+   int ifvm_prio; /* prio to apply on packet leaving if */
+   u_int16_t ifvm_proto; /* encapsulation ethertype */
+   u_int16_t ifvm_tag; /* tag to apply on packets leaving if */
+   u_int16_t ifvm_type; /* non-standard ethertype or 0x8100 */
+   }   ifv_mib;
+   LIST_HEAD(__vlan_mchead, vlan_mc_entry) vlan_mc_listhead;
+   SRPL_ENTRY(ifvlan) ifv_list;
+   int ifv_flags;
+   struct refcnt ifv_refcnt;
+   void *lh_cookie;
+   void *dh_cookie;
+   struct ifih *ifv_ifih;
+};
+
+#defineifv_if  ifv_ac.ac_if
+#defineifv_tag ifv_mib.ifvm_tag
+#defineifv_prioifv_mib.ifvm_prio
+#defineifv_typeifv_mib.ifvm_type
+#defineIFVF_PROMISC0x01/* the parent should be made promisc */
+#defineIFVF_LLADDR 0x02/* don't inherit the parents mac */
 
 #define TAG_HASH_BITS  5
 #define TAG_HASH_SIZE  (1 << TAG_HASH_BITS)
Index: if_vlan_var.h
===
RCS file: /cvs/src/sys/net/if_vlan_var.h,v
retrieving revision 1.40
diff -u -p -r1.40 if_vlan_var.h
--- if_vlan_var.h   19 Apr 2019 04:36:12 -  1.40
+++ if_vlan_var.h   26 Apr 2019 05:47:09 -
@@ -47,44 +47,6 @@ struct   vlanreq {
 };
 
 #ifdef _KERNEL
-#include 
-
-#define mc_enm mc_u.mcu_enm
-
-struct vlan_mc_entry {
-   LIST_ENTRY(vlan_mc_entry)   mc_entries;
-   union {
-   struct ether_multi  *mcu_enm;
-   } mc_u;
-   struct sockaddr_storage mc_addr;
-};
-
-struct ifvlan {
-   struct  arpcom ifv_ac;  /* make this an interface */
-   unsigned int ifv_ifidx0;/* parent interface of this vlan */
-   int ifv_rxprio;
-   struct  ifv_linkmib {
-   int ifvm_prio; /* prio to apply on packet leaving if */
-   u_int16_t ifvm_proto; /* encapsulation ethertype */
-   u_int16_t ifvm_tag; /* tag to apply on packets leaving if */
-   u_int16_t ifvm_type; /* non-standard ethertype or 0x8100 */
-   }   ifv_mib;
-   LIST_HEAD(__vlan_mchead, vlan_mc_entry) vlan_mc_listhead;
-   SRPL_ENTRY(ifvlan) ifv_list;
-   int ifv_flags;
-   struct refcnt ifv_refcnt;
-   void *lh_cookie;
-   void *dh_cookie;
-   struct ifih *ifv_ifih;
-};
-
-#defineifv_if  ifv_ac.ac_if
-#defineifv_tag ifv_mib.ifvm_tag
-#defineifv_prioifv_mib.ifvm_prio
-#defineifv_typeifv_mib.ifvm_type
-#defineIFVF_PROMISC0x01/* the parent should be made promisc */
-#defineIFVF_LLADDR 0x02/* don't inherit the parents mac */
-
 struct mbuf*vlan_inject(struct mbuf *, uint16_t, uint16_t);
 #endif /* _KERNEL */

Re: bridge(4) output w/o KERNEL_LOCK()

2019-04-23 Thread David Gwynne




> On 22 Apr 2019, at 5:45 am, Martin Pieuchot  wrote:
> 
> Diff below removes the KERNEL_LOCK() from bridge(4)'s output fast-path.
> 
> To do so, it redefines the ifp <-> bridge relationship.  Currently every
> interface in a bridge(4) contains a pointer to that bridge's port.  This
> relationship is guaranteed to be valid as long as the KERNEL_LOCK() is
> held.  We cannot use the NET_LOCK() to protect this relation because
> wifi drivers still call bridge_output() in interrupt handlers.  So I
> decided to put the bridge's interface index in `struct ifnet' instead.
> 
> bridge_rtlookup() is now also returning an interface index for similar
> reasons.
> 
> The `interface list' and `span list' are still protected by the
> KERNEL_LOCK() in this diff.  Next step will be to move to SMR and remove
> the intermediate queue in the input path.
> 
> All of that should improve latency of bridge(4) and allow us to continue
> untangle the various locks in the Network Stack.
> 
> This has been quite extensively tested by Hrvoje Popovski.  I'm looking
> for more tests, reviews and oks :)

ok by me.

> 
> Index: net/bridgectl.c
> ===
> RCS file: /cvs/src/sys/net/bridgectl.c,v
> retrieving revision 1.17
> diff -u -p -r1.17 bridgectl.c
> --- net/bridgectl.c   8 Mar 2019 17:48:35 -   1.17
> +++ net/bridgectl.c   4 Apr 2019 19:58:21 -
> @@ -84,8 +84,7 @@ bridgectl_ioctl(struct ifnet *ifp, u_lon
>   error = ENOENT;
>   break;
>   }
> - bif = (struct bridge_iflist *)ifs->if_bridgeport;
> - if (bif == NULL || bif->bridge_sc != sc) {
> + if (ifs->if_bridgeidx != ifp->if_index) {
>   error = ESRCH;
>   break;
>   }
> @@ -126,8 +125,7 @@ bridgectl_ioctl(struct ifnet *ifp, u_lon
>   error = ENOENT;
>   break;
>   }
> - bif = (struct bridge_iflist *)ifs->if_bridgeport;
> - if (bif == NULL || bif->bridge_sc != sc) {
> + if (ifs->if_bridgeidx != ifp->if_index) {
>   error = ESRCH;
>   break;
>   }
> @@ -137,6 +135,7 @@ bridgectl_ioctl(struct ifnet *ifp, u_lon
>   error = EINVAL;
>   break;
>   }
> + bif = bridge_getbif(ifs);
>   if (brlreq->ifbr_flags & BRL_FLAG_IN) {
>   error = bridge_addrule(bif, brlreq, 0);
>   if (error)
> @@ -154,11 +153,11 @@ bridgectl_ioctl(struct ifnet *ifp, u_lon
>   error = ENOENT;
>   break;
>   }
> - bif = (struct bridge_iflist *)ifs->if_bridgeport;
> - if (bif == NULL || bif->bridge_sc != sc) {
> + if (ifs->if_bridgeidx != ifp->if_index) {
>   error = ESRCH;
>   break;
>   }
> + bif = bridge_getbif(ifs);
>   bridge_flushrule(bif);
>   break;
>   case SIOCBRDGGRL:
> @@ -167,11 +166,11 @@ bridgectl_ioctl(struct ifnet *ifp, u_lon
>   error = ENOENT;
>   break;
>   }
> - bif = (struct bridge_iflist *)ifs->if_bridgeport;
> - if (bif == NULL || bif->bridge_sc != sc) {
> + if (ifs->if_bridgeidx != ifp->if_index) {
>   error = ESRCH;
>   break;
>   }
> + bif = bridge_getbif(ifs);
>   error = bridge_brlconf(bif, bc);
>   break;
>   default:
> @@ -206,7 +205,7 @@ bridge_rtupdate(struct bridge_softc *sc,
>   goto done;
> 
>   bcopy(ea, >brt_addr, sizeof(p->brt_addr));
> - p->brt_if = ifp;
> + p->brt_ifidx = ifp->if_index;
>   p->brt_age = 1;
>   bridge_copytag(brtag, >brt_tunnel);
> 
> @@ -227,16 +226,14 @@ bridge_rtupdate(struct bridge_softc *sc,
>   dir = memcmp(ea, >brt_addr, sizeof(q->brt_addr));
>   if (dir == 0) {
>   if (setflags) {
> - q->brt_if = ifp;
> + q->brt_ifidx = ifp->if_index;
>   q->brt_flags = flags;
>   } else if (!(q->brt_flags & IFBAF_STATIC))
> - q->brt_if = ifp;
> + q->brt_ifidx = ifp->if_index;
> 
> - if (q->brt_if == ifp)
> + if (q->brt_ifidx == ifp->if_index)
>   q->brt_age = 1;
> - ifp = q->brt_if;
>   bridge_copytag(brtag, >brt_tunnel);
> -
>   goto want;
>   }
> 
> @@ -248,7 +245,7 @@ bridge_rtupdate(struct bridge_softc *sc,
>

disable mobileip(4)

2019-04-21 Thread David Gwynne

i want to remove the mobileip driver. disabling it is a good start.

it would be generous to say that mobileip is a very niche protocol. it
is not great on the wire, it cannot encap anything except ipv4, and it
lacks widespread support in other operating systems. the other bsds
appear to support it, but there are better options for tunnelling
between them anyway.

ok?

Index: GENERIC
===
RCS file: /cvs/src/sys/conf/GENERIC,v
retrieving revision 1.260
diff -u -p -r1.260 GENERIC
--- GENERIC 13 Apr 2019 17:35:10 -  1.260
+++ GENERIC 22 Apr 2019 01:32:29 -
@@ -90,7 +90,6 @@ pseudo-device carp# CARP protocol supp
 pseudo-device  etherip # EtherIP (RFC 3378)
 pseudo-device  gif # IPv[46] over IPv[46] tunnel (RFC1933)
 pseudo-device  gre # GRE encapsulation interface
-pseudo-device  mobileip# MobileIP encapsulation interface
 pseudo-device  loop# network loopback
 pseudo-device  mpe # MPLS PE interface
 pseudo-device  mpw # MPLS pseudowire support

Re: ifconfig: remove obsolete vlan code

2019-04-14 Thread David Gwynne




> On 15 Apr 2019, at 05:56, Klemens Nanni  wrote:
> 
> On Sun, Apr 14, 2019 at 07:46:59PM +0200, Sebastian Benoit wrote:
>> I dont mind keeping vlan/vlandev either, but then they should be aliases,
>> not with their own function.
> Fine with me as well.
> 
> Diff below removes the old functions while keeping `[-]vlan' and
> `[-]vlandev' as aliases for `[-]vnetid' and `[-]parent' respectively.
> 
> While testing this, I noticed that the old and new interfaces do not
> work well together, although they're supposed to do the same.
> 
> This is -CURRENT without my diff.  Create one using the old interface:
> 
>   # ifconfig vlan0 vlan 1 vlandev trunk0
>   ifconfig: The 'vlan' option is deprecated, use 'vnetid'
>   ifconfig: The 'vlandev' option is deprecated, use 'parent'
> 
> Deconfigure it using the new interface:
> 
>   # ifconfig vlan0 -vnetid
>   # ifconfig vlan0 -parent
>   ifconfig: SIOCDIFPARENT: Device busy
>   # ifconfig vlan0 | grep encap
>   encap: vnetid none parent trunk0 txprio packet
> 
> 
> Clean up, do it again but the other way around;  create with new:
> 
>   # ifconfig vlan0 destroy
>   # ifconfig vlan0 vnetid 1 parent trunk0
> 
> Delete with old:
> 
>   # ifconfig vlan0 -vnetid
>   # ifconfig vlan0 -parent
>   # ifconfig vlan0 | grep encap
>   encap: vnetid none parent none txprio packet
> 
> 
> Shouldn't there be completely interoperability?  If so, that seems like
> one more reason for removing the old interface, with or without aliases.

I don't mind if we remove the old command names or alias them to the new ones, 
I mostly want the ioctl interface the old names call to go away. Removing or 
aliasing achieves that.

The behaviour changes you're seeing above are from the ioctl interface the old 
commands are using. The one in particular that you're hitting is that the old 
ioctl implicitly brings the vlan interface up, but up which is when the config 
is "committed". You can't change or remove the parent while the vlan interface 
is up, and it's now up even though you didn't ifconfig vlan0 up.

Another difference you should be aware of is that the code you're removing used 
the interface minor as the vnetid if another value wasn't explicitly set. eg, 
"ifconfig vlan10 vlandev trunk0" on a newly created interface turns into the 
following:

ifconfig vlan10 parent trunk0
ifconfig vlan10 vnetid 10
ifconfig vlan10 up

I'm trying to get rid of implicit behaviours like this with side effects in the 
network stack.

You have my OK on this diff.

dlg

> 
> So like this?  No manual bits so far.
> 
> Index: ifconfig.c
> ===
> RCS file: /cvs/src/sbin/ifconfig/ifconfig.c,v
> retrieving revision 1.399
> diff -u -p -r1.399 ifconfig.c
> --- ifconfig.c11 Apr 2019 11:32:24 -  1.399
> +++ ifconfig.c14 Apr 2019 19:52:33 -
> @@ -250,9 +250,6 @@ void  setpwe3fat(const char *, int);
> void  unsetpwe3fat(const char *, int);
> void  setpwe3neighbor(const char *, const char *);
> void  unsetpwe3neighbor(const char *, int);
> -void setvlantag(const char *, int);
> -void setvlandev(const char *, int);
> -void unsetvlandev(const char *, int);
> void  mpls_status(void);
> void  setrdomain(const char *, int);
> void  unsetrdomain(const char *, int);
> @@ -424,9 +421,10 @@ const struct cmd {
>   { "-vnetid",0,  0,  delvnetid },
>   { "parent", NEXTARG,0,  setifparent },
>   { "-parent",1,  0,  delifparent },
> - { "vlan",   NEXTARG,0,  setvlantag },
> - { "vlandev",NEXTARG,0,  setvlandev },
> - { "-vlandev",   1,  0,  unsetvlandev },
> + { "vlan",   NEXTARG,0,  setvnetid },
> + { "-vlan",  0,  0,  delvnetid },
> + { "vlandev",NEXTARG,0,  setifparent },
> + { "-vlandev",   1,  0,  delifparent },
>   { "group",  NEXTARG,0,  setifgroup },
>   { "-group", NEXTARG,0,  unsetifgroup },
>   { "autoconf",   1,  0,  setautoconf },
> @@ -4273,89 +4271,6 @@ getencap(void)
> #endif
> 
>   printf("\n");
> -}
> -
> -static int __tag = 0;
> -static int __have_tag = 0;
> -
> -/* ARGSUSED */
> -void
> -setvlantag(const char *val, int d)
> -{
> - u_int16_t tag;
> - struct vlanreq vreq;
> - const char *errmsg = NULL;
> -
> - warnx("The 'vlan' option is deprecated, use 'vnetid'");
> -
> - __tag = tag = strtonum(val, EVL_VLID_MIN, EVL_VLID_MAX, );
> - if (errmsg)
> - errx(1, "vlan tag %s: %s", val, errmsg);
> - __have_tag = 1;
> -
> - bzero((char *), sizeof(struct vlanreq));
> - ifr.ifr_data = (caddr_t)
> -
> - if (ioctl(s, SIOCGETVLAN,

tunnel interface rxprio config

2019-04-14 Thread David Gwynne

ive been working on RFC 2983 support, with extended functionality.

rfc 2983 is "Differentiated Services and Tunnels", and discusses where
prio values should go and come from on tunnel ingress and egress. we
currentl support setting the packet on tunnel ingress using the txprio
functionliaty. this diff adds egress or rxprio handling.

the rfc talks about selecting the outer or inner dscp value on ip
tunnels. this diff adds this support, and allows config to ignore both
the inner and outer prio fields, or hardcode it to a specific value like
we do on tx. it also extends on the rfc by allowing the config to
apply to other encapsulations, eg, vlan, bpe, and the mpls tunnels can
support this too.

the diff below shows the vlan and gif diffs. i have changes for other
interfaces in a tree somewhere, but i'm happy to commit those on my own
if everyone's ok with the diff below.

ok?

Index: sys/sys/sockio.h
===
RCS file: /cvs/src/sys/sys/sockio.h,v
retrieving revision 1.81
diff -u -p -r1.81 sockio.h
--- sys/sys/sockio.h10 Apr 2019 09:49:50 -  1.81
+++ sys/sys/sockio.h14 Apr 2019 07:14:01 -
@@ -207,6 +207,9 @@
 #defineSIOCSLIFPHYECN  _IOW('i', 199, struct ifreq)/* set ecn 
copying */
 #defineSIOCGLIFPHYECN  _IOWR('i', 200, struct ifreq)   /* get ecn 
copying */
 
+#defineSIOCSRXHPRIO_IOW('i', 219, struct ifreq)/* set rx hdr 
prio */
+#defineSIOCGRXHPRIO_IOWR('i', 219, struct ifreq)   /* get rx hdr 
prio */
+
 #define SIOCSPWE3CTRLWORD  _IOW('i', 220, struct ifreq)
 #define SIOCGPWE3CTRLWORD  _IOWR('i',  220, struct ifreq)
 #define SIOCSPWE3FAT   _IOW('i', 221, struct ifreq)
Index: sys/net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.575
diff -u -p -r1.575 if.c
--- sys/net/if.c14 Apr 2019 06:57:00 -  1.575
+++ sys/net/if.c14 Apr 2019 07:14:01 -
@@ -2168,6 +2168,7 @@ ifioctl(struct socket *so, u_long cmd, c
case SIOCSVNETID:
case SIOCSVNETFLOWID:
case SIOCSTXHPRIO:
+   case SIOCSRXHPRIO:
case SIOCSIFPAIR:
case SIOCSIFPARENT:
case SIOCDIFPARENT:
Index: sys/net/if.h
===
RCS file: /cvs/src/sys/net/if.h,v
retrieving revision 1.200
diff -u -p -r1.200 if.h
--- sys/net/if.h10 Apr 2019 09:49:22 -  1.200
+++ sys/net/if.h14 Apr 2019 07:14:02 -
@@ -427,6 +427,7 @@ struct  ifreq {
 #define IF_HDRPRIO_MAX IFQ_MAXPRIO
 #define IF_HDRPRIO_PACKET  -1  /* use mbuf prio */
 #define IF_HDRPRIO_PAYLOAD -2  /* copy payload prio */
+#define IF_HDRPRIO_OUTER   -3  /* use outer prio */
 
 #define IF_PWE3_ETHERNET   1   /* ethernet or ethernet tagged */
 #define IF_PWE3_IP 2   /* IP layer 2 */
Index: sys/net/if_vlan.c
===
RCS file: /cvs/src/sys/net/if_vlan.c,v
retrieving revision 1.183
diff -u -p -r1.183 if_vlan.c
--- sys/net/if_vlan.c   15 Feb 2019 13:00:51 -  1.183
+++ sys/net/if_vlan.c   14 Apr 2019 07:14:02 -
@@ -174,6 +174,7 @@ vlan_clone_create(struct if_clone *ifc, 
 
refcnt_init(>ifv_refcnt);
ifv->ifv_prio = IF_HDRPRIO_PACKET;
+   ifv->ifv_rxprio = IF_HDRPRIO_OUTER;
 
ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST;
ifp->if_xflags = IFXF_CLONED|IFXF_MPSAFE;
@@ -373,11 +374,6 @@ vlan_input(struct ifnet *ifp0, struct mb
 
/* From now on ether_vtag is fine */
tag = EVL_VLANOFTAG(m->m_pkthdr.ether_vtag);
-   m->m_pkthdr.pf.prio = EVL_PRIOFTAG(m->m_pkthdr.ether_vtag);
-
-   /* IEEE 802.1p has prio 0 and 1 swapped */
-   if (m->m_pkthdr.pf.prio <= 1)
-   m->m_pkthdr.pf.prio = !m->m_pkthdr.pf.prio;
 
list = [TAG_HASH(tag)];
SRPL_FOREACH(ifv, , list, ifv_list) {
@@ -408,6 +404,20 @@ vlan_input(struct ifnet *ifp0, struct mb
m_adj(m, EVL_ENCAPLEN);
}
 
+   switch (ifv->ifv_rxprio) {
+   case IF_HDRPRIO_PACKET:
+   break;
+   case IF_HDRPRIO_OUTER:
+   m->m_pkthdr.pf.prio = EVL_PRIOFTAG(m->m_pkthdr.ether_vtag);
+   break;
+   default:
+   m->m_pkthdr.pf.prio = ifv->ifv_rxprio;
+   /* IEEE 802.1p has prio 0 and 1 swapped */
+   if (m->m_pkthdr.pf.prio <= 1)
+   m->m_pkthdr.pf.prio = !m->m_pkthdr.pf.prio;
+   break;
+   }
+
ml_enqueue(, m);
if_input(>ifv_if, );
SRPL_LEAVE();
@@ -736,6 +746,22 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd
break;
case SIOCGTXHPRIO:
ifr->ifr_hdrprio = ifv->ifv_prio;
+   break;
+
+   case SIOCSRXHPRIO:
+   if (ifr->ifr_hdrprio == IF_HDRPRIO_PACKET ||
+

tx mitigation again

2019-04-14 Thread David Gwynne

this is another go at implementing tx mitigation.

it is mostly the same as the previous attempts in that actual tx
is deferred to a network taskq unless a backlog of packets is
reached. when the task or the backlog is reached the actual hardware
transmit routine is called. this is all hidden behind the existing
api.

the big difference in this version is that an ifq_barrier call no
longer implies a taskq_barrier in the nettq. this avoids a deadlock
that nics can cause if (when) they call ifq_barrier with NET_LOCK
held. instead we just rely on the ifq serialiser barrier to do it's
thing.

we do care that the task isnt being run when the ifq is about to
be freed, so we do the taskq_barrier when the ifq is behing shut
down, which already happens without NET_LOCK held.

this still gives a significant performance improvement in some
situations. eg, hrvoje popovski goes from 740kpps to 1mpps when
forwarding between ix interfaces running this code.

id like to get it in now so we can shake any issues out of it.

ok?

Index: ifq.c
===
RCS file: /cvs/src/sys/net/ifq.c,v
retrieving revision 1.30
diff -u -p -r1.30 ifq.c
--- ifq.c   29 Mar 2019 04:21:55 -  1.30
+++ ifq.c   2 Apr 2019 22:16:19 -
@@ -70,6 +70,13 @@ struct priq {
 void   ifq_start_task(void *);
 void   ifq_restart_task(void *);
 void   ifq_barrier_task(void *);
+void   ifq_bundle_task(void *);
+
+static inline void
+ifq_run_start(struct ifqueue *ifq)
+{
+   ifq_serialize(ifq, >ifq_start);
+}
 
 void
 ifq_serialize(struct ifqueue *ifq, struct task *t)
@@ -112,6 +119,16 @@ ifq_is_serialized(struct ifqueue *ifq)
 }
 
 void
+ifq_start(struct ifqueue *ifq)
+{
+   if (ifq_len(ifq) >= min(ifq->ifq_if->if_txmit, ifq->ifq_maxlen)) {
+   task_del(ifq->ifq_softnet, >ifq_bundle);
+   ifq_run_start(ifq);
+   } else
+   task_add(ifq->ifq_softnet, >ifq_bundle);
+}
+
+void
 ifq_start_task(void *p)
 {
struct ifqueue *ifq = p;
@@ -135,11 +152,21 @@ ifq_restart_task(void *p)
 }
 
 void
+ifq_bundle_task(void *p)
+{
+   struct ifqueue *ifq = p;
+
+   ifq_run_start(ifq);
+}
+
+void
 ifq_barrier(struct ifqueue *ifq)
 {
struct cond c = COND_INITIALIZER();
struct task t = TASK_INITIALIZER(ifq_barrier_task, );
 
+   task_del(ifq->ifq_softnet, >ifq_bundle);
+
if (ifq->ifq_serializer == NULL)
return;
 
@@ -164,6 +191,7 @@ void
 ifq_init(struct ifqueue *ifq, struct ifnet *ifp, unsigned int idx)
 {
ifq->ifq_if = ifp;
+   ifq->ifq_softnet = net_tq(ifp->if_index); /* + idx */
ifq->ifq_softc = NULL;
 
mtx_init(>ifq_mtx, IPL_NET);
@@ -184,6 +212,7 @@ ifq_init(struct ifqueue *ifq, struct ifn
mtx_init(>ifq_task_mtx, IPL_NET);
TAILQ_INIT(>ifq_task_list);
ifq->ifq_serializer = NULL;
+   task_set(>ifq_bundle, ifq_bundle_task, ifq);
 
task_set(>ifq_start, ifq_start_task, ifq);
task_set(>ifq_restart, ifq_restart_task, ifq);
@@ -234,6 +263,10 @@ void
 ifq_destroy(struct ifqueue *ifq)
 {
struct mbuf_list ml = MBUF_LIST_INITIALIZER();
+
+   NET_ASSERT_UNLOCKED();
+   if (!task_del(ifq->ifq_softnet, >ifq_bundle))
+   taskq_barrier(ifq->ifq_softnet);
 
/* don't need to lock because this is the last use of the ifq */
 
Index: ifq.h
===
RCS file: /cvs/src/sys/net/ifq.h,v
retrieving revision 1.25
diff -u -p -r1.25 ifq.h
--- ifq.h   29 Mar 2019 04:21:55 -  1.25
+++ ifq.h   2 Apr 2019 22:16:19 -
@@ -25,6 +25,7 @@ struct ifq_ops;
 
 struct ifqueue {
struct ifnet*ifq_if;
+   struct taskq*ifq_softnet;
union {
void*_ifq_softc;
/*
@@ -57,6 +58,7 @@ struct ifqueue {
struct mutex ifq_task_mtx;
struct task_list ifq_task_list;
void*ifq_serializer;
+   struct task  ifq_bundle;
 
/* work to be serialised */
struct task  ifq_start;
@@ -397,6 +399,7 @@ void ifq_attach(struct ifqueue *, cons
 voidifq_destroy(struct ifqueue *);
 voidifq_add_data(struct ifqueue *, struct if_data *);
 int ifq_enqueue(struct ifqueue *, struct mbuf *);
+voidifq_start(struct ifqueue *);
 struct mbuf*ifq_deq_begin(struct ifqueue *);
 voidifq_deq_commit(struct ifqueue *, struct mbuf *);
 voidifq_deq_rollback(struct ifqueue *, struct mbuf *);
@@ -436,12 +439,6 @@ static inline unsigned int
 ifq_is_oactive(struct ifqueue *ifq)
 {
return (ifq->ifq_oactive);
-}
-
-static inline void
-ifq_start(struct ifqueue *ifq)
-{
-   ifq_serialize(ifq, >ifq_start);
 }
 
 static inline void
Index: if_var.h
===
RCS

let mpe/mpw/mpip use exp as a txprio field

2019-04-14 Thread David Gwynne

the mpls exp bits are now defined as a prio field, but we don't have
support for that currently.

this lets the mpls tunnelling interfaces configure the use of the exp
fields for prio with the same machinery used for all the other tunnel
interfaces. the interfaces default to using 0 for the value which keeps
it compat. other values can be configured with ifconfig like normal
tunnel interfaces.

ok? meh? nah?

Index: if_mpe.c
===
RCS file: /cvs/src/sys/net/if_mpe.c,v
retrieving revision 1.90
diff -u -p -r1.90 if_mpe.c
--- if_mpe.c2 Apr 2019 10:52:33 -   1.90
+++ if_mpe.c14 Apr 2019 06:49:56 -
@@ -55,6 +55,7 @@
 
 struct mpe_softc {
struct ifnetsc_if;  /* the interface */
+   int sc_txhprio;
unsigned intsc_rdomain;
struct ifaddr   sc_ifa;
struct sockaddr_mplssc_smpls;
@@ -121,6 +122,7 @@ mpe_clone_create(struct if_clone *ifc, i
bpfattach(>if_bpf, ifp, DLT_LOOP, sizeof(u_int32_t));
 #endif
 
+   sc->sc_txhprio = 0;
sc->sc_rdomain = 0;
sc->sc_ifa.ifa_ifp = ifp;
sc->sc_ifa.ifa_addr = sdltosa(ifp->if_sadl);
@@ -210,10 +212,13 @@ int
 mpe_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
struct rtentry *rt)
 {
+   struct mpe_softc *sc;
struct rt_mpls  *rtmpls;
struct shim_hdr shim;
int error;
+   int txprio;
uint8_t ttl = mpls_defttl;
+   uint8_t tos, prio;
size_t  ttloff;
socklen_t   slen;
 
@@ -243,15 +248,22 @@ mpe_output(struct ifnet *ifp, struct mbu
 
error = 0;
switch (dst->sa_family) {
-   case AF_INET:
+   case AF_INET: {
+   struct ip *ip = mtod(m, struct ip *);
+   tos = ip->ip_tos;
ttloff = offsetof(struct ip, ip_ttl);
slen = sizeof(struct sockaddr_in);
break;
+   }
 #ifdef INET6
-   case AF_INET6:
+   case AF_INET6: {
+   struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
+   uint32_t flow = bemtoh32(>ip6_flow);
+   tos = flow >> 20;
ttloff = offsetof(struct ip6_hdr, ip6_hlim);
slen = sizeof(struct sockaddr_in6);
break;
+   }
 #endif
default:
m_freem(m);
@@ -263,7 +275,23 @@ mpe_output(struct ifnet *ifp, struct mbu
ttl = *(mtod(m, uint8_t *) + ttloff);
}
 
-   shim.shim_label = rtmpls->mpls_label | MPLS_BOS_MASK | htonl(ttl);
+   sc = ifp->if_softc;
+   txprio = sc->sc_txhprio;
+
+   switch (txprio) {
+   case IF_HDRPRIO_PACKET:
+   prio = m->m_pkthdr.pf.prio;
+   break;
+   case IF_HDRPRIO_PAYLOAD:
+   prio = IFQ_TOS2PRIO(tos);
+   break;
+   default:
+   prio = txprio;
+   break;
+   }
+
+   shim.shim_label = rtmpls->mpls_label | htonl(prio << MPLS_EXP_OFFSET) |
+   MPLS_BOS_MASK | htonl(ttl);
 
m = m_prepend(m, sizeof(shim), M_NOWAIT);
if (m == NULL) {
@@ -278,7 +306,7 @@ mpe_output(struct ifnet *ifp, struct mbu
goto out;
}
memcpy(mtod(m, struct sockaddr *), rt->rt_gateway, slen);
-   mtod(m, struct sockaddr *)->sa_len = slen; /* to be sure */
+   mtod(m, struct sockaddr *)->sa_len = slen; /* to be sure */
 
m->m_pkthdr.ph_family = dst->sa_family;
 
@@ -387,6 +415,22 @@ mpe_ioctl(struct ifnet *ifp, u_long cmd,
break;
case SIOCGLIFPHYRTABLE:
ifr->ifr_rdomainid = sc->sc_rdomain;
+   break;
+
+   case SIOCSTXHPRIO:
+   if (ifr->ifr_hdrprio == IF_HDRPRIO_PACKET ||
+   ifr->ifr_hdrprio == IF_HDRPRIO_PAYLOAD)
+   ;
+   else if (ifr->ifr_hdrprio > IF_HDRPRIO_MAX ||
+   ifr->ifr_hdrprio < IF_HDRPRIO_MIN) {
+   error = EINVAL;
+   break;
+   }
+
+   sc->sc_txhprio = ifr->ifr_hdrprio;
+   break;
+   case SIOCGTXHPRIO:
+   ifr->ifr_hdrprio = sc->sc_txhprio;
break;
 
default:
Index: if_mpip.c
===
RCS file: /cvs/src/sys/net/if_mpip.c,v
retrieving revision 1.4
diff -u -p -r1.4 if_mpip.c
--- if_mpip.c   2 Apr 2019 10:50:16 -   1.4
+++ if_mpip.c   14 Apr 2019 06:49:56 -
@@ -55,6 +55,7 @@ struct mpip_softc {
unsigned intsc_dead;
uint32_tsc_flow; /* xor for mbuf flowid */
 
+   int sc_txhprio;
struct ifaddr   sc_ifa;
struct sockaddr_mplssc_smpls; /* Local label */
unsigned intsc_rdomain;
@@ -92,6 +93,7 @@

tcpdump: print some more info about md5 auth in ospf

2019-04-10 Thread David Gwynne

seeing the key-id in particular helped me debug a problem here.

ok?

Index: ospf.h
===
RCS file: /cvs/src/usr.sbin/tcpdump/ospf.h,v
retrieving revision 1.10
diff -u -p -r1.10 ospf.h
--- ospf.h  4 Aug 2010 16:47:01 -   1.10
+++ ospf.h  10 Apr 2019 22:27:29 -
@@ -220,6 +220,13 @@ struct ospfhdr {
 } ospf_un ;
 } ;
 
+struct ospf_md5_authdata {
+   uint16_tauth_md5_offset;
+   uint8_t auth_keyid;
+   uint8_t auth_len;
+   uint32_tauth_seq;
+};
+
 #defineospf_hello  ospf_un.un_hello
 #defineospf_db ospf_un.un_db
 #defineospf_lsrospf_un.un_lsr
Index: print-ospf.c
===
RCS file: /cvs/src/usr.sbin/tcpdump/print-ospf.c,v
retrieving revision 1.20
diff -u -p -r1.20 print-ospf.c
--- print-ospf.c16 Nov 2015 00:16:39 -  1.20
+++ print-ospf.c10 Apr 2019 22:27:29 -
@@ -32,6 +32,7 @@
 
 #include 
 #include 
+#include 
 
 #include "interface.h"
 #include "addrtoname.h"
@@ -554,9 +555,20 @@ ospf_print(const u_char *bp, u_int lengt
printf("\"");
break;
 
-   case OSPF_AUTH_MD5:
-   printf(" auth MD5");
+   case OSPF_AUTH_MD5: {
+   struct ospf_md5_authdata auth;
+   memcpy(, op->ospf_authdata, sizeof(auth));
+
+   printf(" auth MD5 key-id %u", auth.auth_keyid);
+   if (vflag)
+   printf(" seq %u", ntohl(auth.auth_seq));
+   if (vflag > 1) {
+   printf(" off %u len %u",
+   ntohs(auth.auth_md5_offset),
+   auth.auth_len);
+   }
break;
+   }
 
default:
printf(" ??authtype-%d??", ntohs(op->ospf_authtype));

Re: sfp module info and diagnostics

2019-04-08 Thread David Gwynne

 Warning Flag= False

10GBASE-SR


On Mon, Apr 08, 2019 at 02:21:36PM +1000, David Gwynne wrote:
> this adds support to ifconfig for reading info from transceivers.
> 
> it looks like this:
> 
> dlg@ix ifconfig$ sudo ./obj/ifconfig ix0 transceiver
> ix0: identifier SFP (03)
>   connector: Copper Pigtail (21)
>   vendor: Amphenol
>   product: 616740001
>   revision: B
>   serial: CN0V250M36J0T86
>   date: 2013-07-04
> dlg@ix ifconfig$ sudo ./obj/ifconfig ix1 transceiver 
> ix1: identifier SFP (03)
>   connector: LC (07)
>   vendor: FINISAR CORP.
>   product: FTLX8571D3BCL-FC
>   revision: A
>   serial: AQG28W3
>   date: 2013-10-19
>   temperature: 34.60 C
>   vcc: 3.3553 V
>   tx-bias: 7986.0 uA
>   tx-power: 0.6128 mW
>   rx-power: 0.6153 mW average
> dlg@ix ifconfig$ sudo ./obj/ifconfig ixl0 transceiver
> ixl0: identifier QSFP+ (0d)
> 
> this is all specified by the SFF (small formfactor) group in SNIA, but
> it is a lot of disparate documentation to get into your head. the top
> level summary is that sfp modules have an i2c bus wired up to them, and
> answer reads at device address 0xa0. there is a 256 byte page at that
> address with information like the type of module, and depending on the
> type you can find the manufacturer, product name, serial number, and so
> on.
> 
> a later spec added support a "digital diagnostics monitoring" (DDM)
> or "digital optical monitoring" (DOM) capability where there's live
> status/diag information available at i2c address 0xa2. again, it's a 256
> byte page, but the values change all the time based on what the module
> is doing. this is where the temperature and laser power stuff is.
> 
> ive implemented basic support for the above, which is specific to
> some sfp shaped modules (so sfp+ and sfp28 too) and gbics. devices
> report whether they support the diag page, so it only fetches and
> parses that if page 0 on 0xa0 says it can. there are different specs
> for the other types of modules, in particular qsfp and related
> modules have a very different layout. however, they still use the
> same device addresses and pages, it's just that the contents of the
> page vary. support for qsfp will be forthcoming if this goes ahead.
> dumping more info generally will happen as time and interest permits
> too.
> 
> i've only implemented the kernel backend for this on ix and ixl. ixl
> support is patchy because it relies on a command that only exists in
> high API versions (like 1.7). ix seems pretty consistent. other nics can
> grow support as time and hw availability permites. i don't have an em(4)
> with optics, so that might be hard for me to do myself, but i tried to
> make the kernel side as easy as possible so people should have a good
> chance at figuring it out.
> 
> do those power units sound plausible or are the factors off?
> 
> this was originally requested by rachel roch on misc@ in "Viewing SFP
> diagnostic data in OpenBSD ?"
> 
> thoughts? 

Index: Makefile
===
RCS file: /cvs/src/sbin/ifconfig/Makefile,v
retrieving revision 1.14
diff -u -p -r1.14 Makefile
--- Makefile3 May 2016 17:52:33 -   1.14
+++ Makefile8 Apr 2019 09:23:59 -
@@ -1,10 +1,10 @@
 #  $OpenBSD: Makefile,v 1.14 2016/05/03 17:52:33 jca Exp $
 
 PROG=  ifconfig
-SRCS=  ifconfig.c brconfig.c
+SRCS=  ifconfig.c brconfig.c sff.c
 MAN=   ifconfig.8
 
-LDADD= -lutil
+LDADD= -lutil -lm
 DPADD= ${LIBUTIL}
 
 .include 
Index: ifconfig.c
===
RCS file: /cvs/src/sbin/ifconfig/ifconfig.c,v
retrieving revision 1.397
diff -u -p -r1.397 ifconfig.c
--- ifconfig.c  11 Mar 2019 11:25:48 -  1.397
+++ ifconfig.c  8 Apr 2019 09:23:59 -
@@ -340,6 +340,8 @@ voidumb_setclass(const char *, int);
 void   umb_roaming(const char *, int);
 void   utf16_to_char(uint16_t *, int, char *, size_t);
 intchar_to_utf16(const char *, uint16_t *, size_t);
+void   transceiver(const char *, int);
+void   transceiverdump(const char *, int);
 #else
 void   setignore(const char *, int);
 #endif
@@ -589,6 +591,9 @@ const structcmd {
{ "datapath",   NEXTARG,0,  switch_datapathid },
{ "portno", NEXTARG2,   0,  NULL, switch_portno },
{ "addlocal",   NEXTARG,0,  addlocal },
+   { "transceiver", 0, 0,  transceiver },
+   { "sff",0,  0,  transceiver },
+   { "sffdump",0,  0,  transceiverdump },
 #else /* SMALL */
{

sfp module info and diagnostics

2019-04-07 Thread David Gwynne

(IXL_AQ_OP_PHY_GET_REGISTER);
+   param = (struct ixl_aq_phy_reg_access *)iaq->iaq_param;
+   param->phy_iface = IXL_AQ_PHY_IF_MODULE;
+   param->dev_addr = dev;
+   htolem32(>reg, reg);
+
+   ixl_atq_exec(sc, , "ixlsffget");
+
+   switch (iaq->iaq_retval) {
+   case htole16(IXL_AQ_RC_OK):
+   break;
+   case htole16(IXL_AQ_RC_EBUSY):
+   return (EBUSY);
+   case htole16(IXL_AQ_RC_ESRCH):
+   return (ENODEV);
+   case htole16(IXL_AQ_RC_EIO):
+   case htole16(IXL_AQ_RC_EINVAL):
+   default:
+   printf("%s: %u\n", __func__, lemtoh16(>iaq_retval));
+   return (EIO);
+   }
+
+   *p = lemtoh32(>val);
+
+   return (0);
+}
+
+
+static int
+ixl_sff_set_byte(struct ixl_softc *sc, uint8_t dev, uint32_t reg, uint8_t v)
+{
+   struct ixl_atq iatq;
+   struct ixl_aq_desc *iaq;
+   struct ixl_aq_phy_reg_access *param;
+
+   memset(, 0, sizeof(iatq));
+   iaq = _desc;
+   iaq->iaq_opcode = htole16(IXL_AQ_OP_PHY_SET_REGISTER);
+   param = (struct ixl_aq_phy_reg_access *)iaq->iaq_param;
+   param->phy_iface = IXL_AQ_PHY_IF_MODULE;
+   param->dev_addr = dev;
+   htolem32(>reg, reg);
+   htolem32(>val, v);
+
+   ixl_atq_exec(sc, , "ixlsffset");
+
+   switch (iaq->iaq_retval) {
+   case htole16(IXL_AQ_RC_OK):
+   break;
+   case htole16(IXL_AQ_RC_EBUSY):
+   return (EBUSY);
+   case htole16(IXL_AQ_RC_ESRCH):
+   return (ENODEV);
+   case htole16(IXL_AQ_RC_EIO):
+   case htole16(IXL_AQ_RC_EINVAL):
+   default:
+   return (EIO);
+   }
 
return (0);
 }
Index: sbin/ifconfig/Makefile
===
RCS file: /cvs/src/sbin/ifconfig/Makefile,v
retrieving revision 1.14
diff -u -p -r1.14 Makefile
--- sbin/ifconfig/Makefile  3 May 2016 17:52:33 -   1.14
+++ sbin/ifconfig/Makefile  8 Apr 2019 02:05:36 -
@@ -1,7 +1,7 @@
 #  $OpenBSD: Makefile,v 1.14 2016/05/03 17:52:33 jca Exp $
 
 PROG=  ifconfig
-SRCS=  ifconfig.c brconfig.c
+SRCS=  ifconfig.c brconfig.c sff.c
 MAN=   ifconfig.8
 
 LDADD= -lutil
Index: sbin/ifconfig/ifconfig.c
===
RCS file: /cvs/src/sbin/ifconfig/ifconfig.c,v
retrieving revision 1.394
diff -u -p -r1.394 ifconfig.c
--- sbin/ifconfig/ifconfig.c20 Feb 2019 19:17:17 -  1.394
+++ sbin/ifconfig/ifconfig.c8 Apr 2019 02:05:36 -
@@ -340,6 +340,8 @@ voidumb_setclass(const char *, int);
 void   umb_roaming(const char *, int);
 void   utf16_to_char(uint16_t *, int, char *, size_t);
 intchar_to_utf16(const char *, uint16_t *, size_t);
+void   transceiver(const char *, int);
+void   transceiverdump(const char *, int);
 #else
 void   setignore(const char *, int);
 #endif
@@ -587,6 +589,9 @@ const structcmd {
{ "datapath",   NEXTARG,0,  switch_datapathid },
{ "portno", NEXTARG2,   0,  NULL, switch_portno },
{ "addlocal",   NEXTARG,0,  addlocal },
+   { "transceiver", 0, 0,  transceiver },
+   { "sff",0,  0,  transceiver },
+   { "sffdump",0,  0,  transceiverdump },
 #else /* SMALL */
{ "powersave",  NEXTARG0,   0,  setignore },
{ "priority",   NEXTARG,0,  setignore },
@@ -4003,6 +4008,22 @@ setmpwcontrolword(const char *value, int
imrsave.imr_flags |= IMR_FLAG_CONTROLWORD;
else
imrsave.imr_flags &= ~IMR_FLAG_CONTROLWORD;
+}
+
+intif_sff_info(int, const char *, int);
+
+void
+transceiver(const char *value, int d)
+{
+   if (if_sff_info(s, name, 0) == -1)
+   err(1, "%s %s", name, __func__);
+}
+
+void
+transceiverdump(const char *value, int d)
+{
+   if (if_sff_info(s, name, 1) == -1)
+   err(1, "%s transceiver", name);
 }
 #endif /* SMALL */
 
Index: sbin/ifconfig/sff.c
===
RCS file: sbin/ifconfig/sff.c
diff -N sbin/ifconfig/sff.c
--- /dev/null   1 Jan 1970 00:00:00 -
+++ sbin/ifconfig/sff.c 8 Apr 2019 02:05:36 -
@@ -0,0 +1,451 @@
+/* $OpenBSD$ */
+
+/*
+ * Copyright (c) David Gwynne 
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF

teach tcpdump about cdp on ppp and gre links

2019-04-03 Thread David Gwynne

ok?

Index: interface.h
===
RCS file: /cvs/src/usr.sbin/tcpdump/interface.h,v
retrieving revision 1.79
diff -u -p -r1.79 interface.h
--- interface.h 22 Oct 2018 16:12:45 -  1.79
+++ interface.h 4 Apr 2019 01:54:37 -
@@ -270,8 +270,7 @@ extern void ike_print(const u_char *, u_
 extern void udpencap_print(const u_char *, u_int, const u_char *);
 extern void ah_print(const u_char *, u_int, const u_char *);
 extern void esp_print(const u_char *, u_int, const u_char *);
-extern void cdp_print(const u_char *, u_int, u_int, const u_char *,
-   const u_char *);
+extern void cdp_print(const u_char *, u_int, u_int, int);
 extern void stp_print(const u_char *, u_int);
 extern void radius_print(const u_char *, u_int);
 extern void lwres_print(const u_char *, u_int);
Index: print-cdp.c
===
RCS file: /cvs/src/usr.sbin/tcpdump/print-cdp.c,v
retrieving revision 1.6
diff -u -p -r1.6 print-cdp.c
--- print-cdp.c 29 Mar 2016 04:07:50 -  1.6
+++ print-cdp.c 4 Apr 2019 01:54:37 -
@@ -46,20 +46,17 @@ void cdp_print_prefixes(const u_char * p
  * Returns non-zero IFF it succeeds in printing the header
  */
 void
-cdp_print(const u_char *p, u_int length, u_int caplen,
- const u_char *esrc, const u_char *edst)
+cdp_print(const u_char *p, u_int length, u_int caplen, int i)
 {
-   int i;
int type, len;
 
/* Cisco Discovery Protocol */
 
-   if (caplen < 12) {
+   if (caplen < i + 4) {
printf("[|cdp]");
return;
}
 
-   i=8;/* CDP data starts at offset 8 */
printf("CDP v%d, ttl=%ds", p[i], p[i+1]);
i+=4;   /* skip version, TTL and chksum */
 
Index: print-gre.c
===
RCS file: /cvs/src/usr.sbin/tcpdump/print-gre.c,v
retrieving revision 1.23
diff -u -p -r1.23 print-gre.c
--- print-gre.c 2 Apr 2019 11:10:54 -   1.23
+++ print-gre.c 4 Apr 2019 01:54:37 -
@@ -268,6 +271,9 @@ gre_print_0(const u_char *p, u_int lengt
break;
case ERSPAN_II:
gre_print_erspan2(p, length);
+   break;
+   case 0x2000:
+   cdp_print(p, length, l, 0);
break;
default:
printf("unknown-proto-%04x", proto);
Index: print-llc.c
===
RCS file: /cvs/src/usr.sbin/tcpdump/print-llc.c,v
retrieving revision 1.20
diff -u -p -r1.20 print-llc.c
--- print-llc.c 16 Nov 2015 00:16:39 -  1.20
+++ print-llc.c 4 Apr 2019 01:54:37 -
@@ -100,7 +100,7 @@ llc_print(const u_char *p, u_int length,
 
/* Cisco Discovery Protocol  - SNAP & ether type 0x2000 */
if (llc.ethertype[0] == 0x20 && llc.ethertype[1] == 0x00) {
-   cdp_print(p, length, caplen, esrc, edst);
+   cdp_print(p, length, caplen, 8);
return (1);
}
/* Shared Spanning Tree Protocol - SNAP & ether type 0x010b */
Index: print-ppp.c
===
RCS file: /cvs/src/usr.sbin/tcpdump/print-ppp.c,v
retrieving revision 1.32
diff -u -p -r1.32 print-ppp.c
--- print-ppp.c 6 Feb 2018 03:41:58 -   1.32
+++ print-ppp.c 4 Apr 2019 01:54:37 -
@@ -390,6 +390,9 @@ ppp_print(const u_char *p, u_int length)
case PPP_IPV6CP:
handle_ipv6cp(p, l);
break;
+   case PPP_CDP:
+   cdp_print(p, length, l, 0);
+   break;
}
 }

deprecate TASKQ_CANTSLEEP

2019-03-28 Thread David Gwynne

nothing uses it anymore, and i don't think it's useful either.

for those who don't know what it did, it marked the threads used by a
taskq so the scheduler knew they shouldnt sleep. this was used in the
early stages of the mpsafe network stack changes to mark the softnet
taskqs as nonsleeping so we could turn that back into an interrupt
context with less issues. it's pretty obvious now that softnets are
going to remain as threads though.

ok?

Index: share/man/man9/task_add.9
===
RCS file: /cvs/src/share/man/man9/task_add.9,v
retrieving revision 1.18
diff -u -p -r1.18 task_add.9
--- share/man/man9/task_add.9   16 Dec 2018 03:40:12 -  1.18
+++ share/man/man9/task_add.9   29 Mar 2019 04:26:01 -
@@ -81,9 +81,6 @@ argument:
 .Bl -tag -width xxx -offset indent
 .It Dv TASKQ_MPSAFE
 The threads servicing the taskq will be run without the kernel big lock.
-.It Dv TASKQ_CANTSLEEP
-The tasks run via the taskq cannot sleep.
-.El
 .Pp
 .Fn taskq_destroy
 causes the resources associated with a previously created taskq to be freed.
Index: sys/sys/task.h
===
RCS file: /cvs/src/sys/sys/task.h,v
retrieving revision 1.13
diff -u -p -r1.13 task.h
--- sys/sys/task.h  16 Dec 2018 03:36:02 -  1.13
+++ sys/sys/task.h  29 Mar 2019 04:26:01 -
@@ -35,7 +35,6 @@ struct task {
 TAILQ_HEAD(task_list, task);
 
 #define TASKQ_MPSAFE   (1 << 0)
-#define TASKQ_CANTSLEEP(1 << 1)
 
 #define TASK_INITIALIZER(_f, _a)  {{ NULL, NULL }, (_f), (_a), 0 }
 
Index: sys/kern/kern_task.c
===
RCS file: /cvs/src/sys/kern/kern_task.c,v
retrieving revision 1.23
diff -u -p -r1.23 kern_task.c
--- sys/kern/kern_task.c16 Dec 2018 03:36:02 -  1.23
+++ sys/kern/kern_task.c29 Mar 2019 04:26:01 -
@@ -59,9 +59,6 @@ struct taskq taskq_sys_mp = {
TAILQ_HEAD_INITIALIZER(taskq_sys_mp.tq_worklist)
 };
 
-typedef int (*sleepfn)(const volatile void *, struct mutex *, int,
-const char *, int);
-
 struct taskq *const systq = _sys;
 struct taskq *const systqmp = _sys_mp;
 
@@ -70,7 +67,7 @@ void  taskq_create_thread(void *);
 void   taskq_barrier_task(void *);
 inttaskq_sleep(const volatile void *, struct mutex *, int,
const char *, int);
-inttaskq_next_work(struct taskq *, struct task *, sleepfn);
+inttaskq_next_work(struct taskq *, struct task *);
 void   taskq_thread(void *);
 
 void
@@ -246,21 +243,7 @@ task_del(struct taskq *tq, struct task *
 }
 
 int
-taskq_sleep(const volatile void *ident, struct mutex *mtx, int priority,
-const char *wmesg, int tmo)
-{
-   u_int *flags = >p_flag;
-   int rv;
-
-   atomic_clearbits_int(flags, P_CANTSLEEP);
-   rv = msleep(ident, mtx, priority, wmesg, tmo);
-   atomic_setbits_int(flags, P_CANTSLEEP);
-
-   return (tmo);
-}
-
-int
-taskq_next_work(struct taskq *tq, struct task *work, sleepfn tqsleep)
+taskq_next_work(struct taskq *tq, struct task *work)
 {
struct task *next;
 
@@ -271,7 +254,7 @@ taskq_next_work(struct taskq *tq, struct
return (0);
}
 
-   tqsleep(tq, >tq_mtx, PWAIT, "bored", 0);
+   msleep(tq, >tq_mtx, PWAIT, "bored", 0);
}
 
TAILQ_REMOVE(>tq_worklist, next, t_entry);
@@ -291,7 +274,6 @@ taskq_next_work(struct taskq *tq, struct
 void
 taskq_thread(void *xtq)
 {
-   sleepfn tqsleep = msleep;
struct taskq *tq = xtq;
struct task work;
int last;
@@ -299,12 +281,7 @@ taskq_thread(void *xtq)
if (ISSET(tq->tq_flags, TASKQ_MPSAFE))
KERNEL_UNLOCK();
 
-   if (ISSET(tq->tq_flags, TASKQ_CANTSLEEP)) {
-   tqsleep = taskq_sleep;
-   atomic_setbits_int(>p_flag, P_CANTSLEEP);
-   }
-
-   while (taskq_next_work(tq, , tqsleep)) {
+   while (taskq_next_work(tq, )) {
(*work.t_func)(work.t_arg);
sched_pause(yield);
}
@@ -312,9 +289,6 @@ taskq_thread(void *xtq)
mtx_enter(>tq_mtx);
last = (--tq->tq_running == 0);
mtx_leave(>tq_mtx);
-
-   if (ISSET(tq->tq_flags, TASKQ_CANTSLEEP))
-   atomic_clearbits_int(>p_flag, P_CANTSLEEP);
 
if (ISSET(tq->tq_flags, TASKQ_MPSAFE))
KERNEL_LOCK();

enable mpip(4) in GENERIC?

2019-03-17 Thread David Gwynne

ok?

Index: GENERIC
===
RCS file: /cvs/src/sys/conf/GENERIC,v
retrieving revision 1.257
diff -u -p -r1.257 GENERIC
--- GENERIC 20 Dec 2018 23:00:55 -  1.257
+++ GENERIC 18 Mar 2019 03:22:03 -
@@ -94,6 +94,7 @@ pseudo-device mobileip# MobileIP encaps
 pseudo-device  loop# network loopback
 pseudo-device  mpe # MPLS PE interface
 pseudo-device  mpw # MPLS pseudowire support
+pseudo-device  mpip# MPLS IP Layer2 pseudowire support
 pseudo-device  bpe # Provider Backbone Bridge edge interface
 pseudo-device  pair# Virtual Ethernet interface pair
 pseudo-device  ppp # PPP

Re: ipv6 via ipsec tunnel

2019-03-15 Thread David Gwynne




> On 15 Mar 2019, at 16:37, Otto Moerbeek  wrote:
> 
> On Fri, Mar 15, 2019 at 04:15:55PM +1000, David Gwynne wrote:
> 
>> 
>> 
>>> On 14 Mar 2019, at 19:36, Otto Moerbeek  wrote:
>>> 
>>> Hi,
>>> 
>>> So i have a little IPv6 problem. 
>>> 
>>> I have a machine in colocation that has IPv6. I have my home cable
>>> modem connection that does not have it.
>>> 
>>> So I thought: I make my own tunnel. First I tried gif(4), that worked,
>>> but only after some fighting with mtu settings on all hosts on my home
>>> net via rad.  Performance was kinda bad. So I'm looking for an
>>> alternative. I thougt: IPSEC should be able to do this.
>>> 
>>> I have a flow from my locally created IPv6 net to any and vice versa.
>>> THe flow itself works. 
>>> 
>>> There I ran into the trouble that you cannot specify a default
>>> gateway, since my remote gw (the host in colo) it is not reachable
>>> according to route(8).
>>> 
>>> How does one solve the default route problem?  I never really
>>> understood how routing works in the presense of IPSEC flows.
>> 
>> Can you elaborate on what gif and slow meant? Also, you should be able to 
>> use gif with whatever MTU you want, even 1500 on the gif interface and 
>> fragments over the internet. You could also try gre, but I doubt it would be 
>> different to gif in terms of performance and support for MTU/fragmentation.
>> 
>> If you want ipsec and routes, you would still use tunnel and get IPsec to 
>> protect it. Or you could trick someone into making something like Cisco's 
>> vti a thing in OpenBSD.
>> 
>> dlg
> 
> gif tunnel:
> 
> ifconfig gif0 inet6 2a02:898:216:3::2 2a02:898:216:3::1 prefixlen 128
> 
> and viceversa on th eother end.
> 
> So gif tunnel with default options. With that it showed an an mtu of
> 1280 in ifconfig so I assumed that would be the max. I have a
> 200 Mb/s cable connection. Downloading IPv4 I reach that. With IPV6
> often it would be 10% of that. Plus it would only work reliably if the
> hosts in my net use an mtu of 1280 (manually or via rad).

I wonder why PMTUD isn't working in this situation.

> I now have a ipsec tunnel and that does 55 Mb/s (APU2 on both
> ends) without any need for config on the hosts in my local net.

Did you have to clamp your internal MTU for that to work too?

dlg

Re: ipv6 via ipsec tunnel

2019-03-15 Thread David Gwynne

> On 14 Mar 2019, at 19:36, Otto Moerbeek  wrote:
> 
> Hi,
> 
> So i have a little IPv6 problem. 
> 
> I have a machine in colocation that has IPv6. I have my home cable
> modem connection that does not have it.
> 
> So I thought: I make my own tunnel. First I tried gif(4), that worked,
> but only after some fighting with mtu settings on all hosts on my home
> net via rad.  Performance was kinda bad. So I'm looking for an
> alternative. I thougt: IPSEC should be able to do this.
> 
> I have a flow from my locally created IPv6 net to any and vice versa.
> THe flow itself works. 
> 
> There I ran into the trouble that you cannot specify a default
> gateway, since my remote gw (the host in colo) it is not reachable
> according to route(8).
> 
> How does one solve the default route problem?  I never really
> understood how routing works in the presense of IPSEC flows.

Can you elaborate on what gif and slow meant? Also, you should be able to use 
gif with whatever MTU you want, even 1500 on the gif interface and fragments 
over the internet. You could also try gre, but I doubt it would be different to 
gif in terms of performance and support for MTU/fragmentation.

If you want ipsec and routes, you would still use tunnel and get IPsec to 
protect it. Or you could trick someone into making something like Cisco's vti a 
thing in OpenBSD.

dlg

Re: extend BPF filter drop to allow not capturing packets

2019-03-10 Thread David Gwynne

On Tue, Mar 05, 2019 at 12:03:05PM +1000, David Gwynne wrote:
> this extends the fildrop mechanism so you can drop the packets with bpf
> using the existing fildrop method, but with an extra tweak so you can 
> avoid the cost of copying packets to userland.
> 
> i wanted to quickly drop some packets in the rx interrupt path to try
> and prioritise some traffic getting processed by the system. the initial
> version was going to use weird custom DLTs and extra bpf interface
> pointers and stuff, but most of the glue is already in place with
> the fildrop functionality.
> 
> this also adds a bit to tcpdump so you can set a fildrop action. it
> means tcpdump can be used as a quick and dirty firewall.

there's a bit more discussion about this that i should have included in
my original email.

firstly, the functionality it offers. this effectively offers a firewall
with the ability to filter arbitrary packets. this has significant
overlap with the functionality that pf offers, but there are a couple of
important differences. pf only handles IP traffic, but we don't
really have a good story when it comes to filtering non-ip. we could
implement something like pf for the next protocol that people need to
manage, but what is that next protocol? pf like implies a highly
optimised but constrained set of filters that deeply understands the
protocol it is handling. is that next protol ieee1905p? cdp? ipx?
macsec? where should that protocol be filtered in the stack?

im arguing that bpf with fildrop has the benefit of already existing,
it's in place, and it already has the ability to be configured with
arbitrary policy. considering we've got this far without handling
non-ip, spending more time on it seems unjustified.

secondly, the performance aspects of this diff.

bpf allows for arbitrarily complicated filters, so it is entirely
possible to slow your box down a lot by writing really complicated
filters. this is in comparison to pf where each rule has a limit
on how much work it will do, which is also mitigated by the ruleset
optimiser and skip steps. i don't have a good answer to that except to
say you can already add such filters to bpf, they just don't do anything
except copy packets at the moment.

another interesting performance consideration is that bpf runs a lot
earlier than pf, so filtering packets with bpf can avoid a lot of work
in the stack. if you want to pass IP statefully, pf is a much better
hammer, but to drop packets up front bpf is interesting.

for example, thanks to hrvoje popovski i now have a setup where im
pushing ~7 million packets per second through a box to do performance
measurements. those packets are udp from random ips to port 7 on
another set of random ips. if i have the following rule in pf.conf:

 block in quick proto udp to port 7

i can rx and drop about 550kpps. if im sshed in using another
interface, the system is super sluggish over that shell.

if i use this diff and run the following;

# tcpdump -B drop -i ix1 udp and port 7

i'm dropping about 1.2 million pps, and the box is responsive when sshed
in using another interface.

so, to summarise, bpf can already be used to drop packets, this is just
a tweak to make it faster, and a tweak so tcpdump can be used to set up
that filtering.

> Index: sys/net/bpf.c
> ===
> RCS file: /cvs/src/sys/net/bpf.c,v
> retrieving revision 1.170
> diff -u -p -r1.170 bpf.c
> --- sys/net/bpf.c 13 Jul 2018 08:51:15 -  1.170
> +++ sys/net/bpf.c 4 Mar 2019 22:30:32 -
> @@ -926,9 +926,20 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
>   *(u_int *)addr = d->bd_fildrop;
>   break;
>  
> - case BIOCSFILDROP:  /* set "filter-drop" flag */
> - d->bd_fildrop = *(u_int *)addr ? 1 : 0;
> + case BIOCSFILDROP: {/* set "filter-drop" flag */
> + unsigned int fildrop = *(u_int *)addr;
> + switch (fildrop) {
> + case BPF_FILDROP_PASS:
> + case BPF_FILDROP_CAPTURE:
> + case BPF_FILDROP_DROP:
> + d->bd_fildrop = fildrop;
> + break;
> + default:
> + error = EINVAL;
> + break;
> + }
>   break;
> + }
>  
>   case BIOCGDIRFILT:  /* get direction filter */
>   *(u_int *)addr = d->bd_dirfilt;
> @@ -1261,23 +1272,26 @@ _bpf_mtap(caddr_t arg, const struct mbuf
>   pktlen += m0->m_len;
>  
>   SRPL_FOREACH(d, , >bif_dlist, bd_next) {
> + struct srp_ref bsr;
> + struct bpf_program *bf;
> + struct bpf_insn *fcode = NULL;
> +
>   atomic_inc_long(>bd_rcount);
>  
>

extend BPF filter drop to allow not capturing packets

2019-03-04 Thread David Gwynne

this extends the fildrop mechanism so you can drop the packets with bpf
using the existing fildrop method, but with an extra tweak so you can 
avoid the cost of copying packets to userland.

i wanted to quickly drop some packets in the rx interrupt path to try
and prioritise some traffic getting processed by the system. the initial
version was going to use weird custom DLTs and extra bpf interface
pointers and stuff, but most of the glue is already in place with
the fildrop functionality.

this also adds a bit to tcpdump so you can set a fildrop action. it
means tcpdump can be used as a quick and dirty firewall.

Index: sys/net/bpf.c
===
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.170
diff -u -p -r1.170 bpf.c
--- sys/net/bpf.c   13 Jul 2018 08:51:15 -  1.170
+++ sys/net/bpf.c   4 Mar 2019 22:30:32 -
@@ -926,9 +926,20 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
*(u_int *)addr = d->bd_fildrop;
break;
 
-   case BIOCSFILDROP:  /* set "filter-drop" flag */
-   d->bd_fildrop = *(u_int *)addr ? 1 : 0;
+   case BIOCSFILDROP: {/* set "filter-drop" flag */
+   unsigned int fildrop = *(u_int *)addr;
+   switch (fildrop) {
+   case BPF_FILDROP_PASS:
+   case BPF_FILDROP_CAPTURE:
+   case BPF_FILDROP_DROP:
+   d->bd_fildrop = fildrop;
+   break;
+   default:
+   error = EINVAL;
+   break;
+   }
break;
+   }
 
case BIOCGDIRFILT:  /* get direction filter */
*(u_int *)addr = d->bd_dirfilt;
@@ -1261,23 +1272,26 @@ _bpf_mtap(caddr_t arg, const struct mbuf
pktlen += m0->m_len;
 
SRPL_FOREACH(d, , >bif_dlist, bd_next) {
+   struct srp_ref bsr;
+   struct bpf_program *bf;
+   struct bpf_insn *fcode = NULL;
+
atomic_inc_long(>bd_rcount);
 
-   if ((direction & d->bd_dirfilt) != 0)
-   slen = 0;
-   else {
-   struct srp_ref bsr;
-   struct bpf_program *bf;
-   struct bpf_insn *fcode = NULL;
-
-   bf = srp_enter(, >bd_rfilter);
-   if (bf != NULL)
-   fcode = bf->bf_insns;
-   slen = bpf_mfilter(fcode, m, pktlen);
-   srp_leave();
-   }
+   if (ISSET(d->bd_dirfilt, direction))
+   continue;
+
+   bf = srp_enter(, >bd_rfilter);
+   if (bf != NULL)
+   fcode = bf->bf_insns;
+   slen = bpf_mfilter(fcode, m, pktlen);
+   srp_leave();
 
-   if (slen > 0) {
+   if (slen == 0)
+   continue;
+   if (d->bd_fildrop != BPF_FILDROP_PASS)
+   drop = 1;
+   if (d->bd_fildrop != BPF_FILDROP_DROP) {
if (!gottime++)
microtime();
 
@@ -1285,9 +1299,6 @@ _bpf_mtap(caddr_t arg, const struct mbuf
bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn,
);
mtx_leave(>bd_mtx);
-
-   if (d->bd_fildrop)
-   drop = 1;
}
}
SRPL_LEAVE();
Index: sys/net/bpf.h
===
RCS file: /cvs/src/sys/net/bpf.h,v
retrieving revision 1.65
diff -u -p -r1.65 bpf.h
--- sys/net/bpf.h   3 Feb 2018 13:37:37 -   1.65
+++ sys/net/bpf.h   4 Mar 2019 22:30:32 -
@@ -126,6 +126,13 @@ struct bpf_version {
 #define BPF_DIRECTION_IN   1
 #define BPF_DIRECTION_OUT  (1<<1)
 
+/*
+ * Values for BIOCGFILDROP/BIOCSFILDROP
+ */
+#define BPF_FILDROP_PASS   0 /* capture, pass */
+#define BPF_FILDROP_CAPTURE1 /* capture, drop */
+#define BPF_FILDROP_DROP   2 /* no capture, drop */
+
 struct bpf_timeval {
u_int32_t   tv_sec;
u_int32_t   tv_usec;
Index: share/man/man4/bpf.4
===
RCS file: /cvs/src/share/man/man4/bpf.4,v
retrieving revision 1.38
diff -u -p -r1.38 bpf.4
--- share/man/man4/bpf.428 Apr 2016 19:07:19 -  1.38
+++ share/man/man4/bpf.44 Mar 2019 22:30:32 -
@@ -391,11 +391,24 @@ This flag is initialized to zero by defa
 .Pp
 .It Dv BIOCSFILDROP Fa "u_int *"
 .It Dv BIOCGFILDROP Fa "u_int *"
-Sets or gets the status of the
+Sets or gets the
 .Dq filter drop
-flag.
-If non-zero, packets matching any filters will be reported to the
-associated interface so that they can be dropped.
+action.
+The supported actions for packets matching

Re: Pass IPv6 through pppx(4)

2019-03-04 Thread David Gwynne

ok.

> On 5 Mar 2019, at 02:46, Denis Fondras  wrote:
> 
> Simple diff to allow IPv6 through pppx(4).
> 
> Denis
> 
> Index: net/if_pppx.c
> ===
> RCS file: /cvs/src/sys/net/if_pppx.c,v
> retrieving revision 1.66
> diff -u -p -r1.66 if_pppx.c
> --- net/if_pppx.c 11 Jul 2018 21:18:23 -  1.66
> +++ net/if_pppx.c 4 Mar 2019 16:33:50 -
> @@ -1047,6 +1047,11 @@ pppx_if_output(struct ifnet *ifp, struct
> #endif
>   if (pipex_enable) {
>   switch (dst->sa_family) {
> +#ifdef INET6
> + case AF_INET6:
> + proto = PPP_IPV6;
> + break;
> +#endif
>   case AF_INET:
>   proto = PPP_IP;
>   break;
>

interface queue drops in systat again, plus netstat

2019-03-04 Thread David Gwynne

this combines errors and qdrops into fails, and shows them by default.
if you want to look at drops or errors, you can use d or f to switch to
that view.

this also changes netstat so it shows fails by default which is a
combination of errors and qdrops too, but -d and -e force drops or
errors respectively.

it is really frustrating at the moment that i can't see qdrops anywhere,
which makes it hard to judge the effectiveness of some changes im
working on.

for example, this is before and after with netstat:

dlg@ix netstat$ netstat -I ix1 
NameMtu   Network Address  Ipkts IerrsOpkts Oerrs Colls
ix1 1500b8:ca:3a:66:e2:72 193968251 0 172754300 0
ix1 1500  192.168.1.3 192.168.1.3   193968251 0 172754300 0
ix1 1500  192.168.1.1 192.168.1.19  193968251 0 172754300 0
dlg@ix netstat$ ./obj/netstat -I ix1
NameMtu   Network Address  Ipkts IfailOpkts Ofail Colls
ix1 1500b8:ca:3a:66:e2:72 193968251 1789065 172754300 0
ix1 1500  192.168.1.3 192.168.1.3   193968251 1789065 172754300 0
ix1 1500  192.168.1.1 192.168.1.19  193968251 1789065 172754300 0
dlg@ix netstat$ ./obj/netstat -dI ix1
NameMtu   Network Address  Ipkts IdropOpkts Odrop Colls
ix1 1500b8:ca:3a:66:e2:72 193968251 1789065 172754300 0
ix1 1500  192.168.1.3 192.168.1.3   193968251 1789065 172754300 0
ix1 1500  192.168.1.1 192.168.1.19  193968251 1789065 172754300 0
dlg@ix netstat$ ./obj/netstat -eI ix1 
NameMtu   Network Address  Ipkts IerrsOpkts Oerrs Colls
ix1 1500b8:ca:3a:66:e2:72 193968251 0 172754300 0
ix1 1500  192.168.1.3 192.168.1.3   193968251 0 172754300 0
ix1 1500  192.168.1.1 192.168.1.19  193968251 0 172754300 0

thoughts?

Index: systat/if.c
===
RCS file: /cvs/src/usr.bin/systat/if.c,v
retrieving revision 1.23
diff -u -p -r1.23 if.c
--- systat/if.c 16 Jan 2015 00:03:37 -  1.23
+++ systat/if.c 4 Mar 2019 11:13:40 -
@@ -56,6 +56,49 @@ static void showifstat(struct ifstat *);
 static void showtotal(void);
 static void rt_getaddrinfo(struct sockaddr *, int, struct sockaddr **);
 
+const char ifails[] = "IFAILS";
+const char ofails[] = "OFAILS";
+
+#define IF_ERR_SUM 0
+#define IF_ERR_ERRORS  1
+#define IF_ERR_QDROPS  2
+
+struct if_err_view {
+   const char *iname;
+   const char *oname;
+   uint64_t (*icount)(const struct ifcount *);
+   uint64_t (*ocount)(const struct ifcount *);
+};
+
+static uint64_t if_err_ifails(const struct ifcount *);
+static uint64_t if_err_ofails(const struct ifcount *);
+static uint64_t if_err_ierrors(const struct ifcount *);
+static uint64_t if_err_oerrors(const struct ifcount *);
+static uint64_t if_err_iqdrops(const struct ifcount *);
+static uint64_t if_err_oqdrops(const struct ifcount *);
+
+static const struct if_err_view if_err_views[] = {
+   [IF_ERR_SUM] ={
+   .iname = ifails,
+   .oname = ofails,
+   .icount = if_err_ifails,
+   .ocount = if_err_ofails,
+   },
+   [IF_ERR_ERRORS] = {
+   .iname = "IERRS",
+   .oname = "OERRS",
+   .icount = if_err_ierrors,
+   .ocount = if_err_oerrors,
+   },
+   [IF_ERR_QDROPS] = {
+   .iname = "IQDROPS",
+   .oname = "OQDROPS",
+   .icount = if_err_iqdrops,
+   .ocount = if_err_oqdrops,
+   },
+};
+
+static const struct if_err_view *if_err_view = _err_views[IF_ERR_SUM];
 
 /* Define fields */
 field_def fields_if[] = {
@@ -63,10 +106,10 @@ field_def fields_if[] = {
{"STATE", 4, 6, 1, FLD_ALIGN_LEFT, -1, 0, 0, 0},
{"IPKTS", 5, 8, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0},
{"IBYTES", 5, 8, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0},
-   {"IERRS", 5, 8, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0},
+   {ifails, 5, 8, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0},
{"OPKTS", 5, 8, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0},
{"OBYTES", 5, 8, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0},
-   {"OERRS", 5, 8, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0},
+   {ofails, 5, 8, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0},
{"COLLS", 5, 8, 1, FLD_ALIGN_RIGHT, -1, 0, 0, 0},
{"DESC", 14, 64, 1, FLD_ALIGN_LEFT, -1, 0, 0, 0},
 };
@@ -264,9 +307,11 @@ fetchifstat(void)
UPDATE(ifc_ip, ifm_data.ifi_ipackets);
UPDATE(ifc_ib, ifm_data.ifi_ibytes);
UPDATE(ifc_ie, ifm_data.ifi_ierrors);
+   UPDATE(ifc_iq, ifm_data.ifi_iqdrops);
UPDATE(ifc_op, ifm_data.ifi_opackets);
UPDATE(ifc_ob, ifm_data.ifi_obytes);
UPDATE(ifc_oe, ifm_data.ifi_oerrors);
+   UPDATE(ifc_oq, ifm_data.ifi_oqdrops);
UPDATE(ifc_co, ifm_data.ifi_collisions);

mandoc -Tlint systat.1 fix

2019-03-04 Thread David Gwynne

lint thinks uvm_swap_get() looks like a function name, so this uses .Fn
to mark it up as one.

ok?

Index: systat.1
===
RCS file: /cvs/src/usr.bin/systat/systat.1,v
retrieving revision 1.110
diff -u -p -r1.110 systat.1
--- systat.125 Jul 2018 17:24:14 -  1.110
+++ systat.14 Mar 2019 10:58:15 -
@@ -697,7 +697,8 @@ swap pages in use
 .It swpgonly
 in use swap pages not in RAM
 .It nswget
-fault called uvm_swap_get()
+fault called
+.Fn uvm_swap_get
 .It nanon
 total anon's
 .Pp

use ifiq_input and if_rxr_livelocked in ix(4)

2019-02-28 Thread David Gwynne

using ifiq_input lets ix check the return value from that function
so it can call if_rxr_livelocked as needed. calling if_rxr_livelocked
will make it  shrinkthe rx rings before ifiq_input has to start
dropping packets. the idea being that dropping in hardware lets the
cpu spend more time processing packets instead of freeing them.

this works best if you have my "if_input_process doesn't need to mask
interrupts" diff in the tree too. without that the rx ring gets used
more than it should, so it grows the massively to all 255 slots we give
ix rings. with the two diffs together, ix grows the rings to 20 or
30 slots and still manages to forward a lot of packets. the box
feels smoother under a DoS too.

the trick is to figure out the right thresholds in ifiq_input.

Index: if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.156
diff -u -p -r1.156 if_ix.c
--- if_ix.c 1 Mar 2019 06:15:59 -   1.156
+++ if_ix.c 1 Mar 2019 06:16:29 -
@@ -2903,7 +2903,8 @@ next_desc:
}
rxr->next_to_check = i;
 
-   if_input(ifp, );
+   if (ifiq_input(>if_rcv, ))
+   if_rxr_livelocked(>rx_ring);
 
if (!(staterr & IXGBE_RXD_STAT_DD))
return FALSE;

if_input_process doesn't need to mask interrupts

2019-02-28 Thread David Gwynne

it is in fact harmful for the stack to block hardware interrupts.

the nettq can run on cpu0, which is where interrupts for nics come in
too. if a busy nic has fed the stack a lot of work, and the stack is on
cpu0, then the stack will stop the driver pulling packets off the nic
again. the hardware will then place more packets on the rx ring
than it would have if the isr had serviced the ring, which messes
up the rx ring moderation we do. it will inflate the ring usage so the
rings will grow, letting it give the stack more work. it basically makes
the whole problem inflate.

rx ring moderation works a lot better with this diff. id argue it's
broken without it actually. without it the current watermark on
rings grows to the max under DoS conditions.

can anyone think of why the current network stack needs any interrupt
protection?

ok?

Index: if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.573
diff -u -p -r1.573 if.c
--- if.c1 Mar 2019 04:47:32 -   1.573
+++ if.c1 Mar 2019 05:14:29 -
@@ -900,7 +900,6 @@ if_input_process(struct ifnet *ifp, stru
struct mbuf *m;
struct ifih *ifih;
struct srp_ref sr;
-   int s;
 
if (ml_empty(ml))
return;
@@ -921,7 +920,6 @@ if_input_process(struct ifnet *ifp, stru
 * lists.
 */
NET_RLOCK();
-   s = splnet();
while ((m = ml_dequeue(ml)) != NULL) {
/*
 * Pass this mbuf to all input handlers of its
@@ -936,7 +934,6 @@ if_input_process(struct ifnet *ifp, stru
if (ifih == NULL)
m_freem(m);
}
-   splx(s);
NET_RUNLOCK();
 }

rework the interface input backpressure mechanism

2019-02-27 Thread David Gwynne

this changes how ifiq_input measure whether there is too much work for
the network stack to do, which in turn is used to decide whether it
should drop packets or not.

currently we count the number of packets still on the queue, and we drop
when that backlog of packets gets too high. currently that threshold
ends up being 10240 packets for every individual nic or ring on a
nic, which is "quite high". it seems to mean that when the network
stack is under load, we keep feeding it 10k packets at a time, and
dropping the rest in between.

this moves us to counting the number of times a nic or ring tried to
enqueue packets for the network stack. so, ifiq_input operations
add 1 to an ifiq_pressure variable, and the task that dequeues and
processes the packets resets it to 0. if ifiq_pressure grows too
much due to a lot of enqueue operations without matching dequeue
ops, it assumes there's too much pressure and begins to drop packets.

this is a much nicer mechanism since it effectively scales to the
system it's running on, and feeds smaller and therefore smoother bundles
of packets into the stack. this is instead of having a queue limit
we hope works well enough on all systems ranging from a raspberry
pi all the way up to a high speed xeon box with 40Gb nics.

it seems to work well in practice. i had hoped that there'd be no
difference in performance, but hrvoje popovski has tested it and noted
an increase on one system from 1.1mpps to sitting between 1.1 and
1.3mpps, and a slow box going from 730 to 745kpps.

there's some follow on work moving nics that implement rx ring
moderation to use ifiq_input directly, and reducing their rings in
hardware before we have to resort to dropping packets in software.
my intitial tests with a change like that have ix forwarding over
90% of the same pps as before, but moderating it's rings to about
20 or 30 packets.  currently it grows them to the full 255 slots
and we drop 2/3rds of them in ifiq_input. i'm hoping i can tune it
so there's no drop in pps but we run with much smaller rings and
without spending so much time in m_freem.

im sending this out so people can object, otherwise im committing it
tomorrow, or about 24 hours from now.

Index: if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.572
diff -u -p -r1.572 if.c
--- if.c26 Feb 2019 03:20:08 -  1.572
+++ if.c27 Feb 2019 12:05:06 -
@@ -738,7 +738,7 @@ if_enqueue_ifq(struct ifnet *ifp, struct
 void
 if_input(struct ifnet *ifp, struct mbuf_list *ml)
 {
-   ifiq_input(>if_rcv, ml, 2048);
+   ifiq_input(>if_rcv, ml);
 }
 
 int
Index: ifq.c
===
RCS file: /cvs/src/sys/net/ifq.c,v
retrieving revision 1.25
diff -u -p -r1.25 ifq.c
--- ifq.c   16 Dec 2018 03:36:02 -  1.25
+++ ifq.c   27 Feb 2019 12:05:06 -
@@ -445,6 +445,7 @@ ifiq_init(struct ifiqueue *ifiq, struct 
mtx_init(>ifiq_mtx, IPL_NET);
ml_init(>ifiq_ml);
task_set(>ifiq_task, ifiq_process, ifiq);
+   ifiq->ifiq_pressure = 0;
 
ifiq->ifiq_qdrops = 0;
ifiq->ifiq_packets = 0;
@@ -467,17 +468,20 @@ ifiq_destroy(struct ifiqueue *ifiq)
ml_purge(>ifiq_ml);
 }
 
+unsigned int ifiq_pressure_drop = 16;
+unsigned int ifiq_pressure_return = 2;
+
 int
-ifiq_input(struct ifiqueue *ifiq, struct mbuf_list *ml, unsigned int cwm)
+ifiq_input(struct ifiqueue *ifiq, struct mbuf_list *ml)
 {
struct ifnet *ifp = ifiq->ifiq_if;
struct mbuf *m;
uint64_t packets;
uint64_t bytes = 0;
+   unsigned int pressure;
 #if NBPFILTER > 0
caddr_t if_bpf;
 #endif
-   int rv = 1;
 
if (ml_empty(ml))
return (0);
@@ -518,12 +522,11 @@ ifiq_input(struct ifiqueue *ifiq, struct
ifiq->ifiq_packets += packets;
ifiq->ifiq_bytes += bytes;
 
-   if (ifiq_len(ifiq) >= cwm * 5)
+   pressure = ++ifiq->ifiq_pressure;
+   if (pressure > ifiq_pressure_drop)
ifiq->ifiq_qdrops += ml_len(ml);
-   else {
-   rv = (ifiq_len(ifiq) >= cwm * 3);
+   else
ml_enlist(>ifiq_ml, ml);
-   }
mtx_leave(>ifiq_mtx);
 
if (ml_empty(ml))
@@ -531,7 +534,7 @@ ifiq_input(struct ifiqueue *ifiq, struct
else
ml_purge(ml);
 
-   return (rv);
+   return (pressure > ifiq_pressure_return);
 }
 
 void
@@ -573,6 +576,7 @@ ifiq_process(void *arg)
return;
 
mtx_enter(>ifiq_mtx);
+   ifiq->ifiq_pressure = 0;
ml = ifiq->ifiq_ml;
ml_init(>ifiq_ml);
mtx_leave(>ifiq_mtx);
Index: ifq.h
===
RCS file: /cvs/src/sys/net/ifq.h,v
retrieving revision 1.22
diff -u -p -r1.22 ifq.h
--- ifq.h   11 Dec 2018 01:36:42 -  1.22
+++ ifq.h   27 Feb 2019 12:05:06 -
@@ -80,6 +80,7 @@ struct ifiqueue {

mpip(4): MPLS "IP Layer2 Transport" pseudowire interface

2019-02-25 Thread David Gwynne

according to the pwe3 type registry, you can use a pseudowire as a
transport for ip packets. LDP can negotiate this (not ldpd yet) as type
0x000b, but you basically end up with a p2p ip tunnel over an mpls
fabric.

this can be handy if you just want to join two sites together and might
mean you don't have to configure a whole extra routing protocol to get
connectivity up. the existing pwe3 ioctls can be used to configure this
interface, and ldpd support will be forthcoming.

can someone tell me if the conf/files bit makes sense?

ok?

Index: conf/files
===
RCS file: /cvs/src/sys/conf/files,v
retrieving revision 1.666
diff -u -p -r1.666 files
--- conf/files  20 Dec 2018 23:00:55 -  1.666
+++ conf/files  26 Feb 2019 03:46:02 -
@@ -59,6 +59,7 @@ defineonewire_bitbang
 # net device attributes - we have generic code for ether(net)
 define crypto
 define ether
+define mpls
 define sppp
 define wlan
 
@@ -563,6 +564,7 @@ pseudo-device crypto: ifnet
 pseudo-device trunk: ifnet, ether, ifmedia
 pseudo-device mpe: ifnet, ether
 pseudo-device mpw: ifnet, ether
+pseudo-device mpip: ifnet, mpls
 pseudo-device bpe: ifnet, ether, ifmedia
 pseudo-device vether: ifnet, ether
 pseudo-device pppx: ifnet
@@ -814,6 +816,7 @@ file net/if_trunk.c trunk   
needs-coun
 file net/trunklacp.c   trunk
 file net/if_mpe.c  mpe needs-count
 file net/if_mpw.c  mpw & bridgeneeds-count
+file net/if_mpip.c mpip
 file net/if_bpe.c  bpe needs-count
 file net/if_vether.c   vether  needs-count
 file net/if_pair.c pairneeds-count
Index: net/if_mpip.c
===
RCS file: net/if_mpip.c
diff -N net/if_mpip.c
--- /dev/null   1 Jan 1970 00:00:00 -
+++ net/if_mpip.c   26 Feb 2019 03:46:02 -
@@ -0,0 +1,706 @@
+/* $OpenBSD$ */
+
+/*
+ * Copyright (c) 2015 Rafael Zalamena 
+ * Copyright (c) 2019 David Gwynne 
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "bpfilter.h"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#ifdef INET6
+#include 
+#endif
+
+#include 
+
+#if NBPFILTER > 0
+#include 
+#endif /* NBPFILTER */
+
+struct mpip_neighbor {
+   struct shim_hdr n_rshim;
+   struct sockaddr_storage n_nexthop;
+};
+
+struct mpip_softc {
+   struct ifnetsc_if;
+   unsigned intsc_dead;
+   uint32_tsc_flow; /* xor for mbuf flowid */
+
+   struct ifaddr   sc_ifa;
+   struct sockaddr_mplssc_smpls; /* Local label */
+   unsigned intsc_rdomain;
+   struct mpip_neighbor*sc_neighbor;
+
+   unsigned intsc_cword; /* control word */
+   unsigned intsc_fword; /* flow-aware transport */
+   int sc_ttl;
+};
+
+void   mpipattach(int);
+intmpip_clone_create(struct if_clone *, int);
+intmpip_clone_destroy(struct ifnet *);
+intmpip_ioctl(struct ifnet *, u_long, caddr_t);
+intmpip_output(struct ifnet *, struct mbuf *, struct sockaddr *,
+   struct rtentry *);
+void   mpip_start(struct ifnet *);
+
+struct if_clone mpip_cloner =
+IF_CLONE_INITIALIZER("mpip", mpip_clone_create, mpip_clone_destroy);
+
+void
+mpipattach(int n)
+{
+   if_clone_attach(_cloner);
+}
+
+int
+mpip_clone_create(struct if_clone *ifc, int unit)
+{
+   struct mpip_softc *sc;
+   struct ifnet *ifp;
+
+   sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_CANFAIL|M_ZERO);
+   if (sc == NULL)
+   return (ENOMEM);
+
+   sc->sc_neighbor = 0;
+   sc->sc_cword = 0; /* default to no control word */
+   sc->sc_fword = 0; /* both sides have to agree on FAT first */
+   sc->sc_flow = arc4random() & 0xf;
+   sc->sc_smpls.smpls_len = sizeof(sc->sc_smpls);
+   sc->sc_smpls.smpls_family = AF_MPLS;
+   sc->sc

Re: ix(4): align rx payloads to the end of the cluster

2019-02-25 Thread David Gwynne

On Mon, Feb 25, 2019 at 08:44:35AM +0100, Claudio Jeker wrote:
> On Mon, Feb 25, 2019 at 10:49:16AM +1000, David Gwynne wrote:
> > the mcl2k2 pool, aka the intel mbuf cluster pool, gets set up to allocate
> > at least 2048 + 2 bytes, which gets rounded up by 64 bytes to 2112
> > bytes. this diff makes ix move the reception of packets to the end of
> > the 2112 byte allocation so there's space left at the front of the mbuf.
> > 
> > this in turn makes it more likely that an m_prepend at another point in
> > the system will work without an extra mbuf allocation. eg, if you're
> > bridging or routing between vlans and vlans on svlans somewhere else,
> > this will be a bit faster with this diff.
> > 
> > thoughts? ok?
> 
> I think using m_align() here may be benefitial. Since it does exactly
> that. Apart from that I have to agree, shifting the packet back makes a
> lot of sense.

Like this?

Index: if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.153
diff -u -p -r1.153 if_ix.c
--- if_ix.c 21 Feb 2019 03:16:47 -  1.153
+++ if_ix.c 25 Feb 2019 10:06:59 -
@@ -2389,8 +2395,8 @@ ixgbe_get_buf(struct rx_ring *rxr, int i
if (!mp)
return (ENOBUFS);
 
+   m_align(mp, sc->rx_mbuf_sz);
mp->m_len = mp->m_pkthdr.len = sc->rx_mbuf_sz;
-   m_adj(mp, ETHER_ALIGN);
 
error = bus_dmamap_load_mbuf(rxr->rxdma.dma_tag, rxbuf->map,
mp, BUS_DMA_NOWAIT);

Re: ioctls for MPLS pseudowire interface config

2019-02-25 Thread David Gwynne

On Mon, Feb 25, 2019 at 03:28:58PM +1000, David Gwynne wrote:
> On Mon, Feb 25, 2019 at 10:37:40AM +1000, David Gwynne wrote:
> > 
> > 
> > > On 22 Feb 2019, at 05:01, Martin Pieuchot  wrote:
> > > 
> > > On 21/02/19(Thu) 07:35, David Gwynne wrote:
> > >>> On 20 Feb 2019, at 11:21 pm, Martin Pieuchot  wrote:
> > >>> 
> > >>> On 20/02/19(Wed) 14:44, David Gwynne wrote:
> > >>>> Index: sys/net/if.c
> > >>>> ===
> > >>>> RCS file: /cvs/src/sys/net/if.c,v
> > >>>> retrieving revision 1.571
> > >>>> diff -u -p -r1.571 if.c
> > >>>> --- sys/net/if.c   9 Jan 2019 01:14:21 -   1.571
> > >>>> +++ sys/net/if.c   20 Feb 2019 04:35:42 -
> > >>>> @@ -2143,6 +2143,25 @@ ifioctl(struct socket *so, u_long cmd, c
> > >>>>NET_UNLOCK();
> > >>>>break;
> > >>>> 
> > >>>> +  case SIOCSETMPWCFG:
> > >>>> +  case SIOCSPWE3CTRLWORD:
> > >>>> +  case SIOCSPWE3FAT:
> > >>>> +  case SIOCSPWE3NEIGHBOR:
> > >>>> +  case SIOCDPWE3NEIGHBOR:
> > >>>> +  if ((error = suser(p)) != 0)
> > >>>> +  break;
> > >>>> +  /* FALLTHROUGH */
> > >>>> +  case SIOCGETMPWCFG:
> > >>>> +  case SIOCGPWE3CTRLWORD:
> > >>>> +  case SIOCGPWE3FAT:
> > >>>> +  case SIOCGPWE3NEIGHBOR:
> > >>>> +  if_ref(ifp);
> > >>>> +  KERNEL_UNLOCK();
> > >>>> +  error = ((*ifp->if_ioctl)(ifp, cmd, data));
> > >>>> +  KERNEL_LOCK();
> > >>>> +  if_put(ifp);
> > >>> 
> > >>> Why are you referencing the `ifp' and grabbing the KERNEL_LOCK()
> > >>> (recursively)?
> > >> 
> > >> ifioctl gets the ifp pointer from ifunit, which doesn't increase the ref 
> > >> count for you. I'm giving up kernel lock around the pwe3 ioctl calls 
> > >> into the driver, not taking them harder. Taking the ifp ref there 
> > >> guarantees the interface will stay alive^Wallocated over those calls.
> > > 
> > > It feels premature to me, well I'm confused.  None of the other ioctl
> > > handlers do that.  The KERNEL_LOCK() is still held in ifioctl() which
> > > guarantees serialization.  If any of the ioctl(2) calls is going to sleep,
> > > thus releasing the lock, then I'd suggest to grab the NET_RLOCK() here
> > > instead.  It still guarantees serialization of network ioctls w/ regard
> > > to detach.
> > > 
> > > Note that I'll be delighted if you want to remove/push down the NET_LOCK()
> > > from this code path, but can we keep the handlers coherent?
> > > 
> > > Even if we're using refcounting, don't we want to serialize all network
> > > ioctl(2)s?  If we're not using the NET_LOCK() for this, can this new lock
> > > guarantee that that `ifp' isn't going away?  Or do you have a better
> > > idea?
> > 
> > The network stack implicitly taking locks is hurting me way more
> > than it's helping me at the moment, particularly the net lock, so
> > I would like to spend some time narrowing that down. If the consensus
> > is that it's too much risk for drivers to keep themselves consistent
> > then I'd argue for a per ifp rwlock that the ifioctl code could
> > take and release.
> > 
> > Do you want me to do that for the pwe3 ioctls? There's a small
> > number of MPLS interfaces, so they'd be good for a test run.
> > 
> > ifunit() is notionally like if_get except it doesn't give you a
> > reference. You have to be holding a lock that prevents the interface
> > being removed from the list if you're calling ifunit. The code
> > doesn't make it clear whether the lock you need to be holding is
> > the kernel lock or the net lock, but the kernel lock is empirically
> > good enough. If you give up that lock while holding the ifp, you
> > need to account for your reference to it.
> 
> deraadt@ talked me down from giving up KERNEL_LOCK. so this is what
> the diff would be like if the interface had a lock and it was taken
> around the mpls ioctls.
> 
> my opinion on the pros and cons of this is:
> 
> pro: it keeps

Re: ioctls for MPLS pseudowire interface config

2019-02-24 Thread David Gwynne

On Mon, Feb 25, 2019 at 10:37:40AM +1000, David Gwynne wrote:
> 
> 
> > On 22 Feb 2019, at 05:01, Martin Pieuchot  wrote:
> > 
> > On 21/02/19(Thu) 07:35, David Gwynne wrote:
> >>> On 20 Feb 2019, at 11:21 pm, Martin Pieuchot  wrote:
> >>> 
> >>> On 20/02/19(Wed) 14:44, David Gwynne wrote:
> >>>> Index: sys/net/if.c
> >>>> ===
> >>>> RCS file: /cvs/src/sys/net/if.c,v
> >>>> retrieving revision 1.571
> >>>> diff -u -p -r1.571 if.c
> >>>> --- sys/net/if.c 9 Jan 2019 01:14:21 -   1.571
> >>>> +++ sys/net/if.c 20 Feb 2019 04:35:42 -
> >>>> @@ -2143,6 +2143,25 @@ ifioctl(struct socket *so, u_long cmd, c
> >>>>  NET_UNLOCK();
> >>>>  break;
> >>>> 
> >>>> +case SIOCSETMPWCFG:
> >>>> +case SIOCSPWE3CTRLWORD:
> >>>> +case SIOCSPWE3FAT:
> >>>> +case SIOCSPWE3NEIGHBOR:
> >>>> +case SIOCDPWE3NEIGHBOR:
> >>>> +if ((error = suser(p)) != 0)
> >>>> +break;
> >>>> +/* FALLTHROUGH */
> >>>> +case SIOCGETMPWCFG:
> >>>> +case SIOCGPWE3CTRLWORD:
> >>>> +case SIOCGPWE3FAT:
> >>>> +case SIOCGPWE3NEIGHBOR:
> >>>> +if_ref(ifp);
> >>>> +KERNEL_UNLOCK();
> >>>> +error = ((*ifp->if_ioctl)(ifp, cmd, data));
> >>>> +KERNEL_LOCK();
> >>>> +if_put(ifp);
> >>> 
> >>> Why are you referencing the `ifp' and grabbing the KERNEL_LOCK()
> >>> (recursively)?
> >> 
> >> ifioctl gets the ifp pointer from ifunit, which doesn't increase the ref 
> >> count for you. I'm giving up kernel lock around the pwe3 ioctl calls into 
> >> the driver, not taking them harder. Taking the ifp ref there guarantees 
> >> the interface will stay alive^Wallocated over those calls.
> > 
> > It feels premature to me, well I'm confused.  None of the other ioctl
> > handlers do that.  The KERNEL_LOCK() is still held in ifioctl() which
> > guarantees serialization.  If any of the ioctl(2) calls is going to sleep,
> > thus releasing the lock, then I'd suggest to grab the NET_RLOCK() here
> > instead.  It still guarantees serialization of network ioctls w/ regard
> > to detach.
> > 
> > Note that I'll be delighted if you want to remove/push down the NET_LOCK()
> > from this code path, but can we keep the handlers coherent?
> > 
> > Even if we're using refcounting, don't we want to serialize all network
> > ioctl(2)s?  If we're not using the NET_LOCK() for this, can this new lock
> > guarantee that that `ifp' isn't going away?  Or do you have a better
> > idea?
> 
> The network stack implicitly taking locks is hurting me way more
> than it's helping me at the moment, particularly the net lock, so
> I would like to spend some time narrowing that down. If the consensus
> is that it's too much risk for drivers to keep themselves consistent
> then I'd argue for a per ifp rwlock that the ifioctl code could
> take and release.
> 
> Do you want me to do that for the pwe3 ioctls? There's a small
> number of MPLS interfaces, so they'd be good for a test run.
> 
> ifunit() is notionally like if_get except it doesn't give you a
> reference. You have to be holding a lock that prevents the interface
> being removed from the list if you're calling ifunit. The code
> doesn't make it clear whether the lock you need to be holding is
> the kernel lock or the net lock, but the kernel lock is empirically
> good enough. If you give up that lock while holding the ifp, you
> need to account for your reference to it.

deraadt@ talked me down from giving up KERNEL_LOCK. so this is what
the diff would be like if the interface had a lock and it was taken
around the mpls ioctls.

my opinion on the pros and cons of this is:

pro: it keeps the individual driver state consistent cos changes
are serialised by the lock. this means you don't have to think too
hard about the driver locking against itself.

pro: it allows fear free use of ifq_barrier. ifq_barrier cannot deadlock
if the caller isn't holding NET_LOCK. this is the big win because it
supports the model where the ioctl can coordinate with the running stack
by publishing a change and then inserting a barrier to ensure the old
state is no longe

add the RETURN VALUES section to rwlock.9

2019-02-24 Thread David Gwynne

i had to think a bit about what rw_lock returns, so i made this.

is this worth it?

Index: rwlock.9
===
RCS file: /cvs/src/share/man/man9/rwlock.9,v
retrieving revision 1.23
diff -u -p -r1.23 rwlock.9
--- rwlock.94 Jun 2018 04:52:33 -   1.23
+++ rwlock.925 Feb 2019 01:27:21 -
@@ -188,19 +188,7 @@ functions check the status
 panicking if it is not write-, read-, any-, or unlocked, respectively.
 .Pp
 .Nm rw_status
-returns the current state of the lock:
-.Pp
-.Bl -tag -width "RW_WRITE_OTHER" -offset indent -compact
-.It Dv RW_WRITE
-Lock is write locked by the calling thread.
-.It Dv RW_WRITE_OTHER
-Lock is write locked by a different thread.
-.It Dv RW_READ
-Lock is read locked.
-The current thread may be one of the threads that has it locked.
-.It 0
-Lock is not locked.
-.El
+returns the current state of the lock.
 .Pp
 A lock declaration may be initialised with the
 .Fn RWLOCK_INITIALIZER
@@ -223,6 +211,30 @@ and
 can be called during autoconf, from process context, or from interrupt context.
 .Pp
 All other functions can be called during autoconf or from process context.
+.Sh RETURN VALUES
+.Nm rw_enter
+and
+.Nm rrw_enter
+return 0 on success, or an
+.Xr errno 2
+style value on failure.
+.Pp
+.Nm rw_status
+and
+.Nm rrw_status
+return the state of the lock:
+.Pp
+.Bl -tag -width "RW_WRITE_OTHER" -offset indent -compact
+.It Dv RW_WRITE
+Lock is write locked by the calling thread.
+.It Dv RW_WRITE_OTHER
+Lock is write locked by a different thread.
+.It Dv RW_READ
+Lock is read locked.
+The current thread may be one of the threads that has it locked.
+.It 0
+Lock is not locked.
+.El
 .Sh SEE ALSO
 .Xr witness 4 ,
 .Xr mutex 9 ,

ix(4): align rx payloads to the end of the cluster

2019-02-24 Thread David Gwynne

the mcl2k2 pool, aka the intel mbuf cluster pool, gets set up to allocate
at least 2048 + 2 bytes, which gets rounded up by 64 bytes to 2112
bytes. this diff makes ix move the reception of packets to the end of
the 2112 byte allocation so there's space left at the front of the mbuf.

this in turn makes it more likely that an m_prepend at another point in
the system will work without an extra mbuf allocation. eg, if you're
bridging or routing between vlans and vlans on svlans somewhere else,
this will be a bit faster with this diff.

thoughts? ok?

Index: dev/pci/if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.152
diff -u -p -r1.152 if_ix.c
--- dev/pci/if_ix.c 22 Jun 2017 02:44:37 -  1.152
+++ dev/pci/if_ix.c 25 Feb 2019 00:40:47 -
@@ -2445,7 +2445,7 @@ ixgbe_get_buf(struct rx_ring *rxr, int i
return (ENOBUFS);
 
mp->m_len = mp->m_pkthdr.len = sc->rx_mbuf_sz;
-   m_adj(mp, ETHER_ALIGN);
+   m_adj(mp, mp->m_ext.ext_size - sc->rx_mbuf_sz);
 
error = bus_dmamap_load_mbuf(rxr->rxdma.dma_tag, rxbuf->map,
mp, BUS_DMA_NOWAIT);

Re: ioctls for MPLS pseudowire interface config

2019-02-24 Thread David Gwynne




> On 22 Feb 2019, at 05:01, Martin Pieuchot  wrote:
> 
> On 21/02/19(Thu) 07:35, David Gwynne wrote:
>>> On 20 Feb 2019, at 11:21 pm, Martin Pieuchot  wrote:
>>> 
>>> On 20/02/19(Wed) 14:44, David Gwynne wrote:
>>>> Index: sys/net/if.c
>>>> ===
>>>> RCS file: /cvs/src/sys/net/if.c,v
>>>> retrieving revision 1.571
>>>> diff -u -p -r1.571 if.c
>>>> --- sys/net/if.c   9 Jan 2019 01:14:21 -   1.571
>>>> +++ sys/net/if.c   20 Feb 2019 04:35:42 -
>>>> @@ -2143,6 +2143,25 @@ ifioctl(struct socket *so, u_long cmd, c
>>>>NET_UNLOCK();
>>>>break;
>>>> 
>>>> +  case SIOCSETMPWCFG:
>>>> +  case SIOCSPWE3CTRLWORD:
>>>> +  case SIOCSPWE3FAT:
>>>> +  case SIOCSPWE3NEIGHBOR:
>>>> +  case SIOCDPWE3NEIGHBOR:
>>>> +  if ((error = suser(p)) != 0)
>>>> +  break;
>>>> +  /* FALLTHROUGH */
>>>> +  case SIOCGETMPWCFG:
>>>> +  case SIOCGPWE3CTRLWORD:
>>>> +  case SIOCGPWE3FAT:
>>>> +  case SIOCGPWE3NEIGHBOR:
>>>> +  if_ref(ifp);
>>>> +  KERNEL_UNLOCK();
>>>> +  error = ((*ifp->if_ioctl)(ifp, cmd, data));
>>>> +  KERNEL_LOCK();
>>>> +  if_put(ifp);
>>> 
>>> Why are you referencing the `ifp' and grabbing the KERNEL_LOCK()
>>> (recursively)?
>> 
>> ifioctl gets the ifp pointer from ifunit, which doesn't increase the ref 
>> count for you. I'm giving up kernel lock around the pwe3 ioctl calls into 
>> the driver, not taking them harder. Taking the ifp ref there guarantees the 
>> interface will stay alive^Wallocated over those calls.
> 
> It feels premature to me, well I'm confused.  None of the other ioctl
> handlers do that.  The KERNEL_LOCK() is still held in ifioctl() which
> guarantees serialization.  If any of the ioctl(2) calls is going to sleep,
> thus releasing the lock, then I'd suggest to grab the NET_RLOCK() here
> instead.  It still guarantees serialization of network ioctls w/ regard
> to detach.
> 
> Note that I'll be delighted if you want to remove/push down the NET_LOCK()
> from this code path, but can we keep the handlers coherent?
> 
> Even if we're using refcounting, don't we want to serialize all network
> ioctl(2)s?  If we're not using the NET_LOCK() for this, can this new lock
> guarantee that that `ifp' isn't going away?  Or do you have a better
> idea?

The network stack implicitly taking locks is hurting me way more than it's 
helping me at the moment, particularly the net lock, so I would like to spend 
some time narrowing that down. If the consensus is that it's too much risk for 
drivers to keep themselves consistent then I'd argue for a per ifp rwlock that 
the ifioctl code could take and release.

Do you want me to do that for the pwe3 ioctls? There's a small number of MPLS 
interfaces, so they'd be good for a test run.

ifunit() is notionally like if_get except it doesn't give you a reference. You 
have to be holding a lock that prevents the interface being removed from the 
list if you're calling ifunit. The code doesn't make it clear whether the lock 
you need to be holding is the kernel lock or the net lock, but the kernel lock 
is empirically good enough. If you give up that lock while holding the ifp, you 
need to account for your reference to it.

Re: bypass interface input queues for vlan(4)

2019-02-23 Thread David Gwynne

On Fri, Feb 22, 2019 at 09:56:58AM -0300, Martin Pieuchot wrote:
> On 22/02/19(Fri) 15:01, David Gwynne wrote:
> > On Thu, Feb 21, 2019 at 04:29:27PM -0300, Martin Pieuchot wrote:
> > > On 21/02/19(Thu) 14:19, David Gwynne wrote:
> > > > right now we add vlan_input as a possible input handler on the parent
> > > > interface, and if the packet is for a vlan we take it and pretend we
> > > > received it on the vlan interface by calling if_input against that mbuf.
> > > > 
> > > > as mpi notes, the if input queue stuff looks like a lot of work,
> > > > especially for a single packet. my opinion is that we got away with
> > > > the if input stuff we've done to try and encourage an mpsafe network
> > > > stack because we amortised the cost of it over many packets off the
> > > > hardware ring. vlan does it a packet at a time.
> > > > 
> > > > this moves the handling of vlan packets back into ether_input by
> > > > calling vlan_input directly on packets that are either marked as vlan
> > > > tagged or have a vlan ethertype. note that we have to do that anyway,
> > > > this just makes it explicit.
> > > > 
> > > > vlan_input is then tweaked to implement all the important bits of if
> > > > input. part of what if input does is count the packets. because vlan
> > > > already has per cpu counters for bypassing queues on output, we can use
> > > > them again for input from any cpu. if i ever get round to making a
> > > > driver handle multiple rx rings this means we can rx vlan packets
> > > > concurrently, they don't get serialised to a single if input q.
> > > > 
> > > > finally, hrvoje popovski has tested this diff and get's a significant
> > > > bump with it. on a machine that can forward 1100Kpps without vlan, it
> > > > goes from 790Kpps with vlan to 870Kpps. On a box that can do 730Kpps
> > > > without vlans, it goes from 550Kpps with vlan to 840Kpps. We're
> > > > still trying to figure that last one out, but it does appear to be
> > > > faster.
> > > > 
> > > > thoughts? ok?
> > > 
> > > Why do we need to move stuff to ether_input() if all we want is to
> > > bypass ifiq_input()?  Isn't a 3 line diff enough^^ ?
> > 
> > Fair point. It turns out it's not quite three lines, but it's still
> > smaller.
> 
> I'm unhappy to see the bpf & packet magic reappear in pseudo-drivers.
> 
> This is going to spread in every pseudo-driver, no?  So why not keeping
> it in the new API?   Should we document if_input() vs if_input_one()?
> Should we assert that if_input_one() is only called from a network
> thread?  If yes, should we pick a better name?

Maybe. How's if_vinput? And as you suggest, it can do some more of
the magic? Let me try converting a few more drivers before we
burden it with constraints.

Index: if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.571
diff -u -p -r1.571 if.c
--- if.c9 Jan 2019 01:14:21 -   1.571
+++ if.c23 Feb 2019 09:06:27 -
@@ -895,11 +895,29 @@ if_ih_remove(struct ifnet *ifp, int (*in
 }
 
 void
-if_input_process(struct ifnet *ifp, struct mbuf_list *ml)
+if_input_ih(struct ifnet *ifp, struct mbuf *m)
 {
-   struct mbuf *m;
struct ifih *ifih;
struct srp_ref sr;
+
+   /*
+* Pass this mbuf to all input handlers of its
+* interface until it is consumed.
+*/
+   SRPL_FOREACH(ifih, , >if_inputs, ifih_next) {
+   if ((*ifih->ifih_input)(ifp, m, ifih->ifih_cookie))
+   break;
+   }
+   SRPL_LEAVE();
+
+   if (ifih == NULL)
+   m_freem(m);
+}
+
+void
+if_input_process(struct ifnet *ifp, struct mbuf_list *ml)
+{
+   struct mbuf *m;
int s;
 
if (ml_empty(ml))
@@ -922,22 +940,32 @@ if_input_process(struct ifnet *ifp, stru
 */
NET_RLOCK();
s = splnet();
-   while ((m = ml_dequeue(ml)) != NULL) {
-   /*
-* Pass this mbuf to all input handlers of its
-* interface until it is consumed.
-*/
-   SRPL_FOREACH(ifih, , >if_inputs, ifih_next) {
-   if ((*ifih->ifih_input)(ifp, m, ifih->ifih_cookie))
-   break;
-   }
-   SRPL_LEAVE();
+   while ((m = ml_dequeue(ml)) != NULL)
+   if_input_ih(ifp, m);
+   splx(s);
+   NET_RUNLOCK();
+}
+
+void
+if_vinput(struct ifnet *ifp, struct mbuf *m)
+{
+#if NBPFILTER > 0
+   cadd

a manpage for pci_mapreg_map

2019-02-21 Thread David Gwynne

plus some related functions.

i didnt know which of the int arguments is used as the flags argument to
the bus_space_map call it wraps. students tell me that having to read
the source code instead of some doco is literally (figuratively) the
worst, so here's some doco so i don't have to read the code next time.

the wording isnt the best, but i reckon it is a good start.

ok?

Index: Makefile
===
RCS file: /cvs/src/share/man/man9/Makefile,v
retrieving revision 1.291
diff -u -p -r1.291 Makefile
--- Makefile12 Nov 2018 15:13:12 -  1.291
+++ Makefile22 Feb 2019 06:32:12 -
@@ -25,8 +25,8 @@ MAN=  aml_evalnode.9 atomic_add_int.9 ato
malloc.9 membar_sync.9 memcmp.9 mbuf.9 mbuf_tags.9 md5.9 mi_switch.9 \
microtime.9 ml_init.9 mq_init.9 mutex.9 \
namei.9 \
-   panic.9 pci_conf_read.9 pci_intr_map.9 physio.9 pmap.9 \
-   pool.9 pool_cache_init.9 ppsratecheck.9 printf.9 psignal.9 \
+   panic.9 pci_conf_read.9 pci_mapreg_map.9 pci_intr_map.9 physio.9 \
+   pmap.9 pool.9 pool_cache_init.9 ppsratecheck.9 printf.9 psignal.9 \
RBT_INIT.9 \
radio.9 arc4random.9 rasops.9 ratecheck.9 refcnt_init.9 resettodr.9 \
rssadapt.9 route.9 rt_ifa_add.9 rt_timer_add.9 rtalloc.9 rtable_add.9 \
Index: pci_mapreg_map.9
===
RCS file: pci_mapreg_map.9
diff -N pci_mapreg_map.9
--- /dev/null   1 Jan 1970 00:00:00 -
+++ pci_mapreg_map.922 Feb 2019 06:32:12 -
@@ -0,0 +1,151 @@
+.\"$OpenBSD$
+.\"
+.\" Copyright (c) 2019 David Gwynne 
+.\" All rights reserved.
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.\"
+.Dd $Mdocdate: June 4 2013 $
+.Dt PCI_MAPREG_MAP 9
+.Os
+.Sh NAME
+.Nm pci_mapreg_map ,
+.Nm pci_mapreg_info ,
+.Nm pci_mapreg_probe ,
+.Nm pci_mapreg_type
+.\" .Nm pci_mem_find ,
+.\" .Nm pci_io_find
+.Nd PCI register mappings
+.Sh SYNOPSIS
+.In dev/pci/pcivar.h
+.Ft int
+.Fo pci_mapreg_map
+.Fa "struct pci_attach_args *paa"
+.Fa "int reg"
+.Fa "pcireg_t type"
+.Fa "int flags"
+.Fa "bus_space_tag_t *tagp"
+.Fa "bus_space_handle_t *handlep"
+.Fa "bus_addr_t *basep"
+.Fa "bus_size_t *sizep"
+.Fa "bus_size_t maxsize"
+.Fc
+.Ft int
+.Fo pci_mapreg_info
+.Fa "pci_chipset_tag_t pc"
+.Fa "pcitag_t tag"
+.Fa "int reg"
+.Fa "pcireg_t type"
+.Fa "bus_addr_t *basep"
+.Fa "bus_size_t *sizep"
+.Fa "int *flagsp"
+.Fc
+.Ft int
+.Fo pci_mapreg_probe
+.Fa "pci_chipset_tag_t pc"
+.Fa "pcitag_t tag"
+.Fa "int reg"
+.Fa "pcireg_t *typep"
+.Fc
+.Ft pcireg_t
+.Fo pci_mapreg_type
+.Fa "pci_chipset_tag_t pc"
+.Fa "pcitag_t tag"
+.Fa "int reg"
+.Fc
+.\" .Ft int
+.\" .Fo pci_mem_find
+.\" .Fa "pci_chipset_tag_t pc"
+.\" .Fa "pcitag_t pcitag"
+.\" .Fa "int reg"
+.\" .Fa "bus_addr_t *basep"
+.\" .Fa "bus_size_t *sizep"
+.\" .Fa "int *cacheablep"
+.\" .Fc
+.\" .Ft int
+.\" .Fo pci_io_find
+.\" .Fa "pci_chipset_tag_t pc"
+.\" .Fa "pcitag_t pcitag"
+.\" .Fa "int reg"
+.\" .Fa "bus_addr_t *basep"
+.\" .Fa "bus_size_t *sizep"
+.\" .Fa "int *cacheablep"
+.\" .Fc
+.Sh DESCRIPTION
+These functions provide wrappers and helpers around
+.Xr bus_space 9
+mappings for device registers described by the Base Address Registers
+(BARs) in a PCI devices configuration space.
+.Pp
+.Nm pci_mapreg_map
+wraps a call to
+.Xr bus_space_map 9
+using information from the BAR referenced by
+.Fa reg
+for the device being attached with
+.Fa paa .
+Memory or I/O mappings are derived from the
+.Fa type
+argument.
+The size of the register mapping can be restricted by specifying a
+non-zero value in
+.Fa maxsize .
+The bus space tag and handle used for the mapping, and the base
+address and size of the mapping will be provided to the caller v

Re: bypass interface input queues for vlan(4)

2019-02-21 Thread David Gwynne

On Thu, Feb 21, 2019 at 04:29:27PM -0300, Martin Pieuchot wrote:
> On 21/02/19(Thu) 14:19, David Gwynne wrote:
> > right now we add vlan_input as a possible input handler on the parent
> > interface, and if the packet is for a vlan we take it and pretend we
> > received it on the vlan interface by calling if_input against that mbuf.
> > 
> > as mpi notes, the if input queue stuff looks like a lot of work,
> > especially for a single packet. my opinion is that we got away with
> > the if input stuff we've done to try and encourage an mpsafe network
> > stack because we amortised the cost of it over many packets off the
> > hardware ring. vlan does it a packet at a time.
> > 
> > this moves the handling of vlan packets back into ether_input by
> > calling vlan_input directly on packets that are either marked as vlan
> > tagged or have a vlan ethertype. note that we have to do that anyway,
> > this just makes it explicit.
> > 
> > vlan_input is then tweaked to implement all the important bits of if
> > input. part of what if input does is count the packets. because vlan
> > already has per cpu counters for bypassing queues on output, we can use
> > them again for input from any cpu. if i ever get round to making a
> > driver handle multiple rx rings this means we can rx vlan packets
> > concurrently, they don't get serialised to a single if input q.
> > 
> > finally, hrvoje popovski has tested this diff and get's a significant
> > bump with it. on a machine that can forward 1100Kpps without vlan, it
> > goes from 790Kpps with vlan to 870Kpps. On a box that can do 730Kpps
> > without vlans, it goes from 550Kpps with vlan to 840Kpps. We're
> > still trying to figure that last one out, but it does appear to be
> > faster.
> > 
> > thoughts? ok?
> 
> Why do we need to move stuff to ether_input() if all we want is to
> bypass ifiq_input()?  Isn't a 3 line diff enough^^ ?

Fair point. It turns out it's not quite three lines, but it's still
smaller.

> - ml_enqueue(, m);
> - if_input(>ifv_if, );
> + if_input_one(>ifv_if, m);
>  
> I'm saying that because I'm afraid of the breakage that will happen if
> we remove the input handlers.  So I'm not opposed to get rid of the
> handlers, but as you said we should consider all drivers to not break
> trunk on top of bridge on top of vlan or whatever crazy configuration
> people do.

I want to fold them up because there's semantic issues around which
handlers were added in which order, and I'd bet there's a small
performance win too. But it's mostly about the semantics.

> Another point to keep in mind if you're going to remove the handlers is:
> do we want to keep passing a single mbuf or was it a plan to pass the
> whole list?

My current thinking is you want to bundle mbufs on and off the
hardware as much as possible via lists of mbufs, but between layers
of pseudo interfaces it's better to dispatch each mbuf directly.

If the call stack becomes too deep (high?) then lists might be useful
for squashing that back down.

Index: if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.571
diff -u -p -r1.571 if.c
--- if.c9 Jan 2019 01:14:21 -   1.571
+++ if.c22 Feb 2019 02:13:03 -
@@ -895,11 +895,29 @@ if_ih_remove(struct ifnet *ifp, int (*in
 }
 
 void
-if_input_process(struct ifnet *ifp, struct mbuf_list *ml)
+if_input_one(struct ifnet *ifp, struct mbuf *m)
 {
-   struct mbuf *m;
struct ifih *ifih;
struct srp_ref sr;
+
+   /*
+* Pass this mbuf to all input handlers of its
+* interface until it is consumed.
+*/
+   SRPL_FOREACH(ifih, , >if_inputs, ifih_next) {
+   if ((*ifih->ifih_input)(ifp, m, ifih->ifih_cookie))
+   break;
+   }
+   SRPL_LEAVE();
+
+   if (ifih == NULL)
+   m_freem(m);
+}
+
+void
+if_input_process(struct ifnet *ifp, struct mbuf_list *ml)
+{
+   struct mbuf *m;
int s;
 
if (ml_empty(ml))
@@ -922,20 +940,8 @@ if_input_process(struct ifnet *ifp, stru
 */
NET_RLOCK();
s = splnet();
-   while ((m = ml_dequeue(ml)) != NULL) {
-   /*
-* Pass this mbuf to all input handlers of its
-* interface until it is consumed.
-*/
-   SRPL_FOREACH(ifih, , >if_inputs, ifih_next) {
-   if ((*ifih->ifih_input)(ifp, m, ifih->ifih_cookie))
-   break;
-   }
-   SRPL_LEAVE();
-
-   if (ifih == NULL)
-   m_freem(m);
-   }
+   while ((m = ml_dequeue(ml)) != NULL)
+   if_input_one(

bypass interface input queues for vlan(4)

2019-02-20 Thread David Gwynne

right now we add vlan_input as a possible input handler on the parent
interface, and if the packet is for a vlan we take it and pretend we
received it on the vlan interface by calling if_input against that mbuf.

as mpi notes, the if input queue stuff looks like a lot of work,
especially for a single packet. my opinion is that we got away with
the if input stuff we've done to try and encourage an mpsafe network
stack because we amortised the cost of it over many packets off the
hardware ring. vlan does it a packet at a time.

this moves the handling of vlan packets back into ether_input by
calling vlan_input directly on packets that are either marked as vlan
tagged or have a vlan ethertype. note that we have to do that anyway,
this just makes it explicit.

vlan_input is then tweaked to implement all the important bits of if
input. part of what if input does is count the packets. because vlan
already has per cpu counters for bypassing queues on output, we can use
them again for input from any cpu. if i ever get round to making a
driver handle multiple rx rings this means we can rx vlan packets
concurrently, they don't get serialised to a single if input q.

finally, hrvoje popovski has tested this diff and get's a significant
bump with it. on a machine that can forward 1100Kpps without vlan, it
goes from 790Kpps with vlan to 870Kpps. On a box that can do 730Kpps
without vlans, it goes from 550Kpps with vlan to 840Kpps. We're
still trying to figure that last one out, but it does appear to be
faster.

thoughts? ok?

i would like to apply this style of tweak to the other ethernet magic
pseudo interfaces like trunk, bridge, switch and bpe  after this
with the intention of making if_input a single function pointer again.

Index: if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.571
diff -u -p -r1.571 if.c
--- if.c9 Jan 2019 01:14:21 -   1.571
+++ if.c21 Feb 2019 04:03:44 -
@@ -895,11 +895,29 @@ if_ih_remove(struct ifnet *ifp, int (*in
 }
 
 void
-if_input_process(struct ifnet *ifp, struct mbuf_list *ml)
+if_input_one(struct ifnet *ifp, struct mbuf *m)
 {
-   struct mbuf *m;
struct ifih *ifih;
struct srp_ref sr;
+
+   /*
+* Pass this mbuf to all input handlers of its
+* interface until it is consumed.
+*/
+   SRPL_FOREACH(ifih, , >if_inputs, ifih_next) {
+   if ((*ifih->ifih_input)(ifp, m, ifih->ifih_cookie))
+   break;
+   }
+   SRPL_LEAVE();
+
+   if (ifih == NULL)
+   m_freem(m);
+}
+
+void
+if_input_process(struct ifnet *ifp, struct mbuf_list *ml)
+{
+   struct mbuf *m;
int s;
 
if (ml_empty(ml))
@@ -922,20 +940,8 @@ if_input_process(struct ifnet *ifp, stru
 */
NET_RLOCK();
s = splnet();
-   while ((m = ml_dequeue(ml)) != NULL) {
-   /*
-* Pass this mbuf to all input handlers of its
-* interface until it is consumed.
-*/
-   SRPL_FOREACH(ifih, , >if_inputs, ifih_next) {
-   if ((*ifih->ifih_input)(ifp, m, ifih->ifih_cookie))
-   break;
-   }
-   SRPL_LEAVE();
-
-   if (ifih == NULL)
-   m_freem(m);
-   }
+   while ((m = ml_dequeue(ml)) != NULL)
+   if_input_one(ifp, m);
splx(s);
NET_RUNLOCK();
 }
Index: if_ethersubr.c
===
RCS file: /cvs/src/sys/net/if_ethersubr.c,v
retrieving revision 1.258
diff -u -p -r1.258 if_ethersubr.c
--- if_ethersubr.c  18 Feb 2019 03:41:21 -  1.258
+++ if_ethersubr.c  21 Feb 2019 04:03:44 -
@@ -74,6 +74,7 @@ didn't get a copy, you may request one f
 */
 
 #include "bpfilter.h"
+#include "vlan.h"
 
 #include 
 #include 
@@ -103,6 +104,10 @@ didn't get a copy, you may request one f
 #include 
 #endif
 
+#if NVLAN > 0
+#include 
+#endif
+
 #include "pppoe.h"
 #if NPPPOE > 0
 #include 
@@ -362,6 +367,17 @@ ether_input(struct ifnet *ifp, struct mb
 
ac = (struct arpcom *)ifp;
eh = mtod(m, struct ether_header *);
+   etype = ntohs(eh->ether_type);
+
+   if (ISSET(m->m_flags, M_VLANTAG) ||
+   etype == ETHERTYPE_VLAN || etype == ETHERTYPE_QINQ) {
+#if NVLAN > 0
+   m = vlan_input(ifp, m);
+   if (m == NULL)
+   return (1);
+#endif
+   goto dropanyway;
+   }
 
/* Is the packet for us? */
if (memcmp(ac->ac_enaddr, eh->ether_dhost, ETHER_ADDR_LEN) != 0) {
@@ -387,15 +403,6 @@ ether_input(struct ifnet *ifp, struct mb
m->m_flags |= M_MCAST;
ifp->if_imcasts++;
}
-
-   /*
-* HW vlan tagged packets that were not collected by vlan(4) must
-* be dropped now.
-*/
-

Re: ioctls for MPLS pseudowire interface config

2019-02-20 Thread David Gwynne




> On 20 Feb 2019, at 11:21 pm, Martin Pieuchot  wrote:
> 
> On 20/02/19(Wed) 14:44, David Gwynne wrote:
>> Index: sys/net/if.c
>> ===
>> RCS file: /cvs/src/sys/net/if.c,v
>> retrieving revision 1.571
>> diff -u -p -r1.571 if.c
>> --- sys/net/if.c 9 Jan 2019 01:14:21 -   1.571
>> +++ sys/net/if.c 20 Feb 2019 04:35:42 -
>> @@ -2143,6 +2143,25 @@ ifioctl(struct socket *so, u_long cmd, c
>>  NET_UNLOCK();
>>  break;
>> 
>> +case SIOCSETMPWCFG:
>> +case SIOCSPWE3CTRLWORD:
>> +case SIOCSPWE3FAT:
>> +case SIOCSPWE3NEIGHBOR:
>> +case SIOCDPWE3NEIGHBOR:
>> +if ((error = suser(p)) != 0)
>> +break;
>> +/* FALLTHROUGH */
>> +case SIOCGETMPWCFG:
>> +case SIOCGPWE3CTRLWORD:
>> +case SIOCGPWE3FAT:
>> +case SIOCGPWE3NEIGHBOR:
>> +if_ref(ifp);
>> +KERNEL_UNLOCK();
>> +error = ((*ifp->if_ioctl)(ifp, cmd, data));
>> +KERNEL_LOCK();
>> +if_put(ifp);
> 
> Why are you referencing the `ifp' and grabbing the KERNEL_LOCK()
> (recursively)?

ifioctl gets the ifp pointer from ifunit, which doesn't increase the ref count 
for you. I'm giving up kernel lock around the pwe3 ioctl calls into the driver, 
not taking them harder. Taking the ifp ref there guarantees the interface will 
stay alive^Wallocated over those calls.

Re: ioctls for MPLS pseudowire interface config

2019-02-19 Thread David Gwynne

jsg@ pointed out that stsp sniped the ioctl numbers i was using.

this diff moves them into 220-212.

On Wed, Feb 20, 2019 at 01:33:00PM +1000, David Gwynne wrote:
> This splits up the mpw config ioctl so the same functionality is
> implemented in a bunch of smaller and more specific ioctls. This
> simplifies configuration of an interface cos you can incrementally
> configure it instead of having to line up all the bits correctly for the
> jumbo ioctl.
> 
> It also allows extra functionality to be added incrementally in the
> future. For example, this adds the ability to configure Flow-Aware
> Transport for mpw via the SIOCSPWE3FAT and SIOCGPWE3FAT ioctls.
> 
> Another benefit is that this shrinks ifconfig output. Currently ifconfig
> mpw0 looks like this:
> 
> mpw0: flags=8843 mtu 1500
> lladdr fe:e1:ba:d0:93:1a
> index 10 priority 0 llprio 3
> encapsulation-type ethernet, control-word
> mpls label: local 16 remote 16
> neighbor: 192.168.0.27
> groups: mpw
> inet 100.64.100.2 netmask 0xff00 broadcast 100.64.100.255
> 
> After this I can make it look like this:
> 
> mpw0: flags=8843 mtu 1500
> lladdr fe:e1:ba:d0:93:1a
> index 10 priority 0 llprio 3
> mpls: label 16 pwe3 remote label 16 on 192.168.0.27 cw nofat
> groups: mpw
> inet 100.64.100.2 netmask 0xff00 broadcast 100.64.100.255
> 
> The ifconfig bits aren't built when SMALL, so this doesn't impact
> install media.
> 
> Lastly, this let's the PWE3 ioctls run without locks. Gotta start
> somewhere right?
> 
> As discussed with claudio@ at a2k19, I have written a driver for an IP
> pseudowire interface called mpip(4). It uses these same ioctls for it's
> configuration.
> 
> ok?

Index: sys/sys/sockio.h
===
RCS file: /cvs/src/sys/sys/sockio.h,v
retrieving revision 1.79
diff -u -p -r1.79 sockio.h
--- sys/sys/sockio.h23 Jan 2019 08:23:18 -  1.79
+++ sys/sys/sockio.h20 Feb 2019 04:35:42 -
@@ -143,6 +143,7 @@
 #defineSIOCSSARAMS  _IOW('i', 147, struct ifreq)   /* set pppoe 
params */
 #defineSIOCGSARAMS _IOWR('i', 148, struct ifreq)   /* get pppoe 
params */
 
+#define SIOCDELLABEL_IOW('i', 151, struct ifreq)   /* del MPLS label */
 #define SIOCGPWE3   _IOWR('i', 152, struct ifreq)  /* get MPLS PWE3 cap */
 #define SIOCSETLABEL_IOW('i', 153, struct ifreq)   /* set MPLS label */
 #define SIOCGETLABEL_IOW('i', 154, struct ifreq)   /* get MPLS label */
@@ -204,6 +205,14 @@
 
 #defineSIOCSLIFPHYECN  _IOW('i', 199, struct ifreq)/* set ecn 
copying */
 #defineSIOCGLIFPHYECN  _IOWR('i', 200, struct ifreq)   /* get ecn 
copying */
+
+#define SIOCSPWE3CTRLWORD  _IOW('i', 220, struct ifreq)
+#define SIOCGPWE3CTRLWORD  _IOWR('i',  220, struct ifreq)
+#define SIOCSPWE3FAT   _IOW('i', 221, struct ifreq)
+#define SIOCGPWE3FAT   _IOWR('i', 221, struct ifreq)
+#define SIOCSPWE3NEIGHBOR  _IOW('i', 222, struct if_laddrreq)
+#define SIOCGPWE3NEIGHBOR  _IOWR('i', 222, struct if_laddrreq)
+#define SIOCDPWE3NEIGHBOR  _IOW('i', 222, struct ifreq)
 
 #defineSIOCSVH _IOWR('i', 245, struct ifreq)   /* set carp 
param */
 #defineSIOCGVH _IOWR('i', 246, struct ifreq)   /* get carp 
param */
Index: sys/net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.571
diff -u -p -r1.571 if.c
--- sys/net/if.c9 Jan 2019 01:14:21 -   1.571
+++ sys/net/if.c20 Feb 2019 04:35:42 -
@@ -2143,6 +2143,25 @@ ifioctl(struct socket *so, u_long cmd, c
NET_UNLOCK();
break;
 
+   case SIOCSETMPWCFG:
+   case SIOCSPWE3CTRLWORD:
+   case SIOCSPWE3FAT:
+   case SIOCSPWE3NEIGHBOR:
+   case SIOCDPWE3NEIGHBOR:
+   if ((error = suser(p)) != 0)
+   break;
+   /* FALLTHROUGH */
+   case SIOCGETMPWCFG:
+   case SIOCGPWE3CTRLWORD:
+   case SIOCGPWE3FAT:
+   case SIOCGPWE3NEIGHBOR:
+   if_ref(ifp);
+   KERNEL_UNLOCK();
+   error = ((*ifp->if_ioctl)(ifp, cmd, data));
+   KERNEL_LOCK();
+   if_put(ifp);
+   break;
+
case SIOCSETKALIVE:
case SIOCDIFPHYADDR:
case SIOCSLIFPHYADDR:
Index: sys/net/if_mpw.c
===
RCS file: /cvs/src/sys/net/if_mpw.c,v
retrieving revision 1.44
diff -u -p -r1.44 if_mpw.c
--- sys/net/if_mpw.c20 Feb 2019 01:04:53 -  1.44
+++ sys/net/if_mpw.c20 Feb 2019 04:35:42 -
@@ -44,6 +44,11 @@
 #include 
 #endif
 
+struct mpw_neighbor {
+   struct shim_hdr n_

ioctls for MPLS pseudowire interface config

2019-02-19 Thread David Gwynne

This splits up the mpw config ioctl so the same functionality is
implemented in a bunch of smaller and more specific ioctls. This
simplifies configuration of an interface cos you can incrementally
configure it instead of having to line up all the bits correctly for the
jumbo ioctl.

It also allows extra functionality to be added incrementally in the
future. For example, this adds the ability to configure Flow-Aware
Transport for mpw via the SIOCSPWE3FAT and SIOCGPWE3FAT ioctls.

Another benefit is that this shrinks ifconfig output. Currently ifconfig
mpw0 looks like this:

mpw0: flags=8843 mtu 1500
lladdr fe:e1:ba:d0:93:1a
index 10 priority 0 llprio 3
encapsulation-type ethernet, control-word
mpls label: local 16 remote 16
neighbor: 192.168.0.27
groups: mpw
inet 100.64.100.2 netmask 0xff00 broadcast 100.64.100.255

After this I can make it look like this:

mpw0: flags=8843 mtu 1500
lladdr fe:e1:ba:d0:93:1a
index 10 priority 0 llprio 3
mpls: label 16 pwe3 remote label 16 on 192.168.0.27 cw nofat
groups: mpw
inet 100.64.100.2 netmask 0xff00 broadcast 100.64.100.255

The ifconfig bits aren't built when SMALL, so this doesn't impact
install media.

Lastly, this let's the PWE3 ioctls run without locks. Gotta start
somewhere right?

As discussed with claudio@ at a2k19, I have written a driver for an IP
pseudowire interface called mpip(4). It uses these same ioctls for it's
configuration.

ok?

Index: sys/sys/sockio.h
===
RCS file: /cvs/src/sys/sys/sockio.h,v
retrieving revision 1.79
diff -u -p -r1.79 sockio.h
--- sys/sys/sockio.h23 Jan 2019 08:23:18 -  1.79
+++ sys/sys/sockio.h20 Feb 2019 03:29:05 -
@@ -143,6 +143,7 @@
 #defineSIOCSSARAMS  _IOW('i', 147, struct ifreq)   /* set pppoe 
params */
 #defineSIOCGSARAMS _IOWR('i', 148, struct ifreq)   /* get pppoe 
params */
 
+#define SIOCDELLABEL_IOW('i', 151, struct ifreq)   /* del MPLS label */
 #define SIOCGPWE3   _IOWR('i', 152, struct ifreq)  /* get MPLS PWE3 cap */
 #define SIOCSETLABEL_IOW('i', 153, struct ifreq)   /* set MPLS label */
 #define SIOCGETLABEL_IOW('i', 154, struct ifreq)   /* get MPLS label */
@@ -204,6 +205,14 @@
 
 #defineSIOCSLIFPHYECN  _IOW('i', 199, struct ifreq)/* set ecn 
copying */
 #defineSIOCGLIFPHYECN  _IOWR('i', 200, struct ifreq)   /* get ecn 
copying */
+
+#define SIOCSPWE3CTRLWORD  _IOW('i', 210, struct ifreq)
+#define SIOCGPWE3CTRLWORD  _IOWR('i', 210, struct ifreq)
+#define SIOCSPWE3FAT   _IOW('i', 211, struct ifreq)
+#define SIOCGPWE3FAT   _IOWR('i', 211, struct ifreq)
+#define SIOCSPWE3NEIGHBOR  _IOW('i', 212, struct if_laddrreq)
+#define SIOCGPWE3NEIGHBOR  _IOWR('i', 212, struct if_laddrreq)
+#define SIOCDPWE3NEIGHBOR  _IOW('i', 212, struct ifreq)
 
 #defineSIOCSVH _IOWR('i', 245, struct ifreq)   /* set carp 
param */
 #defineSIOCGVH _IOWR('i', 246, struct ifreq)   /* get carp 
param */
Index: sys/net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.571
diff -u -p -r1.571 if.c
--- sys/net/if.c9 Jan 2019 01:14:21 -   1.571
+++ sys/net/if.c20 Feb 2019 03:29:05 -
@@ -2143,6 +2143,25 @@ ifioctl(struct socket *so, u_long cmd, c
NET_UNLOCK();
break;
 
+   case SIOCSETMPWCFG:
+   case SIOCSPWE3CTRLWORD:
+   case SIOCSPWE3FAT:
+   case SIOCSPWE3NEIGHBOR:
+   case SIOCDPWE3NEIGHBOR:
+   if ((error = suser(p)) != 0)
+   break;
+   /* FALLTHROUGH */
+   case SIOCGETMPWCFG:
+   case SIOCGPWE3CTRLWORD:
+   case SIOCGPWE3FAT:
+   case SIOCGPWE3NEIGHBOR:
+   if_ref(ifp);
+   KERNEL_UNLOCK();
+   error = ((*ifp->if_ioctl)(ifp, cmd, data));
+   KERNEL_LOCK();
+   if_put(ifp);
+   break;
+
case SIOCSETKALIVE:
case SIOCDIFPHYADDR:
case SIOCSLIFPHYADDR:
Index: sys/net/if_mpw.c
===
RCS file: /cvs/src/sys/net/if_mpw.c,v
retrieving revision 1.44
diff -u -p -r1.44 if_mpw.c
--- sys/net/if_mpw.c20 Feb 2019 01:04:53 -  1.44
+++ sys/net/if_mpw.c20 Feb 2019 03:29:05 -
@@ -44,6 +44,11 @@
 #include 
 #endif
 
+struct mpw_neighbor {
+   struct shim_hdr n_rshim;
+   struct sockaddr_storage n_nexthop;
+};
+
 struct mpw_softc {
struct arpcom   sc_ac;
 #define sc_if  sc_ac.ac_if
@@ -56,8 +61,7 @@ struct mpw_softc {
unsigned intsc_fword;
uint32_tsc_flow;
uint32_tsc_type;
-   struct shim_hdr sc_rshim;
-   struct sockaddr_storage

ix transmit code doesn't need atomic ops

2019-02-19 Thread David Gwynne

mpi's recent post about vlan performance which happened to use ix
reminded me that i had some diffz for ix that may be relevant to
the discussion.

this uses the loads and stores of the produce and consumer indexes
to calculate free space in the start path and for figuring out how
much space to reclaim in txeof. this is instead coordining with an
actual "available slots" counter using atomic ops.

it also avoids doing a bunch of work in the txeof path. instead of
putting the mbuf on the last descriptor for a packet, we leave it on the
first and use the index to the last to poll for packet completion. this
means we can leave the mbuf and dmamap in the first slot, which
simplifies the encap code. by only reading the last descriptor from the
ring, we are no longer scrubbing entries like we used to, but this
doesnt seem be a problem in practice.

anyway, i think the big cost when transmitting is writing to the
register at the end of the loop. we effectively call start for every
packet, which means we post each packet to the chip. this is why
tx mitigation helps; it reduces the number of register writes.

dlg

Index: if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.152
diff -u -p -r1.152 if_ix.c
--- if_ix.c 22 Jun 2017 02:44:37 -  1.152
+++ if_ix.c 19 Feb 2019 11:36:43 -
@@ -385,6 +385,7 @@ ixgbe_start(struct ifqueue *ifq)
struct ix_softc *sc = ifp->if_softc;
struct tx_ring  *txr = sc->tx_rings;
struct mbuf *m_head;
+   unsigned int head, free, used;
int  post = 0;
 
if (!(ifp->if_flags & IFF_RUNNING) || ifq_is_oactive(ifq))
@@ -392,13 +393,21 @@ ixgbe_start(struct ifqueue *ifq)
if (!sc->link_up)
return;
 
-   bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, 0,
-   txr->txdma.dma_map->dm_mapsize,
-   BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
+   head = txr->next_avail_desc;
+   free = txr->next_to_clean;
+   if (free <= head)
+   free += sc->num_tx_desc;
+   free -= head;
+
+   membar_consumer();
+
+   bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
+   0, txr->txdma.dma_map->dm_mapsize,
+   BUS_DMASYNC_POSTWRITE);
 
for (;;) {
/* Check that we have the minimal number of TX descriptors. */
-   if (txr->tx_avail <= IXGBE_TX_OP_THRESHOLD) {
+   if (free <= IXGBE_TX_OP_THRESHOLD) {
ifq_set_oactive(ifq);
break;
}
@@ -407,11 +416,14 @@ ixgbe_start(struct ifqueue *ifq)
if (m_head == NULL)
break;
 
-   if (ixgbe_encap(txr, m_head)) {
+   used = ixgbe_encap(txr, m_head);
+   if (used == 0) {
m_freem(m_head);
continue;
}
 
+   free -= used;
+
 #if NBPFILTER > 0
if (ifp->if_bpf)
bpf_mtap_ether(ifp->if_bpf, m_head, BPF_DIRECTION_OUT);
@@ -426,7 +438,7 @@ ixgbe_start(struct ifqueue *ifq)
 
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
0, txr->txdma.dma_map->dm_mapsize,
-   BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+   BUS_DMASYNC_PREWRITE);
 
/*
 * Advance the Transmit Descriptor Tail (Tdt), this tells the
@@ -579,8 +591,8 @@ ixgbe_watchdog(struct ifnet * ifp)
printf("%s: Queue(%d) tdh = %d, hw tdt = %d\n", ifp->if_xname, 
i,
IXGBE_READ_REG(hw, IXGBE_TDH(i)),
IXGBE_READ_REG(hw, IXGBE_TDT(i)));
-   printf("%s: TX(%d) desc avail = %d, Next TX to Clean = %d\n", 
ifp->if_xname,
-   i, txr->tx_avail, txr->next_to_clean);
+   printf("%s: TX(%d) Next TX to Clean = %d\n", ifp->if_xname,
+   i, txr->next_to_clean);
}
ifp->if_flags &= ~IFF_RUNNING;
sc->watchdog_events++;
@@ -1150,7 +1162,7 @@ ixgbe_encap(struct tx_ring *txr, struct 
 {
struct ix_softc *sc = txr->sc;
uint32_tolinfo_status = 0, cmd_type_len;
-   int i, j, error;
+   int i, j, ntxc;
int first, last = 0;
bus_dmamap_tmap;
struct ixgbe_tx_buf *txbuf;
@@ -1177,36 +1189,34 @@ ixgbe_encap(struct tx_ring *txr, struct 
/*
 * Map the packet for DMA.
 */
-   error = bus_dmamap_load_mbuf(txr->txdma.dma_tag, map, m_head,
-   BUS_DMA_NOWAIT);
-   switch (error) {
+   switch (bus_dmamap_load_mbuf(txr->txdma.dma_tag, map,
+   m_head, BUS_DMA_NOWAIT)) {
case 0:
break;
case EFBIG:
if (m_defrag(m_head, M_NOWAIT) == 0 &&
-   (error =

Re: mpw(4) hacks for bridge(4)

2019-02-13 Thread David Gwynne

yes!

ok by me.

> On 14 Feb 2019, at 00:02, Martin Pieuchot  wrote:
> 
> Now that mpw(4) is an Ethernet interface, can I get rid of the following
> hacks?
> 
> Index: net/if_bridge.c
> ===
> RCS file: /cvs/src/sys/net/if_bridge.c,v
> retrieving revision 1.319
> diff -u -p -r1.319 if_bridge.c
> --- net/if_bridge.c   29 Jan 2019 17:47:35 -  1.319
> +++ net/if_bridge.c   13 Feb 2019 14:01:11 -
> @@ -36,7 +36,6 @@
> #include "pf.h"
> #include "carp.h"
> #include "vlan.h"
> -#include "mpw.h"
> 
> #include 
> #include 
> @@ -311,13 +310,7 @@ bridge_ioctl(struct ifnet *ifp, u_long c
>   error = ifpromisc(ifs, 1);
>   if (error != 0)
>   break;
> - }
> -#if NMPW > 0
> - else if (ifs->if_type == IFT_MPLSTUNNEL) {
> - /* Nothing needed */
> - }
> -#endif /* NMPW */
> - else {
> + } else {
>   error = EINVAL;
>   break;
>   }
> @@ -367,8 +360,7 @@ bridge_ioctl(struct ifnet *ifp, u_long c
>   error = ENOENT;
>   break;
>   }
> - if (ifs->if_type != IFT_ETHER &&
> - ifs->if_type != IFT_MPLSTUNNEL) {
> + if (ifs->if_type != IFT_ETHER) {
>   error = EINVAL;
>   break;
>   }
> @@ -813,15 +805,6 @@ bridge_output(struct ifnet *ifp, struct 
>   (bif->bif_flags & IFBIF_STP) &&
>   (bif->bif_state == BSTP_IFSTATE_DISCARDING))
>   continue;
> -#if NMPW > 0
> - /*
> -  * Split horizon: avoid broadcasting messages from
> -  * wire to another wire.
> -  */
> - if (ifp->if_type == IFT_MPLSTUNNEL &&
> - dst_if->if_type == IFT_MPLSTUNNEL)
> - continue;
> -#endif /* NMPW */
>   if ((bif->bif_flags & IFBIF_DISCOVER) == 0 &&
>   (m->m_flags & (M_BCAST | M_MCAST)) == 0)
>   continue;
> @@ -1276,16 +1259,6 @@ bridge_broadcast(struct bridge_softc *sc
> 
>   if (bridge_localbroadcast(dst_if, eh, m))
>   sc->sc_if.if_oerrors++;
> -
> -#if NMPW > 0
> - /*
> -  * Split horizon: avoid broadcasting messages from wire to
> -  * another wire.
> -  */
> - if (ifp->if_type == IFT_MPLSTUNNEL &&
> - dst_if->if_type == IFT_MPLSTUNNEL)
> - continue;
> -#endif /* NMPW */
> 
>   /* If last one, reuse the passed-in mbuf */
>   if (SLIST_NEXT(bif, bif_next) == NULL) {

make sbcreatecontrol take void * instead of caddr_t

2019-02-12 Thread David Gwynne

this makes it easier to call sbcreatecontrol without requiring casts.

it makes the argument const as well, and promotes the length variable to
size_t.

ok?

Index: sys/socketvar.h
===
RCS file: /cvs/src/sys/sys/socketvar.h,v
retrieving revision 1.88
diff -u -p -r1.88 socketvar.h
--- sys/socketvar.h 19 Nov 2018 13:15:37 -  1.88
+++ sys/socketvar.h 12 Feb 2019 23:52:42 -
@@ -290,7 +290,7 @@ int sbappendcontrol(struct socket *, str
 void   sbappendrecord(struct socket *, struct sockbuf *, struct mbuf *);
 void   sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n);
 struct mbuf *
-   sbcreatecontrol(caddr_t p, int size, int type, int level);
+   sbcreatecontrol(const void *, size_t, int type, int level);
 void   sbdrop(struct socket *, struct sockbuf *, int);
 void   sbdroprecord(struct sockbuf *sb);
 void   sbflush(struct socket *, struct sockbuf *);
Index: kern/uipc_socket2.c
===
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.99
diff -u -p -r1.99 uipc_socket2.c
--- kern/uipc_socket2.c 19 Nov 2018 13:15:37 -  1.99
+++ kern/uipc_socket2.c 12 Feb 2019 23:52:42 -
@@ -1020,14 +1020,14 @@ sbdroprecord(struct sockbuf *sb)
  * with the specified type for presentation on a socket buffer.
  */
 struct mbuf *
-sbcreatecontrol(caddr_t p, int size, int type, int level)
+sbcreatecontrol(const void *p, size_t size, int type, int level)
 {
struct cmsghdr *cp;
struct mbuf *m;
 
if (CMSG_SPACE(size) > MCLBYTES) {
-   printf("sbcreatecontrol: message too large %d\n", size);
-   return NULL;
+   printf("sbcreatecontrol: message too large %zu\n", size);
+   return (NULL);
}
 
if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)

Re: disable the ability to change tun(4) mode from p2p to bcast and back again

2019-02-04 Thread David Gwynne

> On 4 Feb 2019, at 22:00, Tim Kuijsten  wrote:
> 
> On Mon, Feb 04, 2019 at 12:07:22PM +1000, David Gwynne wrote:
>> Currently you can change a tun interface from being point to point to
>> being a broadcast interface. Why?
> 
> I'm using broadcast mode in my own wireguard implementation because there can 
> be more than one peer on the network:
> https://github.com/timkuijsten/uwg/blob/ccd39c6a9bdf36575a3bb3db06c438a2241c1134/ifn.c#L1868

But there's only one process sucking on the /dev entry, so there's just the one 
pipe. Does it make a difference to the routes you can add whether tun is only 
point to point, or is broadcast required? I don't see uwg itself adding routes, 
do you do that outside it?

dlg

disable the ability to change tun(4) mode from p2p to bcast and back again

2019-02-03 Thread David Gwynne

Currently you can change a tun interface from being point to point to
being a broadcast interface. Why?

This cuts out the ability to change it. Note that the ioctl code is
shared by tap, so it still has IFF_BROADCAST code that gets run, you
should just not be able to change the flags, only read them.

With the above in mind, this also removes the ability to make a tap
interface point to point. Why would you want that too?

This was noticed by tedu while playing with wg, and it confused me. But
that is true for a lot of tap stuff atm. Does anyone really use all the
ioctl buttons that tap provides?

ok?

Index: if_tun.c
===
RCS file: /cvs/src/sys/net/if_tun.c,v
retrieving revision 1.184
diff -u -p -r1.184 if_tun.c
--- if_tun.c3 Feb 2019 23:04:49 -   1.184
+++ if_tun.c4 Feb 2019 02:00:14 -
@@ -104,7 +104,7 @@ int tundebug = TUN_DEBUG;
 #endif
 
 /* Only these IFF flags are changeable by TUNSIFINFO */
-#define TUN_IFF_FLAGS (IFF_UP|IFF_POINTOPOINT|IFF_MULTICAST|IFF_BROADCAST)
+#define TUN_IFF_FLAGS (IFF_UP)
 
 void   tunattach(int);
 
@@ -650,15 +650,9 @@ tun_dev_ioctl(struct tun_softc *tp, u_lo
break;
 #endif
case TUNSIFMODE:
-   switch (*(int *)data & (IFF_POINTOPOINT|IFF_BROADCAST)) {
-   case IFF_POINTOPOINT:
-   case IFF_BROADCAST:
-   tp->tun_if.if_flags &= ~TUN_IFF_FLAGS;
-   tp->tun_if.if_flags |= *(int *)data & TUN_IFF_FLAGS;
-   break;
-   default:
+   if ((*(int *)data & (IFF_POINTOPOINT|IFF_BROADCAST)) !=
+   (tp->tun_if.if_flags & (IFF_POINTOPOINT|IFF_BROADCAST)))
return (EINVAL);
-   }
break;
 
case FIONBIO:

avoid byteswapping at runtime in tun(4)

2019-02-03 Thread David Gwynne

this has reads from tun load the AF out of the data rather than point to
it, then has the switch statement compare to the swapped AF values.

ok?

Index: if_tun.c
===
RCS file: /cvs/src/sys/net/if_tun.c,v
retrieving revision 1.184
diff -u -p -r1.184 if_tun.c
--- if_tun.c3 Feb 2019 23:04:49 -   1.184
+++ if_tun.c4 Feb 2019 00:18:54 -
@@ -833,7 +833,7 @@ int
 tun_dev_write(struct tun_softc *tp, struct uio *uio, int ioflag)
 {
struct ifnet*ifp;
-   u_int32_t   *th;
+   u_int32_t   th;
struct mbuf *top, **mp, *m;
int error = 0, tlen;
size_t  mlen;
@@ -842,7 +842,7 @@ tun_dev_write(struct tun_softc *tp, stru
TUNDEBUG(("%s: tunwrite\n", ifp->if_xname));
 
if (uio->uio_resid == 0 || uio->uio_resid > ifp->if_mtu +
-   (tp->tun_flags & TUN_LAYER2 ? ETHER_HDR_LEN : sizeof(*th))) {
+   (tp->tun_flags & TUN_LAYER2 ? ETHER_HDR_LEN : sizeof(th))) {
TUNDEBUG(("%s: len=%d!\n", ifp->if_xname, uio->uio_resid));
return (EMSGSIZE);
}
@@ -917,11 +917,11 @@ tun_dev_write(struct tun_softc *tp, stru
}
 #endif
 
-   th = mtod(top, u_int32_t *);
+   th = *mtod(top, u_int32_t *);
/* strip the tunnel header */
-   top->m_data += sizeof(*th);
-   top->m_len  -= sizeof(*th);
-   top->m_pkthdr.len -= sizeof(*th);
+   top->m_data += sizeof(th);
+   top->m_len  -= sizeof(th);
+   top->m_pkthdr.len -= sizeof(th);
top->m_pkthdr.ph_rtableid = ifp->if_rdomain;
top->m_pkthdr.ph_ifidx = ifp->if_index;
 
@@ -930,17 +930,17 @@ tun_dev_write(struct tun_softc *tp, stru
 
NET_LOCK();
 
-   switch (ntohl(*th)) {
-   case AF_INET:
+   switch (th) {
+   case htonl(AF_INET):
ipv4_input(ifp, top);
break;
 #ifdef INET6
-   case AF_INET6:
+   case htonl(AF_INET6):
ipv6_input(ifp, top);
break;
 #endif
 #ifdef MPLS
-   case AF_MPLS:
+   case htonl(AF_MPLS):
mpls_input(ifp, top);
break;
 #endif

Re: fork rt_ifa_{add,del} for mpls local input routes

2019-02-02 Thread David Gwynne

On Fri, Feb 01, 2019 at 02:40:17PM -0200, Martin Pieuchot wrote:
> On 31/01/19(Thu) 13:31, David Gwynne wrote:
> > On Wed, Jan 30, 2019 at 11:54:45AM -0200, Martin Pieuchot wrote:
> > > On 30/01/19(Wed) 11:48, David Gwynne wrote:
> > > > mpls uses AF_MPLS routes with RTF_LOCAL set on them to know which tags
> > > > are used as input for the mpe and mpw interfaces. setting this up
> > > > currently goes through rt_ifa_add, but that has a couple of features
> > > > that are undesirable for mpls.
> > > > 
> > > > Firstly, rt_ifa_add unconditionally sets RTF_MPATH on the routes it
> > > > adds, which means multiple mpe and mpw interfaces can "own" the same
> > > > input tag. mpe tries to work around this by maintaining a global list of
> > > > mpe interfaces, and iterates over them when a new label is added. That's
> > > > ok (sort of) for mpe, but it doesnt take the tags used by mpw into
> > > > account.
> > > 
> > > I don't understand how this work, where are these 'tag' and 'label'
> > > stored?  What do you mean with "'own' the same input tag"?
> > 
> > I'm using tag and label interchangably here, but it refers to the number
> > that appears in the MPLS shim header on the wire. Each label is supposed
> > to represent a single "forwarding equivalence class", ie, each MPLS label
> > is only supposed to do one thing. When we're talkign about mpe and mpw
> > input, we're talking about configuring each interface with a local
> > MPLS label. Packets sent to the label on the local machine should
> > end up as packets coming in on an mpe or mpw interface.
> > 
> > Right now mpw and mpe use rt_ifa_add() to claim ownership of a local
> > tag. Once something is using a tag, nothing else should be able to add
> > an entry to the mpls rtable with that same tag. Because rt_ifa_add()
> > adds RTF_MPATH unconditionally, it is possible to add two things with
> > the same label to the mpls rtable.
> > 
> > > What is the current 'gateway' value of MPLS RTF_LOCAL routes? 
> > 
> > It's the struct sockaddr_dl *if_sadl in struct ifnet.
> > 
> > > > Secondly, I'd like to start pulling apart the restriction on the use of
> > > > mpls only in rdomain 1. rt_ifa_add doesn't help this situation because
> > > > it assumes that we're adding a route inside the rdomain the interface is
> > > > in, rather than the one it tunnels in. Changing this assumption means
> > > > forking rt_ifa_add, and oh look, that's what I've started here.
> > > 
> > > I'm afraid of adding new rtrequest(9) calls, especially with new error
> > > conditions.  Why can't you add the rdomain of the tunnel?
> > 
> > ifp->if_rdomain refers to the rdomain of the traffic going over an
> > interface. For tunnels it is the rdomain of packets inside the
> > tunnel. Another way of saying it is that a tunnel provides an overlay
> > on top of an underlay network, and ifp->if_rdomain is the rdomain
> > of the overlay traffic.
> > 
> > When we're adding the mpls rtable entries we're specifying connectivity
> > outside the tunnel, aka routes in the network underlay. Currently
> > rt_ifa_{add,del} force anything with RTF_MPLS into rdomain 0 despite
> > what the overlay is, and despite where you might want to run the
> > underlay. One of the configurations claudio@ and I talked about for
> > the firewalls at work was running MPLS in rdomain 1 and leaving our
> > current config running in rdomain 0. This model is not currently
> > possible with RTF_MPLS routes forced to the mpls rtable in rdomain 0.
> 
> I don't understand how you're going to select the rtable.  It will be
> different than the MPLS interface's rdomain, right?

The tunnel interfaces have SIOCSLIFPHYRTABLE and SIOCGLIFPHYRTABLE. I
was going to reuse that for specifying the rdomain MPLS "tunnels" in.

> The current codes for !AF_MPLS routes use the current rdomain because
> that's where the L2 entries live in our kernel.

Well, IP things still add their own l2 to the interfaces inside the
rdomain just like now. My current theory is that MPLS using it to add
the input tag was an elegant^Wconvenient hack since you want to do
mostly the same thing as IP, just outside the current rdomain. The hack
assumed MPLS could only exist in rdomain 0, hence the hardcoded values.

>
> > > If all you need is remove RTF_MPATH, then do so :)
> > > 
> > > We can add it to all rt_ifa_add(9) calls to be explicit!
> > 
> > That won't solve the rdomain thing though.
> >

Re: fork rt_ifa_{add,del} for mpls local input routes

2019-01-30 Thread David Gwynne

On Wed, Jan 30, 2019 at 11:54:45AM -0200, Martin Pieuchot wrote:
> On 30/01/19(Wed) 11:48, David Gwynne wrote:
> > mpls uses AF_MPLS routes with RTF_LOCAL set on them to know which tags
> > are used as input for the mpe and mpw interfaces. setting this up
> > currently goes through rt_ifa_add, but that has a couple of features
> > that are undesirable for mpls.
> > 
> > Firstly, rt_ifa_add unconditionally sets RTF_MPATH on the routes it
> > adds, which means multiple mpe and mpw interfaces can "own" the same
> > input tag. mpe tries to work around this by maintaining a global list of
> > mpe interfaces, and iterates over them when a new label is added. That's
> > ok (sort of) for mpe, but it doesnt take the tags used by mpw into
> > account.
> 
> I don't understand how this work, where are these 'tag' and 'label'
> stored?  What do you mean with "'own' the same input tag"?

I'm using tag and label interchangably here, but it refers to the number
that appears in the MPLS shim header on the wire. Each label is supposed
to represent a single "forwarding equivalence class", ie, each MPLS label
is only supposed to do one thing. When we're talkign about mpe and mpw
input, we're talking about configuring each interface with a local
MPLS label. Packets sent to the label on the local machine should
end up as packets coming in on an mpe or mpw interface.

Right now mpw and mpe use rt_ifa_add() to claim ownership of a local
tag. Once something is using a tag, nothing else should be able to add
an entry to the mpls rtable with that same tag. Because rt_ifa_add()
adds RTF_MPATH unconditionally, it is possible to add two things with
the same label to the mpls rtable.

> What is the current 'gateway' value of MPLS RTF_LOCAL routes? 

It's the struct sockaddr_dl *if_sadl in struct ifnet.

> > Secondly, I'd like to start pulling apart the restriction on the use of
> > mpls only in rdomain 1. rt_ifa_add doesn't help this situation because
> > it assumes that we're adding a route inside the rdomain the interface is
> > in, rather than the one it tunnels in. Changing this assumption means
> > forking rt_ifa_add, and oh look, that's what I've started here.
> 
> I'm afraid of adding new rtrequest(9) calls, especially with new error
> conditions.  Why can't you add the rdomain of the tunnel?

ifp->if_rdomain refers to the rdomain of the traffic going over an
interface. For tunnels it is the rdomain of packets inside the
tunnel. Another way of saying it is that a tunnel provides an overlay
on top of an underlay network, and ifp->if_rdomain is the rdomain
of the overlay traffic.

When we're adding the mpls rtable entries we're specifying connectivity
outside the tunnel, aka routes in the network underlay. Currently
rt_ifa_{add,del} force anything with RTF_MPLS into rdomain 0 despite
what the overlay is, and despite where you might want to run the
underlay. One of the configurations claudio@ and I talked about for
the firewalls at work was running MPLS in rdomain 1 and leaving our
current config running in rdomain 0. This model is not currently
possible with RTF_MPLS routes forced to the mpls rtable in rdomain 0.

> If all you need is remove RTF_MPATH, then do so :)
> 
> We can add it to all rt_ifa_add(9) calls to be explicit!

That won't solve the rdomain thing though.

How about making rt_ifa_{add,del} as wrappers around the thing that
let's you not specify RTF_MPATH, and explicitly configure the rdomain?
This effectively replaces rt_ifa_{add,del} with rt_ifa_{add,del}_rdomain
respectively, but provides the old names as wrappers on the new one.

At least it keeps everything in one place...

I'm not wedded to the names, it was just the least worst I came up with
inbetween meetings.

Index: route.c
===
RCS file: /cvs/src/sys/net/route.c,v
retrieving revision 1.379
diff -u -p -r1.379 route.c
--- route.c 23 Nov 2018 16:24:11 -  1.379
+++ route.c 31 Jan 2019 03:00:37 -
@@ -1032,17 +1032,24 @@ rt_maskedcopy(struct sockaddr *src, stru
 int
 rt_ifa_add(struct ifaddr *ifa, int flags, struct sockaddr *dst)
 {
+   return (rt_ifa_add_rdomain(ifa, flags | RTF_MPATH, dst,
+ifa->ifa_ifp->if_rdomain));
+}
+
+int
+rt_ifa_add_rdomain(struct ifaddr *ifa, int flags, struct sockaddr *dst,
+unsigned int rtableid)
+{
struct ifnet*ifp = ifa->ifa_ifp;
struct rtentry  *rt;
struct sockaddr_rtlabel  sa_rl;
struct rt_addrinfo   info;
-   unsigned int rtableid = ifp->if_rdomain;
uint8_t  prio = ifp->if_priority + RTP_STATIC;
int  error;

memset(, 0, sizeof(info));
info.rti_ifa = ifa;
-   info.rti_flags = flags | RTF_MPATH;
+   inf

Re: fork rt_ifa_{add,del} for mpls local input routes

2019-01-29 Thread David Gwynne

I mean rdomain 0 below, not 1.

> On 30 Jan 2019, at 11:48, David Gwynne  wrote:
> 
> mpls uses AF_MPLS routes with RTF_LOCAL set on them to know which tags
> are used as input for the mpe and mpw interfaces. setting this up
> currently goes through rt_ifa_add, but that has a couple of features
> that are undesirable for mpls.
> 
> Firstly, rt_ifa_add unconditionally sets RTF_MPATH on the routes it
> adds, which means multiple mpe and mpw interfaces can "own" the same
> input tag. mpe tries to work around this by maintaining a global list of
> mpe interfaces, and iterates over them when a new label is added. That's
> ok (sort of) for mpe, but it doesnt take the tags used by mpw into
> account.
> 
> Secondly, I'd like to start pulling apart the restriction on the use of
> mpls only in rdomain 1. rt_ifa_add doesn't help this situation because
> it assumes that we're adding a route inside the rdomain the interface is
> in, rather than the one it tunnels in. Changing this assumption means
> forking rt_ifa_add, and oh look, that's what I've started here.
> 
> So, if I have the following:
> 
> mpe2: flags=51 rdomain 2 mtu 1500
>index 6 priority 0 llprio 3
>mpls label 1000
>groups: mpe
>inet 192.168.0.25 --> 0.0.0.0 netmask 0x
> mpw0: flags=8843 mtu 1500
>lladdr fe:e1:ba:d0:93:1a
>index 7 priority 0 llprio 3
>encapsulation-type ethernet, control-word
>mpls label: local 16 remote 16
>neighbor: 192.168.0.27
>groups: mpw
>inet 100.64.100.2 netmask 0xff00 broadcast 100.64.100.255
> 
> The following now does what's expected:
> 
> dlg@cpe0 sys$ sudo ifconfig mpe3 create
> dlg@cpe0 sys$ sudo ifconfig mpe3 mplslabel 16  
> ifconfig: SIOCSETLABEL: File exists
> 
> ok?
> 
> Index: net/if_mpe.c
> ===
> RCS file: /cvs/src/sys/net/if_mpe.c,v
> retrieving revision 1.76
> diff -u -p -r1.76 if_mpe.c
> --- net/if_mpe.c  30 Jan 2019 01:09:36 -  1.76
> +++ net/if_mpe.c  30 Jan 2019 01:40:47 -
> @@ -132,10 +132,8 @@ mpe_clone_destroy(struct ifnet *ifp)
> 
>   LIST_REMOVE(sc, sc_list);
> 
> - if (sc->sc_smpls.smpls_label) {
> - rt_ifa_del(>sc_ifa, RTF_MPLS,
> - smplstosa(>sc_smpls));
> - }
> + if (sc->sc_smpls.smpls_label)
> + mpls_ifa_del(>sc_ifa, >sc_smpls);
> 
>   if_detach(ifp);
>   free(sc, M_DEVBUF, sizeof *sc);
> @@ -331,13 +329,11 @@ mpe_ioctl(struct ifnet *ifp, u_long cmd,
>   ifm = ifp->if_softc;
>   if (ifm->sc_smpls.smpls_label) {
>   /* remove old MPLS route */
> - rt_ifa_del(>sc_ifa, RTF_MPLS,
> - smplstosa(>sc_smpls));
> + mpls_ifa_del(>sc_ifa, >sc_smpls);
>   }
>   /* add new MPLS route */
>   ifm->sc_smpls.smpls_label = shim.shim_label;
> - error = rt_ifa_add(>sc_ifa, RTF_MPLS|RTF_LOCAL,
> - smplstosa(>sc_smpls));
> + error = mpls_ifa_add(>sc_ifa, >sc_smpls);
>   if (error) {
>   ifm->sc_smpls.smpls_label = 0;
>   break;
> @@ -348,10 +344,8 @@ mpe_ioctl(struct ifnet *ifp, u_long cmd,
>   /* XXX does not make sense, the MPLS route is on rtable 0 */
>   ifm = ifp->if_softc;
>   if (ifr->ifr_rdomainid != ifp->if_rdomain) {
> - if (ifm->sc_smpls.smpls_label) {
> - rt_ifa_add(>sc_ifa, RTF_MPLS,
> - smplstosa(>sc_smpls));
> - }
> + if (ifm->sc_smpls.smpls_label)
> + mpls_ifa_add(>sc_ifa, >sc_smpls);
>   }
>   /* return with ENOTTY so that the parent handler finishes */
>   return (ENOTTY);
> Index: net/if_mpw.c
> ===
> RCS file: /cvs/src/sys/net/if_mpw.c,v
> retrieving revision 1.31
> diff -u -p -r1.31 if_mpw.c
> --- net/if_mpw.c  30 Jan 2019 01:09:36 -  1.31
> +++ net/if_mpw.c  30 Jan 2019 01:40:47 -
> @@ -116,8 +116,7 @@ mpw_clone_destroy(struct ifnet *ifp)
>   ifp->if_flags &= ~IFF_RUNNING;
> 
>   if (sc->sc_smpls.smpls_label) {
> - rt_ifa_del(>sc_ifa, RTF_MPLS,
> - smplstosa(>sc_smpls));
> + mpls_ifa_del(>sc_ifa, >sc_smpls);
>   }

fork rt_ifa_{add,del} for mpls local input routes

2019-01-29 Thread David Gwynne

mpls uses AF_MPLS routes with RTF_LOCAL set on them to know which tags
are used as input for the mpe and mpw interfaces. setting this up
currently goes through rt_ifa_add, but that has a couple of features
that are undesirable for mpls.

Firstly, rt_ifa_add unconditionally sets RTF_MPATH on the routes it
adds, which means multiple mpe and mpw interfaces can "own" the same
input tag. mpe tries to work around this by maintaining a global list of
mpe interfaces, and iterates over them when a new label is added. That's
ok (sort of) for mpe, but it doesnt take the tags used by mpw into
account.

Secondly, I'd like to start pulling apart the restriction on the use of
mpls only in rdomain 1. rt_ifa_add doesn't help this situation because
it assumes that we're adding a route inside the rdomain the interface is
in, rather than the one it tunnels in. Changing this assumption means
forking rt_ifa_add, and oh look, that's what I've started here.

So, if I have the following:

mpe2: flags=51 rdomain 2 mtu 1500
index 6 priority 0 llprio 3
mpls label 1000
groups: mpe
inet 192.168.0.25 --> 0.0.0.0 netmask 0x
mpw0: flags=8843 mtu 1500
lladdr fe:e1:ba:d0:93:1a
index 7 priority 0 llprio 3
encapsulation-type ethernet, control-word
mpls label: local 16 remote 16
neighbor: 192.168.0.27
groups: mpw
inet 100.64.100.2 netmask 0xff00 broadcast 100.64.100.255

The following now does what's expected:

dlg@cpe0 sys$ sudo ifconfig mpe3 create
dlg@cpe0 sys$ sudo ifconfig mpe3 mplslabel 16  
ifconfig: SIOCSETLABEL: File exists

ok?

Index: net/if_mpe.c
===
RCS file: /cvs/src/sys/net/if_mpe.c,v
retrieving revision 1.76
diff -u -p -r1.76 if_mpe.c
--- net/if_mpe.c30 Jan 2019 01:09:36 -  1.76
+++ net/if_mpe.c30 Jan 2019 01:40:47 -
@@ -132,10 +132,8 @@ mpe_clone_destroy(struct ifnet *ifp)
 
LIST_REMOVE(sc, sc_list);
 
-   if (sc->sc_smpls.smpls_label) {
-   rt_ifa_del(>sc_ifa, RTF_MPLS,
-   smplstosa(>sc_smpls));
-   }
+   if (sc->sc_smpls.smpls_label)
+   mpls_ifa_del(>sc_ifa, >sc_smpls);
 
if_detach(ifp);
free(sc, M_DEVBUF, sizeof *sc);
@@ -331,13 +329,11 @@ mpe_ioctl(struct ifnet *ifp, u_long cmd,
ifm = ifp->if_softc;
if (ifm->sc_smpls.smpls_label) {
/* remove old MPLS route */
-   rt_ifa_del(>sc_ifa, RTF_MPLS,
-   smplstosa(>sc_smpls));
+   mpls_ifa_del(>sc_ifa, >sc_smpls);
}
/* add new MPLS route */
ifm->sc_smpls.smpls_label = shim.shim_label;
-   error = rt_ifa_add(>sc_ifa, RTF_MPLS|RTF_LOCAL,
-   smplstosa(>sc_smpls));
+   error = mpls_ifa_add(>sc_ifa, >sc_smpls);
if (error) {
ifm->sc_smpls.smpls_label = 0;
break;
@@ -348,10 +344,8 @@ mpe_ioctl(struct ifnet *ifp, u_long cmd,
/* XXX does not make sense, the MPLS route is on rtable 0 */
ifm = ifp->if_softc;
if (ifr->ifr_rdomainid != ifp->if_rdomain) {
-   if (ifm->sc_smpls.smpls_label) {
-   rt_ifa_add(>sc_ifa, RTF_MPLS,
-   smplstosa(>sc_smpls));
-   }
+   if (ifm->sc_smpls.smpls_label)
+   mpls_ifa_add(>sc_ifa, >sc_smpls);
}
/* return with ENOTTY so that the parent handler finishes */
return (ENOTTY);
Index: net/if_mpw.c
===
RCS file: /cvs/src/sys/net/if_mpw.c,v
retrieving revision 1.31
diff -u -p -r1.31 if_mpw.c
--- net/if_mpw.c30 Jan 2019 01:09:36 -  1.31
+++ net/if_mpw.c30 Jan 2019 01:40:47 -
@@ -116,8 +116,7 @@ mpw_clone_destroy(struct ifnet *ifp)
ifp->if_flags &= ~IFF_RUNNING;
 
if (sc->sc_smpls.smpls_label) {
-   rt_ifa_del(>sc_ifa, RTF_MPLS,
-   smplstosa(>sc_smpls));
+   mpls_ifa_del(>sc_ifa, >sc_smpls);
}
 
ether_ifdetach(ifp);
@@ -162,9 +161,8 @@ mpw_ioctl(struct ifnet *ifp, u_long cmd,
/* Teardown all configuration if got no nexthop */
sin = (struct sockaddr_in *) _nexthop;
if (sin->sin_addr.s_addr == 0) {
-   if (rt_ifa_del(>sc_ifa, RTF_MPLS,
-   smplstosa(>sc_smpls)) == 0)
-   sc->sc_smpls.smpls_label = 0;
+   mpls_ifa_del(>sc_ifa, >sc_smpls);
+   sc->sc_smpls.smpls_label = 0;
 
memset(>sc_rshim, 0, sizeof(sc->sc_rshim));

Re: MPLSv6 1/2: kernel diff

2019-01-27 Thread David Gwynne




> On 27 Dec 2018, at 5:42 pm, Claudio Jeker  wrote:
> 
> On Wed, Dec 26, 2018 at 09:27:59PM +0100, Denis Fondras wrote:
>> Resend because of nasty typo :/
>> 
>> On Mon, Dec 24, 2018 at 08:43:10PM -0200, Martin Pieuchot wrote:
>>> I'm not happy with adding the IFF_MULTICAST flag and SIOC{ADD,DEL}MULTI
>>> ioctls.  It seems to be a common pattern between in existing pseudo-driver,
>>> so this shouldn't block you.  However I'd greatly appreciate if you
>>> could explain to us which code is asking for this and if this could be
>>> improved.
>>> 
>> 
>> Interface needs IFF_MULTICAST when enabling IPv6 as per in6_ifattach().
>> When an address is configured, the interface joins multicast groups with
>> in6_joingroup() (hence use SIOCADDMULTI) but only if IFF_MULTICAST is set. 
>> It is
>> useful for interfaces extern-facing interfaces, not so much for internal 
>> (like
>> mpe) as they won't process all-{nodes, routers} packets.
>> 
>> We may remove the test in in6_ifattach() because there are many tests through
>> the stack. It works if I disable the test in in6_ifattach() and remove the
>> IFF_MULTICAST. However I haven't tested further so it may break elsewhere.
>> 
> 
> Doing IFF_MULTICAST on a true IFF_POINTOPOINT interface is indeed trivial
> and save. I think that bit of the mpe(4) diff is OK and I would not go and
> try to work around this in in6 code. IPv6 requires multicast and it was an
> oversight in mpe(4) to not add it.

My current thinking on this is that IFF_MULTICAST should be used by v6 to 
decide if an interface can do neighbour discovery or not, it should not be used 
to decide if an interface can have an IPv6 address assigned or not. You should 
be able to assign v6 to any interface that can transport it, but if it can't do 
ND then you'll need to statically assign prefixes. Assigning prefixes is what 
bgpd implementing VPNv6 would be doing, and it would work fine.

This might mean we don't need IFF_MULTICAST on the layer three p2p tunnel 
interfaces...

Cheers,
dlg

< 1 2 3 4 5 6 7 8 9 10 >

301 - 400 of 1094 matches

Mail list logo