rtsock: Check prefixlength before deciding an existing L2 cached route is the same as a new route being added

2021-11-27 Thread Mitchell Krome
Hi,

I ran into a problem where when using /31 netmasks on point to point
links, I am unable to add a larger summary route that happens to have
the same network address as the /31 link route the box has. It's a
bit hard to explain so hopefully the art below helps.


bsd1rtr2
  +-+ +-+
  | vio0  172.19.6.33/31|<--->|172.19.6.32/31  if1  |
  +-+ +-+
172.19.6.32/27 -> 172.19.6.32

$route show
...
172.17.6.32/31 172.17.6.33UCn10 - 4 vio0
172.17.6.3250:40:61:d6:05:e3  UHLch 79 5959 - 3 vio0
172.17.6.3352:54:00:e7:0c:c5  UHLl   0   70 - 1 vio0

When I try to add the /27 route, it comes back with invalid argument

$ doas route add 172.17.6.32/27 172.17.6.32
add net 172.17.6.32/27: gateway 172.17.6.32: Invalid argument

I managed to track this down to a check in rtsock.c where it thinks that
the route I am adding is a modification of the existing L2 route for
172.19.7.32. The Invalid Argument is because later on in the RTM_CHANGE
code it wants a AF of the new nexthop to be the same as the route it
thinks its changing. When using /30 routes this would never happen as
you wouldn't get a host on the network address I guess. 

Diff below fixed the problem by also checking the prefix length matches
before assuming the route isn't a new route and jumping into RTM_CHANGE
code.

The code in change: also checks the AF are the same, before it will
do a modification so I'm not sure if this should also check that before
assuming the route isn't actually a new one?

As far as I could tell reading the other code, it seemed like the only
way to get RTF_CACHED flag was to be a L2 nexthop, so I don't think that
there would be a case where the prefix length of the new and old entries
don't match but the intent was to go through the change path. 




diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c
index a717d112e..9dbc8ca90 100644
--- a/sys/net/rtsock.c
+++ b/sys/net/rtsock.c
@@ -937,7 +937,10 @@ rtm_output(struct rt_msghdr *rtm, struct rtentry **prt,
 * cached route because this can lead to races in the
 * receive path.  Instead we update the L2 cache.
 */
-   if ((rt != NULL) && ISSET(rt->rt_flags, RTF_CACHED)) {
+   plen = rtable_satoplen(info->rti_info[RTAX_DST]->sa_family,
+   info->rti_info[RTAX_NETMASK]);
+   if ((rt != NULL) && (plen == rt->rt_plen) &&
+   ISSET(rt->rt_flags, RTF_CACHED)) {
ifp = if_get(rt->rt_ifidx);
if (ifp == NULL) {
rtfree(rt);



Re: disabling a cpu socket

2021-11-27 Thread Mark Kettenis
> Date: Sat, 27 Nov 2021 20:28:39 +
> From: Stuart Henderson 
> 
> I have some amd64 machines which are doing 600+ gettimeofday/second
> at quiet times and way more when they're busy and I'd quite like to
> get them onto userland tsc, however they're dual socket and the skew
> between cores on the different sockets is too great. There's no way to
> disable a socket in BIOS settings, and physically removing a cpu would
> be very inconvenient.

I don't think ~1000 system calls per second is something you really
need to be worried about.

> 
> Where would I need to make changes (as a local patch obviously) to skip
> a cpu? Can I just avoid doing the cpu_intr_init/cpu_start_secondary/
> sched_init_cpu/ncpus++/etc from amd64/cpu.c 638-645 ? i.e. these bits

Best way is probably to skip attaching the CPUs in dev/acpi/acpimadt.c

>  631 case CPU_ROLE_AP:
>  632 /*
>  633  * report on an AP
>  634  */
>  635 printf("apid %d (application processor)\n", 
> caa->cpu_apicid);
>  636 
>  637 #if defined(MULTIPROCESSOR)
>  638 cpu_intr_init(ci);
>  639 cpu_start_secondary(ci);
>  640 sched_init_cpu(ci);
>  641 ncpus++;
>  642 if (ci->ci_flags & CPUF_PRESENT) {
>  643 ci->ci_next = cpu_info_list->ci_next;
>  644 cpu_info_list->ci_next = ci;
>  645 }
>  646 #else
>  647 printf("%s: not started\n", sc->sc_dev.dv_xname);
>  648 #endif
> 
> 



Re: update iwx(4) firmware to -67

2021-11-27 Thread Matthias Schmidt
Hi Stefan,

* Stefan Sperling wrote:
> This patch updates iwx(4) to new firmware images (API version -67).
> 
> Intel has published a related security advisory:
> https://www.intel.com/content/www/us/en/security-center/advisory/intel-sa-00509.html
> 
> Make sure to get a fresh kernel from -current sources and update
> to iwx-firmware-20211101 with fw_update before trying this patch.
> The new firmware version shows as "fw ver 67.8f59b80b.0" in dmesg.
> I have tested on AX200 and AX201 and I am not seeing any issues.
> 
> iwx(4) devices which are using the iwx-Qu-c0-hr-b0-63 image did
> not receive a firmware update. I cannot tell why.

Tested on the following device and works totally flawlessly here!

iwx0: hw rev 0x340, fw ver 67.8f59b80b.0, address 48:51:c5:0e:3e:68
iwx0 at pci2 dev 0 function 0 "Intel Wi-Fi 6 AX200" rev 0x1a, msix

Cheers

Matthias

> ok?
> 
>  
> diff 777a5184786f624f57b85b8460919a1130498508 
> 1d9e5ec96bf733cdb7c339f7179b945ff1e0a937
> blob - 1d87b3522c60081eac0b0f54cdd772984de4b340
> blob + d48e22237cb568cb0bed6a49e82d3be6d4744da4
> --- sys/dev/pci/if_iwx.c
> +++ sys/dev/pci/if_iwx.c
> @@ -9266,7 +9266,7 @@ iwx_attach(struct device *parent, struct device *self,
>  
>   switch (PCI_PRODUCT(pa->pa_id)) {
>   case PCI_PRODUCT_INTEL_WL_22500_1:
> - sc->sc_fwname = "iwx-cc-a0-63";
> + sc->sc_fwname = "iwx-cc-a0-67";
>   sc->sc_device_family = IWX_DEVICE_FAMILY_22000;
>   sc->sc_integrated = 0;
>   sc->sc_ltr_delay = IWX_SOC_FLAGS_LTR_APPLY_DELAY_NONE;
> @@ -9283,7 +9283,7 @@ iwx_attach(struct device *parent, struct device *self,
>   return;
>   }
>  
> - sc->sc_fwname = "iwx-QuZ-a0-hr-b0-63";
> + sc->sc_fwname = "iwx-QuZ-a0-hr-b0-67";
>   sc->sc_device_family = IWX_DEVICE_FAMILY_22000;
>   sc->sc_integrated = 1;
>   sc->sc_ltr_delay = IWX_SOC_FLAGS_LTR_APPLY_DELAY_200;
> 



disabling a cpu socket

2021-11-27 Thread Stuart Henderson
I have some amd64 machines which are doing 600+ gettimeofday/second
at quiet times and way more when they're busy and I'd quite like to
get them onto userland tsc, however they're dual socket and the skew
between cores on the different sockets is too great. There's no way to
disable a socket in BIOS settings, and physically removing a cpu would
be very inconvenient.

Where would I need to make changes (as a local patch obviously) to skip
a cpu? Can I just avoid doing the cpu_intr_init/cpu_start_secondary/
sched_init_cpu/ncpus++/etc from amd64/cpu.c 638-645 ? i.e. these bits

 631 case CPU_ROLE_AP:
 632 /*
 633  * report on an AP
 634  */
 635 printf("apid %d (application processor)\n", 
caa->cpu_apicid);
 636 
 637 #if defined(MULTIPROCESSOR)
 638 cpu_intr_init(ci);
 639 cpu_start_secondary(ci);
 640 sched_init_cpu(ci);
 641 ncpus++;
 642 if (ci->ci_flags & CPUF_PRESENT) {
 643 ci->ci_next = cpu_info_list->ci_next;
 644 cpu_info_list->ci_next = ci;
 645 }
 646 #else
 647 printf("%s: not started\n", sc->sc_dev.dv_xname);
 648 #endif



[patch] ehci: change explicit function names to __func__ in debugging printfs

2021-11-27 Thread Mikhail
While fiddling with ehci.c and fighting with urndis I noticed that most
of the debugging printf's use explicit names, which is inconvenient for
grep'ing. Also, couple of items used wrong function names
(ehci_check_intr instead of ehci_check_qh_intr).

Compilation tested with DIAGNOSTIC and EHCI_DEBUG defines.

diff --git sys/dev/usb/ehci.c sys/dev/usb/ehci.c
index afc423dbbb6..37ff3beeb7a 100644
--- sys/dev/usb/ehci.c
+++ sys/dev/usb/ehci.c
@@ -314,10 +314,10 @@ ehci_init(struct ehci_softc *sc)
sc->sc_offs = EREAD1(sc, EHCI_CAPLENGTH);
 
sparams = EREAD4(sc, EHCI_HCSPARAMS);
-   DPRINTF(("ehci_init: sparams=0x%x\n", sparams));
+   DPRINTF(("%s: sparams=0x%x\n", __func__, sparams));
sc->sc_noport = EHCI_HCS_N_PORTS(sparams);
cparams = EREAD4(sc, EHCI_HCCPARAMS);
-   DPRINTF(("ehci_init: cparams=0x%x\n", cparams));
+   DPRINTF(("%s: cparams=0x%x\n", __func__, cparams));
 
/* MUST clear segment register if 64 bit capable. */
if (EHCI_HCC_64BIT(cparams))
@@ -521,7 +521,7 @@ ehci_intr1(struct ehci_softc *sc)
/* In case the interrupt occurs before initialization has completed. */
if (sc == NULL) {
 #ifdef DIAGNOSTIC
-   printf("ehci_intr1: sc == NULL\n");
+   printf("%s: sc == NULL\n", __func__);
 #endif
return (0);
}
@@ -704,7 +704,7 @@ ehci_check_qh_intr(struct ehci_softc *sc, struct usbd_xfer 
*xfer)
 * short packet (SPD and not ACTIVE).
 */
if (letoh32(lsqtd->qtd.qtd_status) & EHCI_QTD_ACTIVE) {
-   DPRINTFN(12, ("ehci_check_intr: active ex=%p\n", ex));
+   DPRINTFN(12, ("%s: active ex=%p\n", __func__, ex));
for (sqtd = ex->sqtdstart; sqtd != lsqtd; sqtd=sqtd->nextqtd) {
usb_syncmem(>dma,
sqtd->offs + offsetof(struct ehci_qtd, qtd_status),
@@ -724,7 +724,7 @@ ehci_check_qh_intr(struct ehci_softc *sc, struct usbd_xfer 
*xfer)
if (EHCI_QTD_GET_BYTES(status) != 0)
goto done;
}
-   DPRINTFN(12, ("ehci_check_intr: ex=%p std=%p still active\n",
+   DPRINTFN(12, ("%s: ex=%p std=%p still active\n", __func__,
  ex, ex->sqtdstart));
usb_syncmem(>dma,
lsqtd->offs + offsetof(struct ehci_qtd, qtd_status),
@@ -876,7 +876,7 @@ ehci_idone(struct usbd_xfer *xfer)
int s = splhigh();
if (ex->isdone) {
splx(s);
-   printf("ehci_idone: ex=%p is done!\n", ex);
+   printf("%s: ex=%p is done!\n", __func__, ex);
return;
}
ex->isdone = 1;
@@ -904,8 +904,8 @@ ehci_idone(struct usbd_xfer *xfer)
}
 
cerr = EHCI_QTD_GET_CERR(status);
-   DPRINTFN(/*10*/2, ("ehci_idone: len=%d, actlen=%d, cerr=%d, "
-   "status=0x%x\n", xfer->length, actlen, cerr, status));
+   DPRINTFN(/*10*/2, ("%s: len=%d, actlen=%d, cerr=%d, "
+   "status=0x%x\n", __func__, xfer->length, actlen, cerr, status));
xfer->actlen = actlen;
if ((status & EHCI_QTD_HALTED) != 0) {
if ((status & EHCI_QTD_BABBLE) == 0 && cerr > 0)
@@ -920,7 +920,7 @@ ehci_idone(struct usbd_xfer *xfer)
usbd_xfer_isread(xfer) ?
BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
usb_transfer_complete(xfer);
-   DPRINTFN(/*12*/2, ("ehci_idone: ex=%p done\n", ex));
+   DPRINTFN(/*12*/2, ("%s: ex=%p done\n", __func__, ex));
 }
 
 void
@@ -1367,7 +1367,7 @@ ehci_open(struct usbd_pipe *pipe)
int ival, speed, naks;
int hshubaddr, hshubport;
 
-   DPRINTFN(1, ("ehci_open: pipe=%p, addr=%d, endpt=%d\n",
+   DPRINTFN(1, ("%s: pipe=%p, addr=%d, endpt=%d\n", __func__,
pipe, addr, ed->bEndpointAddress));
 
if (sc->sc_bus.dying)
@@ -1408,7 +1408,7 @@ ehci_open(struct usbd_pipe *pipe)
speed = EHCI_QH_SPEED_HIGH;
break;
default:
-   panic("ehci_open: bad device speed %d", dev->speed);
+   panic("%s: bad device speed %d", __func__, dev->speed);
}
 
/*
@@ -1512,18 +1512,18 @@ ehci_open(struct usbd_pipe *pipe)
}
/* Spec page 271 says intervals > 16 are invalid */
if (ed->bInterval == 0 || ed->bInterval > 16) {
-   printf("ehci: opening pipe with invalid bInterval\n");
+   printf("%s: opening pipe with invalid bInterval\n", 
__func__);
return (USBD_INVAL);
}
if (UGETW(ed->wMaxPacketSize) == 0) {
-   printf("ehci: zero length endpoint open request\n");
+   printf("%s: zero length endpoint open request\n", 
__func__);
return 

Re: slaacd(8): prevent crash when interface disappears

2021-11-27 Thread Florian Obser
anyone?

On 2021-11-18 09:02 +01, Florian Obser  wrote:
> This is split in two for easier review and I also intend to commit it
> like this.
>
> The first diff shuffles setting of if_index around so that it's
> available in all switch cases and uses it consistently instead of
> ifm->ifm_index.
>
> That way we can copy the if_name == NULL check into each case block
> preventing crashes when an interface disappears at just the wrong
> moment.
>
> OK?
>
> (btw. I found this because I transmogrified slaacd into gelatod and kn@
> reported a crash while running in debug mode so I could spot what's
> wrong with it. Much harder to find in slaacd...)
>
> commit 2880598cb424e5d889ecdbf06d9d72d777b11569
> Author: Florian Obser 
> Date:   Wed Nov 17 20:13:55 2021 +0100
>
> normalize if_index handling in route messages
>
> diff --git frontend.c frontend.c
> index 72d7929c5df..29446c11e16 100644
> --- frontend.c
> +++ frontend.c
> @@ -793,30 +793,28 @@ handle_route_message(struct rt_msghdr *rtm, struct 
> sockaddr **rti_info)
>   switch (rtm->rtm_type) {
>   case RTM_IFINFO:
>   ifm = (struct if_msghdr *)rtm;
> - if_name = if_indextoname(ifm->ifm_index, ifnamebuf);
> + if_index = ifm->ifm_index;
> + if_name = if_indextoname(if_index, ifnamebuf);
>   if (if_name == NULL) {
> - log_debug("RTM_IFINFO: lost if %d", ifm->ifm_index);
> - if_index = ifm->ifm_index;
> + log_debug("RTM_IFINFO: lost if %d", if_index);
>   frontend_imsg_compose_engine(IMSG_REMOVE_IF, 0, 0,
>   _index, sizeof(if_index));
>   remove_iface(if_index);
> + break;
> + }
> + xflags = get_xflags(if_name);
> + if (xflags == -1 || !(xflags & (IFXF_AUTOCONF6 |
> + IFXF_AUTOCONF6TEMP))) {
> + log_debug("RTM_IFINFO: %s(%d) no(longer) autoconf6", 
> if_name,
> + if_index);
> + frontend_imsg_compose_engine(IMSG_REMOVE_IF, 0,
> + 0, _index, sizeof(if_index));
> + remove_iface(if_index);
>   } else {
> - xflags = get_xflags(if_name);
> - if (xflags == -1 || !(xflags & (IFXF_AUTOCONF6 |
> - IFXF_AUTOCONF6TEMP))) {
> - log_debug("RTM_IFINFO: %s(%d) no(longer) "
> -"autoconf6", if_name, ifm->ifm_index);
> - if_index = ifm->ifm_index;
> - frontend_imsg_compose_engine(IMSG_REMOVE_IF, 0,
> - 0, _index, sizeof(if_index));
> - remove_iface(if_index);
> - } else {
> - update_iface(ifm->ifm_index, if_name);
> + update_iface(if_index, if_name);
>  #ifndef  SMALL
> - update_autoconf_addresses(ifm->ifm_index,
> - if_name);
> + update_autoconf_addresses(if_index, if_name);
>  #endif   /* SMALL */
> - }
>   }
>   break;
>   case RTM_IFANNOUNCE:
> @@ -830,27 +828,29 @@ handle_route_message(struct rt_msghdr *rtm, struct 
> sockaddr **rti_info)
>   break;
>   case RTM_NEWADDR:
>   ifm = (struct if_msghdr *)rtm;
> - if_name = if_indextoname(ifm->ifm_index, ifnamebuf);
> - log_debug("RTM_NEWADDR: %s[%u]", if_name, ifm->ifm_index);
> - update_iface(ifm->ifm_index, if_name);
> + if_index = ifm->ifm_index;
> + if_name = if_indextoname(if_index, ifnamebuf);
> + log_debug("RTM_NEWADDR: %s[%u]", if_name, if_index);
> + update_iface(if_index, if_name);
>   break;
>   case RTM_DELADDR:
>   ifm = (struct if_msghdr *)rtm;
> - if_name = if_indextoname(ifm->ifm_index, ifnamebuf);
> + if_index = ifm->ifm_index;
> + if_name = if_indextoname(if_index, ifnamebuf);
>   if (rtm->rtm_addrs & RTA_IFA && rti_info[RTAX_IFA]->sa_family
>   == AF_INET6) {
> - del_addr.if_index = ifm->ifm_index;
> + del_addr.if_index = if_index;
>   memcpy(_addr.addr, rti_info[RTAX_IFA], sizeof(
>   del_addr.addr));
>   frontend_imsg_compose_engine(IMSG_DEL_ADDRESS,
>   0, 0, _addr, sizeof(del_addr));
> - log_debug("RTM_DELADDR: %s[%u]", if_name,
> - ifm->ifm_index);
> + log_debug("RTM_DELADDR: %s[%u]", if_name, if_index);
>   }
>   break;
>   case RTM_CHGADDRATTR:
>   ifm 

Re: urndis0: IOERROR

2021-11-27 Thread Mikhail
On Mon, Nov 22, 2021 at 04:28:54PM +0100, Gerhard Roth wrote:
> 
> Your OpenBSD pcap contains not "URB_CONTROL in" packets. Don't know where
> they got lost but since you cannot see any data sent by the function on the
> control pipe, it is almost impossible to do any debugging here. You only see
> what the host sends to the function by miss the replies.
> 

I did some more investigation and patched ehci.c to dump qTDs and USB
registers in ehci_idone function, dump shows that transaction is HALTED
and has XACTERR, also sometimes I see such errors:

Nov 27 17:55:00 edge /bsd: uhub3: device problem, disabling port 2

Spec[1] says the following about XactErr:

> Transaction Error (XactErr). Set to a 1 by the Host Controller during
> status update in the case where the host did not receive a valid
> response from the device (Timeout, CRC, Bad PID, etc.). This bit will
> only be set for IN transactions.

I also tested latest FreeBSD and NetBSD, but they didn't even show
description string, and NetBSD ended with the same "device problem"
output as above.

The modem works on Debian Linux 11, but in journal it says that it uses
CDC.

qTD dump follows, maybe someone will have quick clue what's happening (I
added a comment about a thing which looks strange), but I tend to an
idea of changing the modem.

[1] - 
https://www.intel.com/content/dam/www/public/us/en/documents/technical-specifications/ehci-specification-for-usb.pdf

Nov 26 21:30:49 edge /bsd: urndis0 at uhub3 port 2 configuration 2 interface 0 
"Yota Devices LTD Modem YOTA 4G LTE" rev 2.00/3.34 addr 5
Nov 26 21:30:49 edge /bsd: urndis0: using RNDIS
Nov 26 21:30:50 edge /bsd: ehci_check_qh_intr: no EHCI_QTD_ACTIVE
Nov 26 21:30:50 edge /bsd: QTD(0xfd80d8438f80) at 0xd8438f80:
Nov 26 21:30:50 edge /bsd:   next=0xd8438e80<> altnext=0xd8438e80<>
Nov 26 21:30:50 edge /bsd:   status=0x8e00: toggle=1 bytes=0x0 ioc=0 
c_page=0x0
Nov 26 21:30:50 edge /bsd: cerr=3 pid=2 stat=0x0
Nov 26 21:30:50 edge /bsd:   buffer[0]=0xd8439ac8
Nov 26 21:30:50 edge /bsd:   buffer[1]=0x
Nov 26 21:30:50 edge /bsd:   buffer[2]=0x
Nov 26 21:30:50 edge /bsd:   buffer[3]=0x
Nov 26 21:30:50 edge /bsd:   buffer[4]=0x
Nov 26 21:30:50 edge /bsd: QTD(0xfd80d8438e80) at 0xd8438e80:
Nov 26 21:30:50 edge /bsd:   next=0x0001 altnext=0x0001
Nov 26 21:30:50 edge /bsd:   status=0x80008148: toggle=1 bytes=0x0 ioc=1 
c_page=0x0
Nov 26 21:30:50 edge /bsd: cerr=0 pid=1 stat=0x48
Nov 26 21:30:50 edge /bsd:   buffer[0]=0x
Nov 26 21:30:50 edge /bsd:   buffer[1]=0x
Nov 26 21:30:50 edge /bsd:   buffer[2]=0x
Nov 26 21:30:50 edge /bsd:   buffer[3]=0x
Nov 26 21:30:50 edge /bsd:   buffer[4]=0x
Nov 26 21:30:50 edge /bsd: cmd=0x00020031, sts=0xc008, ien=0x0037
Nov 26 21:30:50 edge /bsd: frindex=0x3712 ctrdsegm=0x 
periodic=0xd844b000 async=0xd8437e80
Nov 26 21:30:50 edge /bsd: port 1 status=0x1005
Nov 26 21:30:50 edge /bsd: port 2 status=0x1000
Nov 26 21:30:50 edge /bsd: port 3 status=0x1000

Nov 26 21:30:50 edge /bsd: ehci_check_qh_intr: status & EHCI_QTD_HALTED
Nov 26 21:30:50 edge /bsd: QTD(0xfd80d8438e80) at 0xd8438e80:
Nov 26 21:30:50 edge /bsd:   next=0xd8438e00<> altnext=0xd8438e00<>
Nov 26 21:30:50 edge /bsd:   status=0x00080248: toggle=0 bytes=0x8 ioc=0 
c_page=0x0
Nov 26 21:30:50 edge /bsd: cerr=0 pid=2 stat=0x48
  ^
This is strange, the spec says that XactErr can be set only for IN
transactions, but pid=2 means it's SETUP. Also, this qTD address is the
same (0xfd80d8438e80) as previous one, which was also HALTED and
XACTERR.

Nov 26 21:30:50 edge /bsd:   buffer[0]=0xd8439ac0
Nov 26 21:30:50 edge /bsd:   buffer[1]=0x
Nov 26 21:30:50 edge /bsd:   buffer[2]=0x
Nov 26 21:30:50 edge /bsd:   buffer[3]=0x
Nov 26 21:30:50 edge /bsd:   buffer[4]=0x
Nov 26 21:30:50 edge /bsd: QTD(0xfd80d8438e00) at 0xd8438e00:
Nov 26 21:30:50 edge /bsd:   next=0xd8438f80<> altnext=0xd8438f80<>
Nov 26 21:30:50 edge /bsd:   status=0x80180c80: toggle=1 bytes=0x18 ioc=0 
c_page=0x0
Nov 26 21:30:50 edge /bsd: cerr=3 pid=0 stat=0x80
Nov 26 21:30:50 edge /bsd:   buffer[0]=0xd8459dc0
Nov 26 21:30:50 edge /bsd:   buffer[1]=0x
Nov 26 21:30:50 edge /bsd:   buffer[2]=0x
Nov 26 21:30:50 edge /bsd:   buffer[3]=0x
Nov 26 21:30:50 edge /bsd:   buffer[4]=0x
Nov 26 21:30:50 edge /bsd: QTD(0xfd80d8438f80) at 0xd8438f80:
Nov 26 21:30:50 edge /bsd:   next=0x0001 altnext=0x0001
Nov 26 21:30:50 edge /bsd:   status=0x80008d80: toggle=1 bytes=0x0 ioc=1 
c_page=0x0
Nov 26 21:30:50 edge /bsd: cerr=3 pid=1 stat=0x80
Nov 26 21:30:50 edge /bsd:   buffer[0]=0x
Nov 26 21:30:50 edge /bsd:   buffer[1]=0x
Nov 26 21:30:50 edge /bsd:   buffer[2]=0x
Nov 26 21:30:50 edge /bsd:   buffer[3]=0x
Nov 26 21:30:50 edge /bsd:   buffer[4]=0x
Nov 26 21:30:50 edge /bsd: 

tcpdump print-ipsec: move to EVP_CIPHER_CTX on the heap

2021-11-27 Thread Theo Buehler
EVP_CIPHER_CTX will become opaque, so this will need to change.
Allocate ctx once in esp_init() and modify all accesses accordingly.

Two more things:

Fix the error check of EVP_CipherInit() in esp_init(). The return
values are 1 for success and 1 for failure, so the usual OpenSSL idiom
applies.

In esp_decrypt() I'm not 100% sure how to react to EVP_CipherInit() or
EVP_Cipher() failure (both are currently unchecked). If decryption
fails, the function will likely return a few lines down since the
padding doesn't check out, so a silent return seemed like the
appropriate action.

Index: print-ipsec.c
===
RCS file: /cvs/src/usr.sbin/tcpdump/print-ipsec.c,v
retrieving revision 1.26
diff -u -p -r1.26 print-ipsec.c
--- print-ipsec.c   24 Jan 2020 22:46:37 -  1.26
+++ print-ipsec.c   27 Nov 2021 16:57:20 -
@@ -59,7 +59,7 @@ struct esp_hdr {
 
 static int espinit = 0;
 static int espauthlen = 12;
-static EVP_CIPHER_CTX ctx;
+static EVP_CIPHER_CTX *ctx;
 
 int
 esp_init (char *espspec)
@@ -105,8 +105,12 @@ esp_init (char *espspec)
}
key[i] = strtoul(s, NULL, 16);
}
-   EVP_CIPHER_CTX_init();
-   if (EVP_CipherInit(, evp, key, NULL, 0) < 0) {
+   if ((ctx = EVP_CIPHER_CTX_new()) == NULL) {
+   free(key);
+   error("espkey init failed");
+   }
+   if (!EVP_CipherInit(ctx, evp, key, NULL, 0)) {
+   EVP_CIPHER_CTX_free(ctx);
free(key);
error("espkey init failed");
}
@@ -115,16 +119,16 @@ esp_init (char *espspec)
return (0);
 }
 
-void 
+void
 esp_decrypt (const u_char *bp, u_int len, const u_char *bp2)
 {
const struct ip *ip;
u_char *data, pad, nh;
int blocksz;
- 
+
ip = (const struct ip *)bp2;
 
-   blocksz = EVP_CIPHER_CTX_block_size();
+   blocksz = EVP_CIPHER_CTX_block_size(ctx);
 
/* Skip fragments and short packets */
if (ntohs(ip->ip_off) & 0x3fff)
@@ -149,12 +153,15 @@ esp_decrypt (const u_char *bp, u_int len
len -= espauthlen;
 
/* the first block contains the IV */
-   EVP_CipherInit(, NULL, NULL, data, 0);
+   if (!EVP_CipherInit(ctx, NULL, NULL, data, 0))
+   return;
+
len -= blocksz;
data += blocksz;
 
/* decrypt remaining payload */
-   EVP_Cipher(, data, data, len);
+   if (!EVP_Cipher(ctx, data, data, len))
+   return;
 
nh = data[len - 1];
pad = data[len - 2];



Re: iwm/iwx: try to make roaming more reliable

2021-11-27 Thread Stefan Sperling
On Sat, Nov 27, 2021 at 09:57:45AM -0600, Aaron Poffenberger wrote:
> I see two differences. Before the patch, before "deauth" I see "sending
> delba" but not after patching, and before "firmware has detected
> regulatory domain 'US'", but not after.

I decided to try not sending a DELBA because it is immediately followed
by a DEAUTH anyway. The AP will clear block-ack session state in response
to DELBA, and delete *all* state in response to DEAUTH. So omitting DELBA
should be fine, and eliminates a frame we need to manage to send out
successfully in order to roam away. There is an #if 0 in the patch
which you can switch to an #if 1 in order to send the DELBA frame if
you would like to try that.

Not getting a regulatory notification from the firmware does not matter.
We do not do anything with such information yet beyond printing it to
dmesg in debug mode.



Re: iwm/iwx: try to make roaming more reliable

2021-11-27 Thread Aaron Poffenberger
On 2021-11-27 12:44 +0100, Stefan Sperling wrote:
> This patch reworks the steps involved in roaming to a new access
> point on iwm(4) and iwx(4) interfaces.
> 
> The current implementation suffers from race conditions which can
> leave the interface in a state where it gets "stuck". I have seen
> this happen on iwm(4) 9560 in particular, while testing the driver
> with new firmware images recently published by Intel. This may well
> be related to other hangs people have reported in multi-AP environments
> on both iwm(4) and iwx(4).

During testing I didn't see the race condition without the patch, but
in the past there have been occassions where I had to run `...
netstart.sh iwm0`. So far, I see no regressions with it.

The card is an 8265 in a Lenovo T450s.

iwm0 at pci2 dev 0 function 0 "Intel Dual Band Wireless-AC 8265" rev 0x78, msi

The wireless access points are both TP-Link EAP 245s, one a hardware
V1, and the other V3.


Before Patch:
iwm0: sending delba to 50:c7:bf:94:08:44 on channel 11 mode 11n
iwm0: sending deauth to 50:c7:bf:94:08:44 on channel 11 mode 11n
iwm0: firmware has detected regulatory domain 'US' (0x5553)
iwm0: roaming from 50:c7:bf:94:08:44 chan 11 to b0:95:75:6d:f0:97 chan 44
iwm0: RUN -> AUTH
iwm0: sending auth to b0:95:75:6d:f0:97 on channel 44 mode 11a
iwm0: AUTH -> ASSOC
iwm0: sending assoc_req to b0:95:75:6d:f0:97 on channel 44 mode 11a
iwm0: ASSOC -> RUN


After Patch:
iwm0: sending deauth to 50:c7:bf:94:08:44 on channel 11 mode 11n
iwm0: roaming from 50:c7:bf:94:08:44 chan 11 to b0:95:75:6d:f0:97 chan 44
iwm0: RUN -> AUTH
iwm0: sending auth to b0:95:75:6d:f0:97 on channel 44 mode 11a
iwm0: AUTH -> ASSOC
iwm0: sending assoc_req to b0:95:75:6d:f0:97 on channel 44 mode 11a
iwm0: ASSOC -> RUN


I see two differences. Before the patch, before "deauth" I see "sending
delba" but not after patching, and before "firmware has detected
regulatory domain 'US'", but not after.

Perhaps they're unimportant, or don't show up in all roaming
conditions.

Cheers,

--Aaron



iwm/iwx: try to make roaming more reliable

2021-11-27 Thread Stefan Sperling
This patch reworks the steps involved in roaming to a new access
point on iwm(4) and iwx(4) interfaces.

The current implementation suffers from race conditions which can
leave the interface in a state where it gets "stuck". I have seen
this happen on iwm(4) 9560 in particular, while testing the driver
with new firmware images recently published by Intel. This may well
be related to other hangs people have reported in multi-AP environments
on both iwm(4) and iwx(4).

With this patch in place, net80211 can now pass control to drivers
which provide an optional bgscan_done() handler. In this handler the
driver will do everything necessary to prepare the device for switching
APs, and then call back into net80211 to trigger the AP switch.

In the previous implementation, net80211 asked the driver to prepare
the device and immediately proceeded with switching APs. The driver
may need to wait for firmware commands to complete, which is done in a
task context. So we have a situation where the driver-side task and
net80211's switching of APs (which involves sending frames) kind of
happen in parallel, and things may break.

To test roaming, you need to do the following:

All APs involved need to use the same SSID for this to work as intended.
If you do not have such a setup, you cannot effectively test this patch
(unless you just want to run the patch anyway and look for regressions).

Use wifi with 'ifconfig iwm0/iwx0 debug' enabled to make roaming attempts
appear in /var/log/messages, and watch this file with a command such as:

  tail -f /var/log/messages

Move towards another access point and trigger a background scan by
running this command as root:

  ifconfig iwm0 scan

It may take a few scan attempts until roaming triggers.
Successful roaming displays the following in /var/log/messages:

  iwm0: roaming from 00:2b:a2:95:e3:e4 chan 1 to 00:2b:a2:95:e3:f4 chan 36
  iwm0: RUN -> AUTH
  iwm0: sending auth to 00:1a:dd:da:e3:f4 on channel 36 mode 11a
  iwm0: AUTH -> ASSOC
  iwm0: sending assoc_req to 00:2b:a2:95:e3:f4 on channel 36 mode 11a
  iwm0: ASSOC -> RUN

If it doesn't do this but enters INIT or SCAN or whatever, roaming failed.
This can happen for various reasons. One of them is that our attempt to
AUTH to the new AP times out, and I don't know how this could be fixed.
In any case, the driver should recover from any roaming failure by going
though the regular INIT->SCAN->AUTH->ASSOC->RUN sequence with one of
the available APs, and interface link should come back up.

If you want to force roaming from one particular AP to another, this can
be done by forcing the channel number corresponding to the first AP,
associating to this AP, and then clearing the forced channel number:

  # ifconfig iwm0 chan 1
  wait for association to succeed, then clear the forced channel:
  # ifconfig iwm0 -chan
  now trigger a background scan as usual:
  # ifconfig iwm0 scan
 
diff e5ddbb84043d48bc602408a6bf0e30fb062e3280 
af878e690f6195efea7bb4639bf75fe439f3cddc
blob - f1908d2923d90c7be84c010fc68342afda860d0c
blob + dfccd0bd1327325b7c834d1e60f1b37d7a19dc18
--- sys/dev/pci/if_iwm.c
+++ sys/dev/pci/if_iwm.c
@@ -477,6 +477,9 @@ voidiwm_add_task(struct iwm_softc *, struct taskq 
*, 
 void   iwm_del_task(struct iwm_softc *, struct taskq *, struct task *);
 intiwm_scan(struct iwm_softc *);
 intiwm_bgscan(struct ieee80211com *);
+void   iwm_bgscan_done(struct ieee80211com *,
+   struct ieee80211_node_switch_bss_arg *, size_t);
+void   iwm_bgscan_done_task(void *);
 intiwm_umac_scan_abort(struct iwm_softc *);
 intiwm_lmac_scan_abort(struct iwm_softc *);
 intiwm_scan_abort(struct iwm_softc *);
@@ -8287,6 +8290,81 @@ iwm_bgscan(struct ieee80211com *ic) 
return 0;
 }
 
+void
+iwm_bgscan_done(struct ieee80211com *ic,
+struct ieee80211_node_switch_bss_arg *arg, size_t arg_size)
+{
+   struct iwm_softc *sc = ic->ic_softc;
+
+   free(sc->bgscan_unref_arg, M_DEVBUF, sc->bgscan_unref_arg_size);
+   sc->bgscan_unref_arg = arg;
+   sc->bgscan_unref_arg_size = arg_size;
+   iwm_add_task(sc, sc->sc_nswq, >bgscan_done_task);
+}
+
+void
+iwm_bgscan_done_task(void *arg)
+{
+   struct iwm_softc *sc = arg;
+   struct ieee80211com *ic = >sc_ic;
+   struct iwm_node *in = (void *)ic->ic_bss;
+   struct ieee80211_node *ni = >in_ni;
+   int tid, err = 0, s = splnet();
+
+   if ((sc->sc_flags & IWM_FLAG_SHUTDOWN) ||
+   (ic->ic_flags & IEEE80211_F_BGSCAN) == 0 ||
+   ic->ic_state != IEEE80211_S_RUN) {
+   err = ENXIO;
+   goto done;
+   }
+
+   for (tid = 0; tid < IWM_MAX_TID_COUNT; tid++) {
+   int qid = IWM_FIRST_AGG_TX_QUEUE + tid;
+
+   if ((sc->tx_ba_queue_mask & (1 << qid)) == 0)
+   continue;
+
+   err = iwm_sta_tx_agg(sc, ni, tid, 0, 0, 0);
+   if (err)
+   goto done;
+   err = iwm_disable_txq(sc, IWM_STATION_ID,