selecting proper GOP when there are multiple GOPs
Hello, The below diff originally posted by Alexei K. on bugs@: Garbled screen when booting with UEFI https://marc.info/?l=openbsd-bugs=165087969227708=2 The same problem had been reported periodically and we have asked to use "machine gop" to workaround it. But the diff from Alexei seems to be a proper way. I've tested it by some my machines including HPE DL20 Gen10 which has a virtual video and serial console. I'd like to commit the diff and ask people to test it. ok? Index: sys/arch/amd64/stand/efiboot/efiboot.c === RCS file: /disk/cvs/openbsd/src/sys/arch/amd64/stand/efiboot/efiboot.c,v retrieving revision 1.38 diff -u -p -r1.38 efiboot.c --- sys/arch/amd64/stand/efiboot/efiboot.c 7 Jun 2021 00:04:20 - 1.38 +++ sys/arch/amd64/stand/efiboot/efiboot.c 2 May 2022 07:53:38 - @@ -424,8 +424,9 @@ efi_memprobe_internal(void) /*** * Console ***/ -static SIMPLE_TEXT_OUTPUT_INTERFACE *conout = NULL; -static SIMPLE_INPUT_INTERFACE *conin; +static SIMPLE_TEXT_OUTPUT_INTERFACE*conout = NULL; +static SIMPLE_INPUT_INTERFACE *conin; +static EFI_GRAPHICS_OUTPUT *gop = NULL; static EFI_GUID con_guid = EFI_CONSOLE_CONTROL_PROTOCOL_GUID; static EFI_GUID gop_guid @@ -444,6 +445,30 @@ efi_video_init(void) int i, mode80x25, mode100x31; UINTNcols, rows; EFI_STATUS status; + EFI_HANDLE *handles; + UINTNnhandles; + EFI_GRAPHICS_OUTPUT *first_gop = NULL; + EFI_DEVICE_PATH *devp_test = NULL; + + status = BS->LocateHandleBuffer(ByProtocol, _guid, NULL, , + ); + if (status != EFI_SUCCESS) + panic("BS->LocateHandleBuffer() returns %d", status); + for (i = 0; i < nhandles; i++) { + status = BS->HandleProtocol(handles[i], _guid, + (void **)); + if (first_gop == NULL) + first_gop = gop; + status = BS->HandleProtocol(handles[i], _guid, + (void **)_test); + if (status == EFI_SUCCESS) + break; + } + if (status != EFI_SUCCESS) + gop = first_gop; + if (gop == NULL) + panic("no gop found"); + BS->FreePool(handles); conout = ST->ConOut; status = BS->LocateProtocol(_guid, NULL, (void **)); @@ -808,7 +833,6 @@ efi_com_putc(dev_t dev, int c) */ static EFI_GUID acpi_guid = ACPI_20_TABLE_GUID; static EFI_GUID smbios_guid = SMBIOS_TABLE_GUID; -static EFI_GRAPHICS_OUTPUT *gop; static int gopmode = -1; #defineefi_guidcmp(_a, _b) memcmp((_a), (_b), sizeof(EFI_GUID)) @@ -853,57 +877,54 @@ efi_makebootargs(void) /* * Frame buffer */ - status = BS->LocateProtocol(_guid, NULL, (void **)); - if (!EFI_ERROR(status)) { - if (gopmode < 0) { - for (i = 0; i < gop->Mode->MaxMode; i++) { - status = gop->QueryMode(gop, i, , ); - if (EFI_ERROR(status)) - continue; - gopsiz = gopi->HorizontalResolution * - gopi->VerticalResolution; - if (gopsiz > bestsiz) { - gopmode = i; - bestsiz = gopsiz; - } + if (gopmode < 0) { + for (i = 0; i < gop->Mode->MaxMode; i++) { + status = gop->QueryMode(gop, i, , ); + if (EFI_ERROR(status)) + continue; + gopsiz = gopi->HorizontalResolution * + gopi->VerticalResolution; + if (gopsiz > bestsiz) { + gopmode = i; + bestsiz = gopsiz; } } - if (gopmode >= 0 && gopmode != gop->Mode->Mode) { - curmode = gop->Mode->Mode; - if (efi_gop_setmode(gopmode) != EFI_SUCCESS) - (void)efi_gop_setmode(curmode); - } - - gopi = gop->Mode->Info; - switch (gopi->PixelFormat) { - case PixelBlueGreenRedReserved8BitPerColor: - ei->fb_red_mask =
Re: wg(4): 'Address already in use' when wgrtable is changed
Hi, On Tue, 29 Mar 2022 17:28:23 +0900 Yuichiro NAITO wrote: > There is one thing I'm worrying about. > Ifconfig doesn't show wgrtable value with your patch. > In my use case as follows, it seems that setting `wgrtable 1` is > ignored. > > ``` > # route -T1 add default `cat /etc/mygate` > # ifconfig wg0 create wgport 7111 wgkey `cat /etc/mykey.wg0` > # ifconfig wg0 up > # ifconfig wg0 wgrtable 1 > # ifconfig wg0 > wg0: flags=80c3 mtu 1420 > index 6 priority 0 llprio 3 > wgport 7111 > wgpubkey e/CYTG1RGqT4jmrY0Fom8cAdtOWP7F/gBVwamyINRlg= > groups: wg > ``` Thank you for pointing this out. In this case, wg0 is binding 7111/udp on rdomain 0. So I have supposed ignoring "wgrtable 1" is correct. But if we configure wgrtable when creating, % doas ifconfig wg0 create wgport 7111 wgrtable 1 wgkey `openssl rand -base64 32` up % doas ifconfig wg0 wg0: flags=80c3 mtu 1420 index 13 priority 0 llprio 3 wgport 7111 wgrtable 1 wgpubkey /4v4hsi426MsVZojJ0rwRvk8kK0jSckjcU2Z1L/k5W8= groups: wg % It displays "wgrtable 1". And actually % netstat -T0 -naf inet | grep 7111 % netstat -T1 -naf inet | grep 7111 udp 0 0 *.7111 *.* % it binds 7111/udp on rtable 1. So I start wondering why binding 7111/udp on table 1 fails with EADDRINUSE when 7111/udp on rtable 0 is used. > On 3/28/22 15:59, YASUOKA Masahiko wrote: >> On Mon, 28 Mar 2022 15:20:02 +0900 >> Yuichiro NAITO wrote: >>> Thanks for the explanation. >>> I understand how your patch works. >>> >>> I want to ask the goal of your patch. >>> It seems just removing 'Address already in use' message. >>> Is my guessing right? >> Yes. There is nothing to do, since the command is to bind the same >> port, protocol, and domain of prevous. >> The code seems to do such the skip already, but it lacks consideration >> for rtable_l2(rtable) != rtable case. >> >>> On 3/28/22 14:01, YASUOKA Masahiko wrote: >>>> Hi, >>>> On Mon, 28 Mar 2022 12:12:39 +0900 >>>> Yuichiro NAITO wrote: >>>>> On 3/27/22 18:25, YASUOKA Masahiko wrote: >>>>>> Hi, >>>>>> On Wed, 9 Mar 2022 15:28:44 +0900 >>>>>> Yuichiro NAITO wrote: >>>>>>> I see 'Address already in use' message, >>>>>>> when I change wgrtable for a running wg interface. >>>>>>> It doesn't make sense to me. >>>>>>> >>>>>>> It can be reproduced by the following command sequence. >>>>>>> >>>>>>> ``` >>>>>>> # route -T1 add default `cat /etc/mygate` >>>>>>> # ifconfig wg0 create wgport 7111 wgkey `cat /etc/mykey.wg0` >>>>>>> # ifconfig wg0 up >>>>>>> # ifconfig wg0 wgrtable 1 >>>>>>> ifconfig: SIOCSWG: Address already in use >>>>>>> ``` >>>>>>> >>>>>>> When I down wg0 interface before changing wgrtable, >>>>>>> It succeeds and no messages are shown. >>>>>>> >>>>>>> I investigated the reason why 'Address already in use' is shown. >>>>>>> >>>>>>> If wgrtable is specified by ifconfig argument, >>>>>>> `wg_ioctl_set` function in `sys/net/if_wg.c` is called. >>>>>>> >>>>>>> And if the wg interface is running, `wg_bind` function is called. >>>>>>> `wg_bind` creates new sockets (IPv4 and 6) and replace them from old >>>>>>> ones. >>>>>>> >>>>>>> If only wgrtable is changed, `wg_bind` binds as same port as existing >>>>>>> sockets. >>>>>>> So 'Address already in use' is shown. >>>>>>> >>>>>>> Here is a simple patch to close existing sockets before `wg_bind`. >>>>>>> It works for me but I'm not 100% sure this is right fix. >>>>>>> >>>>>>> Any other ideas? >>>>>>> >>>>>>> ``` >>>>>>> diff --git a/sys/net/if_wg.c b/sys/net/if_wg.c >>>>>>> index 4dae3e3c976..0159664fb34 100644 >>>>>>> --- a/sys/net/if_wg.c >>>>>>> +++ b/sys/net/if_wg.c >>>>>>> @@ -2253,11 +2253,14 @@ wg_ioctl_set(struct wg_softc *sc, struct >>>
Re: wg(4): 'Address already in use' when wgrtable is changed
On Mon, 28 Mar 2022 15:20:02 +0900 Yuichiro NAITO wrote: > Thanks for the explanation. > I understand how your patch works. > > I want to ask the goal of your patch. > It seems just removing 'Address already in use' message. > Is my guessing right? Yes. There is nothing to do, since the command is to bind the same port, protocol, and domain of prevous. The code seems to do such the skip already, but it lacks consideration for rtable_l2(rtable) != rtable case. > On 3/28/22 14:01, YASUOKA Masahiko wrote: >> Hi, >> On Mon, 28 Mar 2022 12:12:39 +0900 >> Yuichiro NAITO wrote: >>> On 3/27/22 18:25, YASUOKA Masahiko wrote: >>>> Hi, >>>> On Wed, 9 Mar 2022 15:28:44 +0900 >>>> Yuichiro NAITO wrote: >>>>> I see 'Address already in use' message, >>>>> when I change wgrtable for a running wg interface. >>>>> It doesn't make sense to me. >>>>> >>>>> It can be reproduced by the following command sequence. >>>>> >>>>> ``` >>>>> # route -T1 add default `cat /etc/mygate` >>>>> # ifconfig wg0 create wgport 7111 wgkey `cat /etc/mykey.wg0` >>>>> # ifconfig wg0 up >>>>> # ifconfig wg0 wgrtable 1 >>>>> ifconfig: SIOCSWG: Address already in use >>>>> ``` >>>>> >>>>> When I down wg0 interface before changing wgrtable, >>>>> It succeeds and no messages are shown. >>>>> >>>>> I investigated the reason why 'Address already in use' is shown. >>>>> >>>>> If wgrtable is specified by ifconfig argument, >>>>> `wg_ioctl_set` function in `sys/net/if_wg.c` is called. >>>>> >>>>> And if the wg interface is running, `wg_bind` function is called. >>>>> `wg_bind` creates new sockets (IPv4 and 6) and replace them from old >>>>> ones. >>>>> >>>>> If only wgrtable is changed, `wg_bind` binds as same port as existing >>>>> sockets. >>>>> So 'Address already in use' is shown. >>>>> >>>>> Here is a simple patch to close existing sockets before `wg_bind`. >>>>> It works for me but I'm not 100% sure this is right fix. >>>>> >>>>> Any other ideas? >>>>> >>>>> ``` >>>>> diff --git a/sys/net/if_wg.c b/sys/net/if_wg.c >>>>> index 4dae3e3c976..0159664fb34 100644 >>>>> --- a/sys/net/if_wg.c >>>>> +++ b/sys/net/if_wg.c >>>>> @@ -2253,11 +2253,14 @@ wg_ioctl_set(struct wg_softc *sc, struct >>>>> wg_data_io *data) >>>>> if (port != sc->sc_udp_port || rtable != sc->sc_udp_rtable) { >>>>> TAILQ_FOREACH(peer, >sc_peer_seq, p_seq_entry) >>>>> wg_peer_clear_src(peer); >>>>> >>>>> - if (sc->sc_if.if_flags & IFF_RUNNING) >>>>> + if (sc->sc_if.if_flags & IFF_RUNNING) { >>>>> + if (port == sc->sc_udp_port) >>>>> + wg_unbind(sc); >>>>> if ((ret = wg_bind(sc, , )) != 0) >>>>> goto error; >>>>> + } >>>>> >>>>> sc->sc_udp_port = port; >>>>> sc->sc_udp_rtable = rtable; >>>>> } >>>>> ``` >>>> If rdomain 1 exists, the error will not shown. >>>># ifconfig vether0 rdomain 1 up >>>># ifconfig wg0 create wgport 7111 wgkey `openssl rand -base64 32` up >>>># ifconfig wg0 wgrtable 1 >>>># >>> >>> Yes, if rdomain 1 is created before `ifconfig wg0 wgrtable 1`, >>> setting wgrtable succeeds and there is no problem. >>> >>>> In the case which you reported to, it is supposed that rtable 1 exists >>>> but rdomain 1 doesn't exist. >>>> Even when "wgtable 1" is configured, becase there is no dedicated >>>> rdomain, rdomain 0 will be used to bind the UDP port. >>> >>> Exactly, it's the case that I reported and want to fix. >>> >>>> So what wg(4) should do for this case is "nothing". >>> >>> I'm a little bit confused. >>> As you said, I can confirm your patch doesn't set wgrtable in my use >>> case. >>> It is not the result that I wanted
Re: wg(4): 'Address already in use' when wgrtable is changed
Hi, On Mon, 28 Mar 2022 12:12:39 +0900 Yuichiro NAITO wrote: > On 3/27/22 18:25, YASUOKA Masahiko wrote: >> Hi, >> On Wed, 9 Mar 2022 15:28:44 +0900 >> Yuichiro NAITO wrote: >>> I see 'Address already in use' message, >>> when I change wgrtable for a running wg interface. >>> It doesn't make sense to me. >>> >>> It can be reproduced by the following command sequence. >>> >>> ``` >>> # route -T1 add default `cat /etc/mygate` >>> # ifconfig wg0 create wgport 7111 wgkey `cat /etc/mykey.wg0` >>> # ifconfig wg0 up >>> # ifconfig wg0 wgrtable 1 >>> ifconfig: SIOCSWG: Address already in use >>> ``` >>> >>> When I down wg0 interface before changing wgrtable, >>> It succeeds and no messages are shown. >>> >>> I investigated the reason why 'Address already in use' is shown. >>> >>> If wgrtable is specified by ifconfig argument, >>> `wg_ioctl_set` function in `sys/net/if_wg.c` is called. >>> >>> And if the wg interface is running, `wg_bind` function is called. >>> `wg_bind` creates new sockets (IPv4 and 6) and replace them from old >>> ones. >>> >>> If only wgrtable is changed, `wg_bind` binds as same port as existing >>> sockets. >>> So 'Address already in use' is shown. >>> >>> Here is a simple patch to close existing sockets before `wg_bind`. >>> It works for me but I'm not 100% sure this is right fix. >>> >>> Any other ideas? >>> >>> ``` >>> diff --git a/sys/net/if_wg.c b/sys/net/if_wg.c >>> index 4dae3e3c976..0159664fb34 100644 >>> --- a/sys/net/if_wg.c >>> +++ b/sys/net/if_wg.c >>> @@ -2253,11 +2253,14 @@ wg_ioctl_set(struct wg_softc *sc, struct >>> wg_data_io *data) >>> if (port != sc->sc_udp_port || rtable != sc->sc_udp_rtable) { >>> TAILQ_FOREACH(peer, >sc_peer_seq, p_seq_entry) >>> wg_peer_clear_src(peer); >>> >>> - if (sc->sc_if.if_flags & IFF_RUNNING) >>> + if (sc->sc_if.if_flags & IFF_RUNNING) { >>> + if (port == sc->sc_udp_port) >>> + wg_unbind(sc); >>> if ((ret = wg_bind(sc, , )) != 0) >>> goto error; >>> + } >>> >>> sc->sc_udp_port = port; >>> sc->sc_udp_rtable = rtable; >>> } >>> ``` >> If rdomain 1 exists, the error will not shown. >> # ifconfig vether0 rdomain 1 up >> # ifconfig wg0 create wgport 7111 wgkey `openssl rand -base64 32` up >> # ifconfig wg0 wgrtable 1 >> # > > Yes, if rdomain 1 is created before `ifconfig wg0 wgrtable 1`, > setting wgrtable succeeds and there is no problem. > >> In the case which you reported to, it is supposed that rtable 1 exists >> but rdomain 1 doesn't exist. >> Even when "wgtable 1" is configured, becase there is no dedicated >> rdomain, rdomain 0 will be used to bind the UDP port. > > Exactly, it's the case that I reported and want to fix. > >> So what wg(4) should do for this case is "nothing". > > I'm a little bit confused. > As you said, I can confirm your patch doesn't set wgrtable in my use > case. > It is not the result that I wanted. # ifconfig wg0 create wgport 7111 wgkey `openssl rand -base64 32` up -> bind 7111/udp on rdomain 0 (1) is expected. (1) # ifconfig wg0 wgrtable 1 -> bind 7111/udp on rdomain 0 (2) is expected, since there is no "domain 1". If trying to do (1) and (2), then it causes EADDRINUSE since it is to bind the same port, proto, and domain. The latest diff is skip (2) properly. Previous >> - if (port != sc->sc_udp_port || rtable != sc->sc_udp_rtable) { "rtable != sc->sc_udp_rtable" was wrong since rdomain for rtable may not exist. This is the cause of EADDRINUSE. >> So the diff is updated. >> ok? >> Index: sys/net/if_wg.c >> === >> RCS file: /disk/cvs/openbsd/src/sys/net/if_wg.c,v >> retrieving revision 1.22 >> diff -u -p -r1.22 if_wg.c >> --- sys/net/if_wg.c 22 Feb 2022 01:15:02 - 1.22 >> +++ sys/net/if_wg.c 27 Mar 2022 09:17:08 - >> @@ -2250,7 +2250,8 @@ wg_ioctl_set(struct wg_softc *sc, struct >> else >> rtable = sc->sc_udp_rtable; >> - if (port != sc->sc_udp_port || rtable != sc->sc_udp_rtable) { >> +if (port != sc->sc_udp_port || >> +rtable_l2(rtable) != sc->sc_udp_rtable) { >> TAILQ_FOREACH(peer, >sc_peer_seq, p_seq_entry) >> wg_peer_clear_src(peer); >> > > -- > Yuichiro NAITO (naito.yuich...@gmail.com) >
Re: wg(4): 'Address already in use' when wgrtable is changed
Hi, On Wed, 9 Mar 2022 15:28:44 +0900 Yuichiro NAITO wrote: > I see 'Address already in use' message, > when I change wgrtable for a running wg interface. > It doesn't make sense to me. > > It can be reproduced by the following command sequence. > > ``` > # route -T1 add default `cat /etc/mygate` > # ifconfig wg0 create wgport 7111 wgkey `cat /etc/mykey.wg0` > # ifconfig wg0 up > # ifconfig wg0 wgrtable 1 > ifconfig: SIOCSWG: Address already in use > ``` > > When I down wg0 interface before changing wgrtable, > It succeeds and no messages are shown. > > I investigated the reason why 'Address already in use' is shown. > > If wgrtable is specified by ifconfig argument, > `wg_ioctl_set` function in `sys/net/if_wg.c` is called. > > And if the wg interface is running, `wg_bind` function is called. > `wg_bind` creates new sockets (IPv4 and 6) and replace them from old > ones. > > If only wgrtable is changed, `wg_bind` binds as same port as existing > sockets. > So 'Address already in use' is shown. > > Here is a simple patch to close existing sockets before `wg_bind`. > It works for me but I'm not 100% sure this is right fix. > > Any other ideas? > > ``` > diff --git a/sys/net/if_wg.c b/sys/net/if_wg.c > index 4dae3e3c976..0159664fb34 100644 > --- a/sys/net/if_wg.c > +++ b/sys/net/if_wg.c > @@ -2253,11 +2253,14 @@ wg_ioctl_set(struct wg_softc *sc, struct > wg_data_io *data) > if (port != sc->sc_udp_port || rtable != sc->sc_udp_rtable) { > TAILQ_FOREACH(peer, >sc_peer_seq, p_seq_entry) > wg_peer_clear_src(peer); > > - if (sc->sc_if.if_flags & IFF_RUNNING) > + if (sc->sc_if.if_flags & IFF_RUNNING) { > + if (port == sc->sc_udp_port) > + wg_unbind(sc); > if ((ret = wg_bind(sc, , )) != 0) > goto error; > + } > > sc->sc_udp_port = port; > sc->sc_udp_rtable = rtable; > } > ``` If rdomain 1 exists, the error will not shown. # ifconfig vether0 rdomain 1 up # ifconfig wg0 create wgport 7111 wgkey `openssl rand -base64 32` up # ifconfig wg0 wgrtable 1 # In the case which you reported to, it is supposed that rtable 1 exists but rdomain 1 doesn't exist. Even when "wgtable 1" is configured, becase there is no dedicated rdomain, rdomain 0 will be used to bind the UDP port. So what wg(4) should do for this case is "nothing". So the diff is updated. ok? Index: sys/net/if_wg.c === RCS file: /disk/cvs/openbsd/src/sys/net/if_wg.c,v retrieving revision 1.22 diff -u -p -r1.22 if_wg.c --- sys/net/if_wg.c 22 Feb 2022 01:15:02 - 1.22 +++ sys/net/if_wg.c 27 Mar 2022 09:17:08 - @@ -2250,7 +2250,8 @@ wg_ioctl_set(struct wg_softc *sc, struct else rtable = sc->sc_udp_rtable; - if (port != sc->sc_udp_port || rtable != sc->sc_udp_rtable) { + if (port != sc->sc_udp_port || + rtable_l2(rtable) != sc->sc_udp_rtable) { TAILQ_FOREACH(peer, >sc_peer_seq, p_seq_entry) wg_peer_clear_src(peer);
Re: parallel ip forwarding
Hi, On Sat, 25 Dec 2021 21:50:47 +0300 Vitaliy Makkoveev wrote: > On Fri, Dec 24, 2021 at 12:50:23PM +0100, Alexander Bluhm wrote: >> On Fri, Dec 24, 2021 at 04:16:28PM +0900, YASUOKA Masahiko wrote: >> > > - npppd l2pt ipsecflowinfo is not MP safe >> > >> > Does this mean the things we are discussing on the "Fix >> > ipsp_spd_lookup() for transport mode" thread? I wonder if there is >> > another issue. >> >> In this mail thread I was concerned about things might get worse. >> >> Currently I see these problems: >> >> tdb_free() will be called with a shared netlock. From there >> ipsp_ids_free() is called. >> >> if (--ids->id_refcount > 0) >> return; >> >> This ref count needs to be atomic. >> >> if (LIST_EMPTY(_ids_gc_list)) >> timeout_add_sec(_ids_gc_timeout, 1); >> LIST_INSERT_HEAD(_ids_gc_list, ids, id_gc_list); >> >> And some mutex should protect ipsp_ids_gc_list. Thanks, I suppose I could catch up the problem. > The diff below adds `ipsec_flows_mtx' mutex(9) to protect `ipsp_ids_*' > list and trees. ipsp_ids_lookup() returns `ids' with bumped reference > counter. This direction seems good. One thing, I found a problem. > Index: sys/netinet/ip_spd.c > === > RCS file: /cvs/src/sys/netinet/ip_spd.c,v > retrieving revision 1.110 > diff -u -p -r1.110 ip_spd.c > --- sys/netinet/ip_spd.c 16 Dec 2021 15:38:03 - 1.110 > +++ sys/netinet/ip_spd.c 25 Dec 2021 18:34:22 - > @@ -418,6 +418,7 @@ ipsp_spd_lookup(struct mbuf *m, int af, > /* Cached entry is good. */ > error = ipsp_spd_inp(m, inp, ipo, tdbout); > mtx_leave(_tdb_mtx); > + ipsp_ids_free(ids); > return error; > >nomatchout: > @@ -452,6 +453,7 @@ ipsp_spd_lookup(struct mbuf *m, int af, > dignore ? : >ipo_dst, > ipo->ipo_sproto, ids ? ids: ipo->ipo_ids, > >ipo_addr, >ipo_mask); > + ipsp_ids_free(ids); > mtx_enter(_tdb_mtx); > if ((tdbp_new != NULL) && > (tdbp_new->tdb_flags & TDBF_DELETED)) { ids will remain unfreed since there are some code paths which doesn't pass the above lines. I tried to fix that, but adding a lot of ipsp_ids_free() looks a mess. Instead, how about changing ipsp_spd_lookup() to take a "struct ipsec_ids *ids" as an argument and letting the caller take the resposibility of the ids? Index: sys/net/if_bridge.c === RCS file: /disk/cvs/openbsd/src/sys/net/if_bridge.c,v retrieving revision 1.362 diff -u -p -r1.362 if_bridge.c --- sys/net/if_bridge.c 23 Dec 2021 12:21:48 - 1.362 +++ sys/net/if_bridge.c 30 Dec 2021 08:12:18 - @@ -1595,7 +1595,7 @@ bridge_ipsec(struct ifnet *ifp, struct e } } else { /* Outgoing from the bridge. */ error = ipsp_spd_lookup(m, af, hlen, IPSP_DIRECTION_OUT, - NULL, NULL, , 0); + NULL, NULL, , NULL); if (error == 0 && tdb != NULL) { /* * We don't need to do loop detection, the Index: sys/net/if_veb.c === RCS file: /disk/cvs/openbsd/src/sys/net/if_veb.c,v retrieving revision 1.21 diff -u -p -r1.21 if_veb.c --- sys/net/if_veb.c8 Nov 2021 04:15:46 - 1.21 +++ sys/net/if_veb.c30 Dec 2021 08:12:18 - @@ -746,7 +746,7 @@ veb_ipsec_proto_out(struct mbuf *m, sa_f #endif tdb = ipsp_spd_lookup(m, af, iphlen, , IPSP_DIRECTION_OUT, - NULL, NULL, 0); + NULL, NULL, NULL); if (tdb == NULL) return (m); Index: sys/netinet/ip_ipsp.c === RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_ipsp.c,v retrieving revision 1.267 diff -u -p -r1.267 ip_ipsp.c --- sys/netinet/ip_ipsp.c 20 Dec 2021 15:59:09 - 1.267 +++ sys/netinet/ip_ipsp.c 30 Dec 2021 08:12:18 - @@ -47,6 +47,8 @@ #include #include #include +#include +#include #include #include @@ -84,6 +86,13 @@ void tdb_hashstats(void); do { } while (0) #endif +/* + * Locks used to protect global data and struct members: + * F ipsec_flows_mtx + */ + +struct mutex ipsec_flows_mtx = MUTEX_INITIALIZER(IPL_SOFTNET); + inttdb_rehash(void); void tdb_tim
Re: parallel ip forwarding
Hello, On Fri, 24 Dec 2021 00:55:04 +0100 Alexander Bluhm wrote: > On Fri, Dec 03, 2021 at 08:35:45PM +0100, Alexander Bluhm wrote: >> Note that IPsec still has the workaround to disable multiple queues. > > I think we can remove the ipsec_in_use workaround now. The IPsec > path is protected with the kernel lock. > > There are some issues left: > - npppd l2pt ipsecflowinfo is not MP safe Does this mean the things we are discussing on the "Fix ipsp_spd_lookup() for transport mode" thread? I wonder if there is another issue. > - the acquire SA feature is not MP safe > - Hrvoje has seen a panic with sasync
Re: Fix ipsp_spd_lookup() for transport mode
Hi, On Mon, 20 Dec 2021 13:20:46 +0100 Alexander Bluhm wrote: > On Tue, Dec 14, 2021 at 06:25:20PM +0900, YASUOKA Masahiko wrote: >> Yes, if there is another better idea, it will be welcome. >> For this moment, the diff is the best idea for me. > > Sorry, no better idea. I have no experiance with l2pt. Codewise > the diff looks fine, but I don't understand the consequences. Thank you for your review and comments. >> +if (tdbflow != NULL) >> +rn = rn_lookup((caddr_t)>tdb_filter, >> +(caddr_t)>tdb_filtermask, rnh); > > Does rn_lookup() modify the radix tree? I looks like rn_lookup -> > rn_addmask -> rn_insert() does that. This will make it impossible > to make IPsec MP capable. The radix tree is not MP safe, art has > been implemented as an alternative. An ipsp_spd_lookup() should > not modify the flows. It is stange that a function named rn_lookup() > does modifications. Did I miss something? rn_lookup() doesn't make any modification. rn_lookup() calls rn_addmask() with second argument search=1. 183 /* return a perfect match if m_arg is set, else do a regular rn_match */ 184 struct radix_node * 185 rn_lookup(void *v_arg, void *m_arg, struct radix_node_head *head) 186 { 187 struct radix_node *x, *tm; 188 caddr_t netmask = 0; 189 190 if (m_arg) { 191 tm = rn_addmask(m_arg, 1, head->rnh_treetop->rn_off); and then rn_addmask() 416 struct radix_node * 417 rn_addmask(void *n_arg, int search, int skip) 418 { (snip) 449 if (tm || search) 450 return (tm); 451 tm = malloc(max_keylen + 2 * sizeof(*tm), M_RTABLE, M_NOWAIT | M_ZERO); 452 if (tm == NULL) 453 return (0); 454 saved_tm = tm; 455 netmask = cp = (caddr_t)(tm + 2); 456 memcpy(cp, addmask_key, mlen); 457 tm = rn_insert(cp, mask_rnhead, , tm); returns at #449-450 before calling rn_insert(). It seems that rn_addmask() does read only operations when "search". > Why do you call rn_lookup() here? Since rn_match() doesn't take a mask and returns the best one. For an example, if there are multiple peers behind a NAT, flows like below can be configured at the same time. (a) Windows: REMOTE_IP:1701/udp <=> LOCAL_IP:1701/udp (b) Linux:REMOTE_IP:ANY/udp <=> LOCAL_IP:1701/udp If source port of a packet from the Linux is 1701, rn_match() will return (a) for it, then ipsp_spd_lookup() will fail to verify that the given tdb matches the policy. Policies can be created with wildcards (any port, any protocol), then it is compared with a packet whose port and protocol is concreted. Since rn_match() is to find a bestmatch, it can't find a wildcard policy properly if there is a non wildcard policy which is overlapped by the wildcard. So the diff uses rn_lookup() to find the correct policy. > Could we add the masks earlier when the flows are added? > >> +else if (tdbp != NULL) >> +rn = rn_lookup((caddr_t)>tdb_filter, >> +(caddr_t)>tdb_filtermask, rnh); > > What are the consequences of this chunk for regular IPsec? I have thought that again. Now I realized the problem is only for transport mode. For tunnel mode, since best match is always preferred, rn_lookup() should be used. I'll update the diff that uses rn_lookup() for transport mode only. >> /* Match source/dest IDs. */ >> -if (ipo->ipo_ids) >> -if (tdbp->tdb_ids == NULL || >> -!ipsp_ids_match(ipo->ipo_ids, >> tdbp->tdb_ids)) >> +if (ipo->ipo_ids != NULL) { >> +if ((tdbp->tdb_flags & TDBF_TUNNELING) == 0 && >> +(tdbp->tdb_flags & TDBF_UDPENCAP) != 0) { >> +/* >> + * Skip IDs check for transport mode >> + * with NAT-T. Multiple clients (IDs) >> + * can use a same policy. aima>> + */ >> +} else if (tdbp->tdb_ids == NULL && >> +!ipsp_ids_match(ipo->ipo_ids, >> +tdbp->tdb_ids)) >> goto nomatchin; >> +} > > This was added to make IPsec/l2tp work in rev 1.85. And now you > change it to make it work. I wish markus@ or mikeb@ could give a > clue. At the change of 1.85, "ipsec-id bundles" is intr
Re: Fix ipsp_spd_lookup() for transport mode
Hi, On Tue, 14 Dec 2021 01:20:49 +0100 Alexander Bluhm wrote: > I don't know much about l2tp, pipex or npppd. So I cannot say if > the new logic is correct. But I guess you have tested that. Yes, I've tested some L2TP/IPsec cases already. > The tdb mutex and ref counting looks correct. > >> +struct tdb *tdb, *tdblocal = NULL; > > The variable names tdb and tdbp are used very inconsistently within > IPsec. Don't use both. I think tdpb and a tdbflow are sufficient. Ok, > >> +if (ipsecflowinfo != 0) >> +ids = ipsp_ids_lookup(ipsecflowinfo); > > Can you move that to the place where it is needed? Yes, > Perhaps it is easier to understand this way: > > if (ipsecflowinfo != 0) { Sure. Let me update the diff. > It is hard to say whether the new > rn_lookup(tdbp->tdb_filter/tdbp->tdb_filtermask) changes existing > IPsec behavior for setups without l2tp. I suppose it has no regression on other setups. But I'll look it more carefully and test the other setups. > Do we need it there? Yes, if there is another better idea, it will be welcome. For this moment, the diff is the best idea for me. > I never ran into problems patching the correct policy. Index: sys/netinet/ip_ipsp.c === RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_ipsp.c,v retrieving revision 1.264 diff -u -p -r1.264 ip_ipsp.c --- sys/netinet/ip_ipsp.c 11 Dec 2021 16:33:47 - 1.264 +++ sys/netinet/ip_ipsp.c 14 Dec 2021 06:32:07 - @@ -91,6 +91,8 @@ void tdb_soft_timeout(void *); void tdb_soft_firstuse(void *); inttdb_hash(u_int32_t, union sockaddr_union *, u_int8_t); void tdb_dodelete(struct tdb *, int locked); +intsockaddr_encap_match(struct sockaddr_encap *, + struct sockaddr_encap *, struct sockaddr_encap *); int ipsec_in_use = 0; u_int64_t ipsec_last_added = 0; @@ -510,6 +512,78 @@ gettdbbysrc(u_int rdomain, union sockadd tdb_ref(tdbp); mtx_leave(_sadb_mtx); return tdbp; +} + +/* + * Get an SA given the flow, the direction, the security protocol type, and + * the desired IDs. + */ +struct tdb * +gettdbbyflow(u_int rdomain, int direction, struct sockaddr_encap *senflow, +u_int8_t sproto, struct ipsec_ids *ids) +{ + u_int32_t hashval; + struct tdb *tdbp; + union sockaddr_union srcdst; + + if (ids == NULL)/* ids is mandatory */ + return NULL; + + memset(, 0, sizeof(srcdst)); + switch (senflow->sen_type) { + case SENT_IP4: + srcdst.sin.sin_len = sizeof(srcdst.sin); + srcdst.sin.sin_family = AF_INET; + if (direction == IPSP_DIRECTION_OUT) + srcdst.sin.sin_addr = senflow->Sen.Sip4.Dst; + else + srcdst.sin.sin_addr = senflow->Sen.Sip4.Src; + break; + case SENT_IP6: + srcdst.sin6.sin6_len = sizeof(srcdst.sin6); + srcdst.sin6.sin6_family = AF_INET6; + if (direction == IPSP_DIRECTION_OUT) + srcdst.sin6.sin6_addr = senflow->Sen.Sip6.Dst; + else + srcdst.sin6.sin6_addr = senflow->Sen.Sip6.Src; + break; + } + + mtx_enter(_sadb_mtx); + hashval = tdb_hash(0, , sproto); + + for (tdbp = tdbdst[hashval]; tdbp != NULL; tdbp = tdbp->tdb_dnext) + if (tdbp->tdb_sproto == sproto && + tdbp->tdb_rdomain == rdomain && + (tdbp->tdb_flags & TDBF_INVALID) == 0 && + ipsp_ids_match(ids, tdbp->tdb_ids) && + ((direction == IPSP_DIRECTION_OUT && + !memcmp(>tdb_dst, , srcdst.sa.sa_len)) || + (direction == IPSP_DIRECTION_IN && + !memcmp(>tdb_src, , srcdst.sa.sa_len { + if (sockaddr_encap_match(>tdb_filter, + >tdb_filtermask, senflow)) + break; + } + + tdb_ref(tdbp); + mtx_leave(_sadb_mtx); + return tdbp; +} + +int +sockaddr_encap_match(struct sockaddr_encap *addr, struct sockaddr_encap *mask, +struct sockaddr_encap *dest) +{ + size_t off; + + for (off = offsetof(struct sockaddr_encap, sen_type); + off < dest->sen_len; off++) { + if ((*((u_char *)addr + off) & *((u_char *)mask + off)) != + (*((u_char *)dest + off) & *((u_char *)mask + off))) + break; + } + return (off < dest->sen_len)? 0 : 1; } #ifdef DDB Index: sys/netinet/ip_ipsp.h === RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_ipsp.h,v retrieving revision 1.230 diff -u -p -r1.230 ip_ipsp.h --- sys/netinet/ip_ipsp.h 11 Dec 2021 16:33:47 - 1.230 +++
Re: Fix ipsp_spd_lookup() for transport mode
On Wed, 1 Dec 2021 00:27:06 +0100 Alexander Bluhm wrote: > On Tue, Nov 30, 2021 at 05:53:34PM +0300, Vitaliy Makkoveev wrote: >> Hi, >> >> This question is mostly for bluhm@. Should the gettdbbyflow() grab the >> extra reference on returned `tdbp' like other other gettdb*() do? I'm >> pointing this because we are going to not rely on the netlock when doing >> `tdbp' dereference. > > Yes. Call tdb_ref(tdbp) withing the tdb_sadb_mtx mutex. > > The interesting question is when to unref it. You use the same > variable for the tdb parameter and the tdb from gettdbbyflow(). > Tracking when you don't use the new TDB anymore, gets tricky. Let me update the diff. That grabs a reference now. Also the diff fixes gettdbbyflow(). Comparing ids was missing. Index: sys/netinet/ip_ipsp.c === RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_ipsp.c,v retrieving revision 1.258 diff -u -p -r1.258 ip_ipsp.c --- sys/netinet/ip_ipsp.c 29 Nov 2021 19:19:00 - 1.258 +++ sys/netinet/ip_ipsp.c 1 Dec 2021 12:19:53 - @@ -90,6 +90,8 @@ void tdb_firstuse(void *); void tdb_soft_timeout(void *); void tdb_soft_firstuse(void *); inttdb_hash(u_int32_t, union sockaddr_union *, u_int8_t); +intsockaddr_encap_match(struct sockaddr_encap *, + struct sockaddr_encap *, struct sockaddr_encap *); int ipsec_in_use = 0; u_int64_t ipsec_last_added = 0; @@ -507,6 +509,78 @@ gettdbbysrc(u_int rdomain, union sockadd tdb_ref(tdbp); mtx_leave(_sadb_mtx); return tdbp; +} + +/* + * Get an SA given the flow, the direction, the security protocol type, and + * the desired IDs. + */ +struct tdb * +gettdbbyflow(u_int rdomain, int direction, struct sockaddr_encap *senflow, +u_int8_t sproto, struct ipsec_ids *ids) +{ + u_int32_t hashval; + struct tdb *tdbp; + union sockaddr_union srcdst; + + if (ids == NULL)/* ids is mandatory */ + return NULL; + + memset(, 0, sizeof(srcdst)); + switch (senflow->sen_type) { + case SENT_IP4: + srcdst.sin.sin_len = sizeof(srcdst.sin); + srcdst.sin.sin_family = AF_INET; + if (direction == IPSP_DIRECTION_OUT) + srcdst.sin.sin_addr = senflow->Sen.Sip4.Dst; + else + srcdst.sin.sin_addr = senflow->Sen.Sip4.Src; + break; + case SENT_IP6: + srcdst.sin6.sin6_len = sizeof(srcdst.sin6); + srcdst.sin6.sin6_family = AF_INET6; + if (direction == IPSP_DIRECTION_OUT) + srcdst.sin6.sin6_addr = senflow->Sen.Sip6.Dst; + else + srcdst.sin6.sin6_addr = senflow->Sen.Sip6.Src; + break; + } + + mtx_enter(_sadb_mtx); + hashval = tdb_hash(0, , sproto); + + for (tdbp = tdbdst[hashval]; tdbp != NULL; tdbp = tdbp->tdb_dnext) + if (tdbp->tdb_sproto == sproto && + tdbp->tdb_rdomain == rdomain && + (tdbp->tdb_flags & TDBF_INVALID) == 0 && + ipsp_ids_match(ids, tdbp->tdb_ids) && + ((direction == IPSP_DIRECTION_OUT && + !memcmp(>tdb_dst, , srcdst.sa.sa_len)) || + (direction == IPSP_DIRECTION_IN && + !memcmp(>tdb_src, , srcdst.sa.sa_len { + if (sockaddr_encap_match(>tdb_filter, + >tdb_filtermask, senflow)) + break; + } + + tdb_ref(tdbp); + mtx_leave(_sadb_mtx); + return tdbp; +} + +int +sockaddr_encap_match(struct sockaddr_encap *addr, struct sockaddr_encap *mask, +struct sockaddr_encap *dest) +{ + size_t off; + + for (off = offsetof(struct sockaddr_encap, sen_type); + off < dest->sen_len; off++) { + if ((*((u_char *)addr + off) & *((u_char *)mask + off)) != + (*((u_char *)dest + off) & *((u_char *)mask + off))) + break; + } + return (off < dest->sen_len)? 0 : 1; } #ifdef DDB Index: sys/netinet/ip_ipsp.h === RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_ipsp.h,v retrieving revision 1.224 diff -u -p -r1.224 ip_ipsp.h --- sys/netinet/ip_ipsp.h 30 Nov 2021 13:17:43 - 1.224 +++ sys/netinet/ip_ipsp.h 1 Dec 2021 12:19:53 - @@ -565,6 +565,8 @@ struct tdb *gettdbbysrcdst_dir(u_int, u_ union sockaddr_union *, u_int8_t, int); #define gettdbbysrcdst(a,b,c,d,e) gettdbbysrcdst_dir((a),(b),(c),(d),(e),0) #define gettdbbysrcdst_rev(a,b,c,d,e) gettdbbysrcdst_dir((a),(b),(c),(d),(e),1) +struct tdb *gettdbbyflow(u_int, int, struct sockaddr_encap *, u_int8_t, + struct ipsec_ids *); void
Re: Fix ipsp_spd_lookup() for transport mode
Hi, Let me update the diff. Previous has a problem in ipsp_spd_lookup() which uses "rn" without initialization. On Sat, 20 Nov 2021 21:44:20 +0900 (JST) YASUOKA Masahiko wrote: > On Wed, 12 May 2021 19:11:09 +0900 (JST) > YASUOKA Masahiko wrote: >> Radek reported a problem to misc@ that multiple Windows clients behind >> a NAT cannot use a L2TP/IPsec server simultaneously. >> >> https://marc.info/?t=16099681611=1=2 >> >> There is two problems. First is pipex(4) doesn't pass the proper >> ipsecflowinfo to ip_output(). Second is the IPsec policy check which >> is done by ipsp_spd_lookup() returns -1 (EINVAL) if the given tdb is >> not cached. This happens when its flow is shared by another tdb (for >> another client of the same NAT). > > This problem is not fixed yet. The diff for the second problem was > not committed in. It was to fix the check in ipsp_spd_lookup() by > making a IPsec policy have a list of IDs. > > Also my colleague Kawai pointed out there is another problem if there > is a Linux client among with Windows clients behind a NAT. Windows > uses 1701/udp for its local ID, but the Linux uses ANY/udp for its > local ID. > > In the situation, policies will be overlapped. > > (a) Windows: REMOTE_IP:1701/udp <=> LOCAL_IP:1701/udp > (b) Linux:REMOTE_IP:ANY/udp <=> LOCAL_IP:1701/udp > > Since we use a radix tree for the policies, when rn_match() is used to > find a policy, as it's best match, (b) is never selected. > > Let me update the diff. > > As for the incomming, we know the tdb when is used. The diff uses the > tdb to find the proper policy. > > As for the outgoing, other than using "ipsecflowinfo" there is no way > to select a proper policy. So only when "ipsecflowinfo" is used, get > a tdb from the packet flow and the IDs (retributed by the > ipsecflowinfo), then we can find the proper policy by the tdb. > > Also the diff skips the IDs check against the policy only if it is > transport mode and using NAT-T. Since when NAT-T is used for a policy > for transport mode is shared by multiple clients which has a different > IDs, checking the IDs is difficult and I think the checks other than > is enough. > > ok? comments? > > Fix some problems when accepting IPsec transport mode connections from > multiple clients behind a NAT. In the situation, policies can be > overlapped, but previous could not choice a proper policy both for > incoming and outgoing. To solve this problem, use > tdb->tdb_filter{,mask} to find a proper policy for incoming and find the > tdb by the given ipsecflowinfo and use it for outgoing. Also skip > checking IDs of the policy since a policy is shared by multiple clients > in the situation. Index: sys/netinet/ip_ipsp.c === RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_ipsp.c,v retrieving revision 1.258 diff -u -p -r1.258 ip_ipsp.c --- sys/netinet/ip_ipsp.c 29 Nov 2021 19:19:00 - 1.258 +++ sys/netinet/ip_ipsp.c 30 Nov 2021 04:44:48 - @@ -90,6 +90,8 @@ void tdb_firstuse(void *); void tdb_soft_timeout(void *); void tdb_soft_firstuse(void *); inttdb_hash(u_int32_t, union sockaddr_union *, u_int8_t); +intsockaddr_encap_match(struct sockaddr_encap *, + struct sockaddr_encap *, struct sockaddr_encap *); int ipsec_in_use = 0; u_int64_t ipsec_last_added = 0; @@ -507,6 +509,76 @@ gettdbbysrc(u_int rdomain, union sockadd tdb_ref(tdbp); mtx_leave(_sadb_mtx); return tdbp; +} + +/* + * Get an SA given the flow, the direction, the security protocol type, and + * the desired IDs. + */ +struct tdb * +gettdbbyflow(u_int rdomain, int direction, struct sockaddr_encap *senflow, +u_int8_t sproto, struct ipsec_ids *ids) +{ + u_int32_t hashval; + struct tdb *tdbp; + union sockaddr_union srcdst; + + if (ids == NULL)/* ids is mandatory */ + return NULL; + + memset(, 0, sizeof(srcdst)); + switch (senflow->sen_type) { + case SENT_IP4: + srcdst.sin.sin_len = sizeof(srcdst.sin); + srcdst.sin.sin_family = AF_INET; + if (direction == IPSP_DIRECTION_OUT) + srcdst.sin.sin_addr = senflow->Sen.Sip4.Dst; + else + srcdst.sin.sin_addr = senflow->Sen.Sip4.Src; + break; + case SENT_IP6: + srcdst.sin6.sin6_len = sizeof(srcdst.sin6); + srcdst.sin6.sin6_family = AF_INET6; + if (direction == IPSP_DIRECTION_OUT) + srcdst.sin6.sin6_addr = senflow->Sen.Sip6.Dst; + el
Fix ipsp_spd_lookup() for transport mode (was Re: Fix IPsec NAT-T for L2TP/IPsec)
Hi, On Wed, 12 May 2021 19:11:09 +0900 (JST) YASUOKA Masahiko wrote: > Radek reported a problem to misc@ that multiple Windows clients behind > a NAT cannot use a L2TP/IPsec server simultaneously. > > https://marc.info/?t=16099681611=1=2 > > There is two problems. First is pipex(4) doesn't pass the proper > ipsecflowinfo to ip_output(). Second is the IPsec policy check which > is done by ipsp_spd_lookup() returns -1 (EINVAL) if the given tdb is > not cached. This happens when its flow is shared by another tdb (for > another client of the same NAT). This problem is not fixed yet. The diff for the second problem was not committed in. It was to fix the check in ipsp_spd_lookup() by making a IPsec policy have a list of IDs. Also my colleague Kawai pointed out there is another problem if there is a Linux client among with Windows clients behind a NAT. Windows uses 1701/udp for its local ID, but the Linux uses ANY/udp for its local ID. In the situation, policies will be overlapped. (a) Windows: REMOTE_IP:1701/udp <=> LOCAL_IP:1701/udp (b) Linux:REMOTE_IP:ANY/udp <=> LOCAL_IP:1701/udp Since we use a radix tree for the policies, when rn_match() is used to find a policy, as it's best match, (b) is never selected. Let me update the diff. As for the incomming, we know the tdb when is used. The diff uses the tdb to find the proper policy. As for the outgoing, other than using "ipsecflowinfo" there is no way to select a proper policy. So only when "ipsecflowinfo" is used, get a tdb from the packet flow and the IDs (retributed by the ipsecflowinfo), then we can find the proper policy by the tdb. Also the diff skips the IDs check against the policy only if it is transport mode and using NAT-T. Since when NAT-T is used for a policy for transport mode is shared by multiple clients which has a different IDs, checking the IDs is difficult and I think the checks other than is enough. ok? comments? Fix some problems when accepting IPsec transport mode connections from multiple clients behind a NAT. In the situation, policies can be overlapped, but previous could not choice a proper policy both for incoming and outgoing. To solve this problem, use tdb->tdb_filter{,mask} to find a proper policy for incoming and find the tdb by the given ipsecflowinfo and use it for outgoing. Also skip checking IDs of the policy since a policy is shared by multiple clients in the situation. Index: sys/netinet/ip_ipsp.c === RCS file: /cvs/src/sys/netinet/ip_ipsp.c,v retrieving revision 1.251 diff -u -p -r1.251 ip_ipsp.c --- sys/netinet/ip_ipsp.c 18 Nov 2021 11:04:10 - 1.251 +++ sys/netinet/ip_ipsp.c 20 Nov 2021 12:42:36 - @@ -91,6 +91,8 @@ void tdb_firstuse(void *); void tdb_soft_timeout(void *); void tdb_soft_firstuse(void *); inttdb_hash(u_int32_t, union sockaddr_union *, u_int8_t); +intsockaddr_encap_match(struct sockaddr_encap *, + struct sockaddr_encap *, struct sockaddr_encap *); int ipsec_in_use = 0; u_int64_t ipsec_last_added = 0; @@ -501,6 +503,76 @@ gettdbbysrc(u_int rdomain, union sockadd mtx_leave(_sadb_mtx); return tdbp; +} + +/* + * Get an SA given the flow, the direction, the security protocol type, and + * the desired IDs. + */ +struct tdb * +gettdbbyflow(u_int rdomain, int direction, struct sockaddr_encap *senflow, +u_int8_t sproto, struct ipsec_ids *ids) +{ + u_int32_t hashval; + struct tdb *tdbp; + union sockaddr_union srcdst; + + if (ids == NULL)/* ids is mandatory */ + return NULL; + + memset(, 0, sizeof(srcdst)); + switch (senflow->sen_type) { + case SENT_IP4: + srcdst.sin.sin_len = sizeof(srcdst.sin); + srcdst.sin.sin_family = AF_INET; + if (direction == IPSP_DIRECTION_OUT) + srcdst.sin.sin_addr = senflow->Sen.Sip4.Dst; + else + srcdst.sin.sin_addr = senflow->Sen.Sip4.Src; + break; + case SENT_IP6: + srcdst.sin6.sin6_len = sizeof(srcdst.sin6); + srcdst.sin6.sin6_family = AF_INET6; + if (direction == IPSP_DIRECTION_OUT) + srcdst.sin6.sin6_addr = senflow->Sen.Sip6.Dst; + else + srcdst.sin6.sin6_addr = senflow->Sen.Sip6.Src; + break; + } + + mtx_enter(_sadb_mtx); + hashval = tdb_hash(0, , sproto); + + for (tdbp = tdbdst[hashval]; tdbp != NULL; tdbp = tdbp->tdb_dnext) + if (tdbp->tdb_sproto == sproto && + tdbp->tdb_rdomain == rdomain && + (tdbp->tdb_flags & TDBF_INVALID) == 0 && + ((directi
Re: diff: ipsec.conf(5), clarify "aes" accepts 128:256 bits
Hi, On Tue, 2 Nov 2021 07:03:43 + Jason McIntyre wrote: > On Tue, Nov 02, 2021 at 12:02:07PM +0900, YASUOKA Masahiko wrote: >> I'd like to clarify "aes" in ipsec.conf accepts 128:256 bits. >> >> sbin/ipsecctl/ike.c: >> 201 case ENCXF_AES: >> 202 enc_alg = "AES"; >> 203 key_length = "128,128:256"; >> 204 break; >> >> >> ok? >> >> Clarify "aes" will accept keys which length is in 128:256 bits. >> > > i notice that the enc lists in ipsec.conf.5 and iked.conf.5 differ. > aren;t they supposed to be in sync? > > for example, iked.conf.5 doesn;t mention "aes" or "aesctr". also the > *-gmac and *-gcm-12 discrepancy. As for "aes", *only isakmpd(8)* supports "aes" keyword or having a range for the key length. So there isn't need to sync it to iked.conf.5. Also I belive "aesctr" is to support 160:288 range for key length, but the implemention doesn't seem to be completed. I have another plan to handle this separately, then I'll update the man page. Other than the key length range, it seems there are some differences between iked.conf.5 and ipsec.conf.5. 1. "-gcm-12" missing this in ipsec.conf.5 is ok since isakmpd(8) doesn't support it yet. (It is actually an alias ID for "-gcm" though.) 2. "-gmac" and "null" iked.conf.5 has a separeted list for them to clarify they don't do encryption. Applied the same to isakmpd.conf.5. 3. "chacha20-poly1305" It is missing in ipsec.conf.5. 4. explanation of "[IKE only]" or "[phase 2]" It is missing in ipsec.conf.5. Copied the section from iked.conf and modified it. 5. explanation of "keysize" for AES-CTR and so on The explanation in ipsec.conf.5 is better. Copied that to iked.conf.5. 6. "cast" ipsecctl(8) program doesn't support "cast" keyword actually, it supports "cast128" instead. Correct "cast" to "cast128" ok? Index: sbin/iked/iked.conf.5 === RCS file: /cvs/src/sbin/iked/iked.conf.5,v retrieving revision 1.87 diff -u -p -r1.87 iked.conf.5 --- sbin/iked/iked.conf.5 26 Oct 2021 17:31:22 - 1.87 +++ sbin/iked/iked.conf.5 3 Nov 2021 05:42:48 - @@ -998,9 +998,9 @@ keyword. 3DES requires 24 bytes to form its 168-bit key. This is because the most significant bit of each byte is used for parity. .Pp -The keysize of AES-CTR is actually 128-bit. +The keysize of AES-CTR can be 128, 192, or 256 bits. However as well as the key, a 32-bit nonce has to be supplied. -Thus 160 bits of key material have to be supplied. +Thus 160, 224, or 288 bits of key material, respectively, have to be supplied. The same applies to AES-GCM, AES-GMAC and Chacha20-Poly1305, however in the latter case the keysize is 256 bit. .Pp Index: sbin/ipsecctl/ipsec.conf.5 === RCS file: /cvs/src/sbin/ipsecctl/ipsec.conf.5,v retrieving revision 1.160 diff -u -p -r1.160 ipsec.conf.5 --- sbin/ipsecctl/ipsec.conf.5 22 Oct 2021 12:30:54 - 1.160 +++ sbin/ipsecctl/ipsec.conf.5 3 Nov 2021 05:42:49 - @@ -637,10 +637,10 @@ keyword: The following cipher types are permitted with the .Ic enc keyword: -.Bl -column "aes-128-gmac" "Key Length" "Description" -offset indent +.Bl -column "chacha20-poly1305" "128-256 bits" "Description" -offset indent .It Em "Cipher" Ta Em "Key Length" Ta "" .It Li 3des Ta "168 bits" Ta "" -.It Li aes Ta "128 bits" Ta "" +.It Li aes Ta "128-256 bits" Ta "" .It Li aes-128 Ta "128 bits" Ta "" .It Li aes-192 Ta "192 bits" Ta "" .It Li aes-256 Ta "256 bits" Ta "" @@ -651,21 +651,37 @@ keyword: .It Li aes-128-gcm Ta "160 bits" Ta "[phase 2 only, IKE only]" .It Li aes-192-gcm Ta "224 bits" Ta "[phase 2 only, IKE only]" .It Li aes-256-gcm Ta "288 bits" Ta "[phase 2 only, IKE only]" +.It Li blowfish Ta "160 bits" Ta "" +.It Li cast128 Ta "128 bits" Ta "" +.It Li chacha20-poly1305 Ta "288 bits" Ta "" +.El +.Pp +The following cipher types provide only authentication, not encryption: +.Bl -column "chacha20-poly1305" "128-256 bits" "Description" -offset indent .It Li aes-128-gmac Ta "160 bits" Ta "[phase 2 only, IKE only]" .It Li aes-192-g
diff: isakmpd.conf.5, clarify ANY can be used for some params
ok? Clarify that ANY can be used for several parameters of IPsec transform. Index: sbin/isakmpd/isakmpd.conf.5 === RCS file: /cvs/src/sbin/isakmpd/isakmpd.conf.5,v retrieving revision 1.135 diff -u -p -r1.135 isakmpd.conf.5 --- sbin/isakmpd/isakmpd.conf.5 17 Apr 2018 12:13:29 - 1.135 +++ sbin/isakmpd/isakmpd.conf.5 2 Nov 2021 02:57:23 - @@ -726,7 +726,7 @@ See below. Parameters for IPsec transform configuration .Bl -tag -width Ds .It Em AUTHENTICATION_ALGORITHM -The optional authentication algorithm in the case of this +The optional authentication algorithm or ANY in the case of this being an ESP transform. .It Em ENCAPSULATION_MODE The encapsulation mode as given by the RFCs. @@ -745,7 +745,8 @@ List of lifetimes, each element is a .Aq Sy Lifetime section name. .It Em TRANSFORM_ID -The transform ID as given by the RFCs. +The transform ID as given by the RFCs, or ANY to denote that any +transform proposed will be accepted. .El .It Aq Sy IPsec-ID Parameters for IPsec ID configuration
diff: ipsec.conf(5), clarify "aes" accepts 128:256 bits
I'd like to clarify "aes" in ipsec.conf accepts 128:256 bits. sbin/ipsecctl/ike.c: 201 case ENCXF_AES: 202 enc_alg = "AES"; 203 key_length = "128,128:256"; 204 break; ok? Clarify "aes" will accept keys which length is in 128:256 bits. Index: sbin/ipsecctl/ipsec.conf.5 === RCS file: /cvs/src/sbin/ipsecctl/ipsec.conf.5,v retrieving revision 1.160 diff -u -p -r1.160 ipsec.conf.5 --- sbin/ipsecctl/ipsec.conf.5 22 Oct 2021 12:30:54 - 1.160 +++ sbin/ipsecctl/ipsec.conf.5 2 Nov 2021 02:58:13 - @@ -637,10 +637,10 @@ keyword: The following cipher types are permitted with the .Ic enc keyword: -.Bl -column "aes-128-gmac" "Key Length" "Description" -offset indent +.Bl -column "aes-128-gmac" "128-256 bits" "Description" -offset indent .It Em "Cipher" Ta Em "Key Length" Ta "" .It Li 3des Ta "168 bits" Ta "" -.It Li aes Ta "128 bits" Ta "" +.It Li aes Ta "128-256 bits" Ta "" .It Li aes-128 Ta "128 bits" Ta "" .It Li aes-192 Ta "192 bits" Ta "" .It Li aes-256 Ta "256 bits" Ta ""
Re: Exit status of pkg_add
Hi, # drop ccing misc@ The diff seems ok for me. ok to commit it in? On Tue, 19 Oct 2021 10:42:04 +0900 Yuichiro NAITO wrote: > Following patch changes pkg_add to return a error code, > if a package name is wrong. > > diff --git a/usr.sbin/pkg_add/OpenBSD/AddDelete.pm > b/usr.sbin/pkg_add/OpenBSD/AddDelete.pm > index 7a968cbf05d..39bee874ff1 100644 > --- a/usr.sbin/pkg_add/OpenBSD/AddDelete.pm > +++ b/usr.sbin/pkg_add/OpenBSD/AddDelete.pm > @@ -403,12 +403,13 @@ sub check_root > sub choose_location > { > my ($state, $name, $list, $is_quirks) = @_; > if (@$list == 0) { > if (!$is_quirks) { > $state->errsay("Can't find #1", $name); > + $state->{bad}++; > $state->run_quirks( > sub { > my $quirks = shift; > $quirks->filter_obsolete([$name], $state); > }); > } > > Is it OK? > > On 10/18/21 16:53, Yuichiro NAITO wrote: >> Hi, I have a question about exit status of pkg_add command. >> When I wrote a package install script which included typo in a package >> name >> (of course it's my fault), the script didn't stop in spite of `set >> -e`. >> Because pkg_add command returns 0 even if a package name is wrong. >> Is this exit status intended or design policy of pkg_add command? >> If not, I want a error status getting returned. >> It will save my time to look for a typo or similar bug. >> I can't see 'EXIT STATUS' section in the pkg_add manual of OpenBSD >> 7.0. >> So, I e-mailed this question. >> > > -- > Yuichiro NAITO (naito.yuich...@gmail.com) >
Re: Fix IPsec NAT-T for L2TP/IPsec
On Wed, 12 May 2021 19:11:09 +0900 (JST) YASUOKA Masahiko wrote: > Radek reported a problem to misc@ that multiple Windows clients behind > a NAT cannot use a L2TP/IPsec server simultaneously. > > https://marc.info/?t=16099681611=1=2 > > There is two problems. First is pipex(4) doesn't pass the proper > ipsecflowinfo to ip_output(). Second is the IPsec policy check which > is done by ipsp_spd_lookup() returns -1 (EINVAL) if the given tdb is > not cached. This happens when its flow is shared by another tdb (for > another client of the same NAT). > > The following 2 diffs fix these problem. > > comment? > ok? > > diff #1 > > Fix IPsec NAT-T work with pipex. The original diff #1 used m_tag to specify the ipsecflowinfo. I noticed "ph_cookie" is usable instead of the m_tag. It seems simpler. Is it better? Index: sys/net/if_etherip.c === RCS file: /disk/cvs/openbsd/src/sys/net/if_etherip.c,v retrieving revision 1.48 diff -u -p -r1.48 if_etherip.c --- sys/net/if_etherip.c9 Jan 2021 21:00:58 - 1.48 +++ sys/net/if_etherip.c12 May 2021 23:29:41 - @@ -547,7 +547,7 @@ ip_etherip_output(struct ifnet *ifp, str etheripstat_pkt(etherips_opackets, etherips_obytes, m->m_pkthdr.len - (sizeof(struct ip) + sizeof(struct etherip_header))); - ip_send(m); + ip_send(m, 0); return (0); } Index: sys/net/if_gif.c === RCS file: /disk/cvs/openbsd/src/sys/net/if_gif.c,v retrieving revision 1.132 diff -u -p -r1.132 if_gif.c --- sys/net/if_gif.c20 Feb 2021 04:58:29 - 1.132 +++ sys/net/if_gif.c12 May 2021 23:29:45 - @@ -340,7 +340,7 @@ gif_send(struct gif_softc *sc, struct mb ip->ip_src = sc->sc_tunnel.t_src4; ip->ip_dst = sc->sc_tunnel.t_dst4; - ip_send(m); + ip_send(m, 0); break; } #ifdef INET6 Index: sys/net/if_gre.c === RCS file: /disk/cvs/openbsd/src/sys/net/if_gre.c,v retrieving revision 1.171 diff -u -p -r1.171 if_gre.c --- sys/net/if_gre.c10 Mar 2021 10:21:47 - 1.171 +++ sys/net/if_gre.c12 May 2021 23:29:52 - @@ -1999,7 +1999,7 @@ gre_ip_output(const struct gre_tunnel *t switch (tunnel->t_af) { case AF_INET: - ip_send(m); + ip_send(m, 0); break; #ifdef INET6 case AF_INET6: Index: sys/net/pf.c === RCS file: /disk/cvs/openbsd/src/sys/net/pf.c,v retrieving revision 1.1116 diff -u -p -r1.1116 pf.c --- sys/net/pf.c27 Apr 2021 09:38:29 - 1.1116 +++ sys/net/pf.c12 May 2021 23:29:56 - @@ -2896,7 +2896,7 @@ pf_send_tcp(const struct pf_rule *r, sa_ switch (af) { case AF_INET: - ip_send(m); + ip_send(m, 0); break; #ifdef INET6 case AF_INET6: Index: sys/net/pipex.c === RCS file: /disk/cvs/openbsd/src/sys/net/pipex.c,v retrieving revision 1.132 diff -u -p -r1.132 pipex.c --- sys/net/pipex.c 10 Mar 2021 10:21:48 - 1.132 +++ sys/net/pipex.c 12 May 2021 23:31:24 - @@ -1258,7 +1258,7 @@ pipex_pptp_output(struct mbuf *m0, struc gre->flags = htons(gre->flags); m0->m_pkthdr.ph_ifidx = session->ifindex; - ip_send(m0); + ip_send(m0, 0); if (len > 0) { /* network layer only */ /* countup statistics */ session->stat.opackets++; @@ -1704,7 +1704,7 @@ pipex_l2tp_output(struct mbuf *m0, struc ip->ip_tos = 0; ip->ip_off = 0; - ip_send(m0); + ip_send(m0, session->proto.l2tp.ipsecflowinfo); break; #ifdef INET6 case AF_INET6: Index: sys/netinet/ip_icmp.c === RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_icmp.c,v retrieving revision 1.186 diff -u -p -r1.186 ip_icmp.c --- sys/netinet/ip_icmp.c 30 Mar 2021 08:37:10 - 1.186 +++ sys/netinet/ip_icmp.c 12 May 2021 23:31:57 - @@ -860,7 +860,7 @@ icmp_send(struct mbuf *m, struct mbuf *o ipstat_inc(ips_localout); ip_send_raw(m); } else - ip_send(m); + ip_send(m, 0); } u_int32_t Index: sys/netinet/ip_input.c === RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_input.c,v retrieving revision 1.359 diff -u -p -r1.359 ip_input.c --- sys/netinet/ip_input.c 30 Apr 2021 13:52:48 - 1.359 +++ sys/n
Re: Fix IPsec NAT-T for L2TP/IPsec
On Wed, 12 May 2021 19:15:29 +0300 Vitaliy Makkoveev wrote: >> On 12 May 2021, at 18:42, YASUOKA Masahiko wrote: >> On Wed, 12 May 2021 17:26:51 +0300 >> Vitaliy Makkoveev wrote: >>> On Wed, May 12, 2021 at 07:11:09PM +0900, YASUOKA Masahiko wrote: >>>> Radek reported a problem to misc@ that multiple Windows clients behind a >>>> NAT >>>> cannot use a L2TP/IPsec server simultaneously. >>>> >>>> https://marc.info/?t=16099681611=1=2 >>>> >>>> There is two problems. First is pipex(4) doesn't pass the proper >>>> ipsecflowinfo to ip_output(). Second is the IPsec policy check which is >>>> done by ipsp_spd_lookup() returns -1 (EINVAL) if the given tdb is not >>>> cached. This happens when its flow is shared by another tdb (for another >>>> client of the same NAT). >>>> >>>> The following 2 diffs fix these problem. >>>> >>>> comment? >>>> ok? >>>> >>> >>> Hi. >>> >>> I have two comments for the diff 1: >>> >>> 1. You should add PACKET_TAG_IPSEC_FLOWINFO description to >>>m_tag_get(9). >>> 2. You introduced mbuf(9) leak in pipex_l2tp_output() error path. I >>> pointed the place in your diff. >> >> Good catch. Thanks. >> > > m_freem(9) accepts NULL so this check before is redundant. Yes, > It seems to me that "Used by the IPv4 stack to specify the IPsec flow > of an output IP packet. The tag contains a u_int32_t identifying the > IPsec flow.” is enough. Anyway it’s better to ask jmc@. Ok, > Also I like to remove PACKET_TAG_PIPEX with separate diff. I removed PACKET_TAG_PIPEX separetely. Let me update the diff. Index: sys/net/pipex.c === RCS file: /disk/cvs/openbsd/src/sys/net/pipex.c,v retrieving revision 1.132 diff -u -p -r1.132 pipex.c --- sys/net/pipex.c 10 Mar 2021 10:21:48 - 1.132 +++ sys/net/pipex.c 12 May 2021 23:18:52 - @@ -1628,6 +1628,7 @@ pipex_l2tp_output(struct mbuf *m0, struc #ifdef INET6 struct ip6_hdr *ip6; #endif + struct m_tag *mtag; hlen = sizeof(struct pipex_l2tp_header) + ((pipex_session_is_l2tp_data_sequencing_on(session)) @@ -1704,6 +1705,15 @@ pipex_l2tp_output(struct mbuf *m0, struc ip->ip_tos = 0; ip->ip_off = 0; + if (session->proto.l2tp.ipsecflowinfo > 0) { + if ((mtag = m_tag_get(PACKET_TAG_IPSEC_FLOWINFO, + sizeof(u_int32_t), M_NOWAIT)) == NULL) + goto drop; + *(u_int32_t *)(mtag + 1) = + session->proto.l2tp.ipsecflowinfo; + m_tag_prepend(m0, mtag); + } + ip_send(m0); break; #ifdef INET6 @@ -1733,6 +1743,7 @@ pipex_l2tp_output(struct mbuf *m0, struc return; drop: + m_freem(m0); session->stat.oerrors++; } Index: sys/netinet/ip_input.c === RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_input.c,v retrieving revision 1.359 diff -u -p -r1.359 ip_input.c --- sys/netinet/ip_input.c 30 Apr 2021 13:52:48 - 1.359 +++ sys/netinet/ip_input.c 12 May 2021 23:18:52 - @@ -1790,6 +1790,8 @@ ip_send_do_dispatch(void *xmq, int flags struct mbuf_queue *mq = xmq; struct mbuf *m; struct mbuf_list ml; + struct m_tag *mtag; + u_int32_t ipsecflowinfo = 0; mq_delist(mq, ); if (ml_empty()) @@ -1797,7 +1799,12 @@ ip_send_do_dispatch(void *xmq, int flags NET_LOCK(); while ((m = ml_dequeue()) != NULL) { - ip_output(m, NULL, NULL, flags, NULL, NULL, 0); + if ((mtag = m_tag_find(m, PACKET_TAG_IPSEC_FLOWINFO, NULL)) + != NULL) { + ipsecflowinfo = *(u_int32_t *)(mtag + 1); + m_tag_delete(m, mtag); + } + ip_output(m, NULL, NULL, flags, NULL, NULL, ipsecflowinfo); } NET_UNLOCK(); } Index: sys/sys/mbuf.h === RCS file: /disk/cvs/openbsd/src/sys/sys/mbuf.h,v retrieving revision 1.252 diff -u -p -r1.252 mbuf.h --- sys/sys/mbuf.h 25 Feb 2021 02:43:31 - 1.252 +++ sys/sys/mbuf.h 12 May 2021 23:18:52 - @@ -469,6 +469,7 @@ struct m_tag *m_tag_next(struct mbuf *, /* Packet tag types */ #define PACKET_TAG_IPSEC_IN_DONE 0x0001 /* IPsec applied, in */ #define PACKET_TAG_IPSEC_OUT_DONE 0x0002 /* IPsec applied, out */ +#define PACKET_TAG_IPSEC_FLOWINFO
Re: Fix IPsec NAT-T for L2TP/IPsec
On Wed, 12 May 2021 17:26:51 +0300 Vitaliy Makkoveev wrote: > On Wed, May 12, 2021 at 07:11:09PM +0900, YASUOKA Masahiko wrote: >> Radek reported a problem to misc@ that multiple Windows clients behind a NAT >> cannot use a L2TP/IPsec server simultaneously. >> >> https://marc.info/?t=16099681611=1=2 >> >> There is two problems. First is pipex(4) doesn't pass the proper >> ipsecflowinfo to ip_output(). Second is the IPsec policy check which is >> done by ipsp_spd_lookup() returns -1 (EINVAL) if the given tdb is not >> cached. This happens when its flow is shared by another tdb (for another >> client of the same NAT). >> >> The following 2 diffs fix these problem. >> >> comment? >> ok? >> > > Hi. > > I have two comments for the diff 1: > > 1. You should add PACKET_TAG_IPSEC_FLOWINFO description to > m_tag_get(9). > 2. You introduced mbuf(9) leak in pipex_l2tp_output() error path. I >pointed the place in your diff. Good catch. Thanks. Let me update the diff. Index: sys/net/pipex.c === RCS file: /disk/cvs/openbsd/src/sys/net/pipex.c,v retrieving revision 1.132 diff -u -p -r1.132 pipex.c --- sys/net/pipex.c 10 Mar 2021 10:21:48 - 1.132 +++ sys/net/pipex.c 12 May 2021 15:33:33 - @@ -1628,6 +1628,7 @@ pipex_l2tp_output(struct mbuf *m0, struc #ifdef INET6 struct ip6_hdr *ip6; #endif + struct m_tag *mtag; hlen = sizeof(struct pipex_l2tp_header) + ((pipex_session_is_l2tp_data_sequencing_on(session)) @@ -1704,6 +1705,15 @@ pipex_l2tp_output(struct mbuf *m0, struc ip->ip_tos = 0; ip->ip_off = 0; + if (session->proto.l2tp.ipsecflowinfo > 0) { + if ((mtag = m_tag_get(PACKET_TAG_IPSEC_FLOWINFO, + sizeof(u_int32_t), M_NOWAIT)) == NULL) + goto drop; + *(u_int32_t *)(mtag + 1) = + session->proto.l2tp.ipsecflowinfo; + m_tag_prepend(m0, mtag); + } + ip_send(m0); break; #ifdef INET6 @@ -1733,6 +1743,8 @@ pipex_l2tp_output(struct mbuf *m0, struc return; drop: + if (m0 != NULL) + m_freem(m0); session->stat.oerrors++; } Index: sys/netinet/ip_input.c === RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_input.c,v retrieving revision 1.359 diff -u -p -r1.359 ip_input.c --- sys/netinet/ip_input.c 30 Apr 2021 13:52:48 - 1.359 +++ sys/netinet/ip_input.c 12 May 2021 15:31:52 - @@ -1790,6 +1790,8 @@ ip_send_do_dispatch(void *xmq, int flags struct mbuf_queue *mq = xmq; struct mbuf *m; struct mbuf_list ml; + struct m_tag *mtag; + u_int32_t ipsecflowinfo = 0; mq_delist(mq, ); if (ml_empty()) @@ -1797,7 +1799,12 @@ ip_send_do_dispatch(void *xmq, int flags NET_LOCK(); while ((m = ml_dequeue()) != NULL) { - ip_output(m, NULL, NULL, flags, NULL, NULL, 0); + if ((mtag = m_tag_find(m, PACKET_TAG_IPSEC_FLOWINFO, NULL)) + != NULL) { + ipsecflowinfo = *(u_int32_t *)(mtag + 1); + m_tag_delete(m, mtag); + } + ip_output(m, NULL, NULL, flags, NULL, NULL, ipsecflowinfo); } NET_UNLOCK(); } Index: sys/sys/mbuf.h === RCS file: /disk/cvs/openbsd/src/sys/sys/mbuf.h,v retrieving revision 1.252 diff -u -p -r1.252 mbuf.h --- sys/sys/mbuf.h 25 Feb 2021 02:43:31 - 1.252 +++ sys/sys/mbuf.h 12 May 2021 15:31:52 - @@ -469,6 +469,7 @@ struct m_tag *m_tag_next(struct mbuf *, /* Packet tag types */ #define PACKET_TAG_IPSEC_IN_DONE 0x0001 /* IPsec applied, in */ #define PACKET_TAG_IPSEC_OUT_DONE 0x0002 /* IPsec applied, out */ +#define PACKET_TAG_IPSEC_FLOWINFO 0x0004 /* IPsec flowinfo */ #define PACKET_TAG_WIREGUARD 0x0040 /* WireGuard data */ #define PACKET_TAG_GRE 0x0080 /* GRE processing done */ #define PACKET_TAG_DLT 0x0100 /* data link layer type */ @@ -479,7 +480,7 @@ struct m_tag *m_tag_next(struct mbuf *, #define PACKET_TAG_CARP_BAL_IP 0x4000 /* carp(4) ip balanced marker */ #define MTAG_BITS \ -("\20\1IPSEC_IN_DONE\2IPSEC_OUT_DONE\3IPSEC_IN_CRYPTO_DONE" \ +("\20\1IPSEC_IN_DONE\2IPSEC_OUT_DONE\3IPSEC_FLOWINFO" \ "\4IPSEC_OUT_CRYPTO_NEEDED\5IPSEC_PENDING_TDB\6BRIDGE\7WG\10GRE\11DLT" \ "\12PF_DIVERT\14PF_REASSEMBLED\15SRCROUTE\16TUNNEL\17CARP_BAL_IP") Index: share/man/man9
Fix IPsec NAT-T for L2TP/IPsec
Hi, Radek reported a problem to misc@ that multiple Windows clients behind a NAT cannot use a L2TP/IPsec server simultaneously. https://marc.info/?t=16099681611=1=2 There is two problems. First is pipex(4) doesn't pass the proper ipsecflowinfo to ip_output(). Second is the IPsec policy check which is done by ipsp_spd_lookup() returns -1 (EINVAL) if the given tdb is not cached. This happens when its flow is shared by another tdb (for another client of the same NAT). The following 2 diffs fix these problem. comment? ok? diff #1 Fix IPsec NAT-T work with pipex. Index: sys/net/pipex.c === RCS file: /disk/cvs/openbsd/src/sys/net/pipex.c,v retrieving revision 1.132 diff -u -p -r1.132 pipex.c --- sys/net/pipex.c 10 Mar 2021 10:21:48 - 1.132 +++ sys/net/pipex.c 12 May 2021 09:38:32 - @@ -1628,6 +1628,7 @@ pipex_l2tp_output(struct mbuf *m0, struc #ifdef INET6 struct ip6_hdr *ip6; #endif + struct m_tag *mtag; hlen = sizeof(struct pipex_l2tp_header) + ((pipex_session_is_l2tp_data_sequencing_on(session)) @@ -1703,6 +1704,15 @@ pipex_l2tp_output(struct mbuf *m0, struc ip->ip_ttl = MAXTTL; ip->ip_tos = 0; ip->ip_off = 0; + + if (session->proto.l2tp.ipsecflowinfo > 0) { + if ((mtag = m_tag_get(PACKET_TAG_IPSEC_FLOWINFO, + sizeof(u_int32_t), M_NOWAIT)) == NULL) + goto drop; + *(u_int32_t *)(mtag + 1) = + session->proto.l2tp.ipsecflowinfo; + m_tag_prepend(m0, mtag); + } ip_send(m0); break; Index: sys/netinet/ip_input.c === RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_input.c,v retrieving revision 1.359 diff -u -p -r1.359 ip_input.c --- sys/netinet/ip_input.c 30 Apr 2021 13:52:48 - 1.359 +++ sys/netinet/ip_input.c 12 May 2021 09:38:32 - @@ -1790,6 +1790,8 @@ ip_send_do_dispatch(void *xmq, int flags struct mbuf_queue *mq = xmq; struct mbuf *m; struct mbuf_list ml; + struct m_tag *mtag; + u_int32_t ipsecflowinfo = 0; mq_delist(mq, ); if (ml_empty()) @@ -1797,7 +1799,12 @@ ip_send_do_dispatch(void *xmq, int flags NET_LOCK(); while ((m = ml_dequeue()) != NULL) { - ip_output(m, NULL, NULL, flags, NULL, NULL, 0); + if ((mtag = m_tag_find(m, PACKET_TAG_IPSEC_FLOWINFO, NULL)) + != NULL) { + ipsecflowinfo = *(u_int32_t *)(mtag + 1); + m_tag_delete(m, mtag); + } + ip_output(m, NULL, NULL, flags, NULL, NULL, ipsecflowinfo); } NET_UNLOCK(); } Index: sys/sys/mbuf.h === RCS file: /disk/cvs/openbsd/src/sys/sys/mbuf.h,v retrieving revision 1.252 diff -u -p -r1.252 mbuf.h --- sys/sys/mbuf.h 25 Feb 2021 02:43:31 - 1.252 +++ sys/sys/mbuf.h 12 May 2021 09:38:32 - @@ -469,6 +469,7 @@ struct m_tag *m_tag_next(struct mbuf *, /* Packet tag types */ #define PACKET_TAG_IPSEC_IN_DONE 0x0001 /* IPsec applied, in */ #define PACKET_TAG_IPSEC_OUT_DONE 0x0002 /* IPsec applied, out */ +#define PACKET_TAG_IPSEC_FLOWINFO 0x0004 /* IPsec flowinfo */ #define PACKET_TAG_WIREGUARD 0x0040 /* WireGuard data */ #define PACKET_TAG_GRE 0x0080 /* GRE processing done */ #define PACKET_TAG_DLT 0x0100 /* data link layer type */ @@ -479,7 +480,7 @@ struct m_tag *m_tag_next(struct mbuf *, #define PACKET_TAG_CARP_BAL_IP 0x4000 /* carp(4) ip balanced marker */ #define MTAG_BITS \ -("\20\1IPSEC_IN_DONE\2IPSEC_OUT_DONE\3IPSEC_IN_CRYPTO_DONE" \ +("\20\1IPSEC_IN_DONE\2IPSEC_OUT_DONE\3IPSEC_FLOWINFO" \ "\4IPSEC_OUT_CRYPTO_NEEDED\5IPSEC_PENDING_TDB\6BRIDGE\7WG\10GRE\11DLT" \ "\12PF_DIVERT\14PF_REASSEMBLED\15SRCROUTE\16TUNNEL\17CARP_BAL_IP") diff #2 Make the IPsec flow can have multiple `ipsec_ids' so that ipsp_spd_lookup() can check whether the `ipsec_ids` of the given tdb is belonged with a flow shared by mutlple clients behind a NAT. Index: sys/net/pfkeyv2.c === RCS file: /disk/cvs/openbsd/src/sys/net/pfkeyv2.c,v retrieving revision 1.211 diff -u -p -r1.211 pfkeyv2.c --- sys/net/pfkeyv2.c 4 May 2021 09:28:04 - 1.211 +++ sys/net/pfkeyv2.c 12 May 2021 10:07:11 - @@ -1106,6 +1106,7 @@ pfkeyv2_send(struct socket *so, void *me int i, j, rval = 0, mode = PFKEYV2_SENDMESSAGE_BROADCAST; int delflag = 0; struct sockaddr_encap encapdst, encapnetmask; + struct ipsec_ids *ids, *ids0; struct ipsec_policy *ipo; struct ipsec_acquire *ipa; struct
Re: monotonic time going back by wrong skews
Hi, I'm sorry.. I send a wrong diff to the people. The result from giovanni@ and mcmer seems wrong. I suppose stu@ used the correct diff. giovanni and mcmer, can you test with the correct diff again? I attached the correct diff at last of this mail. I'm sorry again. On Tue, 6 Apr 2021 09:21:40 +0200 Giovanni Bechis wrote: > On Mon, Apr 05, 2021 at 07:14:49PM +0900, YASUOKA Masahiko wrote: >> Hi, >> >> > Another issue that I see is that people have not reported, at least > [...] >> > publicly, that this runs fine on their normal OpenBSD machines. >> >> Some dmesgs posted on public lists seems to have the same problem. >> >> https://marc.info/?l=openbsd-bugs=2=1=disabling+user+TSC=b >> https://marc.info/?l=openbsd-tech=2=1=disabling+user+TSC=b >> https://marc.info/?l=openbsd-ports=2=1=disabling+user+TSC=b >> >> For example, >> >> https://marc.info/?l=openbsd-bugs=161618496905444=2 >> >> |Subject:wg(4) crash >> |From: Stuart Henderson >> |bios0: vendor Dell Inc. version "2.9.0" date 12/06/2019 >> |bios0: Dell Inc. PowerEdge R620 >> |cpu1: disabling user TSC (skew=135) >> |cpu1: smt 0, core 0, package 1 >> >> https://marc.info/?l=openbsd-ports=161306073708427=2 >> |Subject:Re: sysutils/nut README APC over USB device chgrp/chmod >> |From: Marcus MERIGHI >> |bios0: vendor American Megatrends Inc. version "3.1" date 06/07/2018 >> |cpu11: disabling user TSC (skew=240) >> |cpu11: smt 0, core 3, package 1 >> >> these 2 are real machine and using 2 CPU sockets. >> >> https://marc.info/?l=openbsd-ports=161562278114172=2 >> |Subject:ruby27 vs Puppet >> |From: Giovanni Bechis >> |bios0: vendor Phoenix Technologies LTD version "6.00" date 12/12/2018 >> |bios0: VMware, Inc. VMware Virtual Platform >> |cpu1: disabling user TSC (skew=-12705) >> >> VMware. seems the same problem of mine. >> >> I'll ask people to do the same test which cheloha@ write in previous >> mail. >> > Attached my data and dmesg produced by the script on my VMware vm. > > Cheers > Giovanni Index: sys/arch/amd64/amd64/tsc.c === RCS file: /var/cvs/openbsd/src/sys/arch/amd64/amd64/tsc.c,v retrieving revision 1.23 diff -u -p -r1.23 tsc.c --- sys/arch/amd64/amd64/tsc.c 23 Feb 2021 04:44:30 - 1.23 +++ sys/arch/amd64/amd64/tsc.c 5 Apr 2021 10:28:00 - @@ -311,16 +311,42 @@ tsc_read_bp(struct cpu_info *ci, uint64_ *aptscp = tsc_sync_val; } +#defineTSC_SYNC_NTIMES 1000 + +static int tsc_difs[MAXCPUS][TSC_SYNC_NTIMES]; + +void +tsc_debug(void) +{ + int i, cpuid = curcpu()->ci_cpuid; + + for (i = 0; i < TSC_SYNC_NTIMES; i++) { + if (i % 10 == 0) + printf("%5d", tsc_difs[cpuid][i]); + else + printf(" %5d", tsc_difs[cpuid][i]); + if (i % 10 == 9) + printf("\n"); + } + printf("\n"); +} + void tsc_sync_bp(struct cpu_info *ci) { + int i, mindif = INT_MAX, dif; uint64_t bptsc, aptsc; - tsc_read_bp(ci, , ); /* discarded - cache effects */ - tsc_read_bp(ci, , ); + for (i = 0; i < TSC_SYNC_NTIMES; i++) { + tsc_read_bp(ci, , ); + dif = bptsc - aptsc; + if (abs(dif) < abs(mindif)) + mindif = dif; + tsc_difs[ci->ci_cpuid][i] = dif; + } /* Compute final value to adjust for skew. */ - ci->ci_tsc_skew = bptsc - aptsc; + ci->ci_tsc_skew = mindif; } /* @@ -351,8 +377,10 @@ tsc_post_ap(struct cpu_info *ci) void tsc_sync_ap(struct cpu_info *ci) { - tsc_post_ap(ci); - tsc_post_ap(ci); + int i; + + for (i = 0; i < TSC_SYNC_NTIMES; i++) + tsc_post_ap(ci); } void
Re: monotonic time going back by wrong skews
On Mon, 5 Apr 2021 14:24:03 +0200 (CEST) Mark Kettenis wrote: >> Date: Mon, 05 Apr 2021 19:14:49 +0900 (JST) >> From: YASUOKA Masahiko >> >> Hi, >> >> On Mon, 5 Apr 2021 10:43:00 +0300 >> Paul Irofti wrote: >> > On 05.04.2021 06:13, Scott Cheloha wrote: >> >> On Mon, Mar 29, 2021 at 02:00:01PM +0900, YASUOKA Masahiko wrote: >> >>> On Thu, 25 Mar 2021 19:41:35 +0100 (CET) >> >>> Mark Kettenis wrote: >> >>>>> From: Scott Cheloha >> >>>>> Date: Thu, 25 Mar 2021 13:18:04 -0500 >> >>>>>> On Wed, Mar 24, 2021 at 05:40:21PM +0900, YASUOKA Masahiko wrote: >> >>>>> Which diff did you apply? Yasuoka provided two diffs. >> >>>>> >> >>>>> In any case, ignore this diff: >> >>>>> >> >>>>>> diff --git a/sys/arch/amd64/amd64/tsc.c b/sys/arch/amd64/amd64/tsc.c >> >>>>>> index 238a5a068e1..3b951a8b5a3 100644 >> >>>>>> --- a/sys/arch/amd64/amd64/tsc.c >> >>>>>> +++ b/sys/arch/amd64/amd64/tsc.c >> >>>>>> @@ -212,7 +212,8 @@ cpu_recalibrate_tsc(struct timecounter *tc) >> >>>>>> u_int >> >>>>>> tsc_get_timecount(struct timecounter *tc) >> >>>>>> { >> >>>>>> - return rdtsc_lfence() + curcpu()->ci_tsc_skew; >> >>>>>> + //return rdtsc_lfence() + curcpu()->ci_tsc_skew; >> >>>>>> + return rdtsc_lfence(); >> >>>>>> } >> >>>>>> >> >>>>>> void >> >>>>> >> >>>>> >> >>>>> We don't want to discard the skews, that's wrong. >> >>> >> >>> I'm sorry for the confusion. >> >> No problem. >> >> >> >>>>> The reason it "fixes" Yasuoka's problem is because the real skews >> >>>>> on the ESXi VMs in question are probably close to zero but our >> >>>>> synchronization algorithm is picking huge (wrong) skews due to >> >>>>> some other variable interfering with our measurement. >> >>>> >> >>>> Right. If a VM exit happens while we're doing our measurement, you'll >> >>>> see a significant delay. And a guest OS can't prevent those from >> >>>> happening. But even on real hardware SMM mode may interfere with our >> >>>> measurement. >> >>> >> >>> For machines like the ESXi VMs, the measurement seems to have to >> >>> exclude such delayed values as outliers. I think taking a lot of >> >>> samples and choice the minimum is a good enough way for the purpose. >> >>> >> >>> I updated the diff. >> >>> >> >>> - delete lines for debug >> >>> - make tsc quality lower if skew is not good enough >> >>> - reduce difference from NetBSD >> >>> >> >>> comment? ok? >> >> If more iterations fixes your problem, great. It isn't going to make >> >> things worse for machines with sync'd TSCs, makes the TSC usable on >> >> another class of machine, and is relatively cheap. >> >> This is ok cheloha@. >> >> You need another ok, though. >> > >> > >> > The diff is obviously fine. But it is still a heuristic with no real >> > motivation except for this particular ESXi VM case. So my question >> > about why we choose the minimum instead of the median or the mean has >> > not been answered. >> >> Because median or mean is affected by outliers. We actually see >> some outliers in samples from the VMware. >> >> I suppose there is a better mesure, but I am currently no idia and had >> not used that kind of measure in kernel. On the other hand, finding >> the minimum is very simple. > > Using the median should take care of the outliers though. You are right. I misunderstood the meaning. > I'm not at all convinced that taking the absolute value of the > difference makes sense. It probably works in this case since the > actual skew on your VM is zero. So measurements close to zero are > "good". But what if the skew isn't zero? Take for example an AP that > is running ahead of the BP by 5000 ticks. In that case, the right > value for the skew is -5000. But now imagine that the BP gets > "interrupted&qu
Re: monotonic time going back by wrong skews
Hi, On Mon, 5 Apr 2021 10:43:00 +0300 Paul Irofti wrote: > On 05.04.2021 06:13, Scott Cheloha wrote: >> On Mon, Mar 29, 2021 at 02:00:01PM +0900, YASUOKA Masahiko wrote: >>> On Thu, 25 Mar 2021 19:41:35 +0100 (CET) >>> Mark Kettenis wrote: >>>>> From: Scott Cheloha >>>>> Date: Thu, 25 Mar 2021 13:18:04 -0500 >>>>>> On Wed, Mar 24, 2021 at 05:40:21PM +0900, YASUOKA Masahiko wrote: >>>>> Which diff did you apply? Yasuoka provided two diffs. >>>>> >>>>> In any case, ignore this diff: >>>>> >>>>>> diff --git a/sys/arch/amd64/amd64/tsc.c b/sys/arch/amd64/amd64/tsc.c >>>>>> index 238a5a068e1..3b951a8b5a3 100644 >>>>>> --- a/sys/arch/amd64/amd64/tsc.c >>>>>> +++ b/sys/arch/amd64/amd64/tsc.c >>>>>> @@ -212,7 +212,8 @@ cpu_recalibrate_tsc(struct timecounter *tc) >>>>>> u_int >>>>>> tsc_get_timecount(struct timecounter *tc) >>>>>> { >>>>>> -return rdtsc_lfence() + curcpu()->ci_tsc_skew; >>>>>> +//return rdtsc_lfence() + curcpu()->ci_tsc_skew; >>>>>> +return rdtsc_lfence(); >>>>>> } >>>>>> >>>>>> void >>>>> >>>>> >>>>> We don't want to discard the skews, that's wrong. >>> >>> I'm sorry for the confusion. >> No problem. >> >>>>> The reason it "fixes" Yasuoka's problem is because the real skews >>>>> on the ESXi VMs in question are probably close to zero but our >>>>> synchronization algorithm is picking huge (wrong) skews due to >>>>> some other variable interfering with our measurement. >>>> >>>> Right. If a VM exit happens while we're doing our measurement, you'll >>>> see a significant delay. And a guest OS can't prevent those from >>>> happening. But even on real hardware SMM mode may interfere with our >>>> measurement. >>> >>> For machines like the ESXi VMs, the measurement seems to have to >>> exclude such delayed values as outliers. I think taking a lot of >>> samples and choice the minimum is a good enough way for the purpose. >>> >>> I updated the diff. >>> >>> - delete lines for debug >>> - make tsc quality lower if skew is not good enough >>> - reduce difference from NetBSD >>> >>> comment? ok? >> If more iterations fixes your problem, great. It isn't going to make >> things worse for machines with sync'd TSCs, makes the TSC usable on >> another class of machine, and is relatively cheap. >> This is ok cheloha@. >> You need another ok, though. > > > The diff is obviously fine. But it is still a heuristic with no real > motivation except for this particular ESXi VM case. So my question > about why we choose the minimum instead of the median or the mean has > not been answered. Because median or mean is affected by outliers. We actually see some outliers in samples from the VMware. I suppose there is a better mesure, but I am currently no idia and had not used that kind of measure in kernel. On the other hand, finding the minimum is very simple. > Another issue that I see is that people have not reported, at least > publicly, that this runs fine on their normal OpenBSD machines. Some dmesgs posted on public lists seems to have the same problem. https://marc.info/?l=openbsd-bugs=2=1=disabling+user+TSC=b https://marc.info/?l=openbsd-tech=2=1=disabling+user+TSC=b https://marc.info/?l=openbsd-ports=2=1=disabling+user+TSC=b For example, https://marc.info/?l=openbsd-bugs=161618496905444=2 |Subject:wg(4) crash |From: Stuart Henderson |bios0: vendor Dell Inc. version "2.9.0" date 12/06/2019 |bios0: Dell Inc. PowerEdge R620 |cpu1: disabling user TSC (skew=135) |cpu1: smt 0, core 0, package 1 https://marc.info/?l=openbsd-ports=161306073708427=2 |Subject:Re: sysutils/nut README APC over USB device chgrp/chmod |From: Marcus MERIGHI |bios0: vendor American Megatrends Inc. version "3.1" date 06/07/2018 |cpu11: disabling user TSC (skew=240) |cpu11: smt 0, core 3, package 1 these 2 are real machine and using 2 CPU sockets. https://marc.info/?l=openbsd-ports=161562278114172=2 |Subject:ruby27 vs Puppet |From: Giovanni Bechis |bios0: vendor Phoenix Technologies LTD version "6.00" date 12/12/2018 |bios0: VMware, Inc. VMware Virtual Platform |cpu1: disabling user TSC (skew=-12705) VMware. seems the same problem of mine. I'll ask people to do the same test
Re: monotonic time going back by wrong skews
On Thu, 25 Mar 2021 19:41:35 +0100 (CET) Mark Kettenis wrote: >> From: Scott Cheloha >> Date: Thu, 25 Mar 2021 13:18:04 -0500 >> > On Wed, Mar 24, 2021 at 05:40:21PM +0900, YASUOKA Masahiko wrote: >> Which diff did you apply? Yasuoka provided two diffs. >> >> In any case, ignore this diff: >> >> > diff --git a/sys/arch/amd64/amd64/tsc.c b/sys/arch/amd64/amd64/tsc.c >> > index 238a5a068e1..3b951a8b5a3 100644 >> > --- a/sys/arch/amd64/amd64/tsc.c >> > +++ b/sys/arch/amd64/amd64/tsc.c >> > @@ -212,7 +212,8 @@ cpu_recalibrate_tsc(struct timecounter *tc) >> > u_int >> > tsc_get_timecount(struct timecounter *tc) >> > { >> > - return rdtsc_lfence() + curcpu()->ci_tsc_skew; >> > + //return rdtsc_lfence() + curcpu()->ci_tsc_skew; >> > + return rdtsc_lfence(); >> > } >> > >> > void >> >> >> We don't want to discard the skews, that's wrong. I'm sorry for the confusion. >> The reason it "fixes" Yasuoka's problem is because the real skews >> on the ESXi VMs in question are probably close to zero but our >> synchronization algorithm is picking huge (wrong) skews due to >> some other variable interfering with our measurement. > > Right. If a VM exit happens while we're doing our measurement, you'll > see a significant delay. And a guest OS can't prevent those from > happening. But even on real hardware SMM mode may interfere with our > measurement. For machines like the ESXi VMs, the measurement seems to have to exclude such delayed values as outliers. I think taking a lot of samples and choice the minimum is a good enough way for the purpose. I updated the diff. - delete lines for debug - make tsc quality lower if skew is not good enough - reduce difference from NetBSD comment? ok? Index: sys/arch/amd64//amd64/tsc.c === RCS file: /disk/cvs/openbsd/src/sys/arch/amd64/amd64/tsc.c,v retrieving revision 1.23 diff -u -p -r1.23 tsc.c --- sys/arch/amd64//amd64/tsc.c 23 Feb 2021 04:44:30 - 1.23 +++ sys/arch/amd64//amd64/tsc.c 29 Mar 2021 04:18:31 - @@ -38,6 +38,7 @@ int tsc_is_invariant; #defineTSC_DRIFT_MAX 250 #define TSC_SKEW_MAX 100 +#defineTSC_SYNC_ROUNDS 1000 int64_ttsc_drift_observed; volatile int64_t tsc_sync_val; @@ -235,6 +236,7 @@ tsc_timecounter_init(struct cpu_info *ci printf("%s: disabling user TSC (skew=%lld)\n", ci->ci_dev->dv_xname, (long long)ci->ci_tsc_skew); tsc_timecounter.tc_user = 0; + tsc_timecounter.tc_quality = -1000; } if (!(ci->ci_flags & CPUF_PRIMARY) || @@ -314,13 +316,19 @@ tsc_read_bp(struct cpu_info *ci, uint64_ void tsc_sync_bp(struct cpu_info *ci) { + int i, val, diff; uint64_t bptsc, aptsc; - tsc_read_bp(ci, , ); /* discarded - cache effects */ - tsc_read_bp(ci, , ); + val = INT_MAX; + for (i = 0; i < TSC_SYNC_ROUNDS; i++) { + tsc_read_bp(ci, , ); + diff = bptsc - aptsc; + if (abs(diff) < abs(val)) + val = diff; + } /* Compute final value to adjust for skew. */ - ci->ci_tsc_skew = bptsc - aptsc; + ci->ci_tsc_skew = val; } /* @@ -351,8 +359,10 @@ tsc_post_ap(struct cpu_info *ci) void tsc_sync_ap(struct cpu_info *ci) { - tsc_post_ap(ci); - tsc_post_ap(ci); + int i; + + for (i = 0; i < TSC_SYNC_ROUNDS; i++) + tsc_post_ap(ci); } void
Re: fyi: get HP EliteBook 830 G7/G8 booting
On Fri, 26 Mar 2021 12:12:44 +0100 (CET) Mark Kettenis wrote: >> Date: Fri, 26 Mar 2021 19:43:23 +0900 (JST) >> From: YASUOKA Masahiko >> >> Hi, >> >> On Fri, 26 Mar 2021 09:30:43 +0100 >> Jan Klemkow wrote: >> > If you want to boot OpenBSD on an HP EliteBook 830 G7/G8, the bootloader >> > will hang while loading the kernel. Because, the UEFI loads the >> > bootloader on the same place in memory, where the bootloader will copy >> > the kernel. We are unable to load the kernel on arbitrary memory. >> > Thus, the following diff will help you, to get OpenBSD running on these >> > machines. It moves the hardcoded Kernel address to a free place. >> >> The openbsd efiboot copies the kernel to that place after >> ExitBootServices(). >> >> sys/arch/amd64/stand/efiboot/exec_i386.c >> 152 /* >> 153 * Move the loaded kernel image to the usual place after >> calling >> 154 * ExitBootServices(). >> 155 */ >> 156 #ifdef __amd64__ >> 157 protect_writeable(marks[MARK_START] + delta, >> 158 marks[MARK_END] - marks[MARK_START]); >> 159 #endif >> 160 memmove((void *)marks[MARK_START] + delta, (void >> *)marks[MARK_START], >> 161 marks[MARK_END] - marks[MARK_START]); >> 162 for (i = 0; i < MARK_MAX; i++) >> 163 marks[i] += delta; >> 164 >> 165 #ifdef __amd64__ >> 166 (*run_i386)((u_long)run_i386, entry, howto, bootdev, >> BOOTARG_APIVER, >> 167 marks[MARK_END], extmem, cnvmem, ac, (intptr_t)av); >> >> >> I think it should work without the ld.script change.. > > The (likely) problem is that the memmove() on line 160 is overwriting > the bootloader code itself. > > There are essentially two ways to fix this: > > 1. Have the bootloader relocate itself to an address that doesn't >conflict with the kernel to be loaded. > > 2. Make it possible for the kernel to be loaded at a (somewhat) >arbitrary physical address. > > In my view #2 is the way forward. There are other reasons why that > would be beneficial as it would make it less predictable at which > physical address the kernel code lives which could prevent some > attacks that use the direct map. > > #2 is also the approach taken by the EFIBOOT on armv7 and arm64. On > arm64 for example, EFIBOOT loads the kernel into a 64MB memory block > that is aligned on a 2MB boundary. The kernel then figures out its > load address based on that and and patches things up accordingly. In this senario, what efiboot should do is just jumping "start64" (entry point for 64bit) of the kernel, and other things are done after the start64? > mlarkin@ was doing some work to change how we load the amd64 kernel. > His approach was to let the bootloader build the initial page tables > and jump into the kernel in 64-bit mode with the MMU enabled. That > was more focussed on running the kernel at a randomized virtual > address. But it should be fairly easy to make it run at a different > physical address as well this way. Unfortunately that effort was > mostly focussed on the legacy bootloader.
Re: fyi: get HP EliteBook 830 G7/G8 booting
Hi, On Fri, 26 Mar 2021 09:30:43 +0100 Jan Klemkow wrote: > If you want to boot OpenBSD on an HP EliteBook 830 G7/G8, the bootloader > will hang while loading the kernel. Because, the UEFI loads the > bootloader on the same place in memory, where the bootloader will copy > the kernel. We are unable to load the kernel on arbitrary memory. > Thus, the following diff will help you, to get OpenBSD running on these > machines. It moves the hardcoded Kernel address to a free place. The openbsd efiboot copies the kernel to that place after ExitBootServices(). sys/arch/amd64/stand/efiboot/exec_i386.c 152 /* 153 * Move the loaded kernel image to the usual place after calling 154 * ExitBootServices(). 155 */ 156 #ifdef __amd64__ 157 protect_writeable(marks[MARK_START] + delta, 158 marks[MARK_END] - marks[MARK_START]); 159 #endif 160 memmove((void *)marks[MARK_START] + delta, (void *)marks[MARK_START], 161 marks[MARK_END] - marks[MARK_START]); 162 for (i = 0; i < MARK_MAX; i++) 163 marks[i] += delta; 164 165 #ifdef __amd64__ 166 (*run_i386)((u_long)run_i386, entry, howto, bootdev, BOOTARG_APIVER, 167 marks[MARK_END], extmem, cnvmem, ac, (intptr_t)av); I think it should work without the ld.script change..
Re: monotonic time going back by wrong skews
Hi, > Second, why is taking the minimum value the optimal choice? I would > assume an average would be better. Basically if you have a sequency > like 900, 900, 900, 900, 0, 900, 900, 900 you pick 0 which could lead > to some problems, right? Or am I missing something?" Skews on VMware >> -8445 -6643 -52183 0-3-4-7 -11-5 0 >>-11-9-5-3-4-3-7 8-5-6 >> -5-9-3-9-7-1-5-5-9-2 >> -6-4-6-4 -11-8-3-4-8-1 >> -9-1-8 1-8 6-5-4 2-2 >> -8-3-1-5-2-2 1 2-2-9 >>-12 0-9-2-2-5-2 1 2 0 First 3 seem to be storange. Also there is such a value on middle of sampling. >> 9-1 -10 50505-1 2 6 -11 2-2 I suppose such values should be excluded. Also I did same test on my VAIO. It seems more constant than VMware. Full result is attached at last. Is it possible that the calculation code is taking effects from the CPU scheduler of its virtual supervisor? Thanks, On Wed, 24 Mar 2021 13:04:32 +0200 Paul Irofti wrote: > Hi, > > Thank you for taking this to tech@ as requested! > > I will reproduce here what I replied to Yasouka and Scott (which I > think proposed taking the minimum skew value) in private. > > "First, thank you very much for the in-depth analysis. I would suggest > you take this to a public forum like tech@ so that we can keep the > discussion opened and civilized. > > I remember when I wrote the CPU synchronization code, that I tried > doing sampling but it had some issues that now I don't remember of. So > let us try this on real hardware too. This is another argument for > moving this to tech@. > > Second, why is taking the minimum value the optimal choice? I would > assume an average would be better. Basically if you have a sequency > like 900, 900, 900, 900, 0, 900, 900, 900 you pick 0 which could lead > to some problems, right? Or am I missing something?" > > So could people give the minimum skew approach a spin on real machines > to see if there are any issues popping up? > > All the best, > Paul > > On 3/24/21 10:40 AM, YASUOKA Masahiko wrote: >> Hi, >> I hit a problem which is caused by going back of monotonic time. It >> happens on hosts on VMware ESXi. >> I wrote the program which repeats the problem. >> % cc -o monotime monotime.c -lpthread >> % ./monotime >> 194964 Starting >> 562210 Starting >> 483046 Starting >> 148865 Starting >> 148865 Back 991.808048665 => 991.007447931 >> 562210 Back 991.808048885 => 991.007448224 >> 483046 Back 991.808049115 => 991.007449172 >> 148865 Stopped >> 562210 Stopped >> 483046 Stopped >> 194964 Stopped >> % uname -a >> OpenBSD yasuoka-ob-c.tokyo.iiji.jp 6.8 GENERIC.MP#5 amd64 >> % sysctl kern.version >> kern.version=OpenBSD 6.8 (GENERIC.MP) #5: Mon Feb 22 04:36:10 MST 2021 >> >> r...@syspatch-68-amd64.openbsd.org:/usr/src/sys/arch/amd64/compile/GENERIC.MP >> % >> monotime.c >> >> #include >> #include >> #include >> #include >> #include >> #include >> #include >> #define NTHREAD 4 >> #define NTRY 5 >> void * >> start(void *dummy) >> { >> int i; >> struct timespec ts0, ts1; >> printf("%d Starting\n", (int)getthrid()); >> clock_gettime(CLOCK_MONOTONIC, ); >> for (i = 0; i < NTRY; i++) { >> clock_gettime(CLOCK_MONOTONIC, ); >> if (timespeccmp(, , <=)) { >> ts0 = ts1; >> continue; >> } >> printf("%d Back %lld.%09lu => %lld.%09lu\n", >> (int)getthrid(), ts0.tv_sec, ts0.tv_nsec, ts1.tv_sec, >> ts1.tv_nsec); >> break; >> } >> printf("%d Stopped\n", (int)getthrid()); >> return (NULL); >> } >> int >> main(int argc, char *argv[]) >> { >> int i, n = NTHREAD; >> pthread_t *threads; >> threads = calloc(n, sizeof(pthread_t)); >> for (i = 0; i < n; i++) >> pthread_create([i], NULL, start, NULL); >> for (i = 0; i < n; i++) >> pthread_join(threads[i], NULL); >> } >> >> The machine has 4 vCPUs and showing the following message on boot. >>cpu1: disabling user TSC
monotonic time going back by wrong skews
Hi, I hit a problem which is caused by going back of monotonic time. It happens on hosts on VMware ESXi. I wrote the program which repeats the problem. % cc -o monotime monotime.c -lpthread % ./monotime 194964 Starting 562210 Starting 483046 Starting 148865 Starting 148865 Back 991.808048665 => 991.007447931 562210 Back 991.808048885 => 991.007448224 483046 Back 991.808049115 => 991.007449172 148865 Stopped 562210 Stopped 483046 Stopped 194964 Stopped % uname -a OpenBSD yasuoka-ob-c.tokyo.iiji.jp 6.8 GENERIC.MP#5 amd64 % sysctl kern.version kern.version=OpenBSD 6.8 (GENERIC.MP) #5: Mon Feb 22 04:36:10 MST 2021 r...@syspatch-68-amd64.openbsd.org:/usr/src/sys/arch/amd64/compile/GENERIC.MP % monotime.c #include #include #include #include #include #include #include #define NTHREAD 4 #define NTRY5 void * start(void *dummy) { int i; struct timespec ts0, ts1; printf("%d Starting\n", (int)getthrid()); clock_gettime(CLOCK_MONOTONIC, ); for (i = 0; i < NTRY; i++) { clock_gettime(CLOCK_MONOTONIC, ); if (timespeccmp(, , <=)) { ts0 = ts1; continue; } printf("%d Back %lld.%09lu => %lld.%09lu\n", (int)getthrid(), ts0.tv_sec, ts0.tv_nsec, ts1.tv_sec, ts1.tv_nsec); break; } printf("%d Stopped\n", (int)getthrid()); return (NULL); } int main(int argc, char *argv[]) { int i, n = NTHREAD; pthread_t *threads; threads = calloc(n, sizeof(pthread_t)); for (i = 0; i < n; i++) pthread_create([i], NULL, start, NULL); for (i = 0; i < n; i++) pthread_join(threads[i], NULL); } The machine has 4 vCPUs and showing the following message on boot. cpu1: disabling user TSC (skew=-5310) cpu2: disabling user TSC (skew=-5335) cpu3: disabling user TSC (skew=-7386) This means "user TSC" is disabled because of TSC of cpu{1,2,3} is much delayed against cpu0. Simply ignoring the skews by the following diff seems to workaround this problem. diff --git a/sys/arch/amd64/amd64/tsc.c b/sys/arch/amd64/amd64/tsc.c index 238a5a068e1..3b951a8b5a3 100644 --- a/sys/arch/amd64/amd64/tsc.c +++ b/sys/arch/amd64/amd64/tsc.c @@ -212,7 +212,8 @@ cpu_recalibrate_tsc(struct timecounter *tc) u_int tsc_get_timecount(struct timecounter *tc) { - return rdtsc_lfence() + curcpu()->ci_tsc_skew; + //return rdtsc_lfence() + curcpu()->ci_tsc_skew; + return rdtsc_lfence(); } void So I supposed the skews are not calculated properly. Also I found NetBSD changed the skew calculating so that it checks 1000 times and take the minimum value. https://github.com/NetBSD/src/commit/1dec05c1ae197b4acfc7038e49dfddabcbed0dff https://github.com/NetBSD/src/commit/66d76b89792bac1c71cd5507ba62b08ad02129ef I checked skews on the machine by the following debug code. diff --git a/sys/arch/amd64/amd64/tsc.c b/sys/arch/amd64/amd64/tsc.c index 238a5a068e1..83e835e4f82 100644 --- a/sys/arch/amd64/amd64/tsc.c +++ b/sys/arch/amd64/amd64/tsc.c @@ -302,16 +302,42 @@ tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp) *aptscp = tsc_sync_val; } +#defineTSC_SYNC_NTIMES 1000 + +static int tsc_difs[MAXCPUS][TSC_SYNC_NTIMES]; + +void +tsc_debug(void) +{ + int i, cpuid = curcpu()->ci_cpuid; + + for (i = 0; i < TSC_SYNC_NTIMES; i++) { + if (i % 10 == 0) + printf("%5d", tsc_difs[cpuid][i]); + else + printf(" %5d", tsc_difs[cpuid][i]); + if (i % 10 == 9) + printf("\n"); + } + printf("\n"); +} + void tsc_sync_bp(struct cpu_info *ci) { + int i, mindif = INT_MAX, dif; uint64_t bptsc, aptsc; - tsc_read_bp(ci, , ); /* discarded - cache effects */ - tsc_read_bp(ci, , ); + for (i = 0; i < TSC_SYNC_NTIMES; i++) { + tsc_read_bp(ci, , ); + dif = bptsc - aptsc; + if (abs(dif) < abs(mindif)) + mindif = dif; + tsc_difs[ci->ci_cpuid][i] = dif; + } /* Compute final value to adjust for skew. */ - ci->ci_tsc_skew = bptsc - aptsc; + ci->ci_tsc_skew = mindif; } /* @@ -342,8 +368,10 @@ tsc_post_ap(struct cpu_info *ci) void tsc_sync_ap(struct cpu_info *ci) { - tsc_post_ap(ci); - tsc_post_ap(ci); + int i; + + for (i = 0; i < TSC_SYNC_NTIMES; i++) + tsc_post_ap(ci); } void Stopped at db_enter+0x10: popq%rbp ddb{0}> machine ddbcpu 1 Stopped at x86_ipi_db+0x12:leave ddb{1}> call tsc_debug -8445 -6643 -52183 0-3-4-7 -11-5 0 -11-9-5-3-4-3-7 8-5-6 -5-9-3-9-7-1-5
Re: diff: efiboot: alignment for media which has IoAlign > 1
On Wed, 10 Mar 2021 13:15:58 +0100 (CET) Mark Kettenis wrote: >> On Wed, 10 Mar 2021 20:35:41 +0900 (JST) >> YASUOKA Masahiko wrote: >> > efiboot cannot load the kernel properly on some machines if booted >> > from CD-ROM. In that case boot fails with a message like follow: >> > >> >booting cd0a:. [359648read symbols: Unknown error: code 255 >> > >> > As far as Asou and my test, this happens on hosts on VMware ESXi 6.7, >> > 7.0 and asou's physical machine. >> > >> > The problem happens because efiboot calls ReadBlocks function with an >> > unaligned pointer for medias which requires an aligned pointer. When >> > efiboot loads a kernel, the pointer becomes unaligned since there is >> > an ELF section located at unaligned place in CD-ROM. Previously our >> > kernel didn't have such a section but it does after switching lld as >> > the default linker. >> > >> > For test, let me show sample commands which creates a bootable cdrom >> > image for EFI: >> > >> > mkdir -p efiboot/EFI/BOOT >> > cp /usr/mdec/BOOTX64.EFI efiboot/EFI/BOOT >> > makefs -M 1m -m 1m -t msdos -o fat_type=12,sectors_per_cluster=1 \ >> > efiboot.img efiboot >> > mkdir -p cd-dir/etc >> > cp bsd.rd cd-dir/ >> > echo "set image bsd.rd" > cd-dir/etc/boot.conf >> > makefs -t cd9660 -o >> > 'rockridge,bootimage=i386;/usr/mdec/cdbr,no-emul-boot,allow-multidot,bootimage=efi;efiboot.img,no-emul-boot' >> > \ >> >boot.iso cd-dir >> > >> > the diff is to fix the problem. >> > >> > ok? > > Maybe it is better to always bounce through an aligned buffer? That > would make the code a little bit slower but a lot simpler. And the > overhead of doing the copy should be small compared to the actual I/O. Indeed. It became much simpler. As I tested on ESXi 7.0, vaio, and qemu, I don't feel significant performance regression. ok? Index: sys/arch/amd64/stand/efiboot/efidev.c === RCS file: /var/cvs/openbsd/src/sys/arch/amd64/stand/efiboot/efidev.c,v retrieving revision 1.32 diff -u -p -r1.32 efidev.c --- sys/arch/amd64/stand/efiboot/efidev.c 9 Dec 2020 18:10:18 - 1.32 +++ sys/arch/amd64/stand/efiboot/efidev.c 11 Mar 2021 05:59:41 - @@ -84,10 +84,10 @@ efid_init(struct diskinfo *dip, void *ha static EFI_STATUS efid_io(int rw, efi_diskinfo_t ed, u_int off, int nsect, void *buf) { - u_intblks, lba, i_lblks, i_tblks, i_nblks; + u_intblks, start, end; EFI_STATUS status = EFI_SUCCESS; - static u_char *iblk = NULL; - static u_int iblksz = 0; + static u_char *ibuf = NULL; + static u_int ibufsz = 0; /* block count of the intrisic block size in DEV_BSIZE */ blks = EFI_BLKSPERSEC(ed); @@ -95,90 +95,46 @@ efid_io(int rw, efi_diskinfo_t ed, u_int /* block size < 512. HP Stream 13 actually has such a disk. */ return (EFI_UNSUPPORTED); - /* leading and trailing unaligned blocks in intrisic block */ - i_lblks = ((off % blks) == 0)? 0 : blks - (off % blks); - i_tblks = (nsect > i_lblks)? (off + nsect) % blks : 0; - - /* aligned blocks in intrisic block */ - i_nblks = (nsect > i_lblks + i_tblks)? nsect - (i_lblks + i_tblks) : 0; - - lba = (off + i_lblks) / blks; - - /* allocate the space for reading unaligned blocks */ - if (ed->blkio->Media->BlockSize != DEV_BSIZE) { - if (iblk && iblksz < ed->blkio->Media->BlockSize) { - free(iblk, iblksz); - iblk = NULL; - } - if (iblk == NULL) { - iblk = alloc(ed->blkio->Media->BlockSize); - iblksz = ed->blkio->Media->BlockSize; - } + start = off / blks; + end = (off + nsect + blks - 1) / blks; + /* +* Prepare a buffer to use an aligned memory always that might be +* required by some medias +*/ + if (ibuf && ibufsz < (end - start) * ed->blkio->Media->BlockSize) { + free(ibuf, ibufsz); + ibuf = NULL; + } + if (ibuf == NULL) { + ibufsz = (end - start) * ed->blkio->Media->BlockSize; + ibuf = alloc(ibufsz); } + switch (rw) { case F_READ: - if (i_lblks > 0) { - status = EFI_CALL(ed->blkio->ReadBlocks, - ed->blkio
Re: diff: efiboot: alignment for media which has IoAlign > 1
Sorry for making noise, let me update the diff. > + if (ed->blkio->Media->IoAlign > 1 && > + ((UINTN)buf + i_lblks * DEV_BSIZE) > + % ed->blkio->Media->IoAlign == 0) first condition was reversed.. On Wed, 10 Mar 2021 20:35:41 +0900 (JST) YASUOKA Masahiko wrote: > efiboot cannot load the kernel properly on some machines if booted > from CD-ROM. In that case boot fails with a message like follow: > >booting cd0a:. [359648read symbols: Unknown error: code 255 > > As far as Asou and my test, this happens on hosts on VMware ESXi 6.7, > 7.0 and asou's physical machine. > > The problem happens because efiboot calls ReadBlocks function with an > unaligned pointer for medias which requires an aligned pointer. When > efiboot loads a kernel, the pointer becomes unaligned since there is > an ELF section located at unaligned place in CD-ROM. Previously our > kernel didn't have such a section but it does after switching lld as > the default linker. > > For test, let me show sample commands which creates a bootable cdrom > image for EFI: > > mkdir -p efiboot/EFI/BOOT > cp /usr/mdec/BOOTX64.EFI efiboot/EFI/BOOT > makefs -M 1m -m 1m -t msdos -o fat_type=12,sectors_per_cluster=1 \ > efiboot.img efiboot > mkdir -p cd-dir/etc > cp bsd.rd cd-dir/ > echo "set image bsd.rd" > cd-dir/etc/boot.conf > makefs -t cd9660 -o > 'rockridge,bootimage=i386;/usr/mdec/cdbr,no-emul-boot,allow-multidot,bootimage=efi;efiboot.img,no-emul-boot' > \ > boot.iso cd-dir > > the diff is to fix the problem. > > ok? Index: sys/arch/amd64/stand/efiboot/efidev.c === RCS file: /disk/cvs/openbsd/src/sys/arch/amd64/stand/efiboot/efidev.c,v retrieving revision 1.32 diff -u -p -r1.32 efidev.c --- sys/arch/amd64/stand/efiboot/efidev.c 9 Dec 2020 18:10:18 - 1.32 +++ sys/arch/amd64/stand/efiboot/efidev.c 10 Mar 2021 11:41:39 - @@ -84,7 +84,7 @@ efid_init(struct diskinfo *dip, void *ha static EFI_STATUS efid_io(int rw, efi_diskinfo_t ed, u_int off, int nsect, void *buf) { - u_intblks, lba, i_lblks, i_tblks, i_nblks; + u_inti, blks, lba, i_lblks, i_tblks, i_nblks; EFI_STATUS status = EFI_SUCCESS; static u_char *iblk = NULL; static u_int iblksz = 0; @@ -127,10 +127,29 @@ efid_io(int rw, efi_diskinfo_t ed, u_int min(nsect, i_lblks) * DEV_BSIZE); } if (i_nblks > 0) { - status = EFI_CALL(ed->blkio->ReadBlocks, - ed->blkio, ed->mediaid, lba, - ed->blkio->Media->BlockSize * (i_nblks / blks), - buf + (i_lblks * DEV_BSIZE)); + /* +* Pass the buffer directly to the EFI function only if +* the buffer is properly aligned as the media requires +*/ + if (ed->blkio->Media->IoAlign <= 1 || + ((UINTN)buf + i_lblks * DEV_BSIZE) + % ed->blkio->Media->IoAlign == 0) + status = EFI_CALL(ed->blkio->ReadBlocks, + ed->blkio, ed->mediaid, lba, + ed->blkio->Media->BlockSize * (i_nblks / + blks), buf + i_lblks * DEV_BSIZE); + else { + for (i = 0; i < i_nblks; i += blks) { + status = EFI_CALL(ed->blkio->ReadBlocks, + ed->blkio, ed->mediaid, + lba + i / blks, + ed->blkio->Media->BlockSize, iblk); + if (EFI_ERROR(status)) + break; + memcpy(buf + i * DEV_BSIZE, iblk, + ed->blkio->Media->BlockSize); + } + } if (EFI_ERROR(status)) goto on_eio; } @@ -160,10 +179,30 @@ efid_io(int rw, efi_diskinfo_t ed, u_int ed->blkio->Media->BlockSize, iblk); } if (i_nblks > 0) { - status = EFI_CALL(ed->blkio->WriteBlocks, - ed->blkio, ed->mediaid, lba, -
diff: efiboot: alignment for media which has IoAlign > 1
Hi, efiboot cannot load the kernel properly on some machines if booted from CD-ROM. In that case boot fails with a message like follow: booting cd0a:. [359648read symbols: Unknown error: code 255 As far as Asou and my test, this happens on hosts on VMware ESXi 6.7, 7.0 and asou's physical machine. The problem happens because efiboot calls ReadBlocks function with an unaligned pointer for medias which requires an aligned pointer. When efiboot loads a kernel, the pointer becomes unaligned since there is an ELF section located at unaligned place in CD-ROM. Previously our kernel didn't have such a section but it does after switching lld as the default linker. For test, let me show sample commands which creates a bootable cdrom image for EFI: mkdir -p efiboot/EFI/BOOT cp /usr/mdec/BOOTX64.EFI efiboot/EFI/BOOT makefs -M 1m -m 1m -t msdos -o fat_type=12,sectors_per_cluster=1 \ efiboot.img efiboot mkdir -p cd-dir/etc cp bsd.rd cd-dir/ echo "set image bsd.rd" > cd-dir/etc/boot.conf makefs -t cd9660 -o 'rockridge,bootimage=i386;/usr/mdec/cdbr,no-emul-boot,allow-multidot,bootimage=efi;efiboot.img,no-emul-boot' \ boot.iso cd-dir the diff is to fix the problem. ok? Index: sys/arch/amd64/stand/efiboot/efidev.c === RCS file: /disk/cvs/openbsd/src/sys/arch/amd64/stand/efiboot/efidev.c,v retrieving revision 1.32 diff -u -p -r1.32 efidev.c --- sys/arch/amd64/stand/efiboot/efidev.c 9 Dec 2020 18:10:18 - 1.32 +++ sys/arch/amd64/stand/efiboot/efidev.c 10 Mar 2021 10:58:35 - @@ -84,7 +84,7 @@ efid_init(struct diskinfo *dip, void *ha static EFI_STATUS efid_io(int rw, efi_diskinfo_t ed, u_int off, int nsect, void *buf) { - u_intblks, lba, i_lblks, i_tblks, i_nblks; + u_inti, blks, lba, i_lblks, i_tblks, i_nblks; EFI_STATUS status = EFI_SUCCESS; static u_char *iblk = NULL; static u_int iblksz = 0; @@ -127,10 +127,29 @@ efid_io(int rw, efi_diskinfo_t ed, u_int min(nsect, i_lblks) * DEV_BSIZE); } if (i_nblks > 0) { - status = EFI_CALL(ed->blkio->ReadBlocks, - ed->blkio, ed->mediaid, lba, - ed->blkio->Media->BlockSize * (i_nblks / blks), - buf + (i_lblks * DEV_BSIZE)); + /* +* Pass the buffer directly to the EFI function only if +* the buffer is properly aligned as the media requires +*/ + if (ed->blkio->Media->IoAlign > 1 && + ((UINTN)buf + i_lblks * DEV_BSIZE) + % ed->blkio->Media->IoAlign == 0) + status = EFI_CALL(ed->blkio->ReadBlocks, + ed->blkio, ed->mediaid, lba, + ed->blkio->Media->BlockSize * (i_nblks / + blks), buf + i_lblks * DEV_BSIZE); + else { + for (i = 0; i < i_nblks; i += blks) { + status = EFI_CALL(ed->blkio->ReadBlocks, + ed->blkio, ed->mediaid, + lba + i / blks, + ed->blkio->Media->BlockSize, iblk); + if (EFI_ERROR(status)) + break; + memcpy(buf + i * DEV_BSIZE, iblk, + ed->blkio->Media->BlockSize); + } + } if (EFI_ERROR(status)) goto on_eio; } @@ -160,10 +179,30 @@ efid_io(int rw, efi_diskinfo_t ed, u_int ed->blkio->Media->BlockSize, iblk); } if (i_nblks > 0) { - status = EFI_CALL(ed->blkio->WriteBlocks, - ed->blkio, ed->mediaid, lba, - ed->blkio->Media->BlockSize * (i_nblks / blks), - buf + (i_lblks * DEV_BSIZE)); + /* +* Pass the buffer directly to the EFI function only if +* the buffer is properly aligned as the media requires +*/ + if (ed->blkio->Media->IoAlign > 1 && + ((UINTN)buf + i_lblks * DEV_BSIZE) + % ed->blkio->Media->IoAlign == 0) + status = EFI_CALL(ed->blkio->WriteBlocks, + ed->blkio, ed->mediaid, lba, +
Re: 2 diffs for dev/acpi/dsdt.c
Hi, Let me update "diff #2". On Fri, 26 Feb 2021 13:42:32 +0900 (JST) YASUOKA Masahiko wrote: > My vaio repeatedly crashed by "Data modified on freelist"(*1) or other > memory corruptions. After my long time debug, I found the route cause > is a handling of references of LocalX, like the following: > > If ((SMRW (0x0B, 0x16, 0x21, RefOf (Local0)) == Zero)) > > In the called control method, "RefOf (Local1)" is referred as Arg3, is > stored a value like the following: > > Arg3 = \_SB.PCI0.LPCB.EC0.SMD0 > > In aml_store(), lvalue is reset if lvalue is a LocalX. But since that > was done before resolving the reference, lvalue was not reset if > lvalue is a reference of LocalX. > > diff #1 fixes that problem. It resets lvalue after resolving > references. > > ok? > > diff #2 adds aml_die() if any memory corruption occurs when creating > field in a buffer. This actually happens on my vaio (pro pk 14) if > diff #1 is not applied. > > ok? > > diff #1 > > Index: sys/dev/acpi/dsdt.c > === > RCS file: /var/cvs/openbsd/src/sys/dev/acpi/dsdt.c,v > retrieving revision 1.257 > diff -u -p -r1.257 dsdt.c > --- sys/dev/acpi/dsdt.c 17 Dec 2020 17:57:19 - 1.257 > +++ sys/dev/acpi/dsdt.c 26 Feb 2021 04:12:03 - > @@ -2961,11 +2961,11 @@ aml_store(struct aml_scope *scope, struc > aml_rwfield(rhs, 0, rhs->v_field.bitlen, , ACPI_IOREAD); > rhs = > } > + > + lhs = aml_gettgt(lhs, AMLOP_STORE); > /* Store to LocalX: free value */ > if (lhs->stack >= AMLOP_LOCAL0 && lhs->stack <= AMLOP_LOCAL7) > aml_freevalue(lhs); > - > - lhs = aml_gettgt(lhs, AMLOP_STORE); > switch (lhs->type) { > case AML_OBJTYPE_UNINITIALIZED: > aml_copyvalue(lhs, rhs); > > diff #2 > > Index: sys/dev/acpi/dsdt.c > === > RCS file: /var/cvs/openbsd/src/sys/dev/acpi/dsdt.c,v > retrieving revision 1.257 > diff -u -p -r1.257 dsdt.c > --- sys/dev/acpi/dsdt.c 17 Dec 2020 17:57:19 - 1.257 > +++ sys/dev/acpi/dsdt.c 26 Feb 2021 04:33:21 - > @@ -2742,11 +2742,17 @@ aml_rwfield(struct aml_value *fld, int b > } else if (mode == ACPI_IOREAD) { > /* bufferfield:read */ > _aml_setvalue(val, AML_OBJTYPE_INTEGER, 0, 0); > + if (ref1->length < aml_bytepos(fld->v_field.bitpos) + > + aml_bytelen(fld->v_field.bitlen)) > + aml_die("bufferfield:read out of range"); > aml_bufcpy(>v_integer, 0, ref1->v_buffer, > fld->v_field.bitpos, fld->v_field.bitlen); > } else { > /* bufferfield:write */ > val = aml_convert(val, AML_OBJTYPE_INTEGER, -1); > + if (ref1->length < aml_bytepos(fld->v_field.bitpos) + > + aml_bytelen(fld->v_field.bitlen)) > + aml_die("bufferfield:write out of range"); > aml_bufcpy(ref1->v_buffer, fld->v_field.bitpos, >v_integer, > 0, fld->v_field.bitlen); > aml_delref(, "wrbuffld"); It's better to die when creating a field which refers out of range memory. ok? Index: sys/dev/acpi/dsdt.c === RCS file: /disk/cvs/openbsd/src/sys/dev/acpi/dsdt.c,v retrieving revision 1.257 diff -u -p -r1.257 dsdt.c --- sys/dev/acpi/dsdt.c 17 Dec 2020 17:57:19 - 1.257 +++ sys/dev/acpi/dsdt.c 27 Feb 2021 09:58:31 - @@ -2790,6 +2790,11 @@ aml_createfield(struct aml_value *field, data->type != AML_OBJTYPE_BUFFER) data = aml_convert(data, AML_OBJTYPE_BUFFER, -1); + if (field->type == AML_OBJTYPE_BUFFERFIELD && + data->length < aml_bytepos(bpos) + aml_bytelen(blen)) + aml_die("%s(%s) out of range\n", aml_mnem(opcode, 0), + aml_nodename(field->node)); + field->v_field.type = opcode; field->v_field.bitpos = bpos; field->v_field.bitlen = blen;
2 diffs for dev/acpi/dsdt.c
Hi, My vaio repeatedly crashed by "Data modified on freelist"(*1) or other memory corruptions. After my long time debug, I found the route cause is a handling of references of LocalX, like the following: If ((SMRW (0x0B, 0x16, 0x21, RefOf (Local0)) == Zero)) In the called control method, "RefOf (Local1)" is referred as Arg3, is stored a value like the following: Arg3 = \_SB.PCI0.LPCB.EC0.SMD0 In aml_store(), lvalue is reset if lvalue is a LocalX. But since that was done before resolving the reference, lvalue was not reset if lvalue is a reference of LocalX. diff #1 fixes that problem. It resets lvalue after resolving references. ok? diff #2 adds aml_die() if any memory corruption occurs when creating field in a buffer. This actually happens on my vaio (pro pk 14) if diff #1 is not applied. ok? diff #1 Index: sys/dev/acpi/dsdt.c === RCS file: /var/cvs/openbsd/src/sys/dev/acpi/dsdt.c,v retrieving revision 1.257 diff -u -p -r1.257 dsdt.c --- sys/dev/acpi/dsdt.c 17 Dec 2020 17:57:19 - 1.257 +++ sys/dev/acpi/dsdt.c 26 Feb 2021 04:12:03 - @@ -2961,11 +2961,11 @@ aml_store(struct aml_scope *scope, struc aml_rwfield(rhs, 0, rhs->v_field.bitlen, , ACPI_IOREAD); rhs = } + + lhs = aml_gettgt(lhs, AMLOP_STORE); /* Store to LocalX: free value */ if (lhs->stack >= AMLOP_LOCAL0 && lhs->stack <= AMLOP_LOCAL7) aml_freevalue(lhs); - - lhs = aml_gettgt(lhs, AMLOP_STORE); switch (lhs->type) { case AML_OBJTYPE_UNINITIALIZED: aml_copyvalue(lhs, rhs); diff #2 Index: sys/dev/acpi/dsdt.c === RCS file: /var/cvs/openbsd/src/sys/dev/acpi/dsdt.c,v retrieving revision 1.257 diff -u -p -r1.257 dsdt.c --- sys/dev/acpi/dsdt.c 17 Dec 2020 17:57:19 - 1.257 +++ sys/dev/acpi/dsdt.c 26 Feb 2021 04:33:21 - @@ -2742,11 +2742,17 @@ aml_rwfield(struct aml_value *fld, int b } else if (mode == ACPI_IOREAD) { /* bufferfield:read */ _aml_setvalue(val, AML_OBJTYPE_INTEGER, 0, 0); + if (ref1->length < aml_bytepos(fld->v_field.bitpos) + + aml_bytelen(fld->v_field.bitlen)) + aml_die("bufferfield:read out of range"); aml_bufcpy(>v_integer, 0, ref1->v_buffer, fld->v_field.bitpos, fld->v_field.bitlen); } else { /* bufferfield:write */ val = aml_convert(val, AML_OBJTYPE_INTEGER, -1); + if (ref1->length < aml_bytepos(fld->v_field.bitpos) + + aml_bytelen(fld->v_field.bitlen)) + aml_die("bufferfield:write out of range"); aml_bufcpy(ref1->v_buffer, fld->v_field.bitpos, >v_integer, 0, fld->v_field.bitlen); aml_delref(, "wrbuffld"); *1 example console log Data modified on freelist: word -35183627074926 of object 0x824a3060 size 0x10 previous type temp (invalid addr 0x8027023e55f0) uvm_fault(0x81f63958, 0x8027023e55f8, 0, 1) -> e kernel: page fault trap, code=0 Stopped at malloc+0x482: movq0x8(%r14),%rcx Running script... ddb{0}> malloc(10,91,5) at malloc+0x482 i915_gem_do_execbuffer(802ab078,80ee0c00,8000337a7970,820ca000,0) at i915_gem_do_execbuffer+0xa52 i915_gem_execbuffer2_ioctl(802ab078,8000337a7970,80ee0c00) at i915_gem_execbuffer2_ioctl+0x144 drmioctl(15700,80406469,8000337a7970,3,8000336a8798) at drmioctl+0xd8 VOP_IOCTL(fd8227abbeb0,80406469,8000337a7970,3,fd826bd1dd88,8000336a8798) at VOP_IOCTL+0x55 vn_ioctl(fd82282ee8e8,80406469,8000337a7970,8000336a8798) at vn_ioctl+0x64 sys_ioctl(8000336a8798,8000337a7a80,8000337a7ae0) at sys_ioctl+0x3c2 syscall(8000337a7b50) at syscall+0x389 Xsyscall(6,36,0,36,80406469,7f7f5c00) at Xsyscall+0x128 end of kernel end trace frame: 0x7f7f5bd0, count: -9
Re: pppac(4): remove `sc_dead' logic
ok yasuoka Thanks, On Tue, 9 Feb 2021 12:06:08 +0300 Vitaliy Makkoveev wrote: > `sc_dead' is used to prevent pppac_ioctl() be called on dying pppac(4) > interface. But now if_detach() makes dying `ifp' inaccessible and waits > for references which are in-use. This logic is not required anymore. > Also I moved if_detach() before klist_invalidate() to prevent the case > while pppac_qstart() bump `sc_rsel'. > > Index: sys/net/if_pppx.c > === > RCS file: /cvs/src/sys/net/if_pppx.c,v > retrieving revision 1.108 > diff -u -p -r1.108 if_pppx.c > --- sys/net/if_pppx.c 1 Feb 2021 07:46:55 - 1.108 > +++ sys/net/if_pppx.c 9 Feb 2021 09:05:23 - > @@ -930,7 +930,6 @@ RBT_GENERATE(pppx_ifs, pppx_if, pxi_entr > > struct pppac_softc { > struct ifnetsc_if; > - unsigned intsc_dead;/* [N] */ > dev_t sc_dev; /* [I] */ > LIST_ENTRY(pppac_softc) > sc_entry; /* [K] */ > @@ -1305,17 +1304,16 @@ pppacclose(dev_t dev, int flags, int mod > int s; > > NET_LOCK(); > - sc->sc_dead = 1; > CLR(ifp->if_flags, IFF_RUNNING); > NET_UNLOCK(); > > + if_detach(ifp); > + > s = splhigh(); > klist_invalidate(>sc_rsel.si_note); > klist_invalidate(>sc_wsel.si_note); > splx(s); > > - if_detach(ifp); > - > pool_put(_session_pool, sc->sc_multicast_session); > NET_LOCK(); > pipex_destroy_all_sessions(sc); > @@ -1330,12 +1328,8 @@ pppacclose(dev_t dev, int flags, int mod > static int > pppac_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) > { > - struct pppac_softc *sc = ifp->if_softc; > /* struct ifreq *ifr = (struct ifreq *)data; */ > int error = 0; > - > - if (sc->sc_dead) > - return (ENXIO); > > switch (cmd) { > case SIOCSIFADDR:
Re: npppd(8)/pppac(4): remove dummy TUNSIFMODE ioctl(2) call
Yes, ok yasuoka On Fri, 29 Jan 2021 14:32:39 +0300 Vitaliy Makkoveev wrote: > Since OpenBSD 6.7 npppd(8) can't work over tun(4) anymore. I propose to > remove dummy TUNSIFMODE ioctl(2) call. > > Index: sys/net/if_pppx.c > === > RCS file: /cvs/src/sys/net/if_pppx.c,v > retrieving revision 1.106 > diff -u -p -r1.106 if_pppx.c > --- sys/net/if_pppx.c 25 Dec 2020 12:59:53 - 1.106 > +++ sys/net/if_pppx.c 29 Jan 2021 11:10:40 - > @@ -920,12 +920,6 @@ pppx_if_ioctl(struct ifnet *ifp, u_long > RBT_GENERATE(pppx_ifs, pppx_if, pxi_entry, pppx_if_cmp); > > /* > - * pppac(4) - PPP Access Concentrator interface > - */ > - > -#include > - > -/* > * Locks used to protect struct members and global data > * I immutable after creation > * K kernel lock > @@ -1188,9 +1182,6 @@ pppacioctl(dev_t dev, u_long cmd, caddr_ > > NET_LOCK(); > switch (cmd) { > - case TUNSIFMODE: /* make npppd happy */ > - break; > - > case FIONBIO: > break; > case FIONREAD: > Index: usr.sbin/npppd/npppd/npppd_iface.c > === > RCS file: /cvs/src/usr.sbin/npppd/npppd/npppd_iface.c,v > retrieving revision 1.14 > diff -u -p -r1.14 npppd_iface.c > --- usr.sbin/npppd/npppd/npppd_iface.c2 Jan 2021 13:15:15 - > 1.14 > +++ usr.sbin/npppd/npppd/npppd_iface.c29 Jan 2021 11:10:41 - > @@ -275,7 +275,6 @@ npppd_iface_reinit(npppd_iface *_this, s > int > npppd_iface_start(npppd_iface *_this) > { > - int x; > charbuf[PATH_MAX]; > > NPPPD_IFACE_ASSERT(_this != NULL); > @@ -285,16 +284,6 @@ npppd_iface_start(npppd_iface *_this) > if ((_this->devf = priv_open(buf, O_RDWR | O_NONBLOCK)) < 0) { > npppd_iface_log(_this, LOG_ERR, "open(%s) failed: %m", buf); > goto fail; > - } > - > - if (_this->using_pppx == 0) { > - x = IFF_BROADCAST; > - if (ioctl(_this->devf, TUNSIFMODE, ) != 0) { > - npppd_iface_log(_this, LOG_ERR, > - "ioctl(TUNSIFMODE=IFF_BROADCAST) failed " > - "in %s(): %m", __func__); > - goto fail; > - } > } > > event_set(&_this->ev, _this->devf, EV_READ | EV_PERSIST, >
Re: Wireguard: can't remove multiple peers at once.
Hi, On Thu, 14 Jan 2021 08:54:36 +0900 Yuichiro NAITO wrote: > Does anybody please review my code? > > Yasuoka-san is my coleague of my work. > So, he is interested in this topic. That’s why I CCed this mail. > I don’t mean he is an reviewer. > >> 2021/01/12 11:27、Yuichiro NAITO のメール: >> I have set up multiple peers in a wg0 interface, >> and tried to remove more than one peers at once. >> Ifconfig(1) only removes the first peer. >> >> Command line was like following. >> >> ``` >> # ifconfig wg0 -wgpeer -wgpeer -wgpeer >> ``` >> >> Only was removed. >> >> I think next peer pointer isn't calculated in case of removing peer >> in sys/net/if_wg.c: wg_ioctl_set() function. >> >> I have tried following patch that can fix this problem. Yes, the diff seems good. I made the following whitespace change. > @@ -2333,6 +2333,11 @@ wg_ioctl_set(struct wg_softc *sc, struct wg_data_io > *data) > } > > peer_p = (struct wg_peer_io *)aip_p; > + continue; > + next_peer: > + aip_p = _p->p_aips[0]; > + aip_p += peer_o.p_aips_count; > + peer_p = (struct wg_peer_io *)aip_p; > } > > error: It seems we prefer putting goto labels at the beginning of the line. ok? Fix wg(4) ioctl to be able to handle multiple wgpeers. Diff from Yuichiro NAITO. Index: sys/net/if_wg.c === RCS file: /cvs/src/sys/net/if_wg.c,v retrieving revision 1.14 diff -u -p -r1.14 if_wg.c --- sys/net/if_wg.c 1 Sep 2020 19:06:59 - 1.14 +++ sys/net/if_wg.c 14 Jan 2021 07:26:48 - @@ -2270,7 +2270,7 @@ wg_ioctl_set(struct wg_softc *sc, struct /* Peer must have public key */ if (!(peer_o.p_flags & WG_PEER_HAS_PUBLIC)) - continue; + goto next_peer; /* 0 = latest protocol, 1 = this protocol */ if (peer_o.p_protocol_version != 0) { @@ -2283,7 +2283,7 @@ wg_ioctl_set(struct wg_softc *sc, struct /* Get local public and check that peer key doesn't match */ if (noise_local_keys(>sc_local, public, NULL) == 0 && bcmp(public, peer_o.p_public, WG_KEY_SIZE) == 0) - continue; + goto next_peer; /* Lookup peer, or create if it doesn't exist */ if ((peer = wg_peer_lookup(sc, peer_o.p_public)) == NULL) { @@ -2291,7 +2291,7 @@ wg_ioctl_set(struct wg_softc *sc, struct * Also, don't create a new one if we only want to * update. */ if (peer_o.p_flags & (WG_PEER_REMOVE|WG_PEER_UPDATE)) - continue; + goto next_peer; if ((peer = wg_peer_create(sc, peer_o.p_public)) == NULL) { @@ -2303,7 +2303,7 @@ wg_ioctl_set(struct wg_softc *sc, struct /* Remove peer and continue if specified */ if (peer_o.p_flags & WG_PEER_REMOVE) { wg_peer_destroy(peer); - continue; + goto next_peer; } if (peer_o.p_flags & WG_PEER_HAS_ENDPOINT) @@ -2332,6 +2332,11 @@ wg_ioctl_set(struct wg_softc *sc, struct aip_p++; } + peer_p = (struct wg_peer_io *)aip_p; + continue; +next_peer: + aip_p = _p->p_aips[0]; + aip_p += peer_o.p_aips_count; peer_p = (struct wg_peer_io *)aip_p; }
Re: pipex(4)/npppd(8): remove dummy PIPEX{G,S}MODE ioctl(2) calls
Yes, ok yasuoka On Wed, 30 Dec 2020 03:02:55 +0300 Vitaliy Makkoveev wrote: > This time pipex(4) related ioctl(2) calls PIPEX{S,G}MODE are pretty > dummy and were kept for backward compatibility reasons. The diff below > removes them. > > ok? > > Index: share/man/man4/pipex.4 > === > RCS file: /cvs/src/share/man/man4/pipex.4,v > retrieving revision 1.13 > diff -u -p -r1.13 pipex.4 > --- share/man/man4/pipex.49 Aug 2020 14:35:31 - 1.13 > +++ share/man/man4/pipex.429 Dec 2020 23:51:57 - > @@ -57,20 +57,6 @@ or > devices. > The added requests are as follows: > .Bl -tag -width Ds > -.It Dv PIPEXGMODEFa "int *" > -Get the devices's > -.Nm > -operation mode. > -1 to enable > -.Nm > -on this device; 0 to disable. > -.It Dv PIPEXSMODEFa "int *" > -Set the device's > -.Nm > -operation mode. > -1 to enable > -.Nm > -on this device; 0 to disable. > .It Dv PIPEXASESSION Fa "struct pipex_session_req *" > Add a new PPP session to be handled by > .Nm . > Index: sys/net/pipex.c > === > RCS file: /cvs/src/sys/net/pipex.c,v > retrieving revision 1.127 > diff -u -p -r1.127 pipex.c > --- sys/net/pipex.c 30 Aug 2020 19:48:16 - 1.127 > +++ sys/net/pipex.c 29 Dec 2020 23:51:59 - > @@ -163,13 +163,6 @@ pipex_ioctl(void *ownersc, u_long cmd, c > > NET_ASSERT_LOCKED(); > switch (cmd) { > - case PIPEXSMODE: > - break; > - > - case PIPEXGMODE: > - *(int *)data = 1; > - break; > - > case PIPEXCSESSION: > ret = pipex_config_session( > (struct pipex_session_config_req *)data, ownersc); > Index: sys/net/pipex.h > === > RCS file: /cvs/src/sys/net/pipex.h,v > retrieving revision 1.28 > diff -u -p -r1.28 pipex.h > --- sys/net/pipex.h 27 Aug 2020 10:47:52 - 1.28 > +++ sys/net/pipex.h 29 Dec 2020 23:51:59 - > @@ -165,8 +165,6 @@ struct pipex_session_descr_req { > > > /* PIPEX ioctls */ > -#define PIPEXSMODE _IOW ('p', 1, int) > -#define PIPEXGMODE _IOR ('p', 2, int) > #define PIPEXASESSION_IOW ('p', 3, struct pipex_session_req) > #define PIPEXDSESSION_IOWR('p', 4, struct pipex_session_close_req) > #define PIPEXCSESSION_IOW ('p', 5, struct pipex_session_config_req) > Index: usr.sbin/npppd/npppd/npppd_iface.c > === > RCS file: /cvs/src/usr.sbin/npppd/npppd/npppd_iface.c,v > retrieving revision 1.13 > diff -u -p -r1.13 npppd_iface.c > --- usr.sbin/npppd/npppd/npppd_iface.c5 Dec 2015 16:10:31 - > 1.13 > +++ usr.sbin/npppd/npppd/npppd_iface.c29 Dec 2020 23:52:00 - > @@ -96,11 +96,6 @@ static void npppd_iface_io_event_handle > static int npppd_iface_log (npppd_iface *, int, const char *, ...) > __printflike(3,4); > > -#ifdef USE_NPPPD_PIPEX > -static int npppd_iface_pipex_enable(npppd_iface *_this); > -static int npppd_iface_pipex_disable(npppd_iface *_this); > -#endif /* USE_NPPPD_PIPEX */ > - > > /** initialize npppd_iface */ > void > @@ -311,12 +306,7 @@ npppd_iface_start(npppd_iface *_this) > goto fail; > } > > -#ifdef USE_NPPPD_PIPEX > - if (npppd_iface_pipex_enable(_this) != 0) { > - log_printf(LOG_WARNING, > - "npppd_iface_pipex_enable() failed: %m"); > - } > -#else > +#ifndef USE_NPPPD_PIPEX > if (_this->using_pppx) { > npppd_iface_log(_this, LOG_ERR, > "pipex is required when using pppx interface"); > @@ -358,13 +348,6 @@ npppd_iface_stop(npppd_iface *_this) > in_host_route_delete(&_this->ip4addr, ); > } > if (_this->devf >= 0) { > -#ifdef USE_NPPPD_PIPEX > - if (npppd_iface_pipex_disable(_this) != 0) { > - log_printf(LOG_CRIT, > - "npppd_iface_pipex_disable() failed: %m"); > - } > -#endif /* USE_NPPPD_PIPEX */ > - > event_del(&_this->ev); > close(_this->devf); > npppd_iface_log(_this, LOG_INFO, "Stopped"); > @@ -381,32 +364,6 @@ npppd_iface_fini(npppd_iface *_this) > NPPPD_IFACE_ASSERT(_this != NULL); > _this->initialized = 0; > } > - > - > -/*** > - * PIPEX related functions > - ***/ > -#ifdef USE_NPPPD_PIPEX > - > -/** enable PIPEX on PPPAC interface */ > -int > -npppd_iface_pipex_enable(npppd_iface *_this) > -{ > - int enable = 1; > - > - return ioctl(_this->devf, PIPEXSMODE, ); > -} > - > -/** disable PIPEX on PPPAC interface */ > -int > -npppd_iface_pipex_disable(npppd_iface *_this) > -{ > - int disable = 0; > - > -
Re: diff: pfctl: error message for nonexisting rtable
the condition was reversed. ok? Index: parse.y === RCS file: /cvs/src/sbin/pfctl/parse.y,v retrieving revision 1.702 diff -u -p -r1.702 parse.y --- parse.y 17 Sep 2020 10:09:43 - 1.702 +++ parse.y 17 Sep 2020 14:23:42 - @@ -1216,7 +1216,7 @@ antispoof_opt : LABEL label { if ($2 < 0 || $2 > RT_TABLEID_MAX) { yyerror("invalid rtable id"); YYERROR; - } else if (lookup_rtable($2) >= 1) { + } else if (lookup_rtable($2) < 1) { yyerror("rtable %lld does not exist", $2); YYERROR; } @@ -2003,7 +2003,7 @@ filter_opt: USER uids { if ($2 < 0 || $2 > RT_TABLEID_MAX) { yyerror("invalid rtable id"); YYERROR; - } else if (lookup_rtable($2) >= 1) { + } else if (lookup_rtable($2) < 1) { yyerror("rtable %lld does not exist", $2); YYERROR; }
Re: diff: pfctl: error message for nonexisting rtable
Hi, I just committed yours. Thanks, On Wed, 16 Sep 2020 16:07:40 +0200 Klemens Nanni wrote: > On Wed, Sep 16, 2020 at 07:49:19PM +0900, YASUOKA Masahiko wrote: >> New diff is using -1 for ENOENT. >> >> Also domainid == 0 is a valid domain id, but previous diff cannot make >> a cache of it since 0 is the default value. So new diff is doing >> >> -static u_int found[RT_TABLEID_MAX+1]; >> +static struct { >> +int found; >> +int domainid; >> +}rtables[RT_TABLEID_MAX+1]; >> >> to distinguish the default 0 and domainid 0. > This looks more complicated than it needs to be, but I also don't want > to bikeshed it; given that the parser is happy with this and we plan to > remove this code alltogether anyway in the next release cycle: OK kn. > > Alternatively, here's a much simpler diff resembling what I had in mind. > Feel free to commit this instead (with my OK), give me an OK for it or > go ahead with yours. > > It uses the same function and reflects the fact that every rdomain is a > rtable but not every rtable is also a rdomain (your choice of `domainid' > seems inconsistent with that). > > Index: parse.y > === > RCS file: /cvs/src/sbin/pfctl/parse.y,v > retrieving revision 1.701 > diff -u -p -r1.701 parse.y > --- parse.y 28 Jan 2020 15:40:35 - 1.701 > +++ parse.y 16 Sep 2020 13:58:23 - > @@ -392,7 +392,7 @@ intinvalid_redirect(struct node_host * > u_int16_t parseicmpspec(char *, sa_family_t); > int kw_casecmp(const void *, const void *); > int map_tos(char *string, int *); > -int rdomain_exists(u_int); > +int lookup_rtable(u_int); > int filteropts_to_rule(struct pf_rule *, struct filter_opts *); > > TAILQ_HEAD(loadanchorshead, loadanchors) > @@ -1216,6 +1216,9 @@ antispoof_opt : LABEL label { > if ($2 < 0 || $2 > RT_TABLEID_MAX) { > yyerror("invalid rtable id"); > YYERROR; > + } else if (lookup_rtable($2) >= 1) { > + yyerror("rtable %lld does not exist", $2); > + YYERROR; > } > antispoof_opts.rtableid = $2; > } > @@ -2000,6 +2003,9 @@ filter_opt : USER uids { > if ($2 < 0 || $2 > RT_TABLEID_MAX) { > yyerror("invalid rtable id"); > YYERROR; > + } else if (lookup_rtable($2) >= 1) { > + yyerror("rtable %lld does not exist", $2); > + YYERROR; > } > filter_opts.rtableid = $2; > } > @@ -2475,7 +2481,7 @@ if_item : STRING{ > | RDOMAIN NUMBER{ > if ($2 < 0 || $2 > RT_TABLEID_MAX) > yyerror("rdomain %lld outside range", $2); > - else if (rdomain_exists($2) != 1) > + else if (lookup_rtable($2) != 2) > yyerror("rdomain %lld does not exist", $2); > > $$ = calloc(1, sizeof(struct node_if)); > @@ -5868,37 +5874,38 @@ map_tos(char *s, int *val) > } > > int > -rdomain_exists(u_int rdomain) > +lookup_rtable(u_int rtableid) > { > size_t len; > struct rt_tableinfo info; > int mib[6]; > static u_int found[RT_TABLEID_MAX+1]; > > - if (found[rdomain] == 1) > - return 1; > + if (found[rtableid]) > + return found[rtableid]; > > mib[0] = CTL_NET; > mib[1] = PF_ROUTE; > mib[2] = 0; > mib[3] = 0; > mib[4] = NET_RT_TABLE; > - mib[5] = rdomain; > + mib[5] = rtableid; > > len = sizeof(info); > if (sysctl(mib, 6, , , NULL, 0) == -1) { > if (errno == ENOENT) { > /* table nonexistent */ > + found[rtableid] = 0; > return 0; > } > err(1, "%s", __func__); > } > - if (info.rti_domainid == rdomain) { > - found[rdomain] = 1; > - return 1; > + if (info.rti_domainid == rtableid) { > + found[rtableid] = 2; > + return 2; > } > - /* rdomain is a table, but not an rdomain */ > - return 0; > + found[rtableid] = 1; > + return 1; > } > > int
Re: diff: pfctl: error message for nonexisting rtable
Hi, On Wed, 16 Sep 2020 12:04:55 +0200 Klemens Nanni wrote: > Using the function verb would reads a bit clearer/more intuitive, > i.e. Yes, "if (!rtable_exists($2))" seems better. >> @@ -5887,17 +5897,37 @@ rdomain_exists(u_int rdomain) >> >> len = sizeof(info); >> if (sysctl(mib, 6, , , NULL, 0) == -1) { >> -if (errno == ENOENT) { >> +if (errno == ENOENT) >> /* table nonexistent */ >> -return 0; >> -} >> -err(1, "%s", __func__); >> -} >> -if (info.rti_domainid == rdomain) { >> -found[rdomain] = 1; >> +domainid[rdomain] = RT_TABLEID_MAX; > This does not look correct, RT_TABLEID_MAX (255) is the biggest *valid* > id, so you cannot use it to denote a nonexistent routing table. Good catch. Thanks, > Perhaps use `static int domainid[RT_TABLEID_MAX+1]' and `-1' to reflect > ENOENT? New diff is using -1 for ENOENT. Also domainid == 0 is a valid domain id, but previous diff cannot make a cache of it since 0 is the default value. So new diff is doing - static u_int found[RT_TABLEID_MAX+1]; + static struct { + int found; + int domainid; + }rtables[RT_TABLEID_MAX+1]; to distinguish the default 0 and domainid 0. ok? Make pfctl check if the rtable really exists when parsing the config. Index: sbin/pfctl/parse.y === RCS file: /cvs/src/sbin/pfctl/parse.y,v retrieving revision 1.701 diff -u -p -r1.701 parse.y --- sbin/pfctl/parse.y 28 Jan 2020 15:40:35 - 1.701 +++ sbin/pfctl/parse.y 16 Sep 2020 10:40:25 - @@ -392,7 +392,9 @@ int invalid_redirect(struct node_host * u_int16_t parseicmpspec(char *, sa_family_t); int kw_casecmp(const void *, const void *); int map_tos(char *string, int *); +int get_domainid(u_int); int rdomain_exists(u_int); +int rtable_exists(u_int); int filteropts_to_rule(struct pf_rule *, struct filter_opts *); TAILQ_HEAD(loadanchorshead, loadanchors) @@ -1217,6 +1219,10 @@ antispoof_opt: LABEL label { yyerror("invalid rtable id"); YYERROR; } + else if (!rtable_exists($2)) { + yyerror("rtable %lld does not exist", $2); + YYERROR; + } antispoof_opts.rtableid = $2; } ; @@ -2001,6 +2007,10 @@ filter_opt : USER uids { yyerror("invalid rtable id"); YYERROR; } + else if (!rtable_exists($2)) { + yyerror("rtable %lld does not exist", $2); + YYERROR; + } filter_opts.rtableid = $2; } | DIVERTTO STRING PORT portplain { @@ -2475,7 +2485,7 @@ if_item : STRING{ | RDOMAIN NUMBER{ if ($2 < 0 || $2 > RT_TABLEID_MAX) yyerror("rdomain %lld outside range", $2); - else if (rdomain_exists($2) != 1) + else if (!rdomain_exists($2)) yyerror("rdomain %lld does not exist", $2); $$ = calloc(1, sizeof(struct node_if)); @@ -5868,36 +5878,60 @@ map_tos(char *s, int *val) } int -rdomain_exists(u_int rdomain) +get_domainid(u_int rtable) { size_t len; struct rt_tableinfo info; int mib[6]; - static u_int found[RT_TABLEID_MAX+1]; + static struct { + int found; + int domainid; + }rtables[RT_TABLEID_MAX+1]; - if (found[rdomain] == 1) - return 1; + if (rtables[rtable].found) + return rtables[rtable].domainid; mib[0] = CTL_NET; mib[1] = PF_ROUTE; mib[2] = 0; mib[3] = 0; mib[4] = NET_RT_TABLE; - mib[5] = rdomain; + mib[5] = rtable; len = sizeof(info); if (sysctl(mib, 6, , , NULL, 0) == -1) { - if (errno == ENOENT) { + if (errno == ENOENT) /* table nonexistent */ - return 0; - } - err(1, "%s", __func__); - } - if (info.rti_domainid == rdomain) { - found[rdomain] = 1; + rtables[rtable].domainid = -1; + else + err(1, "%s", __func__); + } else + rtables[rtable].domainid = info.rti_domainid; +
Re: diff: pfctl: error message for nonexisting rtable
Hi, So, it seems we need to more code and test for pf(4) part. Let me continue this separetely. On Mon, 14 Sep 2020 11:07:53 +0200 Klemens Nanni wrote: > On Mon, Sep 14, 2020 at 02:09:27PM +0900, YASUOKA Masahiko wrote: >> Make pfctl check if the rtable really exists when parsing the config. > I concur, but you can do this with less (duplicated) code. > > Instead of copying rdomain_exists() into rtable_exists() with the > `rti_domainid' check omitted, tweak (and rename) rdomain_exists() into > returning the information whether the given ID is just an rtable. > > rdomain_exists() merges the "invalid id" and "id is an rtable but not > an rdmomain" cases - make those separate return codes, check/adjust > existing callers and use it for your new checks. Yes, I could reduce the code. Thanks, ok? Make pfctl check if the rtable really exists when parsing the config. Index: sbin/pfctl/parse.y === RCS file: /cvs/src/sbin/pfctl/parse.y,v retrieving revision 1.701 diff -u -p -r1.701 parse.y --- sbin/pfctl/parse.y 28 Jan 2020 15:40:35 - 1.701 +++ sbin/pfctl/parse.y 16 Sep 2020 09:11:21 - @@ -392,7 +392,9 @@ int invalid_redirect(struct node_host * u_int16_t parseicmpspec(char *, sa_family_t); int kw_casecmp(const void *, const void *); int map_tos(char *string, int *); +int get_domainid(u_int); int rdomain_exists(u_int); +int rtable_exists(u_int); int filteropts_to_rule(struct pf_rule *, struct filter_opts *); TAILQ_HEAD(loadanchorshead, loadanchors) @@ -1217,6 +1219,10 @@ antispoof_opt: LABEL label { yyerror("invalid rtable id"); YYERROR; } + else if (rtable_exists($2) != 1) { + yyerror("rtable %lld does not exist", $2); + YYERROR; + } antispoof_opts.rtableid = $2; } ; @@ -2001,6 +2007,10 @@ filter_opt : USER uids { yyerror("invalid rtable id"); YYERROR; } + else if (rtable_exists($2) != 1) { + yyerror("rtable %lld does not exist", $2); + YYERROR; + } filter_opts.rtableid = $2; } | DIVERTTO STRING PORT portplain { @@ -5868,15 +5878,15 @@ map_tos(char *s, int *val) } int -rdomain_exists(u_int rdomain) +get_domainid(u_int rdomain) { size_t len; struct rt_tableinfo info; int mib[6]; - static u_int found[RT_TABLEID_MAX+1]; + static u_int domainid[RT_TABLEID_MAX+1]; - if (found[rdomain] == 1) - return 1; + if (domainid[rdomain] != 0) + return domainid[rdomain]; mib[0] = CTL_NET; mib[1] = PF_ROUTE; @@ -5887,17 +5897,37 @@ rdomain_exists(u_int rdomain) len = sizeof(info); if (sysctl(mib, 6, , , NULL, 0) == -1) { - if (errno == ENOENT) { + if (errno == ENOENT) /* table nonexistent */ - return 0; - } - err(1, "%s", __func__); - } - if (info.rti_domainid == rdomain) { - found[rdomain] = 1; + domainid[rdomain] = RT_TABLEID_MAX; + else + err(1, "%s", __func__); + } else + domainid[rdomain] = info.rti_domainid; + + return domainid[rdomain]; +} + +int +rdomain_exists(u_int rdomain) +{ + int domainid; + + domainid = get_domainid(rdomain); + if (domainid == rdomain) return 1; - } /* rdomain is a table, but not an rdomain */ + return 0; +} + +int +rtable_exists(u_int rtable) +{ + int domainid; + + domainid = get_domainid(rtable); + if (domainid < RT_TABLEID_MAX) + return 1; return 0; }
Re: diff: pfctl: error message for nonexisting rtable
Hi, On Tue, 15 Sep 2020 02:31:24 +0200 Klemens Nanni wrote: > On Tue, Sep 15, 2020 at 12:30:35AM +0200, Klemens Nanni wrote: >> Actually, that should just work regardless of whether the rounting >> domain exists at ruleset creation time; just like it is the case with >> interface names/groups which may come and go at runtime without >> requiring changes to the ruleset. >> >> Rules on nonexistent interfaces won't match, routing domains (and >> ultimately routing tables) should behave the same, I think. >> >> Here's a diff that does this for routing domains allowing me to always >> use `on rdomain 5' - I've tested it with a few examplatory rulesets and >> behaviour is as expected. >> >> It will need more eye balling and I am not pushing such changes before >> release, but if that is a general direction we agree, your proposed >> `rtable' fix could move along and become just as flexible instead. > More on this: > > # ifconfig lo1 rdomain 1 > # echo pass on rdomain 1 | pfctl -f- > # ifconfig lo1 destroy > # pfctl -sr > > pass on rdomain 1 all flags S/SA > > The ruleset stays valid and continues to work as soon as routing domain > `1' reappears, there is no reason to require existence of it at ruleset > creation; this is safe because routing domains are just normative > numbers, there's no further state when it comes to filtering - either > the id on the packet matches the number in the ruleset or it doesn't. > > Routing tables however are more involved as they can be used to *alter* > a packet's flow in pf.conf(5), so requiring them to be present at > ruleset creation makes sense to guarantee that pf will only ever change > routing table ids to valid ones. It's not clear for me why non-existing rdomain is accepted but non-existing rtable is rejected. I suppose we can make pf(4) can handle a packet for the non-existing routing table as if the routing table is empty. > Routing domains can be deleted, but that doesn't invalidate rules like > `on rdomain 1', which simply won't match when the given id does not > exist. > > Routing tables however cannot be deleted, they get moved to the default > routing domain whenever their corresponding routing domain disappears; > this is in line with only ever loading valid routing table ids into pf. > > So unless I missed something, that ruleset creation (`pfctl -f ...') > is the only occasion pf actually needs to validate routing table ids: > they are guaranteed to always exist from then on. > > Given this, my diff looks fine as is and should not change `rtable' > behaviour - YASUOKA's diff is also fine as is and actually implements > the validity check I just mentioned, obsoleting my initial feedback.
diff: pfctl: error message for nonexisting rtable
Hi, When pf rule with a "on rdomain n" with nonexisting rdomain n causes /etc/pf.conf:XXX: rdomain n does not exist error. But with a "rtable n" with nonexisting rtable n will cause pfctl: DIOCADDRULE: Device busy error. It is hard to find the cause by this error message. /etc/pf.conf:XXX: rtable n does not exist is better. ok? Make pfctl check if the rtable really exists when parsing the config. Index: sbin/pfctl/parse.y === RCS file: /cvs/src/sbin/pfctl/parse.y,v retrieving revision 1.701 diff -u -p -r1.701 parse.y --- sbin/pfctl/parse.y 28 Jan 2020 15:40:35 - 1.701 +++ sbin/pfctl/parse.y 14 Sep 2020 04:54:39 - @@ -393,6 +393,7 @@ u_int16_t parseicmpspec(char *, sa_famil int kw_casecmp(const void *, const void *); int map_tos(char *string, int *); int rdomain_exists(u_int); +int rtable_exists(u_int); int filteropts_to_rule(struct pf_rule *, struct filter_opts *); TAILQ_HEAD(loadanchorshead, loadanchors) @@ -1217,6 +1218,10 @@ antispoof_opt: LABEL label { yyerror("invalid rtable id"); YYERROR; } + else if (rtable_exists($2) != 1) { + yyerror("rtable %lld does not exist", $2); + YYERROR; + } antispoof_opts.rtableid = $2; } ; @@ -2001,6 +2006,10 @@ filter_opt : USER uids { yyerror("invalid rtable id"); YYERROR; } + else if (rtable_exists($2) != 1) { + yyerror("rtable %lld does not exist", $2); + YYERROR; + } filter_opts.rtableid = $2; } | DIVERTTO STRING PORT portplain { @@ -5899,6 +5908,36 @@ rdomain_exists(u_int rdomain) } /* rdomain is a table, but not an rdomain */ return 0; +} + +int +rtable_exists(u_int rtable) +{ + size_t len; + struct rt_tableinfo info; + int mib[6]; + static u_int found[RT_TABLEID_MAX+1]; + + if (found[rtable] == 1) + return 1; + + mib[0] = CTL_NET; + mib[1] = PF_ROUTE; + mib[2] = 0; + mib[3] = 0; + mib[4] = NET_RT_TABLE; + mib[5] = rtable; + + len = sizeof(info); + if (sysctl(mib, 6, , , NULL, 0) == -1) { + if (errno == ENOENT) { + /* table nonexistent */ + return 0; + } + err(1, "%s", __func__); + } + found[rtable] = 1; + return 1; } int
Re: httpd: use the original uri for REQUEST_URI
Anyone? This is a tiny change but makes httpd(8) more correct. The diff is not so complicated. On Thu, 03 Sep 2020 13:09:49 +0900 (JST) YASUOKA Masahiko wrote: > Let me update the diff. Previous doesn't have an error handling when > strdup() failed. > > On Thu, 03 Sep 2020 13:02:51 +0900 (JST) > YASUOKA Masahiko wrote: >> The diff makes REQUEST_URI in FastCGI become the original request >> URI. Currently it is an url which is url decoded and canonicalized. >> I could not find a specification of REQUEST_URI, but I suppose it is >> the URI in HTTP request. Apache httpd and nginx is using the original >> URI for it. >> >> ok? >> >> >> Use the original requested URI for REQUEST_URI. > > Index: usr.sbin/httpd/http.h > === > RCS file: /cvs/src/usr.sbin/httpd/http.h,v > retrieving revision 1.15 > diff -u -p -r1.15 http.h > --- usr.sbin/httpd/http.h 8 May 2019 21:41:06 - 1.15 > +++ usr.sbin/httpd/http.h 3 Sep 2020 04:09:26 - > @@ -246,6 +246,7 @@ struct http_descriptor { > /* Rewritten path and query remain NULL if not used */ > char*http_path_alias; > char*http_query_alias; > + char*http_path_orig; > > /* A tree of headers and attached lists for repeated headers. */ > struct kv *http_lastheader; > Index: usr.sbin/httpd/server_fcgi.c > === > RCS file: /cvs/src/usr.sbin/httpd/server_fcgi.c,v > retrieving revision 1.83 > diff -u -p -r1.83 server_fcgi.c > --- usr.sbin/httpd/server_fcgi.c 24 Aug 2020 15:49:11 - 1.83 > +++ usr.sbin/httpd/server_fcgi.c 3 Sep 2020 04:09:26 - > @@ -299,13 +299,13 @@ server_fcgi(struct httpd *env, struct cl > } > > if (!desc->http_query) { > - if (fcgi_add_param(, "REQUEST_URI", desc->http_path, > + if (fcgi_add_param(, "REQUEST_URI", desc->http_path_orig, > clt) == -1) { > errstr = "failed to encode param"; > goto fail; > } > } else { > - if (asprintf(, "%s?%s", desc->http_path, > + if (asprintf(, "%s?%s", desc->http_path_orig, > desc->http_query) == -1) { > errstr = "failed to encode param"; > goto fail; > Index: usr.sbin/httpd/server_http.c > === > RCS file: /cvs/src/usr.sbin/httpd/server_http.c,v > retrieving revision 1.140 > diff -u -p -r1.140 server_http.c > --- usr.sbin/httpd/server_http.c 3 Aug 2020 10:59:53 - 1.140 > +++ usr.sbin/httpd/server_http.c 3 Sep 2020 04:09:26 - > @@ -100,6 +100,8 @@ server_httpdesc_free(struct http_descrip > > free(desc->http_path); > desc->http_path = NULL; > + free(desc->http_path_orig); > + desc->http_path_orig = NULL; > free(desc->http_path_alias); > desc->http_path_alias = NULL; > free(desc->http_query); > @@ -1204,9 +1206,13 @@ server_response(struct httpd *httpd, str > char*hostval, *query; > const char *errstr = NULL; > > - /* Decode the URL */ > + /* Preserve original path */ > if (desc->http_path == NULL || > - url_decode(desc->http_path) == NULL) > + (desc->http_path_orig = strdup(desc->http_path)) == NULL) > + goto fail; > + > + /* Decode the URL */ > + if (url_decode(desc->http_path) == NULL) > goto fail; > > /* Canonicalize the request path */
Re: httpd: use the original uri for REQUEST_URI
Let me update the diff. Previous doesn't have an error handling when strdup() failed. On Thu, 03 Sep 2020 13:02:51 +0900 (JST) YASUOKA Masahiko wrote: > The diff makes REQUEST_URI in FastCGI become the original request > URI. Currently it is an url which is url decoded and canonicalized. > I could not find a specification of REQUEST_URI, but I suppose it is > the URI in HTTP request. Apache httpd and nginx is using the original > URI for it. > > ok? > > > Use the original requested URI for REQUEST_URI. Index: usr.sbin/httpd/http.h === RCS file: /cvs/src/usr.sbin/httpd/http.h,v retrieving revision 1.15 diff -u -p -r1.15 http.h --- usr.sbin/httpd/http.h 8 May 2019 21:41:06 - 1.15 +++ usr.sbin/httpd/http.h 3 Sep 2020 04:09:26 - @@ -246,6 +246,7 @@ struct http_descriptor { /* Rewritten path and query remain NULL if not used */ char*http_path_alias; char*http_query_alias; + char*http_path_orig; /* A tree of headers and attached lists for repeated headers. */ struct kv *http_lastheader; Index: usr.sbin/httpd/server_fcgi.c === RCS file: /cvs/src/usr.sbin/httpd/server_fcgi.c,v retrieving revision 1.83 diff -u -p -r1.83 server_fcgi.c --- usr.sbin/httpd/server_fcgi.c24 Aug 2020 15:49:11 - 1.83 +++ usr.sbin/httpd/server_fcgi.c3 Sep 2020 04:09:26 - @@ -299,13 +299,13 @@ server_fcgi(struct httpd *env, struct cl } if (!desc->http_query) { - if (fcgi_add_param(, "REQUEST_URI", desc->http_path, + if (fcgi_add_param(, "REQUEST_URI", desc->http_path_orig, clt) == -1) { errstr = "failed to encode param"; goto fail; } } else { - if (asprintf(, "%s?%s", desc->http_path, + if (asprintf(, "%s?%s", desc->http_path_orig, desc->http_query) == -1) { errstr = "failed to encode param"; goto fail; Index: usr.sbin/httpd/server_http.c === RCS file: /cvs/src/usr.sbin/httpd/server_http.c,v retrieving revision 1.140 diff -u -p -r1.140 server_http.c --- usr.sbin/httpd/server_http.c3 Aug 2020 10:59:53 - 1.140 +++ usr.sbin/httpd/server_http.c3 Sep 2020 04:09:26 - @@ -100,6 +100,8 @@ server_httpdesc_free(struct http_descrip free(desc->http_path); desc->http_path = NULL; + free(desc->http_path_orig); + desc->http_path_orig = NULL; free(desc->http_path_alias); desc->http_path_alias = NULL; free(desc->http_query); @@ -1204,9 +1206,13 @@ server_response(struct httpd *httpd, str char*hostval, *query; const char *errstr = NULL; - /* Decode the URL */ + /* Preserve original path */ if (desc->http_path == NULL || - url_decode(desc->http_path) == NULL) + (desc->http_path_orig = strdup(desc->http_path)) == NULL) + goto fail; + + /* Decode the URL */ + if (url_decode(desc->http_path) == NULL) goto fail; /* Canonicalize the request path */
httpd: use the original uri for REQUEST_URI
The diff makes REQUEST_URI in FastCGI become the original request URI. Currently it is an url which is url decoded and canonicalized. I could not find a specification of REQUEST_URI, but I suppose it is the URI in HTTP request. Apache httpd and nginx is using the original URI for it. ok? Use the original requested URI for REQUEST_URI. Index: usr.sbin/httpd/http.h === RCS file: /cvs/src/usr.sbin/httpd/http.h,v retrieving revision 1.15 diff -u -p -r1.15 http.h --- usr.sbin/httpd/http.h 8 May 2019 21:41:06 - 1.15 +++ usr.sbin/httpd/http.h 3 Sep 2020 04:00:49 - @@ -246,6 +246,7 @@ struct http_descriptor { /* Rewritten path and query remain NULL if not used */ char*http_path_alias; char*http_query_alias; + char*http_path_orig; /* A tree of headers and attached lists for repeated headers. */ struct kv *http_lastheader; Index: usr.sbin/httpd/server_fcgi.c === RCS file: /cvs/src/usr.sbin/httpd/server_fcgi.c,v retrieving revision 1.83 diff -u -p -r1.83 server_fcgi.c --- usr.sbin/httpd/server_fcgi.c24 Aug 2020 15:49:11 - 1.83 +++ usr.sbin/httpd/server_fcgi.c3 Sep 2020 04:00:49 - @@ -299,13 +299,13 @@ server_fcgi(struct httpd *env, struct cl } if (!desc->http_query) { - if (fcgi_add_param(, "REQUEST_URI", desc->http_path, + if (fcgi_add_param(, "REQUEST_URI", desc->http_path_orig, clt) == -1) { errstr = "failed to encode param"; goto fail; } } else { - if (asprintf(, "%s?%s", desc->http_path, + if (asprintf(, "%s?%s", desc->http_path_orig, desc->http_query) == -1) { errstr = "failed to encode param"; goto fail; Index: usr.sbin/httpd/server_http.c === RCS file: /cvs/src/usr.sbin/httpd/server_http.c,v retrieving revision 1.140 diff -u -p -r1.140 server_http.c --- usr.sbin/httpd/server_http.c3 Aug 2020 10:59:53 - 1.140 +++ usr.sbin/httpd/server_http.c3 Sep 2020 04:00:49 - @@ -100,6 +100,8 @@ server_httpdesc_free(struct http_descrip free(desc->http_path); desc->http_path = NULL; + free(desc->http_path_orig); + desc->http_path_orig = NULL; free(desc->http_path_alias); desc->http_path_alias = NULL; free(desc->http_query); @@ -1203,6 +1205,10 @@ server_response(struct httpd *httpd, str int portval = -1, ret; char*hostval, *query; const char *errstr = NULL; + + /* preserve original path */ + if (desc->http_path != NULL) + desc->http_path_orig = strdup(desc->http_path); /* Decode the URL */ if (desc->http_path == NULL ||
Re: Make pipex more common for pppac and pppx
On Mon, 24 Aug 2020 20:07:48 +0300 Vitaliy Makkoveev wrote: > I pointed some comments inline. Thanks, >> +case PIPEXASESSION: >> +{ >> +struct pipex_session_req *req = >> +(struct pipex_session_req *)data; >> +if ((error = pipex_init_session(, req)) != 0) >> +break; >> +error = pipex_link_session(session, >sc_if, sc); >> +break; >> +} > > If pipex_link_session() fails `session' will be leaked. Yes, it's a good catch. >> +case PIPEXDSESSION: >> +{ >> +struct pipex_session_close_req *req = >> +(struct pipex_session_close_req *)data; >> +session = pipex_lookup_by_session_id(req->pcr_protocol, >> +req->pcr_session_id); >> +if (session == NULL || session->ifindex != sc->sc_if.if_index) { > > Can you compare with `session->ownersc' instead of `ifindex' like other > code does? For consistency with other code. Yes, it's better. > What about to introduce pppac_{add,del}_session() and move related code > into them? Also I agreed. > Also I see no such reason to kill pipex_{add,destroy}_session() because > they play with `pipex_rd_head{4,6}' and you don't need newly introduced > `session->is_pppx' which you use only once for that reason. pipex_{add,destroy}_session() should be killed since they are only for pppac. I think such functions should have "pppac_" prefix and placed in if_pppx.c. Also I'd like to move pipex_rd_head{4,6} things to pppac_{add,del}_session with a next step. Yes, we might be able to kill is_pppx. But I'd like to discuss that as a next step as well. I'd like to commit this for this moment, and continue further discussion. ok? Index: sys/net/if_pppx.c === RCS file: /cvs/src/sys/net/if_pppx.c,v retrieving revision 1.101 diff -u -p -r1.101 if_pppx.c --- sys/net/if_pppx.c 14 Aug 2020 11:05:38 - 1.101 +++ sys/net/if_pppx.c 26 Aug 2020 06:25:34 - @@ -163,7 +163,6 @@ struct pppx_if { struct ifnetpxi_if; struct pppx_dev *pxi_dev; /* [I] */ struct pipex_session*pxi_session; /* [I] */ - struct pipex_iface_context pxi_ifcontext; /* [N] */ }; static inline int @@ -181,12 +180,6 @@ intpppx_add_session(struct pppx_dev *, struct pipex_session_req *); intpppx_del_session(struct pppx_dev *, struct pipex_session_close_req *); -intpppx_config_session(struct pppx_dev *, - struct pipex_session_config_req *); -intpppx_get_stat(struct pppx_dev *, - struct pipex_session_stat_req *); -intpppx_get_closed(struct pppx_dev *, - struct pipex_session_list_req *); intpppx_set_session_descr(struct pppx_dev *, struct pipex_session_descr_req *); @@ -424,17 +417,6 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t NET_LOCK(); switch (cmd) { - case PIPEXSMODE: - /* -* npppd always enables on open, and only disables before -* closing. we cheat and let open and close do that, so lie -* to npppd. -*/ - break; - case PIPEXGMODE: - *(int *)addr = 1; - break; - case PIPEXASESSION: error = pppx_add_session(pxd, (struct pipex_session_req *)addr); @@ -445,21 +427,6 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t (struct pipex_session_close_req *)addr); break; - case PIPEXCSESSION: - error = pppx_config_session(pxd, - (struct pipex_session_config_req *)addr); - break; - - case PIPEXGSTAT: - error = pppx_get_stat(pxd, - (struct pipex_session_stat_req *)addr); - break; - - case PIPEXGCLOSED: - error = pppx_get_closed(pxd, - (struct pipex_session_list_req *)addr); - break; - case PIPEXSIFDESCR: error = pppx_set_session_descr(pxd, (struct pipex_session_descr_req *)addr); @@ -472,7 +439,7 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t break; default: - error = ENOTTY; + error = pipex_ioctl(pxd, cmd, addr); break; } NET_UNLOCK(); @@ -741,11 +708,7 @@ pppx_add_session(struct pppx_dev *pxd, s if_addrhooks_run(ifp); } - /* fake a pipex interface context */ - pxi->pxi_ifcontext.ifindex = ifp->if_index; - pxi->pxi_ifcontext.pipexmode = PIPEX_ENABLED; - - error = pipex_link_session(session, >pxi_ifcontext); + error = pipex_link_session(session,
Re: Make pipex more common for pppac and pppx
Hi, Thank you for your comments. On Mon, 17 Aug 2020 00:15:08 +0300 Vitaliy Makkoveev wrote: > I like your idea to kill `pipex_iface_context'. I had trying to keep it > by myself and this was wrong way. Could you rework your diff to be > against the recent sources? I'm sorry the diff was for the old version. >> @@ -1122,8 +1051,11 @@ pppacopen(dev_t dev, int flags, int mode, struct proc >> *p) >> #if NBPFILTER > 0 >> bpfattach(>if_bpf, ifp, DLT_LOOP, sizeof(uint32_t)); >> #endif >> - >> -pipex_iface_init(>sc_pipex_iface, ifp->if_index); >> +/* virtual pipex_session entry for multicast */ >> +session = pool_get(_session_pool, PR_WAITOK | PR_ZERO); >> +session->is_multicast = 1; >> +session->ifindex = ifp->if_index; >> +sc->sc_multicast_session = session; >> > Interface index is not required for multicast session, because it's > never used. Also I like to alloc `sc_multicast_session' before > if_attach(). The diff was to use `ifindex' to select all sessions associated the same pppac(4). But the latest diff uses `ownersc' instead for the same purpose. Also the allocation was moved to earlier part of the function. >> @@ -1382,7 +1340,10 @@ pppacclose(dev_t dev, int flags, int mode, struct >> proc *p) >> klist_invalidate(>sc_wsel.si_note); >> splx(s); >> >> -pipex_iface_fini(>sc_pipex_iface); >> +pool_put(_session_pool, sc->sc_multicast_session); >> +NET_LOCK(); >> +pipex_destroy_all_sessions(sc); >> +NET_UNLOCK(); >> >> if_detach(ifp); > > The recent sources has pppac(4) with unlocked start routine. I like you > detach `ifp' before destroy `sc_multicast_session'. The lines were moved after if_detach(). I'll test this more on this weekend, then I'll ask ok for this. Index: sys/net/if_pppx.c === RCS file: /cvs/src/sys/net/if_pppx.c,v retrieving revision 1.101 diff -u -p -r1.101 if_pppx.c --- sys/net/if_pppx.c 14 Aug 2020 11:05:38 - 1.101 +++ sys/net/if_pppx.c 20 Aug 2020 05:19:55 - @@ -163,7 +163,6 @@ struct pppx_if { struct ifnetpxi_if; struct pppx_dev *pxi_dev; /* [I] */ struct pipex_session*pxi_session; /* [I] */ - struct pipex_iface_context pxi_ifcontext; /* [N] */ }; static inline int @@ -181,12 +180,6 @@ intpppx_add_session(struct pppx_dev *, struct pipex_session_req *); intpppx_del_session(struct pppx_dev *, struct pipex_session_close_req *); -intpppx_config_session(struct pppx_dev *, - struct pipex_session_config_req *); -intpppx_get_stat(struct pppx_dev *, - struct pipex_session_stat_req *); -intpppx_get_closed(struct pppx_dev *, - struct pipex_session_list_req *); intpppx_set_session_descr(struct pppx_dev *, struct pipex_session_descr_req *); @@ -424,17 +417,6 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t NET_LOCK(); switch (cmd) { - case PIPEXSMODE: - /* -* npppd always enables on open, and only disables before -* closing. we cheat and let open and close do that, so lie -* to npppd. -*/ - break; - case PIPEXGMODE: - *(int *)addr = 1; - break; - case PIPEXASESSION: error = pppx_add_session(pxd, (struct pipex_session_req *)addr); @@ -445,21 +427,6 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t (struct pipex_session_close_req *)addr); break; - case PIPEXCSESSION: - error = pppx_config_session(pxd, - (struct pipex_session_config_req *)addr); - break; - - case PIPEXGSTAT: - error = pppx_get_stat(pxd, - (struct pipex_session_stat_req *)addr); - break; - - case PIPEXGCLOSED: - error = pppx_get_closed(pxd, - (struct pipex_session_list_req *)addr); - break; - case PIPEXSIFDESCR: error = pppx_set_session_descr(pxd, (struct pipex_session_descr_req *)addr); @@ -472,7 +439,7 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t break; default: - error = ENOTTY; + error = pipex_ioctl(pxd, cmd, addr); break; } NET_UNLOCK(); @@ -741,11 +708,7 @@ pppx_add_session(struct pppx_dev *pxd, s if_addrhooks_run(ifp); } - /* fake a pipex interface context */ - pxi->pxi_ifcontext.ifindex = ifp->if_index; - pxi->pxi_ifcontext.pipexmode = PIPEX_ENABLED; - - error = pipex_link_session(session, >pxi_ifcontext); + error =
Re: Make pipex more common for pppac and pppx
Let me update the diff. A bug found by the test. diff --git a/sys/net/if_pppx.c b/sys/net/if_pppx.c index 62b85bc34af..6d3de6973bd 100644 --- a/sys/net/if_pppx.c +++ b/sys/net/if_pppx.c @@ -163,7 +163,6 @@ struct pppx_if { struct ifnetpxi_if; struct pppx_dev *pxi_dev; /* [I] */ struct pipex_session*pxi_session; /* [I] */ - struct pipex_iface_context pxi_ifcontext; /* [N] */ }; static inline int @@ -181,12 +180,6 @@ intpppx_add_session(struct pppx_dev *, struct pipex_session_req *); intpppx_del_session(struct pppx_dev *, struct pipex_session_close_req *); -intpppx_config_session(struct pppx_dev *, - struct pipex_session_config_req *); -intpppx_get_stat(struct pppx_dev *, - struct pipex_session_stat_req *); -intpppx_get_closed(struct pppx_dev *, - struct pipex_session_list_req *); intpppx_set_session_descr(struct pppx_dev *, struct pipex_session_descr_req *); @@ -424,17 +417,6 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) NET_LOCK(); switch (cmd) { - case PIPEXSMODE: - /* -* npppd always enables on open, and only disables before -* closing. we cheat and let open and close do that, so lie -* to npppd. -*/ - break; - case PIPEXGMODE: - *(int *)addr = 1; - break; - case PIPEXASESSION: error = pppx_add_session(pxd, (struct pipex_session_req *)addr); @@ -445,21 +427,6 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) (struct pipex_session_close_req *)addr); break; - case PIPEXCSESSION: - error = pppx_config_session(pxd, - (struct pipex_session_config_req *)addr); - break; - - case PIPEXGSTAT: - error = pppx_get_stat(pxd, - (struct pipex_session_stat_req *)addr); - break; - - case PIPEXGCLOSED: - error = pppx_get_closed(pxd, - (struct pipex_session_list_req *)addr); - break; - case PIPEXSIFDESCR: error = pppx_set_session_descr(pxd, (struct pipex_session_descr_req *)addr); @@ -472,7 +439,7 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; default: - error = ENOTTY; + error = pipex_ioctl(pxd, cmd, addr); break; } NET_UNLOCK(); @@ -742,11 +709,7 @@ pppx_add_session(struct pppx_dev *pxd, struct pipex_session_req *req) if_addrhooks_run(ifp); } - /* fake a pipex interface context */ - pxi->pxi_ifcontext.ifindex = ifp->if_index; - pxi->pxi_ifcontext.pipexmode = PIPEX_ENABLED; - - error = pipex_link_session(session, >pxi_ifcontext); + error = pipex_link_session(session, ifp, pxd); if (error) goto detach; @@ -786,40 +749,6 @@ pppx_del_session(struct pppx_dev *pxd, struct pipex_session_close_req *req) return (0); } -int -pppx_config_session(struct pppx_dev *pxd, -struct pipex_session_config_req *req) -{ - struct pppx_if *pxi; - - pxi = pppx_if_find(pxd, req->pcr_session_id, req->pcr_protocol); - if (pxi == NULL) - return (EINVAL); - - return pipex_config_session(req, >pxi_ifcontext); -} - -int -pppx_get_stat(struct pppx_dev *pxd, struct pipex_session_stat_req *req) -{ - struct pppx_if *pxi; - - pxi = pppx_if_find(pxd, req->psr_session_id, req->psr_protocol); - if (pxi == NULL) - return (EINVAL); - - return pipex_get_stat(req, >pxi_ifcontext); -} - -int -pppx_get_closed(struct pppx_dev *pxd, struct pipex_session_list_req *req) -{ - /* XXX: Only opened sessions exist for pppx(4) */ - memset(req, 0, sizeof(*req)); - - return 0; -} - int pppx_set_session_descr(struct pppx_dev *pxd, struct pipex_session_descr_req *req) @@ -1022,9 +951,8 @@ struct pppac_softc { struct selinfo sc_rsel; struct mutexsc_wsel_mtx; struct selinfo sc_wsel; - - struct pipex_iface_context - sc_pipex_iface; + struct pipex_session + *sc_multicast_session; struct mbuf_queue sc_mq; @@ -1084,6 +1012,7 @@ pppacopen(dev_t dev, int flags, int mode, struct proc *p) { struct pppac_softc *sc; struct ifnet *ifp; + struct pipex_session *session; sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); if (pppac_lookup(dev) != NULL) { @@
Make pipex more common for pppac and pppx
This diff makes pipex become more common for pppac and pppx. - Delete "pipex_iface_context". It had been created when pppx doesn't exist. This creates some confusions. For example session->pipex_iface is the device context when pppac(4) but it's not when pppx(4). 623 Static int 624 pipex_get_closed(struct pipex_session_list_req *req, 625 struct pipex_iface_context *iface) 626 { 627 struct pipex_session *session, *session_tmp; 628 629 NET_ASSERT_LOCKED(); 630 bzero(req, sizeof(*req)); 631 LIST_FOREACH_SAFE(session, _close_wait_list, state_list, 632 session_tmp) { 633 if (session->pipex_iface != iface) 634 continue; at #633, using it to verify the ownership. But PIPEXGCLOSED is to get all closed sessions associated with the *device* (not the interface). So we need another way to verify the owner. - The diff adds "void *ownersc" to session for it. - PIPEXGCLOSED for pppx is actually broken. The diff fixes this. - pipex_iface_context has a dummy session for multicast and it's not used by pppx(4). The diff moves all multicast things to pppac local. - Also session creation and deletion for pppac cannot be used by pppx. Move them to pppac local. - Make PIPEX{S,G}MODE dummy. I'd like to delete them afterward. The diff is still under review and test. comment? diff --git a/sys/net/if_pppx.c b/sys/net/if_pppx.c index 62b85bc34af..6d3de6973bd 100644 --- a/sys/net/if_pppx.c +++ b/sys/net/if_pppx.c @@ -163,7 +163,6 @@ struct pppx_if { struct ifnetpxi_if; struct pppx_dev *pxi_dev; /* [I] */ struct pipex_session*pxi_session; /* [I] */ - struct pipex_iface_context pxi_ifcontext; /* [N] */ }; static inline int @@ -181,12 +180,6 @@ intpppx_add_session(struct pppx_dev *, struct pipex_session_req *); intpppx_del_session(struct pppx_dev *, struct pipex_session_close_req *); -intpppx_config_session(struct pppx_dev *, - struct pipex_session_config_req *); -intpppx_get_stat(struct pppx_dev *, - struct pipex_session_stat_req *); -intpppx_get_closed(struct pppx_dev *, - struct pipex_session_list_req *); intpppx_set_session_descr(struct pppx_dev *, struct pipex_session_descr_req *); @@ -424,17 +417,6 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) NET_LOCK(); switch (cmd) { - case PIPEXSMODE: - /* -* npppd always enables on open, and only disables before -* closing. we cheat and let open and close do that, so lie -* to npppd. -*/ - break; - case PIPEXGMODE: - *(int *)addr = 1; - break; - case PIPEXASESSION: error = pppx_add_session(pxd, (struct pipex_session_req *)addr); @@ -445,21 +427,6 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) (struct pipex_session_close_req *)addr); break; - case PIPEXCSESSION: - error = pppx_config_session(pxd, - (struct pipex_session_config_req *)addr); - break; - - case PIPEXGSTAT: - error = pppx_get_stat(pxd, - (struct pipex_session_stat_req *)addr); - break; - - case PIPEXGCLOSED: - error = pppx_get_closed(pxd, - (struct pipex_session_list_req *)addr); - break; - case PIPEXSIFDESCR: error = pppx_set_session_descr(pxd, (struct pipex_session_descr_req *)addr); @@ -472,7 +439,7 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; default: - error = ENOTTY; + error = pipex_ioctl(pxd, cmd, addr); break; } NET_UNLOCK(); @@ -742,11 +709,7 @@ pppx_add_session(struct pppx_dev *pxd, struct pipex_session_req *req) if_addrhooks_run(ifp); } - /* fake a pipex interface context */ - pxi->pxi_ifcontext.ifindex = ifp->if_index; - pxi->pxi_ifcontext.pipexmode = PIPEX_ENABLED; - - error = pipex_link_session(session, >pxi_ifcontext); + error = pipex_link_session(session, ifp, pxd); if (error) goto detach; @@ -786,40 +749,6 @@ pppx_del_session(struct pppx_dev *pxd, struct pipex_session_close_req *req) return (0); } -int -pppx_config_session(struct pppx_dev *pxd, -struct pipex_session_config_req *req) -{ - struct pppx_if *pxi; - - pxi = pppx_if_find(pxd,
Re: pppac(4): destroy sessions the same way as pppx(4) does
On Wed, 12 Aug 2020 12:26:22 +0300 Vitaliy Makkoveev wrote: > We destroy pppx(4) related sessions while we performing PIPEXDSESSION > command. But with pppac(4) we set session's state to > PIPEX_STATE_CLOSE_WAIT2 and we wait garbage collector to do destruction. pppac's PIPEXDSESSION set the states PIPEX_STATE_CLOSED. It is to wait until pipex{in,out}q becomes empty. > We removed `pipex{in,out}q'. So we can safe destroy session in any time. > I propose to make pppac(4) session destruction path the same as pppx(4) > does. Now we destroy them while performing PIPEXDSESSION commad too. Yes. I agree this point. > Also there is no in-kernel garbage collector for pppac(4) sessions. > yasuoka@ pointed me that npppd(8) should kill expired sessions. > > This not only makes pppac(4) closer to pppx(4) but simplify code and > allow us to make safe pppx(4) session processing by pipex_timer(). > So this is preparation step to restore in-kernel timeout for pppx(4) > too. Below, I am asking to keep the timeout behavior. There is a bug for pppx(4) but it had been working for pppac(4) for long time. If you really want to change the behavior please provide a reason. I have not so strong opinion but I don't want to change the behavior without a reason. > Index: sys/net/pipex.c > === > RCS file: /cvs/src/sys/net/pipex.c,v > retrieving revision 1.124 > diff -u -p -r1.124 pipex.c > --- sys/net/pipex.c 12 Aug 2020 08:41:39 - 1.124 > +++ sys/net/pipex.c 12 Aug 2020 09:07:12 - > @@ -536,29 +536,6 @@ out: > return error; > } > > -int > -pipex_notify_close_session(struct pipex_session *session) > -{ > - NET_ASSERT_LOCKED(); > - session->state = PIPEX_STATE_CLOSE_WAIT; > - session->stat.idle_time = 0; > - LIST_INSERT_HEAD(_close_wait_list, session, state_list); > - > - return (0); > -} > - Unrelated but ok. > -int > -pipex_notify_close_session_all(void) > -{ > - struct pipex_session *session; > - > - NET_ASSERT_LOCKED(); > - LIST_FOREACH(session, _session_list, session_list) > - if (session->state == PIPEX_STATE_OPENED) > - pipex_notify_close_session(session); > - return (0); > -} > - Unrelated but ok. Since it's not used. > Static int > pipex_close_session(struct pipex_session_close_req *req, > struct pipex_iface_context *iface) > @@ -573,13 +550,9 @@ pipex_close_session(struct pipex_session > if (session->pipex_iface != iface) > return (EINVAL); > > - /* remove from close_wait list */ > - if (session->state == PIPEX_STATE_CLOSE_WAIT) > - LIST_REMOVE(session, state_list); > - This must be kept. Useland may PIPEXDSESSION before PIPEXGCLOSED for this session. > /* get statistics before destroy the session */ > req->pcr_stat = session->stat; > - session->state = PIPEX_STATE_CLOSED; > + pipex_destroy_session(session); > > return (0); > } ok > @@ -739,47 +712,25 @@ pipex_timer_stop(void) > Static void > pipex_timer(void *ignored_arg) > { > - struct pipex_session *session, *session_tmp; > + struct pipex_session *session; > > timeout_add_sec(_timer_ch, pipex_prune); > > NET_LOCK(); > /* walk through */ > - LIST_FOREACH_SAFE(session, _session_list, session_list, > - session_tmp) { > - switch (session->state) { > - case PIPEX_STATE_OPENED: > - if (session->timeout_sec == 0) > - continue; > - > - session->stat.idle_time++; > - if (session->stat.idle_time < session->timeout_sec) > - continue; > - > - pipex_notify_close_session(session); > - break; > - > - case PIPEX_STATE_CLOSE_WAIT: > - case PIPEX_STATE_CLOSE_WAIT2: > - /* Wait PIPEXDSESSION from userland */ > - session->stat.idle_time++; > - if (session->stat.idle_time < PIPEX_CLOSE_TIMEOUT) > - continue; > - > - if (session->state == PIPEX_STATE_CLOSE_WAIT) > - LIST_REMOVE(session, state_list); > - session->state = PIPEX_STATE_CLOSED; > - /* FALLTHROUGH */ > + LIST_FOREACH(session, _session_list, session_list) { > + if (session->state != PIPEX_STATE_OPENED) > + continue; > + if (session->timeout_sec == 0) > + continue; > > - case PIPEX_STATE_CLOSED: > - pipex_destroy_session(session); > - break; > + session->stat.idle_time++; > + if (session->stat.idle_time < session->timeout_sec) > + continue; > > - default: > - break; > -
Re: pipex "idle-timeout" work with pppx(4).
Hi, On Wed, 12 Aug 2020 12:38:39 +0300 Vitaliy Makkoveev wrote: > We don't need to mark pppx(4) sessions because there is no special cases > for them. We just need to kill pppx(4) related "pr_timeout_sec != 0" > checks and call pipex_get_closed() by pppx_get_closed(). How do you implement that by calling pipex_get_closed() by pppx_get_closed()? PIPEXGCLOSED is to pick up expired sessions which is associated with the character device (/dev/{pppx,pppac}0). In pppac(4) case, the character device is the same object of the interface pppac. But pppx(4) is not the same. pipex_session has no direct referece to the device. This is why my diff was modifying pipex_get_closed().
Re: pipex "idle-timeout" work with pppx(4).
On Tue, 11 Aug 2020 23:06:45 +0300 Vitaliy Makkoveev wrote: > We removed `pipex{in,out}q'. So now we can destroy pppac(4) session just > like we do in pppx(4) case. Also there is no reason to allow > pipex_timer() to destroy sessions - userland will do this by > PIPEXDSESSION. This permit us to use existing pipex_get_closed() for > both pppac(4) and pppx(4) without any modifications. > > So, I propose pipex_close_session() and pipex_timer() be like below. It doesn't seem to fix "idle-timeout". > We simplify pppac(4) session destruction. We unify behavior with pppx(4) > - we killing session just now. There is no reason to modify > pipex_get_closed() and pipex_link_session(). pppx(4) related sessions > can be processed by pipex_timer(). There is no performance impact. We need to modify pppx_get_closed() to implement idle-timeout. > Do you like this? We can do two diffs. The first to unify destruction > and the second to re-enable in-kernel timeout for pppx(4) and revert man > pages modifications. I have no objection to your "unify destruction". I'll rebase my diff after that work.
Re: pipex "idle-timeout" work with pppx(4).
my diff is to make pppx(4) have the same "idle-timeout" functionality. I strongly think pppx(4) must have the same functionalities of pppac(4) because I don't see any reason to have any difference between pppx(4) and pppac(4). Your pseudo code is suggesting another thing. You would like to change the existing behavior of pppac(4)? Then, what is a problem you concern. I'd like you to provide what is the relation of my diff or a background of the code. On Tue, 11 Aug 2020 01:20:45 +0300 Vitaliy Makkoveev wrote: > > >> On 10 Aug 2020, at 19:53, Vitaliy Makkoveev wrote: >> >> We are doing all wrong :) >> >> We can just unlink pppx(4) related session from `pipex_session_list' if >> it's time expired. But since this unlinked session is still exists in >> pppx(4) layer we can access through pppx_get_closed() without any >> search. We should only add flag to session which identifies it as >> pppx(4) related. >> >> I hope you like this idea. >> >> cut begin >> Static void >> pipex_timer(void *ignored_arg) >> { >>struct pipex_session *session, *session_tmp; >> >>timeout_add_sec(_timer_ch, pipex_prune); >> >>NET_LOCK(); >>/* walk through */ >>LIST_FOREACH_SAFE(session, _session_list, session_list, >>session_tmp) { >>switch (session->state) { >>case PIPEX_STATE_OPENED: >>if (session->timeout_sec == 0) >>continue; >> >>session->stat.idle_time++; >>if (session->stat.idle_time < session->timeout_sec) >>continue; >> >> if (session->pppx_session) >> pipex_unlink_session(session); >> else >> pipex_notify_close_session(session); >>break; >> /* ... */ >> } >> >> pppx_get_closed(struct pppx_dev *pxd, struct pipex_session_list_req *req) >> { >> struct pppx_if *pxi; >> >> pxi = pppx_if_find(pxd, req->pdr_session_id, req->pdr_protocol); >> if (pxi == NULL) >> return (EINVAL); >> >> memset(req, 0, sizeof(*req)); >> if (session->state == PIPEX_STATE_CLOSED) { >> req->plr_ppp_id[req->plr_ppp_id_count++] = session->ppp_id; >> pppx_if_destroy(pxi); >> } >> >> return 0; >> } > > Sorry for noise. I should avoid to write pseudo code.
Re: pipex "idle-timeout" work with pppx(4).
Hi, On Mon, 10 Aug 2020 16:30:27 +0300 Vitaliy Makkoveev wrote: > On Mon, Aug 10, 2020 at 03:12:02PM +0900, YASUOKA Masahiko wrote: >> On Sun, 9 Aug 2020 20:03:50 +0300 >> Vitaliy Makkoveev wrote: >> > On Sun, Aug 09, 2020 at 06:20:13PM +0300, Vitaliy Makkoveev wrote: >> >> You propose to unlink pppx(4) related session which reached timeout. I'm >> >> ok with this direction. But I see no reason to rework _get_closed() >> >> routines. >> >> >> >> in pppac(4) case it's assumed what if session is not yet destroyed by >> >> garbage collector, it will be destroyed while we performing PIPEXGCLOSED >> >> command. We can make pppx(4) behavior the same and I propose to >> >> pppx_get_closed() be like below. >> >> >> >> Also, nothing requires to modify pipex_get_closed(). >> >> >> >> cut begin >> > >> > Sorry, I mean >> > >> > pppx_get_closed(struct pppx_dev *pxd, struct pipex_session_list_req *req) >> > { >> >struct pppx_if *pxi; >> > >> >memset(req, 0, sizeof(*req)); >> > >> >while ((pxi = LIST_FIRST(>pxd_pxis))) { >> >if (pxi->pxi_session->state == session->state = >> >PIPEX_STATE_CLOSED) { >> >req->plr_ppp_id[req->plr_ppp_id_count++] = >> >pxi->pxi_session->ppp_id; >> >pppx_if_destroy(pxi); >> >} >> >} >> > >> >return 0; >> > } >> >> Yes, the diff doesn't seem to be completed but this way also will work. >> >> Usually there is few CLOSED session even if there is a lot of session. >> Also there is no CLOSED session if idle-timeout is not configured. I >> avoided that way because I think checking all sessions' state to find >> such the few sessions is too expensive. >> >> A way I am suggesting: >> >> @@ -622,7 +625,7 @@ pipex_get_stat(struct pipex_session_stat >> >> Static int >> pipex_get_closed(struct pipex_session_list_req *req, >> -struct pipex_iface_context *iface) >> +int (*isowner)(void *, struct pipex_session *), void *ctx) >> { >> struct pipex_session *session, *session_tmp; >> >> @@ -630,7 +633,7 @@ pipex_get_closed(struct pipex_session_li >> bzero(req, sizeof(*req)); >> LIST_FOREACH_SAFE(session, _close_wait_list, state_list, >> session_tmp) { >> -if (session->pipex_iface != iface) >> +if (!isowner(ctx, session)) >> continue; >> req->plr_ppp_id[req->plr_ppp_id_count++] = session->ppp_id; >> LIST_REMOVE(session, state_list); >> >> uses pipex_close_wait_list which contains only sessions which is timed >> out. > > You are right. pipex_get_closed() walks through `pipex_close_wait_list' > which contains only CLOSE_WAIT sessions. > > According to npppd(8) code we do PIPEXGCLOSED related walkthrough once > per NPPPD_TIMER_TICK_IVAL seconds, which is defined as 4. Is this such > performance impact? It might be not so expensive for you. But why do you intend to use that extra CPU when you have a cheaper way? > Also who should destroy these sessions? It's assumed npppd(8) will > destroy them by l2tp_ctrl_timeout() and pptp_ctrl_timeout()? Excuse me > if I'm wrong, but who will destroy sessions in pppoe case? In usr.sbin/npppd/npppd/npppd.c: 1306 static void 1307 pipex_periodic(npppd *_this) 1308 { (snip) 1326 do { 1327 error = ioctl(devf, PIPEXGCLOSED, ); 1328 if (error) { 1329 if (errno != ENXIO) 1330 log_printf(LOG_WARNING, 1331 "PIPEXGCLOSED failed: %m"); 1332 break; 1333 } 1334 for (i = 0; i < req.plr_ppp_id_count; i++) { 1335 ppp_id = req.plr_ppp_id[i]; 1336 slist_add(, (void *)(uintptr_t)ppp_id); 1337 } 1338 } while (req.plr_flags & PIPEX_LISTREQ_MORE); ppp sessions which are closed by pipex(4) is inserted into "dlist". 1350 /* Disconnect request */ 1351 slist_itr_first(); 1352 while (slist_itr_has_next()) { (snip) 1372 ppp_log(ppp, LOG_INFO, "Stop requested by the kernel"); 1373 /* TODO: PIPEX doesn't return the disconect reason */ 1374 #ifdef USE_NPPPD_RADIUS 1375 ppp_set_radius_terminate_cause(ppp, 1376 RADIUS_TERMNATE_CAUSE_IDLE_TIMEOUT); 1377 #endif 1378 ppp_stop(ppp, NULL); all ppp session are stopd at #1378. PPP is finisingh a layer by a layer, ppp_stop0() will called. That function will call PIPEXDSESSION. I'd like to empasize that npppd(8) takes responsibilities of pipex sessions' creation/deletion even when idle timeout happening.
Re: pipex "idle-timeout" work with pppx(4).
Hi, Thank you for your review. On Sun, 9 Aug 2020 20:03:50 +0300 Vitaliy Makkoveev wrote: > On Sun, Aug 09, 2020 at 06:20:13PM +0300, Vitaliy Makkoveev wrote: >> You propose to unlink pppx(4) related session which reached timeout. I'm >> ok with this direction. But I see no reason to rework _get_closed() >> routines. >> >> in pppac(4) case it's assumed what if session is not yet destroyed by >> garbage collector, it will be destroyed while we performing PIPEXGCLOSED >> command. We can make pppx(4) behavior the same and I propose to >> pppx_get_closed() be like below. >> >> Also, nothing requires to modify pipex_get_closed(). >> >> cut begin > > Sorry, I mean > > pppx_get_closed(struct pppx_dev *pxd, struct pipex_session_list_req *req) > { > struct pppx_if *pxi; > > memset(req, 0, sizeof(*req)); > > while ((pxi = LIST_FIRST(>pxd_pxis))) { > if (pxi->pxi_session->state == session->state = > PIPEX_STATE_CLOSED) { > req->plr_ppp_id[req->plr_ppp_id_count++] = > pxi->pxi_session->ppp_id; > pppx_if_destroy(pxi); > } > } > > return 0; > } Yes, the diff doesn't seem to be completed but this way also will work. Usually there is few CLOSED session even if there is a lot of session. Also there is no CLOSED session if idle-timeout is not configured. I avoided that way because I think checking all sessions' state to find such the few sessions is too expensive. A way I am suggesting: @@ -622,7 +625,7 @@ pipex_get_stat(struct pipex_session_stat Static int pipex_get_closed(struct pipex_session_list_req *req, -struct pipex_iface_context *iface) +int (*isowner)(void *, struct pipex_session *), void *ctx) { struct pipex_session *session, *session_tmp; @@ -630,7 +633,7 @@ pipex_get_closed(struct pipex_session_li bzero(req, sizeof(*req)); LIST_FOREACH_SAFE(session, _close_wait_list, state_list, session_tmp) { - if (session->pipex_iface != iface) + if (!isowner(ctx, session)) continue; req->plr_ppp_id[req->plr_ppp_id_count++] = session->ppp_id; LIST_REMOVE(session, state_list); uses pipex_close_wait_list which contains only sessions which is timed out. >> Also I have one inlined comment within your diff. >> > @@ -430,6 +425,7 @@ pipex_link_session(struct pipex_session >> >struct pipex_iface_context *iface) >> > { >> >struct pipex_hash_head *chain; >> > + struct ifnet *ifp; >> > >> >NET_ASSERT_LOCKED(); >> > >> > @@ -442,6 +438,11 @@ pipex_link_session(struct pipex_session >> >session->pipex_iface = iface; >> >session->ifindex = iface->ifindex; >> > >> > + ifp = if_get(iface->ifindex); >> > + if (ifp != NULL && ifp->if_flags & IFF_POINTOPOINT) >> > + session->is_p2p = 1; >> > + if_put(ifp); >> > + >> >> I guess NULL `ifp' here exposes us a bug. I like to have assertion here. ok, I agree here. The diff is updated. Index: sys/net/if_pppx.c === RCS file: /cvs/src/sys/net/if_pppx.c,v retrieving revision 1.98 diff -u -p -r1.98 if_pppx.c --- sys/net/if_pppx.c 28 Jul 2020 09:53:36 - 1.98 +++ sys/net/if_pppx.c 10 Aug 2020 06:09:52 - @@ -185,6 +185,7 @@ int pppx_config_session(struct pppx_dev struct pipex_session_config_req *); intpppx_get_stat(struct pppx_dev *, struct pipex_session_stat_req *); +intpppx_is_owner(void *, struct pipex_session *); intpppx_get_closed(struct pppx_dev *, struct pipex_session_list_req *); intpppx_set_session_descr(struct pppx_dev *, @@ -645,14 +646,6 @@ pppx_add_session(struct pppx_dev *pxd, s struct in_ifaddr *ia; struct sockaddr_in ifaddr; - /* -* XXX: As long as `session' is allocated as part of a `pxi' -* it isn't possible to free it separately. So disallow -* the timeout feature until this is fixed. -*/ - if (req->pr_timeout_sec != 0) - return (EINVAL); - error = pipex_init_session(, req); if (error) return (error); @@ -812,12 +805,22 @@ pppx_get_stat(struct pppx_dev *pxd, stru } int -pppx_get_closed(struct pppx_dev *pxd, struct pipex_session_list_req *req) +pppx_is_owner(void *ctx, struct pipex_session *session) { - /* XXX: Only opened sessions exist for pppx(4) */ - memset(req, 0, sizeof(*req)); + struct pppx_dev *pxd = ctx; + struct pppx_if *pxi; - return 0; + pxi = pppx_if_find(pxd, session->session_id, session->protocol); + if (pxi != NULL) + return (1); + + return (0); +} + +int +pppx_get_closed(struct pppx_dev *pxd, struct pipex_session_list_req *req) +{ +
pipex "idle-timeout" work with pppx(4).
This diff makes pipex "idle-timeout" work with pppx(4). ok? Index: sys/net/if_pppx.c === RCS file: /disk/cvs/openbsd/src/sys/net/if_pppx.c,v retrieving revision 1.98 diff -u -p -r1.98 if_pppx.c --- sys/net/if_pppx.c 28 Jul 2020 09:53:36 - 1.98 +++ sys/net/if_pppx.c 9 Aug 2020 08:05:16 - @@ -185,6 +185,7 @@ int pppx_config_session(struct pppx_dev struct pipex_session_config_req *); intpppx_get_stat(struct pppx_dev *, struct pipex_session_stat_req *); +intpppx_is_owner(void *, struct pipex_session *); intpppx_get_closed(struct pppx_dev *, struct pipex_session_list_req *); intpppx_set_session_descr(struct pppx_dev *, @@ -645,14 +646,6 @@ pppx_add_session(struct pppx_dev *pxd, s struct in_ifaddr *ia; struct sockaddr_in ifaddr; - /* -* XXX: As long as `session' is allocated as part of a `pxi' -* it isn't possible to free it separately. So disallow -* the timeout feature until this is fixed. -*/ - if (req->pr_timeout_sec != 0) - return (EINVAL); - error = pipex_init_session(, req); if (error) return (error); @@ -812,12 +805,22 @@ pppx_get_stat(struct pppx_dev *pxd, stru } int -pppx_get_closed(struct pppx_dev *pxd, struct pipex_session_list_req *req) +pppx_is_owner(void *ctx, struct pipex_session *session) { - /* XXX: Only opened sessions exist for pppx(4) */ - memset(req, 0, sizeof(*req)); + struct pppx_dev *pxd = ctx; + struct pppx_if *pxi; - return 0; + pxi = pppx_if_find(pxd, session->session_id, session->protocol); + if (pxi != NULL) + return (1); + + return (0); +} + +int +pppx_get_closed(struct pppx_dev *pxd, struct pipex_session_list_req *req) +{ + return (pipex_get_closed(req, pppx_is_owner, pxd)); } int @@ -1059,6 +1062,7 @@ static intpppac_ioctl(struct ifnet *, u static int pppac_output(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); static voidpppac_start(struct ifnet *); +static int pppac_is_owner(void *, struct pipex_session *); static inline struct pppac_softc * pppac_lookup(dev_t dev) @@ -1251,6 +1255,16 @@ pppacwrite(dev_t dev, struct uio *uio, i } int +pppac_is_owner(void *ctx, struct pipex_session *session) +{ + struct pppac_softc *sc = ctx; + + if (session->ifindex == sc->sc_if.if_index) + return (1); + return (0); +} + +int pppacioctl(dev_t dev, u_long cmd, caddr_t data, int flags, struct proc *p) { struct pppac_softc *sc = pppac_lookup(dev); @@ -1264,6 +1278,13 @@ pppacioctl(dev_t dev, u_long cmd, caddr_ break; case FIONREAD: *(int *)data = mq_hdatalen(>sc_mq); + break; + + case PIPEXGCLOSED: + NET_LOCK(); + error = pipex_get_closed((struct pipex_session_list_req *)data, + pppac_is_owner, sc); + NET_UNLOCK(); break; default: Index: sys/net/pipex.c === RCS file: /disk/cvs/openbsd/src/sys/net/pipex.c,v retrieving revision 1.123 diff -u -p -r1.123 pipex.c --- sys/net/pipex.c 4 Aug 2020 09:32:05 - 1.123 +++ sys/net/pipex.c 9 Aug 2020 08:05:16 - @@ -240,11 +240,6 @@ pipex_ioctl(struct pipex_iface_context * pipex_iface); break; - case PIPEXGCLOSED: - ret = pipex_get_closed((struct pipex_session_list_req *)data, - pipex_iface); - break; - default: ret = ENOTTY; break; @@ -430,6 +425,7 @@ pipex_link_session(struct pipex_session struct pipex_iface_context *iface) { struct pipex_hash_head *chain; + struct ifnet *ifp; NET_ASSERT_LOCKED(); @@ -442,6 +438,11 @@ pipex_link_session(struct pipex_session session->pipex_iface = iface; session->ifindex = iface->ifindex; + ifp = if_get(iface->ifindex); + if (ifp != NULL && ifp->if_flags & IFF_POINTOPOINT) + session->is_p2p = 1; + if_put(ifp); + LIST_INSERT_HEAD(_session_list, session, session_list); chain = PIPEX_ID_HASHTABLE(session->session_id); LIST_INSERT_HEAD(chain, session, id_chain); @@ -469,6 +470,8 @@ pipex_unlink_session(struct pipex_sessio session->ifindex = 0; NET_ASSERT_LOCKED(); + if (session->state == PIPEX_STATE_CLOSED) + return; LIST_REMOVE(session, id_chain); #if defined(PIPEX_PPTP) || defined(PIPEX_L2TP) switch (session->protocol) { @@ -622,7 +625,7 @@ pipex_get_stat(struct pipex_session_stat Static int
Re: describe 'idle-timeout' exception in npppd.conf man page
On Sat, 8 Aug 2020 16:01:59 +0300 Vitaliy Makkoveev wrote: > On Sat, Aug 08, 2020 at 08:49:24PM +0900, YASUOKA Masahiko wrote: >> On Fri, 7 Aug 2020 22:19:05 +0300 >> Vitaliy Makkoveev wrote: >> > Some times ago we disabled in-kernel timeout for pppx(4) related >> > pipex(4) sessions. We did this for prevent use after free issue caused >> > by pipex_timer [1]. By default "idle-timeout" is not set in >> > npppd.conf(5) and I guess this is reason for we forgot to describe this >> > exception in npppd.conf(5). >> > >> > But looks like one user caught this [2]. So I propose to describe this >> > in BUGS section of npppd.conf(5). >> > >> > Also current "idle-timeout" description looks incorrect. If this option >> > is missing, there is not in-kernel timeout for this session, but >> > npppd(8) uses it's own timeout for. And we can't configure this value. >> > >> > YASUOKA, what do you think? May be we can kill in-kernel timeout feature >> > for pipex(4)?, and make npppd(8)'s idle timeout configurable by this >> > option? >> >> I think we should mention this to the man page until we fix it. >> So I'd like you to update the man page first. >> >> I'll try to review the problem. >> > > Thanks. I updated my diff with changes proposed by jmc@. Are you agree > with them? Yes. ok yasuoka >> > 1. >> > https://cvsweb.openbsd.org/src/sys/net/if_pppx.c?rev=1.78=text/x-cvsweb-markup >> > 2. https://marc.info/?l=openbsd-misc=159655468504864=2 >> > >> > >> > Index: usr.sbin/npppd/npppd/npppd.conf.5 >> > === >> > RCS file: /cvs/src/usr.sbin/npppd/npppd/npppd.conf.5,v >> > retrieving revision 1.27 >> > diff -u -p -r1.27 npppd.conf.5 >> > --- usr.sbin/npppd/npppd/npppd.conf.5 23 Apr 2020 21:10:54 - >> > 1.27 >> > +++ usr.sbin/npppd/npppd/npppd.conf.5 7 Aug 2020 19:17:00 - >> > @@ -699,3 +699,9 @@ The current version of >> > .Xr npppd 8 >> > does not support adding or removing tunnel settings or changing listener >> > settings (listen address, port and l2tp-ipsec-require). >> > +.Pp >> > +This time >> > +.Xr pppx 4 >> > +does not allow to create sessions with non null >> > +.Ic idle-timeout >> > +option. >> >
Re: describe 'idle-timeout' exception in npppd.conf man page
On Fri, 7 Aug 2020 22:19:05 +0300 Vitaliy Makkoveev wrote: > Some times ago we disabled in-kernel timeout for pppx(4) related > pipex(4) sessions. We did this for prevent use after free issue caused > by pipex_timer [1]. By default "idle-timeout" is not set in > npppd.conf(5) and I guess this is reason for we forgot to describe this > exception in npppd.conf(5). > > But looks like one user caught this [2]. So I propose to describe this > in BUGS section of npppd.conf(5). > > Also current "idle-timeout" description looks incorrect. If this option > is missing, there is not in-kernel timeout for this session, but > npppd(8) uses it's own timeout for. And we can't configure this value. > > YASUOKA, what do you think? May be we can kill in-kernel timeout feature > for pipex(4)?, and make npppd(8)'s idle timeout configurable by this > option? I think we should mention this to the man page until we fix it. So I'd like you to update the man page first. I'll try to review the problem. > 1. > https://cvsweb.openbsd.org/src/sys/net/if_pppx.c?rev=1.78=text/x-cvsweb-markup > 2. https://marc.info/?l=openbsd-misc=159655468504864=2 > > > Index: usr.sbin/npppd/npppd/npppd.conf.5 > === > RCS file: /cvs/src/usr.sbin/npppd/npppd/npppd.conf.5,v > retrieving revision 1.27 > diff -u -p -r1.27 npppd.conf.5 > --- usr.sbin/npppd/npppd/npppd.conf.5 23 Apr 2020 21:10:54 - 1.27 > +++ usr.sbin/npppd/npppd/npppd.conf.5 7 Aug 2020 19:17:00 - > @@ -699,3 +699,9 @@ The current version of > .Xr npppd 8 > does not support adding or removing tunnel settings or changing listener > settings (listen address, port and l2tp-ipsec-require). > +.Pp > +This time > +.Xr pppx 4 > +does not allow to create sessions with non null > +.Ic idle-timeout > +option.
Re: [PATCH] pipex(4): rework PPP input
Sorry for delayed reply. On Wed, 27 May 2020 01:29:36 +0300 Sergey Ryazanov wrote: > On Tue, May 26, 2020 at 12:07 PM Vitaliy Makkoveev > wrote: >>> On 25 May 2020, at 22:04, Sergey Ryazanov wrote: >>> On Sat, May 23, 2020 at 3:07 PM Vitaliy Makkoveev >>> wrote: For example, each pipex session should have unique pair of `protocol’ and `session_id’. These values are passed from userland. While the only instance of npppd(8) uses pipex(4) this is not the problem. But you introduce the case while pipex(4) will be used by multiple independent userland programs. At least, I have interest how you handle this. >>> >>> This should not be a problem here. npppd(8) support server mode only. >>> While my work is to implement acceleration for client side of L2TP >>> connection. >> >> I guess they can coexist. Also you can have multiple connections to >> ppp servers simultaneously. > > With 16 bits long session id field, according to birthday problem to > reach 0.9 collision probability I need 549 simultaneous sessions. > Should I still be worried or I have a time to complete integration > work and then update UDP filter for love of the game? usr.sbin/npppd/l2tp/l2tp_local.h 79 #define L2TP_SESSION_ID_MASK0x7fff npppd uses 0-32767
Re: pipex(4): kill pipexintr()
On Mon, 3 Aug 2020 23:36:09 +0300 Vitaliy Makkoveev wrote: > On Tue, Aug 04, 2020 at 01:26:14AM +0900, YASUOKA Masahiko wrote: >> Comments? > > You introduce `cookie' as > > cookie = session->protocol << 16 | session->session_id; > > also multicast sessions initialized as > > session->protocol = PIPEX_PROTO_NONE; > session->session_id = ifindex; > > `protocol' and `session_id' come from userland, so I like to have checks > like below. It's allow us to avoid `cookie' be broken while > `pr_session_id' exceeds 16 bit integer. Also userland should not pass > PIPEX_PROTO_NONE as `pr_protocol' because we shouldn't have multicast > and not multicast sessions with the same `cookie'. > > cut begin > > pipex_init_session(struct pipex_session **rsession, > struct pipex_session_req *req) > { > if (req->pr_protocol == PIPEX_PROTO_NONE) > return (EINVAL); pipex_init_session() has the same check already. 287 int 288 pipex_init_session(struct pipex_session **rsession, 289 struct pipex_session_req *req) 290 { (snip) 297 switch (req->pr_protocol) { 298 #ifdef PIPEX_PPPOE 299 case PIPEX_PROTO_PPPOE: (snip) 333 default: 334 return (EPROTONOSUPPORT); 335 } > > if (req->pr_session_id > 0x) > return (EINVAL); > > cut end req->pr_session_id can't be > 0x since it's uint16_t. > Also cookies introduce invalidation problem. Yes, it has low > probability, but we can have operation order like below: > > 1. enqueue session with `protocol' = 0xaa and `session_id' = 0xbb, and > `cookie' = 0xaabb > 2. kill this session > 3. create new session `protocol' = 0xaa and `session_id' = 0xbb > 4. this newly created session will be used by pipexintr() > > As I have seen while played with refcounters, session can be enqueued > more than 10 times... The diff makes the problem worse, but it could happen already if the session-id is reused. > Also It's not obvious that interface index will never exceed 16 bit > counter. It's unsigned int and may be underlay counter's resolution > will be expanded in future. So I like to have at least corresponding > assertion in pipex_iface_init(). Right. This is fixable with another unique number. > So, may be my first solution is the best here. And, as mpi@ pointed, > ipsec(4) should be reworked to allow parallelism. Does first mean killing the pipexintr? What I explained was wrong. I'm sorry about this. On Fri, 31 Jul 2020 09:36:32 +0900 (JST) YASUOKA Masahiko wrote: > A packet of L2TP/IPsec (encapsulated IP/PPP/L2TP/UDP/ESP/UDP/IP) is > processed like: > >ipv4_input > ... >udp_input > ipsec_common_input > esp_input >crypto_dispatch > => crypto_taskq_mp_safe > >kthread "crynlk" > crypto_invoke >... (*1) > crypto_done > esp_input_cb >ipsec_common_input_cb > ip_deliver >udp_input > pipex_l2tp_input >pipex_common_input > (*2) > pipex_ppp_input >pipex_mppe_input (*3) > pipex_ppp_input >pipex_ip_input > ipv4_input >... This should be kthread "crynlk" crypto_invoke ... (*1) crypto_done kthread "crypto" < another thread ipsec_input_cb < this is missed esp_input_cb ipsec_common_input_cb ip_deliver udp_input pipex_l2tp_input pipex_common_input (*2) pipex_ppp_input pipex_mppe_input (*3) pipex_ppp_input pipex_ip_input ipv4_input ... > At *2 there was a queue. "crynlk" is a busy thread, since it is doing > decryption at *1. I think it's better pipex input is be done by > another thread than crypto since it also has decryption at *3. This is false. *3 is done by another thread. It is the same if crypto driver is not CRYPTOCAP_F_MPSAFE. (crypto_invoke() is done by the caller's thread and the callback (ipsec_input_cb) is called by"crypto" thread.) So I have no actual reason to keep the queues. ok yasuoka for the diff which kills pipexintr.
Re: pipex(4): kill pipexintr()
On Sat, 1 Aug 2020 18:52:27 +0300 Vitaliy Makkoveev wrote: > On Sat, Aug 01, 2020 at 07:44:17PM +0900, YASUOKA Masahiko wrote: >> I'm not sure when it is broken, in old versions, it was assumed the >> pipex queues are empty when pipex_iface_stop() is called. The problem >> mvs@ found is the assumption is not true any more. >> >> pipex has a mechanism that delete a session when the queues are empty. >> >> 819 Static void >> 820 pipex_timer(void *ignored_arg) >> 821 { >> (snip) >> 854 case PIPEX_STATE_CLOSED: >> 855 /* >> 856 * mbuf queued in pipexinq or pipexoutq may >> have a >> 857 * refererce to this session. >> 858 */ >> 859 if (!mq_empty() || >> !mq_empty()) >> 860 continue; >> 861 >> 862 pipex_destroy_session(session); >> 863 break; >> >> I think using this is better. >> >> How about this? > > Unfortunately your diff is incorrect. It introduces memory leaks and > breaks pppx(4). Also it is incomplete. Thank you for your feedbacks. > We have multiple ways to kill pipex(sessions): > > 1. pppx(4) > > We have `struct pppx_if' which has pointer to corresponding session and > this session is accessed directly within pppx(4) layer. Since we can't > destroy `ppp_if' in pipex(4) layer we can't destroy these sessions by > pipex_timer(). The only way to destroy them is pppx_if_destroy() which: > > 1. unlink session by pipex_unlink_session() > 2. detach corresponding `ifnet' by if_detach() > 3. release session by pipex_rele_session() > > It's unsafe because mbuf queues can have references to this session. Yes. > 2. pppac(4) > > We have no direct access to corresponding sessions within pppac(4) > layer. Also there are multiple ways to do this: > > 1. pipex_ioctl() with `PIPEXSMODE' command. Underlay pipex_iface_stop() > walks through `pipex_session_list' and destroy sessions by > pipex_destroy_session() call. It's unsafe because we don't check queues. > > 2. pipex_ioctl() with `PIPEXDSESSION'. pipex_close_session() will change > session's state and pipex_timer() will kill this sessions later. This > is the only safe way. > > 3. pipex_iface_fini(). The same as `PIPEXSMODE', pipex_iface_stop() > kills sessions, Which is also unsafe. Also we have another use after > free issue: > > cut begin > > pipex_iface_fini(struct pipex_iface_context *pipex_iface) > { > pool_put(_session_pool, pipex_iface->multicast_session); > NET_LOCK(); > pipex_iface_stop(pipex_iface); > NET_UNLOCK(); > } > > cut end > > `multicast_session' should be protected too. It also can be pushed to > `pipexoutq'. Yes, I missed this point. > Also since this time pipexintr() and pipex_iface_fini() are > both serialized by KERNEL_LOCK() too we can't destroy `multicast_session' > which is in use by pipexintr(). But when we will drop KERNEL_LOCK() > around pipexintr() we can catch use after free issue here. I already did > diff for move this pool_put() under NET_LOCK(), but it was rejectedi by > mpi@ because: > > cut begin > pipex_iface_fini() should be called on the last reference of the > > descriptor. So this shouldn't be necessary. If there's an issue > > with the current order of the operations, we should certainly fix > > it differently. > cut end Yes, I understand what mpi@ is saying. But this is a separate story. > So I repeat it again: npppd(8) can be killed in every moment by SIGKILL > or by SIGSEGV and pppacclose() will be called and it will call > pipex_iface_fini(). `multicast_session' can be used in this moment by > pipexintr(). > > And no locks protect `multicast_session' itself. > > The two diffs I proposed in this thread solve problems caused by > pipexintr(). There are a lot of ways to solve the problems. The diff I sent few days ago is to destruct the pipex sessions in the pipex timer. As you pointed out it has some problems. Those problems can be fixed, but I'd suggest another way. I attached at last. The problem exposed is "use-after-free". Since I think this is not a problem of parallel processing, having reference counter seems too much for me. The diff is not to refer the session by a pointer, but by the id. The idea is come from IPsec tdb. Comments? diff --git a/sys/net/pipex.c b/sys/net/p
Re: pipex(4): kill pipexintr()
Hi, I'm not sure when it is broken, in old versions, it was assumed the pipex queues are empty when pipex_iface_stop() is called. The problem mvs@ found is the assumption is not true any more. pipex has a mechanism that delete a session when the queues are empty. 819 Static void 820 pipex_timer(void *ignored_arg) 821 { (snip) 854 case PIPEX_STATE_CLOSED: 855 /* 856 * mbuf queued in pipexinq or pipexoutq may have a 857 * refererce to this session. 858 */ 859 if (!mq_empty() || !mq_empty()) 860 continue; 861 862 pipex_destroy_session(session); 863 break; I think using this is better. How about this? diff --git a/sys/net/pipex.c b/sys/net/pipex.c index 2ad7757fee9..6fe14c400bf 100644 --- a/sys/net/pipex.c +++ b/sys/net/pipex.c @@ -190,7 +190,7 @@ pipex_iface_stop(struct pipex_iface_context *pipex_iface) LIST_FOREACH_SAFE(session, _session_list, session_list, session_tmp) { if (session->pipex_iface == pipex_iface) - pipex_destroy_session(session); + pipex_unlink_session(session); } } @@ -470,9 +470,16 @@ pipex_link_session(struct pipex_session *session, void pipex_unlink_session(struct pipex_session *session) { + struct radix_node *rn; + session->ifindex = 0; NET_ASSERT_LOCKED(); + if (!in_nullhost(session->ip_address.sin_addr)) { + rn = rn_delete(>ip_address, >ip_netmask, + pipex_rd_head4, (struct radix_node *)session); + KASSERT(rn != NULL); + } LIST_REMOVE(session, id_chain); #if defined(PIPEX_PPTP) || defined(PIPEX_L2TP) switch (session->protocol) { @@ -486,10 +493,6 @@ pipex_unlink_session(struct pipex_session *session) LIST_REMOVE(session, state_list); LIST_REMOVE(session, session_list); session->state = PIPEX_STATE_CLOSED; - - /* if final session is destroyed, stop timer */ - if (LIST_EMPTY(_session_list)) - pipex_timer_stop(); } Static int @@ -652,20 +655,16 @@ pipex_get_closed(struct pipex_session_list_req *req, Static int pipex_destroy_session(struct pipex_session *session) { - struct radix_node *rn; - /* remove from radix tree and hash chain */ NET_ASSERT_LOCKED(); - if (!in_nullhost(session->ip_address.sin_addr)) { - rn = rn_delete(>ip_address, >ip_netmask, - pipex_rd_head4, (struct radix_node *)session); - KASSERT(rn != NULL); - } - pipex_unlink_session(session); pipex_rele_session(session); + /* if final session is destroyed, stop timer */ + if (LIST_EMPTY(_session_list)) + pipex_timer_stop(); + return (0); } @@ -739,7 +738,8 @@ pipexintr(void) mq_delist(, ); while ((m = ml_dequeue()) != NULL) { pkt_session = m->m_pkthdr.ph_cookie; - if (pkt_session == NULL) { + if (pkt_session == NULL || + pkt_session->state == PIPEX_STATE_CLOSED) { m_freem(m); continue; } @@ -776,7 +776,8 @@ pipexintr(void) mq_delist(, ); while ((m = ml_dequeue()) != NULL) { pkt_session = m->m_pkthdr.ph_cookie; - if (pkt_session == NULL) { + if (pkt_session == NULL || + pkt_session->state == PIPEX_STATE_CLOSED) { m_freem(m); continue; }
Re: pipex(4): kill pipexintr()
On Thu, 30 Jul 2020 22:43:10 +0300 Vitaliy Makkoveev wrote: > On Thu, Jul 30, 2020 at 10:05:13PM +0900, YASUOKA Masahiko wrote: >> On Thu, 30 Jul 2020 15:34:09 +0300 >> Vitaliy Makkoveev wrote: >> > On Thu, Jul 30, 2020 at 09:13:46PM +0900, YASUOKA Masahiko wrote: >> >> If the diff removes the queue, then the pipex input routine is >> >> executed by the NIC's interrupt handler. >> >> >> >> The queues had been made to avoid that kind of situations. >> > >> > It's not enqueued in pppoe case. According pipex_pppoe_input() code we >> > call pipex_common_input() with `useq' argument set to '0', so we don't >> > enqueue mbuf(9) but pass it to pipex_ppp_input() which will pass it to >> > ipv{4,6}_input(). >> >> You are right. Sorry, I forgot about this which I did that by myself. > > I'm interesting the reason why you did that. I remembered, it was first step of MP steps for pipex. At that time, I discussed with mpi, he suggested like below. 1. stop enqueueing packets for PPPoE 2. try not take a kernel lock before calling gre_input(), then we can also stop enqueueing packets for PPTP(GRE) 3. for L2TP, keep the queue and change the netisr to an unlocked task
Re: pipex(4): kill pipexintr()
On Thu, 30 Jul 2020 22:43:10 +0300 Vitaliy Makkoveev wrote: > On Thu, Jul 30, 2020 at 10:05:13PM +0900, YASUOKA Masahiko wrote: >> On Thu, 30 Jul 2020 15:34:09 +0300 >> Vitaliy Makkoveev wrote: >> > On Thu, Jul 30, 2020 at 09:13:46PM +0900, YASUOKA Masahiko wrote: >> >> Hi, >> >> >> >> sys/net/if_ethersubr.c: >> >> 372 void >> >> 373 ether_input(struct ifnet *ifp, struct mbuf *m) >> >> (snip) >> >> 519 #if NPPPOE > 0 || defined(PIPEX) >> >> 520 case ETHERTYPE_PPPOEDISC: >> >> 521 case ETHERTYPE_PPPOE: >> >> 522 if (m->m_flags & (M_MCAST | M_BCAST)) >> >> 523 goto dropanyway; >> >> 524 #ifdef PIPEX >> >> 525 if (pipex_enable) { >> >> 526 struct pipex_session *session; >> >> 527 >> >> 528 if ((session = pipex_pppoe_lookup_session(m)) >> >> != NULL) { >> >> 529 pipex_pppoe_input(m, session); >> >> 530 return; >> >> 531 } >> >> 532 } >> >> 533 #endif >> >> >> >> previously a packet which branchces to #529 is enqueued. >> >> >> >> If the diff removes the queue, then the pipex input routine is >> >> executed by the NIC's interrupt handler. >> >> >> >> The queues had been made to avoid that kind of situations. >> > >> > It's not enqueued in pppoe case. According pipex_pppoe_input() code we >> > call pipex_common_input() with `useq' argument set to '0', so we don't >> > enqueue mbuf(9) but pass it to pipex_ppp_input() which will pass it to >> > ipv{4,6}_input(). >> >> You are right. Sorry, I forgot about this which I did that by myself. >> > > I'm interesting the reason why you did that. > >> >> Also I don't see a relation of the use-after-free problem and killing >> >> queues. Can't we fix the problem unless we kill the queues? >> > >> > Yes we can. Reference counters allow us to keep orphan sessions in these >> > queues without use after free issue. >> > >> > I will wait your commentaries current enqueuing before to do something. >> >> I have another concern. >> >> You might know, when L2TP/IPsec is used heavily, the crypto thread >> uses 100% of 1 CPU core. In that case, that thread becomes like >> below: >> >> crypto thread -> udp_userreq -> pipex_l2tp_input >> >> some clients are using MPPE(RC4 encryption) on CCP. It's not so >> light. >> >> How do we offload this for CPUs? I am thinking that "pipex" can have >> a dedicated thread. Do we have another scenario? >> > > I suppose you mean udp_input(). What is you call "crypto thread"? I did > a little backtrace but I didn't find this thread. > > ether_resolve > if_input_local > ipv4_input > ip_input_if > ip_ours > ip_deliver > udp_input (through pr_input) > pipex_l2tp_input > > ipi{,6}_mloopback > if_input_local > ipv4_input > ... > udp_input (through pr_input) > pipex_l2tp_input > > loinput > if_input_local > ipv4_input > ... > udp_input (through pr_input) > pipex_l2tp_input > > Also various pseudo drivers call ipv{4,6}_input() and underlay > udp_unput() too. > > Except nfs, we call udp_usrreq() through socket layer only. Do you mean > userland as "crypto thread"? Sorry, udp_usrreq() should be usr_input() and crypto thread meant a kthread for crypto_taskq_mp_safe, whose name is "crynlk" (see crypto_init()). A packet of L2TP/IPsec (encapsulated IP/PPP/L2TP/UDP/ESP/UDP/IP) is processed like: ipv4_input ... udp_input ipsec_common_input esp_input crypto_dispatch => crypto_taskq_mp_safe kthread "crynlk" crypto_invoke ... (*1) crypto_done esp_input_cb ipsec_common_input_cb ip_deliver udp_input pipex_l2tp_input pipex_common_input (*2) pipex_ppp_input pipex_mppe_input (*3) pipex_ppp_input pipex_ip_input
Re: pipex(4): kill pipexintr()
On Thu, 30 Jul 2020 15:34:09 +0300 Vitaliy Makkoveev wrote: > On Thu, Jul 30, 2020 at 09:13:46PM +0900, YASUOKA Masahiko wrote: >> Hi, >> >> sys/net/if_ethersubr.c: >> 372 void >> 373 ether_input(struct ifnet *ifp, struct mbuf *m) >> (snip) >> 519 #if NPPPOE > 0 || defined(PIPEX) >> 520 case ETHERTYPE_PPPOEDISC: >> 521 case ETHERTYPE_PPPOE: >> 522 if (m->m_flags & (M_MCAST | M_BCAST)) >> 523 goto dropanyway; >> 524 #ifdef PIPEX >> 525 if (pipex_enable) { >> 526 struct pipex_session *session; >> 527 >> 528 if ((session = pipex_pppoe_lookup_session(m)) != >> NULL) { >> 529 pipex_pppoe_input(m, session); >> 530 return; >> 531 } >> 532 } >> 533 #endif >> >> previously a packet which branchces to #529 is enqueued. >> >> If the diff removes the queue, then the pipex input routine is >> executed by the NIC's interrupt handler. >> >> The queues had been made to avoid that kind of situations. > > It's not enqueued in pppoe case. According pipex_pppoe_input() code we > call pipex_common_input() with `useq' argument set to '0', so we don't > enqueue mbuf(9) but pass it to pipex_ppp_input() which will pass it to > ipv{4,6}_input(). You are right. Sorry, I forgot about this which I did that by myself. >> Also I don't see a relation of the use-after-free problem and killing >> queues. Can't we fix the problem unless we kill the queues? > > Yes we can. Reference counters allow us to keep orphan sessions in these > queues without use after free issue. > > I will wait your commentaries current enqueuing before to do something. I have another concern. You might know, when L2TP/IPsec is used heavily, the crypto thread uses 100% of 1 CPU core. In that case, that thread becomes like below: crypto thread -> udp_userreq -> pipex_l2tp_input some clients are using MPPE(RC4 encryption) on CCP. It's not so light. How do we offload this for CPUs? I am thinking that "pipex" can have a dedicated thread. Do we have another scenario? --yasuoka
Re: pipex(4): kill pipexintr()
Hi, sys/net/if_ethersubr.c: 372 void 373 ether_input(struct ifnet *ifp, struct mbuf *m) (snip) 519 #if NPPPOE > 0 || defined(PIPEX) 520 case ETHERTYPE_PPPOEDISC: 521 case ETHERTYPE_PPPOE: 522 if (m->m_flags & (M_MCAST | M_BCAST)) 523 goto dropanyway; 524 #ifdef PIPEX 525 if (pipex_enable) { 526 struct pipex_session *session; 527 528 if ((session = pipex_pppoe_lookup_session(m)) != NULL) { 529 pipex_pppoe_input(m, session); 530 return; 531 } 532 } 533 #endif previously a packet which branchces to #529 is enqueued. If the diff removes the queue, then the pipex input routine is executed by the NIC's interrupt handler. The queues had been made to avoid that kind of situations. Also I don't see a relation of the use-after-free problem and killing queues. Can't we fix the problem unless we kill the queues? On Wed, 29 Jul 2020 23:04:36 +0300 Vitaliy Makkoveev wrote: > Now pipex(4) is fully covered by NET_LOCK() and this is documented. But > we still have an issue with pipex(4) session itself and I guess it's > time to fix it. > > We have `pipexinq' and `pipexoutq' mbuf(9) queues to store mbufs. Each > mbuf(9) passed to these queues stores the pointer to corresponding > session referenced as `m_pkthdr.ph_cookie'. We enqueue incoming mbufs for > pppx(4) and incoming and outgoing mbufs for pppac(4). But we don't > enqueue pppoe related mbufs. After packet was enqueued to corresponding > queue we call schednetisr() which just schedules netisr() to run: > > cut begin > > 780 pipex_ppp_enqueue(struct mbuf *m0, struct pipex_session *session, > 781 struct mbuf_queue *mq) > 782 { > 783 m0->m_pkthdr.ph_cookie = session; > 784 /* XXX need to support other protocols */ > 785 m0->m_pkthdr.ph_ppp_proto = PPP_IP; > 786 > 787 if (mq_enqueue(mq, m0) != 0) > 788 return (1); > 789 > 790 schednetisr(NETISR_PIPEX); > 791 > 792 return (0); > 793 } > > cut end > > Also we have pipex_timer() which should destroy session in safe way, but > it does this only for pppac(4) and only for sessions closed by > `PIPEXDSESSION' command: > > cut begin > > 812 pipex_timer(void *ignored_arg) > 813 { > /* skip */ > 846 case PIPEX_STATE_CLOSED: > 847 /* > 848 * mbuf queued in pipexinq or pipexoutq may have a > 849* refererce to this session. > 850 */ > 851 if (!mq_empty() || !mq_empty()) > 852 continue; > 853 > 854 pipex_destroy_session(session); > 855 break; > > cut end > > While we destroy sessions through pipex_rele_session() or through > pipex_iface_fini() or through `PIPEXSMODE' command we don't check > `pipexinq' and `pipexoutq' state. This means we can break them. > > It's not guaranteed that netisr() will start just after schednetisr() > call. This means we can destroy session, but corresponding mbuf(9) is > stored within `pipexinq' or `pipexoutq'. It's `m_pkthdr.ph_cookie' still > stores pointer to destroyed session and we have use after free issue. I > wonder why we didn't caught panic yet. > > I propose to kill `pipexinq', `pipexoutq' and pipexintr(). There is > absolutely no reason them to exist. This should not only fix issue > described above but simplifies code too. > > Other ways are to implement reference counters for session or walk > through mbuf(9) queues and kill corresponding mbufs. It doesn't make > sense to go these ways. > > Index: lib/libc/sys/sysctl.2 > === > RCS file: /cvs/src/lib/libc/sys/sysctl.2,v > retrieving revision 1.40 > diff -u -p -r1.40 sysctl.2 > --- lib/libc/sys/sysctl.2 17 May 2020 05:48:39 - 1.40 > +++ lib/libc/sys/sysctl.2 29 Jul 2020 13:47:40 - > @@ -2033,35 +2033,11 @@ The currently defined variable names are > .Bl -column "Third level name" "integer" "Changeable" -offset indent > .It Sy "Third level name" Ta Sy "Type" Ta Sy "Changeable" > .It Dv PIPEXCTL_ENABLE Ta integer Ta yes > -.It Dv PIPEXCTL_INQ Ta node Ta not applicable > -.It Dv PIPEXCTL_OUTQ Ta node Ta not applicable > .El > .Bl -tag -width "123456" > .It Dv PIPEXCTL_ENABLE > If set to 1, enable PIPEX processing. > The default is 0. > -.It Dv PIPEXCTL_INQ Pq Va net.pipex.inq > -Fourth level comprises an array of > -.Vt struct ifqueue > -structures containing information about the PIPEX packet input queue. > -The forth level names for the elements of > -.Vt struct ifqueue > -are the same as described in > -.Li ip.arpq > -in the > -.Dv PF_INET > -section. > -.It Dv PIPEXCTL_OUTQ Pq Va
Re: pf: route-to least-states
Hi, On Tue, 28 Jul 2020 18:54:48 +0200 Alexandr Nedvedicky wrote: > On Wed, Jul 29, 2020 at 01:22:48AM +0900, YASUOKA Masahiko wrote: >> Previous commit has a wrong part.. >> >> ok? >> >> Fix previous commit which referred wrong address. > > would it make sense to move the block, you've introduced earler > under the !PF_AZERO() branch just couple lines below. something > like this: > > 8<---8<---8<--8< > diff --git a/sys/net/pf_lb.c b/sys/net/pf_lb.c > index 510795a4d0b..f77d96a99ec 100644 > --- a/sys/net/pf_lb.c > +++ b/sys/net/pf_lb.c > @@ -322,13 +322,13 @@ pf_map_addr_sticky(sa_family_t af, struct pf_rule *r, > struct pf_addr *saddr, > return (-1); > } > > - if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_LEASTSTATES) { > - if (pf_map_addr_states_increase(af, rpool, naddr) == -1) > + if (!PF_AZERO(cached, af)) { > + pf_addrcpy(naddr, cached, af); > + if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_LEASTSTATES) > && > + ((pf_map_addr_states_increase(af, rpool, cached) == -1)) > return (-1); > } > > - if (!PF_AZERO(cached, af)) > - pf_addrcpy(naddr, cached, af); > if (pf_status.debug >= LOG_DEBUG) { > log(LOG_DEBUG, "pf: pf_map_addr: " > "src tracking (%u) maps ", type); > > 8<---8<---8<--8< > > It seems to me it would be better to bump number of states if and only if we > actually find some address in pool. Yes, I agree. ok? Fix previous commit which referred wrong address and returned wrong value. Index: sys/net/pf_lb.c === RCS file: /cvs/src/sys/net/pf_lb.c,v retrieving revision 1.66 diff -u -p -r1.66 pf_lb.c --- sys/net/pf_lb.c 28 Jul 2020 16:47:41 - 1.66 +++ sys/net/pf_lb.c 28 Jul 2020 17:01:34 - @@ -322,13 +322,13 @@ pf_map_addr_sticky(sa_family_t af, struc return (-1); } - if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_LEASTSTATES) { - if (pf_map_addr_states_increase(af, rpool, naddr) == -1) - return (-1); - } - if (!PF_AZERO(cached, af)) + if (!PF_AZERO(cached, af)) { pf_addrcpy(naddr, cached, af); + if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_LEASTSTATES && + pf_map_addr_states_increase(af, rpool, cached) == -1) + return (-1); + } if (pf_status.debug >= LOG_DEBUG) { log(LOG_DEBUG, "pf: pf_map_addr: " "src tracking (%u) maps ", type); @@ -651,7 +651,7 @@ pf_map_addr_states_increase(sa_family_t pf_print_host(naddr, 0, af); addlog(". Failed to increase count!\n"); } - return (1); + return (-1); } } else if (rpool->addr.type == PF_ADDR_DYNIFTL) { if (pfr_states_increase(rpool->addr.p.dyn->pfid_kt, @@ -663,7 +663,7 @@ pf_map_addr_states_increase(sa_family_t pf_print_host(naddr, 0, af); addlog(". Failed to increase count!\n"); } - return (1); + return (-1); } } return (0);
Re: pf: route-to least-states
Hi, Let me add another fix of previous. ok? Fix previous commit which referred wrong address and returned wrong value. Index: sys/net/pf_lb.c === RCS file: /cvs/src/sys/net/pf_lb.c,v retrieving revision 1.66 diff -u -p -r1.66 pf_lb.c --- sys/net/pf_lb.c 28 Jul 2020 16:47:41 - 1.66 +++ sys/net/pf_lb.c 28 Jul 2020 16:52:24 - @@ -323,7 +323,7 @@ pf_map_addr_sticky(sa_family_t af, struc } if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_LEASTSTATES) { - if (pf_map_addr_states_increase(af, rpool, naddr) == -1) + if (pf_map_addr_states_increase(af, rpool, cached) == -1) return (-1); } @@ -651,7 +651,7 @@ pf_map_addr_states_increase(sa_family_t pf_print_host(naddr, 0, af); addlog(". Failed to increase count!\n"); } - return (1); + return (-1); } } else if (rpool->addr.type == PF_ADDR_DYNIFTL) { if (pfr_states_increase(rpool->addr.p.dyn->pfid_kt, @@ -663,7 +663,7 @@ pf_map_addr_states_increase(sa_family_t pf_print_host(naddr, 0, af); addlog(". Failed to increase count!\n"); } - return (1); + return (-1); } } return (0);
Re: pf: route-to least-states
Hi, Previous commit has a wrong part.. ok? Fix previous commit which referred wrong address. Index: sys/net/pf_lb.c === RCS file: /cvs/src/sys/net/pf_lb.c,v retrieving revision 1.65 diff -u -p -r1.65 pf_lb.c --- sys/net/pf_lb.c 24 Jul 2020 14:06:33 - 1.65 +++ sys/net/pf_lb.c 28 Jul 2020 16:15:50 - @@ -323,7 +323,7 @@ pf_map_addr_sticky(sa_family_t af, struc } if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_LEASTSTATES) { - if (pf_map_addr_states_increase(af, rpool, naddr) == -1) + if (pf_map_addr_states_increase(af, rpool, cached) == -1) return (-1); }
relayd: set group and divert-reply
Hi, I'd like to run relayd as _relayd group always so that we can use "group _relayd" in a pf rule. This makes it possible to write a pf rule easily which is to match only connections from relayd(8). Also as for relayd.conf(5), I'd like to mention that "divert-reply" is required for "transparent forward" and add an example pf rule which uses "group _relayd". ok? Run relayd(8) as _relayd group user. Index: usr.sbin/relayd/relayd.c === RCS file: /cvs/src/usr.sbin/relayd/relayd.c,v retrieving revision 1.182 diff -u -p -r1.182 relayd.c --- usr.sbin/relayd/relayd.c15 Sep 2019 19:23:29 - 1.182 +++ usr.sbin/relayd/relayd.c26 Jul 2020 08:39:27 - @@ -201,6 +201,11 @@ main(int argc, char *argv[]) if ((ps->ps_pw = getpwnam(RELAYD_USER)) == NULL) errx(1, "unknown user %s", RELAYD_USER); + if (setgroups(1, >ps_pw->pw_gid) == -1 || + setresgid(ps->ps_pw->pw_gid, ps->ps_pw->pw_gid, ps->ps_pw->pw_gid) + == -1) + err(1, "unable to set group ids"); + log_init(debug, LOG_DAEMON); log_setverbose(verbose); Add a mention that "divert-reply" rule is required for "transparent forward" and add an example which uses "group _relayd" to match the outgoing connections. Index: usr.sbin/relayd/relayd.conf.5 === RCS file: /cvs/src/usr.sbin/relayd/relayd.conf.5,v retrieving revision 1.198 diff -u -p -r1.198 relayd.conf.5 --- usr.sbin/relayd/relayd.conf.5 1 Jul 2020 06:47:18 - 1.198 +++ usr.sbin/relayd/relayd.conf.5 26 Jul 2020 08:39:27 - @@ -622,6 +622,10 @@ Use the .Ic transparent keyword to enable fully-transparent mode; the source address of the client will be retained in this case. +For this case, +additional +.Xr pf 4 +rule with divert-reply option is required for the outgoing connection. .Pp The .Ic with tls @@ -1627,6 +1631,31 @@ relay tlsinspect { protocol httpfilter forward with tls to destination } +.Ed +.Pp +If you want to use fully-transparent mode, +you can add the +.Ic transparent +keyword to +.Ic forward +option: +.Bd -literal -offset indent +relay tlsinspect { + listen on 127.0.0.1 port 8443 tls + protocol httpfilter + transparent forward with tls to destination +} +.Ed +.Pp +And add a matching divert-reply rule in +.Xr pf.conf 5 . +You can use +.Dq group _relayd +to match only connections from +.Xr relayd 8 +precisely: +.Bd -literal -offset indent +pass out proto tcp to port 443 group _relayd divert-reply .Ed .Pp The next simple router configuration example can be used to run
Re: pf_remove_divert_state
Thanks, On Sat, 25 Jul 2020 15:00:07 +0200 Alexander Bluhm wrote: > On Sat, Jul 25, 2020 at 09:37:37PM +0900, YASUOKA Masahiko wrote: >> Is this part a reason why we have "divert-reply"? > > Yes. > > Divert rules pass packets to the local network stack. With divert-to > you specify the socket address. This works for incomming connections. > The divert-to address can be 127.0.0.1 or anything else with > SO_BINDANY. > > When you use SO_BINDANY for outgoing connections and you don't know > the addresses when writing pf.conf, use divert-reply. > > As dangling states interfere with new connections, I added the > divert state cleanup. This is especially necessary for DGRAM or > RAW sockets. Yes. My first message shows it is neccessary for TCP. Also my diff was totally wrong it deletes the states regardless of it's for divert or not. >> > Is that not possible for you? >> >> It's possible. > > Fine, then use divert-reply instead of changing the semantics. I have thought it's hard to create a divert-reply rule for relayd's "transparent forward to destination" case. But I noticed tftp-proxy is using "group _tftp_proxy" to match connections only from the program precisely. I'll send diffs to do the same thing for relayd in a separated mail.
Re: pf_remove_divert_state
On Sat, 25 Jul 2020 13:29:57 +0200 Alexander Bluhm wrote: > On Sat, Jul 25, 2020 at 08:20:21PM +0900, YASUOKA Masahiko wrote: >> Currently SO_BINDANY is usable without any divert or divert-reply >> rule. > > This is why we have the divert-reply feature. Just mark the states > with that keyword when you want to use them with SO_BINDANY. Thanks, Let me clarify whether I understand correctly. | @@ -1410,9 +1410,7 @@ pf_remove_divert_state(struct pf_state_k | struct pf_state_item*si; | | TAILQ_FOREACH(si, >states, entry) { | - if (sk == si->s->key[PF_SK_STACK] && si->s->rule.ptr && | - (si->s->rule.ptr->divert.type == PF_DIVERT_TO || | - si->s->rule.ptr->divert.type == PF_DIVERT_REPLY)) { | + if (sk == si->s->key[PF_SK_STACK]) { | pf_remove_state(si->s); | break; | } | Is this part a reason why we have "divert-reply"? > See man setsockopt Yes, I have checked the API already. > Is that not possible for you? It's possible. --yasuoka
pf_remove_divert_state
Hi, # let me correct the previous mail, it has some typos. Currently SO_BINDANY is usable without any divert or divert-reply rule. pf reserves its associated PCB to its state when the packet is going out. This time, the pf rule is not required to have "divert" or "divert-reply" option. When receiving reverse direction packets, those packets are going to "ours" since they has the associated PCB. But when dropping the connection, the PCB is deleted but the state will not removed. Currently pf removes the state only if it is created by a rule with "divert-reply" or "divert" option. Otherwise the state is kept. As the result, following incoming packets for the connection will be forwarded by the state. They should not be forwarded since they were going to "ours". I think the state should be deleted even if it's created by a rule without "divert" or "divert-reply" option. The following diff will change this behavior. Also I attached a test procedure after the diff. ok? comments? Don't keep a state when associated PCB is delete regardless it's created without a "divert-to" or "divert-reply" rule. It might be created by SO_BINDANY. Index: sys/net/pf.c === RCS file: /cvs/src/sys/net/pf.c,v retrieving revision 1.1094 diff -u -p -r1.1094 pf.c --- sys/net/pf.c24 Jul 2020 18:17:15 - 1.1094 +++ sys/net/pf.c25 Jul 2020 07:39:19 - @@ -1410,9 +1410,7 @@ pf_remove_divert_state(struct pf_state_k struct pf_state_item*si; TAILQ_FOREACH(si, >states, entry) { - if (sk == si->s->key[PF_SK_STACK] && si->s->rule.ptr && - (si->s->rule.ptr->divert.type == PF_DIVERT_TO || - si->s->rule.ptr->divert.type == PF_DIVERT_REPLY)) { + if (sk == si->s->key[PF_SK_STACK]) { pf_remove_state(si->s); break; } network configuration: 192.168.0.101 -- 192.168.0.1 [OBJ] 10.0.0.1 --> 10.0.0.10 setup: ifconfig pair100 rdomain 10 ifconfig pair100 inet 192.168.0.1 ifconfig pair101 rdomain 11 patch pair100 ifconfig pair101 inet 192.168.0.101 ifconfig pair102 rdomain 10 ifconfig pair102 inet 10.0.0.1/24 ifconfig pair103 rdomain 12 patch pair102 ifconfig pair103 inet 10.0.0.101/24 route -T11 add default 192.168.0.1 /etc/pf.conf: pass on {pair100 pair101 pair102 pair103} match out on pair102 nat-to (pair102:0) block in on pair103 proto tcp to port 443 procedure: 1. run a server by scapy on 443/tcp on rdomain 12 $ doas route -T12 exec python test.py 2. connect to the server from OBJ (rdomain 10) $ doas route -T10 exec nc -vs 192.168.0.101 10.0.0.101 443 Connection to 10.0.0.101 443 port [tcp/https] succeeded! Ctrl-D $ close the connection by Ctrl-D immediately 3. see the packet capture on pair103 - You can see packets like below 19:28:51.822879 10.0.0.101.443 > 10.0.0.1.60956: . ack 1 win 8192 19:28:51.823559 192.168.0.101.22083 > 10.0.0.101.443: R 0:0(0) ack 1 win 0 (DF) [tos 0x10] - Since the pf state is kept, the packet "10.0.0.101.443 > 10.0.0.1.60956" is converted into "10.0.0.101.443 > 192.168.0.101.22083" by the state's NAT - but since the PCB doesn't exist, the packet is forwarded. - but the packet is blocked by default "block return" rule - "192.168.0.101.22083 > 10.0.0.101.443" is the result of "block return" -> 192.168.0.101 is NATed address. It should not appear on 10.0.0.0/24 network. teardown: ifconfig pair100 destroy ifconfig pair101 destroy ifconfig pair102 destroy ifconfig pair103 destroy test.py *** import time from scapy.all import * a=sniff(iface="pair102", count=1, filter="tcp and port 443") ip_src = a[0][IP].src ip_dst = a[0][IP].dst sport = a[0][TCP].sport dport = a[0][TCP].dport seq_nr = 5 ack_nr = a[0][TCP].seq + 1 a=sr1(IP(src=ip_dst, dst=ip_src)/ TCP(sport=dport, dport=sport, flags="SA", seq=seq_nr, ack=ack_nr, options=[('MSS', 1460)])) #ack_nr = a[0][TCP].seq + 1 # Send FIN and receive FIN+ACK seq_nr = seq_nr + 1 a=sr1(IP(src=ip_dst, dst=ip_src)/ TCP(sport=dport, dport=sport, flags="FA", seq=seq_nr, ack=ack_nr)) ack_nr = a[0][TCP].seq + 1 time.sleep(2) # Send ACK of FIN lastack = (IP(src=ip_dst, dst=ip_src)/ TCP(sport=dport, dport=sport, flags="A", seq=seq_nr, ack=ack_nr)) send(lastack) # Resend in 100 times for _ in range(100): time.sleep(2) send(lastack) ***
pf_remove_divert_state
Hi, Currently SO_BINDANY is usable without any divert or divert-reply rule. pf reserves its associated PCB to its state when the packet is going out. This time, the pf rule is not required to have "divert" or "divert-reply" option. When receiving reverse direction packets, those packets are going to "ours" since they has the associated PCB. But when dropping the connection, the PCB is deleted but the state will not removed. Currently pf removes the state only if it is created by a rule with "divert-reply" or "divert" option. Otherwise the state is kept. As the result, following incoming packets for the connection will be forwarded by the state. They should not be forwarded since they were going to "ours". I think the state should be deleted even if it's created by a rule without doesn't "divert" or "divert-reply" option. The following diff will change this behavior. Also I attached a test procedure after the diff. ok? comments? Don't keep a state when associated PCB is delete regardless it's created without a "divert-to" or "divert-reply" rule. It might be created by SO_BINDANY. Index: sys/net/pf.c === RCS file: /cvs/src/sys/net/pf.c,v retrieving revision 1.1094 diff -u -p -r1.1094 pf.c --- sys/net/pf.c24 Jul 2020 18:17:15 - 1.1094 +++ sys/net/pf.c25 Jul 2020 07:39:19 - @@ -1410,9 +1410,7 @@ pf_remove_divert_state(struct pf_state_k struct pf_state_item*si; TAILQ_FOREACH(si, >states, entry) { - if (sk == si->s->key[PF_SK_STACK] && si->s->rule.ptr && - (si->s->rule.ptr->divert.type == PF_DIVERT_TO || - si->s->rule.ptr->divert.type == PF_DIVERT_REPLY)) { + if (sk == si->s->key[PF_SK_STACK]) { pf_remove_state(si->s); break; } network configuration: 192.168.0.101 -- 192.168.0.1 [OBJ] 10.0.0.1 --> 10.0.0.10 setup: ifconfig pair100 rdomain 10 ifconfig pair100 inet 192.168.0.1 ifconfig pair101 rdomain 11 patch pair100 ifconfig pair101 inet 192.168.0.101 ifconfig pair102 rdomain 10 ifconfig pair102 inet 10.0.0.1/24 ifconfig pair103 rdomain 12 patch pair102 ifconfig pair103 inet 10.0.0.101/24 route -T11 add default 192.168.0.1 /etc/pf.conf: pass on {pair101 pair102 pair103 pair104} match out on pair102 nat-to (pair102:0) block in on pair103 proto tcp to port 443 procedure: 1. run a server by scapy on 443/tcp on rdomain 12 $ doas route -T12 exec python test.py 2. connect to the server from OBJ (rdomain 10) $ doas route -T10 exec nc -vs 192.168.0.101 10.0.0.101 443 Connection to 10.0.0.101 443 port [tcp/https] succeeded! Ctrl-D $ close the connection by Ctrl-D immediately 3. see the packet capture on pair103 - You can see packets like below 19:28:51.822879 10.0.0.101.443 > 10.0.0.1.60956: . ack 1 win 8192 19:28:51.823559 192.168.0.101.22083 > 10.0.0.101.443: R 0:0(0) ack 1 win 0 (DF) [tos 0x10] - Since the pf state is kept, the packet "10.0.0.101.443 > 10.0.0.1.60956" is converted into "10.0.0.101.443 > 192.168.0.101.22083" by the state's NAT - but since the PCB doesn't exist, the packet is forwarded. - but the packet is blocked by default "block return" rule - "192.168.0.101.22083 > 10.0.0.101.443" is the result of "block return" -> 192.168.0.101 is NATed address. It should not appear on 10.0.0.0/24 network. teardown: ifconfig pair100 destroy ifconfig pair101 destroy ifconfig pair102 destroy ifconfig pair103 destroy test.py *** import time from scapy.all import * a=sniff(iface="pair102", count=1, filter="tcp and port 443") ip_src = a[0][IP].src ip_dst = a[0][IP].dst sport = a[0][TCP].sport dport = a[0][TCP].dport seq_nr = 5 ack_nr = a[0][TCP].seq + 1 a=sr1(IP(src=ip_dst, dst=ip_src)/ TCP(sport=dport, dport=sport, flags="SA", seq=seq_nr, ack=ack_nr, options=[('MSS', 1460)])) #ack_nr = a[0][TCP].seq + 1 # Send FIN and receive FIN+ACK seq_nr = seq_nr + 1 a=sr1(IP(src=ip_dst, dst=ip_src)/ TCP(sport=dport, dport=sport, flags="FA", seq=seq_nr, ack=ack_nr)) ack_nr = a[0][TCP].seq + 1 time.sleep(2) # Send ACK of FIN lastack = (IP(src=ip_dst, dst=ip_src)/ TCP(sport=dport, dport=sport, flags="A", seq=seq_nr, ack=ack_nr)) send(lastack) # Resend in 100 times for _ in range(100): time.sleep(2) send(lastack) ***
carp: unicast carppeer and peer down
Hi, When an unicast address is specified for carppeer, if the peer is down, sending out advertisemnent packets will fail, this failure is treated as an error of the sending host, then the error counter is incremented and carpdemote is incremenated. I think this is not correct because the failure is not a fault of the sending host. ok? Don't treat an error if carppeer is an unicast and the peer is down. Index: sys/netinet/ip_carp.c === RCS file: /cvs/src/sys/netinet/ip_carp.c,v retrieving revision 1.347 diff -u -p -r1.347 ip_carp.c --- sys/netinet/ip_carp.c 24 Jul 2020 18:17:15 - 1.347 +++ sys/netinet/ip_carp.c 25 Jul 2020 07:16:42 - @@ -1140,7 +1140,9 @@ carp_send_ad(struct carp_vhost_entry *vh error = ip_output(m, NULL, NULL, IP_RAWOUTPUT, >sc_imo, NULL, 0); - if (error) { + if (error && + /* when unicast, the peer's down is not our fault */ + !(!IN_MULTICAST(sc->sc_peer.s_addr) && error == EHOSTDOWN)){ if (error == ENOBUFS) carpstat_inc(carps_onomem); else
pfsync: comparing duration when "bulk-end"
Hi, pfsync does "bulk update" just after boot, I noticed it sometimes fails. When finishing "bulk update", the duration in the "bulk-end" packet and our duration based on uptime are compared, but that comparision should be fixed. It must consider the values are rounded in a second. ok? Consider being rounded in a second when comparing the duration in "bulk-end" packet and the duration based on our uptime. This fixes the problem the carp demote count sometimes becomes 33 after reboot. Index: sys/net/if_pfsync.c === RCS file: /cvs/src/sys/net/if_pfsync.c,v retrieving revision 1.274 diff -u -p -r1.274 if_pfsync.c --- sys/net/if_pfsync.c 10 Jul 2020 13:26:42 - 1.274 +++ sys/net/if_pfsync.c 25 Jul 2020 05:09:47 - @@ -1169,8 +1169,7 @@ pfsync_in_bus(caddr_t buf, int len, int break; case PFSYNC_BUS_END: - if (getuptime() - ntohl(bus->endtime) >= - sc->sc_ureq_sent) { + if (ntohl(bus->endtime) <= getuptime() + 1 - sc->sc_ureq_sent) { /* that's it, we're happy */ sc->sc_ureq_sent = 0; sc->sc_bulk_tries = 0;
Re: pf: route-to {random,srchash} in an anchor
Hi, On Thu, 23 Jul 2020 18:44:43 +0200 Alexandr Nedvedicky wrote: > On Thu, Jul 23, 2020 at 08:01:18PM +0900, YASUOKA Masahiko wrote: >> Hi, >> >> Last month, I fixed the problem "route-to least-state" in an anchor >> didn't work. >> >> https://marc.info/?t=15911745782=1=2 >> >> I noticed the same problem happens on "random" and "srchash" as well. >> >> ok? > > change looks good. I have just one nit-pick comment. I leave decision > whether it's worth to adjust your diff or commit as-is up to you. > > see in-line further below. I can't remember why I used "null == false" logic, since I usually avoid using that. I'll commit the ajusted diff below. Index: sys/net/pf_lb.c === RCS file: /cvs/src/sys/net/pf_lb.c,v retrieving revision 1.65 diff -u -p -r1.65 pf_lb.c --- sys/net/pf_lb.c 24 Jul 2020 14:06:33 - 1.65 +++ sys/net/pf_lb.c 24 Jul 2020 14:13:42 - @@ -353,6 +353,7 @@ pf_map_addr(sa_family_t af, struct pf_ru struct pf_addr faddr; struct pf_addr *raddr = >addr.v.a.addr; struct pf_addr *rmask = >addr.v.a.mask; + struct pfr_ktable *kt; struct pfi_kif *kif; u_int64_tstates; u_int16_tweight; @@ -405,18 +406,17 @@ pf_map_addr(sa_family_t af, struct pf_ru pf_poolmask(naddr, raddr, rmask, saddr, af); break; case PF_POOL_RANDOM: - if (rpool->addr.type == PF_ADDR_TABLE) { - cnt = rpool->addr.p.tbl->pfrkt_cnt; - if (cnt == 0) - rpool->tblidx = 0; + if (rpool->addr.type == PF_ADDR_TABLE || + rpool->addr.type == PF_ADDR_DYNIFTL) { + if (rpool->addr.type == PF_ADDR_TABLE) + kt = rpool->addr.p.tbl; else - rpool->tblidx = (int)arc4random_uniform(cnt); - memset(>counter, 0, sizeof(rpool->counter)); - if (pfr_pool_get(rpool, , , af)) + kt = rpool->addr.p.dyn->pfid_kt; + kt = pfr_ktable_select_active(kt); + if (kt == NULL) return (1); - pf_addrcpy(naddr, >counter, af); - } else if (rpool->addr.type == PF_ADDR_DYNIFTL) { - cnt = rpool->addr.p.dyn->pfid_kt->pfrkt_cnt; + + cnt = kt->pfrkt_cnt; if (cnt == 0) rpool->tblidx = 0; else @@ -462,18 +462,18 @@ pf_map_addr(sa_family_t af, struct pf_ru case PF_POOL_SRCHASH: hashidx = pf_hash(saddr, (struct pf_addr *), >key, af); - if (rpool->addr.type == PF_ADDR_TABLE) { - cnt = rpool->addr.p.tbl->pfrkt_cnt; - if (cnt == 0) - rpool->tblidx = 0; + + if (rpool->addr.type == PF_ADDR_TABLE || + rpool->addr.type == PF_ADDR_DYNIFTL) { + if (rpool->addr.type == PF_ADDR_TABLE) + kt = rpool->addr.p.tbl; else - rpool->tblidx = (int)(hashidx % cnt); - memset(>counter, 0, sizeof(rpool->counter)); - if (pfr_pool_get(rpool, , , af)) + kt = rpool->addr.p.dyn->pfid_kt; + kt = pfr_ktable_select_active(kt); + if (kt == NULL) return (1); - pf_addrcpy(naddr, >counter, af); - } else if (rpool->addr.type == PF_ADDR_DYNIFTL) { - cnt = rpool->addr.p.dyn->pfid_kt->pfrkt_cnt; + + cnt = kt->pfrkt_cnt; if (cnt == 0) rpool->tblidx = 0; else Index: sys/net/pf_table.c === RCS file: /cvs/src/sys/net/pf_table.c,v retrieving revision 1.133 diff -u -p -r1.133 pf_table.c --- sys/net/pf_table.c 24 Jun 2020 22:03:43 - 1.133 +++ sys/net/pf_table.c 24 Jul 2020 14:13:42 - @@ -2108,9 +2108,8 @@ pfr_kentry_byaddr(struct pfr_ktable *kt, struct sockaddr_in6 tmp6; #endif /* INET6 */ - if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) - kt = kt->pfrkt_root; - if
Re: pf: route-to least-states
Hi, Thank you for your review. On Fri, 24 Jul 2020 01:25:42 +0200 Alexandr Nedvedicky wrote: >> - interface is not selected properly if selected table entry specifies >> an interface. > > to be honest I don't quite understand what's going on here. > can you share some details of configuration/scenario, which > triggers the bug your diff is fixing? You seem to have understood the scenario correctly. > the part of your change, which I'm not able to figure out is > this single line: > >> +if (pf_map_addr_states_increase(af, rpool, naddr) == -1) >> +return (1); >> +/* revert the kif which was set by pfr_pool_get() */ >> +rpool->kif = kif; >> break; >> } > > your fix changes behavior, which is there since least-state > option has been introduced. I believe it does not matter > for case when route-to specifies single interface such as: > > route-to 192.168.1.10@em0 least-states > > I'm not sure what will happen in situation, when there are more interfaces > specified in combination with sticky-address: > > route-to {192.168.1.10@em0, 192.168.1.20@em1} last-states sticky-address Yes. This is a senario. > the resulting code does not look quite right with your diff applied: > > 602 } while (pf_match_addr(1, , rmask, >counter, > af) && > 603 (states > 0)); > 604 > 605 if (pf_map_addr_states_increase(af, rpool, naddr) == -1) > 606 return (1); > 607 /* revert the kif which was set by pfr_pool_get() */ > 608 rpool->kif = kif; > 609 break; > 610 } > 611 > 612 if (rpool->opts & PF_POOL_STICKYADDR) { > 613 if (sns[type] != NULL) { > 614 pf_remove_src_node(sns[type]); > 615 sns[type] = NULL; > 616 } > 617 if (pf_insert_src_node([type], r, type, af, saddr, > naddr, > 618 rpool->kif)) > 619 return (1); > 620 } > > > at line 608 new code reverts kif set by pfr_pool_get(). At line 617 > (executed when sticky-address is set) the original code passes kif chosen > be > pfr_pool_get(). You diff changes that behavior. So my question is simple: > is that intentional change? Yes. Let me simplify the block for "least-states". 535 case PF_POOL_LEASTSTATES: 539 pfr_pool_get(rpool); // fist entry : 561 faddr = rpool->counter; //record as final : 557 load = rpool->states / rpool->weight; 563 naddr = rpool->counter; : 571 do { 572 rpool->counter++; 575 pfr_pool_get(rpool); /* next entry */ : 585 cload = rpool->states / rpool->weight; : : /* find lc minimum */ 591 if (cload < load) { 595 load = cload; 597 naddr = rpool->counter; 601 } 603 } while (raddr->counter != faddr); // loop until final the loop #571:606 is to find the minimum (least-states) entry. If the last entry is not the minimum, after the loop, rpool <= the last entry naddr <= the minimum entry Also see the pfr_pool_get(): 2272 int 2273 pfr_pool_get(struct pf_pool *rpool, struct pf_addr **raddr, 2274 struct pf_addr **rmask, sa_family_t af) 2275 { (snip) 2417 rpool->states = 0; 2418 if (ke->pfrke_counters != NULL) 2419 rpool->states = ke->pfrke_counters->states; 2420 switch (ke->pfrke_type) { 2421 case PFRKE_COST: 2422 rpool->weight = 2423 ((struct pfr_kentry_cost *)ke)->weight; 2424 /* FALLTHROUGH */ 2425 case PFRKE_ROUTE: 2426 rpool->kif = ((struct pfr_kentry_route *)ke)->kif; 2427 break; 2428 default: 2429 rpool->weight = 1; 2430 break; 2431 } some fields of rpool (states, weight, kif) are set by the values of the selected table entry. So, | rpool <= the last entry | naddr <= the minimum entry rpool->kif is the interface of the last entery. It might be different than the inteface of the minimum entry. The diff is to keep kif of the minimum entry, + kif = rpool->kif; revert rpool->kif by it after the loop. + /* revert the kif which was set by pfr_pool_get() */ + rpool->kif = kif;
pf: route-to least-states
Hi, The diff fixes 2 problems of "least-states": - states whose address is selected by sticky-address is not counted for the number of states. - interface is not selected properly if selected table entry specifies an interface. ok? Increase state counter for least-states when the address is selected by sticky-address. Also fix the problem that the interface which is specified by the selected table entry is not used properly. Index: sys/net/pf_lb.c === RCS file: /disk/cvs/openbsd/src/sys/net/pf_lb.c,v retrieving revision 1.64 diff -u -p -r1.64 pf_lb.c --- sys/net/pf_lb.c 2 Jul 2019 09:04:53 - 1.64 +++ sys/net/pf_lb.c 23 Jul 2020 11:06:05 - @@ -97,6 +97,8 @@ u_int64_t pf_hash(struct pf_addr *, st int pf_get_sport(struct pf_pdesc *, struct pf_rule *, struct pf_addr *, u_int16_t *, u_int16_t, u_int16_t, struct pf_src_node **); +int pf_map_addr_states_increase(sa_family_t, + struct pf_pool *, struct pf_addr *); int pf_get_transaddr_af(struct pf_rule *, struct pf_pdesc *, struct pf_src_node **); int pf_map_addr_sticky(sa_family_t, struct pf_rule *, @@ -319,6 +321,12 @@ pf_map_addr_sticky(sa_family_t af, struc sns[type] = NULL; return (-1); } + + if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_LEASTSTATES) { + if (pf_map_addr_states_increase(af, rpool, naddr) == -1) + return (-1); + } + if (!PF_AZERO(cached, af)) pf_addrcpy(naddr, cached, af); if (pf_status.debug >= LOG_DEBUG) { @@ -345,6 +353,7 @@ pf_map_addr(sa_family_t af, struct pf_ru struct pf_addr faddr; struct pf_addr *raddr = >addr.v.a.addr; struct pf_addr *rmask = >addr.v.a.mask; + struct pfi_kif *kif; u_int64_tstates; u_int16_tweight; u_int64_tload; @@ -539,6 +548,7 @@ pf_map_addr(sa_family_t af, struct pf_ru states = rpool->states; weight = rpool->weight; + kif = rpool->kif; if ((rpool->addr.type == PF_ADDR_TABLE && rpool->addr.p.tbl->pfrkt_refcntcost > 0) || @@ -581,6 +591,7 @@ pf_map_addr(sa_family_t af, struct pf_ru if (cload < load) { states = rpool->states; weight = rpool->weight; + kif = rpool->kif; load = cload; pf_addrcpy(naddr, >counter, af); @@ -591,29 +602,10 @@ pf_map_addr(sa_family_t af, struct pf_ru } while (pf_match_addr(1, , rmask, >counter, af) && (states > 0)); - if (rpool->addr.type == PF_ADDR_TABLE) { - if (pfr_states_increase(rpool->addr.p.tbl, - naddr, af) == -1) { - if (pf_status.debug >= LOG_DEBUG) { - log(LOG_DEBUG,"pf: pf_map_addr: " - "selected address "); - pf_print_host(naddr, 0, af); - addlog(". Failed to increase count!\n"); - } - return (1); - } - } else if (rpool->addr.type == PF_ADDR_DYNIFTL) { - if (pfr_states_increase(rpool->addr.p.dyn->pfid_kt, - naddr, af) == -1) { - if (pf_status.debug >= LOG_DEBUG) { - log(LOG_DEBUG, "pf: pf_map_addr: " - "selected address "); - pf_print_host(naddr, 0, af); - addlog(". Failed to increase count!\n"); - } - return (1); - } - } + if (pf_map_addr_states_increase(af, rpool, naddr) == -1) + return (1); + /* revert the kif which was set by pfr_pool_get() */ + rpool->kif = kif; break; } @@ -642,6 +634,38 @@ pf_map_addr(sa_family_t af, struct pf_ru addlog("\n"); } + return (0); +} + +int +pf_map_addr_states_increase(sa_family_t af, struct pf_pool *rpool, +struct pf_addr *naddr) +{ + if (rpool->addr.type == PF_ADDR_TABLE) { + if (pfr_states_increase(rpool->addr.p.tbl, + naddr, af) == -1) { +
pf: route-to {random,srchash} in an anchor
Hi, Last month, I fixed the problem "route-to least-state" in an anchor didn't work. https://marc.info/?t=15911745782=1=2 I noticed the same problem happens on "random" and "srchash" as well. ok? Use the table on root always if current table is not active. Index: sys/net/pf_lb.c === RCS file: /disk/cvs/openbsd/src/sys/net/pf_lb.c,v retrieving revision 1.64 diff -u -p -r1.64 pf_lb.c --- sys/net/pf_lb.c 2 Jul 2019 09:04:53 - 1.64 +++ sys/net/pf_lb.c 23 Jul 2020 10:45:48 - @@ -345,6 +345,7 @@ pf_map_addr(sa_family_t af, struct pf_ru struct pf_addr faddr; struct pf_addr *raddr = >addr.v.a.addr; struct pf_addr *rmask = >addr.v.a.mask; + struct pfr_ktable *kt; u_int64_tstates; u_int16_tweight; u_int64_tload; @@ -396,18 +397,17 @@ pf_map_addr(sa_family_t af, struct pf_ru pf_poolmask(naddr, raddr, rmask, saddr, af); break; case PF_POOL_RANDOM: - if (rpool->addr.type == PF_ADDR_TABLE) { - cnt = rpool->addr.p.tbl->pfrkt_cnt; - if (cnt == 0) - rpool->tblidx = 0; + if (rpool->addr.type == PF_ADDR_TABLE || + rpool->addr.type == PF_ADDR_DYNIFTL) { + if (rpool->addr.type == PF_ADDR_TABLE) + kt = rpool->addr.p.tbl; else - rpool->tblidx = (int)arc4random_uniform(cnt); - memset(>counter, 0, sizeof(rpool->counter)); - if (pfr_pool_get(rpool, , , af)) + kt = rpool->addr.p.dyn->pfid_kt; + kt = pfr_ktable_select_active(kt); + if (!kt) return (1); - pf_addrcpy(naddr, >counter, af); - } else if (rpool->addr.type == PF_ADDR_DYNIFTL) { - cnt = rpool->addr.p.dyn->pfid_kt->pfrkt_cnt; + + cnt = kt->pfrkt_cnt; if (cnt == 0) rpool->tblidx = 0; else @@ -453,18 +453,18 @@ pf_map_addr(sa_family_t af, struct pf_ru case PF_POOL_SRCHASH: hashidx = pf_hash(saddr, (struct pf_addr *), >key, af); - if (rpool->addr.type == PF_ADDR_TABLE) { - cnt = rpool->addr.p.tbl->pfrkt_cnt; - if (cnt == 0) - rpool->tblidx = 0; + + if (rpool->addr.type == PF_ADDR_TABLE || + rpool->addr.type == PF_ADDR_DYNIFTL) { + if (rpool->addr.type == PF_ADDR_TABLE) + kt = rpool->addr.p.tbl; else - rpool->tblidx = (int)(hashidx % cnt); - memset(>counter, 0, sizeof(rpool->counter)); - if (pfr_pool_get(rpool, , , af)) + kt = rpool->addr.p.dyn->pfid_kt; + kt = pfr_ktable_select_active(kt); + if (!kt) return (1); - pf_addrcpy(naddr, >counter, af); - } else if (rpool->addr.type == PF_ADDR_DYNIFTL) { - cnt = rpool->addr.p.dyn->pfid_kt->pfrkt_cnt; + + cnt = kt->pfrkt_cnt; if (cnt == 0) rpool->tblidx = 0; else Index: sys/net/pf_table.c === RCS file: /disk/cvs/openbsd/src/sys/net/pf_table.c,v retrieving revision 1.133 diff -u -p -r1.133 pf_table.c --- sys/net/pf_table.c 24 Jun 2020 22:03:43 - 1.133 +++ sys/net/pf_table.c 23 Jul 2020 10:45:48 - @@ -2108,9 +2108,8 @@ pfr_kentry_byaddr(struct pfr_ktable *kt, struct sockaddr_in6 tmp6; #endif /* INET6 */ - if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) - kt = kt->pfrkt_root; - if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + kt = pfr_ktable_select_active(kt); + if (!kt) return (0); switch (af) { @@ -2153,9 +2152,8 @@ pfr_update_stats(struct pfr_ktable *kt, int dir_idx = (pd->dir == PF_OUT); int op_idx; - if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) - kt = kt->pfrkt_root; - if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + kt = pfr_ktable_select_active(kt); + if (!kt) return; switch (af) { @@ -2308,9 +2306,8 @@ pfr_pool_get(struct pf_pool *rpool, stru
Re: receive interfacez for carp when real mac is used
The problem I was to fix had been fixed by dlg@'s commit today. https://marc.info/?l=openbsd-cvs=159538265604770=2 So the diff is not needed any more. Pointed out by dlg@. Thanks, On Wed, 22 Jul 2020 19:24:32 +0900 (JST) YASUOKA Masahiko wrote: > Hi, > > Currently when using the real mac address for carp(4) interface, all > packets are treated as their receive inteface is carp. This causes > some problems. > > For example, IPv6 ndp doesn't work on an interface which is used for > carpdev. Because it is assumed that reply packets are received with > the same interface which is used to send out the request. > > ok? > > When realmac is used for carp(4), don't pass the packets through the > interface since they are for the real interface. > > Index: sys/netinet/ip_carp.c > === > RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_carp.c,v > retrieving revision 1.345 > diff -u -p -r1.345 ip_carp.c > --- sys/netinet/ip_carp.c 21 May 2020 05:24:59 - 1.345 > +++ sys/netinet/ip_carp.c 22 Jul 2020 09:52:20 - > @@ -1418,6 +1418,14 @@ carp_input(struct ifnet *ifp0, struct mb > } > m_tag_prepend(m, mtag); > } > + > + /* > + * When carp is using realmac, since the matched MAC > + * address is for the real interface, the packets are > + * not for the carp interface. > + */ > + if (sc->sc_realmac) > + sc = NULL; > break; > } > } >
receive interfacez for carp when real mac is used
Hi, Currently when using the real mac address for carp(4) interface, all packets are treated as their receive inteface is carp. This causes some problems. For example, IPv6 ndp doesn't work on an interface which is used for carpdev. Because it is assumed that reply packets are received with the same interface which is used to send out the request. ok? When realmac is used for carp(4), don't pass the packets through the interface since they are for the real interface. Index: sys/netinet/ip_carp.c === RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_carp.c,v retrieving revision 1.345 diff -u -p -r1.345 ip_carp.c --- sys/netinet/ip_carp.c 21 May 2020 05:24:59 - 1.345 +++ sys/netinet/ip_carp.c 22 Jul 2020 09:52:20 - @@ -1418,6 +1418,14 @@ carp_input(struct ifnet *ifp0, struct mb } m_tag_prepend(m, mtag); } + + /* +* When carp is using realmac, since the matched MAC +* address is for the real interface, the packets are +* not for the carp interface. +*/ + if (sc->sc_realmac) + sc = NULL; break; } }
Re: route add ::/0 ...
Let me updated the diff. On Mon, 06 Jul 2020 17:54:30 +0900 (JST) YASUOKA Masahiko wrote: > On Tue, 30 Jun 2020 02:42:02 +0200 > Klemens Nanni wrote: >> On Tue, Jun 30, 2020 at 09:00:30AM +0900, YASUOKA Masahiko wrote: >>> inet_makenetandmask() had required another treatment. >>> >>> Also -prefixlen 0 for -inet has a bug >>> >>> % doas ./obj/route -T100 add -inet 0.0.0.0 -prefixlen 0 127.0.0.1 >>> add net 0.0.0.0: gateway 127.0.0.1 >>> % netstat -nrf inet -T 100 >>> Routing tables >>> >>> Internet: >>> DestinationGatewayFlags Refs Use Mtu Prio >>> Iface >>> 0.0.0.0/32 127.0.0.1 UGS00 32768 8 >>> lo100 >>> >>> /0 becomes /32. The diff following also fixes the problem. >> Yes, this looks correct to me; regress is also happy (again). >> >> OK kn > > Thanks, > > I'm going to commit the diff. ok or comments, are still welcome. > > > Stop using make_addr() which trims trailing zeros of the netmask, set > family and length field. This fixes route(8) to handle "::/0" > properly. Also fix "route add -inet 0.0.0.0 -prefixlen 0 (gateway)" > to work properly. > > Index: sbin/route/route.c > === > RCS file: /cvs/src/sbin/route/route.c,v > retrieving revision 1.247 > diff -u -p -r1.247 route.c > --- sbin/route/route.c15 Jan 2020 10:26:25 - 1.247 > +++ sbin/route/route.c6 Jul 2020 08:45:06 - (snip) > @@ -781,12 +780,9 @@ inet_makenetandmask(u_int32_t net, struc > sin->sin_addr.s_addr = htonl(net); > sin = _mask.sin; > sin->sin_addr.s_addr = htonl(mask); > - sin->sin_len = 0; > - sin->sin_family = 0; > + sin->sin_family = AF_INET; > cp = (char *)(>sin_addr + 1); > - while (*--cp == '\0' && cp > (char *)sin) > - continue; > - sin->sin_len = 1 + cp - (char *)sin; > + sin->sin_len = sizeof(struct sockaddr_in); > } > > /* "cp" becomes unused. The updated diff removes "cp" as well. Index: sbin/route/route.c === RCS file: /cvs/src/sbin/route/route.c,v retrieving revision 1.247 diff -u -p -r1.247 route.c --- sbin/route/route.c 15 Jan 2020 10:26:25 - 1.247 +++ sbin/route/route.c 6 Jul 2020 08:57:25 - @@ -107,7 +107,6 @@ void print_rtmsg(struct rt_msghdr *, in voidpmsg_common(struct rt_msghdr *); voidpmsg_addrs(char *, int); voidbprintf(FILE *, int, char *); -voidmask_addr(union sockunion *, union sockunion *, int); int getaddr(int, int, char *, struct hostent **); voidgetmplslabel(char *, int); int rtmsg(int, int, int, uint8_t); @@ -767,7 +766,6 @@ void inet_makenetandmask(u_int32_t net, struct sockaddr_in *sin, int bits) { u_int32_t mask; - char *cp; rtm_addrs |= RTA_NETMASK; if (bits == 0 && net == 0) @@ -781,12 +779,8 @@ inet_makenetandmask(u_int32_t net, struc sin->sin_addr.s_addr = htonl(net); sin = _mask.sin; sin->sin_addr.s_addr = htonl(mask); - sin->sin_len = 0; - sin->sin_family = 0; - cp = (char *)(>sin_addr + 1); - while (*--cp == '\0' && cp > (char *)sin) - continue; - sin->sin_len = 1 + cp - (char *)sin; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(struct sockaddr_in); } /* @@ -1001,7 +995,8 @@ prefixlen(int af, char *s) memset(_mask, 0, sizeof(so_mask)); so_mask.sin.sin_family = AF_INET; so_mask.sin.sin_len = sizeof(struct sockaddr_in); - so_mask.sin.sin_addr.s_addr = htonl(0x << (32 - len)); + if (len != 0) + so_mask.sin.sin_addr.s_addr = htonl(0x << (32 - len)); break; case AF_INET6: so_mask.sin6.sin6_family = AF_INET6; @@ -1088,8 +1083,6 @@ rtmsg(int cmd, int flags, int fmask, uin rtm.rtm_mpls = mpls_flags; rtm.rtm_hdrlen = sizeof(rtm); - if (rtm_addrs & RTA_NETMASK) - mask_addr(_dst, _mask, RTA_DST); /* store addresses in ascending order of RTA values */ NEXTADDR(RTA_DST, so_dst); NEXTADDR(RTA_GATEWAY, so_gate); @@ -1118,34 +,6 @@ rtmsg(int cmd, int flags, int fmask, uin } #undef rtm return (0); -} - -void -mask_addr(union sockunion *addr, union sockunion *mask, int which) -{ - int olen = mask->sa.sa_len; - char *cp1 = olen + (char *)mask, *cp2; - - for (mask-&g
Re: route add ::/0 ...
On Tue, 30 Jun 2020 02:42:02 +0200 Klemens Nanni wrote: > On Tue, Jun 30, 2020 at 09:00:30AM +0900, YASUOKA Masahiko wrote: >> inet_makenetandmask() had required another treatment. >> >> Also -prefixlen 0 for -inet has a bug >> >> % doas ./obj/route -T100 add -inet 0.0.0.0 -prefixlen 0 127.0.0.1 >> add net 0.0.0.0: gateway 127.0.0.1 >> % netstat -nrf inet -T 100 >> Routing tables >> >> Internet: >> DestinationGatewayFlags Refs Use Mtu Prio >> Iface >> 0.0.0.0/32 127.0.0.1 UGS00 32768 8 >> lo100 >> >> /0 becomes /32. The diff following also fixes the problem. > Yes, this looks correct to me; regress is also happy (again). > > OK kn Thanks, I'm going to commit the diff. ok or comments, are still welcome. Stop using make_addr() which trims trailing zeros of the netmask, set family and length field. This fixes route(8) to handle "::/0" properly. Also fix "route add -inet 0.0.0.0 -prefixlen 0 (gateway)" to work properly. Index: sbin/route/route.c === RCS file: /cvs/src/sbin/route/route.c,v retrieving revision 1.247 diff -u -p -r1.247 route.c --- sbin/route/route.c 15 Jan 2020 10:26:25 - 1.247 +++ sbin/route/route.c 6 Jul 2020 08:45:06 - @@ -107,7 +107,6 @@ void print_rtmsg(struct rt_msghdr *, in voidpmsg_common(struct rt_msghdr *); voidpmsg_addrs(char *, int); voidbprintf(FILE *, int, char *); -voidmask_addr(union sockunion *, union sockunion *, int); int getaddr(int, int, char *, struct hostent **); voidgetmplslabel(char *, int); int rtmsg(int, int, int, uint8_t); @@ -781,12 +780,9 @@ inet_makenetandmask(u_int32_t net, struc sin->sin_addr.s_addr = htonl(net); sin = _mask.sin; sin->sin_addr.s_addr = htonl(mask); - sin->sin_len = 0; - sin->sin_family = 0; + sin->sin_family = AF_INET; cp = (char *)(>sin_addr + 1); - while (*--cp == '\0' && cp > (char *)sin) - continue; - sin->sin_len = 1 + cp - (char *)sin; + sin->sin_len = sizeof(struct sockaddr_in); } /* @@ -1001,7 +997,8 @@ prefixlen(int af, char *s) memset(_mask, 0, sizeof(so_mask)); so_mask.sin.sin_family = AF_INET; so_mask.sin.sin_len = sizeof(struct sockaddr_in); - so_mask.sin.sin_addr.s_addr = htonl(0x << (32 - len)); + if (len != 0) + so_mask.sin.sin_addr.s_addr = htonl(0x << (32 - len)); break; case AF_INET6: so_mask.sin6.sin6_family = AF_INET6; @@ -1088,8 +1085,6 @@ rtmsg(int cmd, int flags, int fmask, uin rtm.rtm_mpls = mpls_flags; rtm.rtm_hdrlen = sizeof(rtm); - if (rtm_addrs & RTA_NETMASK) - mask_addr(_dst, _mask, RTA_DST); /* store addresses in ascending order of RTA values */ NEXTADDR(RTA_DST, so_dst); NEXTADDR(RTA_GATEWAY, so_gate); @@ -1118,34 +1113,6 @@ rtmsg(int cmd, int flags, int fmask, uin } #undef rtm return (0); -} - -void -mask_addr(union sockunion *addr, union sockunion *mask, int which) -{ - int olen = mask->sa.sa_len; - char *cp1 = olen + (char *)mask, *cp2; - - for (mask->sa.sa_len = 0; cp1 > (char *)mask; ) - if (*--cp1 != '\0') { - mask->sa.sa_len = 1 + cp1 - (char *)mask; - break; - } - if ((rtm_addrs & which) == 0) - return; - switch (addr->sa.sa_family) { - case AF_INET: - case AF_INET6: - case AF_UNSPEC: - return; - } - cp1 = mask->sa.sa_len + 1 + (char *)addr; - cp2 = addr->sa.sa_len + 1 + (char *)addr; - while (cp2 > cp1) - *--cp2 = '\0'; - cp2 = mask->sa.sa_len + 1 + (char *)mask; - while (cp1 > addr->sa.sa_data) - *--cp1 &= *--cp2; } char *msgtypes[] = {
Re: route add ::/0 ...
On Mon, 29 Jun 2020 19:18:17 +0200 Klemens Nanni wrote: > On Mon, Jun 29, 2020 at 11:55:10PM +0900, YASUOKA Masahiko wrote: >> The function mask_addr() doesn't mask address for IPv4 and IPv6. Does >> any address family other than IPv4 or IPv6 require #1142:1148? The >> function seems to just trim the trailing zero. Is it neccesaary? And >> it causes the confusion on the kernel. How about deleting >> mask_addr()? >> >> The diff following also fixes the problem. > Removing it breaks IPv4 default routes: > > # ifconfig lo1 rdomain 1 127.1.1.1 > # ./obj/route -nT1 add 0.0.0.0/0 127.1.1.1 > add net 0.0.0.0/0: gateway 127.1.1.1: Invalid argument > # route -nT1 add 0.0.0.0/0 127.1.1.1 > add net 0.0.0.0/0: gateway 127.1.1.1 Thanks, inet_makenetandmask() had required another treatment. Also -prefixlen 0 for -inet has a bug % doas ./obj/route -T100 add -inet 0.0.0.0 -prefixlen 0 127.0.0.1 add net 0.0.0.0: gateway 127.0.0.1 % netstat -nrf inet -T 100 Routing tables Internet: DestinationGatewayFlags Refs Use Mtu Prio Iface 0.0.0.0/32 127.0.0.1 UGS00 32768 8 lo100 /0 becomes /32. The diff following also fixes the problem. diff --git a/sbin/route/route.c b/sbin/route/route.c index 9e43d8e89b6..532a918148d 100644 --- a/sbin/route/route.c +++ b/sbin/route/route.c @@ -107,7 +107,6 @@ void print_rtmsg(struct rt_msghdr *, int); voidpmsg_common(struct rt_msghdr *); voidpmsg_addrs(char *, int); voidbprintf(FILE *, int, char *); -voidmask_addr(union sockunion *, union sockunion *, int); int getaddr(int, int, char *, struct hostent **); voidgetmplslabel(char *, int); int rtmsg(int, int, int, uint8_t); @@ -781,12 +780,9 @@ inet_makenetandmask(u_int32_t net, struct sockaddr_in *sin, int bits) sin->sin_addr.s_addr = htonl(net); sin = _mask.sin; sin->sin_addr.s_addr = htonl(mask); - sin->sin_len = 0; - sin->sin_family = 0; + sin->sin_family = AF_INET; cp = (char *)(>sin_addr + 1); - while (*--cp == '\0' && cp > (char *)sin) - continue; - sin->sin_len = 1 + cp - (char *)sin; + sin->sin_len = sizeof(struct sockaddr_in); } /* @@ -1001,7 +997,8 @@ prefixlen(int af, char *s) memset(_mask, 0, sizeof(so_mask)); so_mask.sin.sin_family = AF_INET; so_mask.sin.sin_len = sizeof(struct sockaddr_in); - so_mask.sin.sin_addr.s_addr = htonl(0x << (32 - len)); + if (len != 0) + so_mask.sin.sin_addr.s_addr = htonl(0x << (32 - len)); break; case AF_INET6: so_mask.sin6.sin6_family = AF_INET6; @@ -1088,8 +1085,6 @@ rtmsg(int cmd, int flags, int fmask, uint8_t prio) rtm.rtm_mpls = mpls_flags; rtm.rtm_hdrlen = sizeof(rtm); - if (rtm_addrs & RTA_NETMASK) - mask_addr(_dst, _mask, RTA_DST); /* store addresses in ascending order of RTA values */ NEXTADDR(RTA_DST, so_dst); NEXTADDR(RTA_GATEWAY, so_gate); @@ -1120,34 +1115,6 @@ rtmsg(int cmd, int flags, int fmask, uint8_t prio) return (0); } -void -mask_addr(union sockunion *addr, union sockunion *mask, int which) -{ - int olen = mask->sa.sa_len; - char *cp1 = olen + (char *)mask, *cp2; - - for (mask->sa.sa_len = 0; cp1 > (char *)mask; ) - if (*--cp1 != '\0') { - mask->sa.sa_len = 1 + cp1 - (char *)mask; - break; - } - if ((rtm_addrs & which) == 0) - return; - switch (addr->sa.sa_family) { - case AF_INET: - case AF_INET6: - case AF_UNSPEC: - return; - } - cp1 = mask->sa.sa_len + 1 + (char *)addr; - cp2 = addr->sa.sa_len + 1 + (char *)addr; - while (cp2 > cp1) - *--cp2 = '\0'; - cp2 = mask->sa.sa_len + 1 + (char *)mask; - while (cp1 > addr->sa.sa_data) - *--cp1 &= *--cp2; -} - char *msgtypes[] = { "", "RTM_ADD: Add Route",
Re: route add ::/0 ...
On Mon, 29 Jun 2020 18:45:07 +0900 (JST) YASUOKA Masahiko wrote: > On Mon, 29 Jun 2020 10:12:23 +0200 > Martin Pieuchot wrote: >> On 28/06/20(Sun) 20:41, YASUOKA Masahiko wrote: >>> Hi, >>> >>> When "::/0" is used as "default", >>> >>> # route add ::/0 fe80::1%em0 >>> add net ::/0: gateway fe80::1%em0: Invalid argument >>> >>> route command trims the sockaddr to { .len = 2, .family = AF_INET6 } >>> for "::/0", but rtable_satoplen() refuses it. I think it should be >>> accepted. >> >> rtable_satoplen() is used in many places, not just in the socket parsing >> code used by route(8). I don't know what side effects can be introduced >> by this change. >> >> Why is IPv6 different from IPv4 when it comes to the default route? > > Diferent functions are used. route(8) uses inet_makenetandmask() to > create a sockaddr for IPv4 prefix length and uses prefixlen() for IPv6 > prefix length. "/0" results: > > IPv4 > { .len = 1, .family = 0, ... } > IPv6 > { .len = 2, .family = AF_INET6, ... } I'm sorry this is not correct. It is actually IPv6 { .len = 28, .family = AF_INET6, ... } > Next, route(8) uses mask_addr() to trim the tailing zeros. > > 1129 void > 1130 mask_addr(union sockunion *addr, union sockunion *mask, int which) > 1131 { > 1132 int olen = mask->sa.sa_len; > 1133 char *cp1 = olen + (char *)mask, *cp2; > 1134 > 1135 for (mask->sa.sa_len = 0; cp1 > (char *)mask; ) > 1136 if (*--cp1 != '\0') { > 1137 mask->sa.sa_len = 1 + cp1 - (char *)mask; > 1138 break; > 1139 } > > See #1135 carefully. As the results, the sockaddrs become: > > IPv4 > { .len = 0, .family = 0, ... } > IPv6 > { .len = 2, .family = AF_INET6, ... } I'm start wondering what the mask_addr() is for. 1123 void 1124 mask_addr(union sockunion *addr, union sockunion *mask, int which) 1125 { 1126 int olen = mask->sa.sa_len; 1127 char *cp1 = olen + (char *)mask, *cp2; 1128 1129 for (mask->sa.sa_len = 0; cp1 > (char *)mask; ) 1130 if (*--cp1 != '\0') { 1131 mask->sa.sa_len = 1 + cp1 - (char *)mask; 1132 break; 1133 } 1134 if ((rtm_addrs & which) == 0) 1135 return; 1136 switch (addr->sa.sa_family) { 1137 case AF_INET: 1138 case AF_INET6: 1139 case AF_UNSPEC: 1140 return; 1141 } 1142 cp1 = mask->sa.sa_len + 1 + (char *)addr; 1143 cp2 = addr->sa.sa_len + 1 + (char *)addr; 1144 while (cp2 > cp1) 1145 *--cp2 = '\0'; 1146 cp2 = mask->sa.sa_len + 1 + (char *)mask; 1147 while (cp1 > addr->sa.sa_data) 1148 *--cp1 &= *--cp2; 1149 } The function mask_addr() doesn't mask address for IPv4 and IPv6. Does any address family other than IPv4 or IPv6 require #1142:1148? The function seems to just trim the trailing zero. Is it neccesaary? And it causes the confusion on the kernel. How about deleting mask_addr()? The diff following also fixes the problem. diff --git a/sbin/route/route.c b/sbin/route/route.c index 9e43d8e89b6..87f26e5c1e7 100644 --- a/sbin/route/route.c +++ b/sbin/route/route.c @@ -107,7 +107,6 @@ void print_rtmsg(struct rt_msghdr *, int); voidpmsg_common(struct rt_msghdr *); voidpmsg_addrs(char *, int); voidbprintf(FILE *, int, char *); -voidmask_addr(union sockunion *, union sockunion *, int); int getaddr(int, int, char *, struct hostent **); voidgetmplslabel(char *, int); int rtmsg(int, int, int, uint8_t); @@ -1088,8 +1087,6 @@ rtmsg(int cmd, int flags, int fmask, uint8_t prio) rtm.rtm_mpls = mpls_flags; rtm.rtm_hdrlen = sizeof(rtm); - if (rtm_addrs & RTA_NETMASK) - mask_addr(_dst, _mask, RTA_DST); /* store addresses in ascending order of RTA values */ NEXTADDR(RTA_DST, so_dst); NEXTADDR(RTA_GATEWAY, so_gate); @@ -1120,34 +1117,6 @@ rtmsg(int cmd, int flags, int fmask, uint8_t prio) return (0); } -void -mask_addr(union sockunion *addr, union sockunion *mask, int which) -{ - int olen = mask->sa.sa_len; - char *cp1 = olen + (char *)mask, *cp2; - - for (mask->sa.sa_len = 0; cp1 > (char *)mask; ) - if (*--cp1 != '\0') { - mask->sa.sa_len = 1 + cp1 - (char *)mask; - break; - } - if ((rtm_addrs & which) == 0) - return; -
Re: route add ::/0 ...
Hi, On Mon, 29 Jun 2020 10:12:23 +0200 Martin Pieuchot wrote: > On 28/06/20(Sun) 20:41, YASUOKA Masahiko wrote: >> Hi, >> >> When "::/0" is used as "default", >> >> # route add ::/0 fe80::1%em0 >> add net ::/0: gateway fe80::1%em0: Invalid argument >> >> route command trims the sockaddr to { .len = 2, .family = AF_INET6 } >> for "::/0", but rtable_satoplen() refuses it. I think it should be >> accepted. > > rtable_satoplen() is used in many places, not just in the socket parsing > code used by route(8). I don't know what side effects can be introduced > by this change. > > Why is IPv6 different from IPv4 when it comes to the default route? Diferent functions are used. route(8) uses inet_makenetandmask() to create a sockaddr for IPv4 prefix length and uses prefixlen() for IPv6 prefix length. "/0" results: IPv4 { .len = 1, .family = 0, ... } IPv6 { .len = 2, .family = AF_INET6, ... } Next, route(8) uses mask_addr() to trim the tailing zeros. 1129 void 1130 mask_addr(union sockunion *addr, union sockunion *mask, int which) 1131 { 1132 int olen = mask->sa.sa_len; 1133 char *cp1 = olen + (char *)mask, *cp2; 1134 1135 for (mask->sa.sa_len = 0; cp1 > (char *)mask; ) 1136 if (*--cp1 != '\0') { 1137 mask->sa.sa_len = 1 + cp1 - (char *)mask; 1138 break; 1139 } See #1135 carefully. As the results, the sockaddrs become: IPv4 { .len = 0, .family = 0, ... } IPv6 { .len = 2, .family = AF_INET6, ... } Yes, we can fix IPv6 case to have .len = 0 as well. But I thought kernel should accept both cases, since the representation for IPv6 didn't seem so bad for me. > Shouldn't we change route(8) to have a `sa_len' of 0? > > That would make the following true: > > mlen = mask->sa_len; > > /* Default route */ > if (mlen == 0) > return (0) > >> Allow sockaddr for prefix length be trimmed before the key(address) >> field. Actually "route" command trims at the address family field for >> "::/0" >> >> Index: sys/net/rtable.c >> === >> RCS file: /cvs/src/sys/net/rtable.c,v >> retrieving revision 1.69 >> diff -u -p -r1.69 rtable.c >> --- sys/net/rtable.c 21 Jun 2019 17:11:42 - 1.69 >> +++ sys/net/rtable.c 28 Jun 2020 11:30:54 - >> @@ -887,8 +887,8 @@ rtable_satoplen(sa_family_t af, struct s >> >> ap = (uint8_t *)((uint8_t *)mask) + dp->dom_rtoffset; >> ep = (uint8_t *)((uint8_t *)mask) + mlen; >> -if (ap > ep) >> -return (-1); >> +if (ap >= ep) >> +return (0); > > That means the kernel now silently ignore sockaddr short `sa_len'. Are > they supposed to be supported or are they symptoms of bugs? I have missed rtable_satoplen() is used by other functions. > >> /* Trim trailing zeroes. */ >> while (ap < ep && ep[-1] == 0) >
route add ::/0 ...
Hi, When "::/0" is used as "default", # route add ::/0 fe80::1%em0 add net ::/0: gateway fe80::1%em0: Invalid argument route command trims the sockaddr to { .len = 2, .family = AF_INET6 } for "::/0", but rtable_satoplen() refuses it. I think it should be accepted. ok? Allow sockaddr for prefix length be trimmed before the key(address) field. Actually "route" command trims at the address family field for "::/0" Index: sys/net/rtable.c === RCS file: /cvs/src/sys/net/rtable.c,v retrieving revision 1.69 diff -u -p -r1.69 rtable.c --- sys/net/rtable.c21 Jun 2019 17:11:42 - 1.69 +++ sys/net/rtable.c28 Jun 2020 11:30:54 - @@ -887,8 +887,8 @@ rtable_satoplen(sa_family_t af, struct s ap = (uint8_t *)((uint8_t *)mask) + dp->dom_rtoffset; ep = (uint8_t *)((uint8_t *)mask) + mlen; - if (ap > ep) - return (-1); + if (ap >= ep) + return (0); /* Trim trailing zeroes. */ while (ap < ep && ep[-1] == 0)
Re: pipex(4): prevent `state_list' corruption
Yes, this seems right. ok yasuoka On Thu, 18 Jun 2020 23:53:25 +0300 Vitaliy Makkoveev wrote: > While pppac(4) destroy sessions by pipex_iface_fini() or by > pipex_ioctl() with PIPEXSMODE command, some sessions can be linked to > `state_list'. This case is not checked and sessions will never be > unlinked and `state_list' will be broken after session's memory freeing. > > Diff below adds session removal from `state_list' in > pipex_unlink_session(). Also unlinked session `state' sets to > PIPEX_STATE_CLOSED like pipex_close_session() does. > > Index: sys/net/pipex.c > === > RCS file: /cvs/src/sys/net/pipex.c,v > retrieving revision 1.115 > diff -u -p -r1.115 pipex.c > --- sys/net/pipex.c 18 Jun 2020 14:20:12 - 1.115 > +++ sys/net/pipex.c 18 Jun 2020 16:37:44 - > @@ -473,8 +473,10 @@ pipex_unlink_session(struct pipex_sessio > break; > } > #endif > - > + if (session->state == PIPEX_STATE_CLOSE_WAIT) > + LIST_REMOVE(session, state_list); > LIST_REMOVE(session, session_list); > + session->state = PIPEX_STATE_CLOSED; > > /* if final session is destroyed, stop timer */ > if (LIST_EMPTY(_session_list)) >
Re: install npppd.conf with mode 0600
The line in etc/mtree/special should be updated as well. npppd.conf type=file mode=0640 uname=root gname=wheel other than that, ok yasuoka On Sun, 21 Jun 2020 16:48:44 +0300 Vitaliy Makkoveev wrote: > We installing `npppd-users' with uid:gid root:wheel and mode 0600 > because it consists sensitive data but mode for npppd.conf is 0640. > npppd.conf can also have radius passwords and nothing requires to allow > it be readable by group. So set it's permissions to 0600. > > Index: usr.sbin/npppd/Makefile > === > RCS file: /cvs/src/usr.sbin/npppd/Makefile,v > retrieving revision 1.6 > diff -u -p -r1.6 Makefile > --- usr.sbin/npppd/Makefile 14 Mar 2013 16:20:46 - 1.6 > +++ usr.sbin/npppd/Makefile 21 Jun 2020 13:37:50 - > @@ -6,7 +6,7 @@ > SUBDIR+= npppd > > distribution: > - ${INSTALL} -C -o root -g wheel -m 0640 ${.CURDIR}/npppd/npppd.conf \ > + ${INSTALL} -C -o root -g wheel -m 0600 ${.CURDIR}/npppd/npppd.conf \ > ${DESTDIR}/etc/npppd/npppd.conf > ${INSTALL} -C -o root -g wheel -m 0600 ${.CURDIR}/npppd/npppd-users \ > ${DESTDIR}/etc/npppd/npppd-users
Re: pf "route-to least-state" in an anchor doesn't work
Hello, On Wed, 3 Jun 2020 23:30:56 +0200 Alexandr Nedvedicky wrote: > I'm OK with your change. Thank you for your review and comment. > However I would like to ask you to do yet another test. I wonder if things > will eventually work on unfixed PF if rules will be constructed as follows: > > pfctl -a test -t LB -T add 10.0.0.11@pair102 > > echo 'pass in on rdomain 102 quick proto tcp to 10.0.0.101 port 8080 \ > keep state ( sloppy ) route-to \ > least-states sticky-address' |pfctl -a test -f - > > echo 'anchor test' | pfctl -f - > > pfctl -e > > I suspect the bug you've found and fixed happens when pfctl loads rules > from pf.conf. I think the steps above will take a different route > through the code, which avoids pfr_ina_define() (a.k.a. transactions). I've tested it before the diff and after. Both tests were OK. # pfctl -sr -a test pass in quick on rdomain 102 inet proto tcp from any to 10.0.0.101 port = 8080 flags S/SA keep state (sloppy) route-to least-states sticky-address # pfctl -a test -tLB -Tshow 10.0.0.11@pair102 # $ doas route -T 101 exec telnet 10.0.0.101 8080 Trying 10.0.0.101... Connected to 10.0.0.101. Escape character is '^]'. ^] telnet> close Connection closed. $ > I don't have a test system readily available and I'm just curious > if anything changes or not. Thanks for finding that for me. > > As I've said I think your change should go in. > > OK sashan Thanks,
pf "route-to least-state" in an anchor doesn't work
Hi, pf.conf: anchor { pass in on rdomain 102 quick proto tcp to 10.0.0.101 port 8080 \ keep state ( sloppy ) route-to \ least-states sticky-address } table { 10.0.0.11@pair102 } this doesn't work. All packets going to 10.0.0.101 are dropped with 'no-route'. The problem doesn't happen if the pass rule is moved to outside of the anchor or uses "round-robin" instead of "least-states". In sys/net/pf_lb.c: 594 if (rpool->addr.type == PF_ADDR_TABLE) { 595 if (pfr_states_increase(rpool->addr.p.tbl, 596 naddr, af) == -1) { 597 if (pf_status.debug >= LOG_DEBUG) { 598 log(LOG_DEBUG,"pf: pf_map_addr: " 599 "selected address "); 600 pf_print_host(naddr, 0, af); 601 addlog(". Failed to increase count!\n"); 602 } 603 return (1); 604 } This chunk is to increase the counter for "least-state". The packets drops here because pfr_states_increase() returns -1. pfr_states_increase() uses pfr_kentry_byaddr(), and pfr_kentry_byaddr() uses pfr_lookup_addr() to lookup a kentry in the table. pfr_lookup_addr() never succeeded for above case, because it doesn't care about using global (root) tables from rules in an anchor. All other functions which lookup a kentry from the table than pfr_lookup_addr() seem to take care about that. I thought that pfr_lookup_addr() is a local function used for ioctl to create tables and manage its members. So the keep it untouched. Instead, the diff replaces the body of pfr_kentry_byaddr() by the logic of pfr_match_addr(). * * * Test 1. prepare network ifconfig pair101 rdomain 101 10.0.0.1/24 ifconfig pair102 rdomain 102 10.0.0.10/24 ifconfig pair102 alias 10.0.0.101/24 ifconfig pair103 rdomain 103 10.0.0.11/24 ifconfig pair104 rdomain 100 patch pair101 up ifconfig pair105 rdomain 100 patch pair102 up ifconfig pair106 rdomain 100 patch pair103 up ifconfig lo103 127.0.0.1/8 ifconfig lo103 alias 10.0.0.101/24 ifconfig bridge100 add pair104 ifconfig bridge100 add pair105 ifconfig bridge100 add pair106 up 2. setup pf.conf anchor { pass in on rdomain 102 quick proto tcp to 10.0.0.101 port 8080 \ keep state ( sloppy ) route-to \ least-states sticky-address } table { 10.0.0.11@pair102 } 3. start a daemon on 8080/tcp on #103 doas route -T 103 exec nc -l 8080 4. try to connect to it from #101 doas route -T 101 exec telnet 10.0.0.101 8080 - test OK if the connection is established 5. teardown ifconfig pair106 destroy ifconfig pair105 destroy ifconfig pair104 destroy ifconfig pair103 destroy ifconfig pair102 destroy ifconfig pair101 destroy ifconfig bridge100 destroy * * * ok? Fix pfr_kentry_byaddr() to be used for a rule in an anchor. It couldn't find an entry if its table is attached a table on the root. This fixes the problem "route-to least-states" doesn't work. The problem is found by IIJ. Index: sys/net/pf_table.c === RCS file: /cvs/src/sys/net/pf_table.c,v retrieving revision 1.131 diff -u -p -r1.131 pf_table.c --- sys/net/pf_table.c 8 Jul 2019 17:49:57 - 1.131 +++ sys/net/pf_table.c 3 Jun 2020 07:21:27 - @@ -2085,11 +2085,28 @@ int pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af) { struct pfr_kentry *ke = NULL; + int match; + + ke = pfr_kentry_byaddr(kt, a, af, 0); + + match = (ke && !(ke->pfrke_flags & PFRKE_FLAG_NOT)); + if (match) + kt->pfrkt_match++; + else + kt->pfrkt_nomatch++; + + return (match); +} + +struct pfr_kentry * +pfr_kentry_byaddr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af, +int exact) +{ + struct pfr_kentry *ke = NULL; struct sockaddr_in tmp4; #ifdef INET6 struct sockaddr_in6 tmp6; #endif /* INET6 */ - int match; if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) kt = kt->pfrkt_root; @@ -2116,12 +2133,10 @@ pfr_match_addr(struct pfr_ktable *kt, st default: unhandled_af(af); } - match = (ke && !(ke->pfrke_flags & PFRKE_FLAG_NOT)); - if (match) - kt->pfrkt_match++; - else - kt->pfrkt_nomatch++; - return (match); + if (exact && ke && KENTRY_NETWORK(ke)) + ke = NULL; + + return (ke); } void @@ -2497,39 +2512,6 @@ pfr_states_decrease(struct pfr_ktable *k "pfr_states_decrease: states-- when states <= 0");
Re: diff: init efifb even if VGA is probed.
On Thu, 28 May 2020 12:31:31 +0200 (CEST) Mark Kettenis wrote: >> Date: Thu, 28 May 2020 17:01:48 +0900 (JST) >> From: YASUOKA Masahiko >> >> Hi, >> >> I'd like to conclude this issue. >> >> On Fri, 21 Feb 2020 14:09:07 +0900 (JST) >> YASUOKA Masahiko wrote: >> > I am testing a new hardware, HPE DL20 Gen10. >> > >> > When efiboot starts the kernel, the video display becomes distorted >> > and never recovered until CPU reset. >> > >> > Our kernel tries to initialized console twice, first trial is done >> > before getting boot info and second trial is done after getting boot >> > info. Since EFI framebuffer needs "boot info", it is initialized on >> > second trial. >> > >> > On HPE DL20 Gen10, probing vga is succeeded on first trial, the kernel >> > selects vga for the console, but actually it is broken. On usual >> > machines which boot with EFI, the problem doesn't happen since they >> > have no vga. >> >> If we have a way to detect whether the machine has VGA. ACPI >> FADT_NO_VGA was a candidate. But that bit is cleard both on my "HPE >> DL20 Gen10" and Andrew Daugherity's Dell PowerEdge R230. Also the >> problem newly posted at misc@ (*) might be the same problem. >> >> (*) https://marc.info/?l=openbsd-misc=159064773219779=2 >> >> I think having the diff folowing is the best for this momemnt. >> The diff does: >> >> - move cninit() after parsing bootinfo >> - give up the debug message which can be enabled if BOOTINFO_DEBUG is >> defined >> >> ok? > > I suspect we have to accept that there is too much broken hardware out > there. Finally we might have no way other than having a configuration in boot.conf... > There is no real reason to drop the debug messages. OK, the debug messages are reverted. > I'd prefer to call cninit() directly from init_x86_64, so I'd just > move the call immediately after the block that calls getbootinfo(). > And emove the call from getbootinfo() of course. I think the last diff already satisfied these things. >> @@ -1395,11 +1395,6 @@ init_x86_64(paddr_t first_avail) >> i8254_startclock(); >> >> /* >> - * Attach the glass console early in case we need to display a panic. >> - */ >> -cninit(); >> - >> -/* >> * Initialize PAGE_SIZE-dependent variables. >> */ >> uvm_setpagesize(); >> @@ -1421,6 +1416,8 @@ init_x86_64(paddr_t first_avail) >> } else >> panic("invalid /boot"); >> >> +cninit(); >> + >> /* >> * Memory on the AMD64 port is described by three different things. >> * A hidden line which calls getbootinfo() is at just before second chunk. The updated diff was created with "-U 4" to clarify this. Alternatively, are you suggesting getbootinfo(bootinfo, bootinfo_size); + cninit(); } else panic("invalid /boot"); ? Both is OK for me. How about this? Index: sys/arch/amd64/amd64/machdep.c === RCS file: /cvs/src/sys/arch/amd64/amd64/machdep.c,v retrieving revision 1.264 diff -u -p -U4 -r1.264 machdep.c --- sys/arch/amd64/amd64/machdep.c 16 May 2020 14:44:44 - 1.264 +++ sys/arch/amd64/amd64/machdep.c 28 May 2020 11:34:39 - @@ -1394,13 +1394,8 @@ init_x86_64(paddr_t first_avail) i8254_startclock(); /* -* Attach the glass console early in case we need to display a panic. -*/ - cninit(); - - /* * Initialize PAGE_SIZE-dependent variables. */ uvm_setpagesize(); @@ -1420,8 +1415,10 @@ init_x86_64(paddr_t first_avail) getbootinfo(bootinfo, bootinfo_size); } else panic("invalid /boot"); + cninit(); + /* * Memory on the AMD64 port is described by three different things. * * 1. biosbasemem - This is outdated, and should really only be used to @@ -1926,10 +1923,8 @@ getbootinfo(char *bootinfo, int bootinfo bootarg32_t *q; bios_ddb_t *bios_ddb; bios_bootduid_t *bios_bootduid; bios_bootsr_t *bios_bootsr; - int docninit = 0; - #undef BOOTINFO_DEBUG #ifdef BOOTINFO_DEBUG printf("bootargv:"); #endif @@ -1982,11 +1977,8 @@ getbootinfo(char *bootinfo, int bootinfo comconsunit = unit; comconsaddr = consaddr; comconsrate = cdp->conspeed;
Re: diff: init efifb even if VGA is probed.
On Thu, 28 May 2020 17:01:48 +0900 (JST) YASUOKA Masahiko wrote: > Hi, > > I'd like to conclude this issue. > > On Fri, 21 Feb 2020 14:09:07 +0900 (JST) > YASUOKA Masahiko wrote: >> I am testing a new hardware, HPE DL20 Gen10. >> >> When efiboot starts the kernel, the video display becomes distorted >> and never recovered until CPU reset. >> >> Our kernel tries to initialized console twice, first trial is done >> before getting boot info and second trial is done after getting boot >> info. Since EFI framebuffer needs "boot info", it is initialized on >> second trial. >> >> On HPE DL20 Gen10, probing vga is succeeded on first trial, the kernel >> selects vga for the console, but actually it is broken. On usual >> machines which boot with EFI, the problem doesn't happen since they >> have no vga. > > If we have a way to detect whether the machine has VGA. ACPI > FADT_NO_VGA was a candidate. But that bit is cleard both on my "HPE > DL20 Gen10" and Andrew Daugherity's Dell PowerEdge R230. Also the > problem newly posted at misc@ (*) might be the same problem. Above paragraph may be unclear. Let me update it by the following paragraph. If we have a way to detect whether the machine has VGA, we thought we can select VGA or EFI framebuffer safely. ACPI FADT_NO_VGA was a candidate. But the bit is cleared both on my "HPE DL20 Gen10" and Andrew Daugherity's Dell PowerEdge R230. Also the problem newly posted at misc@ (*) might be the same problem. > (*) https://marc.info/?l=openbsd-misc=159064773219779=2 > > I think having the diff folowing is the best for this momemnt. > The diff does: > > - move cninit() after parsing bootinfo > - give up the debug message which can be enabled if BOOTINFO_DEBUG is > defined > > ok? > > Index: sys/arch/amd64/amd64/machdep.c > === > RCS file: /disk/cvs/openbsd/src/sys/arch/amd64/amd64/machdep.c,v > retrieving revision 1.264 > diff -u -p -r1.264 machdep.c > --- sys/arch/amd64/amd64/machdep.c16 May 2020 14:44:44 - 1.264 > +++ sys/arch/amd64/amd64/machdep.c28 May 2020 07:40:14 - > @@ -1395,11 +1395,6 @@ init_x86_64(paddr_t first_avail) > i8254_startclock(); > > /* > - * Attach the glass console early in case we need to display a panic. > - */ > - cninit(); > - > - /* >* Initialize PAGE_SIZE-dependent variables. >*/ > uvm_setpagesize(); > @@ -1421,6 +1416,8 @@ init_x86_64(paddr_t first_avail) > } else > panic("invalid /boot"); > > + cninit(); > + > /* > * Memory on the AMD64 port is described by three different things. > * > @@ -1927,12 +1924,6 @@ getbootinfo(char *bootinfo, int bootinfo > bios_ddb_t *bios_ddb; > bios_bootduid_t *bios_bootduid; > bios_bootsr_t *bios_bootsr; > - int docninit = 0; > - > -#undef BOOTINFO_DEBUG > -#ifdef BOOTINFO_DEBUG > - printf("bootargv:"); > -#endif > > for (q = (bootarg32_t *)bootinfo; > (q->ba_type != BOOTARG_END) && > @@ -1942,24 +1933,15 @@ getbootinfo(char *bootinfo, int bootinfo > switch (q->ba_type) { > case BOOTARG_MEMMAP: > bios_memmap = (bios_memmap_t *)q->ba_arg; > -#ifdef BOOTINFO_DEBUG > - printf(" memmap %p", bios_memmap); > -#endif > break; > case BOOTARG_DISKINFO: > bios_diskinfo = (bios_diskinfo_t *)q->ba_arg; > -#ifdef BOOTINFO_DEBUG > - printf(" diskinfo %p", bios_diskinfo); > -#endif > break; > case BOOTARG_APMINFO: > /* generated by i386 boot loader */ > break; > case BOOTARG_CKSUMLEN: > bios_cksumlen = *(u_int32_t *)q->ba_arg; > -#ifdef BOOTINFO_DEBUG > - printf(" cksumlen %d", bios_cksumlen); > -#endif > break; > case BOOTARG_PCIINFO: > /* generated by i386 boot loader */ > @@ -1983,15 +1965,8 @@ getbootinfo(char *bootinfo, int bootinfo > comconsaddr = consaddr; > comconsrate = cdp->conspeed; > comconsiot = X86_BUS_SPACE_IO; > - > - /* Probe the serial port this time. */ > - docninit+
Re: diff: init efifb even if VGA is probed.
Hi, I'd like to conclude this issue. On Fri, 21 Feb 2020 14:09:07 +0900 (JST) YASUOKA Masahiko wrote: > I am testing a new hardware, HPE DL20 Gen10. > > When efiboot starts the kernel, the video display becomes distorted > and never recovered until CPU reset. > > Our kernel tries to initialized console twice, first trial is done > before getting boot info and second trial is done after getting boot > info. Since EFI framebuffer needs "boot info", it is initialized on > second trial. > > On HPE DL20 Gen10, probing vga is succeeded on first trial, the kernel > selects vga for the console, but actually it is broken. On usual > machines which boot with EFI, the problem doesn't happen since they > have no vga. If we have a way to detect whether the machine has VGA. ACPI FADT_NO_VGA was a candidate. But that bit is cleard both on my "HPE DL20 Gen10" and Andrew Daugherity's Dell PowerEdge R230. Also the problem newly posted at misc@ (*) might be the same problem. (*) https://marc.info/?l=openbsd-misc=159064773219779=2 I think having the diff folowing is the best for this momemnt. The diff does: - move cninit() after parsing bootinfo - give up the debug message which can be enabled if BOOTINFO_DEBUG is defined ok? Index: sys/arch/amd64/amd64/machdep.c === RCS file: /disk/cvs/openbsd/src/sys/arch/amd64/amd64/machdep.c,v retrieving revision 1.264 diff -u -p -r1.264 machdep.c --- sys/arch/amd64/amd64/machdep.c 16 May 2020 14:44:44 - 1.264 +++ sys/arch/amd64/amd64/machdep.c 28 May 2020 07:40:14 - @@ -1395,11 +1395,6 @@ init_x86_64(paddr_t first_avail) i8254_startclock(); /* -* Attach the glass console early in case we need to display a panic. -*/ - cninit(); - - /* * Initialize PAGE_SIZE-dependent variables. */ uvm_setpagesize(); @@ -1421,6 +1416,8 @@ init_x86_64(paddr_t first_avail) } else panic("invalid /boot"); + cninit(); + /* * Memory on the AMD64 port is described by three different things. * @@ -1927,12 +1924,6 @@ getbootinfo(char *bootinfo, int bootinfo bios_ddb_t *bios_ddb; bios_bootduid_t *bios_bootduid; bios_bootsr_t *bios_bootsr; - int docninit = 0; - -#undef BOOTINFO_DEBUG -#ifdef BOOTINFO_DEBUG - printf("bootargv:"); -#endif for (q = (bootarg32_t *)bootinfo; (q->ba_type != BOOTARG_END) && @@ -1942,24 +1933,15 @@ getbootinfo(char *bootinfo, int bootinfo switch (q->ba_type) { case BOOTARG_MEMMAP: bios_memmap = (bios_memmap_t *)q->ba_arg; -#ifdef BOOTINFO_DEBUG - printf(" memmap %p", bios_memmap); -#endif break; case BOOTARG_DISKINFO: bios_diskinfo = (bios_diskinfo_t *)q->ba_arg; -#ifdef BOOTINFO_DEBUG - printf(" diskinfo %p", bios_diskinfo); -#endif break; case BOOTARG_APMINFO: /* generated by i386 boot loader */ break; case BOOTARG_CKSUMLEN: bios_cksumlen = *(u_int32_t *)q->ba_arg; -#ifdef BOOTINFO_DEBUG - printf(" cksumlen %d", bios_cksumlen); -#endif break; case BOOTARG_PCIINFO: /* generated by i386 boot loader */ @@ -1983,15 +1965,8 @@ getbootinfo(char *bootinfo, int bootinfo comconsaddr = consaddr; comconsrate = cdp->conspeed; comconsiot = X86_BUS_SPACE_IO; - - /* Probe the serial port this time. */ - docninit++; } #endif -#ifdef BOOTINFO_DEBUG - printf(" console 0x%x:%d", - cdp->consdev, cdp->conspeed); -#endif } break; case BOOTARG_BOOTMAC: @@ -2023,8 +1998,6 @@ getbootinfo(char *bootinfo, int bootinfo case BOOTARG_EFIINFO: bios_efiinfo = (bios_efiinfo_t *)q->ba_arg; - if (bios_efiinfo->fb_addr != 0) - docninit++; break; case BOOTARG_UCODE: @@ -2032,18 +2005,9 @@ getbootinfo(char *bootinfo, int bootinfo break; default: -#ifdef BOOTINFO_DEBUG - printf(" unsupported arg (%d) %p", q->ba_type, - q->ba_arg); -#endif break; } } - if (docninit > 0) - cninit(); -#ifdef BOOTINFO_DEBUG - printf("\n"); -#endif } int
fix pppac(4) without pipex
Hi, The diff followings fixes panics when using pppac(4) with "pipex no". Index: sys/net/if_pppx.c === RCS file: /cvs/src/sys/net/if_pppx.c,v retrieving revision 1.83 diff -u -p -r1.83 if_pppx.c --- sys/net/if_pppx.c 10 Apr 2020 07:36:52 - 1.83 +++ sys/net/if_pppx.c 12 Apr 2020 06:12:35 - @@ -344,7 +344,7 @@ pppxwrite(dev_t dev, struct uio *uio, in if (m == NULL) return (ENOBUFS); mlen = MHLEN; - if (uio->uio_resid >= MINCLSIZE) { + if (uio->uio_resid > MHLEN) { MCLGET(m, M_DONTWAIT); if (!(m->m_flags & M_EXT)) { m_free(m); @@ -1368,7 +1368,7 @@ pppacwrite(dev_t dev, struct uio *uio, i if (m == NULL) return (ENOMEM); - if (uio->uio_resid > MINCLSIZE) { + if (uio->uio_resid > MHLEN) { m_clget(m, M_WAITOK, uio->uio_resid); if (!ISSET(m->m_flags, M_EXT)) { m_free(m);
Re: pipex(4) fix: check session existence before creation
ok yasuoka On Mon, 6 Apr 2020 19:54:20 +0300 Vitaliy Makkoveev wrote: > Deny to create pipex_session which is already exist. Newly created > session will be placed to list head so the caller of > pipex_*_lookup_session() will receive wrong session. > > Index: sys/net/if_pppx.c > === > RCS file: /cvs/src/sys/net/if_pppx.c,v > retrieving revision 1.79 > diff -u -p -r1.79 if_pppx.c > --- sys/net/if_pppx.c 6 Apr 2020 12:31:30 - 1.79 > +++ sys/net/if_pppx.c 6 Apr 2020 13:47:26 - > @@ -719,6 +719,11 @@ pppx_add_session(struct pppx_dev *pxd, s > return (EPROTONOSUPPORT); > } > > + session = pipex_lookup_by_session_id(req->pr_protocol, > + req->pr_session_id); > + if (session) > + return (EEXIST); > + > pxi = pool_get(pppx_if_pl, PR_WAITOK | PR_ZERO); > if (pxi == NULL) > return (ENOMEM); > Index: sys/net/pipex.c > === > RCS file: /cvs/src/sys/net/pipex.c,v > retrieving revision 1.112 > diff -u -p -r1.112 pipex.c > --- sys/net/pipex.c 6 Apr 2020 13:14:04 - 1.112 > +++ sys/net/pipex.c 6 Apr 2020 13:47:33 - > @@ -312,6 +312,11 @@ pipex_add_session(struct pipex_session_r > return (EPROTONOSUPPORT); > } > > + session = pipex_lookup_by_session_id(req->pr_protocol, > + req->pr_session_id); > + if (session) > + return (EEXIST); > + > /* prepare a new session */ > session = pool_get(_session_pool, PR_WAITOK | PR_ZERO); > session->state = PIPEX_STATE_OPENED; >
Re: Prevent memory corruption by pipex_timer()
Hi, Sorry for my silence. ok yasuoka for the daemon part. On Wed, 1 Apr 2020 09:27:10 +0200 Martin Pieuchot wrote: > On 31/03/20(Tue) 23:16, Vitaliy Makkoveev wrote: >> On Tue, Mar 31, 2020 at 06:15:46PM +0200, Martin Pieuchot wrote: >> > [...] >> > Well better fix npppd(8), no? Not crashing the kernel is priority 1. >> I made patch for npppd(8) too. I include it to this email below, without >> starting new thread, ok? If ioctl(PIPEXASESSION) failed and error != >> ENXIO it means that pipex is enabled and session creation failed so down >> this connection. > > Thanks, I committed the kernel part. I'm waiting to see if other devs > want to comment on the daemon part. > >> > Then if the daemon has a bug, should the kernel work around it? >> In most common cases should :( > > That's an opinion. There's no true or false answer. Working around it > has obvious advantages but it doesn't make us fix existing bug and instead > force us to maintain the work around. > > It is argued that the "failing hard" model, as it is practised in OpenBSD > software development, has the advantage of resulting in simpler code because > every layer is responsible for handling errors and doesn't pile workaround. > > This bug is a nice example. Thanks for the report! If you could submit > your refactoring in a new thread, I'd love to look at it. >
Re: diff: init efifb even if VGA is probed.
Hi, Thank you for your test and feedback. On Fri, 6 Mar 2020 16:38:24 -0600 Andrew Daugherity wrote: > On Sun, Mar 1, 2020 at 10:41 PM YASUOKA Masahiko wrote: >> >> Hi, >> >> The problems you are pointing seem to be the same problem. >> >> > I'll try to test this diff next week if I can schedule some downtime. >> >> Test is appreciated. >> >> Also I'd like to know the result of >> >> hexdump -C /var/db/acpi/FACP.1 >> >> when "Load Legacy Video Option ROM" setting is disabled. > > I just tested a -current kernel built yesterday with that diff (your > post on Feb. 20), but unfortunately it does not fix the issue on my > hardware. As before, if "Load Legacy Video Option ROM" is disabled, > output is squished to a purple line and when devices are initialized, > vga1 is the wsdisplay0 device: I see, first diff didn't fix the problem on your machine. > vga1 at pci7 dev 0 function 0 "Matrox MGA G200eR" rev 0x01 > wsdisplay0 at vga1 mux 1: console (80x25, vt100 emulation) > wsdisplay0: screen 1-5 added (80x25, vt100 emulation) > efifb0 at mainbus0: 1280x1024, 32bpp > wsdisplay at efifb0 not configured > > vs. with the legacy video ROM setting: > > "Matrox MGA G200eR" rev 0x01 at pci7 dev 0 function 0 not configured > efifb0 at mainbus0: 1024x768, 32bpp > wsdisplay0 at efifb0 mux 1 > wsdisplay0: screen 0-5 added (std, vt100 emulation) > > I'm using a serial console, if it matters. Hmm... I just noticed that > with the legacy ROM setting disabled, both wsdisplay0 at vga1 mux > 1/wskbd0 at ukbd0 *and* com1 claim to be the console. With the > setting enabled (and efifb working), only com1 is listed as console. > > I haven't tried any of the later diffs as I'm not sure which are still > recommended. The last diff should fix the problem since it will initialize efifb before initializing VGA without condition. https://marc.info/?l=openbsd-tech=158280719421562=2 > The FACP.1 table does not change when the "Load Legacy Video Option > ROM" setting is changed. Here is its hexdump: > andrew@gsc-lb1:~/acpidump$ hexdump -C legacy-2.8.1/FACP.1 > 46 41 43 50 0c 01 00 00 05 62 44 45 4c 4c 20 20 |FACP.bDELL | > 0010 50 45 5f 53 43 33 20 20 00 00 00 00 44 45 4c 4c |PE_SC3 DELL| > 0020 01 00 00 00 00 30 f8 8e 00 b0 fc 8e 00 04 09 00 |.0..| > 0030 b2 00 00 00 f0 f1 f2 00 00 18 00 00 00 00 00 00 || > 0040 04 18 00 00 00 00 00 00 50 18 00 00 08 18 00 00 |P...| > 0050 80 18 00 00 00 00 00 00 04 02 01 04 20 00 10 00 | ...| > 0060 65 00 e9 03 00 00 00 00 01 03 0d 00 32 11 00 00 |e...2...| > 0070 a5 86 00 00 01 08 00 01 f9 0c 00 00 00 00 00 00 || > 0080 06 00 00 00 00 00 00 00 00 00 00 00 00 b0 fc 8e || > 0090 00 00 00 00 01 20 00 02 00 18 00 00 00 00 00 00 |. ..| > 00a0 01 00 00 02 00 00 00 00 00 00 00 00 01 10 00 02 || > 00b0 04 18 00 00 00 00 00 00 01 00 00 02 00 00 00 00 || > 00c0 00 00 00 00 01 08 00 01 50 18 00 00 00 00 00 00 |P...| > 00d0 01 20 00 03 08 18 00 00 00 00 00 00 01 00 00 01 |. ..| > 00e0 80 18 00 00 00 00 00 00 01 00 00 01 00 00 00 00 || > 00f0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 || > 0100 00 00 00 00 00 00 00 00 00 00 00 00 || > 010c This was to check whether using "VGA Not Present" bit is useful on your machine. "Boot IA-PC Boot Architecture Flags" is 0x6D:6E = 0x0011, LEGACY_DEVICES bit is set, "VGA Not Present" is cleared. This means the bit isn't set as I expected, it isn't useful to know existance of VGA. > The only ACPI change made by toggling that option is the DMAR.25 > table. Here are both versions: > Legacy Video ROM enabled: > 44 4d 41 52 90 00 00 00 01 83 49 4e 54 45 4c 20 |DMAR..INTEL | > 0010 47 4e 4c 52 00 00 00 00 01 00 00 00 49 4e 54 4c |GNLRINTL| > 0020 01 00 00 00 26 01 00 00 00 00 00 00 00 00 00 00 |&...| > 0030 00 00 20 00 01 00 00 00 00 00 d9 fe 00 00 00 00 |.. .| > 0040 03 08 00 00 02 f0 1f 00 04 08 00 00 00 00 1f 00 || > 0050 01 00 20 00 00 00 00 00 00 b0 ba 7c 00 00 00 00 |.. || > 0060 ff 2f bb 84 00 00 00 00 01 08 00 00 00 01 00 00 |./..| > 0070 01 00 20 00 00 00 00 00 00 10 31 8e 00 00 00 00 |.. ...1.| > 0080 ff 0f 33 8e 00 00 00 00 01 08 00 00 00 00 14 00 |..3.| > 0090 > and disabled: >