selecting proper GOP when there are multiple GOPs

2022-05-02 Thread YASUOKA Masahiko
Hello,

The below diff originally posted by Alexei K. on bugs@:

  Garbled screen when booting with UEFI
  https://marc.info/?l=openbsd-bugs=165087969227708=2  

The same problem had been reported periodically and we have asked to
use "machine gop" to workaround it.  But the diff from Alexei seems to
be a proper way.

I've tested it by some my machines including HPE DL20 Gen10 which has
a virtual video and serial console.

I'd like to commit the diff and ask people to test it.

ok?

Index: sys/arch/amd64/stand/efiboot/efiboot.c
===
RCS file: /disk/cvs/openbsd/src/sys/arch/amd64/stand/efiboot/efiboot.c,v
retrieving revision 1.38
diff -u -p -r1.38 efiboot.c
--- sys/arch/amd64/stand/efiboot/efiboot.c  7 Jun 2021 00:04:20 -   
1.38
+++ sys/arch/amd64/stand/efiboot/efiboot.c  2 May 2022 07:53:38 -
@@ -424,8 +424,9 @@ efi_memprobe_internal(void)
 /***
  * Console
  ***/
-static SIMPLE_TEXT_OUTPUT_INTERFACE *conout = NULL;
-static SIMPLE_INPUT_INTERFACE   *conin;
+static SIMPLE_TEXT_OUTPUT_INTERFACE*conout = NULL;
+static SIMPLE_INPUT_INTERFACE  *conin;
+static EFI_GRAPHICS_OUTPUT *gop = NULL;
 static EFI_GUID con_guid
= EFI_CONSOLE_CONTROL_PROTOCOL_GUID;
 static EFI_GUID gop_guid
@@ -444,6 +445,30 @@ efi_video_init(void)
int  i, mode80x25, mode100x31;
UINTNcols, rows;
EFI_STATUS   status;
+   EFI_HANDLE  *handles;
+   UINTNnhandles;
+   EFI_GRAPHICS_OUTPUT *first_gop = NULL;
+   EFI_DEVICE_PATH *devp_test = NULL;
+
+   status = BS->LocateHandleBuffer(ByProtocol, _guid, NULL, ,
+   );
+   if (status != EFI_SUCCESS)
+   panic("BS->LocateHandleBuffer() returns %d", status);
+   for (i = 0; i < nhandles; i++) {
+   status = BS->HandleProtocol(handles[i], _guid,
+   (void **));
+   if (first_gop == NULL)
+   first_gop = gop;
+   status = BS->HandleProtocol(handles[i], _guid,
+   (void **)_test);
+   if (status == EFI_SUCCESS)
+   break;
+   }
+   if (status != EFI_SUCCESS)
+   gop = first_gop;
+   if (gop == NULL)
+   panic("no gop found");
+   BS->FreePool(handles);
 
conout = ST->ConOut;
status = BS->LocateProtocol(_guid, NULL, (void **));
@@ -808,7 +833,6 @@ efi_com_putc(dev_t dev, int c)
  */
 static EFI_GUID acpi_guid = ACPI_20_TABLE_GUID;
 static EFI_GUID smbios_guid = SMBIOS_TABLE_GUID;
-static EFI_GRAPHICS_OUTPUT *gop;
 static int  gopmode = -1;
 
 #defineefi_guidcmp(_a, _b) memcmp((_a), (_b), sizeof(EFI_GUID))
@@ -853,57 +877,54 @@ efi_makebootargs(void)
/*
 * Frame buffer
 */
-   status = BS->LocateProtocol(_guid, NULL, (void **));
-   if (!EFI_ERROR(status)) {
-   if (gopmode < 0) {
-   for (i = 0; i < gop->Mode->MaxMode; i++) {
-   status = gop->QueryMode(gop, i, , );
-   if (EFI_ERROR(status))
-   continue;
-   gopsiz = gopi->HorizontalResolution *
-   gopi->VerticalResolution;
-   if (gopsiz > bestsiz) {
-   gopmode = i;
-   bestsiz = gopsiz;
-   }
+   if (gopmode < 0) {
+   for (i = 0; i < gop->Mode->MaxMode; i++) {
+   status = gop->QueryMode(gop, i, , );
+   if (EFI_ERROR(status))
+   continue;
+   gopsiz = gopi->HorizontalResolution *
+   gopi->VerticalResolution;
+   if (gopsiz > bestsiz) {
+   gopmode = i;
+   bestsiz = gopsiz;
}
}
-   if (gopmode >= 0 && gopmode != gop->Mode->Mode) {
-   curmode = gop->Mode->Mode;
-   if (efi_gop_setmode(gopmode) != EFI_SUCCESS)
-   (void)efi_gop_setmode(curmode);
-   }
-
-   gopi = gop->Mode->Info;
-   switch (gopi->PixelFormat) {
-   case PixelBlueGreenRedReserved8BitPerColor:
-   ei->fb_red_mask  = 

Re: wg(4): 'Address already in use' when wgrtable is changed

2022-04-01 Thread YASUOKA Masahiko
Hi,

On Tue, 29 Mar 2022 17:28:23 +0900
Yuichiro NAITO  wrote:
> There is one thing I'm worrying about.
> Ifconfig doesn't show wgrtable value with your patch.
> In my use case as follows, it seems that setting `wgrtable 1` is
> ignored.
> 
> ```
> # route -T1 add default `cat /etc/mygate`
> # ifconfig wg0 create wgport 7111 wgkey `cat /etc/mykey.wg0`
> # ifconfig wg0 up
> # ifconfig wg0 wgrtable 1
> # ifconfig wg0
> wg0: flags=80c3 mtu 1420
> index 6 priority 0 llprio 3
> wgport 7111
> wgpubkey e/CYTG1RGqT4jmrY0Fom8cAdtOWP7F/gBVwamyINRlg=
> groups: wg
> ```

Thank you for pointing this out.

In this case, wg0 is binding 7111/udp on rdomain 0.  So I have
supposed ignoring "wgrtable 1" is correct.  But if we configure
wgrtable when creating,

 % doas ifconfig wg0 create wgport 7111 wgrtable 1 wgkey `openssl rand -base64 
32` up
 % doas ifconfig wg0 
 wg0: flags=80c3 mtu 1420
 index 13 priority 0 llprio 3
 wgport 7111
 wgrtable 1
 wgpubkey /4v4hsi426MsVZojJ0rwRvk8kK0jSckjcU2Z1L/k5W8=
 groups: wg
 % 

It displays "wgrtable 1".  And actually

 % netstat -T0 -naf inet | grep 7111 
 % netstat -T1 -naf inet | grep 7111 
 udp  0  0  *.7111 *.*   
 % 

it binds 7111/udp on rtable 1.

So I start wondering why binding 7111/udp on table 1 fails with
EADDRINUSE when 7111/udp on rtable 0 is used.

> On 3/28/22 15:59, YASUOKA Masahiko wrote:
>> On Mon, 28 Mar 2022 15:20:02 +0900
>> Yuichiro NAITO  wrote:
>>> Thanks for the explanation.
>>> I understand how your patch works.
>>>
>>> I want to ask the goal of your patch.
>>> It seems just removing 'Address already in use' message.
>>> Is my guessing right?
>> Yes.  There is nothing to do, since the command is to bind the same
>> port, protocol, and domain of prevous.
>> The code seems to do such the skip already, but it lacks consideration
>> for rtable_l2(rtable) != rtable case.
>> 
>>> On 3/28/22 14:01, YASUOKA Masahiko wrote:
>>>> Hi,
>>>> On Mon, 28 Mar 2022 12:12:39 +0900
>>>> Yuichiro NAITO  wrote:
>>>>> On 3/27/22 18:25, YASUOKA Masahiko wrote:
>>>>>> Hi,
>>>>>> On Wed, 9 Mar 2022 15:28:44 +0900
>>>>>> Yuichiro NAITO  wrote:
>>>>>>> I see 'Address already in use' message,
>>>>>>> when I change wgrtable for a running wg interface.
>>>>>>> It doesn't make sense to me.
>>>>>>>
>>>>>>> It can be reproduced by the following command sequence.
>>>>>>>
>>>>>>> ```
>>>>>>> # route -T1 add default `cat /etc/mygate`
>>>>>>> # ifconfig wg0 create wgport 7111 wgkey `cat /etc/mykey.wg0`
>>>>>>> # ifconfig wg0 up
>>>>>>> # ifconfig wg0 wgrtable 1
>>>>>>> ifconfig: SIOCSWG: Address already in use
>>>>>>> ```
>>>>>>>
>>>>>>> When I down wg0 interface before changing wgrtable,
>>>>>>> It succeeds and no messages are shown.
>>>>>>>
>>>>>>> I investigated the reason why 'Address already in use' is shown.
>>>>>>>
>>>>>>> If wgrtable is specified by ifconfig argument,
>>>>>>> `wg_ioctl_set` function in `sys/net/if_wg.c` is called.
>>>>>>>
>>>>>>> And if the wg interface is running, `wg_bind` function is called.
>>>>>>> `wg_bind` creates new sockets (IPv4 and 6) and replace them from old
>>>>>>> ones.
>>>>>>>
>>>>>>> If only wgrtable is changed, `wg_bind` binds as same port as existing
>>>>>>> sockets.
>>>>>>> So 'Address already in use' is shown.
>>>>>>>
>>>>>>> Here is a simple patch to close existing sockets before `wg_bind`.
>>>>>>> It works for me but I'm not 100% sure this is right fix.
>>>>>>>
>>>>>>> Any other ideas?
>>>>>>>
>>>>>>> ```
>>>>>>> diff --git a/sys/net/if_wg.c b/sys/net/if_wg.c
>>>>>>> index 4dae3e3c976..0159664fb34 100644
>>>>>>> --- a/sys/net/if_wg.c
>>>>>>> +++ b/sys/net/if_wg.c
>>>>>>> @@ -2253,11 +2253,14 @@ wg_ioctl_set(struct wg_softc *sc, struct
>>>

Re: wg(4): 'Address already in use' when wgrtable is changed

2022-03-28 Thread YASUOKA Masahiko
On Mon, 28 Mar 2022 15:20:02 +0900
Yuichiro NAITO  wrote:
> Thanks for the explanation.
> I understand how your patch works.
> 
> I want to ask the goal of your patch.
> It seems just removing 'Address already in use' message.
> Is my guessing right?

Yes.  There is nothing to do, since the command is to bind the same
port, protocol, and domain of prevous.

The code seems to do such the skip already, but it lacks consideration
for rtable_l2(rtable) != rtable case.

> On 3/28/22 14:01, YASUOKA Masahiko wrote:
>> Hi,
>> On Mon, 28 Mar 2022 12:12:39 +0900
>> Yuichiro NAITO  wrote:
>>> On 3/27/22 18:25, YASUOKA Masahiko wrote:
>>>> Hi,
>>>> On Wed, 9 Mar 2022 15:28:44 +0900
>>>> Yuichiro NAITO  wrote:
>>>>> I see 'Address already in use' message,
>>>>> when I change wgrtable for a running wg interface.
>>>>> It doesn't make sense to me.
>>>>>
>>>>> It can be reproduced by the following command sequence.
>>>>>
>>>>> ```
>>>>> # route -T1 add default `cat /etc/mygate`
>>>>> # ifconfig wg0 create wgport 7111 wgkey `cat /etc/mykey.wg0`
>>>>> # ifconfig wg0 up
>>>>> # ifconfig wg0 wgrtable 1
>>>>> ifconfig: SIOCSWG: Address already in use
>>>>> ```
>>>>>
>>>>> When I down wg0 interface before changing wgrtable,
>>>>> It succeeds and no messages are shown.
>>>>>
>>>>> I investigated the reason why 'Address already in use' is shown.
>>>>>
>>>>> If wgrtable is specified by ifconfig argument,
>>>>> `wg_ioctl_set` function in `sys/net/if_wg.c` is called.
>>>>>
>>>>> And if the wg interface is running, `wg_bind` function is called.
>>>>> `wg_bind` creates new sockets (IPv4 and 6) and replace them from old
>>>>> ones.
>>>>>
>>>>> If only wgrtable is changed, `wg_bind` binds as same port as existing
>>>>> sockets.
>>>>> So 'Address already in use' is shown.
>>>>>
>>>>> Here is a simple patch to close existing sockets before `wg_bind`.
>>>>> It works for me but I'm not 100% sure this is right fix.
>>>>>
>>>>> Any other ideas?
>>>>>
>>>>> ```
>>>>> diff --git a/sys/net/if_wg.c b/sys/net/if_wg.c
>>>>> index 4dae3e3c976..0159664fb34 100644
>>>>> --- a/sys/net/if_wg.c
>>>>> +++ b/sys/net/if_wg.c
>>>>> @@ -2253,11 +2253,14 @@ wg_ioctl_set(struct wg_softc *sc, struct
>>>>> wg_data_io *data)
>>>>>   if (port != sc->sc_udp_port || rtable != sc->sc_udp_rtable) {
>>>>>   TAILQ_FOREACH(peer, >sc_peer_seq, p_seq_entry)
>>>>>   wg_peer_clear_src(peer);
>>>>>
>>>>> - if (sc->sc_if.if_flags & IFF_RUNNING)
>>>>> + if (sc->sc_if.if_flags & IFF_RUNNING) {
>>>>> + if (port == sc->sc_udp_port)
>>>>> + wg_unbind(sc);
>>>>>   if ((ret = wg_bind(sc, , )) != 0)
>>>>>   goto error;
>>>>> + }
>>>>>
>>>>>   sc->sc_udp_port = port;
>>>>>   sc->sc_udp_rtable = rtable;
>>>>>   }
>>>>> ```
>>>> If rdomain 1 exists, the error will not shown.
>>>># ifconfig vether0 rdomain 1 up
>>>># ifconfig wg0 create wgport 7111 wgkey `openssl rand -base64 32` up
>>>># ifconfig wg0 wgrtable 1
>>>>#
>>>
>>> Yes, if rdomain 1 is created before `ifconfig wg0 wgrtable 1`,
>>> setting wgrtable succeeds and there is no problem.
>>>
>>>> In the case which you reported to, it is supposed that rtable 1 exists
>>>> but rdomain 1 doesn't exist.
>>>> Even when "wgtable 1" is configured, becase there is no dedicated
>>>> rdomain, rdomain 0 will be used to bind the UDP port.
>>>
>>> Exactly, it's the case that I reported and want to fix.
>>>
>>>> So what wg(4) should do for this case is "nothing".
>>>
>>> I'm a little bit confused.
>>> As you said, I can confirm your patch doesn't set wgrtable in my use
>>> case.
>>> It is not the result that I wanted

Re: wg(4): 'Address already in use' when wgrtable is changed

2022-03-27 Thread YASUOKA Masahiko
Hi,

On Mon, 28 Mar 2022 12:12:39 +0900
Yuichiro NAITO  wrote:
> On 3/27/22 18:25, YASUOKA Masahiko wrote:
>> Hi,
>> On Wed, 9 Mar 2022 15:28:44 +0900
>> Yuichiro NAITO  wrote:
>>> I see 'Address already in use' message,
>>> when I change wgrtable for a running wg interface.
>>> It doesn't make sense to me.
>>>
>>> It can be reproduced by the following command sequence.
>>>
>>> ```
>>> # route -T1 add default `cat /etc/mygate`
>>> # ifconfig wg0 create wgport 7111 wgkey `cat /etc/mykey.wg0`
>>> # ifconfig wg0 up
>>> # ifconfig wg0 wgrtable 1
>>> ifconfig: SIOCSWG: Address already in use
>>> ```
>>>
>>> When I down wg0 interface before changing wgrtable,
>>> It succeeds and no messages are shown.
>>>
>>> I investigated the reason why 'Address already in use' is shown.
>>>
>>> If wgrtable is specified by ifconfig argument,
>>> `wg_ioctl_set` function in `sys/net/if_wg.c` is called.
>>>
>>> And if the wg interface is running, `wg_bind` function is called.
>>> `wg_bind` creates new sockets (IPv4 and 6) and replace them from old
>>> ones.
>>>
>>> If only wgrtable is changed, `wg_bind` binds as same port as existing
>>> sockets.
>>> So 'Address already in use' is shown.
>>>
>>> Here is a simple patch to close existing sockets before `wg_bind`.
>>> It works for me but I'm not 100% sure this is right fix.
>>>
>>> Any other ideas?
>>>
>>> ```
>>> diff --git a/sys/net/if_wg.c b/sys/net/if_wg.c
>>> index 4dae3e3c976..0159664fb34 100644
>>> --- a/sys/net/if_wg.c
>>> +++ b/sys/net/if_wg.c
>>> @@ -2253,11 +2253,14 @@ wg_ioctl_set(struct wg_softc *sc, struct
>>> wg_data_io *data)
>>> if (port != sc->sc_udp_port || rtable != sc->sc_udp_rtable) {
>>> TAILQ_FOREACH(peer, >sc_peer_seq, p_seq_entry)
>>> wg_peer_clear_src(peer);
>>>
>>> -   if (sc->sc_if.if_flags & IFF_RUNNING)
>>> +   if (sc->sc_if.if_flags & IFF_RUNNING) {
>>> +   if (port == sc->sc_udp_port)
>>> +   wg_unbind(sc);
>>> if ((ret = wg_bind(sc, , )) != 0)
>>> goto error;
>>> +   }
>>>
>>> sc->sc_udp_port = port;
>>> sc->sc_udp_rtable = rtable;
>>> }
>>> ```
>> If rdomain 1 exists, the error will not shown.
>>   # ifconfig vether0 rdomain 1 up
>>   # ifconfig wg0 create wgport 7111 wgkey `openssl rand -base64 32` up
>>   # ifconfig wg0 wgrtable 1
>>   #
> 
> Yes, if rdomain 1 is created before `ifconfig wg0 wgrtable 1`,
> setting wgrtable succeeds and there is no problem.
> 
>> In the case which you reported to, it is supposed that rtable 1 exists
>> but rdomain 1 doesn't exist.
>> Even when "wgtable 1" is configured, becase there is no dedicated
>> rdomain, rdomain 0 will be used to bind the UDP port.
> 
> Exactly, it's the case that I reported and want to fix.
> 
>> So what wg(4) should do for this case is "nothing".
> 
> I'm a little bit confused.
> As you said, I can confirm your patch doesn't set wgrtable in my use
> case.
> It is not the result that I wanted.

   # ifconfig wg0 create wgport 7111 wgkey `openssl rand -base64 32` up
   -> bind 7111/udp on rdomain 0  (1)

is expected. (1)

   # ifconfig wg0 wgrtable 1
   -> bind 7111/udp on rdomain 0  (2)

is expected, since there is no "domain 1".

If trying to do (1) and (2), then it causes EADDRINUSE since it is to
bind the same port, proto, and domain.  The latest diff is skip (2)
properly.

Previous

>>   -  if (port != sc->sc_udp_port || rtable != sc->sc_udp_rtable) {

"rtable != sc->sc_udp_rtable" was wrong since rdomain for rtable may
not exist.  This is the cause of EADDRINUSE.


>> So the diff is updated.
>> ok?
>> Index: sys/net/if_wg.c
>> ===
>> RCS file: /disk/cvs/openbsd/src/sys/net/if_wg.c,v
>> retrieving revision 1.22
>> diff -u -p -r1.22 if_wg.c
>> --- sys/net/if_wg.c  22 Feb 2022 01:15:02 -  1.22
>> +++ sys/net/if_wg.c  27 Mar 2022 09:17:08 -
>> @@ -2250,7 +2250,8 @@ wg_ioctl_set(struct wg_softc *sc, struct
>>  else
>>  rtable = sc->sc_udp_rtable;
>>   -  if (port != sc->sc_udp_port || rtable != sc->sc_udp_rtable) {
>> +if (port != sc->sc_udp_port ||
>> +rtable_l2(rtable) != sc->sc_udp_rtable) {
>>  TAILQ_FOREACH(peer, >sc_peer_seq, p_seq_entry)
>>  wg_peer_clear_src(peer);
>>   
> 
> -- 
> Yuichiro NAITO (naito.yuich...@gmail.com)
> 



Re: wg(4): 'Address already in use' when wgrtable is changed

2022-03-27 Thread YASUOKA Masahiko
Hi,

On Wed, 9 Mar 2022 15:28:44 +0900
Yuichiro NAITO  wrote:
> I see 'Address already in use' message,
> when I change wgrtable for a running wg interface.
> It doesn't make sense to me.
> 
> It can be reproduced by the following command sequence.
> 
> ```
> # route -T1 add default `cat /etc/mygate`
> # ifconfig wg0 create wgport 7111 wgkey `cat /etc/mykey.wg0`
> # ifconfig wg0 up
> # ifconfig wg0 wgrtable 1
> ifconfig: SIOCSWG: Address already in use
> ```
> 
> When I down wg0 interface before changing wgrtable,
> It succeeds and no messages are shown.
> 
> I investigated the reason why 'Address already in use' is shown.
> 
> If wgrtable is specified by ifconfig argument,
> `wg_ioctl_set` function in `sys/net/if_wg.c` is called.
> 
> And if the wg interface is running, `wg_bind` function is called.
> `wg_bind` creates new sockets (IPv4 and 6) and replace them from old
> ones.
> 
> If only wgrtable is changed, `wg_bind` binds as same port as existing
> sockets.
> So 'Address already in use' is shown.
> 
> Here is a simple patch to close existing sockets before `wg_bind`.
> It works for me but I'm not 100% sure this is right fix.
> 
> Any other ideas?
> 
> ```
> diff --git a/sys/net/if_wg.c b/sys/net/if_wg.c
> index 4dae3e3c976..0159664fb34 100644
> --- a/sys/net/if_wg.c
> +++ b/sys/net/if_wg.c
> @@ -2253,11 +2253,14 @@ wg_ioctl_set(struct wg_softc *sc, struct
> wg_data_io *data)
>   if (port != sc->sc_udp_port || rtable != sc->sc_udp_rtable) {
>   TAILQ_FOREACH(peer, >sc_peer_seq, p_seq_entry)
>   wg_peer_clear_src(peer);
> 
> - if (sc->sc_if.if_flags & IFF_RUNNING)
> + if (sc->sc_if.if_flags & IFF_RUNNING) {
> + if (port == sc->sc_udp_port)
> + wg_unbind(sc);
>   if ((ret = wg_bind(sc, , )) != 0)
>   goto error;
> + }
> 
>   sc->sc_udp_port = port;
>   sc->sc_udp_rtable = rtable;
>   }
> ```

If rdomain 1 exists, the error will not shown.

 # ifconfig vether0 rdomain 1 up
 # ifconfig wg0 create wgport 7111 wgkey `openssl rand -base64 32` up
 # ifconfig wg0 wgrtable 1
 # 

In the case which you reported to, it is supposed that rtable 1 exists
but rdomain 1 doesn't exist.

Even when "wgtable 1" is configured, becase there is no dedicated
rdomain, rdomain 0 will be used to bind the UDP port.

So what wg(4) should do for this case is "nothing".

So the diff is updated.

ok?

Index: sys/net/if_wg.c
===
RCS file: /disk/cvs/openbsd/src/sys/net/if_wg.c,v
retrieving revision 1.22
diff -u -p -r1.22 if_wg.c
--- sys/net/if_wg.c 22 Feb 2022 01:15:02 -  1.22
+++ sys/net/if_wg.c 27 Mar 2022 09:17:08 -
@@ -2250,7 +2250,8 @@ wg_ioctl_set(struct wg_softc *sc, struct
else
rtable = sc->sc_udp_rtable;
 
-   if (port != sc->sc_udp_port || rtable != sc->sc_udp_rtable) {
+   if (port != sc->sc_udp_port ||
+   rtable_l2(rtable) != sc->sc_udp_rtable) {
TAILQ_FOREACH(peer, >sc_peer_seq, p_seq_entry)
wg_peer_clear_src(peer);
 



Re: parallel ip forwarding

2021-12-30 Thread YASUOKA Masahiko
Hi,

On Sat, 25 Dec 2021 21:50:47 +0300
Vitaliy Makkoveev  wrote:
> On Fri, Dec 24, 2021 at 12:50:23PM +0100, Alexander Bluhm wrote:
>> On Fri, Dec 24, 2021 at 04:16:28PM +0900, YASUOKA Masahiko wrote:
>> > > - npppd l2pt ipsecflowinfo is not MP safe
>> > 
>> > Does this mean the things we are discussing on the "Fix
>> > ipsp_spd_lookup() for transport mode" thread?  I wonder if there is
>> > another issue.
>> 
>> In this mail thread I was concerned about things might get worse.
>> 
>> Currently I see these problems:
>> 
>> tdb_free() will be called with a shared netlock.  From there
>> ipsp_ids_free() is called.
>> 
>> if (--ids->id_refcount > 0)
>> return;
>> 
>> This ref count needs to be atomic.
>> 
>> if (LIST_EMPTY(_ids_gc_list))
>> timeout_add_sec(_ids_gc_timeout, 1);
>> LIST_INSERT_HEAD(_ids_gc_list, ids, id_gc_list);
>> 
>> And some mutex should protect ipsp_ids_gc_list.

Thanks, I suppose I could catch up the problem.

> The diff below adds `ipsec_flows_mtx' mutex(9) to protect `ipsp_ids_*'
> list and trees. ipsp_ids_lookup() returns `ids' with bumped reference
> counter.

This direction seems good.

One thing, I found a problem.

> Index: sys/netinet/ip_spd.c
> ===
> RCS file: /cvs/src/sys/netinet/ip_spd.c,v
> retrieving revision 1.110
> diff -u -p -r1.110 ip_spd.c
> --- sys/netinet/ip_spd.c  16 Dec 2021 15:38:03 -  1.110
> +++ sys/netinet/ip_spd.c  25 Dec 2021 18:34:22 -
> @@ -418,6 +418,7 @@ ipsp_spd_lookup(struct mbuf *m, int af, 
>   /* Cached entry is good. */
>   error = ipsp_spd_inp(m, inp, ipo, tdbout);
>   mtx_leave(_tdb_mtx);
> + ipsp_ids_free(ids);
>   return error;
>  
>nomatchout:
> @@ -452,6 +453,7 @@ ipsp_spd_lookup(struct mbuf *m, int af, 
>   dignore ?  : >ipo_dst,
>   ipo->ipo_sproto, ids ? ids: ipo->ipo_ids,
>   >ipo_addr, >ipo_mask);
> + ipsp_ids_free(ids);
>   mtx_enter(_tdb_mtx);
>   if ((tdbp_new != NULL) &&
>   (tdbp_new->tdb_flags & TDBF_DELETED)) {

ids will remain unfreed since there are some code paths which doesn't
pass the above lines.

I tried to fix that, but adding a lot of ipsp_ids_free() looks a mess.
Instead, how about changing ipsp_spd_lookup() to take a "struct
ipsec_ids *ids" as an argument  and letting the caller take the
resposibility of the ids?

Index: sys/net/if_bridge.c
===
RCS file: /disk/cvs/openbsd/src/sys/net/if_bridge.c,v
retrieving revision 1.362
diff -u -p -r1.362 if_bridge.c
--- sys/net/if_bridge.c 23 Dec 2021 12:21:48 -  1.362
+++ sys/net/if_bridge.c 30 Dec 2021 08:12:18 -
@@ -1595,7 +1595,7 @@ bridge_ipsec(struct ifnet *ifp, struct e
}
} else { /* Outgoing from the bridge. */
error = ipsp_spd_lookup(m, af, hlen, IPSP_DIRECTION_OUT,
-   NULL, NULL, , 0);
+   NULL, NULL, , NULL);
if (error == 0 && tdb != NULL) {
/*
 * We don't need to do loop detection, the
Index: sys/net/if_veb.c
===
RCS file: /disk/cvs/openbsd/src/sys/net/if_veb.c,v
retrieving revision 1.21
diff -u -p -r1.21 if_veb.c
--- sys/net/if_veb.c8 Nov 2021 04:15:46 -   1.21
+++ sys/net/if_veb.c30 Dec 2021 08:12:18 -
@@ -746,7 +746,7 @@ veb_ipsec_proto_out(struct mbuf *m, sa_f
 #endif
 
tdb = ipsp_spd_lookup(m, af, iphlen, , IPSP_DIRECTION_OUT,
-   NULL, NULL, 0);
+   NULL, NULL, NULL);
if (tdb == NULL)
return (m);
 
Index: sys/netinet/ip_ipsp.c
===
RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_ipsp.c,v
retrieving revision 1.267
diff -u -p -r1.267 ip_ipsp.c
--- sys/netinet/ip_ipsp.c   20 Dec 2021 15:59:09 -  1.267
+++ sys/netinet/ip_ipsp.c   30 Dec 2021 08:12:18 -
@@ -47,6 +47,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -84,6 +86,13 @@ void tdb_hashstats(void);
do { } while (0)
 #endif
 
+/*
+ * Locks used to protect global data and struct members:
+ * F   ipsec_flows_mtx
+ */
+
+struct mutex ipsec_flows_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
+
 inttdb_rehash(void);
 void   tdb_tim

Re: parallel ip forwarding

2021-12-23 Thread YASUOKA Masahiko
Hello,

On Fri, 24 Dec 2021 00:55:04 +0100
Alexander Bluhm  wrote:
> On Fri, Dec 03, 2021 at 08:35:45PM +0100, Alexander Bluhm wrote:
>> Note that IPsec still has the workaround to disable multiple queues.
> 
> I think we can remove the ipsec_in_use workaround now.  The IPsec
> path is protected with the kernel lock.
> 
> There are some issues left:
> - npppd l2pt ipsecflowinfo is not MP safe

Does this mean the things we are discussing on the "Fix
ipsp_spd_lookup() for transport mode" thread?  I wonder if there is
another issue.

> - the acquire SA feature is not MP safe
> - Hrvoje has seen a panic with sasync



Re: Fix ipsp_spd_lookup() for transport mode

2021-12-23 Thread YASUOKA Masahiko
Hi,

On Mon, 20 Dec 2021 13:20:46 +0100
Alexander Bluhm  wrote:
> On Tue, Dec 14, 2021 at 06:25:20PM +0900, YASUOKA Masahiko wrote:
>> Yes, if there is another better idea, it will be welcome.
>> For this moment, the diff is the best idea for me.
> 
> Sorry, no better idea.  I have no experiance with l2pt.  Codewise
> the diff looks fine, but I don't understand the consequences.

Thank you for your review and comments.

>> +if (tdbflow != NULL)
>> +rn = rn_lookup((caddr_t)>tdb_filter,
>> +(caddr_t)>tdb_filtermask, rnh);
> 
> Does rn_lookup() modify the radix tree?  I looks like rn_lookup ->
> rn_addmask -> rn_insert() does that.  This will make it impossible
> to make IPsec MP capable.  The radix tree is not MP safe, art has
> been implemented as an alternative.  An ipsp_spd_lookup() should
> not modify the flows.  It is stange that a function named rn_lookup()
> does modifications.  Did I miss something?

rn_lookup() doesn't make any modification.  rn_lookup() calls
rn_addmask() with second argument search=1.

 183 /* return a perfect match if m_arg is set, else do a regular rn_match */
 184 struct radix_node *
 185 rn_lookup(void *v_arg, void *m_arg, struct radix_node_head *head)
 186 {
 187 struct radix_node *x, *tm;
 188 caddr_t netmask = 0;
 189 
 190 if (m_arg) {
 191 tm = rn_addmask(m_arg, 1, head->rnh_treetop->rn_off);

and then rn_addmask()

 416 struct radix_node *
 417 rn_addmask(void *n_arg, int search, int skip)
 418 {
 (snip)
 449 if (tm || search)
 450 return (tm);
 451 tm = malloc(max_keylen + 2 * sizeof(*tm), M_RTABLE, M_NOWAIT | 
M_ZERO);
 452 if (tm == NULL)
 453 return (0);
 454 saved_tm = tm;
 455 netmask = cp = (caddr_t)(tm + 2);
 456 memcpy(cp, addmask_key, mlen);
 457 tm = rn_insert(cp, mask_rnhead, , tm);

returns at #449-450 before calling rn_insert().  It seems that
rn_addmask() does read only operations when "search".

> Why do you call rn_lookup() here?

Since rn_match() doesn't take a mask and returns the best one.

For an example, if there are multiple peers behind a NAT, flows like
below can be configured at the same time.

  (a) Windows:  REMOTE_IP:1701/udp <=> LOCAL_IP:1701/udp
  (b) Linux:REMOTE_IP:ANY/udp  <=> LOCAL_IP:1701/udp

If source port of a packet from the Linux is 1701, rn_match() will
return (a) for it, then ipsp_spd_lookup() will fail to verify that the
given tdb matches the policy.

Policies can be created with wildcards (any port, any protocol), then
it is compared with a packet whose port and protocol is concreted.

Since rn_match() is to find a bestmatch, it can't find a wildcard
policy properly if there is a non wildcard policy which is overlapped
by the wildcard.

So the diff uses rn_lookup() to find the correct policy.


> Could we add the masks earlier when the flows are added?
> 
>> +else if (tdbp != NULL)
>> +rn = rn_lookup((caddr_t)>tdb_filter,
>> +(caddr_t)>tdb_filtermask, rnh);
> 
> What are the consequences of this chunk for regular IPsec?

I have thought that again.  Now I realized the problem is only for
transport mode.  For tunnel mode, since best match is always
preferred, rn_lookup() should be used.  I'll update the diff that uses
rn_lookup() for transport mode only.

>>  /* Match source/dest IDs. */
>> -if (ipo->ipo_ids)
>> -if (tdbp->tdb_ids == NULL ||
>> -!ipsp_ids_match(ipo->ipo_ids, 
>> tdbp->tdb_ids))
>> +if (ipo->ipo_ids != NULL) {
>> +if ((tdbp->tdb_flags & TDBF_TUNNELING) == 0 &&
>> +(tdbp->tdb_flags & TDBF_UDPENCAP) != 0) {
>> +/*
>> + * Skip IDs check for transport mode
>> + * with NAT-T.  Multiple clients (IDs)
>> + * can use a same policy.
aima>> + */
>> +} else if (tdbp->tdb_ids == NULL &&
>> +!ipsp_ids_match(ipo->ipo_ids,
>> +tdbp->tdb_ids))
>>  goto nomatchin;
>> +}
> 
> This was added to make IPsec/l2tp work in rev 1.85.  And now you
> change it to make it work.  I wish markus@ or mikeb@ could give a
> clue.

At the change of 1.85, "ipsec-id bundles" is intr

Re: Fix ipsp_spd_lookup() for transport mode

2021-12-14 Thread YASUOKA Masahiko
Hi,

On Tue, 14 Dec 2021 01:20:49 +0100
Alexander Bluhm  wrote:
> I don't know much about l2tp, pipex or npppd.  So I cannot say if
> the new logic is correct.  But I guess you have tested that.

Yes, I've tested some L2TP/IPsec cases already.

> The tdb mutex and ref counting looks correct.
> 
>> +struct tdb *tdb, *tdblocal = NULL;
> 
> The variable names tdb and tdbp are used very inconsistently within
> IPsec.  Don't use both.  I think tdpb and a tdbflow are sufficient.

Ok,

> 
>> +if (ipsecflowinfo != 0)
>> +ids = ipsp_ids_lookup(ipsecflowinfo);
> 
> Can you move that to the place where it is needed?

Yes,

> Perhaps it is easier to understand this way:
> 
>   if (ipsecflowinfo != 0) {

Sure.  Let me update the diff.

> It is hard to say whether the new
> rn_lookup(tdbp->tdb_filter/tdbp->tdb_filtermask) changes existing
> IPsec behavior for setups without l2tp.

I suppose it has no regression on other setups.
But I'll look it more carefully and test the other setups.

> Do we need it there?

Yes, if there is another better idea, it will be welcome.
For this moment, the diff is the best idea for me.

> I never ran into problems patching the correct policy.

Index: sys/netinet/ip_ipsp.c
===
RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_ipsp.c,v
retrieving revision 1.264
diff -u -p -r1.264 ip_ipsp.c
--- sys/netinet/ip_ipsp.c   11 Dec 2021 16:33:47 -  1.264
+++ sys/netinet/ip_ipsp.c   14 Dec 2021 06:32:07 -
@@ -91,6 +91,8 @@ void  tdb_soft_timeout(void *);
 void   tdb_soft_firstuse(void *);
 inttdb_hash(u_int32_t, union sockaddr_union *, u_int8_t);
 void   tdb_dodelete(struct tdb *, int locked);
+intsockaddr_encap_match(struct sockaddr_encap *,
+   struct sockaddr_encap *, struct sockaddr_encap *);
 
 int ipsec_in_use = 0;
 u_int64_t ipsec_last_added = 0;
@@ -510,6 +512,78 @@ gettdbbysrc(u_int rdomain, union sockadd
tdb_ref(tdbp);
mtx_leave(_sadb_mtx);
return tdbp;
+}
+
+/*
+ * Get an SA given the flow, the direction, the security protocol type, and
+ * the desired IDs.
+ */
+struct tdb *
+gettdbbyflow(u_int rdomain, int direction, struct sockaddr_encap *senflow,
+u_int8_t sproto, struct ipsec_ids *ids)
+{
+   u_int32_t hashval;
+   struct tdb *tdbp;
+   union sockaddr_union srcdst;
+
+   if (ids == NULL)/* ids is mandatory */
+   return NULL;
+
+   memset(, 0, sizeof(srcdst));
+   switch (senflow->sen_type) {
+   case SENT_IP4:
+   srcdst.sin.sin_len = sizeof(srcdst.sin);
+   srcdst.sin.sin_family = AF_INET;
+   if (direction == IPSP_DIRECTION_OUT)
+   srcdst.sin.sin_addr = senflow->Sen.Sip4.Dst;
+   else
+   srcdst.sin.sin_addr = senflow->Sen.Sip4.Src;
+   break;
+   case SENT_IP6:
+   srcdst.sin6.sin6_len = sizeof(srcdst.sin6);
+   srcdst.sin6.sin6_family = AF_INET6;
+   if (direction == IPSP_DIRECTION_OUT)
+   srcdst.sin6.sin6_addr = senflow->Sen.Sip6.Dst;
+   else
+   srcdst.sin6.sin6_addr = senflow->Sen.Sip6.Src;
+   break;
+   }
+
+   mtx_enter(_sadb_mtx);
+   hashval = tdb_hash(0, , sproto);
+
+   for (tdbp = tdbdst[hashval]; tdbp != NULL; tdbp = tdbp->tdb_dnext)
+   if (tdbp->tdb_sproto == sproto &&
+   tdbp->tdb_rdomain == rdomain &&
+   (tdbp->tdb_flags & TDBF_INVALID) == 0 &&
+   ipsp_ids_match(ids, tdbp->tdb_ids) &&
+   ((direction == IPSP_DIRECTION_OUT &&
+   !memcmp(>tdb_dst, , srcdst.sa.sa_len)) ||
+   (direction == IPSP_DIRECTION_IN &&
+   !memcmp(>tdb_src, , srcdst.sa.sa_len {
+   if (sockaddr_encap_match(>tdb_filter,
+   >tdb_filtermask, senflow))
+   break;
+   }
+
+   tdb_ref(tdbp);
+   mtx_leave(_sadb_mtx);
+   return tdbp;
+}
+
+int
+sockaddr_encap_match(struct sockaddr_encap *addr, struct sockaddr_encap *mask,
+struct sockaddr_encap *dest)
+{
+   size_t  off;
+
+   for (off = offsetof(struct sockaddr_encap, sen_type);
+   off < dest->sen_len; off++) {
+   if ((*((u_char *)addr + off) & *((u_char *)mask + off)) !=
+   (*((u_char *)dest + off) & *((u_char *)mask + off)))
+   break;
+   }
+   return (off < dest->sen_len)? 0 : 1;
 }
 
 #ifdef DDB
Index: sys/netinet/ip_ipsp.h
===
RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_ipsp.h,v
retrieving revision 1.230
diff -u -p -r1.230 ip_ipsp.h
--- sys/netinet/ip_ipsp.h   11 Dec 2021 16:33:47 -  1.230
+++ 

Re: Fix ipsp_spd_lookup() for transport mode

2021-12-01 Thread YASUOKA Masahiko
On Wed, 1 Dec 2021 00:27:06 +0100
Alexander Bluhm  wrote:
> On Tue, Nov 30, 2021 at 05:53:34PM +0300, Vitaliy Makkoveev wrote:
>> Hi,
>> 
>> This question is mostly for bluhm@. Should the gettdbbyflow() grab the
>> extra reference on returned `tdbp' like other other gettdb*() do? I'm
>> pointing this because we are going to not rely on the netlock when doing
>> `tdbp' dereference.
> 
> Yes.  Call tdb_ref(tdbp) withing the tdb_sadb_mtx mutex.
> 
> The interesting question is when to unref it.  You use the same
> variable for the tdb parameter and the tdb from gettdbbyflow().
> Tracking when you don't use the new TDB anymore, gets tricky.

Let me update the diff.  That grabs a reference now.

Also the diff fixes gettdbbyflow().  Comparing ids was missing.


Index: sys/netinet/ip_ipsp.c
===
RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_ipsp.c,v
retrieving revision 1.258
diff -u -p -r1.258 ip_ipsp.c
--- sys/netinet/ip_ipsp.c   29 Nov 2021 19:19:00 -  1.258
+++ sys/netinet/ip_ipsp.c   1 Dec 2021 12:19:53 -
@@ -90,6 +90,8 @@ void  tdb_firstuse(void *);
 void   tdb_soft_timeout(void *);
 void   tdb_soft_firstuse(void *);
 inttdb_hash(u_int32_t, union sockaddr_union *, u_int8_t);
+intsockaddr_encap_match(struct sockaddr_encap *,
+   struct sockaddr_encap *, struct sockaddr_encap *);
 
 int ipsec_in_use = 0;
 u_int64_t ipsec_last_added = 0;
@@ -507,6 +509,78 @@ gettdbbysrc(u_int rdomain, union sockadd
tdb_ref(tdbp);
mtx_leave(_sadb_mtx);
return tdbp;
+}
+
+/*
+ * Get an SA given the flow, the direction, the security protocol type, and
+ * the desired IDs.
+ */
+struct tdb *
+gettdbbyflow(u_int rdomain, int direction, struct sockaddr_encap *senflow,
+u_int8_t sproto, struct ipsec_ids *ids)
+{
+   u_int32_t hashval;
+   struct tdb *tdbp;
+   union sockaddr_union srcdst;
+
+   if (ids == NULL)/* ids is mandatory */
+   return NULL;
+
+   memset(, 0, sizeof(srcdst));
+   switch (senflow->sen_type) {
+   case SENT_IP4:
+   srcdst.sin.sin_len = sizeof(srcdst.sin);
+   srcdst.sin.sin_family = AF_INET;
+   if (direction == IPSP_DIRECTION_OUT)
+   srcdst.sin.sin_addr = senflow->Sen.Sip4.Dst;
+   else
+   srcdst.sin.sin_addr = senflow->Sen.Sip4.Src;
+   break;
+   case SENT_IP6:
+   srcdst.sin6.sin6_len = sizeof(srcdst.sin6);
+   srcdst.sin6.sin6_family = AF_INET6;
+   if (direction == IPSP_DIRECTION_OUT)
+   srcdst.sin6.sin6_addr = senflow->Sen.Sip6.Dst;
+   else
+   srcdst.sin6.sin6_addr = senflow->Sen.Sip6.Src;
+   break;
+   }
+
+   mtx_enter(_sadb_mtx);
+   hashval = tdb_hash(0, , sproto);
+
+   for (tdbp = tdbdst[hashval]; tdbp != NULL; tdbp = tdbp->tdb_dnext)
+   if (tdbp->tdb_sproto == sproto &&
+   tdbp->tdb_rdomain == rdomain &&
+   (tdbp->tdb_flags & TDBF_INVALID) == 0 &&
+   ipsp_ids_match(ids, tdbp->tdb_ids) &&
+   ((direction == IPSP_DIRECTION_OUT &&
+   !memcmp(>tdb_dst, , srcdst.sa.sa_len)) ||
+   (direction == IPSP_DIRECTION_IN &&
+   !memcmp(>tdb_src, , srcdst.sa.sa_len {
+   if (sockaddr_encap_match(>tdb_filter,
+   >tdb_filtermask, senflow))
+   break;
+   }
+
+   tdb_ref(tdbp);
+   mtx_leave(_sadb_mtx);
+   return tdbp;
+}
+
+int
+sockaddr_encap_match(struct sockaddr_encap *addr, struct sockaddr_encap *mask,
+struct sockaddr_encap *dest)
+{
+   size_t  off;
+
+   for (off = offsetof(struct sockaddr_encap, sen_type);
+   off < dest->sen_len; off++) {
+   if ((*((u_char *)addr + off) & *((u_char *)mask + off)) !=
+   (*((u_char *)dest + off) & *((u_char *)mask + off)))
+   break;
+   }
+   return (off < dest->sen_len)? 0 : 1;
 }
 
 #ifdef DDB
Index: sys/netinet/ip_ipsp.h
===
RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_ipsp.h,v
retrieving revision 1.224
diff -u -p -r1.224 ip_ipsp.h
--- sys/netinet/ip_ipsp.h   30 Nov 2021 13:17:43 -  1.224
+++ sys/netinet/ip_ipsp.h   1 Dec 2021 12:19:53 -
@@ -565,6 +565,8 @@ struct  tdb *gettdbbysrcdst_dir(u_int, u_
union sockaddr_union *, u_int8_t, int);
 #define gettdbbysrcdst(a,b,c,d,e) gettdbbysrcdst_dir((a),(b),(c),(d),(e),0)
 #define gettdbbysrcdst_rev(a,b,c,d,e) gettdbbysrcdst_dir((a),(b),(c),(d),(e),1)
+struct tdb *gettdbbyflow(u_int, int, struct sockaddr_encap *, u_int8_t,
+   struct ipsec_ids *);
 void   

Re: Fix ipsp_spd_lookup() for transport mode

2021-11-29 Thread YASUOKA Masahiko
Hi,

Let me update the diff.  Previous has a problem in ipsp_spd_lookup()
which uses "rn" without initialization.

On Sat, 20 Nov 2021 21:44:20 +0900 (JST)
YASUOKA Masahiko  wrote:
> On Wed, 12 May 2021 19:11:09 +0900 (JST)
> YASUOKA Masahiko  wrote:
>> Radek reported a problem to misc@ that multiple Windows clients behind
>> a NAT cannot use a L2TP/IPsec server simultaneously.
>> 
>> https://marc.info/?t=16099681611=1=2
>> 
>> There is two problems.  First is pipex(4) doesn't pass the proper
>> ipsecflowinfo to ip_output().  Second is the IPsec policy check which
>> is done by ipsp_spd_lookup() returns -1 (EINVAL) if the given tdb is
>> not cached.  This happens when its flow is shared by another tdb (for
>> another client of the same NAT).
> 
> This problem is not fixed yet.  The diff for the second problem was
> not committed in.  It was to fix the check in ipsp_spd_lookup() by
> making a IPsec policy have a list of IDs.
> 
> Also my colleague Kawai pointed out there is another problem if there
> is a Linux client among with Windows clients behind a NAT.  Windows
> uses 1701/udp for its local ID, but the Linux uses ANY/udp for its
> local ID.
> 
> In the situation, policies will be overlapped.
> 
>   (a) Windows:  REMOTE_IP:1701/udp <=> LOCAL_IP:1701/udp
>   (b) Linux:REMOTE_IP:ANY/udp  <=> LOCAL_IP:1701/udp
>   
> Since we use a radix tree for the policies, when rn_match() is used to
> find a policy, as it's best match, (b) is never selected.
> 
> Let me update the diff.
> 
> As for the incomming, we know the tdb when is used.  The diff uses the
> tdb to find the proper policy.
> 
> As for the outgoing, other than using "ipsecflowinfo" there is no way
> to select a proper policy.  So only when "ipsecflowinfo" is used, get
> a tdb from the packet flow and the IDs (retributed by the
> ipsecflowinfo), then we can find the proper policy by the tdb.
> 
> Also the diff skips the IDs check against the policy only if it is
> transport mode and using NAT-T.  Since when NAT-T is used for a policy
> for transport mode is shared by multiple clients which has a different
> IDs, checking the IDs is difficult and I think the checks other than
> is enough.
> 
> ok?  comments?
> 
> Fix some problems when accepting IPsec transport mode connections from
> multiple clients behind a NAT.  In the situation, policies can be
> overlapped, but previous could not choice a proper policy both for
> incoming and outgoing.  To solve this problem, use
> tdb->tdb_filter{,mask} to find a proper policy for incoming and find the
> tdb by the given ipsecflowinfo and use it for outgoing.  Also skip
> checking IDs of the policy since a policy is shared by multiple clients
> in the situation.

Index: sys/netinet/ip_ipsp.c
===
RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_ipsp.c,v
retrieving revision 1.258
diff -u -p -r1.258 ip_ipsp.c
--- sys/netinet/ip_ipsp.c   29 Nov 2021 19:19:00 -  1.258
+++ sys/netinet/ip_ipsp.c   30 Nov 2021 04:44:48 -
@@ -90,6 +90,8 @@ void  tdb_firstuse(void *);
 void   tdb_soft_timeout(void *);
 void   tdb_soft_firstuse(void *);
 inttdb_hash(u_int32_t, union sockaddr_union *, u_int8_t);
+intsockaddr_encap_match(struct sockaddr_encap *,
+   struct sockaddr_encap *, struct sockaddr_encap *);
 
 int ipsec_in_use = 0;
 u_int64_t ipsec_last_added = 0;
@@ -507,6 +509,76 @@ gettdbbysrc(u_int rdomain, union sockadd
tdb_ref(tdbp);
mtx_leave(_sadb_mtx);
return tdbp;
+}
+
+/*
+ * Get an SA given the flow, the direction, the security protocol type, and
+ * the desired IDs.
+ */
+struct tdb *
+gettdbbyflow(u_int rdomain, int direction, struct sockaddr_encap *senflow,
+u_int8_t sproto, struct ipsec_ids *ids)
+{
+   u_int32_t hashval;
+   struct tdb *tdbp;
+   union sockaddr_union srcdst;
+
+   if (ids == NULL)/* ids is mandatory */
+   return NULL;
+
+   memset(, 0, sizeof(srcdst));
+   switch (senflow->sen_type) {
+   case SENT_IP4:
+   srcdst.sin.sin_len = sizeof(srcdst.sin);
+   srcdst.sin.sin_family = AF_INET;
+   if (direction == IPSP_DIRECTION_OUT)
+   srcdst.sin.sin_addr = senflow->Sen.Sip4.Dst;
+   else
+   srcdst.sin.sin_addr = senflow->Sen.Sip4.Src;
+   break;
+   case SENT_IP6:
+   srcdst.sin6.sin6_len = sizeof(srcdst.sin6);
+   srcdst.sin6.sin6_family = AF_INET6;
+   if (direction == IPSP_DIRECTION_OUT)
+   srcdst.sin6.sin6_addr = senflow->Sen.Sip6.Dst;
+   el

Fix ipsp_spd_lookup() for transport mode (was Re: Fix IPsec NAT-T for L2TP/IPsec)

2021-11-20 Thread YASUOKA Masahiko
Hi,

On Wed, 12 May 2021 19:11:09 +0900 (JST)
YASUOKA Masahiko  wrote:
> Radek reported a problem to misc@ that multiple Windows clients behind
> a NAT cannot use a L2TP/IPsec server simultaneously.
> 
> https://marc.info/?t=16099681611=1=2
> 
> There is two problems.  First is pipex(4) doesn't pass the proper
> ipsecflowinfo to ip_output().  Second is the IPsec policy check which
> is done by ipsp_spd_lookup() returns -1 (EINVAL) if the given tdb is
> not cached.  This happens when its flow is shared by another tdb (for
> another client of the same NAT).

This problem is not fixed yet.  The diff for the second problem was
not committed in.  It was to fix the check in ipsp_spd_lookup() by
making a IPsec policy have a list of IDs.

Also my colleague Kawai pointed out there is another problem if there
is a Linux client among with Windows clients behind a NAT.  Windows
uses 1701/udp for its local ID, but the Linux uses ANY/udp for its
local ID.

In the situation, policies will be overlapped.

  (a) Windows:  REMOTE_IP:1701/udp <=> LOCAL_IP:1701/udp
  (b) Linux:REMOTE_IP:ANY/udp  <=> LOCAL_IP:1701/udp
  
Since we use a radix tree for the policies, when rn_match() is used to
find a policy, as it's best match, (b) is never selected.

Let me update the diff.

As for the incomming, we know the tdb when is used.  The diff uses the
tdb to find the proper policy.

As for the outgoing, other than using "ipsecflowinfo" there is no way
to select a proper policy.  So only when "ipsecflowinfo" is used, get
a tdb from the packet flow and the IDs (retributed by the
ipsecflowinfo), then we can find the proper policy by the tdb.

Also the diff skips the IDs check against the policy only if it is
transport mode and using NAT-T.  Since when NAT-T is used for a policy
for transport mode is shared by multiple clients which has a different
IDs, checking the IDs is difficult and I think the checks other than
is enough.

ok?  comments?

Fix some problems when accepting IPsec transport mode connections from
multiple clients behind a NAT.  In the situation, policies can be
overlapped, but previous could not choice a proper policy both for
incoming and outgoing.  To solve this problem, use
tdb->tdb_filter{,mask} to find a proper policy for incoming and find the
tdb by the given ipsecflowinfo and use it for outgoing.  Also skip
checking IDs of the policy since a policy is shared by multiple clients
in the situation.

Index: sys/netinet/ip_ipsp.c
===
RCS file: /cvs/src/sys/netinet/ip_ipsp.c,v
retrieving revision 1.251
diff -u -p -r1.251 ip_ipsp.c
--- sys/netinet/ip_ipsp.c   18 Nov 2021 11:04:10 -  1.251
+++ sys/netinet/ip_ipsp.c   20 Nov 2021 12:42:36 -
@@ -91,6 +91,8 @@ void  tdb_firstuse(void *);
 void   tdb_soft_timeout(void *);
 void   tdb_soft_firstuse(void *);
 inttdb_hash(u_int32_t, union sockaddr_union *, u_int8_t);
+intsockaddr_encap_match(struct sockaddr_encap *,
+   struct sockaddr_encap *, struct sockaddr_encap *);
 
 int ipsec_in_use = 0;
 u_int64_t ipsec_last_added = 0;
@@ -501,6 +503,76 @@ gettdbbysrc(u_int rdomain, union sockadd
 
mtx_leave(_sadb_mtx);
return tdbp;
+}
+
+/*
+ * Get an SA given the flow, the direction, the security protocol type, and
+ * the desired IDs.
+ */
+struct tdb *
+gettdbbyflow(u_int rdomain, int direction, struct sockaddr_encap *senflow,
+u_int8_t sproto, struct ipsec_ids *ids)
+{
+   u_int32_t hashval;
+   struct tdb *tdbp;
+   union sockaddr_union srcdst;
+
+   if (ids == NULL)/* ids is mandatory */
+   return NULL;
+
+   memset(, 0, sizeof(srcdst));
+   switch (senflow->sen_type) {
+   case SENT_IP4:
+   srcdst.sin.sin_len = sizeof(srcdst.sin);
+   srcdst.sin.sin_family = AF_INET;
+   if (direction == IPSP_DIRECTION_OUT)
+   srcdst.sin.sin_addr = senflow->Sen.Sip4.Dst;
+   else
+   srcdst.sin.sin_addr = senflow->Sen.Sip4.Src;
+   break;
+   case SENT_IP6:
+   srcdst.sin6.sin6_len = sizeof(srcdst.sin6);
+   srcdst.sin6.sin6_family = AF_INET6;
+   if (direction == IPSP_DIRECTION_OUT)
+   srcdst.sin6.sin6_addr = senflow->Sen.Sip6.Dst;
+   else
+   srcdst.sin6.sin6_addr = senflow->Sen.Sip6.Src;
+   break;
+   }
+
+   mtx_enter(_sadb_mtx);
+   hashval = tdb_hash(0, , sproto);
+
+   for (tdbp = tdbdst[hashval]; tdbp != NULL; tdbp = tdbp->tdb_dnext)
+   if (tdbp->tdb_sproto == sproto &&
+   tdbp->tdb_rdomain == rdomain &&
+   (tdbp->tdb_flags & TDBF_INVALID) == 0 &&
+   ((directi

Re: diff: ipsec.conf(5), clarify "aes" accepts 128:256 bits

2021-11-02 Thread YASUOKA Masahiko
Hi,

On Tue, 2 Nov 2021 07:03:43 +
Jason McIntyre  wrote:
> On Tue, Nov 02, 2021 at 12:02:07PM +0900, YASUOKA Masahiko wrote:
>> I'd like to clarify "aes" in ipsec.conf accepts 128:256 bits.
>> 
>> sbin/ipsecctl/ike.c:
>> 201 case ENCXF_AES:
>> 202 enc_alg = "AES";
>> 203 key_length = "128,128:256";
>> 204 break;
>> 
>> 
>> ok?
>> 
>> Clarify "aes" will accept keys which length is in 128:256 bits.
>> 
> 
> i notice that the enc lists in ipsec.conf.5 and iked.conf.5 differ.
> aren;t they supposed to be in sync?
> 
> for example, iked.conf.5 doesn;t mention "aes" or "aesctr". also the
> *-gmac and *-gcm-12 discrepancy.

As for "aes", *only isakmpd(8)* supports "aes" keyword or having a
range for the key length.  So there isn't need to sync it to
iked.conf.5.

Also I belive "aesctr" is to support 160:288 range for key length, but
the implemention doesn't seem to be completed.  I have another plan to
handle this separately, then I'll update the man page.


Other than the key length range, it seems there are some differences
between iked.conf.5 and ipsec.conf.5.

1. "-gcm-12" 
   missing this in ipsec.conf.5 is ok since isakmpd(8) doesn't support
   it yet.  (It is actually an alias ID for "-gcm" though.)

2. "-gmac" and "null"
   iked.conf.5 has a separeted list for them to clarify they don't do
   encryption.  Applied the same to isakmpd.conf.5.

3. "chacha20-poly1305"
   It is missing in ipsec.conf.5.

4. explanation of "[IKE only]" or "[phase 2]"
   It is missing in ipsec.conf.5.  Copied the section from iked.conf
   and modified it.

5. explanation of "keysize" for AES-CTR and so on
   The explanation in ipsec.conf.5 is better.  Copied that to
   iked.conf.5.

6. "cast"
   ipsecctl(8) program doesn't support "cast" keyword actually,
   it supports "cast128" instead.  Correct "cast" to "cast128"


ok?

Index: sbin/iked/iked.conf.5
===
RCS file: /cvs/src/sbin/iked/iked.conf.5,v
retrieving revision 1.87
diff -u -p -r1.87 iked.conf.5
--- sbin/iked/iked.conf.5   26 Oct 2021 17:31:22 -  1.87
+++ sbin/iked/iked.conf.5   3 Nov 2021 05:42:48 -
@@ -998,9 +998,9 @@ keyword.
 3DES requires 24 bytes to form its 168-bit key.
 This is because the most significant bit of each byte is used for parity.
 .Pp
-The keysize of AES-CTR is actually 128-bit.
+The keysize of AES-CTR can be 128, 192, or 256 bits.
 However as well as the key, a 32-bit nonce has to be supplied.
-Thus 160 bits of key material have to be supplied.
+Thus 160, 224, or 288 bits of key material, respectively, have to be supplied.
 The same applies to AES-GCM, AES-GMAC and Chacha20-Poly1305,
 however in the latter case the keysize is 256 bit.
 .Pp
Index: sbin/ipsecctl/ipsec.conf.5
===
RCS file: /cvs/src/sbin/ipsecctl/ipsec.conf.5,v
retrieving revision 1.160
diff -u -p -r1.160 ipsec.conf.5
--- sbin/ipsecctl/ipsec.conf.5  22 Oct 2021 12:30:54 -  1.160
+++ sbin/ipsecctl/ipsec.conf.5  3 Nov 2021 05:42:49 -
@@ -637,10 +637,10 @@ keyword:
 The following cipher types are permitted with the
 .Ic enc
 keyword:
-.Bl -column "aes-128-gmac" "Key Length" "Description" -offset indent
+.Bl -column "chacha20-poly1305" "128-256 bits" "Description" -offset indent
 .It Em "Cipher" Ta Em "Key Length" Ta ""
 .It Li 3des Ta "168 bits" Ta ""
-.It Li aes Ta "128 bits" Ta ""
+.It Li aes Ta "128-256 bits" Ta ""
 .It Li aes-128 Ta "128 bits" Ta ""
 .It Li aes-192 Ta "192 bits" Ta ""
 .It Li aes-256 Ta "256 bits" Ta ""
@@ -651,21 +651,37 @@ keyword:
 .It Li aes-128-gcm Ta "160 bits" Ta "[phase 2 only, IKE only]"
 .It Li aes-192-gcm Ta "224 bits" Ta "[phase 2 only, IKE only]"
 .It Li aes-256-gcm Ta "288 bits" Ta "[phase 2 only, IKE only]"
+.It Li blowfish Ta "160 bits" Ta ""
+.It Li cast128 Ta "128 bits" Ta ""
+.It Li chacha20-poly1305 Ta "288 bits" Ta ""
+.El
+.Pp
+The following cipher types provide only authentication, not encryption:
+.Bl -column "chacha20-poly1305" "128-256 bits" "Description" -offset indent
 .It Li aes-128-gmac Ta "160 bits" Ta "[phase 2 only, IKE only]"
 .It Li aes-192-g

diff: isakmpd.conf.5, clarify ANY can be used for some params

2021-11-01 Thread YASUOKA Masahiko
ok?

Clarify that ANY can be used for several parameters of IPsec transform.

Index: sbin/isakmpd/isakmpd.conf.5
===
RCS file: /cvs/src/sbin/isakmpd/isakmpd.conf.5,v
retrieving revision 1.135
diff -u -p -r1.135 isakmpd.conf.5
--- sbin/isakmpd/isakmpd.conf.5 17 Apr 2018 12:13:29 -  1.135
+++ sbin/isakmpd/isakmpd.conf.5 2 Nov 2021 02:57:23 -
@@ -726,7 +726,7 @@ See below.
 Parameters for IPsec transform configuration
 .Bl -tag -width Ds
 .It Em AUTHENTICATION_ALGORITHM
-The optional authentication algorithm in the case of this
+The optional authentication algorithm or ANY in the case of this
 being an ESP transform.
 .It Em ENCAPSULATION_MODE
 The encapsulation mode as given by the RFCs.
@@ -745,7 +745,8 @@ List of lifetimes, each element is a
 .Aq Sy Lifetime
 section name.
 .It Em TRANSFORM_ID
-The transform ID as given by the RFCs.
+The transform ID as given by the RFCs, or ANY to denote that any
+transform proposed will be accepted.
 .El
 .It Aq Sy IPsec-ID
 Parameters for IPsec ID configuration



diff: ipsec.conf(5), clarify "aes" accepts 128:256 bits

2021-11-01 Thread YASUOKA Masahiko
I'd like to clarify "aes" in ipsec.conf accepts 128:256 bits.

sbin/ipsecctl/ike.c:
201 case ENCXF_AES:
202 enc_alg = "AES";
203 key_length = "128,128:256";
204 break;


ok?

Clarify "aes" will accept keys which length is in 128:256 bits.

Index: sbin/ipsecctl/ipsec.conf.5
===
RCS file: /cvs/src/sbin/ipsecctl/ipsec.conf.5,v
retrieving revision 1.160
diff -u -p -r1.160 ipsec.conf.5
--- sbin/ipsecctl/ipsec.conf.5  22 Oct 2021 12:30:54 -  1.160
+++ sbin/ipsecctl/ipsec.conf.5  2 Nov 2021 02:58:13 -
@@ -637,10 +637,10 @@ keyword:
 The following cipher types are permitted with the
 .Ic enc
 keyword:
-.Bl -column "aes-128-gmac" "Key Length" "Description" -offset indent
+.Bl -column "aes-128-gmac" "128-256 bits" "Description" -offset indent
 .It Em "Cipher" Ta Em "Key Length" Ta ""
 .It Li 3des Ta "168 bits" Ta ""
-.It Li aes Ta "128 bits" Ta ""
+.It Li aes Ta "128-256 bits" Ta ""
 .It Li aes-128 Ta "128 bits" Ta ""
 .It Li aes-192 Ta "192 bits" Ta ""
 .It Li aes-256 Ta "256 bits" Ta ""



Re: Exit status of pkg_add

2021-10-18 Thread YASUOKA Masahiko
Hi,

# drop ccing misc@

The diff seems ok for me.

ok to commit it in?

On Tue, 19 Oct 2021 10:42:04 +0900
Yuichiro NAITO  wrote:
> Following patch changes pkg_add to return a error code,
> if a package name is wrong.
> 
> diff --git a/usr.sbin/pkg_add/OpenBSD/AddDelete.pm
> b/usr.sbin/pkg_add/OpenBSD/AddDelete.pm
> index 7a968cbf05d..39bee874ff1 100644
> --- a/usr.sbin/pkg_add/OpenBSD/AddDelete.pm
> +++ b/usr.sbin/pkg_add/OpenBSD/AddDelete.pm
> @@ -403,12 +403,13 @@ sub check_root
>  sub choose_location
>  {
>   my ($state, $name, $list, $is_quirks) = @_;
>   if (@$list == 0) {
>   if (!$is_quirks) {
>   $state->errsay("Can't find #1", $name);
> + $state->{bad}++;
>   $state->run_quirks(
>   sub {
>   my $quirks = shift;
>   $quirks->filter_obsolete([$name], $state);
>   });
>   }
> 
> Is it OK?
> 
> On 10/18/21 16:53, Yuichiro NAITO wrote:
>> Hi, I have a question about exit status of pkg_add command.
>> When I wrote a package install script which included typo in a package
>> name
>> (of course it's my fault), the script didn't stop in spite of `set
>> -e`.
>> Because pkg_add command returns 0 even if a package name is wrong.
>> Is this exit status intended or design policy of pkg_add command?
>> If not, I want a error status getting returned.
>> It will save my time to look for a typo or similar bug.
>> I can't see 'EXIT STATUS' section in the pkg_add manual of OpenBSD
>> 7.0.
>> So, I e-mailed this question.
>> 
> 
> -- 
> Yuichiro NAITO (naito.yuich...@gmail.com)
> 



Re: Fix IPsec NAT-T for L2TP/IPsec

2021-05-12 Thread YASUOKA Masahiko
On Wed, 12 May 2021 19:11:09 +0900 (JST)
YASUOKA Masahiko  wrote:
> Radek reported a problem to misc@ that multiple Windows clients behind
> a NAT cannot use a L2TP/IPsec server simultaneously.
> 
> https://marc.info/?t=16099681611=1=2
> 
> There is two problems.  First is pipex(4) doesn't pass the proper
> ipsecflowinfo to ip_output().  Second is the IPsec policy check which
> is done by ipsp_spd_lookup() returns -1 (EINVAL) if the given tdb is
> not cached.  This happens when its flow is shared by another tdb (for
> another client of the same NAT).
> 
> The following 2 diffs fix these problem.
> 
> comment?
> ok?
> 
> diff #1
> 
> Fix IPsec NAT-T work with pipex.

The original diff #1 used m_tag to specify the ipsecflowinfo.

I noticed "ph_cookie" is usable instead of the m_tag.  It seems simpler.

Is it better?

Index: sys/net/if_etherip.c
===
RCS file: /disk/cvs/openbsd/src/sys/net/if_etherip.c,v
retrieving revision 1.48
diff -u -p -r1.48 if_etherip.c
--- sys/net/if_etherip.c9 Jan 2021 21:00:58 -   1.48
+++ sys/net/if_etherip.c12 May 2021 23:29:41 -
@@ -547,7 +547,7 @@ ip_etherip_output(struct ifnet *ifp, str
etheripstat_pkt(etherips_opackets, etherips_obytes, m->m_pkthdr.len -
(sizeof(struct ip) + sizeof(struct etherip_header)));
 
-   ip_send(m);
+   ip_send(m, 0);
 
return (0);
 }
Index: sys/net/if_gif.c
===
RCS file: /disk/cvs/openbsd/src/sys/net/if_gif.c,v
retrieving revision 1.132
diff -u -p -r1.132 if_gif.c
--- sys/net/if_gif.c20 Feb 2021 04:58:29 -  1.132
+++ sys/net/if_gif.c12 May 2021 23:29:45 -
@@ -340,7 +340,7 @@ gif_send(struct gif_softc *sc, struct mb
ip->ip_src = sc->sc_tunnel.t_src4;
ip->ip_dst = sc->sc_tunnel.t_dst4;
 
-   ip_send(m);
+   ip_send(m, 0);
break;
}
 #ifdef INET6
Index: sys/net/if_gre.c
===
RCS file: /disk/cvs/openbsd/src/sys/net/if_gre.c,v
retrieving revision 1.171
diff -u -p -r1.171 if_gre.c
--- sys/net/if_gre.c10 Mar 2021 10:21:47 -  1.171
+++ sys/net/if_gre.c12 May 2021 23:29:52 -
@@ -1999,7 +1999,7 @@ gre_ip_output(const struct gre_tunnel *t
 
switch (tunnel->t_af) {
case AF_INET:
-   ip_send(m);
+   ip_send(m, 0);
break;
 #ifdef INET6
case AF_INET6:
Index: sys/net/pf.c
===
RCS file: /disk/cvs/openbsd/src/sys/net/pf.c,v
retrieving revision 1.1116
diff -u -p -r1.1116 pf.c
--- sys/net/pf.c27 Apr 2021 09:38:29 -  1.1116
+++ sys/net/pf.c12 May 2021 23:29:56 -
@@ -2896,7 +2896,7 @@ pf_send_tcp(const struct pf_rule *r, sa_
 
switch (af) {
case AF_INET:
-   ip_send(m);
+   ip_send(m, 0);
break;
 #ifdef INET6
case AF_INET6:
Index: sys/net/pipex.c
===
RCS file: /disk/cvs/openbsd/src/sys/net/pipex.c,v
retrieving revision 1.132
diff -u -p -r1.132 pipex.c
--- sys/net/pipex.c 10 Mar 2021 10:21:48 -  1.132
+++ sys/net/pipex.c 12 May 2021 23:31:24 -
@@ -1258,7 +1258,7 @@ pipex_pptp_output(struct mbuf *m0, struc
gre->flags = htons(gre->flags);
 
m0->m_pkthdr.ph_ifidx = session->ifindex;
-   ip_send(m0);
+   ip_send(m0, 0);
if (len > 0) {  /* network layer only */
/* countup statistics */
session->stat.opackets++;
@@ -1704,7 +1704,7 @@ pipex_l2tp_output(struct mbuf *m0, struc
ip->ip_tos = 0;
ip->ip_off = 0;
 
-   ip_send(m0);
+   ip_send(m0, session->proto.l2tp.ipsecflowinfo);
break;
 #ifdef INET6
case AF_INET6:
Index: sys/netinet/ip_icmp.c
===
RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_icmp.c,v
retrieving revision 1.186
diff -u -p -r1.186 ip_icmp.c
--- sys/netinet/ip_icmp.c   30 Mar 2021 08:37:10 -  1.186
+++ sys/netinet/ip_icmp.c   12 May 2021 23:31:57 -
@@ -860,7 +860,7 @@ icmp_send(struct mbuf *m, struct mbuf *o
ipstat_inc(ips_localout);
ip_send_raw(m);
} else
-   ip_send(m);
+   ip_send(m, 0);
 }
 
 u_int32_t
Index: sys/netinet/ip_input.c
===
RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_input.c,v
retrieving revision 1.359
diff -u -p -r1.359 ip_input.c
--- sys/netinet/ip_input.c  30 Apr 2021 13:52:48 -  1.359
+++ sys/n

Re: Fix IPsec NAT-T for L2TP/IPsec

2021-05-12 Thread YASUOKA Masahiko
On Wed, 12 May 2021 19:15:29 +0300
Vitaliy Makkoveev  wrote:
>> On 12 May 2021, at 18:42, YASUOKA Masahiko  wrote:
>> On Wed, 12 May 2021 17:26:51 +0300
>> Vitaliy Makkoveev  wrote:
>>> On Wed, May 12, 2021 at 07:11:09PM +0900, YASUOKA Masahiko wrote:
>>>> Radek reported a problem to misc@ that multiple Windows clients behind a 
>>>> NAT
>>>> cannot use a L2TP/IPsec server simultaneously.
>>>> 
>>>> https://marc.info/?t=16099681611=1=2
>>>> 
>>>> There is two problems.  First is pipex(4) doesn't pass the proper
>>>> ipsecflowinfo to ip_output().  Second is the IPsec policy check which is
>>>> done by ipsp_spd_lookup() returns -1 (EINVAL) if the given tdb is not
>>>> cached.  This happens when its flow is shared by another tdb (for another
>>>> client of the same NAT).
>>>> 
>>>> The following 2 diffs fix these problem.
>>>> 
>>>> comment?
>>>> ok?
>>>> 
>>> 
>>> Hi.
>>> 
>>> I have two comments for the diff 1:
>>> 
>>> 1. You should add PACKET_TAG_IPSEC_FLOWINFO description to
>>>m_tag_get(9).
>>> 2. You introduced mbuf(9) leak in pipex_l2tp_output() error path. I
>>>   pointed the place in your diff.
>> 
>> Good catch.  Thanks.
>> 
> 
> m_freem(9) accepts NULL so this check before is redundant.

Yes,

> It seems to me that "Used by the IPv4 stack to specify the IPsec flow
> of an output IP packet. The tag contains a u_int32_t identifying the
> IPsec flow.” is enough. Anyway it’s better to ask jmc@.

Ok,

> Also I like to remove PACKET_TAG_PIPEX with separate diff.

I removed PACKET_TAG_PIPEX separetely.  

Let me update the diff.

Index: sys/net/pipex.c
===
RCS file: /disk/cvs/openbsd/src/sys/net/pipex.c,v
retrieving revision 1.132
diff -u -p -r1.132 pipex.c
--- sys/net/pipex.c 10 Mar 2021 10:21:48 -  1.132
+++ sys/net/pipex.c 12 May 2021 23:18:52 -
@@ -1628,6 +1628,7 @@ pipex_l2tp_output(struct mbuf *m0, struc
 #ifdef INET6
struct ip6_hdr *ip6;
 #endif
+   struct m_tag *mtag;
 
hlen = sizeof(struct pipex_l2tp_header) +
((pipex_session_is_l2tp_data_sequencing_on(session))
@@ -1704,6 +1705,15 @@ pipex_l2tp_output(struct mbuf *m0, struc
ip->ip_tos = 0;
ip->ip_off = 0;
 
+   if (session->proto.l2tp.ipsecflowinfo > 0) {
+   if ((mtag = m_tag_get(PACKET_TAG_IPSEC_FLOWINFO,
+   sizeof(u_int32_t), M_NOWAIT)) == NULL)
+   goto drop;
+   *(u_int32_t *)(mtag + 1) =
+   session->proto.l2tp.ipsecflowinfo;
+   m_tag_prepend(m0, mtag);
+   }
+
ip_send(m0);
break;
 #ifdef INET6
@@ -1733,6 +1743,7 @@ pipex_l2tp_output(struct mbuf *m0, struc
 
return;
 drop:
+   m_freem(m0);
session->stat.oerrors++;
 }
 
Index: sys/netinet/ip_input.c
===
RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_input.c,v
retrieving revision 1.359
diff -u -p -r1.359 ip_input.c
--- sys/netinet/ip_input.c  30 Apr 2021 13:52:48 -  1.359
+++ sys/netinet/ip_input.c  12 May 2021 23:18:52 -
@@ -1790,6 +1790,8 @@ ip_send_do_dispatch(void *xmq, int flags
struct mbuf_queue *mq = xmq;
struct mbuf *m;
struct mbuf_list ml;
+   struct m_tag *mtag;
+   u_int32_t ipsecflowinfo = 0;
 
mq_delist(mq, );
if (ml_empty())
@@ -1797,7 +1799,12 @@ ip_send_do_dispatch(void *xmq, int flags
 
NET_LOCK();
while ((m = ml_dequeue()) != NULL) {
-   ip_output(m, NULL, NULL, flags, NULL, NULL, 0);
+   if ((mtag = m_tag_find(m, PACKET_TAG_IPSEC_FLOWINFO, NULL))
+   != NULL) {
+   ipsecflowinfo = *(u_int32_t *)(mtag + 1);
+   m_tag_delete(m, mtag);
+   }
+   ip_output(m, NULL, NULL, flags, NULL, NULL, ipsecflowinfo);
}
NET_UNLOCK();
 }
Index: sys/sys/mbuf.h
===
RCS file: /disk/cvs/openbsd/src/sys/sys/mbuf.h,v
retrieving revision 1.252
diff -u -p -r1.252 mbuf.h
--- sys/sys/mbuf.h  25 Feb 2021 02:43:31 -  1.252
+++ sys/sys/mbuf.h  12 May 2021 23:18:52 -
@@ -469,6 +469,7 @@ struct m_tag *m_tag_next(struct mbuf *, 
 /* Packet tag types */
 #define PACKET_TAG_IPSEC_IN_DONE   0x0001  /* IPsec applied, in */
 #define PACKET_TAG_IPSEC_OUT_DONE  0x0002  /* IPsec applied, out */
+#define PACKET_TAG_IPSEC_FLOWINFO   

Re: Fix IPsec NAT-T for L2TP/IPsec

2021-05-12 Thread YASUOKA Masahiko
On Wed, 12 May 2021 17:26:51 +0300
Vitaliy Makkoveev  wrote:
> On Wed, May 12, 2021 at 07:11:09PM +0900, YASUOKA Masahiko wrote:
>> Radek reported a problem to misc@ that multiple Windows clients behind a NAT
>> cannot use a L2TP/IPsec server simultaneously.
>> 
>> https://marc.info/?t=16099681611=1=2
>> 
>> There is two problems.  First is pipex(4) doesn't pass the proper
>> ipsecflowinfo to ip_output().  Second is the IPsec policy check which is
>> done by ipsp_spd_lookup() returns -1 (EINVAL) if the given tdb is not
>> cached.  This happens when its flow is shared by another tdb (for another
>> client of the same NAT).
>> 
>> The following 2 diffs fix these problem.
>> 
>> comment?
>> ok?
>> 
> 
> Hi.
> 
> I have two comments for the diff 1:
> 
> 1. You should add PACKET_TAG_IPSEC_FLOWINFO description to
> m_tag_get(9).
> 2. You introduced mbuf(9) leak in pipex_l2tp_output() error path. I
>pointed the place in your diff.

Good catch.  Thanks.


Let me update the diff.

Index: sys/net/pipex.c
===
RCS file: /disk/cvs/openbsd/src/sys/net/pipex.c,v
retrieving revision 1.132
diff -u -p -r1.132 pipex.c
--- sys/net/pipex.c 10 Mar 2021 10:21:48 -  1.132
+++ sys/net/pipex.c 12 May 2021 15:33:33 -
@@ -1628,6 +1628,7 @@ pipex_l2tp_output(struct mbuf *m0, struc
 #ifdef INET6
struct ip6_hdr *ip6;
 #endif
+   struct m_tag *mtag;
 
hlen = sizeof(struct pipex_l2tp_header) +
((pipex_session_is_l2tp_data_sequencing_on(session))
@@ -1704,6 +1705,15 @@ pipex_l2tp_output(struct mbuf *m0, struc
ip->ip_tos = 0;
ip->ip_off = 0;
 
+   if (session->proto.l2tp.ipsecflowinfo > 0) {
+   if ((mtag = m_tag_get(PACKET_TAG_IPSEC_FLOWINFO,
+   sizeof(u_int32_t), M_NOWAIT)) == NULL)
+   goto drop;
+   *(u_int32_t *)(mtag + 1) =
+   session->proto.l2tp.ipsecflowinfo;
+   m_tag_prepend(m0, mtag);
+   }
+
ip_send(m0);
break;
 #ifdef INET6
@@ -1733,6 +1743,8 @@ pipex_l2tp_output(struct mbuf *m0, struc
 
return;
 drop:
+   if (m0 != NULL)
+   m_freem(m0);
session->stat.oerrors++;
 }
 
Index: sys/netinet/ip_input.c
===
RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_input.c,v
retrieving revision 1.359
diff -u -p -r1.359 ip_input.c
--- sys/netinet/ip_input.c  30 Apr 2021 13:52:48 -  1.359
+++ sys/netinet/ip_input.c  12 May 2021 15:31:52 -
@@ -1790,6 +1790,8 @@ ip_send_do_dispatch(void *xmq, int flags
struct mbuf_queue *mq = xmq;
struct mbuf *m;
struct mbuf_list ml;
+   struct m_tag *mtag;
+   u_int32_t ipsecflowinfo = 0;
 
mq_delist(mq, );
if (ml_empty())
@@ -1797,7 +1799,12 @@ ip_send_do_dispatch(void *xmq, int flags
 
NET_LOCK();
while ((m = ml_dequeue()) != NULL) {
-   ip_output(m, NULL, NULL, flags, NULL, NULL, 0);
+   if ((mtag = m_tag_find(m, PACKET_TAG_IPSEC_FLOWINFO, NULL))
+   != NULL) {
+   ipsecflowinfo = *(u_int32_t *)(mtag + 1);
+   m_tag_delete(m, mtag);
+   }
+   ip_output(m, NULL, NULL, flags, NULL, NULL, ipsecflowinfo);
}
NET_UNLOCK();
 }
Index: sys/sys/mbuf.h
===
RCS file: /disk/cvs/openbsd/src/sys/sys/mbuf.h,v
retrieving revision 1.252
diff -u -p -r1.252 mbuf.h
--- sys/sys/mbuf.h  25 Feb 2021 02:43:31 -  1.252
+++ sys/sys/mbuf.h  12 May 2021 15:31:52 -
@@ -469,6 +469,7 @@ struct m_tag *m_tag_next(struct mbuf *, 
 /* Packet tag types */
 #define PACKET_TAG_IPSEC_IN_DONE   0x0001  /* IPsec applied, in */
 #define PACKET_TAG_IPSEC_OUT_DONE  0x0002  /* IPsec applied, out */
+#define PACKET_TAG_IPSEC_FLOWINFO  0x0004  /* IPsec flowinfo */
 #define PACKET_TAG_WIREGUARD   0x0040  /* WireGuard data */
 #define PACKET_TAG_GRE 0x0080  /* GRE processing done */
 #define PACKET_TAG_DLT 0x0100 /* data link layer type */
@@ -479,7 +480,7 @@ struct m_tag *m_tag_next(struct mbuf *, 
 #define PACKET_TAG_CARP_BAL_IP 0x4000  /* carp(4) ip balanced marker */
 
 #define MTAG_BITS \
-("\20\1IPSEC_IN_DONE\2IPSEC_OUT_DONE\3IPSEC_IN_CRYPTO_DONE" \
+("\20\1IPSEC_IN_DONE\2IPSEC_OUT_DONE\3IPSEC_FLOWINFO" \
 "\4IPSEC_OUT_CRYPTO_NEEDED\5IPSEC_PENDING_TDB\6BRIDGE\7WG\10GRE\11DLT" \
 "\12PF_DIVERT\14PF_REASSEMBLED\15SRCROUTE\16TUNNEL\17CARP_BAL_IP")
 
Index: share/man/man9

Fix IPsec NAT-T for L2TP/IPsec

2021-05-12 Thread YASUOKA Masahiko

Hi,

Radek reported a problem to misc@ that multiple Windows clients 
behind a NAT cannot use a L2TP/IPsec server simultaneously.


https://marc.info/?t=16099681611=1=2

There is two problems.  First is pipex(4) doesn't pass the proper 
ipsecflowinfo to ip_output().  Second is the IPsec policy check which 
is done by ipsp_spd_lookup() returns -1 (EINVAL) if the given tdb is 
not cached.  This happens when its flow is shared by another tdb (for 
another client of the same NAT).


The following 2 diffs fix these problem.

comment?
ok?

diff #1

Fix IPsec NAT-T work with pipex.

Index: sys/net/pipex.c
===
RCS file: /disk/cvs/openbsd/src/sys/net/pipex.c,v
retrieving revision 1.132
diff -u -p -r1.132 pipex.c
--- sys/net/pipex.c 10 Mar 2021 10:21:48 -  1.132
+++ sys/net/pipex.c 12 May 2021 09:38:32 -
@@ -1628,6 +1628,7 @@ pipex_l2tp_output(struct mbuf *m0, struc
 #ifdef INET6
struct ip6_hdr *ip6;
 #endif
+   struct m_tag *mtag;

hlen = sizeof(struct pipex_l2tp_header) +
((pipex_session_is_l2tp_data_sequencing_on(session))
@@ -1703,6 +1704,15 @@ pipex_l2tp_output(struct mbuf *m0, struc
ip->ip_ttl = MAXTTL;
ip->ip_tos = 0;
ip->ip_off = 0;
+
+   if (session->proto.l2tp.ipsecflowinfo > 0) {
+			if ((mtag = 
m_tag_get(PACKET_TAG_IPSEC_FLOWINFO,

+   sizeof(u_int32_t), M_NOWAIT)) == NULL)
+   goto drop;
+   *(u_int32_t *)(mtag + 1) =
+   session->proto.l2tp.ipsecflowinfo;
+   m_tag_prepend(m0, mtag);
+   }

ip_send(m0);
break;
Index: sys/netinet/ip_input.c
===
RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_input.c,v
retrieving revision 1.359
diff -u -p -r1.359 ip_input.c
--- sys/netinet/ip_input.c  30 Apr 2021 13:52:48 -  1.359
+++ sys/netinet/ip_input.c  12 May 2021 09:38:32 -
@@ -1790,6 +1790,8 @@ ip_send_do_dispatch(void *xmq, int flags
struct mbuf_queue *mq = xmq;
struct mbuf *m;
struct mbuf_list ml;
+   struct m_tag *mtag;
+   u_int32_t ipsecflowinfo = 0;

mq_delist(mq, );
if (ml_empty())
@@ -1797,7 +1799,12 @@ ip_send_do_dispatch(void *xmq, int flags

NET_LOCK();
while ((m = ml_dequeue()) != NULL) {
-   ip_output(m, NULL, NULL, flags, NULL, NULL, 0);
+		if ((mtag = m_tag_find(m, PACKET_TAG_IPSEC_FLOWINFO, 
NULL))

+   != NULL) {
+   ipsecflowinfo = *(u_int32_t *)(mtag + 1);
+   m_tag_delete(m, mtag);
+   }
+		ip_output(m, NULL, NULL, flags, NULL, NULL, 
ipsecflowinfo);

}
NET_UNLOCK();
 }
Index: sys/sys/mbuf.h
===
RCS file: /disk/cvs/openbsd/src/sys/sys/mbuf.h,v
retrieving revision 1.252
diff -u -p -r1.252 mbuf.h
--- sys/sys/mbuf.h  25 Feb 2021 02:43:31 -  1.252
+++ sys/sys/mbuf.h  12 May 2021 09:38:32 -
@@ -469,6 +469,7 @@ struct m_tag *m_tag_next(struct mbuf *,
 /* Packet tag types */
 #define PACKET_TAG_IPSEC_IN_DONE	0x0001  /* IPsec applied, in 
*/
 #define PACKET_TAG_IPSEC_OUT_DONE	0x0002  /* IPsec applied, out 
*/

+#define PACKET_TAG_IPSEC_FLOWINFO  0x0004  /* IPsec flowinfo */
 #define PACKET_TAG_WIREGUARD   0x0040  /* WireGuard data */
 #define PACKET_TAG_GRE			0x0080  /* GRE 
processing done */
 #define PACKET_TAG_DLT			0x0100 /* data link 
layer type */

@@ -479,7 +480,7 @@ struct m_tag *m_tag_next(struct mbuf *,
 #define PACKET_TAG_CARP_BAL_IP		0x4000  /* carp(4) ip 
balanced marker */


 #define MTAG_BITS \
-("\20\1IPSEC_IN_DONE\2IPSEC_OUT_DONE\3IPSEC_IN_CRYPTO_DONE" \
+("\20\1IPSEC_IN_DONE\2IPSEC_OUT_DONE\3IPSEC_FLOWINFO" \
 
"\4IPSEC_OUT_CRYPTO_NEEDED\5IPSEC_PENDING_TDB\6BRIDGE\7WG\10GRE\11DLT" 
\
 
"\12PF_DIVERT\14PF_REASSEMBLED\15SRCROUTE\16TUNNEL\17CARP_BAL_IP")





diff #2

Make the IPsec flow can have multiple `ipsec_ids' so that
ipsp_spd_lookup() can check whether the `ipsec_ids` of the given tdb 
is

belonged with a flow shared by mutlple clients behind a NAT.

Index: sys/net/pfkeyv2.c
===
RCS file: /disk/cvs/openbsd/src/sys/net/pfkeyv2.c,v
retrieving revision 1.211
diff -u -p -r1.211 pfkeyv2.c
--- sys/net/pfkeyv2.c   4 May 2021 09:28:04 -   1.211
+++ sys/net/pfkeyv2.c   12 May 2021 10:07:11 -
@@ -1106,6 +1106,7 @@ pfkeyv2_send(struct socket *so, void *me
int i, j, rval = 0, mode = PFKEYV2_SENDMESSAGE_BROADCAST;
int delflag = 0;
struct sockaddr_encap encapdst, encapnetmask;
+   struct ipsec_ids *ids, *ids0;
struct ipsec_policy *ipo;
struct ipsec_acquire *ipa;
struct 

Re: monotonic time going back by wrong skews

2021-04-06 Thread YASUOKA Masahiko
Hi,

I'm sorry..  I send a wrong diff to the people.  The result from
giovanni@ and mcmer seems wrong.  I suppose stu@ used the correct
diff.

giovanni and mcmer, can you test with the correct diff again?

I attached the correct diff at last of this mail.

I'm sorry again.

On Tue, 6 Apr 2021 09:21:40 +0200
Giovanni Bechis  wrote:
> On Mon, Apr 05, 2021 at 07:14:49PM +0900, YASUOKA Masahiko wrote:
>> Hi,
>> 
>> > Another issue that I see is that people have not reported, at least
> [...]
>> > publicly, that this runs fine on their normal OpenBSD machines.
>> 
>> Some dmesgs posted on public lists seems to have the same problem.
>> 
>> https://marc.info/?l=openbsd-bugs=2=1=disabling+user+TSC=b
>> https://marc.info/?l=openbsd-tech=2=1=disabling+user+TSC=b
>> https://marc.info/?l=openbsd-ports=2=1=disabling+user+TSC=b
>> 
>> For example,
>> 
>> https://marc.info/?l=openbsd-bugs=161618496905444=2
>> 
>> |Subject:wg(4) crash
>> |From:   Stuart Henderson 
>> |bios0: vendor Dell Inc. version "2.9.0" date 12/06/2019
>> |bios0: Dell Inc. PowerEdge R620
>> |cpu1: disabling user TSC (skew=135)
>> |cpu1: smt 0, core 0, package 1
>> 
>> https://marc.info/?l=openbsd-ports=161306073708427=2
>> |Subject:Re: sysutils/nut README APC over USB device chgrp/chmod
>> |From:   Marcus MERIGHI 
>> |bios0: vendor American Megatrends Inc. version "3.1" date 06/07/2018
>> |cpu11: disabling user TSC (skew=240)
>> |cpu11: smt 0, core 3, package 1
>> 
>> these 2 are real machine and using 2 CPU sockets.
>> 
>> https://marc.info/?l=openbsd-ports=161562278114172=2
>> |Subject:ruby27 vs Puppet
>> |From:   Giovanni Bechis 
>> |bios0: vendor Phoenix Technologies LTD version "6.00" date 12/12/2018
>> |bios0: VMware, Inc. VMware Virtual Platform
>> |cpu1: disabling user TSC (skew=-12705)
>> 
>> VMware.  seems the same problem of mine.
>> 
>> I'll ask people to do the same test which cheloha@ write in previous
>> mail.
>> 
> Attached my data and dmesg produced by the script on my VMware vm.
> 
>  Cheers
>   Giovanni

Index: sys/arch/amd64/amd64/tsc.c
===
RCS file: /var/cvs/openbsd/src/sys/arch/amd64/amd64/tsc.c,v
retrieving revision 1.23
diff -u -p -r1.23 tsc.c
--- sys/arch/amd64/amd64/tsc.c  23 Feb 2021 04:44:30 -  1.23
+++ sys/arch/amd64/amd64/tsc.c  5 Apr 2021 10:28:00 -
@@ -311,16 +311,42 @@ tsc_read_bp(struct cpu_info *ci, uint64_
*aptscp = tsc_sync_val;
 }
 
+#defineTSC_SYNC_NTIMES 1000
+
+static int tsc_difs[MAXCPUS][TSC_SYNC_NTIMES];
+
+void
+tsc_debug(void)
+{
+   int i, cpuid = curcpu()->ci_cpuid;
+
+   for (i = 0; i < TSC_SYNC_NTIMES; i++) {
+   if (i % 10 == 0)
+   printf("%5d", tsc_difs[cpuid][i]);
+   else
+   printf(" %5d", tsc_difs[cpuid][i]);
+   if (i % 10 == 9)
+   printf("\n");
+   }
+   printf("\n");
+}
+
 void
 tsc_sync_bp(struct cpu_info *ci)
 {
+   int i, mindif = INT_MAX, dif;
uint64_t bptsc, aptsc;
 
-   tsc_read_bp(ci, , ); /* discarded - cache effects */
-   tsc_read_bp(ci, , );
+   for (i = 0; i < TSC_SYNC_NTIMES; i++) {
+   tsc_read_bp(ci, , );
+   dif = bptsc - aptsc;
+   if (abs(dif) < abs(mindif))
+   mindif = dif;
+   tsc_difs[ci->ci_cpuid][i] = dif;
+   }
 
/* Compute final value to adjust for skew. */
-   ci->ci_tsc_skew = bptsc - aptsc;
+   ci->ci_tsc_skew = mindif;
 }
 
 /*
@@ -351,8 +377,10 @@ tsc_post_ap(struct cpu_info *ci)
 void
 tsc_sync_ap(struct cpu_info *ci)
 {
-   tsc_post_ap(ci);
-   tsc_post_ap(ci);
+   int i;
+
+   for (i = 0; i < TSC_SYNC_NTIMES; i++)
+   tsc_post_ap(ci);
 }
 
 void



Re: monotonic time going back by wrong skews

2021-04-05 Thread YASUOKA Masahiko
On Mon, 5 Apr 2021 14:24:03 +0200 (CEST)
Mark Kettenis  wrote:
>> Date: Mon, 05 Apr 2021 19:14:49 +0900 (JST)
>> From: YASUOKA Masahiko 
>> 
>> Hi,
>> 
>> On Mon, 5 Apr 2021 10:43:00 +0300
>> Paul Irofti  wrote:
>> > On 05.04.2021 06:13, Scott Cheloha wrote:
>> >> On Mon, Mar 29, 2021 at 02:00:01PM +0900, YASUOKA Masahiko wrote:
>> >>> On Thu, 25 Mar 2021 19:41:35 +0100 (CET)
>> >>> Mark Kettenis  wrote:
>> >>>>> From: Scott Cheloha 
>> >>>>> Date: Thu, 25 Mar 2021 13:18:04 -0500
>> >>>>>> On Wed, Mar 24, 2021 at 05:40:21PM +0900, YASUOKA Masahiko wrote:
>> >>>>> Which diff did you apply?  Yasuoka provided two diffs.
>> >>>>>
>> >>>>> In any case, ignore this diff:
>> >>>>>
>> >>>>>> diff --git a/sys/arch/amd64/amd64/tsc.c b/sys/arch/amd64/amd64/tsc.c
>> >>>>>> index 238a5a068e1..3b951a8b5a3 100644
>> >>>>>> --- a/sys/arch/amd64/amd64/tsc.c
>> >>>>>> +++ b/sys/arch/amd64/amd64/tsc.c
>> >>>>>> @@ -212,7 +212,8 @@ cpu_recalibrate_tsc(struct timecounter *tc)
>> >>>>>> u_int
>> >>>>>> tsc_get_timecount(struct timecounter *tc)
>> >>>>>> {
>> >>>>>> - return rdtsc_lfence() + curcpu()->ci_tsc_skew;
>> >>>>>> + //return rdtsc_lfence() + curcpu()->ci_tsc_skew;
>> >>>>>> + return rdtsc_lfence();
>> >>>>>> }
>> >>>>>>
>> >>>>>> void
>> >>>>>
>> >>>>>
>> >>>>> We don't want to discard the skews, that's wrong.
>> >>>
>> >>> I'm sorry for the confusion.
>> >> No problem.
>> >> 
>> >>>>> The reason it "fixes" Yasuoka's problem is because the real skews
>> >>>>> on the ESXi VMs in question are probably close to zero but our
>> >>>>> synchronization algorithm is picking huge (wrong) skews due to
>> >>>>> some other variable interfering with our measurement.
>> >>>>
>> >>>> Right.  If a VM exit happens while we're doing our measurement, you'll
>> >>>> see a significant delay.  And a guest OS can't prevent those from
>> >>>> happening.  But even on real hardware SMM mode may interfere with our
>> >>>> measurement.
>> >>>
>> >>> For machines like the ESXi VMs, the measurement seems to have to
>> >>> exclude such delayed values as outliers.  I think taking a lot of
>> >>> samples and choice the minimum is a good enough way for the purpose.
>> >>>
>> >>> I updated the diff.
>> >>>
>> >>> - delete lines for debug
>> >>> - make tsc quality lower if skew is not good enough
>> >>> - reduce difference from NetBSD
>> >>>
>> >>> comment? ok?
>> >> If more iterations fixes your problem, great.  It isn't going to make
>> >> things worse for machines with sync'd TSCs, makes the TSC usable on
>> >> another class of machine, and is relatively cheap.
>> >> This is ok cheloha@.
>> >> You need another ok, though.
>> > 
>> > 
>> > The diff is obviously fine. But it is still a heuristic with no real
>> > motivation except for this particular ESXi VM case. So my question
>> > about why we choose the minimum instead of the median or the mean has
>> > not been answered.
>> 
>> Because median or mean is affected by outliers.  We actually see
>> some outliers in samples from the VMware.
>> 
>> I suppose there is a better mesure, but I am currently no idia and had
>> not used that kind of measure in kernel.  On the other hand, finding
>> the minimum is very simple.
> 
> Using the median should take care of the outliers though.

You are right.  I misunderstood the meaning.

> I'm not at all convinced that taking the absolute value of the
> difference makes sense.  It probably works in this case since the
> actual skew on your VM is zero.  So measurements close to zero are
> "good".  But what if the skew isn't zero?  Take for example an AP that
> is running ahead of the BP by 5000 ticks.  In that case, the right
> value for the skew is -5000.  But now imagine that the BP gets
> "interrupted&qu

Re: monotonic time going back by wrong skews

2021-04-05 Thread YASUOKA Masahiko
Hi,

On Mon, 5 Apr 2021 10:43:00 +0300
Paul Irofti  wrote:
> On 05.04.2021 06:13, Scott Cheloha wrote:
>> On Mon, Mar 29, 2021 at 02:00:01PM +0900, YASUOKA Masahiko wrote:
>>> On Thu, 25 Mar 2021 19:41:35 +0100 (CET)
>>> Mark Kettenis  wrote:
>>>>> From: Scott Cheloha 
>>>>> Date: Thu, 25 Mar 2021 13:18:04 -0500
>>>>>> On Wed, Mar 24, 2021 at 05:40:21PM +0900, YASUOKA Masahiko wrote:
>>>>> Which diff did you apply?  Yasuoka provided two diffs.
>>>>>
>>>>> In any case, ignore this diff:
>>>>>
>>>>>> diff --git a/sys/arch/amd64/amd64/tsc.c b/sys/arch/amd64/amd64/tsc.c
>>>>>> index 238a5a068e1..3b951a8b5a3 100644
>>>>>> --- a/sys/arch/amd64/amd64/tsc.c
>>>>>> +++ b/sys/arch/amd64/amd64/tsc.c
>>>>>> @@ -212,7 +212,8 @@ cpu_recalibrate_tsc(struct timecounter *tc)
>>>>>> u_int
>>>>>> tsc_get_timecount(struct timecounter *tc)
>>>>>> {
>>>>>> -return rdtsc_lfence() + curcpu()->ci_tsc_skew;
>>>>>> +//return rdtsc_lfence() + curcpu()->ci_tsc_skew;
>>>>>> +return rdtsc_lfence();
>>>>>> }
>>>>>>
>>>>>> void
>>>>>
>>>>>
>>>>> We don't want to discard the skews, that's wrong.
>>>
>>> I'm sorry for the confusion.
>> No problem.
>> 
>>>>> The reason it "fixes" Yasuoka's problem is because the real skews
>>>>> on the ESXi VMs in question are probably close to zero but our
>>>>> synchronization algorithm is picking huge (wrong) skews due to
>>>>> some other variable interfering with our measurement.
>>>>
>>>> Right.  If a VM exit happens while we're doing our measurement, you'll
>>>> see a significant delay.  And a guest OS can't prevent those from
>>>> happening.  But even on real hardware SMM mode may interfere with our
>>>> measurement.
>>>
>>> For machines like the ESXi VMs, the measurement seems to have to
>>> exclude such delayed values as outliers.  I think taking a lot of
>>> samples and choice the minimum is a good enough way for the purpose.
>>>
>>> I updated the diff.
>>>
>>> - delete lines for debug
>>> - make tsc quality lower if skew is not good enough
>>> - reduce difference from NetBSD
>>>
>>> comment? ok?
>> If more iterations fixes your problem, great.  It isn't going to make
>> things worse for machines with sync'd TSCs, makes the TSC usable on
>> another class of machine, and is relatively cheap.
>> This is ok cheloha@.
>> You need another ok, though.
> 
> 
> The diff is obviously fine. But it is still a heuristic with no real
> motivation except for this particular ESXi VM case. So my question
> about why we choose the minimum instead of the median or the mean has
> not been answered.

Because median or mean is affected by outliers.  We actually see
some outliers in samples from the VMware.

I suppose there is a better mesure, but I am currently no idia and had
not used that kind of measure in kernel.  On the other hand, finding
the minimum is very simple.

> Another issue that I see is that people have not reported, at least
> publicly, that this runs fine on their normal OpenBSD machines.

Some dmesgs posted on public lists seems to have the same problem.

https://marc.info/?l=openbsd-bugs=2=1=disabling+user+TSC=b
https://marc.info/?l=openbsd-tech=2=1=disabling+user+TSC=b
https://marc.info/?l=openbsd-ports=2=1=disabling+user+TSC=b

For example,

https://marc.info/?l=openbsd-bugs=161618496905444=2

|Subject:wg(4) crash
|From:   Stuart Henderson 
|bios0: vendor Dell Inc. version "2.9.0" date 12/06/2019
|bios0: Dell Inc. PowerEdge R620
|cpu1: disabling user TSC (skew=135)
|cpu1: smt 0, core 0, package 1

https://marc.info/?l=openbsd-ports=161306073708427=2
|Subject:Re: sysutils/nut README APC over USB device chgrp/chmod
|From:   Marcus MERIGHI 
|bios0: vendor American Megatrends Inc. version "3.1" date 06/07/2018
|cpu11: disabling user TSC (skew=240)
|cpu11: smt 0, core 3, package 1

these 2 are real machine and using 2 CPU sockets.

https://marc.info/?l=openbsd-ports=161562278114172=2
|Subject:ruby27 vs Puppet
|From:   Giovanni Bechis 
|bios0: vendor Phoenix Technologies LTD version "6.00" date 12/12/2018
|bios0: VMware, Inc. VMware Virtual Platform
|cpu1: disabling user TSC (skew=-12705)

VMware.  seems the same problem of mine.

I'll ask people to do the same test

Re: monotonic time going back by wrong skews

2021-03-28 Thread YASUOKA Masahiko
On Thu, 25 Mar 2021 19:41:35 +0100 (CET)
Mark Kettenis  wrote:
>> From: Scott Cheloha 
>> Date: Thu, 25 Mar 2021 13:18:04 -0500
>> > On Wed, Mar 24, 2021 at 05:40:21PM +0900, YASUOKA Masahiko wrote:
>> Which diff did you apply?  Yasuoka provided two diffs.
>> 
>> In any case, ignore this diff:
>> 
>> > diff --git a/sys/arch/amd64/amd64/tsc.c b/sys/arch/amd64/amd64/tsc.c
>> > index 238a5a068e1..3b951a8b5a3 100644
>> > --- a/sys/arch/amd64/amd64/tsc.c
>> > +++ b/sys/arch/amd64/amd64/tsc.c
>> > @@ -212,7 +212,8 @@ cpu_recalibrate_tsc(struct timecounter *tc)
>> > u_int
>> > tsc_get_timecount(struct timecounter *tc)
>> > {
>> > -  return rdtsc_lfence() + curcpu()->ci_tsc_skew;
>> > +  //return rdtsc_lfence() + curcpu()->ci_tsc_skew;
>> > +  return rdtsc_lfence();
>> > }
>> > 
>> > void
>> 
>> 
>> We don't want to discard the skews, that's wrong.

I'm sorry for the confusion.

>> The reason it "fixes" Yasuoka's problem is because the real skews
>> on the ESXi VMs in question are probably close to zero but our
>> synchronization algorithm is picking huge (wrong) skews due to
>> some other variable interfering with our measurement.
> 
> Right.  If a VM exit happens while we're doing our measurement, you'll
> see a significant delay.  And a guest OS can't prevent those from
> happening.  But even on real hardware SMM mode may interfere with our
> measurement.

For machines like the ESXi VMs, the measurement seems to have to
exclude such delayed values as outliers.  I think taking a lot of
samples and choice the minimum is a good enough way for the purpose.

I updated the diff.

- delete lines for debug
- make tsc quality lower if skew is not good enough
- reduce difference from NetBSD

comment? ok?

Index: sys/arch/amd64//amd64/tsc.c
===
RCS file: /disk/cvs/openbsd/src/sys/arch/amd64/amd64/tsc.c,v
retrieving revision 1.23
diff -u -p -r1.23 tsc.c
--- sys/arch/amd64//amd64/tsc.c 23 Feb 2021 04:44:30 -  1.23
+++ sys/arch/amd64//amd64/tsc.c 29 Mar 2021 04:18:31 -
@@ -38,6 +38,7 @@ int   tsc_is_invariant;
 
 #defineTSC_DRIFT_MAX   250
 #define TSC_SKEW_MAX   100
+#defineTSC_SYNC_ROUNDS 1000
 int64_ttsc_drift_observed;
 
 volatile int64_t   tsc_sync_val;
@@ -235,6 +236,7 @@ tsc_timecounter_init(struct cpu_info *ci
printf("%s: disabling user TSC (skew=%lld)\n",
ci->ci_dev->dv_xname, (long long)ci->ci_tsc_skew);
tsc_timecounter.tc_user = 0;
+   tsc_timecounter.tc_quality = -1000;
}
 
if (!(ci->ci_flags & CPUF_PRIMARY) ||
@@ -314,13 +316,19 @@ tsc_read_bp(struct cpu_info *ci, uint64_
 void
 tsc_sync_bp(struct cpu_info *ci)
 {
+   int i, val, diff;
uint64_t bptsc, aptsc;
 
-   tsc_read_bp(ci, , ); /* discarded - cache effects */
-   tsc_read_bp(ci, , );
+   val = INT_MAX;
+   for (i = 0; i < TSC_SYNC_ROUNDS; i++) {
+   tsc_read_bp(ci, , );
+   diff = bptsc - aptsc;
+   if (abs(diff) < abs(val))
+   val = diff;
+   }
 
/* Compute final value to adjust for skew. */
-   ci->ci_tsc_skew = bptsc - aptsc;
+   ci->ci_tsc_skew = val;
 }
 
 /*
@@ -351,8 +359,10 @@ tsc_post_ap(struct cpu_info *ci)
 void
 tsc_sync_ap(struct cpu_info *ci)
 {
-   tsc_post_ap(ci);
-   tsc_post_ap(ci);
+   int i;
+
+   for (i = 0; i < TSC_SYNC_ROUNDS; i++)
+   tsc_post_ap(ci);
 }
 
 void



Re: fyi: get HP EliteBook 830 G7/G8 booting

2021-03-26 Thread YASUOKA Masahiko
On Fri, 26 Mar 2021 12:12:44 +0100 (CET)
Mark Kettenis  wrote:
>> Date: Fri, 26 Mar 2021 19:43:23 +0900 (JST)
>> From: YASUOKA Masahiko 
>> 
>> Hi,
>> 
>> On Fri, 26 Mar 2021 09:30:43 +0100
>> Jan Klemkow  wrote:
>> > If you want to boot OpenBSD on an HP EliteBook 830 G7/G8, the bootloader
>> > will hang while loading the kernel.  Because, the UEFI loads the
>> > bootloader on the same place in memory, where the bootloader will copy
>> > the kernel.  We are unable to load the kernel on arbitrary memory.
>> > Thus, the following diff will help you, to get OpenBSD running on these
>> > machines.  It moves the hardcoded Kernel address to a free place.
>> 
>> The openbsd efiboot copies the kernel to that place after
>> ExitBootServices().
>> 
>> sys/arch/amd64/stand/efiboot/exec_i386.c
>> 152 /*
>> 153  * Move the loaded kernel image to the usual place after 
>> calling
>> 154  * ExitBootServices().
>> 155  */
>> 156 #ifdef __amd64__
>> 157 protect_writeable(marks[MARK_START] + delta,
>> 158 marks[MARK_END] - marks[MARK_START]);
>> 159 #endif
>> 160 memmove((void *)marks[MARK_START] + delta, (void 
>> *)marks[MARK_START],
>> 161 marks[MARK_END] - marks[MARK_START]);
>> 162 for (i = 0; i < MARK_MAX; i++)
>> 163 marks[i] += delta;
>> 164 
>> 165 #ifdef __amd64__
>> 166 (*run_i386)((u_long)run_i386, entry, howto, bootdev, 
>> BOOTARG_APIVER,
>> 167 marks[MARK_END], extmem, cnvmem, ac, (intptr_t)av);
>> 
>> 
>> I think it should work without the ld.script change..
> 
> The (likely) problem is that the memmove() on line 160 is overwriting
> the bootloader code itself.
> 
> There are essentially two ways to fix this:
> 
> 1. Have the bootloader relocate itself to an address that doesn't
>conflict with the kernel to be loaded.
> 
> 2. Make it possible for the kernel to be loaded at a (somewhat)
>arbitrary physical address.
> 
> In my view #2 is the way forward.  There are other reasons why that
> would be beneficial as it would make it less predictable at which
> physical address the kernel code lives which could prevent some
> attacks that use the direct map.
> 
> #2 is also the approach taken by the EFIBOOT on armv7 and arm64.  On
> arm64 for example, EFIBOOT loads the kernel into a 64MB memory block
> that is aligned on a 2MB boundary.  The kernel then figures out its
> load address based on that and and patches things up accordingly.

In this senario, what efiboot should do is just jumping "start64"
(entry point for 64bit) of the kernel, and other things are done after
the start64?

> mlarkin@ was doing some work to change how we load the amd64 kernel.
> His approach was to let the bootloader build the initial page tables
> and jump into the kernel in 64-bit mode with the MMU enabled.  That
> was more focussed on running the kernel at a randomized virtual
> address.  But it should be fairly easy to make it run at a different
> physical address as well this way.  Unfortunately that effort was
> mostly focussed on the legacy bootloader.



Re: fyi: get HP EliteBook 830 G7/G8 booting

2021-03-26 Thread YASUOKA Masahiko
Hi,

On Fri, 26 Mar 2021 09:30:43 +0100
Jan Klemkow  wrote:
> If you want to boot OpenBSD on an HP EliteBook 830 G7/G8, the bootloader
> will hang while loading the kernel.  Because, the UEFI loads the
> bootloader on the same place in memory, where the bootloader will copy
> the kernel.  We are unable to load the kernel on arbitrary memory.
> Thus, the following diff will help you, to get OpenBSD running on these
> machines.  It moves the hardcoded Kernel address to a free place.

The openbsd efiboot copies the kernel to that place after
ExitBootServices().

sys/arch/amd64/stand/efiboot/exec_i386.c
152 /*
153  * Move the loaded kernel image to the usual place after calling
154  * ExitBootServices().
155  */
156 #ifdef __amd64__
157 protect_writeable(marks[MARK_START] + delta,
158 marks[MARK_END] - marks[MARK_START]);
159 #endif
160 memmove((void *)marks[MARK_START] + delta, (void 
*)marks[MARK_START],
161 marks[MARK_END] - marks[MARK_START]);
162 for (i = 0; i < MARK_MAX; i++)
163 marks[i] += delta;
164 
165 #ifdef __amd64__
166 (*run_i386)((u_long)run_i386, entry, howto, bootdev, 
BOOTARG_APIVER,
167 marks[MARK_END], extmem, cnvmem, ac, (intptr_t)av);


I think it should work without the ld.script change..



Re: monotonic time going back by wrong skews

2021-03-24 Thread YASUOKA Masahiko
Hi,

> Second, why is taking the minimum value the optimal choice? I would
> assume an average would be better. Basically if you have a sequency
> like 900, 900, 900, 900, 0, 900, 900, 900 you pick 0 which could lead
> to some problems, right? Or am I missing something?"

Skews on VMware

>> -8445 -6643 -52183 0-3-4-7   -11-5 0
>>-11-9-5-3-4-3-7 8-5-6
>> -5-9-3-9-7-1-5-5-9-2
>> -6-4-6-4   -11-8-3-4-8-1
>> -9-1-8 1-8 6-5-4 2-2
>> -8-3-1-5-2-2 1 2-2-9
>>-12 0-9-2-2-5-2 1 2 0


First 3 seem to be storange.  Also there is such a value on middle of
sampling.

>>  9-1   -10 50505-1 2 6   -11 2-2

I suppose such values should be excluded.

Also I did same test on my VAIO.  It seems more constant than VMware.
Full result is attached at last.

Is it possible that the calculation code is taking effects from the
CPU scheduler of its virtual supervisor?

Thanks,

On Wed, 24 Mar 2021 13:04:32 +0200
Paul Irofti  wrote:
> Hi,
> 
> Thank you for taking this to tech@ as requested!
> 
> I will reproduce here what I replied to Yasouka and Scott (which I
> think proposed taking the minimum skew value) in private.
> 
> "First, thank you very much for the in-depth analysis. I would suggest
> you take this to a public forum like tech@ so that we can keep the
> discussion opened and civilized.
> 
> I remember when I wrote the CPU synchronization code, that I tried
> doing sampling but it had some issues that now I don't remember of. So
> let us try this on real hardware too. This is another argument for
> moving this to tech@.
> 
> Second, why is taking the minimum value the optimal choice? I would
> assume an average would be better. Basically if you have a sequency
> like 900, 900, 900, 900, 0, 900, 900, 900 you pick 0 which could lead
> to some problems, right? Or am I missing something?"
> 
> So could people give the minimum skew approach a spin on real machines
> to see if there are any issues popping up?
> 
> All the best,
> Paul
> 
> On 3/24/21 10:40 AM, YASUOKA Masahiko wrote:
>> Hi,
>> I hit a problem which is caused by going back of monotonic time.  It
>> happens on hosts on VMware ESXi.
>> I wrote the program which repeats the problem.
>>   % cc -o monotime monotime.c -lpthread
>>   % ./monotime
>>   194964 Starting
>>   562210 Starting
>>   483046 Starting
>>   148865 Starting
>>   148865 Back 991.808048665 => 991.007447931
>>   562210 Back 991.808048885 => 991.007448224
>>   483046 Back 991.808049115 => 991.007449172
>>   148865 Stopped
>>   562210 Stopped
>>   483046 Stopped
>>   194964 Stopped
>>   % uname -a
>>   OpenBSD yasuoka-ob-c.tokyo.iiji.jp 6.8 GENERIC.MP#5 amd64
>>   % sysctl kern.version
>>   kern.version=OpenBSD 6.8 (GENERIC.MP) #5: Mon Feb 22 04:36:10 MST 2021
>>   
>> r...@syspatch-68-amd64.openbsd.org:/usr/src/sys/arch/amd64/compile/GENERIC.MP
>>   %
>> monotime.c
>> 
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>> #define NTHREAD  4
>> #define NTRY 5
>> void *
>> start(void *dummy)
>> {
>>  int i;
>>  struct timespec ts0, ts1;
>>  printf("%d Starting\n", (int)getthrid());
>>  clock_gettime(CLOCK_MONOTONIC, );
>>  for (i = 0; i < NTRY; i++) {
>>  clock_gettime(CLOCK_MONOTONIC, );
>>  if (timespeccmp(, , <=)) {
>>  ts0 = ts1;
>>  continue;
>>  }
>>  printf("%d Back %lld.%09lu => %lld.%09lu\n",
>>  (int)getthrid(), ts0.tv_sec, ts0.tv_nsec, ts1.tv_sec,
>>  ts1.tv_nsec);
>>  break;
>>  }
>>  printf("%d Stopped\n", (int)getthrid());
>>  return (NULL);
>> }
>> int
>> main(int argc, char *argv[])
>> {
>>  int i, n = NTHREAD;
>>  pthread_t *threads;
>>  threads = calloc(n, sizeof(pthread_t));
>>  for (i = 0; i < n; i++)
>>  pthread_create([i], NULL, start, NULL);
>>  for (i = 0; i < n; i++)
>>  pthread_join(threads[i], NULL);
>> }
>> 
>> The machine has 4 vCPUs and showing the following message on boot.
>>cpu1: disabling user TSC

monotonic time going back by wrong skews

2021-03-24 Thread YASUOKA Masahiko
Hi,

I hit a problem which is caused by going back of monotonic time.  It
happens on hosts on VMware ESXi.

I wrote the program which repeats the problem.

 % cc -o monotime monotime.c -lpthread
 % ./monotime
 194964 Starting
 562210 Starting
 483046 Starting
 148865 Starting
 148865 Back 991.808048665 => 991.007447931
 562210 Back 991.808048885 => 991.007448224
 483046 Back 991.808049115 => 991.007449172
 148865 Stopped
 562210 Stopped
 483046 Stopped
 194964 Stopped
 % uname -a
 OpenBSD yasuoka-ob-c.tokyo.iiji.jp 6.8 GENERIC.MP#5 amd64
 % sysctl kern.version
 kern.version=OpenBSD 6.8 (GENERIC.MP) #5: Mon Feb 22 04:36:10 MST 2021
 
r...@syspatch-68-amd64.openbsd.org:/usr/src/sys/arch/amd64/compile/GENERIC.MP
 %

monotime.c

#include 
#include 
#include 
#include 
#include 
#include 
#include 

#define NTHREAD 4
#define NTRY5

void *
start(void *dummy)
{
int i;
struct timespec ts0, ts1;

printf("%d Starting\n", (int)getthrid());
clock_gettime(CLOCK_MONOTONIC, );

for (i = 0; i < NTRY; i++) {
clock_gettime(CLOCK_MONOTONIC, );
if (timespeccmp(, , <=)) {
ts0 = ts1;
continue;
}
printf("%d Back %lld.%09lu => %lld.%09lu\n",
(int)getthrid(), ts0.tv_sec, ts0.tv_nsec, ts1.tv_sec,
ts1.tv_nsec);
break;
}
printf("%d Stopped\n", (int)getthrid());

return (NULL);
}

int
main(int argc, char *argv[])
{
int i, n = NTHREAD;
pthread_t *threads;

threads = calloc(n, sizeof(pthread_t));

for (i = 0; i < n; i++)
pthread_create([i], NULL, start, NULL);
for (i = 0; i < n; i++)
pthread_join(threads[i], NULL);

}


The machine has 4 vCPUs and showing the following message on boot.

  cpu1: disabling user TSC (skew=-5310)
  cpu2: disabling user TSC (skew=-5335)
  cpu3: disabling user TSC (skew=-7386)

This means "user TSC" is disabled because of TSC of cpu{1,2,3} is much
delayed against cpu0.

Simply ignoring the skews by the following diff seems to workaround
this problem.

diff --git a/sys/arch/amd64/amd64/tsc.c b/sys/arch/amd64/amd64/tsc.c
index 238a5a068e1..3b951a8b5a3 100644
--- a/sys/arch/amd64/amd64/tsc.c
+++ b/sys/arch/amd64/amd64/tsc.c
@@ -212,7 +212,8 @@ cpu_recalibrate_tsc(struct timecounter *tc)
 u_int
 tsc_get_timecount(struct timecounter *tc)
 {
-   return rdtsc_lfence() + curcpu()->ci_tsc_skew;
+   //return rdtsc_lfence() + curcpu()->ci_tsc_skew;
+   return rdtsc_lfence();
 }
 
 void

So I supposed the skews are not calculated properly.  Also I found
NetBSD changed the skew calculating so that it checks 1000 times and
take the minimum value.

  https://github.com/NetBSD/src/commit/1dec05c1ae197b4acfc7038e49dfddabcbed0dff
  https://github.com/NetBSD/src/commit/66d76b89792bac1c71cd5507ba62b08ad02129ef


I checked skews on the machine by the following debug code.

diff --git a/sys/arch/amd64/amd64/tsc.c b/sys/arch/amd64/amd64/tsc.c
index 238a5a068e1..83e835e4f82 100644
--- a/sys/arch/amd64/amd64/tsc.c
+++ b/sys/arch/amd64/amd64/tsc.c
@@ -302,16 +302,42 @@ tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, 
uint64_t *aptscp)
*aptscp = tsc_sync_val;
 }
 
+#defineTSC_SYNC_NTIMES 1000
+
+static int tsc_difs[MAXCPUS][TSC_SYNC_NTIMES];
+
+void
+tsc_debug(void)
+{
+   int i, cpuid = curcpu()->ci_cpuid;
+
+   for (i = 0; i < TSC_SYNC_NTIMES; i++) {
+   if (i % 10 == 0)
+   printf("%5d", tsc_difs[cpuid][i]);
+   else
+   printf(" %5d", tsc_difs[cpuid][i]);
+   if (i % 10 == 9)
+   printf("\n");
+   }
+   printf("\n");
+}
+
 void
 tsc_sync_bp(struct cpu_info *ci)
 {
+   int i, mindif = INT_MAX, dif;
uint64_t bptsc, aptsc;
 
-   tsc_read_bp(ci, , ); /* discarded - cache effects */
-   tsc_read_bp(ci, , );
+   for (i = 0; i < TSC_SYNC_NTIMES; i++) {
+   tsc_read_bp(ci, , );
+   dif = bptsc - aptsc;
+   if (abs(dif) < abs(mindif))
+   mindif = dif;
+   tsc_difs[ci->ci_cpuid][i] = dif;
+   }
 
/* Compute final value to adjust for skew. */
-   ci->ci_tsc_skew = bptsc - aptsc;
+   ci->ci_tsc_skew = mindif;
 }
 
 /*
@@ -342,8 +368,10 @@ tsc_post_ap(struct cpu_info *ci)
 void
 tsc_sync_ap(struct cpu_info *ci)
 {
-   tsc_post_ap(ci);
-   tsc_post_ap(ci);
+   int i;
+
+   for (i = 0; i < TSC_SYNC_NTIMES; i++)
+   tsc_post_ap(ci);
 }
 
 void


Stopped at  db_enter+0x10:  popq%rbp
ddb{0}> machine ddbcpu 1
Stopped at  x86_ipi_db+0x12:leave
ddb{1}> call tsc_debug
-8445 -6643 -52183 0-3-4-7   -11-5 0
  -11-9-5-3-4-3-7 8-5-6
   -5-9-3-9-7-1-5

Re: diff: efiboot: alignment for media which has IoAlign > 1

2021-03-10 Thread YASUOKA Masahiko
On Wed, 10 Mar 2021 13:15:58 +0100 (CET)
Mark Kettenis  wrote:
>> On Wed, 10 Mar 2021 20:35:41 +0900 (JST)
>> YASUOKA Masahiko  wrote:
>> > efiboot cannot load the kernel properly on some machines if booted
>> > from CD-ROM.  In that case boot fails with a message like follow:
>> > 
>> >booting cd0a:. [359648read symbols: Unknown error: code 255
>> > 
>> > As far as Asou and my test, this happens on hosts on VMware ESXi 6.7,
>> > 7.0 and asou's physical machine.
>> > 
>> > The problem happens because efiboot calls ReadBlocks function with an
>> > unaligned pointer for medias which requires an aligned pointer.  When
>> > efiboot loads a kernel, the pointer becomes unaligned since there is
>> > an ELF section located at unaligned place in CD-ROM.  Previously our
>> > kernel didn't have such a section but it does after switching lld as
>> > the default linker.
>> > 
>> > For test, let me show sample commands which creates a bootable cdrom
>> > image for EFI:
>> > 
>> > mkdir -p efiboot/EFI/BOOT
>> > cp /usr/mdec/BOOTX64.EFI efiboot/EFI/BOOT
>> > makefs -M 1m -m 1m -t msdos -o fat_type=12,sectors_per_cluster=1 \
>> > efiboot.img efiboot
>> > mkdir -p cd-dir/etc
>> > cp bsd.rd cd-dir/
>> > echo "set image bsd.rd" > cd-dir/etc/boot.conf
>> > makefs -t cd9660 -o 
>> > 'rockridge,bootimage=i386;/usr/mdec/cdbr,no-emul-boot,allow-multidot,bootimage=efi;efiboot.img,no-emul-boot'
>> >  \
>> >boot.iso cd-dir
>> > 
>> > the diff is to fix the problem.
>> > 
>> > ok?
> 
> Maybe it is better to always bounce through an aligned buffer?  That
> would make the code a little bit slower but a lot simpler.  And the
> overhead of doing the copy should be small compared to the actual I/O.

Indeed.  It became much simpler.  As I tested on ESXi 7.0, vaio, and
qemu, I don't feel significant performance regression.

ok?

Index: sys/arch/amd64/stand/efiboot/efidev.c
===
RCS file: /var/cvs/openbsd/src/sys/arch/amd64/stand/efiboot/efidev.c,v
retrieving revision 1.32
diff -u -p -r1.32 efidev.c
--- sys/arch/amd64/stand/efiboot/efidev.c   9 Dec 2020 18:10:18 -   
1.32
+++ sys/arch/amd64/stand/efiboot/efidev.c   11 Mar 2021 05:59:41 -
@@ -84,10 +84,10 @@ efid_init(struct diskinfo *dip, void *ha
 static EFI_STATUS
 efid_io(int rw, efi_diskinfo_t ed, u_int off, int nsect, void *buf)
 {
-   u_intblks, lba, i_lblks, i_tblks, i_nblks;
+   u_intblks, start, end;
EFI_STATUS   status = EFI_SUCCESS;
-   static u_char   *iblk = NULL;
-   static u_int iblksz = 0;
+   static u_char   *ibuf = NULL;
+   static u_int ibufsz = 0;
 
/* block count of the intrisic block size in DEV_BSIZE */
blks = EFI_BLKSPERSEC(ed);
@@ -95,90 +95,46 @@ efid_io(int rw, efi_diskinfo_t ed, u_int
/* block size < 512.  HP Stream 13 actually has such a disk. */
return (EFI_UNSUPPORTED);
 
-   /* leading and trailing unaligned blocks in intrisic block */
-   i_lblks = ((off % blks) == 0)? 0 : blks - (off % blks);
-   i_tblks = (nsect > i_lblks)? (off + nsect) % blks : 0;
-
-   /* aligned blocks in intrisic block */
-   i_nblks = (nsect > i_lblks + i_tblks)? nsect - (i_lblks + i_tblks) : 0;
-
-   lba = (off + i_lblks) / blks;
-
-   /* allocate the space for reading unaligned blocks */
-   if (ed->blkio->Media->BlockSize != DEV_BSIZE) {
-   if (iblk && iblksz < ed->blkio->Media->BlockSize) {
-   free(iblk, iblksz);
-   iblk = NULL;
-   }
-   if (iblk == NULL) {
-   iblk = alloc(ed->blkio->Media->BlockSize);
-   iblksz = ed->blkio->Media->BlockSize;
-   }
+   start = off / blks;
+   end = (off + nsect + blks - 1) / blks;
+   /*
+* Prepare a buffer to use an aligned memory always that might be
+* required by some medias
+*/
+   if (ibuf && ibufsz < (end - start) * ed->blkio->Media->BlockSize) {
+   free(ibuf, ibufsz);
+   ibuf = NULL;
+   }
+   if (ibuf == NULL) {
+   ibufsz = (end - start) * ed->blkio->Media->BlockSize;
+   ibuf = alloc(ibufsz);
}
+
switch (rw) {
case F_READ:
-   if (i_lblks > 0) {
-   status = EFI_CALL(ed->blkio->ReadBlocks,
-   ed->blkio

Re: diff: efiboot: alignment for media which has IoAlign > 1

2021-03-10 Thread YASUOKA Masahiko
Sorry for making noise, let me update the diff.

> + if (ed->blkio->Media->IoAlign > 1 &&
> + ((UINTN)buf + i_lblks * DEV_BSIZE)
> + % ed->blkio->Media->IoAlign == 0)

first condition was reversed..

On Wed, 10 Mar 2021 20:35:41 +0900 (JST)
YASUOKA Masahiko  wrote:
> efiboot cannot load the kernel properly on some machines if booted
> from CD-ROM.  In that case boot fails with a message like follow:
> 
>booting cd0a:. [359648read symbols: Unknown error: code 255
> 
> As far as Asou and my test, this happens on hosts on VMware ESXi 6.7,
> 7.0 and asou's physical machine.
> 
> The problem happens because efiboot calls ReadBlocks function with an
> unaligned pointer for medias which requires an aligned pointer.  When
> efiboot loads a kernel, the pointer becomes unaligned since there is
> an ELF section located at unaligned place in CD-ROM.  Previously our
> kernel didn't have such a section but it does after switching lld as
> the default linker.
> 
> For test, let me show sample commands which creates a bootable cdrom
> image for EFI:
> 
> mkdir -p efiboot/EFI/BOOT
> cp /usr/mdec/BOOTX64.EFI efiboot/EFI/BOOT
> makefs -M 1m -m 1m -t msdos -o fat_type=12,sectors_per_cluster=1 \
> efiboot.img efiboot
> mkdir -p cd-dir/etc
> cp bsd.rd cd-dir/
> echo "set image bsd.rd" > cd-dir/etc/boot.conf
> makefs -t cd9660 -o 
> 'rockridge,bootimage=i386;/usr/mdec/cdbr,no-emul-boot,allow-multidot,bootimage=efi;efiboot.img,no-emul-boot'
>  \
>   boot.iso cd-dir
> 
> the diff is to fix the problem.
> 
> ok?

Index: sys/arch/amd64/stand/efiboot/efidev.c
===
RCS file: /disk/cvs/openbsd/src/sys/arch/amd64/stand/efiboot/efidev.c,v
retrieving revision 1.32
diff -u -p -r1.32 efidev.c
--- sys/arch/amd64/stand/efiboot/efidev.c   9 Dec 2020 18:10:18 -   
1.32
+++ sys/arch/amd64/stand/efiboot/efidev.c   10 Mar 2021 11:41:39 -
@@ -84,7 +84,7 @@ efid_init(struct diskinfo *dip, void *ha
 static EFI_STATUS
 efid_io(int rw, efi_diskinfo_t ed, u_int off, int nsect, void *buf)
 {
-   u_intblks, lba, i_lblks, i_tblks, i_nblks;
+   u_inti, blks, lba, i_lblks, i_tblks, i_nblks;
EFI_STATUS   status = EFI_SUCCESS;
static u_char   *iblk = NULL;
static u_int iblksz = 0;
@@ -127,10 +127,29 @@ efid_io(int rw, efi_diskinfo_t ed, u_int
min(nsect, i_lblks) * DEV_BSIZE);
}
if (i_nblks > 0) {
-   status = EFI_CALL(ed->blkio->ReadBlocks,
-   ed->blkio, ed->mediaid, lba,
-   ed->blkio->Media->BlockSize * (i_nblks / blks),
-   buf + (i_lblks * DEV_BSIZE));
+   /*
+* Pass the buffer directly to the EFI function only if
+* the buffer is properly aligned as the media requires
+*/
+   if (ed->blkio->Media->IoAlign <= 1 ||
+   ((UINTN)buf + i_lblks * DEV_BSIZE)
+   % ed->blkio->Media->IoAlign == 0)
+   status = EFI_CALL(ed->blkio->ReadBlocks,
+   ed->blkio, ed->mediaid, lba,
+   ed->blkio->Media->BlockSize * (i_nblks /
+   blks), buf + i_lblks * DEV_BSIZE);
+   else {
+   for (i = 0; i < i_nblks; i += blks) {
+   status = EFI_CALL(ed->blkio->ReadBlocks,
+   ed->blkio, ed->mediaid,
+   lba + i / blks,
+   ed->blkio->Media->BlockSize, iblk);
+   if (EFI_ERROR(status))
+   break;
+   memcpy(buf + i * DEV_BSIZE, iblk,
+   ed->blkio->Media->BlockSize);
+   }
+   }
if (EFI_ERROR(status))
goto on_eio;
}
@@ -160,10 +179,30 @@ efid_io(int rw, efi_diskinfo_t ed, u_int
ed->blkio->Media->BlockSize, iblk);
}
if (i_nblks > 0) {
-   status = EFI_CALL(ed->blkio->WriteBlocks,
-   ed->blkio, ed->mediaid, lba,
-

diff: efiboot: alignment for media which has IoAlign > 1

2021-03-10 Thread YASUOKA Masahiko
Hi,

efiboot cannot load the kernel properly on some machines if booted
from CD-ROM.  In that case boot fails with a message like follow:

   booting cd0a:. [359648read symbols: Unknown error: code 255

As far as Asou and my test, this happens on hosts on VMware ESXi 6.7,
7.0 and asou's physical machine.

The problem happens because efiboot calls ReadBlocks function with an
unaligned pointer for medias which requires an aligned pointer.  When
efiboot loads a kernel, the pointer becomes unaligned since there is
an ELF section located at unaligned place in CD-ROM.  Previously our
kernel didn't have such a section but it does after switching lld as
the default linker.

For test, let me show sample commands which creates a bootable cdrom
image for EFI:

mkdir -p efiboot/EFI/BOOT
cp /usr/mdec/BOOTX64.EFI efiboot/EFI/BOOT
makefs -M 1m -m 1m -t msdos -o fat_type=12,sectors_per_cluster=1 \
efiboot.img efiboot
mkdir -p cd-dir/etc
cp bsd.rd cd-dir/
echo "set image bsd.rd" > cd-dir/etc/boot.conf
makefs -t cd9660 -o 
'rockridge,bootimage=i386;/usr/mdec/cdbr,no-emul-boot,allow-multidot,bootimage=efi;efiboot.img,no-emul-boot'
 \
boot.iso cd-dir

the diff is to fix the problem.

ok?

Index: sys/arch/amd64/stand/efiboot/efidev.c
===
RCS file: /disk/cvs/openbsd/src/sys/arch/amd64/stand/efiboot/efidev.c,v
retrieving revision 1.32
diff -u -p -r1.32 efidev.c
--- sys/arch/amd64/stand/efiboot/efidev.c   9 Dec 2020 18:10:18 -   
1.32
+++ sys/arch/amd64/stand/efiboot/efidev.c   10 Mar 2021 10:58:35 -
@@ -84,7 +84,7 @@ efid_init(struct diskinfo *dip, void *ha
 static EFI_STATUS
 efid_io(int rw, efi_diskinfo_t ed, u_int off, int nsect, void *buf)
 {
-   u_intblks, lba, i_lblks, i_tblks, i_nblks;
+   u_inti, blks, lba, i_lblks, i_tblks, i_nblks;
EFI_STATUS   status = EFI_SUCCESS;
static u_char   *iblk = NULL;
static u_int iblksz = 0;
@@ -127,10 +127,29 @@ efid_io(int rw, efi_diskinfo_t ed, u_int
min(nsect, i_lblks) * DEV_BSIZE);
}
if (i_nblks > 0) {
-   status = EFI_CALL(ed->blkio->ReadBlocks,
-   ed->blkio, ed->mediaid, lba,
-   ed->blkio->Media->BlockSize * (i_nblks / blks),
-   buf + (i_lblks * DEV_BSIZE));
+   /*
+* Pass the buffer directly to the EFI function only if
+* the buffer is properly aligned as the media requires
+*/
+   if (ed->blkio->Media->IoAlign > 1 &&
+   ((UINTN)buf + i_lblks * DEV_BSIZE)
+   % ed->blkio->Media->IoAlign == 0)
+   status = EFI_CALL(ed->blkio->ReadBlocks,
+   ed->blkio, ed->mediaid, lba,
+   ed->blkio->Media->BlockSize * (i_nblks /
+   blks), buf + i_lblks * DEV_BSIZE);
+   else {
+   for (i = 0; i < i_nblks; i += blks) {
+   status = EFI_CALL(ed->blkio->ReadBlocks,
+   ed->blkio, ed->mediaid,
+   lba + i / blks,
+   ed->blkio->Media->BlockSize, iblk);
+   if (EFI_ERROR(status))
+   break;
+   memcpy(buf + i * DEV_BSIZE, iblk,
+   ed->blkio->Media->BlockSize);
+   }
+   }
if (EFI_ERROR(status))
goto on_eio;
}
@@ -160,10 +179,30 @@ efid_io(int rw, efi_diskinfo_t ed, u_int
ed->blkio->Media->BlockSize, iblk);
}
if (i_nblks > 0) {
-   status = EFI_CALL(ed->blkio->WriteBlocks,
-   ed->blkio, ed->mediaid, lba,
-   ed->blkio->Media->BlockSize * (i_nblks / blks),
-   buf + (i_lblks * DEV_BSIZE));
+   /*
+* Pass the buffer directly to the EFI function only if
+* the buffer is properly aligned as the media requires
+*/
+   if (ed->blkio->Media->IoAlign > 1 &&
+   ((UINTN)buf + i_lblks * DEV_BSIZE)
+   % ed->blkio->Media->IoAlign == 0)
+   status = EFI_CALL(ed->blkio->WriteBlocks,
+   ed->blkio, ed->mediaid, lba,
+   

Re: 2 diffs for dev/acpi/dsdt.c

2021-02-27 Thread YASUOKA Masahiko
Hi,

Let me update "diff #2".

On Fri, 26 Feb 2021 13:42:32 +0900 (JST)
YASUOKA Masahiko  wrote:
> My vaio repeatedly crashed by "Data modified on freelist"(*1) or other
> memory corruptions.  After my long time debug, I found the route cause
> is a handling of references of LocalX, like the following:
> 
> If ((SMRW (0x0B, 0x16, 0x21, RefOf (Local0)) == Zero))
> 
> In the called control method, "RefOf (Local1)" is referred as Arg3, is
> stored a value like the following:
> 
> Arg3 = \_SB.PCI0.LPCB.EC0.SMD0
> 
> In aml_store(), lvalue is reset if lvalue is a LocalX.  But since that
> was done before resolving the reference, lvalue was not reset if
> lvalue is a reference of LocalX.
> 
> diff #1 fixes that problem.  It resets lvalue after resolving
> references.
> 
> ok?
> 
> diff #2 adds aml_die() if any memory corruption occurs when creating
> field in a buffer.  This actually happens on my vaio (pro pk 14) if
> diff #1 is not applied.
> 
> ok?
> 
> diff #1
> 
> Index: sys/dev/acpi/dsdt.c
> ===
> RCS file: /var/cvs/openbsd/src/sys/dev/acpi/dsdt.c,v
> retrieving revision 1.257
> diff -u -p -r1.257 dsdt.c
> --- sys/dev/acpi/dsdt.c   17 Dec 2020 17:57:19 -  1.257
> +++ sys/dev/acpi/dsdt.c   26 Feb 2021 04:12:03 -
> @@ -2961,11 +2961,11 @@ aml_store(struct aml_scope *scope, struc
>   aml_rwfield(rhs, 0, rhs->v_field.bitlen, , ACPI_IOREAD);
>   rhs = 
>   }
> +
> + lhs = aml_gettgt(lhs, AMLOP_STORE);
>   /* Store to LocalX: free value */
>   if (lhs->stack >= AMLOP_LOCAL0 && lhs->stack <= AMLOP_LOCAL7)
>   aml_freevalue(lhs);
> -
> - lhs = aml_gettgt(lhs, AMLOP_STORE);
>   switch (lhs->type) {
>   case AML_OBJTYPE_UNINITIALIZED:
>   aml_copyvalue(lhs, rhs);
> 
> diff #2
> 
> Index: sys/dev/acpi/dsdt.c
> ===
> RCS file: /var/cvs/openbsd/src/sys/dev/acpi/dsdt.c,v
> retrieving revision 1.257
> diff -u -p -r1.257 dsdt.c
> --- sys/dev/acpi/dsdt.c   17 Dec 2020 17:57:19 -  1.257
> +++ sys/dev/acpi/dsdt.c   26 Feb 2021 04:33:21 -
> @@ -2742,11 +2742,17 @@ aml_rwfield(struct aml_value *fld, int b
>   } else if (mode == ACPI_IOREAD) {
>   /* bufferfield:read */
>   _aml_setvalue(val, AML_OBJTYPE_INTEGER, 0, 0);
> + if (ref1->length < aml_bytepos(fld->v_field.bitpos) +
> + aml_bytelen(fld->v_field.bitlen))
> + aml_die("bufferfield:read out of range");
>   aml_bufcpy(>v_integer, 0, ref1->v_buffer,
>   fld->v_field.bitpos, fld->v_field.bitlen);
>   } else {
>   /* bufferfield:write */
>   val = aml_convert(val, AML_OBJTYPE_INTEGER, -1);
> + if (ref1->length < aml_bytepos(fld->v_field.bitpos) +
> + aml_bytelen(fld->v_field.bitlen))
> + aml_die("bufferfield:write out of range");
>   aml_bufcpy(ref1->v_buffer, fld->v_field.bitpos, >v_integer,
>   0, fld->v_field.bitlen);
>   aml_delref(, "wrbuffld");

It's better to die when creating a field which refers out of range
memory.

ok?

Index: sys/dev/acpi/dsdt.c
===
RCS file: /disk/cvs/openbsd/src/sys/dev/acpi/dsdt.c,v
retrieving revision 1.257
diff -u -p -r1.257 dsdt.c
--- sys/dev/acpi/dsdt.c 17 Dec 2020 17:57:19 -  1.257
+++ sys/dev/acpi/dsdt.c 27 Feb 2021 09:58:31 -
@@ -2790,6 +2790,11 @@ aml_createfield(struct aml_value *field,
data->type != AML_OBJTYPE_BUFFER)
data = aml_convert(data, AML_OBJTYPE_BUFFER, -1);
 
+   if (field->type == AML_OBJTYPE_BUFFERFIELD &&
+   data->length < aml_bytepos(bpos) + aml_bytelen(blen))
+   aml_die("%s(%s) out of range\n", aml_mnem(opcode, 0),
+   aml_nodename(field->node));
+
field->v_field.type = opcode;
field->v_field.bitpos = bpos;
field->v_field.bitlen = blen;



2 diffs for dev/acpi/dsdt.c

2021-02-25 Thread YASUOKA Masahiko
Hi,

My vaio repeatedly crashed by "Data modified on freelist"(*1) or other
memory corruptions.  After my long time debug, I found the route cause
is a handling of references of LocalX, like the following:

If ((SMRW (0x0B, 0x16, 0x21, RefOf (Local0)) == Zero))

In the called control method, "RefOf (Local1)" is referred as Arg3, is
stored a value like the following:

Arg3 = \_SB.PCI0.LPCB.EC0.SMD0

In aml_store(), lvalue is reset if lvalue is a LocalX.  But since that
was done before resolving the reference, lvalue was not reset if
lvalue is a reference of LocalX.

diff #1 fixes that problem.  It resets lvalue after resolving
references.

ok?

diff #2 adds aml_die() if any memory corruption occurs when creating
field in a buffer.  This actually happens on my vaio (pro pk 14) if
diff #1 is not applied.

ok?

diff #1

Index: sys/dev/acpi/dsdt.c
===
RCS file: /var/cvs/openbsd/src/sys/dev/acpi/dsdt.c,v
retrieving revision 1.257
diff -u -p -r1.257 dsdt.c
--- sys/dev/acpi/dsdt.c 17 Dec 2020 17:57:19 -  1.257
+++ sys/dev/acpi/dsdt.c 26 Feb 2021 04:12:03 -
@@ -2961,11 +2961,11 @@ aml_store(struct aml_scope *scope, struc
aml_rwfield(rhs, 0, rhs->v_field.bitlen, , ACPI_IOREAD);
rhs = 
}
+
+   lhs = aml_gettgt(lhs, AMLOP_STORE);
/* Store to LocalX: free value */
if (lhs->stack >= AMLOP_LOCAL0 && lhs->stack <= AMLOP_LOCAL7)
aml_freevalue(lhs);
-
-   lhs = aml_gettgt(lhs, AMLOP_STORE);
switch (lhs->type) {
case AML_OBJTYPE_UNINITIALIZED:
aml_copyvalue(lhs, rhs);

diff #2

Index: sys/dev/acpi/dsdt.c
===
RCS file: /var/cvs/openbsd/src/sys/dev/acpi/dsdt.c,v
retrieving revision 1.257
diff -u -p -r1.257 dsdt.c
--- sys/dev/acpi/dsdt.c 17 Dec 2020 17:57:19 -  1.257
+++ sys/dev/acpi/dsdt.c 26 Feb 2021 04:33:21 -
@@ -2742,11 +2742,17 @@ aml_rwfield(struct aml_value *fld, int b
} else if (mode == ACPI_IOREAD) {
/* bufferfield:read */
_aml_setvalue(val, AML_OBJTYPE_INTEGER, 0, 0);
+   if (ref1->length < aml_bytepos(fld->v_field.bitpos) +
+   aml_bytelen(fld->v_field.bitlen))
+   aml_die("bufferfield:read out of range");
aml_bufcpy(>v_integer, 0, ref1->v_buffer,
fld->v_field.bitpos, fld->v_field.bitlen);
} else {
/* bufferfield:write */
val = aml_convert(val, AML_OBJTYPE_INTEGER, -1);
+   if (ref1->length < aml_bytepos(fld->v_field.bitpos) +
+   aml_bytelen(fld->v_field.bitlen))
+   aml_die("bufferfield:write out of range");
aml_bufcpy(ref1->v_buffer, fld->v_field.bitpos, >v_integer,
0, fld->v_field.bitlen);
aml_delref(, "wrbuffld");


*1 example console log

Data modified on freelist: word -35183627074926 of object 
0x824a3060 size 0x10 previous type temp (invalid addr 
0x8027023e55f0)
uvm_fault(0x81f63958, 0x8027023e55f8, 0, 1) -> e
kernel: page fault trap, code=0
Stopped at  malloc+0x482:   movq0x8(%r14),%rcx
Running script...
ddb{0}> malloc(10,91,5) at malloc+0x482

i915_gem_do_execbuffer(802ab078,80ee0c00,8000337a7970,820ca000,0)
 at i915_gem_do_execbuffer+0xa52

i915_gem_execbuffer2_ioctl(802ab078,8000337a7970,80ee0c00) 
at i915_gem_execbuffer2_ioctl+0x144
drmioctl(15700,80406469,8000337a7970,3,8000336a8798) at 
drmioctl+0xd8

VOP_IOCTL(fd8227abbeb0,80406469,8000337a7970,3,fd826bd1dd88,8000336a8798)
 at VOP_IOCTL+0x55
vn_ioctl(fd82282ee8e8,80406469,8000337a7970,8000336a8798) at 
vn_ioctl+0x64
sys_ioctl(8000336a8798,8000337a7a80,8000337a7ae0) at 
sys_ioctl+0x3c2
syscall(8000337a7b50) at syscall+0x389
Xsyscall(6,36,0,36,80406469,7f7f5c00) at Xsyscall+0x128
end of kernel
end trace frame: 0x7f7f5bd0, count: -9



Re: pppac(4): remove `sc_dead' logic

2021-02-10 Thread YASUOKA Masahiko
ok yasuoka

Thanks,

On Tue, 9 Feb 2021 12:06:08 +0300
Vitaliy Makkoveev  wrote:
> `sc_dead' is used to prevent pppac_ioctl() be called on dying pppac(4)
> interface. But now if_detach() makes dying `ifp' inaccessible and waits
> for references which are in-use. This logic is not required anymore.
> Also I moved if_detach() before klist_invalidate() to prevent the case
> while pppac_qstart() bump `sc_rsel'.
> 
> Index: sys/net/if_pppx.c
> ===
> RCS file: /cvs/src/sys/net/if_pppx.c,v
> retrieving revision 1.108
> diff -u -p -r1.108 if_pppx.c
> --- sys/net/if_pppx.c 1 Feb 2021 07:46:55 -   1.108
> +++ sys/net/if_pppx.c 9 Feb 2021 09:05:23 -
> @@ -930,7 +930,6 @@ RBT_GENERATE(pppx_ifs, pppx_if, pxi_entr
>  
>  struct pppac_softc {
>   struct ifnetsc_if;
> - unsigned intsc_dead;/* [N] */
>   dev_t   sc_dev; /* [I] */
>   LIST_ENTRY(pppac_softc)
>   sc_entry;   /* [K] */
> @@ -1305,17 +1304,16 @@ pppacclose(dev_t dev, int flags, int mod
>   int s;
>  
>   NET_LOCK();
> - sc->sc_dead = 1;
>   CLR(ifp->if_flags, IFF_RUNNING);
>   NET_UNLOCK();
>  
> + if_detach(ifp);
> +
>   s = splhigh();
>   klist_invalidate(>sc_rsel.si_note);
>   klist_invalidate(>sc_wsel.si_note);
>   splx(s);
>  
> - if_detach(ifp);
> -
>   pool_put(_session_pool, sc->sc_multicast_session);
>   NET_LOCK();
>   pipex_destroy_all_sessions(sc);
> @@ -1330,12 +1328,8 @@ pppacclose(dev_t dev, int flags, int mod
>  static int
>  pppac_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
>  {
> - struct pppac_softc *sc = ifp->if_softc;
>   /* struct ifreq *ifr = (struct ifreq *)data; */
>   int error = 0;
> -
> - if (sc->sc_dead)
> - return (ENXIO);
>  
>   switch (cmd) {
>   case SIOCSIFADDR:



Re: npppd(8)/pppac(4): remove dummy TUNSIFMODE ioctl(2) call

2021-01-31 Thread YASUOKA Masahiko
Yes,

ok yasuoka

On Fri, 29 Jan 2021 14:32:39 +0300
Vitaliy Makkoveev  wrote:
> Since OpenBSD 6.7 npppd(8) can't work over tun(4) anymore. I propose to
> remove dummy TUNSIFMODE ioctl(2) call.
> 
> Index: sys/net/if_pppx.c
> ===
> RCS file: /cvs/src/sys/net/if_pppx.c,v
> retrieving revision 1.106
> diff -u -p -r1.106 if_pppx.c
> --- sys/net/if_pppx.c 25 Dec 2020 12:59:53 -  1.106
> +++ sys/net/if_pppx.c 29 Jan 2021 11:10:40 -
> @@ -920,12 +920,6 @@ pppx_if_ioctl(struct ifnet *ifp, u_long 
>  RBT_GENERATE(pppx_ifs, pppx_if, pxi_entry, pppx_if_cmp);
>  
>  /*
> - * pppac(4) - PPP Access Concentrator interface
> - */
> -
> -#include 
> -
> -/*
>   * Locks used to protect struct members and global data
>   *   I   immutable after creation
>   *   K   kernel lock
> @@ -1188,9 +1182,6 @@ pppacioctl(dev_t dev, u_long cmd, caddr_
>  
>   NET_LOCK();
>   switch (cmd) {
> - case TUNSIFMODE: /* make npppd happy */
> - break;
> -
>   case FIONBIO:
>   break;
>   case FIONREAD:
> Index: usr.sbin/npppd/npppd/npppd_iface.c
> ===
> RCS file: /cvs/src/usr.sbin/npppd/npppd/npppd_iface.c,v
> retrieving revision 1.14
> diff -u -p -r1.14 npppd_iface.c
> --- usr.sbin/npppd/npppd/npppd_iface.c2 Jan 2021 13:15:15 -   
> 1.14
> +++ usr.sbin/npppd/npppd/npppd_iface.c29 Jan 2021 11:10:41 -
> @@ -275,7 +275,6 @@ npppd_iface_reinit(npppd_iface *_this, s
>  int
>  npppd_iface_start(npppd_iface *_this)
>  {
> - int x;
>   charbuf[PATH_MAX];
>  
>   NPPPD_IFACE_ASSERT(_this != NULL);
> @@ -285,16 +284,6 @@ npppd_iface_start(npppd_iface *_this)
>   if ((_this->devf = priv_open(buf, O_RDWR | O_NONBLOCK)) < 0) {
>   npppd_iface_log(_this, LOG_ERR, "open(%s) failed: %m", buf);
>   goto fail;
> - }
> -
> - if (_this->using_pppx == 0) {
> - x = IFF_BROADCAST;
> - if (ioctl(_this->devf, TUNSIFMODE, ) != 0) {
> - npppd_iface_log(_this, LOG_ERR,
> - "ioctl(TUNSIFMODE=IFF_BROADCAST) failed "
> - "in %s(): %m", __func__);
> - goto fail;
> - }
>   }
>  
>   event_set(&_this->ev, _this->devf, EV_READ | EV_PERSIST,
> 



Re: Wireguard: can't remove multiple peers at once.

2021-01-13 Thread YASUOKA Masahiko
Hi,

On Thu, 14 Jan 2021 08:54:36 +0900
Yuichiro NAITO  wrote:
> Does anybody please review my code?
> 
> Yasuoka-san is my coleague of my work.
> So, he is interested in this topic. That’s why I CCed this mail.
> I don’t mean he is an reviewer.
> 
>> 2021/01/12 11:27、Yuichiro NAITO のメール:
>> I have set up multiple peers in a wg0 interface,
>> and tried to remove more than one peers at once.
>> Ifconfig(1) only removes the first peer.
>> 
>> Command line was like following.
>> 
>> ```
>> # ifconfig wg0 -wgpeer  -wgpeer  -wgpeer 
>> ```
>> 
>> Only  was removed.
>> 
>> I think next peer pointer isn't calculated in case of removing peer
>> in sys/net/if_wg.c: wg_ioctl_set() function.
>> 
>> I have tried following patch that can fix this problem.

Yes, the diff seems good.

I made the following whitespace change.

> @@ -2333,6 +2333,11 @@ wg_ioctl_set(struct wg_softc *sc, struct wg_data_io 
> *data)
>   }
> 
>   peer_p = (struct wg_peer_io *)aip_p;
> + continue;
> + next_peer:
> + aip_p = _p->p_aips[0];
> + aip_p += peer_o.p_aips_count;
> + peer_p = (struct wg_peer_io *)aip_p;
>   }
> 
> error:

It seems we prefer putting goto labels at the beginning of the line.


ok?

Fix wg(4) ioctl to be able to handle multiple wgpeers.
Diff from Yuichiro NAITO.

Index: sys/net/if_wg.c
===
RCS file: /cvs/src/sys/net/if_wg.c,v
retrieving revision 1.14
diff -u -p -r1.14 if_wg.c
--- sys/net/if_wg.c 1 Sep 2020 19:06:59 -   1.14
+++ sys/net/if_wg.c 14 Jan 2021 07:26:48 -
@@ -2270,7 +2270,7 @@ wg_ioctl_set(struct wg_softc *sc, struct
 
/* Peer must have public key */
if (!(peer_o.p_flags & WG_PEER_HAS_PUBLIC))
-   continue;
+   goto next_peer;
 
/* 0 = latest protocol, 1 = this protocol */
if (peer_o.p_protocol_version != 0) {
@@ -2283,7 +2283,7 @@ wg_ioctl_set(struct wg_softc *sc, struct
/* Get local public and check that peer key doesn't match */
if (noise_local_keys(>sc_local, public, NULL) == 0 &&
bcmp(public, peer_o.p_public, WG_KEY_SIZE) == 0)
-   continue;
+   goto next_peer;
 
/* Lookup peer, or create if it doesn't exist */
if ((peer = wg_peer_lookup(sc, peer_o.p_public)) == NULL) {
@@ -2291,7 +2291,7 @@ wg_ioctl_set(struct wg_softc *sc, struct
 * Also, don't create a new one if we only want to
 * update. */
if (peer_o.p_flags & (WG_PEER_REMOVE|WG_PEER_UPDATE))
-   continue;
+   goto next_peer;
 
if ((peer = wg_peer_create(sc,
peer_o.p_public)) == NULL) {
@@ -2303,7 +2303,7 @@ wg_ioctl_set(struct wg_softc *sc, struct
/* Remove peer and continue if specified */
if (peer_o.p_flags & WG_PEER_REMOVE) {
wg_peer_destroy(peer);
-   continue;
+   goto next_peer;
}
 
if (peer_o.p_flags & WG_PEER_HAS_ENDPOINT)
@@ -2332,6 +2332,11 @@ wg_ioctl_set(struct wg_softc *sc, struct
aip_p++;
}
 
+   peer_p = (struct wg_peer_io *)aip_p;
+   continue;
+next_peer:
+   aip_p = _p->p_aips[0];
+   aip_p += peer_o.p_aips_count;
peer_p = (struct wg_peer_io *)aip_p;
}
 



Re: pipex(4)/npppd(8): remove dummy PIPEX{G,S}MODE ioctl(2) calls

2021-01-02 Thread YASUOKA Masahiko
Yes,

ok yasuoka

On Wed, 30 Dec 2020 03:02:55 +0300
Vitaliy Makkoveev  wrote:
> This time pipex(4) related ioctl(2) calls PIPEX{S,G}MODE are pretty 
> dummy and were kept for backward compatibility reasons. The diff below
> removes them.
> 
> ok?
> 
> Index: share/man/man4/pipex.4
> ===
> RCS file: /cvs/src/share/man/man4/pipex.4,v
> retrieving revision 1.13
> diff -u -p -r1.13 pipex.4
> --- share/man/man4/pipex.49 Aug 2020 14:35:31 -   1.13
> +++ share/man/man4/pipex.429 Dec 2020 23:51:57 -
> @@ -57,20 +57,6 @@ or
>  devices.
>  The added requests are as follows:
>  .Bl -tag -width Ds
> -.It Dv PIPEXGMODEFa "int *"
> -Get the devices's
> -.Nm
> -operation mode.
> -1 to enable
> -.Nm
> -on this device; 0 to disable.
> -.It Dv PIPEXSMODEFa "int *"
> -Set the device's
> -.Nm
> -operation mode.
> -1 to enable
> -.Nm
> -on this device; 0 to disable.
>  .It Dv PIPEXASESSION Fa "struct pipex_session_req *"
>  Add a new PPP session to be handled by
>  .Nm .
> Index: sys/net/pipex.c
> ===
> RCS file: /cvs/src/sys/net/pipex.c,v
> retrieving revision 1.127
> diff -u -p -r1.127 pipex.c
> --- sys/net/pipex.c   30 Aug 2020 19:48:16 -  1.127
> +++ sys/net/pipex.c   29 Dec 2020 23:51:59 -
> @@ -163,13 +163,6 @@ pipex_ioctl(void *ownersc, u_long cmd, c
>  
>   NET_ASSERT_LOCKED();
>   switch (cmd) {
> - case PIPEXSMODE:
> - break;
> -
> - case PIPEXGMODE:
> - *(int *)data = 1;
> - break;
> -
>   case PIPEXCSESSION:
>   ret = pipex_config_session(
>   (struct pipex_session_config_req *)data, ownersc);
> Index: sys/net/pipex.h
> ===
> RCS file: /cvs/src/sys/net/pipex.h,v
> retrieving revision 1.28
> diff -u -p -r1.28 pipex.h
> --- sys/net/pipex.h   27 Aug 2020 10:47:52 -  1.28
> +++ sys/net/pipex.h   29 Dec 2020 23:51:59 -
> @@ -165,8 +165,6 @@ struct pipex_session_descr_req {
>  
>  
>  /* PIPEX ioctls */
> -#define PIPEXSMODE   _IOW ('p',  1, int)
> -#define PIPEXGMODE   _IOR ('p',  2, int)
>  #define PIPEXASESSION_IOW ('p',  3, struct pipex_session_req)
>  #define PIPEXDSESSION_IOWR('p',  4, struct pipex_session_close_req)
>  #define PIPEXCSESSION_IOW ('p',  5, struct pipex_session_config_req)
> Index: usr.sbin/npppd/npppd/npppd_iface.c
> ===
> RCS file: /cvs/src/usr.sbin/npppd/npppd/npppd_iface.c,v
> retrieving revision 1.13
> diff -u -p -r1.13 npppd_iface.c
> --- usr.sbin/npppd/npppd/npppd_iface.c5 Dec 2015 16:10:31 -   
> 1.13
> +++ usr.sbin/npppd/npppd/npppd_iface.c29 Dec 2020 23:52:00 -
> @@ -96,11 +96,6 @@ static void  npppd_iface_io_event_handle
>  static int   npppd_iface_log (npppd_iface *, int, const char *, ...)
>   __printflike(3,4);
>  
> -#ifdef USE_NPPPD_PIPEX
> -static int npppd_iface_pipex_enable(npppd_iface *_this);
> -static int npppd_iface_pipex_disable(npppd_iface *_this);
> -#endif /* USE_NPPPD_PIPEX */
> -
>  
>  /** initialize npppd_iface */
>  void
> @@ -311,12 +306,7 @@ npppd_iface_start(npppd_iface *_this)
>   goto fail;
>   }
>  
> -#ifdef USE_NPPPD_PIPEX
> - if (npppd_iface_pipex_enable(_this) != 0) {
> - log_printf(LOG_WARNING,
> - "npppd_iface_pipex_enable() failed: %m");
> - }
> -#else
> +#ifndef USE_NPPPD_PIPEX
>   if (_this->using_pppx) {
>   npppd_iface_log(_this, LOG_ERR,
>   "pipex is required when using pppx interface");
> @@ -358,13 +348,6 @@ npppd_iface_stop(npppd_iface *_this)
>   in_host_route_delete(&_this->ip4addr, );
>   }
>   if (_this->devf >= 0) {
> -#ifdef USE_NPPPD_PIPEX
> - if (npppd_iface_pipex_disable(_this) != 0) {
> - log_printf(LOG_CRIT,
> - "npppd_iface_pipex_disable() failed: %m");
> - }
> -#endif /* USE_NPPPD_PIPEX */
> -
>   event_del(&_this->ev);
>   close(_this->devf);
>   npppd_iface_log(_this, LOG_INFO, "Stopped");
> @@ -381,32 +364,6 @@ npppd_iface_fini(npppd_iface *_this)
>   NPPPD_IFACE_ASSERT(_this != NULL);
>   _this->initialized = 0;
>  }
> -
> -
> -/***
> - * PIPEX related functions
> - ***/
> -#ifdef USE_NPPPD_PIPEX
> -
> -/** enable PIPEX on PPPAC interface */
> -int
> -npppd_iface_pipex_enable(npppd_iface *_this)
> -{
> - int enable = 1;
> -
> - return ioctl(_this->devf, PIPEXSMODE, );
> -}
> -
> -/** disable PIPEX on PPPAC interface */
> -int
> -npppd_iface_pipex_disable(npppd_iface *_this)
> -{
> - int disable = 0;
> -
> - 

Re: diff: pfctl: error message for nonexisting rtable

2020-09-17 Thread YASUOKA Masahiko
the condition was reversed.

ok?
Index: parse.y
===
RCS file: /cvs/src/sbin/pfctl/parse.y,v
retrieving revision 1.702
diff -u -p -r1.702 parse.y
--- parse.y 17 Sep 2020 10:09:43 -  1.702
+++ parse.y 17 Sep 2020 14:23:42 -
@@ -1216,7 +1216,7 @@ antispoof_opt : LABEL label   {
if ($2 < 0 || $2 > RT_TABLEID_MAX) {
yyerror("invalid rtable id");
YYERROR;
-   } else if (lookup_rtable($2) >= 1) {
+   } else if (lookup_rtable($2) < 1) {
yyerror("rtable %lld does not exist", $2);
YYERROR;
}
@@ -2003,7 +2003,7 @@ filter_opt: USER uids {
if ($2 < 0 || $2 > RT_TABLEID_MAX) {
yyerror("invalid rtable id");
YYERROR;
-   } else if (lookup_rtable($2) >= 1) {
+   } else if (lookup_rtable($2) < 1) {
yyerror("rtable %lld does not exist", $2);
YYERROR;
}



Re: diff: pfctl: error message for nonexisting rtable

2020-09-17 Thread YASUOKA Masahiko
Hi,

I just committed yours.

Thanks,

On Wed, 16 Sep 2020 16:07:40 +0200
Klemens Nanni  wrote:
> On Wed, Sep 16, 2020 at 07:49:19PM +0900, YASUOKA Masahiko wrote:
>> New diff is using -1 for ENOENT.
>> 
>> Also domainid == 0 is a valid domain id, but previous diff cannot make
>> a cache of it since 0 is the default value.  So new diff is doing
>> 
>> -static u_int found[RT_TABLEID_MAX+1];
>> +static struct {
>> +int  found;
>> +int  domainid;
>> +}rtables[RT_TABLEID_MAX+1];
>> 
>> to distinguish the default 0 and domainid 0.
> This looks more complicated than it needs to be, but I also don't want
> to bikeshed it;  given that the parser is happy with this and we plan to
> remove this code alltogether anyway in the next release cycle:  OK kn.
> 
> Alternatively, here's a much simpler diff resembling what I had in mind.
> Feel free to commit this instead (with my OK), give me an OK for it or
> go ahead with yours.
> 
> It uses the same function and reflects the fact that every rdomain is a
> rtable but not every rtable is also a rdomain (your choice of `domainid'
> seems inconsistent with that).
> 
> Index: parse.y
> ===
> RCS file: /cvs/src/sbin/pfctl/parse.y,v
> retrieving revision 1.701
> diff -u -p -r1.701 parse.y
> --- parse.y   28 Jan 2020 15:40:35 -  1.701
> +++ parse.y   16 Sep 2020 13:58:23 -
> @@ -392,7 +392,7 @@ intinvalid_redirect(struct node_host *
>  u_int16_t parseicmpspec(char *, sa_family_t);
>  int   kw_casecmp(const void *, const void *);
>  int   map_tos(char *string, int *);
> -int   rdomain_exists(u_int);
> +int   lookup_rtable(u_int);
>  int   filteropts_to_rule(struct pf_rule *, struct filter_opts *);
>  
>  TAILQ_HEAD(loadanchorshead, loadanchors)
> @@ -1216,6 +1216,9 @@ antispoof_opt   : LABEL label   {
>   if ($2 < 0 || $2 > RT_TABLEID_MAX) {
>   yyerror("invalid rtable id");
>   YYERROR;
> + } else if (lookup_rtable($2) >= 1) {
> + yyerror("rtable %lld does not exist", $2);
> + YYERROR;
>   }
>   antispoof_opts.rtableid = $2;
>   }
> @@ -2000,6 +2003,9 @@ filter_opt  : USER uids {
>   if ($2 < 0 || $2 > RT_TABLEID_MAX) {
>   yyerror("invalid rtable id");
>   YYERROR;
> + } else if (lookup_rtable($2) >= 1) {
> + yyerror("rtable %lld does not exist", $2);
> + YYERROR;
>   }
>   filter_opts.rtableid = $2;
>   }
> @@ -2475,7 +2481,7 @@ if_item : STRING{
>   | RDOMAIN NUMBER{
>   if ($2 < 0 || $2 > RT_TABLEID_MAX)
>   yyerror("rdomain %lld outside range", $2);
> - else if (rdomain_exists($2) != 1)
> + else if (lookup_rtable($2) != 2)
>   yyerror("rdomain %lld does not exist", $2);
>  
>   $$ = calloc(1, sizeof(struct node_if));
> @@ -5868,37 +5874,38 @@ map_tos(char *s, int *val)
>  }
>  
>  int
> -rdomain_exists(u_int rdomain)
> +lookup_rtable(u_int rtableid)
>  {
>   size_t   len;
>   struct rt_tableinfo  info;
>   int  mib[6];
>   static u_int found[RT_TABLEID_MAX+1];
>  
> - if (found[rdomain] == 1)
> - return 1;
> + if (found[rtableid])
> + return found[rtableid];
>  
>   mib[0] = CTL_NET;
>   mib[1] = PF_ROUTE;
>   mib[2] = 0;
>   mib[3] = 0;
>   mib[4] = NET_RT_TABLE;
> - mib[5] = rdomain;
> + mib[5] = rtableid;
>  
>   len = sizeof(info);
>   if (sysctl(mib, 6, , , NULL, 0) == -1) {
>   if (errno == ENOENT) {
>   /* table nonexistent */
> + found[rtableid] = 0;
>   return 0;
>   }
>   err(1, "%s", __func__);
>   }
> - if (info.rti_domainid == rdomain) {
> - found[rdomain] = 1;
> - return 1;
> + if (info.rti_domainid == rtableid) {
> + found[rtableid] = 2;
> + return 2;
>   }
> - /* rdomain is a table, but not an rdomain */
> - return 0;
> + found[rtableid] = 1;
> + return 1;
>  }
>  
>  int



Re: diff: pfctl: error message for nonexisting rtable

2020-09-16 Thread YASUOKA Masahiko
Hi,

On Wed, 16 Sep 2020 12:04:55 +0200
Klemens Nanni  wrote:
> Using the function verb would reads a bit clearer/more intuitive,
> i.e.

Yes, "if (!rtable_exists($2))" seems better.

>> @@ -5887,17 +5897,37 @@ rdomain_exists(u_int rdomain)
>>  
>>  len = sizeof(info);
>>  if (sysctl(mib, 6, , , NULL, 0) == -1) {
>> -if (errno == ENOENT) {
>> +if (errno == ENOENT)
>>  /* table nonexistent */
>> -return 0;
>> -}
>> -err(1, "%s", __func__);
>> -}
>> -if (info.rti_domainid == rdomain) {
>> -found[rdomain] = 1;
>> +domainid[rdomain] = RT_TABLEID_MAX;
> This does not look correct, RT_TABLEID_MAX (255) is the biggest *valid*
> id, so you cannot use it to denote a nonexistent routing table.

Good catch.  Thanks,

> Perhaps use `static int domainid[RT_TABLEID_MAX+1]' and `-1' to reflect
> ENOENT?

New diff is using -1 for ENOENT.

Also domainid == 0 is a valid domain id, but previous diff cannot make
a cache of it since 0 is the default value.  So new diff is doing

-   static u_int found[RT_TABLEID_MAX+1];
+   static struct {
+   int  found;
+   int  domainid;
+   }rtables[RT_TABLEID_MAX+1];

to distinguish the default 0 and domainid 0.

ok?


Make pfctl check if the rtable really exists when parsing the config.

Index: sbin/pfctl/parse.y
===
RCS file: /cvs/src/sbin/pfctl/parse.y,v
retrieving revision 1.701
diff -u -p -r1.701 parse.y
--- sbin/pfctl/parse.y  28 Jan 2020 15:40:35 -  1.701
+++ sbin/pfctl/parse.y  16 Sep 2020 10:40:25 -
@@ -392,7 +392,9 @@ int  invalid_redirect(struct node_host *
 u_int16_t parseicmpspec(char *, sa_family_t);
 int kw_casecmp(const void *, const void *);
 int map_tos(char *string, int *);
+int get_domainid(u_int);
 int rdomain_exists(u_int);
+int rtable_exists(u_int);
 int filteropts_to_rule(struct pf_rule *, struct filter_opts *);
 
 TAILQ_HEAD(loadanchorshead, loadanchors)
@@ -1217,6 +1219,10 @@ antispoof_opt: LABEL label   {
yyerror("invalid rtable id");
YYERROR;
}
+   else if (!rtable_exists($2)) {
+   yyerror("rtable %lld does not exist", $2);
+   YYERROR;
+   }
antispoof_opts.rtableid = $2;
}
;
@@ -2001,6 +2007,10 @@ filter_opt   : USER uids {
yyerror("invalid rtable id");
YYERROR;
}
+   else if (!rtable_exists($2)) {
+   yyerror("rtable %lld does not exist", $2);
+   YYERROR;
+   }
filter_opts.rtableid = $2;
}
| DIVERTTO STRING PORT portplain {
@@ -2475,7 +2485,7 @@ if_item   : STRING{
| RDOMAIN NUMBER{
if ($2 < 0 || $2 > RT_TABLEID_MAX)
yyerror("rdomain %lld outside range", $2);
-   else if (rdomain_exists($2) != 1)
+   else if (!rdomain_exists($2))
yyerror("rdomain %lld does not exist", $2);
 
$$ = calloc(1, sizeof(struct node_if));
@@ -5868,36 +5878,60 @@ map_tos(char *s, int *val)
 }
 
 int
-rdomain_exists(u_int rdomain)
+get_domainid(u_int rtable)
 {
size_t   len;
struct rt_tableinfo  info;
int  mib[6];
-   static u_int found[RT_TABLEID_MAX+1];
+   static struct {
+   int  found;
+   int  domainid;
+   }rtables[RT_TABLEID_MAX+1];
 
-   if (found[rdomain] == 1)
-   return 1;
+   if (rtables[rtable].found)
+   return rtables[rtable].domainid;
 
mib[0] = CTL_NET;
mib[1] = PF_ROUTE;
mib[2] = 0;
mib[3] = 0;
mib[4] = NET_RT_TABLE;
-   mib[5] = rdomain;
+   mib[5] = rtable;
 
len = sizeof(info);
if (sysctl(mib, 6, , , NULL, 0) == -1) {
-   if (errno == ENOENT) {
+   if (errno == ENOENT)
/* table nonexistent */
-   return 0;
-   }
-   err(1, "%s", __func__);
-   }
-   if (info.rti_domainid == rdomain) {
-   found[rdomain] = 1;
+   rtables[rtable].domainid = -1;
+   else
+   err(1, "%s", __func__);
+   } else
+   rtables[rtable].domainid = info.rti_domainid;
+   

Re: diff: pfctl: error message for nonexisting rtable

2020-09-16 Thread YASUOKA Masahiko
Hi,

So, it seems we need to more code and test for pf(4) part.

Let me continue this separetely.

On Mon, 14 Sep 2020 11:07:53 +0200
Klemens Nanni  wrote:
> On Mon, Sep 14, 2020 at 02:09:27PM +0900, YASUOKA Masahiko wrote:
>> Make pfctl check if the rtable really exists when parsing the config.
> I concur, but you can do this with less (duplicated) code.
> 
> Instead of copying rdomain_exists() into rtable_exists() with the
> `rti_domainid' check omitted, tweak (and rename) rdomain_exists() into
> returning the information whether the given ID is just an rtable.
> 
> rdomain_exists() merges the "invalid id" and "id is an rtable but not
> an rdmomain" cases - make those separate return codes, check/adjust
> existing callers and use it for your new checks.

Yes, I could reduce the code.  Thanks,

ok?


Make pfctl check if the rtable really exists when parsing the config.

Index: sbin/pfctl/parse.y
===
RCS file: /cvs/src/sbin/pfctl/parse.y,v
retrieving revision 1.701
diff -u -p -r1.701 parse.y
--- sbin/pfctl/parse.y  28 Jan 2020 15:40:35 -  1.701
+++ sbin/pfctl/parse.y  16 Sep 2020 09:11:21 -
@@ -392,7 +392,9 @@ int  invalid_redirect(struct node_host *
 u_int16_t parseicmpspec(char *, sa_family_t);
 int kw_casecmp(const void *, const void *);
 int map_tos(char *string, int *);
+int get_domainid(u_int);
 int rdomain_exists(u_int);
+int rtable_exists(u_int);
 int filteropts_to_rule(struct pf_rule *, struct filter_opts *);
 
 TAILQ_HEAD(loadanchorshead, loadanchors)
@@ -1217,6 +1219,10 @@ antispoof_opt: LABEL label   {
yyerror("invalid rtable id");
YYERROR;
}
+   else if (rtable_exists($2) != 1) {
+   yyerror("rtable %lld does not exist", $2);
+   YYERROR;
+   }
antispoof_opts.rtableid = $2;
}
;
@@ -2001,6 +2007,10 @@ filter_opt   : USER uids {
yyerror("invalid rtable id");
YYERROR;
}
+   else if (rtable_exists($2) != 1) {
+   yyerror("rtable %lld does not exist", $2);
+   YYERROR;
+   }
filter_opts.rtableid = $2;
}
| DIVERTTO STRING PORT portplain {
@@ -5868,15 +5878,15 @@ map_tos(char *s, int *val)
 }
 
 int
-rdomain_exists(u_int rdomain)
+get_domainid(u_int rdomain)
 {
size_t   len;
struct rt_tableinfo  info;
int  mib[6];
-   static u_int found[RT_TABLEID_MAX+1];
+   static u_int domainid[RT_TABLEID_MAX+1];
 
-   if (found[rdomain] == 1)
-   return 1;
+   if (domainid[rdomain] != 0)
+   return domainid[rdomain];
 
mib[0] = CTL_NET;
mib[1] = PF_ROUTE;
@@ -5887,17 +5897,37 @@ rdomain_exists(u_int rdomain)
 
len = sizeof(info);
if (sysctl(mib, 6, , , NULL, 0) == -1) {
-   if (errno == ENOENT) {
+   if (errno == ENOENT)
/* table nonexistent */
-   return 0;
-   }
-   err(1, "%s", __func__);
-   }
-   if (info.rti_domainid == rdomain) {
-   found[rdomain] = 1;
+   domainid[rdomain] = RT_TABLEID_MAX;
+   else
+   err(1, "%s", __func__);
+   } else
+   domainid[rdomain] = info.rti_domainid;
+
+   return domainid[rdomain];
+}
+
+int
+rdomain_exists(u_int rdomain)
+{
+   int domainid;
+
+   domainid = get_domainid(rdomain);
+   if (domainid == rdomain)
return 1;
-   }
/* rdomain is a table, but not an rdomain */
+   return 0;
+}
+
+int
+rtable_exists(u_int rtable)
+{
+   int domainid;
+
+   domainid = get_domainid(rtable);
+   if (domainid < RT_TABLEID_MAX)
+   return 1;
return 0;
 }
 



Re: diff: pfctl: error message for nonexisting rtable

2020-09-14 Thread YASUOKA Masahiko
Hi,

On Tue, 15 Sep 2020 02:31:24 +0200
Klemens Nanni  wrote:
> On Tue, Sep 15, 2020 at 12:30:35AM +0200, Klemens Nanni wrote:
>> Actually, that should just work regardless of whether the rounting
>> domain exists at ruleset creation time;  just like it is the case with
>> interface names/groups which may come and go at runtime without
>> requiring changes to the ruleset.
>> 
>> Rules on nonexistent interfaces won't match, routing domains (and
>> ultimately routing tables) should behave the same, I think.
>> 
>> Here's a diff that does this for routing domains allowing me to always
>> use `on rdomain 5' - I've tested it with a few examplatory rulesets and
>> behaviour is as expected.
>> 
>> It will need more eye balling and I am not pushing such changes before
>> release, but if that is a general direction we agree, your proposed
>> `rtable' fix could move along and become just as flexible instead.
> More on this:
> 
>   # ifconfig lo1 rdomain 1
>   # echo pass on rdomain 1 | pfctl -f-
>   # ifconfig lo1 destroy
>   # pfctl -sr 
>  
>   pass on rdomain 1 all flags S/SA
> 
> The ruleset stays valid and continues to work as soon as routing domain
> `1' reappears, there is no reason to require existence of it at ruleset
> creation;  this is safe because routing domains are just normative
> numbers, there's no further state when it comes to filtering - either
> the id on the packet matches the number in the ruleset or it doesn't.
> 
> Routing tables however are more involved as they can be used to *alter*
> a packet's flow in pf.conf(5), so requiring them to be present at
> ruleset creation makes sense to guarantee that pf will only ever change
> routing table ids to valid ones.

It's not clear for me why non-existing rdomain is accepted but
non-existing rtable is rejected.  I suppose we can make pf(4) can
handle a packet for the non-existing routing table as if the routing
table is empty.

> Routing domains can be deleted, but that doesn't invalidate rules like
> `on rdomain 1', which simply won't match when the given id does not
> exist.
> 
> Routing tables however cannot be deleted, they get moved to the default
> routing domain whenever their corresponding routing domain disappears;
> this is in line with only ever loading valid routing table ids into pf.
> 
> So unless I missed something, that ruleset creation (`pfctl -f ...')
> is the only occasion pf actually needs to validate routing table ids:
> they are guaranteed to always exist from then on.
> 
> Given this, my diff looks fine as is and should not change `rtable'
> behaviour - YASUOKA's diff is also fine as is and actually implements
> the validity check I just mentioned, obsoleting my initial feedback.



diff: pfctl: error message for nonexisting rtable

2020-09-13 Thread YASUOKA Masahiko
Hi,

When pf rule with a "on rdomain n" with nonexisting rdomain n causes

  /etc/pf.conf:XXX: rdomain n does not exist

error.  But with a "rtable n" with nonexisting rtable n will cause

  pfctl: DIOCADDRULE: Device busy

error.  It is hard to find the cause by this error message.

  /etc/pf.conf:XXX: rtable n does not exist

is better.  

ok?


Make pfctl check if the rtable really exists when parsing the config.

Index: sbin/pfctl/parse.y
===
RCS file: /cvs/src/sbin/pfctl/parse.y,v
retrieving revision 1.701
diff -u -p -r1.701 parse.y
--- sbin/pfctl/parse.y  28 Jan 2020 15:40:35 -  1.701
+++ sbin/pfctl/parse.y  14 Sep 2020 04:54:39 -
@@ -393,6 +393,7 @@ u_int16_t parseicmpspec(char *, sa_famil
 int kw_casecmp(const void *, const void *);
 int map_tos(char *string, int *);
 int rdomain_exists(u_int);
+int rtable_exists(u_int);
 int filteropts_to_rule(struct pf_rule *, struct filter_opts *);
 
 TAILQ_HEAD(loadanchorshead, loadanchors)
@@ -1217,6 +1218,10 @@ antispoof_opt: LABEL label   {
yyerror("invalid rtable id");
YYERROR;
}
+   else if (rtable_exists($2) != 1) {
+   yyerror("rtable %lld does not exist", $2);
+   YYERROR;
+   }
antispoof_opts.rtableid = $2;
}
;
@@ -2001,6 +2006,10 @@ filter_opt   : USER uids {
yyerror("invalid rtable id");
YYERROR;
}
+   else if (rtable_exists($2) != 1) {
+   yyerror("rtable %lld does not exist", $2);
+   YYERROR;
+   }
filter_opts.rtableid = $2;
}
| DIVERTTO STRING PORT portplain {
@@ -5899,6 +5908,36 @@ rdomain_exists(u_int rdomain)
}
/* rdomain is a table, but not an rdomain */
return 0;
+}
+
+int
+rtable_exists(u_int rtable)
+{
+   size_t   len;
+   struct rt_tableinfo  info;
+   int  mib[6];
+   static u_int found[RT_TABLEID_MAX+1];
+
+   if (found[rtable] == 1)
+   return 1;
+
+   mib[0] = CTL_NET;
+   mib[1] = PF_ROUTE;
+   mib[2] = 0;
+   mib[3] = 0;
+   mib[4] = NET_RT_TABLE;
+   mib[5] = rtable;
+
+   len = sizeof(info);
+   if (sysctl(mib, 6, , , NULL, 0) == -1) {
+   if (errno == ENOENT) {
+   /* table nonexistent */
+   return 0;
+   }
+   err(1, "%s", __func__);
+   }
+   found[rtable] = 1;
+   return 1;
 }
 
 int



Re: httpd: use the original uri for REQUEST_URI

2020-09-11 Thread YASUOKA Masahiko
Anyone?

This is a tiny change but makes httpd(8) more correct.
The diff is not so complicated.

On Thu, 03 Sep 2020 13:09:49 +0900 (JST)
YASUOKA Masahiko  wrote:
> Let me update the diff.  Previous doesn't have an error handling when
> strdup() failed.
> 
> On Thu, 03 Sep 2020 13:02:51 +0900 (JST)
> YASUOKA Masahiko  wrote:
>> The diff makes REQUEST_URI in FastCGI become the original request
>> URI.  Currently it is an url which is url decoded and canonicalized.
>> I could not find a specification of REQUEST_URI, but I suppose it is
>> the URI in HTTP request.  Apache httpd and nginx is using the original
>> URI for it.
>> 
>> ok?
>> 
>> 
>> Use the original requested URI for REQUEST_URI.
> 
> Index: usr.sbin/httpd/http.h
> ===
> RCS file: /cvs/src/usr.sbin/httpd/http.h,v
> retrieving revision 1.15
> diff -u -p -r1.15 http.h
> --- usr.sbin/httpd/http.h 8 May 2019 21:41:06 -   1.15
> +++ usr.sbin/httpd/http.h 3 Sep 2020 04:09:26 -
> @@ -246,6 +246,7 @@ struct http_descriptor {
>   /* Rewritten path and query remain NULL if not used */
>   char*http_path_alias;
>   char*http_query_alias;
> + char*http_path_orig;
>  
>   /* A tree of headers and attached lists for repeated headers. */
>   struct kv   *http_lastheader;
> Index: usr.sbin/httpd/server_fcgi.c
> ===
> RCS file: /cvs/src/usr.sbin/httpd/server_fcgi.c,v
> retrieving revision 1.83
> diff -u -p -r1.83 server_fcgi.c
> --- usr.sbin/httpd/server_fcgi.c  24 Aug 2020 15:49:11 -  1.83
> +++ usr.sbin/httpd/server_fcgi.c  3 Sep 2020 04:09:26 -
> @@ -299,13 +299,13 @@ server_fcgi(struct httpd *env, struct cl
>   }
>  
>   if (!desc->http_query) {
> - if (fcgi_add_param(, "REQUEST_URI", desc->http_path,
> + if (fcgi_add_param(, "REQUEST_URI", desc->http_path_orig,
>   clt) == -1) {
>   errstr = "failed to encode param";
>   goto fail;
>   }
>   } else {
> - if (asprintf(, "%s?%s", desc->http_path,
> + if (asprintf(, "%s?%s", desc->http_path_orig,
>   desc->http_query) == -1) {
>   errstr = "failed to encode param";
>   goto fail;
> Index: usr.sbin/httpd/server_http.c
> ===
> RCS file: /cvs/src/usr.sbin/httpd/server_http.c,v
> retrieving revision 1.140
> diff -u -p -r1.140 server_http.c
> --- usr.sbin/httpd/server_http.c  3 Aug 2020 10:59:53 -   1.140
> +++ usr.sbin/httpd/server_http.c  3 Sep 2020 04:09:26 -
> @@ -100,6 +100,8 @@ server_httpdesc_free(struct http_descrip
>  
>   free(desc->http_path);
>   desc->http_path = NULL;
> + free(desc->http_path_orig);
> + desc->http_path_orig = NULL;
>   free(desc->http_path_alias);
>   desc->http_path_alias = NULL;
>   free(desc->http_query);
> @@ -1204,9 +1206,13 @@ server_response(struct httpd *httpd, str
>   char*hostval, *query;
>   const char  *errstr = NULL;
>  
> - /* Decode the URL */
> + /* Preserve original path */
>   if (desc->http_path == NULL ||
> - url_decode(desc->http_path) == NULL)
> + (desc->http_path_orig = strdup(desc->http_path)) == NULL)
> + goto fail;
> +
> + /* Decode the URL */
> + if (url_decode(desc->http_path) == NULL)
>   goto fail;
>  
>   /* Canonicalize the request path */



Re: httpd: use the original uri for REQUEST_URI

2020-09-02 Thread YASUOKA Masahiko
Let me update the diff.  Previous doesn't have an error handling when
strdup() failed.

On Thu, 03 Sep 2020 13:02:51 +0900 (JST)
YASUOKA Masahiko  wrote:
> The diff makes REQUEST_URI in FastCGI become the original request
> URI.  Currently it is an url which is url decoded and canonicalized.
> I could not find a specification of REQUEST_URI, but I suppose it is
> the URI in HTTP request.  Apache httpd and nginx is using the original
> URI for it.
> 
> ok?
> 
> 
> Use the original requested URI for REQUEST_URI.

Index: usr.sbin/httpd/http.h
===
RCS file: /cvs/src/usr.sbin/httpd/http.h,v
retrieving revision 1.15
diff -u -p -r1.15 http.h
--- usr.sbin/httpd/http.h   8 May 2019 21:41:06 -   1.15
+++ usr.sbin/httpd/http.h   3 Sep 2020 04:09:26 -
@@ -246,6 +246,7 @@ struct http_descriptor {
/* Rewritten path and query remain NULL if not used */
char*http_path_alias;
char*http_query_alias;
+   char*http_path_orig;
 
/* A tree of headers and attached lists for repeated headers. */
struct kv   *http_lastheader;
Index: usr.sbin/httpd/server_fcgi.c
===
RCS file: /cvs/src/usr.sbin/httpd/server_fcgi.c,v
retrieving revision 1.83
diff -u -p -r1.83 server_fcgi.c
--- usr.sbin/httpd/server_fcgi.c24 Aug 2020 15:49:11 -  1.83
+++ usr.sbin/httpd/server_fcgi.c3 Sep 2020 04:09:26 -
@@ -299,13 +299,13 @@ server_fcgi(struct httpd *env, struct cl
}
 
if (!desc->http_query) {
-   if (fcgi_add_param(, "REQUEST_URI", desc->http_path,
+   if (fcgi_add_param(, "REQUEST_URI", desc->http_path_orig,
clt) == -1) {
errstr = "failed to encode param";
goto fail;
}
} else {
-   if (asprintf(, "%s?%s", desc->http_path,
+   if (asprintf(, "%s?%s", desc->http_path_orig,
desc->http_query) == -1) {
errstr = "failed to encode param";
goto fail;
Index: usr.sbin/httpd/server_http.c
===
RCS file: /cvs/src/usr.sbin/httpd/server_http.c,v
retrieving revision 1.140
diff -u -p -r1.140 server_http.c
--- usr.sbin/httpd/server_http.c3 Aug 2020 10:59:53 -   1.140
+++ usr.sbin/httpd/server_http.c3 Sep 2020 04:09:26 -
@@ -100,6 +100,8 @@ server_httpdesc_free(struct http_descrip
 
free(desc->http_path);
desc->http_path = NULL;
+   free(desc->http_path_orig);
+   desc->http_path_orig = NULL;
free(desc->http_path_alias);
desc->http_path_alias = NULL;
free(desc->http_query);
@@ -1204,9 +1206,13 @@ server_response(struct httpd *httpd, str
char*hostval, *query;
const char  *errstr = NULL;
 
-   /* Decode the URL */
+   /* Preserve original path */
if (desc->http_path == NULL ||
-   url_decode(desc->http_path) == NULL)
+   (desc->http_path_orig = strdup(desc->http_path)) == NULL)
+   goto fail;
+
+   /* Decode the URL */
+   if (url_decode(desc->http_path) == NULL)
goto fail;
 
/* Canonicalize the request path */



httpd: use the original uri for REQUEST_URI

2020-09-02 Thread YASUOKA Masahiko
The diff makes REQUEST_URI in FastCGI become the original request
URI.  Currently it is an url which is url decoded and canonicalized.
I could not find a specification of REQUEST_URI, but I suppose it is
the URI in HTTP request.  Apache httpd and nginx is using the original
URI for it.

ok?


Use the original requested URI for REQUEST_URI.

Index: usr.sbin/httpd/http.h
===
RCS file: /cvs/src/usr.sbin/httpd/http.h,v
retrieving revision 1.15
diff -u -p -r1.15 http.h
--- usr.sbin/httpd/http.h   8 May 2019 21:41:06 -   1.15
+++ usr.sbin/httpd/http.h   3 Sep 2020 04:00:49 -
@@ -246,6 +246,7 @@ struct http_descriptor {
/* Rewritten path and query remain NULL if not used */
char*http_path_alias;
char*http_query_alias;
+   char*http_path_orig;
 
/* A tree of headers and attached lists for repeated headers. */
struct kv   *http_lastheader;
Index: usr.sbin/httpd/server_fcgi.c
===
RCS file: /cvs/src/usr.sbin/httpd/server_fcgi.c,v
retrieving revision 1.83
diff -u -p -r1.83 server_fcgi.c
--- usr.sbin/httpd/server_fcgi.c24 Aug 2020 15:49:11 -  1.83
+++ usr.sbin/httpd/server_fcgi.c3 Sep 2020 04:00:49 -
@@ -299,13 +299,13 @@ server_fcgi(struct httpd *env, struct cl
}
 
if (!desc->http_query) {
-   if (fcgi_add_param(, "REQUEST_URI", desc->http_path,
+   if (fcgi_add_param(, "REQUEST_URI", desc->http_path_orig,
clt) == -1) {
errstr = "failed to encode param";
goto fail;
}
} else {
-   if (asprintf(, "%s?%s", desc->http_path,
+   if (asprintf(, "%s?%s", desc->http_path_orig,
desc->http_query) == -1) {
errstr = "failed to encode param";
goto fail;
Index: usr.sbin/httpd/server_http.c
===
RCS file: /cvs/src/usr.sbin/httpd/server_http.c,v
retrieving revision 1.140
diff -u -p -r1.140 server_http.c
--- usr.sbin/httpd/server_http.c3 Aug 2020 10:59:53 -   1.140
+++ usr.sbin/httpd/server_http.c3 Sep 2020 04:00:49 -
@@ -100,6 +100,8 @@ server_httpdesc_free(struct http_descrip
 
free(desc->http_path);
desc->http_path = NULL;
+   free(desc->http_path_orig);
+   desc->http_path_orig = NULL;
free(desc->http_path_alias);
desc->http_path_alias = NULL;
free(desc->http_query);
@@ -1203,6 +1205,10 @@ server_response(struct httpd *httpd, str
int  portval = -1, ret;
char*hostval, *query;
const char  *errstr = NULL;
+
+   /* preserve original path */
+   if (desc->http_path != NULL)
+   desc->http_path_orig = strdup(desc->http_path);
 
/* Decode the URL */
if (desc->http_path == NULL ||



Re: Make pipex more common for pppac and pppx

2020-08-26 Thread YASUOKA Masahiko
On Mon, 24 Aug 2020 20:07:48 +0300
Vitaliy Makkoveev  wrote:
> I pointed some comments inline.

Thanks,

>> +case PIPEXASESSION:
>> +{
>> +struct pipex_session_req *req =
>> +(struct pipex_session_req *)data;
>> +if ((error = pipex_init_session(, req)) != 0)
>> +break;
>> +error = pipex_link_session(session, >sc_if, sc);
>> +break;
>> +}
> 
> If pipex_link_session() fails `session' will be leaked.

Yes, it's a good catch.

>> +case PIPEXDSESSION:
>> +{
>> +struct pipex_session_close_req *req =
>> +(struct pipex_session_close_req *)data;
>> +session = pipex_lookup_by_session_id(req->pcr_protocol,
>> +req->pcr_session_id);
>> +if (session == NULL || session->ifindex != sc->sc_if.if_index) {
> 
> Can you compare with `session->ownersc' instead of `ifindex' like other
> code does? For consistency with other code.

Yes, it's better.

> What about to introduce pppac_{add,del}_session() and move related code
> into them?

Also I agreed.

> Also I see no such reason to kill pipex_{add,destroy}_session() because
> they play with `pipex_rd_head{4,6}' and you don't need newly introduced
> `session->is_pppx' which you use only once for that reason. 

pipex_{add,destroy}_session() should be killed since they are only for pppac.  
I think such functions should have "pppac_" prefix and placed in if_pppx.c.  
Also I'd like to move pipex_rd_head{4,6} things to pppac_{add,del}_session with 
a next step.  Yes, we might be able to kill is_pppx.  But I'd like to discuss 
that as a next step as well.


I'd like to commit this for this moment, and continue further discussion.

ok?

Index: sys/net/if_pppx.c
===
RCS file: /cvs/src/sys/net/if_pppx.c,v
retrieving revision 1.101
diff -u -p -r1.101 if_pppx.c
--- sys/net/if_pppx.c   14 Aug 2020 11:05:38 -  1.101
+++ sys/net/if_pppx.c   26 Aug 2020 06:25:34 -
@@ -163,7 +163,6 @@ struct pppx_if {
struct ifnetpxi_if;
struct pppx_dev *pxi_dev;   /* [I] */
struct pipex_session*pxi_session;   /* [I] */
-   struct pipex_iface_context  pxi_ifcontext;  /* [N] */
 };
 
 static inline int
@@ -181,12 +180,6 @@ intpppx_add_session(struct pppx_dev *,
struct pipex_session_req *);
 intpppx_del_session(struct pppx_dev *,
struct pipex_session_close_req *);
-intpppx_config_session(struct pppx_dev *,
-   struct pipex_session_config_req *);
-intpppx_get_stat(struct pppx_dev *,
-   struct pipex_session_stat_req *);
-intpppx_get_closed(struct pppx_dev *,
-   struct pipex_session_list_req *);
 intpppx_set_session_descr(struct pppx_dev *,
struct pipex_session_descr_req *);
 
@@ -424,17 +417,6 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t
 
NET_LOCK();
switch (cmd) {
-   case PIPEXSMODE:
-   /*
-* npppd always enables on open, and only disables before
-* closing. we cheat and let open and close do that, so lie
-* to npppd.
-*/
-   break;
-   case PIPEXGMODE:
-   *(int *)addr = 1;
-   break;
-
case PIPEXASESSION:
error = pppx_add_session(pxd,
(struct pipex_session_req *)addr);
@@ -445,21 +427,6 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t
(struct pipex_session_close_req *)addr);
break;
 
-   case PIPEXCSESSION:
-   error = pppx_config_session(pxd,
-   (struct pipex_session_config_req *)addr);
-   break;
-
-   case PIPEXGSTAT:
-   error = pppx_get_stat(pxd,
-   (struct pipex_session_stat_req *)addr);
-   break;
-
-   case PIPEXGCLOSED:
-   error = pppx_get_closed(pxd,
-   (struct pipex_session_list_req *)addr);
-   break;
-
case PIPEXSIFDESCR:
error = pppx_set_session_descr(pxd,
(struct pipex_session_descr_req *)addr);
@@ -472,7 +439,7 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t
break;
 
default:
-   error = ENOTTY;
+   error = pipex_ioctl(pxd, cmd, addr);
break;
}
NET_UNLOCK();
@@ -741,11 +708,7 @@ pppx_add_session(struct pppx_dev *pxd, s
if_addrhooks_run(ifp);
}
 
-   /* fake a pipex interface context */
-   pxi->pxi_ifcontext.ifindex = ifp->if_index;
-   pxi->pxi_ifcontext.pipexmode = PIPEX_ENABLED;
-
-   error = pipex_link_session(session, >pxi_ifcontext);
+   error = pipex_link_session(session, 

Re: Make pipex more common for pppac and pppx

2020-08-19 Thread YASUOKA Masahiko
Hi,

Thank you for your comments.

On Mon, 17 Aug 2020 00:15:08 +0300
Vitaliy Makkoveev  wrote:
> I like your idea to kill `pipex_iface_context'. I had trying to keep it
> by myself and this was wrong way. Could you rework your diff to be
> against the recent sources?

I'm sorry the diff was for the old version.

>> @@ -1122,8 +1051,11 @@ pppacopen(dev_t dev, int flags, int mode, struct proc 
>> *p)
>>  #if NBPFILTER > 0
>>  bpfattach(>if_bpf, ifp, DLT_LOOP, sizeof(uint32_t));
>>  #endif
>> -
>> -pipex_iface_init(>sc_pipex_iface, ifp->if_index);
>> +/* virtual pipex_session entry for multicast */
>> +session = pool_get(_session_pool, PR_WAITOK | PR_ZERO);
>> +session->is_multicast = 1;
>> +session->ifindex = ifp->if_index;
>> +sc->sc_multicast_session = session;
>>  
> Interface index is not required for multicast session, because it's
> never used. Also I like to alloc `sc_multicast_session' before
> if_attach().

The diff was to use `ifindex' to select all sessions associated the
same pppac(4).  But the latest diff uses `ownersc' instead for the
same purpose.  Also the allocation was moved to earlier part of the
function.

>> @@ -1382,7 +1340,10 @@ pppacclose(dev_t dev, int flags, int mode, struct 
>> proc *p)
>>  klist_invalidate(>sc_wsel.si_note);
>>  splx(s);
>>  
>> -pipex_iface_fini(>sc_pipex_iface);
>> +pool_put(_session_pool, sc->sc_multicast_session);
>> +NET_LOCK();
>> +pipex_destroy_all_sessions(sc);
>> +NET_UNLOCK();
>>  
>>  if_detach(ifp);
> 
> The recent sources has pppac(4) with unlocked start routine. I like you
> detach `ifp' before destroy `sc_multicast_session'.

The lines were moved after if_detach().

I'll test this more on this weekend, then I'll ask ok for this.

Index: sys/net/if_pppx.c
===
RCS file: /cvs/src/sys/net/if_pppx.c,v
retrieving revision 1.101
diff -u -p -r1.101 if_pppx.c
--- sys/net/if_pppx.c   14 Aug 2020 11:05:38 -  1.101
+++ sys/net/if_pppx.c   20 Aug 2020 05:19:55 -
@@ -163,7 +163,6 @@ struct pppx_if {
struct ifnetpxi_if;
struct pppx_dev *pxi_dev;   /* [I] */
struct pipex_session*pxi_session;   /* [I] */
-   struct pipex_iface_context  pxi_ifcontext;  /* [N] */
 };
 
 static inline int
@@ -181,12 +180,6 @@ intpppx_add_session(struct pppx_dev *,
struct pipex_session_req *);
 intpppx_del_session(struct pppx_dev *,
struct pipex_session_close_req *);
-intpppx_config_session(struct pppx_dev *,
-   struct pipex_session_config_req *);
-intpppx_get_stat(struct pppx_dev *,
-   struct pipex_session_stat_req *);
-intpppx_get_closed(struct pppx_dev *,
-   struct pipex_session_list_req *);
 intpppx_set_session_descr(struct pppx_dev *,
struct pipex_session_descr_req *);
 
@@ -424,17 +417,6 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t
 
NET_LOCK();
switch (cmd) {
-   case PIPEXSMODE:
-   /*
-* npppd always enables on open, and only disables before
-* closing. we cheat and let open and close do that, so lie
-* to npppd.
-*/
-   break;
-   case PIPEXGMODE:
-   *(int *)addr = 1;
-   break;
-
case PIPEXASESSION:
error = pppx_add_session(pxd,
(struct pipex_session_req *)addr);
@@ -445,21 +427,6 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t
(struct pipex_session_close_req *)addr);
break;
 
-   case PIPEXCSESSION:
-   error = pppx_config_session(pxd,
-   (struct pipex_session_config_req *)addr);
-   break;
-
-   case PIPEXGSTAT:
-   error = pppx_get_stat(pxd,
-   (struct pipex_session_stat_req *)addr);
-   break;
-
-   case PIPEXGCLOSED:
-   error = pppx_get_closed(pxd,
-   (struct pipex_session_list_req *)addr);
-   break;
-
case PIPEXSIFDESCR:
error = pppx_set_session_descr(pxd,
(struct pipex_session_descr_req *)addr);
@@ -472,7 +439,7 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t
break;
 
default:
-   error = ENOTTY;
+   error = pipex_ioctl(pxd, cmd, addr);
break;
}
NET_UNLOCK();
@@ -741,11 +708,7 @@ pppx_add_session(struct pppx_dev *pxd, s
if_addrhooks_run(ifp);
}
 
-   /* fake a pipex interface context */
-   pxi->pxi_ifcontext.ifindex = ifp->if_index;
-   pxi->pxi_ifcontext.pipexmode = PIPEX_ENABLED;
-
-   error = pipex_link_session(session, >pxi_ifcontext);
+   error = 

Re: Make pipex more common for pppac and pppx

2020-08-15 Thread YASUOKA Masahiko
Let me update the diff.  A bug found by the test.

diff --git a/sys/net/if_pppx.c b/sys/net/if_pppx.c
index 62b85bc34af..6d3de6973bd 100644
--- a/sys/net/if_pppx.c
+++ b/sys/net/if_pppx.c
@@ -163,7 +163,6 @@ struct pppx_if {
struct ifnetpxi_if;
struct pppx_dev *pxi_dev;   /* [I] */
struct pipex_session*pxi_session;   /* [I] */
-   struct pipex_iface_context  pxi_ifcontext;  /* [N] */
 };
 
 static inline int
@@ -181,12 +180,6 @@ intpppx_add_session(struct pppx_dev *,
struct pipex_session_req *);
 intpppx_del_session(struct pppx_dev *,
struct pipex_session_close_req *);
-intpppx_config_session(struct pppx_dev *,
-   struct pipex_session_config_req *);
-intpppx_get_stat(struct pppx_dev *,
-   struct pipex_session_stat_req *);
-intpppx_get_closed(struct pppx_dev *,
-   struct pipex_session_list_req *);
 intpppx_set_session_descr(struct pppx_dev *,
struct pipex_session_descr_req *);
 
@@ -424,17 +417,6 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, 
struct proc *p)
 
NET_LOCK();
switch (cmd) {
-   case PIPEXSMODE:
-   /*
-* npppd always enables on open, and only disables before
-* closing. we cheat and let open and close do that, so lie
-* to npppd.
-*/
-   break;
-   case PIPEXGMODE:
-   *(int *)addr = 1;
-   break;
-
case PIPEXASESSION:
error = pppx_add_session(pxd,
(struct pipex_session_req *)addr);
@@ -445,21 +427,6 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, 
struct proc *p)
(struct pipex_session_close_req *)addr);
break;
 
-   case PIPEXCSESSION:
-   error = pppx_config_session(pxd,
-   (struct pipex_session_config_req *)addr);
-   break;
-
-   case PIPEXGSTAT:
-   error = pppx_get_stat(pxd,
-   (struct pipex_session_stat_req *)addr);
-   break;
-
-   case PIPEXGCLOSED:
-   error = pppx_get_closed(pxd,
-   (struct pipex_session_list_req *)addr);
-   break;
-
case PIPEXSIFDESCR:
error = pppx_set_session_descr(pxd,
(struct pipex_session_descr_req *)addr);
@@ -472,7 +439,7 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, 
struct proc *p)
break;
 
default:
-   error = ENOTTY;
+   error = pipex_ioctl(pxd, cmd, addr);
break;
}
NET_UNLOCK();
@@ -742,11 +709,7 @@ pppx_add_session(struct pppx_dev *pxd, struct 
pipex_session_req *req)
if_addrhooks_run(ifp);
}
 
-   /* fake a pipex interface context */
-   pxi->pxi_ifcontext.ifindex = ifp->if_index;
-   pxi->pxi_ifcontext.pipexmode = PIPEX_ENABLED;
-
-   error = pipex_link_session(session, >pxi_ifcontext);
+   error = pipex_link_session(session, ifp, pxd);
if (error)
goto detach;
 
@@ -786,40 +749,6 @@ pppx_del_session(struct pppx_dev *pxd, struct 
pipex_session_close_req *req)
return (0);
 }
 
-int
-pppx_config_session(struct pppx_dev *pxd,
-struct pipex_session_config_req *req)
-{
-   struct pppx_if *pxi;
-
-   pxi = pppx_if_find(pxd, req->pcr_session_id, req->pcr_protocol);
-   if (pxi == NULL)
-   return (EINVAL);
-
-   return pipex_config_session(req, >pxi_ifcontext);
-}
-
-int
-pppx_get_stat(struct pppx_dev *pxd, struct pipex_session_stat_req *req)
-{
-   struct pppx_if *pxi;
-
-   pxi = pppx_if_find(pxd, req->psr_session_id, req->psr_protocol);
-   if (pxi == NULL)
-   return (EINVAL);
-
-   return pipex_get_stat(req, >pxi_ifcontext);
-}
-
-int
-pppx_get_closed(struct pppx_dev *pxd, struct pipex_session_list_req *req)
-{
-   /* XXX: Only opened sessions exist for pppx(4) */
-   memset(req, 0, sizeof(*req));
-
-   return 0;
-}
-
 int
 pppx_set_session_descr(struct pppx_dev *pxd,
 struct pipex_session_descr_req *req)
@@ -1022,9 +951,8 @@ struct pppac_softc {
struct selinfo  sc_rsel;
struct mutexsc_wsel_mtx;
struct selinfo  sc_wsel;
-
-   struct pipex_iface_context
-   sc_pipex_iface;
+   struct pipex_session
+   *sc_multicast_session;
 
struct mbuf_queue
sc_mq;
@@ -1084,6 +1012,7 @@ pppacopen(dev_t dev, int flags, int mode, struct proc *p)
 {
struct pppac_softc *sc;
struct ifnet *ifp;
+   struct pipex_session *session;
 
sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
if (pppac_lookup(dev) != NULL) {
@@ 

Make pipex more common for pppac and pppx

2020-08-15 Thread YASUOKA Masahiko
This diff makes pipex become more common for pppac and pppx.

- Delete "pipex_iface_context".

   It had been created when pppx doesn't exist.  This creates some
   confusions.  For example session->pipex_iface is the device context
   when pppac(4) but it's not when pppx(4).

623 Static int
624 pipex_get_closed(struct pipex_session_list_req *req,
625 struct pipex_iface_context *iface)
626 {
627 struct pipex_session *session, *session_tmp;
628 
629 NET_ASSERT_LOCKED();
630 bzero(req, sizeof(*req));
631 LIST_FOREACH_SAFE(session, _close_wait_list, state_list,
632 session_tmp) {
633 if (session->pipex_iface != iface)
634 continue;

  at #633, using it to verify the ownership.  But PIPEXGCLOSED is to
  get all closed sessions associated with the *device* (not the
  interface).  So we need another way to verify the owner.

  - The diff adds "void *ownersc" to session for it.
  - PIPEXGCLOSED for pppx is actually broken.  The diff fixes this.

- pipex_iface_context has a dummy session for multicast and it's not
  used by pppx(4).  The diff moves all multicast things to pppac local.

- Also session creation and deletion for pppac cannot be used by
  pppx.  Move them to pppac local.

- Make PIPEX{S,G}MODE dummy.  I'd like to delete them afterward.

The diff is still under review and test.

comment?

diff --git a/sys/net/if_pppx.c b/sys/net/if_pppx.c
index 62b85bc34af..6d3de6973bd 100644
--- a/sys/net/if_pppx.c
+++ b/sys/net/if_pppx.c
@@ -163,7 +163,6 @@ struct pppx_if {
struct ifnetpxi_if;
struct pppx_dev *pxi_dev;   /* [I] */
struct pipex_session*pxi_session;   /* [I] */
-   struct pipex_iface_context  pxi_ifcontext;  /* [N] */
 };
 
 static inline int
@@ -181,12 +180,6 @@ intpppx_add_session(struct pppx_dev *,
struct pipex_session_req *);
 intpppx_del_session(struct pppx_dev *,
struct pipex_session_close_req *);
-intpppx_config_session(struct pppx_dev *,
-   struct pipex_session_config_req *);
-intpppx_get_stat(struct pppx_dev *,
-   struct pipex_session_stat_req *);
-intpppx_get_closed(struct pppx_dev *,
-   struct pipex_session_list_req *);
 intpppx_set_session_descr(struct pppx_dev *,
struct pipex_session_descr_req *);
 
@@ -424,17 +417,6 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, 
struct proc *p)
 
NET_LOCK();
switch (cmd) {
-   case PIPEXSMODE:
-   /*
-* npppd always enables on open, and only disables before
-* closing. we cheat and let open and close do that, so lie
-* to npppd.
-*/
-   break;
-   case PIPEXGMODE:
-   *(int *)addr = 1;
-   break;
-
case PIPEXASESSION:
error = pppx_add_session(pxd,
(struct pipex_session_req *)addr);
@@ -445,21 +427,6 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, 
struct proc *p)
(struct pipex_session_close_req *)addr);
break;
 
-   case PIPEXCSESSION:
-   error = pppx_config_session(pxd,
-   (struct pipex_session_config_req *)addr);
-   break;
-
-   case PIPEXGSTAT:
-   error = pppx_get_stat(pxd,
-   (struct pipex_session_stat_req *)addr);
-   break;
-
-   case PIPEXGCLOSED:
-   error = pppx_get_closed(pxd,
-   (struct pipex_session_list_req *)addr);
-   break;
-
case PIPEXSIFDESCR:
error = pppx_set_session_descr(pxd,
(struct pipex_session_descr_req *)addr);
@@ -472,7 +439,7 @@ pppxioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, 
struct proc *p)
break;
 
default:
-   error = ENOTTY;
+   error = pipex_ioctl(pxd, cmd, addr);
break;
}
NET_UNLOCK();
@@ -742,11 +709,7 @@ pppx_add_session(struct pppx_dev *pxd, struct 
pipex_session_req *req)
if_addrhooks_run(ifp);
}
 
-   /* fake a pipex interface context */
-   pxi->pxi_ifcontext.ifindex = ifp->if_index;
-   pxi->pxi_ifcontext.pipexmode = PIPEX_ENABLED;
-
-   error = pipex_link_session(session, >pxi_ifcontext);
+   error = pipex_link_session(session, ifp, pxd);
if (error)
goto detach;
 
@@ -786,40 +749,6 @@ pppx_del_session(struct pppx_dev *pxd, struct 
pipex_session_close_req *req)
return (0);
 }
 
-int
-pppx_config_session(struct pppx_dev *pxd,
-struct pipex_session_config_req *req)
-{
-   struct pppx_if *pxi;
-
-   pxi = pppx_if_find(pxd, 

Re: pppac(4): destroy sessions the same way as pppx(4) does

2020-08-14 Thread YASUOKA Masahiko
On Wed, 12 Aug 2020 12:26:22 +0300
Vitaliy Makkoveev  wrote:
> We destroy pppx(4) related sessions while we performing PIPEXDSESSION
> command. But with pppac(4) we set session's state to
> PIPEX_STATE_CLOSE_WAIT2 and we wait garbage collector to do destruction.

pppac's PIPEXDSESSION set the states PIPEX_STATE_CLOSED.  It is to
wait until pipex{in,out}q becomes empty.

> We removed `pipex{in,out}q'. So we can safe destroy session in any time.
> I propose to make pppac(4) session destruction path the same as pppx(4)
> does. Now we destroy them while performing PIPEXDSESSION commad too.

Yes.  I agree this point.

> Also there is no in-kernel garbage collector for pppac(4) sessions.
> yasuoka@ pointed me that npppd(8) should kill expired sessions.
> 
> This not only makes pppac(4) closer to pppx(4) but simplify code and
> allow us to make safe pppx(4) session processing by pipex_timer().
> So this is preparation step to restore in-kernel timeout for pppx(4)
> too.

Below, I am asking to keep the timeout behavior.  There is a bug for
pppx(4) but it had been working for pppac(4) for long time.  If you
really want to change the behavior please provide a reason.  I have
not so strong opinion but I don't want to change the behavior without
a reason.

> Index: sys/net/pipex.c
> ===
> RCS file: /cvs/src/sys/net/pipex.c,v
> retrieving revision 1.124
> diff -u -p -r1.124 pipex.c
> --- sys/net/pipex.c   12 Aug 2020 08:41:39 -  1.124
> +++ sys/net/pipex.c   12 Aug 2020 09:07:12 -
> @@ -536,29 +536,6 @@ out:
>   return error;
>  }
>  
> -int
> -pipex_notify_close_session(struct pipex_session *session)
> -{
> - NET_ASSERT_LOCKED();
> - session->state = PIPEX_STATE_CLOSE_WAIT;
> - session->stat.idle_time = 0;
> - LIST_INSERT_HEAD(_close_wait_list, session, state_list);
> -
> - return (0);
> -}
> -

Unrelated but ok.

> -int
> -pipex_notify_close_session_all(void)
> -{
> - struct pipex_session *session;
> -
> - NET_ASSERT_LOCKED();
> - LIST_FOREACH(session, _session_list, session_list)
> - if (session->state == PIPEX_STATE_OPENED)
> - pipex_notify_close_session(session);
> - return (0);
> -}
> -

Unrelated but ok.  Since it's not used.

>  Static int
>  pipex_close_session(struct pipex_session_close_req *req,
>  struct pipex_iface_context *iface)
> @@ -573,13 +550,9 @@ pipex_close_session(struct pipex_session
>   if (session->pipex_iface != iface)
>   return (EINVAL);
>  
> - /* remove from close_wait list */
> - if (session->state == PIPEX_STATE_CLOSE_WAIT)
> - LIST_REMOVE(session, state_list);
> -

This must be kept.  Useland may PIPEXDSESSION before PIPEXGCLOSED for
this session.

>   /* get statistics before destroy the session */
>   req->pcr_stat = session->stat;
> - session->state = PIPEX_STATE_CLOSED;
> + pipex_destroy_session(session);
>  
>   return (0);
>  }

ok

> @@ -739,47 +712,25 @@ pipex_timer_stop(void)
>  Static void
>  pipex_timer(void *ignored_arg)
>  {
> - struct pipex_session *session, *session_tmp;
> + struct pipex_session *session;
>  
>   timeout_add_sec(_timer_ch, pipex_prune);
>  
>   NET_LOCK();
>   /* walk through */
> - LIST_FOREACH_SAFE(session, _session_list, session_list,
> - session_tmp) {
> - switch (session->state) {
> - case PIPEX_STATE_OPENED:
> - if (session->timeout_sec == 0)
> - continue;
> -
> - session->stat.idle_time++;
> - if (session->stat.idle_time < session->timeout_sec)
> - continue;
> -
> - pipex_notify_close_session(session);
> - break;
> -
> - case PIPEX_STATE_CLOSE_WAIT:
> - case PIPEX_STATE_CLOSE_WAIT2:
> - /* Wait PIPEXDSESSION from userland */
> - session->stat.idle_time++;
> - if (session->stat.idle_time < PIPEX_CLOSE_TIMEOUT)
> - continue;
> -
> - if (session->state == PIPEX_STATE_CLOSE_WAIT)
> - LIST_REMOVE(session, state_list);
> - session->state = PIPEX_STATE_CLOSED;
> - /* FALLTHROUGH */
> + LIST_FOREACH(session, _session_list, session_list) {
> + if (session->state != PIPEX_STATE_OPENED)
> + continue;
> + if (session->timeout_sec == 0)
> + continue;
>  
> - case PIPEX_STATE_CLOSED:
> - pipex_destroy_session(session);
> - break;
> + session->stat.idle_time++;
> + if (session->stat.idle_time < session->timeout_sec)
> + continue;
>  
> - default:
> - break;
> -  

Re: pipex "idle-timeout" work with pppx(4).

2020-08-12 Thread YASUOKA Masahiko
Hi,

On Wed, 12 Aug 2020 12:38:39 +0300
Vitaliy Makkoveev  wrote:
> We don't need to mark pppx(4) sessions because there is no special cases
> for them. We just need to kill pppx(4) related "pr_timeout_sec != 0"
> checks and call pipex_get_closed() by pppx_get_closed().

How do you implement that by calling pipex_get_closed() by
pppx_get_closed()?


PIPEXGCLOSED is to pick up expired sessions which is associated with
the character device (/dev/{pppx,pppac}0).  In pppac(4) case, the
character device is the same object of the interface pppac.  But
pppx(4) is not the same.  pipex_session has no direct referece to the
device.  This is why my diff was modifying pipex_get_closed().



Re: pipex "idle-timeout" work with pppx(4).

2020-08-11 Thread YASUOKA Masahiko
On Tue, 11 Aug 2020 23:06:45 +0300
Vitaliy Makkoveev  wrote:
> We removed `pipex{in,out}q'. So now we can destroy pppac(4) session just
> like we do in pppx(4) case. Also there is no reason to allow
> pipex_timer() to destroy sessions - userland will do this by
> PIPEXDSESSION. This permit us to use existing pipex_get_closed() for
> both pppac(4) and pppx(4) without any modifications.
> 
> So, I propose pipex_close_session() and pipex_timer() be like below.

It doesn't seem to fix "idle-timeout".

> We simplify pppac(4) session destruction. We unify behavior with pppx(4)
> - we killing session just now. There is no reason to modify
> pipex_get_closed() and pipex_link_session(). pppx(4) related sessions
> can be processed by pipex_timer(). There is no performance impact.

We need to modify pppx_get_closed() to implement idle-timeout.

> Do you like this? We can do two diffs. The first to unify destruction
> and the second to re-enable in-kernel timeout for pppx(4) and revert man
> pages modifications.

I have no objection to your "unify destruction".

I'll rebase my diff after that work.



Re: pipex "idle-timeout" work with pppx(4).

2020-08-11 Thread YASUOKA Masahiko


my diff is to make pppx(4) have the same "idle-timeout"
functionality.  I strongly think pppx(4) must have the same
functionalities of pppac(4) because I don't see any reason to have
any difference between pppx(4) and pppac(4).

Your pseudo code is suggesting another thing.  You would like to
change the existing behavior of pppac(4)?  Then, what is a problem you
concern.  I'd like you to provide what is the relation of my diff or a
background of the code.

On Tue, 11 Aug 2020 01:20:45 +0300
Vitaliy Makkoveev  wrote:
> 
> 
>> On 10 Aug 2020, at 19:53, Vitaliy Makkoveev  wrote:
>> 
>> We are doing all wrong :)
>> 
>> We can just unlink pppx(4) related session from `pipex_session_list' if
>> it's time expired. But since this unlinked session is still exists in
>> pppx(4) layer we can access through pppx_get_closed() without any
>> search. We should only add flag to session which identifies it as
>> pppx(4) related.
>> 
>> I hope you like this idea.
>> 
>>  cut begin 
>> Static void
>> pipex_timer(void *ignored_arg)
>> {
>>struct pipex_session *session, *session_tmp;
>> 
>>timeout_add_sec(_timer_ch, pipex_prune);
>> 
>>NET_LOCK();
>>/* walk through */
>>LIST_FOREACH_SAFE(session, _session_list, session_list,
>>session_tmp) {
>>switch (session->state) {
>>case PIPEX_STATE_OPENED:
>>if (session->timeout_sec == 0)
>>continue;
>> 
>>session->stat.idle_time++;
>>if (session->stat.idle_time < session->timeout_sec)
>>continue;
>> 
>>  if (session->pppx_session)
>>  pipex_unlink_session(session);
>>  else
>>  pipex_notify_close_session(session);
>>break;
>>  /* ... */
>> }
>> 
>> pppx_get_closed(struct pppx_dev *pxd, struct pipex_session_list_req *req)
>> {
>>  struct pppx_if *pxi;
>> 
>>  pxi = pppx_if_find(pxd, req->pdr_session_id, req->pdr_protocol);
>>  if (pxi == NULL)
>>  return (EINVAL);
>> 
>>  memset(req, 0, sizeof(*req));
>>  if (session->state == PIPEX_STATE_CLOSED) {
>>  req->plr_ppp_id[req->plr_ppp_id_count++] = session->ppp_id;
>>  pppx_if_destroy(pxi);   
>>  }
>> 
>>  return 0;
>> }
> 
> Sorry for noise. I should avoid to write pseudo code.



Re: pipex "idle-timeout" work with pppx(4).

2020-08-11 Thread YASUOKA Masahiko
Hi,

On Mon, 10 Aug 2020 16:30:27 +0300
Vitaliy Makkoveev  wrote:
> On Mon, Aug 10, 2020 at 03:12:02PM +0900, YASUOKA Masahiko wrote:
>> On Sun, 9 Aug 2020 20:03:50 +0300
>> Vitaliy Makkoveev  wrote:
>> > On Sun, Aug 09, 2020 at 06:20:13PM +0300, Vitaliy Makkoveev wrote:
>> >> You propose to unlink pppx(4) related session which reached timeout. I'm
>> >> ok with this direction. But I see no reason to rework _get_closed()
>> >> routines.
>> >> 
>> >> in pppac(4) case it's assumed what if session is not yet destroyed by
>> >> garbage collector, it will be destroyed while we performing PIPEXGCLOSED
>> >> command. We can make pppx(4) behavior the same and I propose to
>> >> pppx_get_closed() be like below. 
>> >> 
>> >> Also, nothing requires to modify pipex_get_closed(). 
>> >> 
>> >>  cut begin 
>> > 
>> > Sorry, I mean
>> > 
>> > pppx_get_closed(struct pppx_dev *pxd, struct pipex_session_list_req *req)
>> > {
>> >struct pppx_if  *pxi;
>> > 
>> >memset(req, 0, sizeof(*req));
>> > 
>> >while ((pxi = LIST_FIRST(>pxd_pxis))) {
>> >if (pxi->pxi_session->state == session->state =
>> >PIPEX_STATE_CLOSED) {
>> >req->plr_ppp_id[req->plr_ppp_id_count++] =
>> >pxi->pxi_session->ppp_id;
>> >pppx_if_destroy(pxi);
>> >}
>> >}
>> > 
>> >return 0;
>> > }
>> 
>> Yes, the diff doesn't seem to be completed but this way also will work.
>> 
>> Usually there is few CLOSED session even if there is a lot of session.
>> Also there is no CLOSED session if idle-timeout is not configured.  I
>> avoided that way because I think checking all sessions' state to find
>> such the few sessions is too expensive.
>> 
>> A way I am suggesting:
>> 
>> @@ -622,7 +625,7 @@ pipex_get_stat(struct pipex_session_stat
>>  
>>  Static int
>>  pipex_get_closed(struct pipex_session_list_req *req,
>> -struct pipex_iface_context *iface)
>> +int (*isowner)(void *, struct pipex_session *), void *ctx)
>>  {
>>  struct pipex_session *session, *session_tmp;
>>  
>> @@ -630,7 +633,7 @@ pipex_get_closed(struct pipex_session_li
>>  bzero(req, sizeof(*req));
>>  LIST_FOREACH_SAFE(session, _close_wait_list, state_list,
>>  session_tmp) {
>> -if (session->pipex_iface != iface)
>> +if (!isowner(ctx, session))
>>  continue;
>>  req->plr_ppp_id[req->plr_ppp_id_count++] = session->ppp_id;
>>  LIST_REMOVE(session, state_list);
>> 
>> uses pipex_close_wait_list which contains only sessions which is timed
>> out.
> 
> You are right. pipex_get_closed() walks through `pipex_close_wait_list'
> which contains only CLOSE_WAIT sessions.
> 
> According to npppd(8) code we do PIPEXGCLOSED related walkthrough once
> per NPPPD_TIMER_TICK_IVAL seconds, which is defined as 4. Is this such
> performance impact?

It might be not so expensive for you.  But why do you intend to use
that extra CPU when you have a cheaper way?

> Also who should destroy these sessions? It's assumed npppd(8) will
> destroy them by l2tp_ctrl_timeout() and pptp_ctrl_timeout()? Excuse me
> if I'm wrong, but who will destroy sessions in pppoe case?

In usr.sbin/npppd/npppd/npppd.c:

1306 static void
1307 pipex_periodic(npppd *_this)
1308 {
(snip)
1326 do {
1327 error = ioctl(devf, PIPEXGCLOSED, );
1328 if (error) {
1329 if (errno != ENXIO)
1330 log_printf(LOG_WARNING,
1331 "PIPEXGCLOSED failed: %m");
1332 break;
1333 }
1334 for (i = 0; i < req.plr_ppp_id_count; i++) {
1335 ppp_id = req.plr_ppp_id[i];
1336 slist_add(, (void 
*)(uintptr_t)ppp_id);
1337 }
1338 } while (req.plr_flags & PIPEX_LISTREQ_MORE);

ppp sessions which are closed by pipex(4) is inserted into "dlist".

1350 /* Disconnect request */
1351 slist_itr_first();
1352 while (slist_itr_has_next()) {
(snip)
1372 ppp_log(ppp, LOG_INFO, "Stop requested by the kernel");
1373 /* TODO: PIPEX doesn't return the disconect reason */
1374 #ifdef USE_NPPPD_RADIUS
1375 ppp_set_radius_terminate_cause(ppp,
1376 RADIUS_TERMNATE_CAUSE_IDLE_TIMEOUT);
1377 #endif
1378 ppp_stop(ppp, NULL);

all ppp session are stopd at #1378.  PPP is finisingh a layer by a
layer, ppp_stop0() will called.  That function will call PIPEXDSESSION.

I'd like to empasize that npppd(8) takes responsibilities of pipex
sessions' creation/deletion even when idle timeout happening.



Re: pipex "idle-timeout" work with pppx(4).

2020-08-10 Thread YASUOKA Masahiko
Hi,

Thank you for your review.

On Sun, 9 Aug 2020 20:03:50 +0300
Vitaliy Makkoveev  wrote:
> On Sun, Aug 09, 2020 at 06:20:13PM +0300, Vitaliy Makkoveev wrote:
>> You propose to unlink pppx(4) related session which reached timeout. I'm
>> ok with this direction. But I see no reason to rework _get_closed()
>> routines.
>> 
>> in pppac(4) case it's assumed what if session is not yet destroyed by
>> garbage collector, it will be destroyed while we performing PIPEXGCLOSED
>> command. We can make pppx(4) behavior the same and I propose to
>> pppx_get_closed() be like below. 
>> 
>> Also, nothing requires to modify pipex_get_closed(). 
>> 
>>  cut begin 
> 
> Sorry, I mean
> 
> pppx_get_closed(struct pppx_dev *pxd, struct pipex_session_list_req *req)
> {
>   struct pppx_if  *pxi;
> 
>   memset(req, 0, sizeof(*req));
> 
>   while ((pxi = LIST_FIRST(>pxd_pxis))) {
>   if (pxi->pxi_session->state == session->state =
>   PIPEX_STATE_CLOSED) {
>   req->plr_ppp_id[req->plr_ppp_id_count++] =
>   pxi->pxi_session->ppp_id;
>   pppx_if_destroy(pxi);
>   }
>   }
> 
>   return 0;
> }

Yes, the diff doesn't seem to be completed but this way also will work.

Usually there is few CLOSED session even if there is a lot of session.
Also there is no CLOSED session if idle-timeout is not configured.  I
avoided that way because I think checking all sessions' state to find
such the few sessions is too expensive.

A way I am suggesting:

@@ -622,7 +625,7 @@ pipex_get_stat(struct pipex_session_stat
 
 Static int
 pipex_get_closed(struct pipex_session_list_req *req,
-struct pipex_iface_context *iface)
+int (*isowner)(void *, struct pipex_session *), void *ctx)
 {
struct pipex_session *session, *session_tmp;
 
@@ -630,7 +633,7 @@ pipex_get_closed(struct pipex_session_li
bzero(req, sizeof(*req));
LIST_FOREACH_SAFE(session, _close_wait_list, state_list,
session_tmp) {
-   if (session->pipex_iface != iface)
+   if (!isowner(ctx, session))
continue;
req->plr_ppp_id[req->plr_ppp_id_count++] = session->ppp_id;
LIST_REMOVE(session, state_list);

uses pipex_close_wait_list which contains only sessions which is timed
out.

>> Also I have one inlined comment within your diff. 

>> > @@ -430,6 +425,7 @@ pipex_link_session(struct pipex_session 
>> >struct pipex_iface_context *iface)
>> >  {
>> >struct pipex_hash_head *chain;
>> > +  struct ifnet *ifp;
>> >  
>> >NET_ASSERT_LOCKED();
>> >  
>> > @@ -442,6 +438,11 @@ pipex_link_session(struct pipex_session 
>> >session->pipex_iface = iface;
>> >session->ifindex = iface->ifindex;
>> >  
>> > +  ifp = if_get(iface->ifindex);
>> > +  if (ifp != NULL && ifp->if_flags & IFF_POINTOPOINT)
>> > +  session->is_p2p = 1;
>> > +  if_put(ifp);
>> > +
>> 
>> I guess NULL `ifp' here exposes us a bug. I like to have assertion here.

ok, I agree here.


The diff is updated.

Index: sys/net/if_pppx.c
===
RCS file: /cvs/src/sys/net/if_pppx.c,v
retrieving revision 1.98
diff -u -p -r1.98 if_pppx.c
--- sys/net/if_pppx.c   28 Jul 2020 09:53:36 -  1.98
+++ sys/net/if_pppx.c   10 Aug 2020 06:09:52 -
@@ -185,6 +185,7 @@ int pppx_config_session(struct pppx_dev
struct pipex_session_config_req *);
 intpppx_get_stat(struct pppx_dev *,
struct pipex_session_stat_req *);
+intpppx_is_owner(void *, struct pipex_session *);
 intpppx_get_closed(struct pppx_dev *,
struct pipex_session_list_req *);
 intpppx_set_session_descr(struct pppx_dev *,
@@ -645,14 +646,6 @@ pppx_add_session(struct pppx_dev *pxd, s
struct in_ifaddr *ia;
struct sockaddr_in ifaddr;
 
-   /*
-* XXX: As long as `session' is allocated as part of a `pxi'
-*  it isn't possible to free it separately.  So disallow
-*  the timeout feature until this is fixed.
-*/
-   if (req->pr_timeout_sec != 0)
-   return (EINVAL);
-
error = pipex_init_session(, req);
if (error)
return (error);
@@ -812,12 +805,22 @@ pppx_get_stat(struct pppx_dev *pxd, stru
 }
 
 int
-pppx_get_closed(struct pppx_dev *pxd, struct pipex_session_list_req *req)
+pppx_is_owner(void *ctx, struct pipex_session *session)
 {
-   /* XXX: Only opened sessions exist for pppx(4) */
-   memset(req, 0, sizeof(*req));
+   struct pppx_dev *pxd = ctx;
+   struct pppx_if *pxi;
 
-   return 0;
+   pxi = pppx_if_find(pxd, session->session_id, session->protocol);
+   if (pxi != NULL)
+   return (1);
+
+   return (0);
+}
+
+int
+pppx_get_closed(struct pppx_dev *pxd, struct pipex_session_list_req *req)
+{
+   

pipex "idle-timeout" work with pppx(4).

2020-08-09 Thread YASUOKA Masahiko
This diff makes pipex "idle-timeout" work with pppx(4).

ok?

Index: sys/net/if_pppx.c
===
RCS file: /disk/cvs/openbsd/src/sys/net/if_pppx.c,v
retrieving revision 1.98
diff -u -p -r1.98 if_pppx.c
--- sys/net/if_pppx.c   28 Jul 2020 09:53:36 -  1.98
+++ sys/net/if_pppx.c   9 Aug 2020 08:05:16 -
@@ -185,6 +185,7 @@ int pppx_config_session(struct pppx_dev
struct pipex_session_config_req *);
 intpppx_get_stat(struct pppx_dev *,
struct pipex_session_stat_req *);
+intpppx_is_owner(void *, struct pipex_session *);
 intpppx_get_closed(struct pppx_dev *,
struct pipex_session_list_req *);
 intpppx_set_session_descr(struct pppx_dev *,
@@ -645,14 +646,6 @@ pppx_add_session(struct pppx_dev *pxd, s
struct in_ifaddr *ia;
struct sockaddr_in ifaddr;
 
-   /*
-* XXX: As long as `session' is allocated as part of a `pxi'
-*  it isn't possible to free it separately.  So disallow
-*  the timeout feature until this is fixed.
-*/
-   if (req->pr_timeout_sec != 0)
-   return (EINVAL);
-
error = pipex_init_session(, req);
if (error)
return (error);
@@ -812,12 +805,22 @@ pppx_get_stat(struct pppx_dev *pxd, stru
 }
 
 int
-pppx_get_closed(struct pppx_dev *pxd, struct pipex_session_list_req *req)
+pppx_is_owner(void *ctx, struct pipex_session *session)
 {
-   /* XXX: Only opened sessions exist for pppx(4) */
-   memset(req, 0, sizeof(*req));
+   struct pppx_dev *pxd = ctx;
+   struct pppx_if *pxi;
 
-   return 0;
+   pxi = pppx_if_find(pxd, session->session_id, session->protocol);
+   if (pxi != NULL)
+   return (1);
+
+   return (0);
+}
+
+int
+pppx_get_closed(struct pppx_dev *pxd, struct pipex_session_list_req *req)
+{
+   return (pipex_get_closed(req, pppx_is_owner, pxd));
 }
 
 int
@@ -1059,6 +1062,7 @@ static intpppac_ioctl(struct ifnet *, u
 static int pppac_output(struct ifnet *, struct mbuf *, struct sockaddr *,
struct rtentry *);
 static voidpppac_start(struct ifnet *);
+static int pppac_is_owner(void *, struct pipex_session *);
 
 static inline struct pppac_softc *
 pppac_lookup(dev_t dev)
@@ -1251,6 +1255,16 @@ pppacwrite(dev_t dev, struct uio *uio, i
 }
 
 int
+pppac_is_owner(void *ctx, struct pipex_session *session)
+{
+   struct pppac_softc *sc = ctx;
+
+   if (session->ifindex == sc->sc_if.if_index)
+   return (1);
+   return (0);
+}
+
+int
 pppacioctl(dev_t dev, u_long cmd, caddr_t data, int flags, struct proc *p)
 {
struct pppac_softc *sc = pppac_lookup(dev);
@@ -1264,6 +1278,13 @@ pppacioctl(dev_t dev, u_long cmd, caddr_
break;
case FIONREAD:
*(int *)data = mq_hdatalen(>sc_mq);
+   break;
+
+   case PIPEXGCLOSED:
+   NET_LOCK();
+   error = pipex_get_closed((struct pipex_session_list_req *)data,
+   pppac_is_owner, sc);
+   NET_UNLOCK();
break;
 
default:
Index: sys/net/pipex.c
===
RCS file: /disk/cvs/openbsd/src/sys/net/pipex.c,v
retrieving revision 1.123
diff -u -p -r1.123 pipex.c
--- sys/net/pipex.c 4 Aug 2020 09:32:05 -   1.123
+++ sys/net/pipex.c 9 Aug 2020 08:05:16 -
@@ -240,11 +240,6 @@ pipex_ioctl(struct pipex_iface_context *
pipex_iface);
break;
 
-   case PIPEXGCLOSED:
-   ret = pipex_get_closed((struct pipex_session_list_req *)data,
-   pipex_iface);
-   break;
-
default:
ret = ENOTTY;
break;
@@ -430,6 +425,7 @@ pipex_link_session(struct pipex_session 
struct pipex_iface_context *iface)
 {
struct pipex_hash_head *chain;
+   struct ifnet *ifp;
 
NET_ASSERT_LOCKED();
 
@@ -442,6 +438,11 @@ pipex_link_session(struct pipex_session 
session->pipex_iface = iface;
session->ifindex = iface->ifindex;
 
+   ifp = if_get(iface->ifindex);
+   if (ifp != NULL && ifp->if_flags & IFF_POINTOPOINT)
+   session->is_p2p = 1;
+   if_put(ifp);
+
LIST_INSERT_HEAD(_session_list, session, session_list);
chain = PIPEX_ID_HASHTABLE(session->session_id);
LIST_INSERT_HEAD(chain, session, id_chain);
@@ -469,6 +470,8 @@ pipex_unlink_session(struct pipex_sessio
session->ifindex = 0;
 
NET_ASSERT_LOCKED();
+   if (session->state == PIPEX_STATE_CLOSED)
+   return;
LIST_REMOVE(session, id_chain);
 #if defined(PIPEX_PPTP) || defined(PIPEX_L2TP)
switch (session->protocol) {
@@ -622,7 +625,7 @@ pipex_get_stat(struct pipex_session_stat
 
 Static int
 

Re: describe 'idle-timeout' exception in npppd.conf man page

2020-08-08 Thread YASUOKA Masahiko
On Sat, 8 Aug 2020 16:01:59 +0300
Vitaliy Makkoveev  wrote:
> On Sat, Aug 08, 2020 at 08:49:24PM +0900, YASUOKA Masahiko wrote:
>> On Fri, 7 Aug 2020 22:19:05 +0300
>> Vitaliy Makkoveev  wrote:
>> > Some times ago we disabled in-kernel timeout for pppx(4) related
>> > pipex(4) sessions. We did this for prevent use after free issue caused
>> > by pipex_timer [1]. By default "idle-timeout" is not set in
>> > npppd.conf(5) and I guess this is reason for we forgot to describe this
>> > exception in npppd.conf(5).
>> > 
>> > But looks like one user caught this [2]. So I propose to describe this
>> > in BUGS section of npppd.conf(5).
>> > 
>> > Also current "idle-timeout" description looks incorrect. If this option
>> > is missing, there is not in-kernel timeout for this session, but
>> > npppd(8) uses it's own timeout for. And we can't configure this value.
>> > 
>> > YASUOKA, what do you think? May be we can kill in-kernel timeout feature
>> > for pipex(4)?, and make npppd(8)'s idle timeout configurable by this
>> > option?
>> 
>> I think we should mention this to the man page until we fix it.
>> So I'd like you to update the man page first.
>> 
>> I'll try to review the problem.
>> 
> 
> Thanks. I updated my diff with changes proposed by jmc@. Are you agree
> with them?

Yes.  ok yasuoka

>> > 1. 
>> > https://cvsweb.openbsd.org/src/sys/net/if_pppx.c?rev=1.78=text/x-cvsweb-markup
>> > 2. https://marc.info/?l=openbsd-misc=159655468504864=2 
>> > 
>> > 
>> > Index: usr.sbin/npppd/npppd/npppd.conf.5
>> > ===
>> > RCS file: /cvs/src/usr.sbin/npppd/npppd/npppd.conf.5,v
>> > retrieving revision 1.27
>> > diff -u -p -r1.27 npppd.conf.5
>> > --- usr.sbin/npppd/npppd/npppd.conf.5  23 Apr 2020 21:10:54 -  
>> > 1.27
>> > +++ usr.sbin/npppd/npppd/npppd.conf.5  7 Aug 2020 19:17:00 -
>> > @@ -699,3 +699,9 @@ The current version of
>> >  .Xr npppd 8
>> >  does not support adding or removing tunnel settings or changing listener
>> >  settings (listen address, port and l2tp-ipsec-require).
>> > +.Pp
>> > +This time
>> > +.Xr pppx 4
>> > +does not allow to create sessions with non null
>> > +.Ic idle-timeout
>> > +option. 
>> 
> 



Re: describe 'idle-timeout' exception in npppd.conf man page

2020-08-08 Thread YASUOKA Masahiko
On Fri, 7 Aug 2020 22:19:05 +0300
Vitaliy Makkoveev  wrote:
> Some times ago we disabled in-kernel timeout for pppx(4) related
> pipex(4) sessions. We did this for prevent use after free issue caused
> by pipex_timer [1]. By default "idle-timeout" is not set in
> npppd.conf(5) and I guess this is reason for we forgot to describe this
> exception in npppd.conf(5).
> 
> But looks like one user caught this [2]. So I propose to describe this
> in BUGS section of npppd.conf(5).
> 
> Also current "idle-timeout" description looks incorrect. If this option
> is missing, there is not in-kernel timeout for this session, but
> npppd(8) uses it's own timeout for. And we can't configure this value.
> 
> YASUOKA, what do you think? May be we can kill in-kernel timeout feature
> for pipex(4)?, and make npppd(8)'s idle timeout configurable by this
> option?

I think we should mention this to the man page until we fix it.
So I'd like you to update the man page first.

I'll try to review the problem.

> 1. 
> https://cvsweb.openbsd.org/src/sys/net/if_pppx.c?rev=1.78=text/x-cvsweb-markup
> 2. https://marc.info/?l=openbsd-misc=159655468504864=2 
> 
> 
> Index: usr.sbin/npppd/npppd/npppd.conf.5
> ===
> RCS file: /cvs/src/usr.sbin/npppd/npppd/npppd.conf.5,v
> retrieving revision 1.27
> diff -u -p -r1.27 npppd.conf.5
> --- usr.sbin/npppd/npppd/npppd.conf.5 23 Apr 2020 21:10:54 -  1.27
> +++ usr.sbin/npppd/npppd/npppd.conf.5 7 Aug 2020 19:17:00 -
> @@ -699,3 +699,9 @@ The current version of
>  .Xr npppd 8
>  does not support adding or removing tunnel settings or changing listener
>  settings (listen address, port and l2tp-ipsec-require).
> +.Pp
> +This time
> +.Xr pppx 4
> +does not allow to create sessions with non null
> +.Ic idle-timeout
> +option. 



Re: [PATCH] pipex(4): rework PPP input

2020-08-04 Thread YASUOKA Masahiko
Sorry for delayed reply.

On Wed, 27 May 2020 01:29:36 +0300
Sergey Ryazanov  wrote:
> On Tue, May 26, 2020 at 12:07 PM Vitaliy Makkoveev
>  wrote:
>>> On 25 May 2020, at 22:04, Sergey Ryazanov  wrote:
>>> On Sat, May 23, 2020 at 3:07 PM Vitaliy Makkoveev
>>>  wrote:
 For example, each pipex session should have unique pair of `protocol’ and
 `session_id’. These values are passed from userland. While the only
 instance of npppd(8) uses pipex(4) this is not the problem. But you
 introduce the case while pipex(4) will be used by multiple independent
 userland programs. At least, I have interest how you handle this.
>>>
>>> This should not be a problem here. npppd(8) support server mode only.
>>> While my work is to implement acceleration for client side of L2TP
>>> connection.
>>
>> I guess they can coexist. Also you can have multiple connections to
>> ppp servers simultaneously.
> 
> With 16 bits long session id field, according to birthday problem to
> reach 0.9 collision probability I need 549 simultaneous sessions.
> Should I still be worried or I have a time to complete integration
> work and then update UDP  filter for love of the game?

usr.sbin/npppd/l2tp/l2tp_local.h

 79 #define L2TP_SESSION_ID_MASK0x7fff

npppd uses 0-32767


Re: pipex(4): kill pipexintr()

2020-08-03 Thread YASUOKA Masahiko
On Mon, 3 Aug 2020 23:36:09 +0300
Vitaliy Makkoveev  wrote:
> On Tue, Aug 04, 2020 at 01:26:14AM +0900, YASUOKA Masahiko wrote:
>> Comments?
> 
> You introduce `cookie' as 
> 
>   cookie = session->protocol << 16 | session->session_id;
> 
> also multicast sessions initialized as 
> 
>   session->protocol = PIPEX_PROTO_NONE;
>   session->session_id = ifindex;
> 
> `protocol' and `session_id' come from userland, so I like to have checks
> like below. It's allow us to avoid `cookie' be broken while
> `pr_session_id' exceeds 16 bit integer. Also userland should not pass
> PIPEX_PROTO_NONE as `pr_protocol' because we shouldn't have multicast
> and not multicast sessions with the same `cookie'.
> 
>  cut begin 
> 
> pipex_init_session(struct pipex_session **rsession,
> struct pipex_session_req *req)
> {
>   if (req->pr_protocol == PIPEX_PROTO_NONE)
>   return (EINVAL);

pipex_init_session() has the same check already.

 287 int
 288 pipex_init_session(struct pipex_session **rsession,
 289 struct pipex_session_req *req)
 290 {
 (snip)
 297 switch (req->pr_protocol) {
 298 #ifdef PIPEX_PPPOE
 299 case PIPEX_PROTO_PPPOE:
 (snip)
 333 default:
 334 return (EPROTONOSUPPORT);
 335 }

> 
>   if (req->pr_session_id > 0x)
>   return (EINVAL);
> 
>  cut end 

req->pr_session_id can't be > 0x since it's uint16_t.

> Also cookies introduce invalidation problem. Yes, it has low
> probability, but we can have operation order like below:
> 
> 1. enqueue session with `protocol' = 0xaa and `session_id' = 0xbb, and
>   `cookie' = 0xaabb
> 2. kill this session
> 3. create new session `protocol' = 0xaa and `session_id' = 0xbb
> 4. this newly created session will be used by pipexintr()
> 
> As I have seen while played with refcounters, session can be enqueued
> more than 10 times...

The diff makes the problem worse, but it could happen already if the
session-id is reused.

> Also It's not obvious that interface index will never exceed 16 bit
> counter. It's unsigned int and may be underlay counter's resolution
> will be expanded in future. So I like to have at least corresponding
> assertion in pipex_iface_init().

Right.  This is fixable with another unique number.

> So, may be my first solution is the best here. And, as mpi@ pointed,
> ipsec(4) should be reworked to allow parallelism.

Does first mean killing the pipexintr?

What I explained was wrong.  I'm sorry about this.

On Fri, 31 Jul 2020 09:36:32 +0900 (JST)
YASUOKA Masahiko  wrote:
> A packet of L2TP/IPsec (encapsulated IP/PPP/L2TP/UDP/ESP/UDP/IP) is
> processed like:
> 
>ipv4_input
>  ...
>udp_input
>  ipsec_common_input
>  esp_input
>crypto_dispatch
>  => crypto_taskq_mp_safe
> 
>kthread "crynlk"
>  crypto_invoke
>... (*1)
>  crypto_done
>  esp_input_cb
>ipsec_common_input_cb
>  ip_deliver
>udp_input
>  pipex_l2tp_input
>pipex_common_input
>  (*2)
>  pipex_ppp_input
>pipex_mppe_input (*3)
>  pipex_ppp_input
>pipex_ip_input
>  ipv4_input
>...

This should be

   kthread "crynlk"
 crypto_invoke
   ... (*1)
 crypto_done
   kthread "crypto" < another thread
 ipsec_input_cb < this is missed
   esp_input_cb
 ipsec_common_input_cb
   ip_deliver
 udp_input
   pipex_l2tp_input
 pipex_common_input
   (*2)
   pipex_ppp_input
 pipex_mppe_input (*3)
   pipex_ppp_input
 pipex_ip_input
   ipv4_input
 ...

> At *2 there was a queue.  "crynlk" is a busy thread, since it is doing
> decryption at *1.  I think it's better pipex input is be done by
> another thread than crypto since it also has decryption at *3.

This is false.  *3 is done by another thread.
It is the same if crypto driver is not CRYPTOCAP_F_MPSAFE.
(crypto_invoke() is done by the caller's thread and the callback
 (ipsec_input_cb) is called by"crypto" thread.)

So I have no actual reason to keep the queues.

ok yasuoka for the diff which kills pipexintr.



Re: pipex(4): kill pipexintr()

2020-08-03 Thread YASUOKA Masahiko
On Sat, 1 Aug 2020 18:52:27 +0300
Vitaliy Makkoveev  wrote:
> On Sat, Aug 01, 2020 at 07:44:17PM +0900, YASUOKA Masahiko wrote:
>> I'm not sure when it is broken, in old versions, it was assumed the
>> pipex queues are empty when pipex_iface_stop() is called.  The problem
>> mvs@ found is the assumption is not true any more.
>> 
>> pipex has a mechanism that delete a session when the queues are empty.
>> 
>> 819 Static void
>> 820 pipex_timer(void *ignored_arg)
>> 821 {
>> (snip)
>> 854 case PIPEX_STATE_CLOSED:
>> 855 /*
>> 856  * mbuf queued in pipexinq or pipexoutq may 
>> have a
>> 857  * refererce to this session.
>> 858  */
>> 859 if (!mq_empty() || 
>> !mq_empty())
>> 860 continue;
>> 861 
>> 862 pipex_destroy_session(session);
>> 863 break;
>> 
>> I think using this is better.
>> 
>> How about this?
> 
> Unfortunately your diff is incorrect. It introduces memory leaks and
> breaks pppx(4). Also it is incomplete.

Thank you for your feedbacks.

> We have multiple ways to kill pipex(sessions):
> 
> 1. pppx(4)
> 
> We have `struct pppx_if' which has pointer to corresponding session and
> this session is accessed directly within pppx(4) layer. Since we can't
> destroy `ppp_if' in pipex(4) layer we can't destroy these sessions by
> pipex_timer(). The only way to destroy them is pppx_if_destroy() which:
> 
> 1. unlink session by pipex_unlink_session()
> 2. detach corresponding `ifnet' by if_detach()
> 3. release session by pipex_rele_session() 
> 
> It's unsafe because mbuf queues can have references to this session.

Yes.

> 2. pppac(4)
> 
> We have no direct access to corresponding sessions within pppac(4)
> layer. Also there are multiple ways to do this:
> 
> 1. pipex_ioctl() with `PIPEXSMODE' command. Underlay pipex_iface_stop()
> walks through `pipex_session_list' and destroy sessions by
> pipex_destroy_session() call. It's unsafe because we don't check queues.
> 
> 2. pipex_ioctl() with `PIPEXDSESSION'. pipex_close_session() will change
> session's  state and pipex_timer() will kill this sessions later. This
> is the only safe way.
> 
> 3. pipex_iface_fini(). The same as `PIPEXSMODE', pipex_iface_stop()
> kills sessions, Which is also unsafe. Also we have another use after
> free issue:
> 
>  cut begin 
> 
> pipex_iface_fini(struct pipex_iface_context *pipex_iface)
> {
> pool_put(_session_pool, pipex_iface->multicast_session);
> NET_LOCK();
> pipex_iface_stop(pipex_iface);
> NET_UNLOCK();
> }
> 
>  cut end 
> 
> `multicast_session' should be protected too. It also can be pushed to
> `pipexoutq'.

Yes, I missed this point.

> Also since this time pipexintr() and pipex_iface_fini() are
> both serialized by KERNEL_LOCK() too we can't destroy `multicast_session'
> which is in use by pipexintr(). But when we will drop KERNEL_LOCK()
> around pipexintr() we can catch use after free issue here. I already did
> diff for move this pool_put() under NET_LOCK(), but it was rejectedi by
> mpi@ because:
> 
>  cut begin 
> pipex_iface_fini() should be called on the last reference of the  
>   
> descriptor.  So this shouldn't be necessary.  If there's an issue 
>   
> with the current order of the operations, we should certainly fix 
>   
> it differently.   
>  cut end 

Yes, I understand what mpi@ is saying.  But this is a separate story.

> So I repeat it again: npppd(8) can be killed in every moment by SIGKILL
> or by SIGSEGV and pppacclose() will be called and it will call
> pipex_iface_fini(). `multicast_session' can be used in this moment by
> pipexintr().
> 
> And no locks protect `multicast_session' itself.
> 
> The two diffs I proposed in this thread solve problems caused by
> pipexintr().

There are a lot of ways to solve the problems.

The diff I sent few days ago is to destruct the pipex sessions in the
pipex timer.  As you pointed out it has some problems.  Those problems
can be fixed, but I'd suggest another way.  I attached at last.

The problem exposed is "use-after-free".  Since I think this is not a
problem of parallel processing, having reference counter seems too
much for me.


The diff is not to refer the session by a pointer, but by the id.
The idea is come from IPsec tdb.

Comments?


diff --git a/sys/net/pipex.c b/sys/net/p

Re: pipex(4): kill pipexintr()

2020-08-01 Thread YASUOKA Masahiko
Hi,

I'm not sure when it is broken, in old versions, it was assumed the
pipex queues are empty when pipex_iface_stop() is called.  The problem
mvs@ found is the assumption is not true any more.

pipex has a mechanism that delete a session when the queues are empty.

819 Static void
820 pipex_timer(void *ignored_arg)
821 {
(snip)
854 case PIPEX_STATE_CLOSED:
855 /*
856  * mbuf queued in pipexinq or pipexoutq may 
have a
857  * refererce to this session.
858  */
859 if (!mq_empty() || 
!mq_empty())
860 continue;
861 
862 pipex_destroy_session(session);
863 break;

I think using this is better.

How about this?

diff --git a/sys/net/pipex.c b/sys/net/pipex.c
index 2ad7757fee9..6fe14c400bf 100644
--- a/sys/net/pipex.c
+++ b/sys/net/pipex.c
@@ -190,7 +190,7 @@ pipex_iface_stop(struct pipex_iface_context *pipex_iface)
LIST_FOREACH_SAFE(session, _session_list, session_list,
session_tmp) {
if (session->pipex_iface == pipex_iface)
-   pipex_destroy_session(session);
+   pipex_unlink_session(session);
}
 }
 
@@ -470,9 +470,16 @@ pipex_link_session(struct pipex_session *session,
 void
 pipex_unlink_session(struct pipex_session *session)
 {
+   struct radix_node *rn;
+
session->ifindex = 0;
 
NET_ASSERT_LOCKED();
+   if (!in_nullhost(session->ip_address.sin_addr)) {
+   rn = rn_delete(>ip_address, >ip_netmask,
+   pipex_rd_head4, (struct radix_node *)session);
+   KASSERT(rn != NULL);
+   }
LIST_REMOVE(session, id_chain);
 #if defined(PIPEX_PPTP) || defined(PIPEX_L2TP)
switch (session->protocol) {
@@ -486,10 +493,6 @@ pipex_unlink_session(struct pipex_session *session)
LIST_REMOVE(session, state_list);
LIST_REMOVE(session, session_list);
session->state = PIPEX_STATE_CLOSED;
-
-   /* if final session is destroyed, stop timer */
-   if (LIST_EMPTY(_session_list))
-   pipex_timer_stop();
 }
 
 Static int
@@ -652,20 +655,16 @@ pipex_get_closed(struct pipex_session_list_req *req,
 Static int
 pipex_destroy_session(struct pipex_session *session)
 {
-   struct radix_node *rn;
-
/* remove from radix tree and hash chain */
NET_ASSERT_LOCKED();
 
-   if (!in_nullhost(session->ip_address.sin_addr)) {
-   rn = rn_delete(>ip_address, >ip_netmask,
-   pipex_rd_head4, (struct radix_node *)session);
-   KASSERT(rn != NULL);
-   }
-
pipex_unlink_session(session);
pipex_rele_session(session);
 
+   /* if final session is destroyed, stop timer */
+   if (LIST_EMPTY(_session_list))
+   pipex_timer_stop();
+
return (0);
 }
 
@@ -739,7 +738,8 @@ pipexintr(void)
mq_delist(, );
while ((m = ml_dequeue()) != NULL) {
pkt_session = m->m_pkthdr.ph_cookie;
-   if (pkt_session == NULL) {
+   if (pkt_session == NULL ||
+   pkt_session->state == PIPEX_STATE_CLOSED) {
m_freem(m);
continue;
}
@@ -776,7 +776,8 @@ pipexintr(void)
mq_delist(, );
while ((m = ml_dequeue()) != NULL) {
pkt_session = m->m_pkthdr.ph_cookie;
-   if (pkt_session == NULL) {
+   if (pkt_session == NULL ||
+   pkt_session->state == PIPEX_STATE_CLOSED) {
m_freem(m);
continue;
}



Re: pipex(4): kill pipexintr()

2020-07-30 Thread YASUOKA Masahiko
On Thu, 30 Jul 2020 22:43:10 +0300
Vitaliy Makkoveev  wrote:
> On Thu, Jul 30, 2020 at 10:05:13PM +0900, YASUOKA Masahiko wrote:
>> On Thu, 30 Jul 2020 15:34:09 +0300
>> Vitaliy Makkoveev  wrote:
>> > On Thu, Jul 30, 2020 at 09:13:46PM +0900, YASUOKA Masahiko wrote:
>> >> If the diff removes the queue, then the pipex input routine is
>> >> executed by the NIC's interrupt handler.
>> >> 
>> >> The queues had been made to avoid that kind of situations.
>> > 
>> > It's not enqueued in pppoe case. According pipex_pppoe_input() code we
>> > call pipex_common_input() with `useq' argument set to '0', so we don't
>> > enqueue mbuf(9) but pass it to pipex_ppp_input() which will pass it to
>> > ipv{4,6}_input().
>> 
>> You are right.  Sorry, I forgot about this which I did that by myself.
> 
> I'm interesting the reason why you did that.

I remembered, it was first step of MP steps for pipex.

At that time, I discussed with mpi, he suggested like below.

 1. stop enqueueing packets for PPPoE
 2. try not take a kernel lock before calling gre_input(), then we can
also stop enqueueing packets for PPTP(GRE)
 3. for L2TP, keep the queue and change the netisr to an unlocked task



Re: pipex(4): kill pipexintr()

2020-07-30 Thread YASUOKA Masahiko
On Thu, 30 Jul 2020 22:43:10 +0300
Vitaliy Makkoveev  wrote:
> On Thu, Jul 30, 2020 at 10:05:13PM +0900, YASUOKA Masahiko wrote:
>> On Thu, 30 Jul 2020 15:34:09 +0300
>> Vitaliy Makkoveev  wrote:
>> > On Thu, Jul 30, 2020 at 09:13:46PM +0900, YASUOKA Masahiko wrote:
>> >> Hi,
>> >> 
>> >> sys/net/if_ethersubr.c:
>> >> 372 void
>> >> 373 ether_input(struct ifnet *ifp, struct mbuf *m)
>> >> (snip)
>> >> 519 #if NPPPOE > 0 || defined(PIPEX)
>> >> 520 case ETHERTYPE_PPPOEDISC:
>> >> 521 case ETHERTYPE_PPPOE:
>> >> 522 if (m->m_flags & (M_MCAST | M_BCAST))
>> >> 523 goto dropanyway;
>> >> 524 #ifdef PIPEX
>> >> 525 if (pipex_enable) {
>> >> 526 struct pipex_session *session;
>> >> 527 
>> >> 528 if ((session = pipex_pppoe_lookup_session(m)) 
>> >> != NULL) {
>> >> 529 pipex_pppoe_input(m, session);
>> >> 530 return;
>> >> 531 }
>> >> 532 }
>> >> 533 #endif
>> >> 
>> >> previously a packet which branchces to #529 is enqueued.
>> >> 
>> >> If the diff removes the queue, then the pipex input routine is
>> >> executed by the NIC's interrupt handler.
>> >> 
>> >> The queues had been made to avoid that kind of situations.
>> > 
>> > It's not enqueued in pppoe case. According pipex_pppoe_input() code we
>> > call pipex_common_input() with `useq' argument set to '0', so we don't
>> > enqueue mbuf(9) but pass it to pipex_ppp_input() which will pass it to
>> > ipv{4,6}_input().
>> 
>> You are right.  Sorry, I forgot about this which I did that by myself.
>> 
> 
> I'm interesting the reason why you did that.
> 
>> >> Also I don't see a relation of the use-after-free problem and killing
>> >> queues.  Can't we fix the problem unless we kill the queues?
>> > 
>> > Yes we can. Reference counters allow us to keep orphan sessions in these
>> > queues without use after free issue.
>> > 
>> > I will wait your commentaries current enqueuing before to do something.
>> 
>> I have another concern.
>> 
>> You might know, when L2TP/IPsec is used heavily, the crypto thread
>> uses 100% of 1 CPU core.  In that case, that thread becomes like
>> below:
>> 
>>   crypto thread -> udp_userreq -> pipex_l2tp_input
>> 
>> some clients are using MPPE(RC4 encryption) on CCP.  It's not so
>> light.
>> 
>> How do we offload this for CPUs?  I am thinking that "pipex" can have
>> a dedicated thread.  Do we have another scenario?
>>
> 
> I suppose you mean udp_input(). What is you call "crypto thread"? I did
> a little backtrace but I didn't find this thread.
> 
> ether_resolve
>   if_input_local
> ipv4_input
>   ip_input_if
> ip_ours
>   ip_deliver
> udp_input (through pr_input)
>   pipex_l2tp_input
> 
> ipi{,6}_mloopback
>   if_input_local
> ipv4_input
>   ...
> udp_input (through pr_input)
>   pipex_l2tp_input
> 
> loinput
>   if_input_local
> ipv4_input
>   ...
> udp_input (through pr_input)
>   pipex_l2tp_input
> 
> Also various pseudo drivers call ipv{4,6}_input() and underlay
> udp_unput() too.
> 
> Except nfs, we call udp_usrreq() through socket layer only. Do you mean
> userland as "crypto thread"?

Sorry, udp_usrreq() should be usr_input() and crypto thread meant a
kthread for crypto_taskq_mp_safe, whose name is "crynlk" (see
crypto_init()).

A packet of L2TP/IPsec (encapsulated IP/PPP/L2TP/UDP/ESP/UDP/IP) is
processed like:

   ipv4_input
 ...
   udp_input
 ipsec_common_input
   esp_input
 crypto_dispatch
   => crypto_taskq_mp_safe

   kthread "crynlk"
 crypto_invoke
   ... (*1)
 crypto_done
   esp_input_cb
 ipsec_common_input_cb
   ip_deliver
 udp_input
   pipex_l2tp_input
 pipex_common_input
   (*2)
   pipex_ppp_input
 pipex_mppe_input (*3)
   pipex_ppp_input
 pipex_ip_input
 

Re: pipex(4): kill pipexintr()

2020-07-30 Thread YASUOKA Masahiko
On Thu, 30 Jul 2020 15:34:09 +0300
Vitaliy Makkoveev  wrote:
> On Thu, Jul 30, 2020 at 09:13:46PM +0900, YASUOKA Masahiko wrote:
>> Hi,
>> 
>> sys/net/if_ethersubr.c:
>> 372 void
>> 373 ether_input(struct ifnet *ifp, struct mbuf *m)
>> (snip)
>> 519 #if NPPPOE > 0 || defined(PIPEX)
>> 520 case ETHERTYPE_PPPOEDISC:
>> 521 case ETHERTYPE_PPPOE:
>> 522 if (m->m_flags & (M_MCAST | M_BCAST))
>> 523 goto dropanyway;
>> 524 #ifdef PIPEX
>> 525 if (pipex_enable) {
>> 526 struct pipex_session *session;
>> 527 
>> 528 if ((session = pipex_pppoe_lookup_session(m)) != 
>> NULL) {
>> 529 pipex_pppoe_input(m, session);
>> 530 return;
>> 531 }
>> 532 }
>> 533 #endif
>> 
>> previously a packet which branchces to #529 is enqueued.
>> 
>> If the diff removes the queue, then the pipex input routine is
>> executed by the NIC's interrupt handler.
>> 
>> The queues had been made to avoid that kind of situations.
> 
> It's not enqueued in pppoe case. According pipex_pppoe_input() code we
> call pipex_common_input() with `useq' argument set to '0', so we don't
> enqueue mbuf(9) but pass it to pipex_ppp_input() which will pass it to
> ipv{4,6}_input().

You are right.  Sorry, I forgot about this which I did that by myself.

>> Also I don't see a relation of the use-after-free problem and killing
>> queues.  Can't we fix the problem unless we kill the queues?
> 
> Yes we can. Reference counters allow us to keep orphan sessions in these
> queues without use after free issue.
> 
> I will wait your commentaries current enqueuing before to do something.

I have another concern.

You might know, when L2TP/IPsec is used heavily, the crypto thread
uses 100% of 1 CPU core.  In that case, that thread becomes like
below:

  crypto thread -> udp_userreq -> pipex_l2tp_input

some clients are using MPPE(RC4 encryption) on CCP.  It's not so
light.

How do we offload this for CPUs?  I am thinking that "pipex" can have
a dedicated thread.  Do we have another scenario?

--yasuoka



Re: pipex(4): kill pipexintr()

2020-07-30 Thread YASUOKA Masahiko
Hi,

sys/net/if_ethersubr.c:
372 void
373 ether_input(struct ifnet *ifp, struct mbuf *m)
(snip)
519 #if NPPPOE > 0 || defined(PIPEX)
520 case ETHERTYPE_PPPOEDISC:
521 case ETHERTYPE_PPPOE:
522 if (m->m_flags & (M_MCAST | M_BCAST))
523 goto dropanyway;
524 #ifdef PIPEX
525 if (pipex_enable) {
526 struct pipex_session *session;
527 
528 if ((session = pipex_pppoe_lookup_session(m)) != 
NULL) {
529 pipex_pppoe_input(m, session);
530 return;
531 }
532 }
533 #endif

previously a packet which branchces to #529 is enqueued.

If the diff removes the queue, then the pipex input routine is
executed by the NIC's interrupt handler.

The queues had been made to avoid that kind of situations.

Also I don't see a relation of the use-after-free problem and killing
queues.  Can't we fix the problem unless we kill the queues?

On Wed, 29 Jul 2020 23:04:36 +0300
Vitaliy Makkoveev  wrote:
> Now pipex(4) is fully covered by NET_LOCK() and this is documented. But
> we still have an issue with pipex(4) session itself and I guess it's
> time to fix it.
> 
> We have `pipexinq' and `pipexoutq' mbuf(9) queues to store mbufs. Each
> mbuf(9) passed to these queues stores the pointer to corresponding
> session referenced as `m_pkthdr.ph_cookie'. We enqueue incoming mbufs for
> pppx(4) and incoming and outgoing mbufs for pppac(4). But we don't
> enqueue pppoe related mbufs. After packet was enqueued to corresponding
> queue we call schednetisr() which just schedules netisr() to run:
> 
>  cut begin 
> 
> 780 pipex_ppp_enqueue(struct mbuf *m0, struct pipex_session *session,
> 781 struct mbuf_queue *mq)
> 782 {
> 783 m0->m_pkthdr.ph_cookie = session;
> 784 /* XXX need to support other protocols */
> 785 m0->m_pkthdr.ph_ppp_proto = PPP_IP;
> 786 
> 787 if (mq_enqueue(mq, m0) != 0)
> 788 return (1);
> 789 
> 790 schednetisr(NETISR_PIPEX);
> 791 
> 792 return (0);
> 793 }
> 
>  cut end 
> 
> Also we have pipex_timer() which should destroy session in safe way, but
> it does this only for pppac(4) and only for sessions closed by
> `PIPEXDSESSION' command:
> 
>  cut begin 
> 
> 812 pipex_timer(void *ignored_arg)
> 813 {
>   /* skip */
> 846 case PIPEX_STATE_CLOSED:
> 847 /*
> 848  * mbuf queued in pipexinq or pipexoutq may have a
> 849* refererce to this session.
> 850  */
> 851 if (!mq_empty() || !mq_empty())
> 852 continue;
> 853 
> 854 pipex_destroy_session(session);
> 855 break;
> 
>  cut end 
> 
> While we destroy sessions through pipex_rele_session() or through
> pipex_iface_fini() or through `PIPEXSMODE' command we don't check
> `pipexinq' and `pipexoutq' state. This means we can break them.
> 
> It's not guaranteed that netisr() will start just after schednetisr()
> call. This means we can destroy session, but corresponding mbuf(9) is
> stored within `pipexinq' or `pipexoutq'. It's `m_pkthdr.ph_cookie' still
> stores pointer to destroyed session and we have use after free issue. I
> wonder why we didn't caught panic yet.
> 
> I propose to kill `pipexinq', `pipexoutq' and pipexintr(). There is
> absolutely no reason them to exist. This should not only fix issue
> described above but simplifies code too.
> 
> Other ways are to implement reference counters for session or walk
> through mbuf(9) queues and kill corresponding mbufs. It doesn't make
> sense to go these ways.
> 
> Index: lib/libc/sys/sysctl.2
> ===
> RCS file: /cvs/src/lib/libc/sys/sysctl.2,v
> retrieving revision 1.40
> diff -u -p -r1.40 sysctl.2
> --- lib/libc/sys/sysctl.2 17 May 2020 05:48:39 -  1.40
> +++ lib/libc/sys/sysctl.2 29 Jul 2020 13:47:40 -
> @@ -2033,35 +2033,11 @@ The currently defined variable names are
>  .Bl -column "Third level name" "integer" "Changeable" -offset indent
>  .It Sy "Third level name" Ta Sy "Type" Ta Sy "Changeable"
>  .It Dv PIPEXCTL_ENABLE Ta integer Ta yes
> -.It Dv PIPEXCTL_INQ Ta node Ta not applicable
> -.It Dv PIPEXCTL_OUTQ Ta node Ta not applicable
>  .El
>  .Bl -tag -width "123456"
>  .It Dv PIPEXCTL_ENABLE
>  If set to 1, enable PIPEX processing.
>  The default is 0.
> -.It Dv PIPEXCTL_INQ Pq Va net.pipex.inq
> -Fourth level comprises an array of
> -.Vt struct ifqueue
> -structures containing information about the PIPEX packet input queue.
> -The forth level names for the elements of
> -.Vt struct ifqueue
> -are the same as described in
> -.Li ip.arpq
> -in the
> -.Dv PF_INET
> -section.
> -.It Dv PIPEXCTL_OUTQ Pq Va 

Re: pf: route-to least-states

2020-07-28 Thread YASUOKA Masahiko
Hi,

On Tue, 28 Jul 2020 18:54:48 +0200
Alexandr Nedvedicky  wrote:
> On Wed, Jul 29, 2020 at 01:22:48AM +0900, YASUOKA Masahiko wrote:
>> Previous commit has a wrong part..
>> 
>> ok?
>> 
>> Fix previous commit which referred wrong address.
> 
> would it make sense to move the block, you've introduced earler
> under the !PF_AZERO() branch just couple lines below. something
> like this:
> 
> 8<---8<---8<--8<
> diff --git a/sys/net/pf_lb.c b/sys/net/pf_lb.c
> index 510795a4d0b..f77d96a99ec 100644
> --- a/sys/net/pf_lb.c
> +++ b/sys/net/pf_lb.c
> @@ -322,13 +322,13 @@ pf_map_addr_sticky(sa_family_t af, struct pf_rule *r, 
> struct pf_addr *saddr,
> return (-1);
> }
>  
> -   if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_LEASTSTATES) {
> -   if (pf_map_addr_states_increase(af, rpool, naddr) == -1)
> +   if (!PF_AZERO(cached, af)) {
> +   pf_addrcpy(naddr, cached, af);
> +   if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_LEASTSTATES) 
> &&
> +   ((pf_map_addr_states_increase(af, rpool, cached) == -1))
> return (-1);
> }
>  
> -   if (!PF_AZERO(cached, af))
> -   pf_addrcpy(naddr, cached, af);
> if (pf_status.debug >= LOG_DEBUG) {
> log(LOG_DEBUG, "pf: pf_map_addr: "
> "src tracking (%u) maps ", type);
> 
> 8<---8<---8<--8<
> 
> It seems to me it would be better to bump number of states if and only if we
> actually find some address in pool.

Yes, I agree.

ok?

Fix previous commit which referred wrong address and returned wrong
value.


Index: sys/net/pf_lb.c
===
RCS file: /cvs/src/sys/net/pf_lb.c,v
retrieving revision 1.66
diff -u -p -r1.66 pf_lb.c
--- sys/net/pf_lb.c 28 Jul 2020 16:47:41 -  1.66
+++ sys/net/pf_lb.c 28 Jul 2020 17:01:34 -
@@ -322,13 +322,13 @@ pf_map_addr_sticky(sa_family_t af, struc
return (-1);
}
 
-   if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_LEASTSTATES) {
-   if (pf_map_addr_states_increase(af, rpool, naddr) == -1)
-   return (-1);
-   }
 
-   if (!PF_AZERO(cached, af))
+   if (!PF_AZERO(cached, af)) {
pf_addrcpy(naddr, cached, af);
+   if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_LEASTSTATES &&
+   pf_map_addr_states_increase(af, rpool, cached) == -1)
+   return (-1);
+   }
if (pf_status.debug >= LOG_DEBUG) {
log(LOG_DEBUG, "pf: pf_map_addr: "
"src tracking (%u) maps ", type);
@@ -651,7 +651,7 @@ pf_map_addr_states_increase(sa_family_t 
pf_print_host(naddr, 0, af);
addlog(". Failed to increase count!\n");
}
-   return (1);
+   return (-1);
}
} else if (rpool->addr.type == PF_ADDR_DYNIFTL) {
if (pfr_states_increase(rpool->addr.p.dyn->pfid_kt,
@@ -663,7 +663,7 @@ pf_map_addr_states_increase(sa_family_t 
pf_print_host(naddr, 0, af);
addlog(". Failed to increase count!\n");
}
-   return (1);
+   return (-1);
}
}
return (0);



Re: pf: route-to least-states

2020-07-28 Thread YASUOKA Masahiko
Hi,

Let me add another fix of previous.

ok?

Fix previous commit which referred wrong address and returned wrong
value.

Index: sys/net/pf_lb.c
===
RCS file: /cvs/src/sys/net/pf_lb.c,v
retrieving revision 1.66
diff -u -p -r1.66 pf_lb.c
--- sys/net/pf_lb.c 28 Jul 2020 16:47:41 -  1.66
+++ sys/net/pf_lb.c 28 Jul 2020 16:52:24 -
@@ -323,7 +323,7 @@ pf_map_addr_sticky(sa_family_t af, struc
}
 
if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_LEASTSTATES) {
-   if (pf_map_addr_states_increase(af, rpool, naddr) == -1)
+   if (pf_map_addr_states_increase(af, rpool, cached) == -1)
return (-1);
}
 
@@ -651,7 +651,7 @@ pf_map_addr_states_increase(sa_family_t 
pf_print_host(naddr, 0, af);
addlog(". Failed to increase count!\n");
}
-   return (1);
+   return (-1);
}
} else if (rpool->addr.type == PF_ADDR_DYNIFTL) {
if (pfr_states_increase(rpool->addr.p.dyn->pfid_kt,
@@ -663,7 +663,7 @@ pf_map_addr_states_increase(sa_family_t 
pf_print_host(naddr, 0, af);
addlog(". Failed to increase count!\n");
}
-   return (1);
+   return (-1);
}
}
return (0);



Re: pf: route-to least-states

2020-07-28 Thread YASUOKA Masahiko
Hi,

Previous commit has a wrong part..

ok?

Fix previous commit which referred wrong address.

Index: sys/net/pf_lb.c
===
RCS file: /cvs/src/sys/net/pf_lb.c,v
retrieving revision 1.65
diff -u -p -r1.65 pf_lb.c
--- sys/net/pf_lb.c 24 Jul 2020 14:06:33 -  1.65
+++ sys/net/pf_lb.c 28 Jul 2020 16:15:50 -
@@ -323,7 +323,7 @@ pf_map_addr_sticky(sa_family_t af, struc
}
 
if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_LEASTSTATES) {
-   if (pf_map_addr_states_increase(af, rpool, naddr) == -1)
+   if (pf_map_addr_states_increase(af, rpool, cached) == -1)
return (-1);
}
 



relayd: set group and divert-reply

2020-07-26 Thread YASUOKA Masahiko
Hi,

I'd like to run relayd as _relayd group always so that we can use
"group _relayd" in a pf rule.  This makes it possible to write a pf
rule easily which is to match only connections from relayd(8).

Also as for relayd.conf(5), I'd like to mention that "divert-reply" is
required for "transparent forward" and add an example pf rule which
uses "group _relayd".

ok?

Run relayd(8) as _relayd group user.

Index: usr.sbin/relayd/relayd.c
===
RCS file: /cvs/src/usr.sbin/relayd/relayd.c,v
retrieving revision 1.182
diff -u -p -r1.182 relayd.c
--- usr.sbin/relayd/relayd.c15 Sep 2019 19:23:29 -  1.182
+++ usr.sbin/relayd/relayd.c26 Jul 2020 08:39:27 -
@@ -201,6 +201,11 @@ main(int argc, char *argv[])
if ((ps->ps_pw =  getpwnam(RELAYD_USER)) == NULL)
errx(1, "unknown user %s", RELAYD_USER);
 
+   if (setgroups(1, >ps_pw->pw_gid) == -1 ||
+   setresgid(ps->ps_pw->pw_gid, ps->ps_pw->pw_gid, ps->ps_pw->pw_gid)
+   == -1)
+   err(1, "unable to set group ids");
+
log_init(debug, LOG_DAEMON);
log_setverbose(verbose);
 

Add a mention that "divert-reply" rule is required for "transparent
forward" and add an example which uses "group _relayd" to match the
outgoing connections.

Index: usr.sbin/relayd/relayd.conf.5
===
RCS file: /cvs/src/usr.sbin/relayd/relayd.conf.5,v
retrieving revision 1.198
diff -u -p -r1.198 relayd.conf.5
--- usr.sbin/relayd/relayd.conf.5   1 Jul 2020 06:47:18 -   1.198
+++ usr.sbin/relayd/relayd.conf.5   26 Jul 2020 08:39:27 -
@@ -622,6 +622,10 @@ Use the
 .Ic transparent
 keyword to enable fully-transparent mode; the source address of the
 client will be retained in this case.
+For this case,
+additional
+.Xr pf 4
+rule with divert-reply option is required for the outgoing connection.
 .Pp
 The
 .Ic with tls
@@ -1627,6 +1631,31 @@ relay tlsinspect {
protocol httpfilter
forward with tls to destination
 }
+.Ed
+.Pp
+If you want to use fully-transparent mode,
+you can add the
+.Ic transparent
+keyword to
+.Ic forward
+option:
+.Bd -literal -offset indent
+relay tlsinspect {
+   listen on 127.0.0.1 port 8443 tls
+   protocol httpfilter
+   transparent forward with tls to destination
+}
+.Ed
+.Pp
+And add a matching divert-reply rule in
+.Xr pf.conf 5 .
+You can use
+.Dq group _relayd
+to match only connections from
+.Xr relayd 8
+precisely:
+.Bd -literal -offset indent
+pass out proto tcp to port 443 group _relayd divert-reply
 .Ed
 .Pp
 The next simple router configuration example can be used to run



Re: pf_remove_divert_state

2020-07-26 Thread YASUOKA Masahiko
Thanks,

On Sat, 25 Jul 2020 15:00:07 +0200
Alexander Bluhm  wrote:
> On Sat, Jul 25, 2020 at 09:37:37PM +0900, YASUOKA Masahiko wrote:
>> Is this part a reason why we have "divert-reply"?
> 
> Yes.
> 
> Divert rules pass packets to the local network stack.  With divert-to
> you specify the socket address.  This works for incomming connections.
> The divert-to address can be 127.0.0.1 or anything else with
> SO_BINDANY.
> 
> When you use SO_BINDANY for outgoing connections and you don't know
> the addresses when writing pf.conf, use divert-reply.
> 
> As dangling states interfere with new connections, I added the
> divert state cleanup.  This is especially necessary for DGRAM or
> RAW sockets.

Yes.  My first message shows it is neccessary for TCP. 

Also my diff was totally wrong it deletes the states regardless of
it's for divert or not.

>> > Is that not possible for you?
>> 
>> It's possible.
> 
> Fine, then use divert-reply instead of changing the semantics.

I have thought it's hard to create a divert-reply rule for relayd's
"transparent forward to destination" case.  But I noticed tftp-proxy
is using "group _tftp_proxy" to match connections only from the
program precisely.

I'll send diffs to do the same thing for relayd in a separated mail.



Re: pf_remove_divert_state

2020-07-25 Thread YASUOKA Masahiko
On Sat, 25 Jul 2020 13:29:57 +0200
Alexander Bluhm  wrote:
> On Sat, Jul 25, 2020 at 08:20:21PM +0900, YASUOKA Masahiko wrote:
>> Currently SO_BINDANY is usable without any divert or divert-reply
>> rule.
> 
> This is why we have the divert-reply feature.  Just mark the states
> with that keyword when you want to use them with SO_BINDANY.

Thanks,

Let me clarify whether I understand correctly.

| @@ -1410,9 +1410,7 @@ pf_remove_divert_state(struct pf_state_k
|   struct pf_state_item*si;
|  
|   TAILQ_FOREACH(si, >states, entry) {
| - if (sk == si->s->key[PF_SK_STACK] && si->s->rule.ptr &&
| - (si->s->rule.ptr->divert.type == PF_DIVERT_TO ||
| - si->s->rule.ptr->divert.type == PF_DIVERT_REPLY)) {
| + if (sk == si->s->key[PF_SK_STACK]) {
|   pf_remove_state(si->s);
|   break;
|   }
| 

Is this part a reason why we have "divert-reply"?

> See man setsockopt

Yes, I have checked the API already.

> Is that not possible for you?

It's possible.

--yasuoka



pf_remove_divert_state

2020-07-25 Thread YASUOKA Masahiko
Hi,

# let me correct the previous mail, it has some typos.

Currently SO_BINDANY is usable without any divert or divert-reply
rule.

pf reserves its associated PCB to its state when the packet is going
out.  This time, the pf rule is not required to have "divert" or
"divert-reply" option.  When receiving reverse direction packets,
those packets are going to "ours" since they has the associated PCB.

But when dropping the connection, the PCB is deleted but the state
will not removed.  Currently pf removes the state only if it is
created by a rule with "divert-reply" or "divert" option.  Otherwise
the state is kept.

As the result, following incoming packets for the connection will be
forwarded by the state.  They should not be forwarded since they were
going to "ours".

I think the state should be deleted even if it's created by a rule
without "divert" or "divert-reply" option. The following diff will
change this behavior.  Also I attached a test procedure after the
diff.


ok? comments?

Don't keep a state when associated PCB is delete regardless it's
created without a "divert-to" or "divert-reply" rule.  It might be
created by SO_BINDANY.

Index: sys/net/pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.1094
diff -u -p -r1.1094 pf.c
--- sys/net/pf.c24 Jul 2020 18:17:15 -  1.1094
+++ sys/net/pf.c25 Jul 2020 07:39:19 -
@@ -1410,9 +1410,7 @@ pf_remove_divert_state(struct pf_state_k
struct pf_state_item*si;
 
TAILQ_FOREACH(si, >states, entry) {
-   if (sk == si->s->key[PF_SK_STACK] && si->s->rule.ptr &&
-   (si->s->rule.ptr->divert.type == PF_DIVERT_TO ||
-   si->s->rule.ptr->divert.type == PF_DIVERT_REPLY)) {
+   if (sk == si->s->key[PF_SK_STACK]) {
pf_remove_state(si->s);
break;
}



network configuration:

  192.168.0.101 -- 192.168.0.1 [OBJ] 10.0.0.1 --> 10.0.0.10

setup:

  ifconfig pair100 rdomain 10
  ifconfig pair100 inet 192.168.0.1
  ifconfig pair101 rdomain 11 patch pair100
  ifconfig pair101 inet 192.168.0.101
  ifconfig pair102 rdomain 10
  ifconfig pair102 inet 10.0.0.1/24
  ifconfig pair103 rdomain 12 patch pair102
  ifconfig pair103 inet 10.0.0.101/24
  route -T11 add default 192.168.0.1

/etc/pf.conf:

  pass on {pair100 pair101 pair102 pair103}
  match out on pair102 nat-to (pair102:0)
  block in on pair103 proto tcp to port 443

procedure:

1. run a server by scapy on 443/tcp on rdomain 12

   $ doas route -T12 exec python test.py

2. connect to the server from OBJ (rdomain 10)

   $ doas route -T10 exec nc -vs 192.168.0.101 10.0.0.101 443
   Connection to 10.0.0.101 443 port [tcp/https] succeeded!
   Ctrl-D
   $

   close the connection by Ctrl-D immediately

3. see the packet capture on pair103

   - You can see packets like below
 19:28:51.822879 10.0.0.101.443 > 10.0.0.1.60956: . ack 1 win 8192
 19:28:51.823559 192.168.0.101.22083 > 10.0.0.101.443: R 0:0(0)
   ack 1 win 0 (DF) [tos 0x10]
 
   - Since the pf state is kept, the packet "10.0.0.101.443 >
 10.0.0.1.60956" is converted into "10.0.0.101.443 >
 192.168.0.101.22083" by the state's NAT
   - but since the PCB doesn't exist, the packet is forwarded.
   - but the packet is blocked by default "block return" rule
   - "192.168.0.101.22083 > 10.0.0.101.443" is the result of "block
 return"
   
   -> 192.168.0.101 is NATed address.  It should not appear on
  10.0.0.0/24 network.

teardown:

  ifconfig pair100 destroy
  ifconfig pair101 destroy
  ifconfig pair102 destroy
  ifconfig pair103 destroy

test.py
***
import time
from scapy.all import *

a=sniff(iface="pair102", count=1, filter="tcp and port 443")

ip_src = a[0][IP].src
ip_dst = a[0][IP].dst
sport =  a[0][TCP].sport
dport =  a[0][TCP].dport
seq_nr = 5
ack_nr = a[0][TCP].seq + 1

a=sr1(IP(src=ip_dst, dst=ip_src)/
  TCP(sport=dport, dport=sport, flags="SA", seq=seq_nr, ack=ack_nr,
  options=[('MSS', 1460)]))
#ack_nr = a[0][TCP].seq + 1

# Send FIN and receive FIN+ACK
seq_nr = seq_nr + 1
a=sr1(IP(src=ip_dst, dst=ip_src)/
  TCP(sport=dport, dport=sport, flags="FA", seq=seq_nr, ack=ack_nr))
ack_nr = a[0][TCP].seq + 1

time.sleep(2)

# Send ACK of FIN
lastack = (IP(src=ip_dst, dst=ip_src)/
  TCP(sport=dport, dport=sport, flags="A", seq=seq_nr, ack=ack_nr))
send(lastack)

# Resend in 100 times
for _ in range(100):
time.sleep(2)
send(lastack)
***



pf_remove_divert_state

2020-07-25 Thread YASUOKA Masahiko
Hi,

Currently SO_BINDANY is usable without any divert or divert-reply
rule.

pf reserves its associated PCB to its state when the packet is going
out.  This time, the pf rule is not required to have "divert" or
"divert-reply" option.  When receiving reverse direction packets,
those packets are going to "ours" since they has the associated PCB.

But when dropping the connection, the PCB is deleted but the state
will not removed.  Currently pf removes the state only if it is
created by a rule with "divert-reply" or "divert" option.  Otherwise
the state is kept.

As the result, following incoming packets for the connection will be
forwarded by the state.  They should not be forwarded since they were
going to "ours".

I think the state should be deleted even if it's created by a rule
without doesn't "divert" or "divert-reply" option. The following diff
will change this behavior.  Also I attached a test procedure after the
diff.


ok? comments?

Don't keep a state when associated PCB is delete regardless it's
created without a "divert-to" or "divert-reply" rule.  It might be
created by SO_BINDANY.

Index: sys/net/pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.1094
diff -u -p -r1.1094 pf.c
--- sys/net/pf.c24 Jul 2020 18:17:15 -  1.1094
+++ sys/net/pf.c25 Jul 2020 07:39:19 -
@@ -1410,9 +1410,7 @@ pf_remove_divert_state(struct pf_state_k
struct pf_state_item*si;
 
TAILQ_FOREACH(si, >states, entry) {
-   if (sk == si->s->key[PF_SK_STACK] && si->s->rule.ptr &&
-   (si->s->rule.ptr->divert.type == PF_DIVERT_TO ||
-   si->s->rule.ptr->divert.type == PF_DIVERT_REPLY)) {
+   if (sk == si->s->key[PF_SK_STACK]) {
pf_remove_state(si->s);
break;
}



network configuration:

  192.168.0.101 -- 192.168.0.1 [OBJ] 10.0.0.1 --> 10.0.0.10

setup:

  ifconfig pair100 rdomain 10
  ifconfig pair100 inet 192.168.0.1
  ifconfig pair101 rdomain 11 patch pair100
  ifconfig pair101 inet 192.168.0.101
  ifconfig pair102 rdomain 10
  ifconfig pair102 inet 10.0.0.1/24
  ifconfig pair103 rdomain 12 patch pair102
  ifconfig pair103 inet 10.0.0.101/24
  route -T11 add default 192.168.0.1

/etc/pf.conf:

  pass on {pair101 pair102 pair103 pair104}
  match out on pair102 nat-to (pair102:0)
  block in on pair103 proto tcp to port 443

procedure:

1. run a server by scapy on 443/tcp on rdomain 12

   $ doas route -T12 exec python test.py

2. connect to the server from OBJ (rdomain 10)

   $ doas route -T10 exec nc -vs 192.168.0.101 10.0.0.101 443
   Connection to 10.0.0.101 443 port [tcp/https] succeeded!
   Ctrl-D
   $

   close the connection by Ctrl-D immediately

3. see the packet capture on pair103

   - You can see packets like below
 19:28:51.822879 10.0.0.101.443 > 10.0.0.1.60956: . ack 1 win 8192
 19:28:51.823559 192.168.0.101.22083 > 10.0.0.101.443: R 0:0(0)
   ack 1 win 0 (DF) [tos 0x10]
 
   - Since the pf state is kept, the packet "10.0.0.101.443 >
 10.0.0.1.60956" is converted into "10.0.0.101.443 >
 192.168.0.101.22083" by the state's NAT
   - but since the PCB doesn't exist, the packet is forwarded.
   - but the packet is blocked by default "block return" rule
   - "192.168.0.101.22083 > 10.0.0.101.443" is the result of "block
 return"
   
   -> 192.168.0.101 is NATed address.  It should not appear on
  10.0.0.0/24 network.

teardown:

  ifconfig pair100 destroy
  ifconfig pair101 destroy
  ifconfig pair102 destroy
  ifconfig pair103 destroy

test.py
***
import time
from scapy.all import *

a=sniff(iface="pair102", count=1, filter="tcp and port 443")

ip_src = a[0][IP].src
ip_dst = a[0][IP].dst
sport =  a[0][TCP].sport
dport =  a[0][TCP].dport
seq_nr = 5
ack_nr = a[0][TCP].seq + 1

a=sr1(IP(src=ip_dst, dst=ip_src)/
  TCP(sport=dport, dport=sport, flags="SA", seq=seq_nr, ack=ack_nr,
  options=[('MSS', 1460)]))
#ack_nr = a[0][TCP].seq + 1

# Send FIN and receive FIN+ACK
seq_nr = seq_nr + 1
a=sr1(IP(src=ip_dst, dst=ip_src)/
  TCP(sport=dport, dport=sport, flags="FA", seq=seq_nr, ack=ack_nr))
ack_nr = a[0][TCP].seq + 1

time.sleep(2)

# Send ACK of FIN
lastack = (IP(src=ip_dst, dst=ip_src)/
  TCP(sport=dport, dport=sport, flags="A", seq=seq_nr, ack=ack_nr))
send(lastack)

# Resend in 100 times
for _ in range(100):
time.sleep(2)
send(lastack)
***



carp: unicast carppeer and peer down

2020-07-25 Thread YASUOKA Masahiko
Hi,

When an unicast address is specified for carppeer, if the peer is
down, sending out advertisemnent packets will fail, this failure is
treated as an error of the sending host, then the error counter is
incremented and carpdemote is incremenated.  I think this is not
correct because the failure is not a fault of the sending host.

ok?

Don't treat an error if carppeer is an unicast and the peer is down.

Index: sys/netinet/ip_carp.c
===
RCS file: /cvs/src/sys/netinet/ip_carp.c,v
retrieving revision 1.347
diff -u -p -r1.347 ip_carp.c
--- sys/netinet/ip_carp.c   24 Jul 2020 18:17:15 -  1.347
+++ sys/netinet/ip_carp.c   25 Jul 2020 07:16:42 -
@@ -1140,7 +1140,9 @@ carp_send_ad(struct carp_vhost_entry *vh
 
error = ip_output(m, NULL, NULL, IP_RAWOUTPUT, >sc_imo,
NULL, 0);
-   if (error) {
+   if (error &&
+   /* when unicast, the peer's down is not our fault */
+   !(!IN_MULTICAST(sc->sc_peer.s_addr) && error == EHOSTDOWN)){
if (error == ENOBUFS)
carpstat_inc(carps_onomem);
else



pfsync: comparing duration when "bulk-end"

2020-07-24 Thread YASUOKA Masahiko
Hi,

pfsync does "bulk update" just after boot, I noticed it sometimes
fails.  When finishing "bulk update", the duration in the "bulk-end"
packet and our duration based on uptime are compared, but that
comparision should be fixed.  It must consider the values are rounded
in a second.

ok?

Consider being rounded in a second when comparing the duration in
"bulk-end"  packet and the duration based on our uptime.  This fixes
the problem the carp demote count sometimes becomes 33 after reboot.

Index: sys/net/if_pfsync.c
===
RCS file: /cvs/src/sys/net/if_pfsync.c,v
retrieving revision 1.274
diff -u -p -r1.274 if_pfsync.c
--- sys/net/if_pfsync.c 10 Jul 2020 13:26:42 -  1.274
+++ sys/net/if_pfsync.c 25 Jul 2020 05:09:47 -
@@ -1169,8 +1169,7 @@ pfsync_in_bus(caddr_t buf, int len, int 
break;
 
case PFSYNC_BUS_END:
-   if (getuptime() - ntohl(bus->endtime) >=
-   sc->sc_ureq_sent) {
+   if (ntohl(bus->endtime) <= getuptime() + 1 - sc->sc_ureq_sent) {
/* that's it, we're happy */
sc->sc_ureq_sent = 0;
sc->sc_bulk_tries = 0;



Re: pf: route-to {random,srchash} in an anchor

2020-07-24 Thread YASUOKA Masahiko
Hi,

On Thu, 23 Jul 2020 18:44:43 +0200
Alexandr Nedvedicky  wrote:
> On Thu, Jul 23, 2020 at 08:01:18PM +0900, YASUOKA Masahiko wrote:
>> Hi,
>> 
>> Last month, I fixed the problem "route-to least-state" in an anchor
>> didn't work.
>> 
>> https://marc.info/?t=15911745782=1=2
>> 
>> I noticed the same problem happens on "random" and "srchash" as well.
>> 
>> ok?
> 
> change looks good. I have just one nit-pick comment. I leave decision
> whether it's worth to adjust your diff or commit as-is up to you.
> 
> see in-line further below.

I can't remember why I used "null == false" logic, since I usually
avoid using that.

I'll commit the ajusted diff below.

Index: sys/net/pf_lb.c
===
RCS file: /cvs/src/sys/net/pf_lb.c,v
retrieving revision 1.65
diff -u -p -r1.65 pf_lb.c
--- sys/net/pf_lb.c 24 Jul 2020 14:06:33 -  1.65
+++ sys/net/pf_lb.c 24 Jul 2020 14:13:42 -
@@ -353,6 +353,7 @@ pf_map_addr(sa_family_t af, struct pf_ru
struct pf_addr   faddr;
struct pf_addr  *raddr = >addr.v.a.addr;
struct pf_addr  *rmask = >addr.v.a.mask;
+   struct pfr_ktable   *kt;
struct pfi_kif  *kif;
u_int64_tstates;
u_int16_tweight;
@@ -405,18 +406,17 @@ pf_map_addr(sa_family_t af, struct pf_ru
pf_poolmask(naddr, raddr, rmask, saddr, af);
break;
case PF_POOL_RANDOM:
-   if (rpool->addr.type == PF_ADDR_TABLE) {
-   cnt = rpool->addr.p.tbl->pfrkt_cnt;
-   if (cnt == 0)
-   rpool->tblidx = 0;
+   if (rpool->addr.type == PF_ADDR_TABLE ||
+   rpool->addr.type == PF_ADDR_DYNIFTL) {
+   if (rpool->addr.type == PF_ADDR_TABLE)
+   kt = rpool->addr.p.tbl;
else
-   rpool->tblidx = (int)arc4random_uniform(cnt);
-   memset(>counter, 0, sizeof(rpool->counter));
-   if (pfr_pool_get(rpool, , , af))
+   kt = rpool->addr.p.dyn->pfid_kt;
+   kt = pfr_ktable_select_active(kt);
+   if (kt == NULL)
return (1);
-   pf_addrcpy(naddr, >counter, af);
-   } else if (rpool->addr.type == PF_ADDR_DYNIFTL) {
-   cnt = rpool->addr.p.dyn->pfid_kt->pfrkt_cnt;
+
+   cnt = kt->pfrkt_cnt;
if (cnt == 0)
rpool->tblidx = 0;
else
@@ -462,18 +462,18 @@ pf_map_addr(sa_family_t af, struct pf_ru
case PF_POOL_SRCHASH:
hashidx =
pf_hash(saddr, (struct pf_addr *), >key, af);
-   if (rpool->addr.type == PF_ADDR_TABLE) {
-   cnt = rpool->addr.p.tbl->pfrkt_cnt;
-   if (cnt == 0)
-   rpool->tblidx = 0;
+
+   if (rpool->addr.type == PF_ADDR_TABLE ||
+   rpool->addr.type == PF_ADDR_DYNIFTL) {
+   if (rpool->addr.type == PF_ADDR_TABLE)
+   kt = rpool->addr.p.tbl;
else
-   rpool->tblidx = (int)(hashidx % cnt);
-   memset(>counter, 0, sizeof(rpool->counter));
-   if (pfr_pool_get(rpool, , , af))
+   kt = rpool->addr.p.dyn->pfid_kt;
+   kt = pfr_ktable_select_active(kt);
+   if (kt == NULL)
return (1);
-   pf_addrcpy(naddr, >counter, af);
-   } else if (rpool->addr.type == PF_ADDR_DYNIFTL) {
-   cnt = rpool->addr.p.dyn->pfid_kt->pfrkt_cnt;
+
+   cnt = kt->pfrkt_cnt;
if (cnt == 0)
rpool->tblidx = 0;
else
Index: sys/net/pf_table.c
===
RCS file: /cvs/src/sys/net/pf_table.c,v
retrieving revision 1.133
diff -u -p -r1.133 pf_table.c
--- sys/net/pf_table.c  24 Jun 2020 22:03:43 -  1.133
+++ sys/net/pf_table.c  24 Jul 2020 14:13:42 -
@@ -2108,9 +2108,8 @@ pfr_kentry_byaddr(struct pfr_ktable *kt,
struct sockaddr_in6  tmp6;
 #endif /* INET6 */
 
-   if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
-   kt = kt->pfrkt_root;
-   if 

Re: pf: route-to least-states

2020-07-24 Thread YASUOKA Masahiko
Hi,

Thank you for your review.

On Fri, 24 Jul 2020 01:25:42 +0200
Alexandr Nedvedicky  wrote:
>> - interface is not selected properly if selected table entry specifies
>>   an interface.
> 
> to be honest I don't quite understand what's going on here.
> can you share some details of configuration/scenario, which
> triggers the bug your diff is fixing?

You seem to have understood the scenario correctly.

> the part of your change, which I'm not able to figure out is
> this single line:
> 
>> +if (pf_map_addr_states_increase(af, rpool, naddr) == -1)
>> +return (1);
>> +/* revert the kif which was set by pfr_pool_get() */
>> +rpool->kif = kif;
>>  break;
>>  }
> 
> your fix changes behavior, which is there since least-state
> option has been introduced. I believe it does not matter
> for case when route-to specifies single interface such as:
> 
>   route-to 192.168.1.10@em0 least-states
> 
> I'm not sure what will happen in situation, when there are more interfaces
> specified in combination with sticky-address:
>   
>   route-to {192.168.1.10@em0, 192.168.1.20@em1} last-states sticky-address

Yes.  This is a senario.

> the resulting code does not look quite right with your diff applied:
> 
> 602 } while (pf_match_addr(1, , rmask, >counter, 
> af) &&
> 603 (states > 0));
> 604 
> 605 if (pf_map_addr_states_increase(af, rpool, naddr) == -1)
> 606 return (1);
> 607 /* revert the kif which was set by pfr_pool_get() */
> 608 rpool->kif = kif;
> 609 break;
> 610 }
> 611 
> 612 if (rpool->opts & PF_POOL_STICKYADDR) {
> 613 if (sns[type] != NULL) {
> 614 pf_remove_src_node(sns[type]);
> 615 sns[type] = NULL;
> 616 }
> 617 if (pf_insert_src_node([type], r, type, af, saddr, 
> naddr,
> 618 rpool->kif))
> 619 return (1);
> 620 }
> 
> 
> at line 608 new code reverts kif set by pfr_pool_get(). At line 617
> (executed when sticky-address is set) the original code passes kif chosen 
> be
> pfr_pool_get(). You diff changes that behavior. So my question is simple:
>   is that intentional change?

Yes.

Let me simplify the block for "least-states".

535   case PF_POOL_LEASTSTATES:
539   pfr_pool_get(rpool);  // fist entry
 :
561   faddr = rpool->counter;   //record as final
 :
557   load = rpool->states / rpool->weight;
563   naddr = rpool->counter;
 :
571  do {
572  rpool->counter++;
575  pfr_pool_get(rpool);   /* next entry */
 :
585  cload = rpool->states / rpool->weight;
 :
 :   /* find lc minimum */
591  if (cload < load) {
595 load = cload;
597 naddr = rpool->counter;
601  }
603   } while (raddr->counter != faddr); // loop until final

the loop #571:606 is to find the minimum (least-states) entry.  If the
last entry is not the minimum, after the loop,

   rpool <= the last entry
   naddr <= the minimum entry

Also see the pfr_pool_get():

2272 int
2273 pfr_pool_get(struct pf_pool *rpool, struct pf_addr **raddr,
2274 struct pf_addr **rmask, sa_family_t af)
2275 {
(snip)
2417 rpool->states = 0;
2418 if (ke->pfrke_counters != NULL)
2419 rpool->states = ke->pfrke_counters->states;
2420 switch (ke->pfrke_type) {
2421 case PFRKE_COST:
2422 rpool->weight =
2423 ((struct pfr_kentry_cost *)ke)->weight;
2424 /* FALLTHROUGH */
2425 case PFRKE_ROUTE:
2426 rpool->kif = ((struct pfr_kentry_route 
*)ke)->kif;
2427 break;
2428 default:
2429 rpool->weight = 1;
2430 break;
2431 }

some fields of rpool (states, weight, kif) are set by the values of
the selected table entry.

So,

|  rpool <= the last entry
|  naddr <= the minimum entry

rpool->kif is the interface of the last entery.  It might be different
than the inteface of the minimum entry.

The diff is to keep kif of the minimum entry,

+   kif = rpool->kif;

revert rpool->kif by it after the loop.

+   /* revert the kif which was set by pfr_pool_get() */
+   rpool->kif = kif;




pf: route-to least-states

2020-07-23 Thread YASUOKA Masahiko
Hi,

The diff fixes 2 problems of "least-states":

- states whose address is selected by sticky-address is not counted
  for the number of states.
- interface is not selected properly if selected table entry specifies
  an interface.

ok?

Increase state counter for least-states when the address is selected
by sticky-address.  Also fix the problem that the interface which is
specified by the selected table entry is not used properly.

Index: sys/net/pf_lb.c
===
RCS file: /disk/cvs/openbsd/src/sys/net/pf_lb.c,v
retrieving revision 1.64
diff -u -p -r1.64 pf_lb.c
--- sys/net/pf_lb.c 2 Jul 2019 09:04:53 -   1.64
+++ sys/net/pf_lb.c 23 Jul 2020 11:06:05 -
@@ -97,6 +97,8 @@ u_int64_t  pf_hash(struct pf_addr *, st
 int pf_get_sport(struct pf_pdesc *, struct pf_rule *,
struct pf_addr *, u_int16_t *, u_int16_t,
u_int16_t, struct pf_src_node **);
+int pf_map_addr_states_increase(sa_family_t,
+   struct pf_pool *, struct pf_addr *);
 int pf_get_transaddr_af(struct pf_rule *,
struct pf_pdesc *, struct pf_src_node **);
 int pf_map_addr_sticky(sa_family_t, struct pf_rule *,
@@ -319,6 +321,12 @@ pf_map_addr_sticky(sa_family_t af, struc
sns[type] = NULL;
return (-1);
}
+
+   if ((rpool->opts & PF_POOL_TYPEMASK) == PF_POOL_LEASTSTATES) {
+   if (pf_map_addr_states_increase(af, rpool, naddr) == -1)
+   return (-1);
+   }
+
if (!PF_AZERO(cached, af))
pf_addrcpy(naddr, cached, af);
if (pf_status.debug >= LOG_DEBUG) {
@@ -345,6 +353,7 @@ pf_map_addr(sa_family_t af, struct pf_ru
struct pf_addr   faddr;
struct pf_addr  *raddr = >addr.v.a.addr;
struct pf_addr  *rmask = >addr.v.a.mask;
+   struct pfi_kif  *kif;
u_int64_tstates;
u_int16_tweight;
u_int64_tload;
@@ -539,6 +548,7 @@ pf_map_addr(sa_family_t af, struct pf_ru
 
states = rpool->states;
weight = rpool->weight;
+   kif = rpool->kif;
 
if ((rpool->addr.type == PF_ADDR_TABLE &&
rpool->addr.p.tbl->pfrkt_refcntcost > 0) ||
@@ -581,6 +591,7 @@ pf_map_addr(sa_family_t af, struct pf_ru
if (cload < load) {
states = rpool->states;
weight = rpool->weight;
+   kif = rpool->kif;
load = cload;
 
pf_addrcpy(naddr, >counter, af);
@@ -591,29 +602,10 @@ pf_map_addr(sa_family_t af, struct pf_ru
} while (pf_match_addr(1, , rmask, >counter, af) &&
(states > 0));
 
-   if (rpool->addr.type == PF_ADDR_TABLE) {
-   if (pfr_states_increase(rpool->addr.p.tbl,
-   naddr, af) == -1) {
-   if (pf_status.debug >= LOG_DEBUG) {
-   log(LOG_DEBUG,"pf: pf_map_addr: "
-   "selected address ");
-   pf_print_host(naddr, 0, af);
-   addlog(". Failed to increase count!\n");
-   }
-   return (1);
-   }
-   } else if (rpool->addr.type == PF_ADDR_DYNIFTL) {
-   if (pfr_states_increase(rpool->addr.p.dyn->pfid_kt,
-   naddr, af) == -1) {
-   if (pf_status.debug >= LOG_DEBUG) {
-   log(LOG_DEBUG, "pf: pf_map_addr: "
-   "selected address ");
-   pf_print_host(naddr, 0, af);
-   addlog(". Failed to increase count!\n");
-   }
-   return (1);
-   }
-   }
+   if (pf_map_addr_states_increase(af, rpool, naddr) == -1)
+   return (1);
+   /* revert the kif which was set by pfr_pool_get() */
+   rpool->kif = kif;
break;
}
 
@@ -642,6 +634,38 @@ pf_map_addr(sa_family_t af, struct pf_ru
addlog("\n");
}
 
+   return (0);
+}
+
+int
+pf_map_addr_states_increase(sa_family_t af, struct pf_pool *rpool,
+struct pf_addr *naddr)
+{
+   if (rpool->addr.type == PF_ADDR_TABLE) {
+   if (pfr_states_increase(rpool->addr.p.tbl,
+   naddr, af) == -1) {
+ 

pf: route-to {random,srchash} in an anchor

2020-07-23 Thread YASUOKA Masahiko
Hi,

Last month, I fixed the problem "route-to least-state" in an anchor
didn't work.

https://marc.info/?t=15911745782=1=2

I noticed the same problem happens on "random" and "srchash" as well.

ok?

Use the table on root always if current table is not active.

Index: sys/net/pf_lb.c
===
RCS file: /disk/cvs/openbsd/src/sys/net/pf_lb.c,v
retrieving revision 1.64
diff -u -p -r1.64 pf_lb.c
--- sys/net/pf_lb.c 2 Jul 2019 09:04:53 -   1.64
+++ sys/net/pf_lb.c 23 Jul 2020 10:45:48 -
@@ -345,6 +345,7 @@ pf_map_addr(sa_family_t af, struct pf_ru
struct pf_addr   faddr;
struct pf_addr  *raddr = >addr.v.a.addr;
struct pf_addr  *rmask = >addr.v.a.mask;
+   struct pfr_ktable   *kt;
u_int64_tstates;
u_int16_tweight;
u_int64_tload;
@@ -396,18 +397,17 @@ pf_map_addr(sa_family_t af, struct pf_ru
pf_poolmask(naddr, raddr, rmask, saddr, af);
break;
case PF_POOL_RANDOM:
-   if (rpool->addr.type == PF_ADDR_TABLE) {
-   cnt = rpool->addr.p.tbl->pfrkt_cnt;
-   if (cnt == 0)
-   rpool->tblidx = 0;
+   if (rpool->addr.type == PF_ADDR_TABLE ||
+   rpool->addr.type == PF_ADDR_DYNIFTL) {
+   if (rpool->addr.type == PF_ADDR_TABLE)
+   kt = rpool->addr.p.tbl;
else
-   rpool->tblidx = (int)arc4random_uniform(cnt);
-   memset(>counter, 0, sizeof(rpool->counter));
-   if (pfr_pool_get(rpool, , , af))
+   kt = rpool->addr.p.dyn->pfid_kt;
+   kt = pfr_ktable_select_active(kt);
+   if (!kt)
return (1);
-   pf_addrcpy(naddr, >counter, af);
-   } else if (rpool->addr.type == PF_ADDR_DYNIFTL) {
-   cnt = rpool->addr.p.dyn->pfid_kt->pfrkt_cnt;
+
+   cnt = kt->pfrkt_cnt;
if (cnt == 0)
rpool->tblidx = 0;
else
@@ -453,18 +453,18 @@ pf_map_addr(sa_family_t af, struct pf_ru
case PF_POOL_SRCHASH:
hashidx =
pf_hash(saddr, (struct pf_addr *), >key, af);
-   if (rpool->addr.type == PF_ADDR_TABLE) {
-   cnt = rpool->addr.p.tbl->pfrkt_cnt;
-   if (cnt == 0)
-   rpool->tblidx = 0;
+
+   if (rpool->addr.type == PF_ADDR_TABLE ||
+   rpool->addr.type == PF_ADDR_DYNIFTL) {
+   if (rpool->addr.type == PF_ADDR_TABLE)
+   kt = rpool->addr.p.tbl;
else
-   rpool->tblidx = (int)(hashidx % cnt);
-   memset(>counter, 0, sizeof(rpool->counter));
-   if (pfr_pool_get(rpool, , , af))
+   kt = rpool->addr.p.dyn->pfid_kt;
+   kt = pfr_ktable_select_active(kt);
+   if (!kt)
return (1);
-   pf_addrcpy(naddr, >counter, af);
-   } else if (rpool->addr.type == PF_ADDR_DYNIFTL) {
-   cnt = rpool->addr.p.dyn->pfid_kt->pfrkt_cnt;
+
+   cnt = kt->pfrkt_cnt;
if (cnt == 0)
rpool->tblidx = 0;
else
Index: sys/net/pf_table.c
===
RCS file: /disk/cvs/openbsd/src/sys/net/pf_table.c,v
retrieving revision 1.133
diff -u -p -r1.133 pf_table.c
--- sys/net/pf_table.c  24 Jun 2020 22:03:43 -  1.133
+++ sys/net/pf_table.c  23 Jul 2020 10:45:48 -
@@ -2108,9 +2108,8 @@ pfr_kentry_byaddr(struct pfr_ktable *kt,
struct sockaddr_in6  tmp6;
 #endif /* INET6 */
 
-   if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
-   kt = kt->pfrkt_root;
-   if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+   kt = pfr_ktable_select_active(kt);
+   if (!kt)
return (0);
 
switch (af) {
@@ -2153,9 +2152,8 @@ pfr_update_stats(struct pfr_ktable *kt, 
int  dir_idx = (pd->dir == PF_OUT);
int  op_idx;
 
-   if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
-   kt = kt->pfrkt_root;
-   if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+   kt = pfr_ktable_select_active(kt);
+   if (!kt)
return;
 
switch (af) {
@@ -2308,9 +2306,8 @@ pfr_pool_get(struct pf_pool *rpool, stru
 

Re: receive interfacez for carp when real mac is used

2020-07-22 Thread YASUOKA Masahiko
The problem I was to fix had been fixed by dlg@'s commit today.

  https://marc.info/?l=openbsd-cvs=159538265604770=2

So the diff is not needed any more.  Pointed out by dlg@.

Thanks,

On Wed, 22 Jul 2020 19:24:32 +0900 (JST)
YASUOKA Masahiko  wrote:
> Hi,
> 
> Currently when using the real mac address for carp(4) interface, all
> packets are treated as their receive inteface is carp.  This causes
> some problems.
> 
> For example, IPv6 ndp doesn't work on an interface which is used for
> carpdev.  Because it is assumed that reply packets are received with
> the same interface which is used to send out the request.
> 
> ok?
> 
> When realmac is used for carp(4), don't pass the packets through the
> interface since they are for the real interface.
> 
> Index: sys/netinet/ip_carp.c
> ===
> RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_carp.c,v
> retrieving revision 1.345
> diff -u -p -r1.345 ip_carp.c
> --- sys/netinet/ip_carp.c 21 May 2020 05:24:59 -  1.345
> +++ sys/netinet/ip_carp.c 22 Jul 2020 09:52:20 -
> @@ -1418,6 +1418,14 @@ carp_input(struct ifnet *ifp0, struct mb
>   }
>   m_tag_prepend(m, mtag);
>   }
> +
> + /*
> +  * When carp is using realmac, since the matched MAC
> +  * address is for the real interface, the packets are
> +  * not for the carp interface.
> +  */
> + if (sc->sc_realmac)
> + sc = NULL;
>   break;
>   }
>   }
> 



receive interfacez for carp when real mac is used

2020-07-22 Thread YASUOKA Masahiko
Hi,

Currently when using the real mac address for carp(4) interface, all
packets are treated as their receive inteface is carp.  This causes
some problems.

For example, IPv6 ndp doesn't work on an interface which is used for
carpdev.  Because it is assumed that reply packets are received with
the same interface which is used to send out the request.

ok?

When realmac is used for carp(4), don't pass the packets through the
interface since they are for the real interface.

Index: sys/netinet/ip_carp.c
===
RCS file: /disk/cvs/openbsd/src/sys/netinet/ip_carp.c,v
retrieving revision 1.345
diff -u -p -r1.345 ip_carp.c
--- sys/netinet/ip_carp.c   21 May 2020 05:24:59 -  1.345
+++ sys/netinet/ip_carp.c   22 Jul 2020 09:52:20 -
@@ -1418,6 +1418,14 @@ carp_input(struct ifnet *ifp0, struct mb
}
m_tag_prepend(m, mtag);
}
+
+   /*
+* When carp is using realmac, since the matched MAC
+* address is for the real interface, the packets are
+* not for the carp interface.
+*/
+   if (sc->sc_realmac)
+   sc = NULL;
break;
}
}



Re: route add ::/0 ...

2020-07-06 Thread YASUOKA Masahiko
Let me updated the diff.

On Mon, 06 Jul 2020 17:54:30 +0900 (JST)
YASUOKA Masahiko  wrote:
> On Tue, 30 Jun 2020 02:42:02 +0200
> Klemens Nanni  wrote:
>> On Tue, Jun 30, 2020 at 09:00:30AM +0900, YASUOKA Masahiko wrote:
>>> inet_makenetandmask() had required another treatment.
>>> 
>>> Also -prefixlen 0 for -inet has a bug
>>> 
>>>  % doas ./obj/route -T100 add -inet 0.0.0.0 -prefixlen 0 127.0.0.1
>>>  add net 0.0.0.0: gateway 127.0.0.1
>>>  % netstat -nrf inet -T 100
>>>  Routing tables
>>> 
>>>  Internet:
>>>  DestinationGatewayFlags   Refs  Use   Mtu  Prio 
>>> Iface
>>>  0.0.0.0/32 127.0.0.1  UGS00 32768 8 
>>> lo100
>>> 
>>> /0 becomes /32.  The diff following also fixes the problem.
>> Yes, this looks correct to me;  regress is also happy (again).
>> 
>> OK kn
> 
> Thanks,
> 
> I'm  going to commit the diff.  ok or comments, are still welcome.
> 
> 
> Stop using make_addr() which trims trailing zeros of the netmask, set
> family and length field.  This fixes route(8) to handle "::/0"
> properly.  Also fix "route add -inet 0.0.0.0 -prefixlen 0 (gateway)"
> to work properly.
> 
> Index: sbin/route/route.c
> ===
> RCS file: /cvs/src/sbin/route/route.c,v
> retrieving revision 1.247
> diff -u -p -r1.247 route.c
> --- sbin/route/route.c15 Jan 2020 10:26:25 -  1.247
> +++ sbin/route/route.c6 Jul 2020 08:45:06 -
(snip)
> @@ -781,12 +780,9 @@ inet_makenetandmask(u_int32_t net, struc
>   sin->sin_addr.s_addr = htonl(net);
>   sin = _mask.sin;
>   sin->sin_addr.s_addr = htonl(mask);
> - sin->sin_len = 0;
> - sin->sin_family = 0;
> + sin->sin_family = AF_INET;
>   cp = (char *)(>sin_addr + 1);
> - while (*--cp == '\0' && cp > (char *)sin)
> - continue;
> - sin->sin_len = 1 + cp - (char *)sin;
> + sin->sin_len = sizeof(struct sockaddr_in);
>  }
>  
>  /*

"cp" becomes unused.  The updated diff removes "cp" as well.

Index: sbin/route/route.c
===
RCS file: /cvs/src/sbin/route/route.c,v
retrieving revision 1.247
diff -u -p -r1.247 route.c
--- sbin/route/route.c  15 Jan 2020 10:26:25 -  1.247
+++ sbin/route/route.c  6 Jul 2020 08:57:25 -
@@ -107,7 +107,6 @@ void print_rtmsg(struct rt_msghdr *, in
 voidpmsg_common(struct rt_msghdr *);
 voidpmsg_addrs(char *, int);
 voidbprintf(FILE *, int, char *);
-voidmask_addr(union sockunion *, union sockunion *, int);
 int getaddr(int, int, char *, struct hostent **);
 voidgetmplslabel(char *, int);
 int rtmsg(int, int, int, uint8_t);
@@ -767,7 +766,6 @@ void
 inet_makenetandmask(u_int32_t net, struct sockaddr_in *sin, int bits)
 {
u_int32_t mask;
-   char *cp;
 
rtm_addrs |= RTA_NETMASK;
if (bits == 0 && net == 0)
@@ -781,12 +779,8 @@ inet_makenetandmask(u_int32_t net, struc
sin->sin_addr.s_addr = htonl(net);
sin = _mask.sin;
sin->sin_addr.s_addr = htonl(mask);
-   sin->sin_len = 0;
-   sin->sin_family = 0;
-   cp = (char *)(>sin_addr + 1);
-   while (*--cp == '\0' && cp > (char *)sin)
-   continue;
-   sin->sin_len = 1 + cp - (char *)sin;
+   sin->sin_family = AF_INET;
+   sin->sin_len = sizeof(struct sockaddr_in);
 }
 
 /*
@@ -1001,7 +995,8 @@ prefixlen(int af, char *s)
memset(_mask, 0, sizeof(so_mask));
so_mask.sin.sin_family = AF_INET;
so_mask.sin.sin_len = sizeof(struct sockaddr_in);
-   so_mask.sin.sin_addr.s_addr = htonl(0x << (32 - len));
+   if (len != 0)
+   so_mask.sin.sin_addr.s_addr = htonl(0x << (32 - 
len));
break;
case AF_INET6:
so_mask.sin6.sin6_family = AF_INET6;
@@ -1088,8 +1083,6 @@ rtmsg(int cmd, int flags, int fmask, uin
rtm.rtm_mpls = mpls_flags;
rtm.rtm_hdrlen = sizeof(rtm);
 
-   if (rtm_addrs & RTA_NETMASK)
-   mask_addr(_dst, _mask, RTA_DST);
/* store addresses in ascending order of RTA values */
NEXTADDR(RTA_DST, so_dst);
NEXTADDR(RTA_GATEWAY, so_gate);
@@ -1118,34 +,6 @@ rtmsg(int cmd, int flags, int fmask, uin
}
 #undef rtm
return (0);
-}
-
-void
-mask_addr(union sockunion *addr, union sockunion *mask, int which)
-{
-   int olen = mask->sa.sa_len;
-   char *cp1 = olen + (char *)mask, *cp2;
-
-   for (mask-&g

Re: route add ::/0 ...

2020-07-06 Thread YASUOKA Masahiko


On Tue, 30 Jun 2020 02:42:02 +0200
Klemens Nanni  wrote:
> On Tue, Jun 30, 2020 at 09:00:30AM +0900, YASUOKA Masahiko wrote:
>> inet_makenetandmask() had required another treatment.
>> 
>> Also -prefixlen 0 for -inet has a bug
>> 
>>  % doas ./obj/route -T100 add -inet 0.0.0.0 -prefixlen 0 127.0.0.1
>>  add net 0.0.0.0: gateway 127.0.0.1
>>  % netstat -nrf inet -T 100
>>  Routing tables
>> 
>>  Internet:
>>  DestinationGatewayFlags   Refs  Use   Mtu  Prio 
>> Iface
>>  0.0.0.0/32 127.0.0.1  UGS00 32768 8 
>> lo100
>> 
>> /0 becomes /32.  The diff following also fixes the problem.
> Yes, this looks correct to me;  regress is also happy (again).
> 
> OK kn

Thanks,

I'm  going to commit the diff.  ok or comments, are still welcome.


Stop using make_addr() which trims trailing zeros of the netmask, set
family and length field.  This fixes route(8) to handle "::/0"
properly.  Also fix "route add -inet 0.0.0.0 -prefixlen 0 (gateway)"
to work properly.

Index: sbin/route/route.c
===
RCS file: /cvs/src/sbin/route/route.c,v
retrieving revision 1.247
diff -u -p -r1.247 route.c
--- sbin/route/route.c  15 Jan 2020 10:26:25 -  1.247
+++ sbin/route/route.c  6 Jul 2020 08:45:06 -
@@ -107,7 +107,6 @@ void print_rtmsg(struct rt_msghdr *, in
 voidpmsg_common(struct rt_msghdr *);
 voidpmsg_addrs(char *, int);
 voidbprintf(FILE *, int, char *);
-voidmask_addr(union sockunion *, union sockunion *, int);
 int getaddr(int, int, char *, struct hostent **);
 voidgetmplslabel(char *, int);
 int rtmsg(int, int, int, uint8_t);
@@ -781,12 +780,9 @@ inet_makenetandmask(u_int32_t net, struc
sin->sin_addr.s_addr = htonl(net);
sin = _mask.sin;
sin->sin_addr.s_addr = htonl(mask);
-   sin->sin_len = 0;
-   sin->sin_family = 0;
+   sin->sin_family = AF_INET;
cp = (char *)(>sin_addr + 1);
-   while (*--cp == '\0' && cp > (char *)sin)
-   continue;
-   sin->sin_len = 1 + cp - (char *)sin;
+   sin->sin_len = sizeof(struct sockaddr_in);
 }
 
 /*
@@ -1001,7 +997,8 @@ prefixlen(int af, char *s)
memset(_mask, 0, sizeof(so_mask));
so_mask.sin.sin_family = AF_INET;
so_mask.sin.sin_len = sizeof(struct sockaddr_in);
-   so_mask.sin.sin_addr.s_addr = htonl(0x << (32 - len));
+   if (len != 0)
+   so_mask.sin.sin_addr.s_addr = htonl(0x << (32 - 
len));
break;
case AF_INET6:
so_mask.sin6.sin6_family = AF_INET6;
@@ -1088,8 +1085,6 @@ rtmsg(int cmd, int flags, int fmask, uin
rtm.rtm_mpls = mpls_flags;
rtm.rtm_hdrlen = sizeof(rtm);
 
-   if (rtm_addrs & RTA_NETMASK)
-   mask_addr(_dst, _mask, RTA_DST);
/* store addresses in ascending order of RTA values */
NEXTADDR(RTA_DST, so_dst);
NEXTADDR(RTA_GATEWAY, so_gate);
@@ -1118,34 +1113,6 @@ rtmsg(int cmd, int flags, int fmask, uin
}
 #undef rtm
return (0);
-}
-
-void
-mask_addr(union sockunion *addr, union sockunion *mask, int which)
-{
-   int olen = mask->sa.sa_len;
-   char *cp1 = olen + (char *)mask, *cp2;
-
-   for (mask->sa.sa_len = 0; cp1 > (char *)mask; )
-   if (*--cp1 != '\0') {
-   mask->sa.sa_len = 1 + cp1 - (char *)mask;
-   break;
-   }
-   if ((rtm_addrs & which) == 0)
-   return;
-   switch (addr->sa.sa_family) {
-   case AF_INET:
-   case AF_INET6:
-   case AF_UNSPEC:
-   return;
-   }
-   cp1 = mask->sa.sa_len + 1 + (char *)addr;
-   cp2 = addr->sa.sa_len + 1 + (char *)addr;
-   while (cp2 > cp1)
-   *--cp2 = '\0';
-   cp2 = mask->sa.sa_len + 1 + (char *)mask;
-   while (cp1 > addr->sa.sa_data)
-   *--cp1 &= *--cp2;
 }
 
 char *msgtypes[] = {



Re: route add ::/0 ...

2020-06-29 Thread YASUOKA Masahiko
On Mon, 29 Jun 2020 19:18:17 +0200
Klemens Nanni  wrote:
> On Mon, Jun 29, 2020 at 11:55:10PM +0900, YASUOKA Masahiko wrote:
>> The function mask_addr() doesn't mask address for IPv4 and IPv6.  Does
>> any address family other than IPv4 or IPv6 require #1142:1148?  The
>> function seems to just trim the trailing zero.  Is it neccesaary?  And
>> it causes the confusion on the kernel.  How about deleting
>> mask_addr()?
>> 
>> The diff following also fixes the problem.
> Removing it breaks IPv4 default routes:
> 
>   # ifconfig lo1 rdomain 1 127.1.1.1
>   # ./obj/route -nT1 add 0.0.0.0/0 127.1.1.1
>   add net 0.0.0.0/0: gateway 127.1.1.1: Invalid argument
>   # route -nT1 add 0.0.0.0/0 127.1.1.1  
>   add net 0.0.0.0/0: gateway 127.1.1.1

Thanks,

inet_makenetandmask() had required another treatment.

Also -prefixlen 0 for -inet has a bug

 % doas ./obj/route -T100 add -inet 0.0.0.0 -prefixlen 0 127.0.0.1
 add net 0.0.0.0: gateway 127.0.0.1
 % netstat -nrf inet -T 100
 Routing tables

 Internet:
 DestinationGatewayFlags   Refs  Use   Mtu  Prio Iface
 0.0.0.0/32 127.0.0.1  UGS00 32768 8 lo100

/0 becomes /32.  The diff following also fixes the problem.


diff --git a/sbin/route/route.c b/sbin/route/route.c
index 9e43d8e89b6..532a918148d 100644
--- a/sbin/route/route.c
+++ b/sbin/route/route.c
@@ -107,7 +107,6 @@ void print_rtmsg(struct rt_msghdr *, int);
 voidpmsg_common(struct rt_msghdr *);
 voidpmsg_addrs(char *, int);
 voidbprintf(FILE *, int, char *);
-voidmask_addr(union sockunion *, union sockunion *, int);
 int getaddr(int, int, char *, struct hostent **);
 voidgetmplslabel(char *, int);
 int rtmsg(int, int, int, uint8_t);
@@ -781,12 +780,9 @@ inet_makenetandmask(u_int32_t net, struct sockaddr_in 
*sin, int bits)
sin->sin_addr.s_addr = htonl(net);
sin = _mask.sin;
sin->sin_addr.s_addr = htonl(mask);
-   sin->sin_len = 0;
-   sin->sin_family = 0;
+   sin->sin_family = AF_INET;
cp = (char *)(>sin_addr + 1);
-   while (*--cp == '\0' && cp > (char *)sin)
-   continue;
-   sin->sin_len = 1 + cp - (char *)sin;
+   sin->sin_len = sizeof(struct sockaddr_in);
 }
 
 /*
@@ -1001,7 +997,8 @@ prefixlen(int af, char *s)
memset(_mask, 0, sizeof(so_mask));
so_mask.sin.sin_family = AF_INET;
so_mask.sin.sin_len = sizeof(struct sockaddr_in);
-   so_mask.sin.sin_addr.s_addr = htonl(0x << (32 - len));
+   if (len != 0)
+   so_mask.sin.sin_addr.s_addr = htonl(0x << (32 - 
len));
break;
case AF_INET6:
so_mask.sin6.sin6_family = AF_INET6;
@@ -1088,8 +1085,6 @@ rtmsg(int cmd, int flags, int fmask, uint8_t prio)
rtm.rtm_mpls = mpls_flags;
rtm.rtm_hdrlen = sizeof(rtm);
 
-   if (rtm_addrs & RTA_NETMASK)
-   mask_addr(_dst, _mask, RTA_DST);
/* store addresses in ascending order of RTA values */
NEXTADDR(RTA_DST, so_dst);
NEXTADDR(RTA_GATEWAY, so_gate);
@@ -1120,34 +1115,6 @@ rtmsg(int cmd, int flags, int fmask, uint8_t prio)
return (0);
 }
 
-void
-mask_addr(union sockunion *addr, union sockunion *mask, int which)
-{
-   int olen = mask->sa.sa_len;
-   char *cp1 = olen + (char *)mask, *cp2;
-
-   for (mask->sa.sa_len = 0; cp1 > (char *)mask; )
-   if (*--cp1 != '\0') {
-   mask->sa.sa_len = 1 + cp1 - (char *)mask;
-   break;
-   }
-   if ((rtm_addrs & which) == 0)
-   return;
-   switch (addr->sa.sa_family) {
-   case AF_INET:
-   case AF_INET6:
-   case AF_UNSPEC:
-   return;
-   }
-   cp1 = mask->sa.sa_len + 1 + (char *)addr;
-   cp2 = addr->sa.sa_len + 1 + (char *)addr;
-   while (cp2 > cp1)
-   *--cp2 = '\0';
-   cp2 = mask->sa.sa_len + 1 + (char *)mask;
-   while (cp1 > addr->sa.sa_data)
-   *--cp1 &= *--cp2;
-}
-
 char *msgtypes[] = {
"",
"RTM_ADD: Add Route",



Re: route add ::/0 ...

2020-06-29 Thread YASUOKA Masahiko
On Mon, 29 Jun 2020 18:45:07 +0900 (JST)
YASUOKA Masahiko  wrote:
> On Mon, 29 Jun 2020 10:12:23 +0200
> Martin Pieuchot  wrote:
>> On 28/06/20(Sun) 20:41, YASUOKA Masahiko wrote:
>>> Hi,
>>> 
>>> When "::/0" is used as "default",
>>> 
>>>   # route add ::/0 fe80::1%em0
>>>   add net ::/0: gateway fe80::1%em0: Invalid argument
>>> 
>>> route command trims the sockaddr to { .len = 2, .family = AF_INET6 }
>>> for "::/0", but rtable_satoplen() refuses it.  I think it should be
>>> accepted.
>> 
>> rtable_satoplen() is used in many places, not just in the socket parsing
>> code used by route(8).  I don't know what side effects can be introduced
>> by this change.
>> 
>> Why is IPv6 different from IPv4 when it comes to the default route?
> 
> Diferent functions are used.  route(8) uses inet_makenetandmask() to
> create a sockaddr for IPv4 prefix length and uses prefixlen() for IPv6
> prefix length.  "/0" results:
> 
> IPv4
>   { .len = 1, .family = 0, ... }
> IPv6 
>   { .len = 2, .family = AF_INET6, ... }

I'm sorry this is not correct.  It is actually

IPv6 
  { .len = 28, .family = AF_INET6, ... }

> Next, route(8) uses mask_addr() to trim the tailing zeros.
> 
> 1129 void
> 1130 mask_addr(union sockunion *addr, union sockunion *mask, int which)
> 1131 {
> 1132 int olen = mask->sa.sa_len;
> 1133 char *cp1 = olen + (char *)mask, *cp2;
> 1134 
> 1135 for (mask->sa.sa_len = 0; cp1 > (char *)mask; )
> 1136 if (*--cp1 != '\0') {
> 1137 mask->sa.sa_len = 1 + cp1 - (char *)mask;
> 1138 break;
> 1139 }
> 
> See #1135 carefully.  As the results, the sockaddrs become:
> 
> IPv4
>   { .len = 0, .family = 0, ... }
> IPv6
>   { .len = 2, .family = AF_INET6, ... }

I'm start wondering what the mask_addr() is for.

   1123 void
   1124 mask_addr(union sockunion *addr, union sockunion *mask, int which)
   1125 {
   1126 int olen = mask->sa.sa_len;
   1127 char *cp1 = olen + (char *)mask, *cp2;
   1128 
   1129 for (mask->sa.sa_len = 0; cp1 > (char *)mask; )
   1130 if (*--cp1 != '\0') {
   1131 mask->sa.sa_len = 1 + cp1 - (char *)mask;
   1132 break;
   1133 }
   1134 if ((rtm_addrs & which) == 0)
   1135 return;
   1136 switch (addr->sa.sa_family) {
   1137 case AF_INET:
   1138 case AF_INET6:
   1139 case AF_UNSPEC:
   1140 return;
   1141 }
   1142 cp1 = mask->sa.sa_len + 1 + (char *)addr;
   1143 cp2 = addr->sa.sa_len + 1 + (char *)addr;
   1144 while (cp2 > cp1)
   1145 *--cp2 = '\0';
   1146 cp2 = mask->sa.sa_len + 1 + (char *)mask;
   1147 while (cp1 > addr->sa.sa_data)
   1148 *--cp1 &= *--cp2;
   1149 }

The function mask_addr() doesn't mask address for IPv4 and IPv6.  Does
any address family other than IPv4 or IPv6 require #1142:1148?  The
function seems to just trim the trailing zero.  Is it neccesaary?  And
it causes the confusion on the kernel.  How about deleting
mask_addr()?

The diff following also fixes the problem.

diff --git a/sbin/route/route.c b/sbin/route/route.c
index 9e43d8e89b6..87f26e5c1e7 100644
--- a/sbin/route/route.c
+++ b/sbin/route/route.c
@@ -107,7 +107,6 @@ void print_rtmsg(struct rt_msghdr *, int);
 voidpmsg_common(struct rt_msghdr *);
 voidpmsg_addrs(char *, int);
 voidbprintf(FILE *, int, char *);
-voidmask_addr(union sockunion *, union sockunion *, int);
 int getaddr(int, int, char *, struct hostent **);
 voidgetmplslabel(char *, int);
 int rtmsg(int, int, int, uint8_t);
@@ -1088,8 +1087,6 @@ rtmsg(int cmd, int flags, int fmask, uint8_t prio)
rtm.rtm_mpls = mpls_flags;
rtm.rtm_hdrlen = sizeof(rtm);
 
-   if (rtm_addrs & RTA_NETMASK)
-   mask_addr(_dst, _mask, RTA_DST);
/* store addresses in ascending order of RTA values */
NEXTADDR(RTA_DST, so_dst);
NEXTADDR(RTA_GATEWAY, so_gate);
@@ -1120,34 +1117,6 @@ rtmsg(int cmd, int flags, int fmask, uint8_t prio)
return (0);
 }
 
-void
-mask_addr(union sockunion *addr, union sockunion *mask, int which)
-{
-   int olen = mask->sa.sa_len;
-   char *cp1 = olen + (char *)mask, *cp2;
-
-   for (mask->sa.sa_len = 0; cp1 > (char *)mask; )
-   if (*--cp1 != '\0') {
-   mask->sa.sa_len = 1 + cp1 - (char *)mask;
-   break;
-   }
-   if ((rtm_addrs & which) == 0)
-   return;
-

Re: route add ::/0 ...

2020-06-29 Thread YASUOKA Masahiko
Hi,

On Mon, 29 Jun 2020 10:12:23 +0200
Martin Pieuchot  wrote:
> On 28/06/20(Sun) 20:41, YASUOKA Masahiko wrote:
>> Hi,
>> 
>> When "::/0" is used as "default",
>> 
>>   # route add ::/0 fe80::1%em0
>>   add net ::/0: gateway fe80::1%em0: Invalid argument
>> 
>> route command trims the sockaddr to { .len = 2, .family = AF_INET6 }
>> for "::/0", but rtable_satoplen() refuses it.  I think it should be
>> accepted.
> 
> rtable_satoplen() is used in many places, not just in the socket parsing
> code used by route(8).  I don't know what side effects can be introduced
> by this change.
> 
> Why is IPv6 different from IPv4 when it comes to the default route?

Diferent functions are used.  route(8) uses inet_makenetandmask() to
create a sockaddr for IPv4 prefix length and uses prefixlen() for IPv6
prefix length.  "/0" results:

IPv4
  { .len = 1, .family = 0, ... }
IPv6 
  { .len = 2, .family = AF_INET6, ... }

Next, route(8) uses mask_addr() to trim the tailing zeros.

1129 void
1130 mask_addr(union sockunion *addr, union sockunion *mask, int which)
1131 {
1132 int olen = mask->sa.sa_len;
1133 char *cp1 = olen + (char *)mask, *cp2;
1134 
1135 for (mask->sa.sa_len = 0; cp1 > (char *)mask; )
1136 if (*--cp1 != '\0') {
1137 mask->sa.sa_len = 1 + cp1 - (char *)mask;
1138 break;
1139 }

See #1135 carefully.  As the results, the sockaddrs become:

IPv4
  { .len = 0, .family = 0, ... }
IPv6
  { .len = 2, .family = AF_INET6, ... }

Yes, we can fix IPv6 case to have .len = 0 as well.

But I thought kernel should accept both cases, since the
representation for IPv6 didn't seem so bad for me.

> Shouldn't we change route(8) to have a `sa_len' of 0?
> 
> That would make the following true:
> 
> mlen = mask->sa_len;
> 
>   /* Default route */
>   if (mlen == 0)
>   return (0)
> 
>> Allow sockaddr for prefix length be trimmed before the key(address)
>> field.  Actually "route" command trims at the address family field for
>> "::/0"
>> 
>> Index: sys/net/rtable.c
>> ===
>> RCS file: /cvs/src/sys/net/rtable.c,v
>> retrieving revision 1.69
>> diff -u -p -r1.69 rtable.c
>> --- sys/net/rtable.c 21 Jun 2019 17:11:42 -  1.69
>> +++ sys/net/rtable.c 28 Jun 2020 11:30:54 -
>> @@ -887,8 +887,8 @@ rtable_satoplen(sa_family_t af, struct s
>>  
>>  ap = (uint8_t *)((uint8_t *)mask) + dp->dom_rtoffset;
>>  ep = (uint8_t *)((uint8_t *)mask) + mlen;
>> -if (ap > ep)
>> -return (-1);
>> +if (ap >= ep)
>> +return (0);
> 
> That means the kernel now silently ignore sockaddr short `sa_len'. Are
> they supposed to be supported or are they symptoms of bugs?

I have missed rtable_satoplen() is used by other functions.

> 
>>  /* Trim trailing zeroes. */
>>  while (ap < ep && ep[-1] == 0)
> 



route add ::/0 ...

2020-06-28 Thread YASUOKA Masahiko
Hi,

When "::/0" is used as "default",

  # route add ::/0 fe80::1%em0
  add net ::/0: gateway fe80::1%em0: Invalid argument

route command trims the sockaddr to { .len = 2, .family = AF_INET6 }
for "::/0", but rtable_satoplen() refuses it.  I think it should be
accepted.

ok?

Allow sockaddr for prefix length be trimmed before the key(address)
field.  Actually "route" command trims at the address family field for
"::/0"

Index: sys/net/rtable.c
===
RCS file: /cvs/src/sys/net/rtable.c,v
retrieving revision 1.69
diff -u -p -r1.69 rtable.c
--- sys/net/rtable.c21 Jun 2019 17:11:42 -  1.69
+++ sys/net/rtable.c28 Jun 2020 11:30:54 -
@@ -887,8 +887,8 @@ rtable_satoplen(sa_family_t af, struct s
 
ap = (uint8_t *)((uint8_t *)mask) + dp->dom_rtoffset;
ep = (uint8_t *)((uint8_t *)mask) + mlen;
-   if (ap > ep)
-   return (-1);
+   if (ap >= ep)
+   return (0);
 
/* Trim trailing zeroes. */
while (ap < ep && ep[-1] == 0)



Re: pipex(4): prevent `state_list' corruption

2020-06-22 Thread YASUOKA Masahiko
Yes, this seems right.

ok yasuoka

On Thu, 18 Jun 2020 23:53:25 +0300
Vitaliy Makkoveev  wrote:
> While pppac(4) destroy sessions by pipex_iface_fini() or by
> pipex_ioctl() with PIPEXSMODE command, some sessions can be linked to
> `state_list'. This case is not checked and sessions will never be
> unlinked and `state_list' will be broken after session's memory freeing.
> 
> Diff below adds session removal from `state_list' in
> pipex_unlink_session(). Also unlinked session `state' sets to
> PIPEX_STATE_CLOSED like pipex_close_session() does.
> 
> Index: sys/net/pipex.c
> ===
> RCS file: /cvs/src/sys/net/pipex.c,v
> retrieving revision 1.115
> diff -u -p -r1.115 pipex.c
> --- sys/net/pipex.c   18 Jun 2020 14:20:12 -  1.115
> +++ sys/net/pipex.c   18 Jun 2020 16:37:44 -
> @@ -473,8 +473,10 @@ pipex_unlink_session(struct pipex_sessio
>   break;
>   }
>  #endif
> -
> + if (session->state == PIPEX_STATE_CLOSE_WAIT)
> + LIST_REMOVE(session, state_list);
>   LIST_REMOVE(session, session_list);
> + session->state = PIPEX_STATE_CLOSED;
>  
>   /* if final session is destroyed, stop timer */
>   if (LIST_EMPTY(_session_list))
> 



Re: install npppd.conf with mode 0600

2020-06-21 Thread YASUOKA Masahiko
The line in etc/mtree/special should be updated as well.

  npppd.conf  type=file mode=0640 uname=root gname=wheel

other than that, ok yasuoka

On Sun, 21 Jun 2020 16:48:44 +0300
Vitaliy Makkoveev  wrote:
> We installing `npppd-users' with uid:gid root:wheel and mode 0600
> because it consists sensitive data but mode for npppd.conf is 0640.
> npppd.conf can also have radius passwords and nothing requires to allow
> it be readable by group. So set it's permissions to 0600.
> 
> Index: usr.sbin/npppd/Makefile
> ===
> RCS file: /cvs/src/usr.sbin/npppd/Makefile,v
> retrieving revision 1.6
> diff -u -p -r1.6 Makefile
> --- usr.sbin/npppd/Makefile   14 Mar 2013 16:20:46 -  1.6
> +++ usr.sbin/npppd/Makefile   21 Jun 2020 13:37:50 -
> @@ -6,7 +6,7 @@
>  SUBDIR+= npppd
>  
>  distribution:
> - ${INSTALL} -C -o root -g wheel -m 0640 ${.CURDIR}/npppd/npppd.conf \
> + ${INSTALL} -C -o root -g wheel -m 0600 ${.CURDIR}/npppd/npppd.conf \
>   ${DESTDIR}/etc/npppd/npppd.conf
>   ${INSTALL} -C -o root -g wheel -m 0600 ${.CURDIR}/npppd/npppd-users \
>   ${DESTDIR}/etc/npppd/npppd-users



Re: pf "route-to least-state" in an anchor doesn't work

2020-06-03 Thread YASUOKA Masahiko
Hello,

On Wed, 3 Jun 2020 23:30:56 +0200
Alexandr Nedvedicky  wrote:
> I'm OK with your change.

Thank you for your review and comment.

> However I would like to ask you to do yet another test.  I wonder if things
> will eventually work on unfixed PF if rules will be constructed as follows:
> 
> pfctl -a test -t LB -T add 10.0.0.11@pair102
> 
> echo 'pass in on rdomain 102 quick proto tcp to 10.0.0.101 port 8080 \
> keep state ( sloppy ) route-to  \
> least-states sticky-address' |pfctl -a test -f -
> 
> echo 'anchor test' | pfctl -f -
> 
> pfctl -e
> 
> I suspect the bug you've found and fixed happens when pfctl loads rules
> from pf.conf. I think the steps above will take a different route
> through the code, which avoids pfr_ina_define() (a.k.a. transactions).

I've tested it before the diff and after.  Both tests were OK.

  # pfctl -sr -a test   
 
  pass in quick on rdomain 102 inet proto tcp from any to 10.0.0.101 port = 
8080 flags S/SA keep state (sloppy) route-to  least-states sticky-address
  # pfctl -a test -tLB -Tshow
 10.0.0.11@pair102
  # 

  $ doas route -T 101 exec telnet 10.0.0.101 8080
  Trying 10.0.0.101...
  Connected to 10.0.0.101.
  Escape character is '^]'.
  ^]
  
  telnet> close
  Connection closed.
  $ 

> I don't have a test system readily available and I'm just curious
> if anything changes or not. Thanks for finding that for me.
> 
> As I've said I think your change should go in.
> 
> OK sashan

Thanks,



pf "route-to least-state" in an anchor doesn't work

2020-06-03 Thread YASUOKA Masahiko
Hi,

pf.conf:

  anchor {
pass in on rdomain 102 quick proto tcp to 10.0.0.101 port 8080 \
  keep state ( sloppy ) route-to  \
  least-states sticky-address
  }
  table  {
10.0.0.11@pair102
  }

this doesn't work.  All packets going to 10.0.0.101 are dropped with
'no-route'.  The problem doesn't happen if the pass rule is moved to
outside of the anchor or uses "round-robin" instead of "least-states".

In sys/net/pf_lb.c:
594 if (rpool->addr.type == PF_ADDR_TABLE) {
595 if (pfr_states_increase(rpool->addr.p.tbl,
596 naddr, af) == -1) {
597 if (pf_status.debug >= LOG_DEBUG) {
598 log(LOG_DEBUG,"pf: pf_map_addr: 
"
599 "selected address ");
600 pf_print_host(naddr, 0, af);
601 addlog(". Failed to increase 
count!\n");
602 }
603 return (1);
604 }

This chunk is to increase the counter for "least-state".  The packets
drops here because pfr_states_increase() returns -1.
pfr_states_increase() uses pfr_kentry_byaddr(), and
pfr_kentry_byaddr() uses pfr_lookup_addr() to lookup a kentry in the
table.

pfr_lookup_addr() never succeeded for above case, because it doesn't
care about using global (root) tables from rules in an anchor.  All
other functions which lookup a kentry from the table than
pfr_lookup_addr() seem to take care about that.

I thought that pfr_lookup_addr() is a local function used for ioctl to
create tables and manage its members.  So the keep it
untouched. Instead, the diff replaces the body of pfr_kentry_byaddr()
by the logic of pfr_match_addr().

* * *
Test

1. prepare network

  ifconfig pair101 rdomain 101 10.0.0.1/24
  ifconfig pair102 rdomain 102 10.0.0.10/24
  ifconfig pair102 alias 10.0.0.101/24
  ifconfig pair103 rdomain 103 10.0.0.11/24
  ifconfig pair104 rdomain 100 patch pair101 up
  ifconfig pair105 rdomain 100 patch pair102 up
  ifconfig pair106 rdomain 100 patch pair103 up
  ifconfig lo103 127.0.0.1/8
  ifconfig lo103 alias 10.0.0.101/24

  ifconfig bridge100 add pair104
  ifconfig bridge100 add pair105
  ifconfig bridge100 add pair106 up

2. setup pf.conf

  anchor {
pass in on rdomain 102 quick proto tcp to 10.0.0.101 port 8080 \
  keep state ( sloppy ) route-to  \
  least-states sticky-address
  }
  table  {
10.0.0.11@pair102
  }

3. start a daemon on 8080/tcp on #103

   doas route -T 103 exec nc -l 8080

4. try to connect to it from #101

   doas route -T 101 exec telnet 10.0.0.101 8080

   - test OK if the connection is established

5. teardown

  ifconfig pair106 destroy
  ifconfig pair105 destroy
  ifconfig pair104 destroy
  ifconfig pair103 destroy
  ifconfig pair102 destroy
  ifconfig pair101 destroy
  ifconfig bridge100 destroy

* * *

ok?

Fix pfr_kentry_byaddr() to be used for a rule in an anchor.  It
couldn't find an entry if its table is attached a table on the root.
This fixes the problem "route-to  least-states" doesn't work.
The problem is found by IIJ.

Index: sys/net/pf_table.c
===
RCS file: /cvs/src/sys/net/pf_table.c,v
retrieving revision 1.131
diff -u -p -r1.131 pf_table.c
--- sys/net/pf_table.c  8 Jul 2019 17:49:57 -   1.131
+++ sys/net/pf_table.c  3 Jun 2020 07:21:27 -
@@ -2085,11 +2085,28 @@ int
 pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af)
 {
struct pfr_kentry   *ke = NULL;
+   int  match;
+
+   ke = pfr_kentry_byaddr(kt, a, af, 0);
+
+   match = (ke && !(ke->pfrke_flags & PFRKE_FLAG_NOT));
+   if (match)
+   kt->pfrkt_match++;
+   else
+   kt->pfrkt_nomatch++;
+
+   return (match);
+}
+
+struct pfr_kentry *
+pfr_kentry_byaddr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af,
+int exact)
+{
+   struct pfr_kentry   *ke = NULL;
struct sockaddr_in   tmp4;
 #ifdef INET6
struct sockaddr_in6  tmp6;
 #endif /* INET6 */
-   int  match;
 
if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
kt = kt->pfrkt_root;
@@ -2116,12 +2133,10 @@ pfr_match_addr(struct pfr_ktable *kt, st
default:
unhandled_af(af);
}
-   match = (ke && !(ke->pfrke_flags & PFRKE_FLAG_NOT));
-   if (match)
-   kt->pfrkt_match++;
-   else
-   kt->pfrkt_nomatch++;
-   return (match);
+   if (exact && ke && KENTRY_NETWORK(ke))
+   ke = NULL;
+
+   return (ke);
 }
 
 void
@@ -2497,39 +2512,6 @@ pfr_states_decrease(struct pfr_ktable *k
"pfr_states_decrease: states-- when states <= 0");
 

Re: diff: init efifb even if VGA is probed.

2020-05-28 Thread YASUOKA Masahiko
On Thu, 28 May 2020 12:31:31 +0200 (CEST)
Mark Kettenis  wrote:
>> Date: Thu, 28 May 2020 17:01:48 +0900 (JST)
>> From: YASUOKA Masahiko 
>> 
>> Hi,
>> 
>> I'd like to conclude this issue.
>> 
>> On Fri, 21 Feb 2020 14:09:07 +0900 (JST)
>> YASUOKA Masahiko  wrote:
>> > I am testing a new hardware, HPE DL20 Gen10.
>> > 
>> > When efiboot starts the kernel, the video display becomes distorted
>> > and never recovered until CPU reset.
>> > 
>> > Our kernel tries to initialized console twice, first trial is done
>> > before getting boot info and second trial is done after getting boot
>> > info.  Since EFI framebuffer needs "boot info", it is initialized on
>> > second trial.
>> > 
>> > On HPE DL20 Gen10, probing vga is succeeded on first trial, the kernel
>> > selects vga for the console, but actually it is broken.  On usual
>> > machines which boot with EFI, the problem doesn't happen since they
>> > have no vga.
>> 
>> If we have a way to detect whether the machine has VGA.  ACPI
>> FADT_NO_VGA was a candidate.  But that bit is cleard both on my "HPE
>> DL20 Gen10" and Andrew Daugherity's Dell PowerEdge R230.  Also the
>> problem newly posted at misc@ (*) might be the same problem.
>> 
>>  (*) https://marc.info/?l=openbsd-misc=159064773219779=2
>> 
>> I think having the diff folowing is the best for this momemnt.
>> The diff does:
>> 
>>   - move cninit() after parsing bootinfo
>>   - give up the debug message which can be enabled if BOOTINFO_DEBUG is 
>> defined
>> 
>> ok?
> 
> I suspect we have to accept that there is too much broken hardware out
> there.

Finally we might have no way other than having a configuration in
boot.conf...

> There is no real reason to drop the debug messages.

OK, the debug messages are reverted.

> I'd prefer to call cninit() directly from init_x86_64, so I'd just
> move the call immediately after the block that calls getbootinfo().
> And emove the call from getbootinfo() of course.

I think the last diff already satisfied these things.

>> @@ -1395,11 +1395,6 @@ init_x86_64(paddr_t first_avail)
>>  i8254_startclock();
>>  
>>  /*
>> - * Attach the glass console early in case we need to display a panic.
>> - */
>> -cninit();
>> -
>> -/*
>>   * Initialize PAGE_SIZE-dependent variables.
>>   */
>>  uvm_setpagesize();
>> @@ -1421,6 +1416,8 @@ init_x86_64(paddr_t first_avail)
>>  } else
>>  panic("invalid /boot");
>>  
>> +cninit();
>> +
>>  /*
>>   * Memory on the AMD64 port is described by three different things.
>>   *

A hidden line which calls getbootinfo() is at just before second
chunk.  The updated diff was created with "-U 4" to clarify this.

Alternatively, are you suggesting

getbootinfo(bootinfo, bootinfo_size);
+   cninit();
} else
panic("invalid /boot");

?

Both is OK for me.

How about this?

Index: sys/arch/amd64/amd64/machdep.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/machdep.c,v
retrieving revision 1.264
diff -u -p -U4 -r1.264 machdep.c
--- sys/arch/amd64/amd64/machdep.c  16 May 2020 14:44:44 -  1.264
+++ sys/arch/amd64/amd64/machdep.c  28 May 2020 11:34:39 -
@@ -1394,13 +1394,8 @@ init_x86_64(paddr_t first_avail)
 
i8254_startclock();
 
/*
-* Attach the glass console early in case we need to display a panic.
-*/
-   cninit();
-
-   /*
 * Initialize PAGE_SIZE-dependent variables.
 */
uvm_setpagesize();
 
@@ -1420,8 +1415,10 @@ init_x86_64(paddr_t first_avail)
getbootinfo(bootinfo, bootinfo_size);
} else
panic("invalid /boot");
 
+   cninit();
+
 /*
  * Memory on the AMD64 port is described by three different things.
  *
  * 1. biosbasemem - This is outdated, and should really only be used to
@@ -1926,10 +1923,8 @@ getbootinfo(char *bootinfo, int bootinfo
bootarg32_t *q;
bios_ddb_t *bios_ddb;
bios_bootduid_t *bios_bootduid;
bios_bootsr_t *bios_bootsr;
-   int docninit = 0;
-
 #undef BOOTINFO_DEBUG
 #ifdef BOOTINFO_DEBUG
printf("bootargv:");
 #endif
@@ -1982,11 +1977,8 @@ getbootinfo(char *bootinfo, int bootinfo
comconsunit = unit;
comconsaddr = consaddr;
comconsrate = cdp->conspeed;
 

Re: diff: init efifb even if VGA is probed.

2020-05-28 Thread YASUOKA Masahiko
On Thu, 28 May 2020 17:01:48 +0900 (JST)
YASUOKA Masahiko  wrote:
> Hi,
> 
> I'd like to conclude this issue.
> 
> On Fri, 21 Feb 2020 14:09:07 +0900 (JST)
> YASUOKA Masahiko  wrote:
>> I am testing a new hardware, HPE DL20 Gen10.
>> 
>> When efiboot starts the kernel, the video display becomes distorted
>> and never recovered until CPU reset.
>> 
>> Our kernel tries to initialized console twice, first trial is done
>> before getting boot info and second trial is done after getting boot
>> info.  Since EFI framebuffer needs "boot info", it is initialized on
>> second trial.
>> 
>> On HPE DL20 Gen10, probing vga is succeeded on first trial, the kernel
>> selects vga for the console, but actually it is broken.  On usual
>> machines which boot with EFI, the problem doesn't happen since they
>> have no vga.
> 
> If we have a way to detect whether the machine has VGA.  ACPI
> FADT_NO_VGA was a candidate.  But that bit is cleard both on my "HPE
> DL20 Gen10" and Andrew Daugherity's Dell PowerEdge R230.  Also the
> problem newly posted at misc@ (*) might be the same problem.

Above paragraph may be unclear.  Let me update it by the following
paragraph.

If we have a way to detect whether the machine has VGA, we thought we
can select VGA or EFI framebuffer safely.  ACPI FADT_NO_VGA was a
candidate.  But the bit is cleared both on my "HPE DL20 Gen10" and
Andrew Daugherity's Dell PowerEdge R230.  Also the problem newly
posted at misc@ (*) might be the same problem.

>  (*) https://marc.info/?l=openbsd-misc=159064773219779=2
> 
> I think having the diff folowing is the best for this momemnt.
> The diff does:
> 
>   - move cninit() after parsing bootinfo
>   - give up the debug message which can be enabled if BOOTINFO_DEBUG is 
> defined
> 
> ok?
> 
> Index: sys/arch/amd64/amd64/machdep.c
> ===
> RCS file: /disk/cvs/openbsd/src/sys/arch/amd64/amd64/machdep.c,v
> retrieving revision 1.264
> diff -u -p -r1.264 machdep.c
> --- sys/arch/amd64/amd64/machdep.c16 May 2020 14:44:44 -  1.264
> +++ sys/arch/amd64/amd64/machdep.c28 May 2020 07:40:14 -
> @@ -1395,11 +1395,6 @@ init_x86_64(paddr_t first_avail)
>   i8254_startclock();
>  
>   /*
> -  * Attach the glass console early in case we need to display a panic.
> -  */
> - cninit();
> -
> - /*
>* Initialize PAGE_SIZE-dependent variables.
>*/
>   uvm_setpagesize();
> @@ -1421,6 +1416,8 @@ init_x86_64(paddr_t first_avail)
>   } else
>   panic("invalid /boot");
>  
> + cninit();
> +
>  /*
>   * Memory on the AMD64 port is described by three different things.
>   *
> @@ -1927,12 +1924,6 @@ getbootinfo(char *bootinfo, int bootinfo
>   bios_ddb_t *bios_ddb;
>   bios_bootduid_t *bios_bootduid;
>   bios_bootsr_t *bios_bootsr;
> - int docninit = 0;
> -
> -#undef BOOTINFO_DEBUG
> -#ifdef BOOTINFO_DEBUG
> - printf("bootargv:");
> -#endif
>  
>   for (q = (bootarg32_t *)bootinfo;
>   (q->ba_type != BOOTARG_END) &&
> @@ -1942,24 +1933,15 @@ getbootinfo(char *bootinfo, int bootinfo
>   switch (q->ba_type) {
>   case BOOTARG_MEMMAP:
>   bios_memmap = (bios_memmap_t *)q->ba_arg;
> -#ifdef BOOTINFO_DEBUG
> - printf(" memmap %p", bios_memmap);
> -#endif
>   break;
>   case BOOTARG_DISKINFO:
>   bios_diskinfo = (bios_diskinfo_t *)q->ba_arg;
> -#ifdef BOOTINFO_DEBUG
> - printf(" diskinfo %p", bios_diskinfo);
> -#endif
>   break;
>   case BOOTARG_APMINFO:
>   /* generated by i386 boot loader */
>   break;
>   case BOOTARG_CKSUMLEN:
>   bios_cksumlen = *(u_int32_t *)q->ba_arg;
> -#ifdef BOOTINFO_DEBUG
> - printf(" cksumlen %d", bios_cksumlen);
> -#endif
>   break;
>   case BOOTARG_PCIINFO:
>   /* generated by i386 boot loader */
> @@ -1983,15 +1965,8 @@ getbootinfo(char *bootinfo, int bootinfo
>   comconsaddr = consaddr;
>   comconsrate = cdp->conspeed;
>   comconsiot = X86_BUS_SPACE_IO;
> -
> - /* Probe the serial port this time. */
> - docninit+

Re: diff: init efifb even if VGA is probed.

2020-05-28 Thread YASUOKA Masahiko
Hi,

I'd like to conclude this issue.

On Fri, 21 Feb 2020 14:09:07 +0900 (JST)
YASUOKA Masahiko  wrote:
> I am testing a new hardware, HPE DL20 Gen10.
> 
> When efiboot starts the kernel, the video display becomes distorted
> and never recovered until CPU reset.
> 
> Our kernel tries to initialized console twice, first trial is done
> before getting boot info and second trial is done after getting boot
> info.  Since EFI framebuffer needs "boot info", it is initialized on
> second trial.
> 
> On HPE DL20 Gen10, probing vga is succeeded on first trial, the kernel
> selects vga for the console, but actually it is broken.  On usual
> machines which boot with EFI, the problem doesn't happen since they
> have no vga.

If we have a way to detect whether the machine has VGA.  ACPI
FADT_NO_VGA was a candidate.  But that bit is cleard both on my "HPE
DL20 Gen10" and Andrew Daugherity's Dell PowerEdge R230.  Also the
problem newly posted at misc@ (*) might be the same problem.

 (*) https://marc.info/?l=openbsd-misc=159064773219779=2

I think having the diff folowing is the best for this momemnt.
The diff does:

  - move cninit() after parsing bootinfo
  - give up the debug message which can be enabled if BOOTINFO_DEBUG is defined

ok?

Index: sys/arch/amd64/amd64/machdep.c
===
RCS file: /disk/cvs/openbsd/src/sys/arch/amd64/amd64/machdep.c,v
retrieving revision 1.264
diff -u -p -r1.264 machdep.c
--- sys/arch/amd64/amd64/machdep.c  16 May 2020 14:44:44 -  1.264
+++ sys/arch/amd64/amd64/machdep.c  28 May 2020 07:40:14 -
@@ -1395,11 +1395,6 @@ init_x86_64(paddr_t first_avail)
i8254_startclock();
 
/*
-* Attach the glass console early in case we need to display a panic.
-*/
-   cninit();
-
-   /*
 * Initialize PAGE_SIZE-dependent variables.
 */
uvm_setpagesize();
@@ -1421,6 +1416,8 @@ init_x86_64(paddr_t first_avail)
} else
panic("invalid /boot");
 
+   cninit();
+
 /*
  * Memory on the AMD64 port is described by three different things.
  *
@@ -1927,12 +1924,6 @@ getbootinfo(char *bootinfo, int bootinfo
bios_ddb_t *bios_ddb;
bios_bootduid_t *bios_bootduid;
bios_bootsr_t *bios_bootsr;
-   int docninit = 0;
-
-#undef BOOTINFO_DEBUG
-#ifdef BOOTINFO_DEBUG
-   printf("bootargv:");
-#endif
 
for (q = (bootarg32_t *)bootinfo;
(q->ba_type != BOOTARG_END) &&
@@ -1942,24 +1933,15 @@ getbootinfo(char *bootinfo, int bootinfo
switch (q->ba_type) {
case BOOTARG_MEMMAP:
bios_memmap = (bios_memmap_t *)q->ba_arg;
-#ifdef BOOTINFO_DEBUG
-   printf(" memmap %p", bios_memmap);
-#endif
break;
case BOOTARG_DISKINFO:
bios_diskinfo = (bios_diskinfo_t *)q->ba_arg;
-#ifdef BOOTINFO_DEBUG
-   printf(" diskinfo %p", bios_diskinfo);
-#endif
break;
case BOOTARG_APMINFO:
/* generated by i386 boot loader */
break;
case BOOTARG_CKSUMLEN:
bios_cksumlen = *(u_int32_t *)q->ba_arg;
-#ifdef BOOTINFO_DEBUG
-   printf(" cksumlen %d", bios_cksumlen);
-#endif
break;
case BOOTARG_PCIINFO:
/* generated by i386 boot loader */
@@ -1983,15 +1965,8 @@ getbootinfo(char *bootinfo, int bootinfo
comconsaddr = consaddr;
comconsrate = cdp->conspeed;
comconsiot = X86_BUS_SPACE_IO;
-
-   /* Probe the serial port this time. */
-   docninit++;
}
 #endif
-#ifdef BOOTINFO_DEBUG
-   printf(" console 0x%x:%d",
-   cdp->consdev, cdp->conspeed);
-#endif
}
break;
case BOOTARG_BOOTMAC:
@@ -2023,8 +1998,6 @@ getbootinfo(char *bootinfo, int bootinfo
 
case BOOTARG_EFIINFO:
bios_efiinfo = (bios_efiinfo_t *)q->ba_arg;
-   if (bios_efiinfo->fb_addr != 0)
-   docninit++;
break;
 
case BOOTARG_UCODE:
@@ -2032,18 +2005,9 @@ getbootinfo(char *bootinfo, int bootinfo
break;
 
default:
-#ifdef BOOTINFO_DEBUG
-   printf(" unsupported arg (%d) %p", q->ba_type,
-   q->ba_arg);
-#endif
break;
}
}
-   if (docninit > 0)
-   cninit();
-#ifdef BOOTINFO_DEBUG
-   printf("\n");
-#endif
 }
 
 int



fix pppac(4) without pipex

2020-04-12 Thread YASUOKA Masahiko
Hi,

The diff followings fixes panics when using pppac(4) with "pipex no".

Index: sys/net/if_pppx.c
===
RCS file: /cvs/src/sys/net/if_pppx.c,v
retrieving revision 1.83
diff -u -p -r1.83 if_pppx.c
--- sys/net/if_pppx.c   10 Apr 2020 07:36:52 -  1.83
+++ sys/net/if_pppx.c   12 Apr 2020 06:12:35 -
@@ -344,7 +344,7 @@ pppxwrite(dev_t dev, struct uio *uio, in
if (m == NULL)
return (ENOBUFS);
mlen = MHLEN;
-   if (uio->uio_resid >= MINCLSIZE) {
+   if (uio->uio_resid > MHLEN) {
MCLGET(m, M_DONTWAIT);
if (!(m->m_flags & M_EXT)) {
m_free(m);
@@ -1368,7 +1368,7 @@ pppacwrite(dev_t dev, struct uio *uio, i
if (m == NULL)
return (ENOMEM);
 
-   if (uio->uio_resid > MINCLSIZE) {
+   if (uio->uio_resid > MHLEN) {
m_clget(m, M_WAITOK, uio->uio_resid);
if (!ISSET(m->m_flags, M_EXT)) {
m_free(m);



Re: pipex(4) fix: check session existence before creation

2020-04-07 Thread YASUOKA Masahiko
ok yasuoka

On Mon, 6 Apr 2020 19:54:20 +0300
Vitaliy Makkoveev  wrote:
> Deny to create pipex_session which is already exist. Newly created
> session will be placed to list head so the caller of
> pipex_*_lookup_session() will receive wrong session.
> 
> Index: sys/net/if_pppx.c
> ===
> RCS file: /cvs/src/sys/net/if_pppx.c,v
> retrieving revision 1.79
> diff -u -p -r1.79 if_pppx.c
> --- sys/net/if_pppx.c 6 Apr 2020 12:31:30 -   1.79
> +++ sys/net/if_pppx.c 6 Apr 2020 13:47:26 -
> @@ -719,6 +719,11 @@ pppx_add_session(struct pppx_dev *pxd, s
>   return (EPROTONOSUPPORT);
>   }
>  
> + session = pipex_lookup_by_session_id(req->pr_protocol,
> + req->pr_session_id);
> + if (session)
> + return (EEXIST);
> +
>   pxi = pool_get(pppx_if_pl, PR_WAITOK | PR_ZERO);
>   if (pxi == NULL)
>   return (ENOMEM);
> Index: sys/net/pipex.c
> ===
> RCS file: /cvs/src/sys/net/pipex.c,v
> retrieving revision 1.112
> diff -u -p -r1.112 pipex.c
> --- sys/net/pipex.c   6 Apr 2020 13:14:04 -   1.112
> +++ sys/net/pipex.c   6 Apr 2020 13:47:33 -
> @@ -312,6 +312,11 @@ pipex_add_session(struct pipex_session_r
>   return (EPROTONOSUPPORT);
>   }
>  
> + session = pipex_lookup_by_session_id(req->pr_protocol,
> + req->pr_session_id);
> + if (session)
> + return (EEXIST);
> +
>   /* prepare a new session */
>   session = pool_get(_session_pool, PR_WAITOK | PR_ZERO);
>   session->state = PIPEX_STATE_OPENED;
> 



Re: Prevent memory corruption by pipex_timer()

2020-04-01 Thread YASUOKA Masahiko
Hi,

Sorry for my silence.

ok yasuoka for the daemon part.

On Wed, 1 Apr 2020 09:27:10 +0200
Martin Pieuchot  wrote:
> On 31/03/20(Tue) 23:16, Vitaliy Makkoveev wrote:
>> On Tue, Mar 31, 2020 at 06:15:46PM +0200, Martin Pieuchot wrote:
>> > [...] 
>> > Well better fix npppd(8), no?  Not crashing the kernel is priority 1.
>> I made patch for npppd(8) too. I include it to this email below, without
>> starting new thread, ok? If ioctl(PIPEXASESSION) failed and error !=
>> ENXIO it means that pipex is enabled and session creation failed so down
>> this connection.
> 
> Thanks, I committed the kernel part.  I'm waiting to see if other devs
> want to comment on the daemon part.
> 
>> > Then if the daemon has a bug, should the kernel work around it? 
>> In most common cases should :(
> 
> That's an opinion.  There's no true or false answer.  Working around it
> has obvious advantages but it doesn't make us fix existing bug and instead
> force us to maintain the work around. 
> 
> It is argued that the "failing hard" model, as it is practised in OpenBSD
> software development, has the advantage of resulting in simpler code because
> every layer is responsible for handling errors and doesn't pile workaround.
> 
> This bug is a nice example.  Thanks for the report!  If you could submit
> your refactoring in a new thread, I'd love to look at it.
> 



Re: diff: init efifb even if VGA is probed.

2020-03-09 Thread YASUOKA Masahiko
Hi,

Thank you for your test and feedback.

On Fri, 6 Mar 2020 16:38:24 -0600
Andrew Daugherity  wrote:
> On Sun, Mar 1, 2020 at 10:41 PM YASUOKA Masahiko  wrote:
>>
>> Hi,
>>
>> The problems you are pointing seem to be the same problem.
>>
>> > I'll try to test this diff next week if I can schedule some downtime.
>>
>> Test is appreciated.
>>
>> Also I'd like to know the result of
>>
>>   hexdump -C /var/db/acpi/FACP.1
>>
>> when "Load Legacy Video Option ROM" setting is disabled.
> 
> I just tested a -current kernel built yesterday with that diff (your
> post on Feb. 20), but unfortunately it does not fix the issue on my
> hardware.  As before, if "Load Legacy Video Option ROM" is disabled,
> output is squished to a purple line and when devices are initialized,
> vga1 is the wsdisplay0 device:

I see, first diff didn't fix the problem on your machine.

> vga1 at pci7 dev 0 function 0 "Matrox MGA G200eR" rev 0x01
> wsdisplay0 at vga1 mux 1: console (80x25, vt100 emulation)
> wsdisplay0: screen 1-5 added (80x25, vt100 emulation)
> efifb0 at mainbus0: 1280x1024, 32bpp
> wsdisplay at efifb0 not configured
> 
> vs. with the legacy video ROM setting:
> 
> "Matrox MGA G200eR" rev 0x01 at pci7 dev 0 function 0 not configured
> efifb0 at mainbus0: 1024x768, 32bpp
> wsdisplay0 at efifb0 mux 1
> wsdisplay0: screen 0-5 added (std, vt100 emulation)
> 
> I'm using a serial console, if it matters.  Hmm... I just noticed that
> with the legacy ROM setting disabled, both wsdisplay0 at vga1 mux
> 1/wskbd0 at ukbd0 *and* com1 claim to be the console.  With the
> setting enabled (and efifb working), only com1 is listed as console.
> 
> I haven't tried any of the later diffs as I'm not sure which are still
> recommended.

The last diff should fix the problem since it will initialize efifb
before initializing VGA without condition.

https://marc.info/?l=openbsd-tech=158280719421562=2

> The FACP.1 table does not change when the "Load Legacy Video Option
> ROM" setting is changed.  Here is its hexdump:
> andrew@gsc-lb1:~/acpidump$ hexdump -C legacy-2.8.1/FACP.1
>   46 41 43 50 0c 01 00 00  05 62 44 45 4c 4c 20 20  |FACP.bDELL  |
> 0010  50 45 5f 53 43 33 20 20  00 00 00 00 44 45 4c 4c  |PE_SC3  DELL|
> 0020  01 00 00 00 00 30 f8 8e  00 b0 fc 8e 00 04 09 00  |.0..|
> 0030  b2 00 00 00 f0 f1 f2 00  00 18 00 00 00 00 00 00  ||
> 0040  04 18 00 00 00 00 00 00  50 18 00 00 08 18 00 00  |P...|
> 0050  80 18 00 00 00 00 00 00  04 02 01 04 20 00 10 00  | ...|
> 0060  65 00 e9 03 00 00 00 00  01 03 0d 00 32 11 00 00  |e...2...|
> 0070  a5 86 00 00 01 08 00 01  f9 0c 00 00 00 00 00 00  ||
> 0080  06 00 00 00 00 00 00 00  00 00 00 00 00 b0 fc 8e  ||
> 0090  00 00 00 00 01 20 00 02  00 18 00 00 00 00 00 00  |. ..|
> 00a0  01 00 00 02 00 00 00 00  00 00 00 00 01 10 00 02  ||
> 00b0  04 18 00 00 00 00 00 00  01 00 00 02 00 00 00 00  ||
> 00c0  00 00 00 00 01 08 00 01  50 18 00 00 00 00 00 00  |P...|
> 00d0  01 20 00 03 08 18 00 00  00 00 00 00 01 00 00 01  |. ..|
> 00e0  80 18 00 00 00 00 00 00  01 00 00 01 00 00 00 00  ||
> 00f0  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  ||
> 0100  00 00 00 00 00 00 00 00  00 00 00 00  ||
> 010c

This was to check whether using "VGA Not Present" bit is useful on
your machine.  "Boot IA-PC Boot Architecture Flags" is 0x6D:6E =
0x0011, LEGACY_DEVICES bit is set, "VGA Not Present" is cleared.  This
means the bit isn't set as I expected, it isn't useful to know
existance of VGA.

> The only ACPI change made by toggling that option is the DMAR.25
> table.  Here are both versions:
> Legacy Video ROM enabled:
>   44 4d 41 52 90 00 00 00  01 83 49 4e 54 45 4c 20  |DMAR..INTEL |
> 0010  47 4e 4c 52 00 00 00 00  01 00 00 00 49 4e 54 4c  |GNLRINTL|
> 0020  01 00 00 00 26 01 00 00  00 00 00 00 00 00 00 00  |&...|
> 0030  00 00 20 00 01 00 00 00  00 00 d9 fe 00 00 00 00  |.. .|
> 0040  03 08 00 00 02 f0 1f 00  04 08 00 00 00 00 1f 00  ||
> 0050  01 00 20 00 00 00 00 00  00 b0 ba 7c 00 00 00 00  |.. ||
> 0060  ff 2f bb 84 00 00 00 00  01 08 00 00 00 01 00 00  |./..|
> 0070  01 00 20 00 00 00 00 00  00 10 31 8e 00 00 00 00  |.. ...1.|
> 0080  ff 0f 33 8e 00 00 00 00  01 08 00 00 00 00 14 00  |..3.|
> 0090
> and disabled:
> 

  1   2   3   >