Re: ACPI aml_rwgsb() fix

2021-05-20 Thread Theo Buehler
On Thu, May 20, 2021 at 12:19:37AM +0200, Mark Kettenis wrote:
> My last change to dsdt.c broke one or two of my cheap little Intel
> "Atom" laptops.  Seems my interpretation of the ACPI standard wasn't
> quite right.  I went back to the original bug report and I think I
> understand a bit better what the AML in that report is trying to do.
> So here is a diff that fixes things.
> 
> Theo, can you try this on that Dell Precision 3640?
> 
> Tests on other hardware are welcome, especially on laptops.

That Dell Precision boots happily with this, as does my x280.



Re: macppc bsd.mp pmap's hash lock

2021-05-20 Thread George Koehler
On Wed, 19 May 2021 00:27:51 -0400
George Koehler  wrote:

> Here's my new diff.  It is a copy of what I sent to ppc@ a moment ago.
> I would ask, "Is it ok to commit?", but while writing this mail, I
> found a 2nd potential problem with memory barriers, so I will need to
> edit this diff yet again.

I edited the diff to add a 2nd membar_exit() and a comment.
I want to commit this version, ok?

Index: arch/powerpc/include/mplock.h
===
RCS file: /cvs/src/sys/arch/powerpc/include/mplock.h,v
retrieving revision 1.4
diff -u -p -r1.4 mplock.h
--- arch/powerpc/include/mplock.h   15 Apr 2020 08:09:00 -  1.4
+++ arch/powerpc/include/mplock.h   20 May 2021 19:47:52 -
@@ -30,13 +30,14 @@
 #define __USE_MI_MPLOCK
 
 /*
+ * __ppc_lock exists because pte_spill_r() can't use __mp_lock.
  * Really simple spinlock implementation with recursive capabilities.
  * Correctness is paramount, no fancyness allowed.
  */
 
 struct __ppc_lock {
-   volatile struct cpu_info *mpl_cpu;
-   volatile long   mpl_count;
+   struct cpu_info *volatile   mpl_cpu;
+   longmpl_count;
 };
 
 #ifndef _LOCORE
@@ -44,10 +45,6 @@ struct __ppc_lock {
 void __ppc_lock_init(struct __ppc_lock *);
 void __ppc_lock(struct __ppc_lock *);
 void __ppc_unlock(struct __ppc_lock *);
-int __ppc_release_all(struct __ppc_lock *);
-int __ppc_release_all_but_one(struct __ppc_lock *);
-void __ppc_acquire_count(struct __ppc_lock *, int);
-int __ppc_lock_held(struct __ppc_lock *, struct cpu_info *);
 
 #endif
 
Index: arch/powerpc/powerpc/lock_machdep.c
===
RCS file: /cvs/src/sys/arch/powerpc/powerpc/lock_machdep.c,v
retrieving revision 1.9
diff -u -p -r1.9 lock_machdep.c
--- arch/powerpc/powerpc/lock_machdep.c 15 Apr 2020 08:09:00 -  1.9
+++ arch/powerpc/powerpc/lock_machdep.c 20 May 2021 19:47:52 -
@@ -1,6 +1,7 @@
 /* $OpenBSD: lock_machdep.c,v 1.9 2020/04/15 08:09:00 mpi Exp $*/
 
 /*
+ * Copyright (c) 2021 George Koehler 
  * Copyright (c) 2007 Artur Grabowski 
  *
  * Permission to use, copy, modify, and distribute this software for any
@@ -22,10 +23,21 @@
 #include 
 
 #include 
-#include 
 
 #include 
 
+/*
+ * If __ppc_lock() crosses a page boundary in the kernel text, then it
+ * may cause a page fault (on G5 with ppc_nobat), and pte_spill_r()
+ * would recursively call __ppc_lock().  The lock must be in a valid
+ * state when the page fault happens.  We acquire or release the lock
+ * with a 32-bit atomic write to mpl_owner, so the lock is always in a
+ * valid state, before or after the write.
+ *
+ * Acquired the lock:  mpl->mpl_cpu == curcpu()
+ * Released the lock:  mpl->mpl_cpu == NULL
+ */
+
 void
 __ppc_lock_init(struct __ppc_lock *lock)
 {
@@ -46,12 +58,12 @@ static __inline void
 __ppc_lock_spin(struct __ppc_lock *mpl)
 {
 #ifndef MP_LOCKDEBUG
-   while (mpl->mpl_count != 0)
+   while (mpl->mpl_cpu != NULL)
CPU_BUSY_CYCLE();
 #else
int nticks = __mp_lock_spinout;
 
-   while (mpl->mpl_count != 0 && --nticks > 0)
+   while (mpl->mpl_cpu != NULL && --nticks > 0)
CPU_BUSY_CYCLE();
 
if (nticks == 0) {
@@ -65,32 +77,33 @@ void
 __ppc_lock(struct __ppc_lock *mpl)
 {
/*
-* Please notice that mpl_count gets incremented twice for the
-* first lock. This is on purpose. The way we release the lock
-* in mp_unlock is to decrement the mpl_count and then check if
-* the lock should be released. Since mpl_count is what we're
-* spinning on, decrementing it in mpl_unlock to 0 means that
-* we can't clear mpl_cpu, because we're no longer holding the
-* lock. In theory mpl_cpu doesn't need to be cleared, but it's
-* safer to clear it and besides, setting mpl_count to 2 on the
-* first lock makes most of this code much simpler.
+* Please notice that mpl_count stays at 0 for the first lock.
+* A page fault might recursively call __ppc_lock() after we
+* set mpl_cpu, but before we can increase mpl_count.
+*
+* After we acquire the lock, we need a "bc; isync" memory
+* barrier, but we might not reach the barrier before the next
+* page fault.  Then the fault's recursive __ppc_lock() must
+* have a barrier.  membar_enter() is just "isync" and must
+* come after a conditional branch for holding the lock.
 */
 
while (1) {
-   int s;
+   struct cpu_info *owner = mpl->mpl_cpu;
+   struct cpu_info *ci = curcpu();
 
-   s = ppc_intr_disable();
-   if (atomic_cas_ulong(&mpl->mpl_count, 0, 1) == 0) {
+   if (owner == NULL) {
+   /* Try to acquire the lock. */
+   if (atomic_cas_ptr(&mpl->mpl_cpu, NULL, ci) == NULL) {
+ 

Re: httpd(8): fastcgi & Content-Length: 0

2021-05-20 Thread mpfr
Ah, good. I was just about to provide a diff bringing Steve's 6.9 httpd
to HEAD but I guess that's obsolete then. Happy to see everything's fine.


On 2021-05-20 17:17, Florian Obser wrote:
> I suspect there was one diff too many in Steve's procedure. I provided a
> clean diff for 6.9 in private (don't want to polute the mailing list
> with such a monstrosity ;)
> 
> Thanks for your help, I commited your fix for the fix.
> Hopefully that was the last iteration ;)
> 



Re: vmm(4): Mask TSC_ADJUST cpu feature

2021-05-20 Thread Mike Larkin
On Thu, May 20, 2021 at 07:36:23AM -0400, Dave Voutila wrote:
> We don't currently emulate all TSC related features yet. While hacking
> on other issues, I've found some more obnoxious guests (*cough* debian
> *cough*) constantly try to read the IA32_TSC_ADJUST msr every second,
> not getting the hint when we inject #GP. This floods the kernel message
> buffer with things like:
>
>   vmx_handle_rdmsr: unsupported rdmsr (msr=0x3b), injecting #GP
>
> (The above debug logging exists to help find msr's we're not supporting
> that guests are poking, so I guess you can say it's working as intended
> [1].)
>
> If and when we add more TSC capabilities to vmm we can always unmask.
>
> Ok?
>
> [1] https://marc.info/?l=openbsd-tech&m=161739346822128&w=2
>
> Index: sys/arch/amd64/include/vmmvar.h
> ===
> RCS file: /cvs/src/sys/arch/amd64/include/vmmvar.h,v
> retrieving revision 1.71
> diff -u -p -r1.71 vmmvar.h
> --- sys/arch/amd64/include/vmmvar.h   5 Apr 2021 18:26:46 -   1.71
> +++ sys/arch/amd64/include/vmmvar.h   16 May 2021 16:55:06 -
> @@ -637,6 +637,7 @@ struct vm_mprotect_ept_params {
>
>  /*
>   * SEFF flags - copy from host minus:
> + *  TSC_ADJUST (SEFF0EBX_TSC_ADJUST)
>   *  SGX (SEFF0EBX_SGX)
>   *  HLE (SEFF0EBX_HLE)
>   *  INVPCID (SEFF0EBX_INVPCID)
> @@ -655,7 +656,8 @@ struct vm_mprotect_ept_params {
>   *  PT (SEFF0EBX_PT)
>   *  AVX512VBMI (SEFF0ECX_AVX512VBMI)
>   */
> -#define VMM_SEFF0EBX_MASK ~(SEFF0EBX_SGX | SEFF0EBX_HLE | SEFF0EBX_INVPCID | 
> \
> +#define VMM_SEFF0EBX_MASK ~(SEFF0EBX_TSC_ADJUST | SEFF0EBX_SGX | \
> +SEFF0EBX_HLE | SEFF0EBX_INVPCID | \
>  SEFF0EBX_RTM | SEFF0EBX_PQM | SEFF0EBX_MPX | \
>  SEFF0EBX_PCOMMIT | SEFF0EBX_PT | \
>  SEFF0EBX_AVX512F | SEFF0EBX_AVX512DQ | \

Yep, if we don't implement it we should not be advertising support for it.

ok mlarkin.



Re: httpd(8): fastcgi & Content-Length: 0

2021-05-20 Thread Florian Obser
On 2021-05-20 16:31 +02, Matthias Pressfreund  wrote:
> I just tried WordPress again on Firefox and Chrome. No problems.
> Is there an obj folder? If so, maybe try to do 'make clean'
> after step 5.
>

I suspect there was one diff too many in Steve's procedure. I provided a
clean diff for 6.9 in private (don't want to polute the mailing list
with such a monstrosity ;)

Thanks for your help, I commited your fix for the fix.
Hopefully that was the last iteration ;)

-- 
I'm not entirely sure you are real.



Re: httpd(8): fastcgi & Content-Length: 0

2021-05-20 Thread Matthias Pressfreund
I just tried WordPress again on Firefox and Chrome. No problems.
Is there an obj folder? If so, maybe try to do 'make clean'
after step 5.

On 2021-05-20 16:02, Steve Williams wrote:
> Hi,
> 
> 
> I still get a connection error from my Andriod Nextcloud client and Wordpress 
> does not work correctly on Firefox.
> 
> Please see my steps below to ensure I have tested the correct thing.  
> (patches are attached as well).
> 
> Nextcloud and roundcubemail on Firefox work fine.
> 
> On Chrome, everything works as expected.
> 
> Just to make sure I did this correctly:
> 
>    1.  Extract clean 6.9 httpd source from 6.9/src.tar.gz
>    2.  Apply patch (p1) from May 15 email from Florian with subject
>    (Re: httpd(8): don't try to chunk-encode an empty body)
>    3.  Apply patch (p2) from start of this email thread (httpd(8):
>    fastcgi & Content-Length: 0)
>    4.  Apply patch (p3) from this email thread (Matthias)
>    5.  All patches applied cleanly
>    6.  make
>    7. make install
>    restart and test
> 
> 
> 
> 
> pcengine# patch < ../p1
> Hmm...  Looks like a unified diff to me...
> The text leading up to this was:
> --
> |k--- httpd.h
> |+++ httpd.h
> --
> Patching file httpd.h using Plan A...
> Hunk #1 succeeded at 300.
> Hmm...  The next patch looks like a unified diff to me...
> The text leading up to this was:
> --
> |diff --git server_fcgi.c server_fcgi.c
> |index 64a0e9d2abb..e1e9704c920 100644
> |--- server_fcgi.c
> |+++ server_fcgi.c
> --
> Patching file server_fcgi.c using Plan A...
> Hunk #1 succeeded at 114.
> Hunk #2 succeeded at 545.
> Hunk #3 succeeded at 599.
> Hunk #4 succeeded at 616.
> done
> pcengine# patch < ../p2
> Hmm...  Looks like a unified diff to me...
> The text leading up to this was:
> --
> |--- server_fcgi.c
> |+++ server_fcgi.c
> --
> Patching file server_fcgi.c using Plan A...
> Hunk #1 succeeded at 636.
> done
> pcengine# patch < ../p3
> Hmm...  Looks like a unified diff to me...
> The text leading up to this was:
> --
> |--- usr.sbin/httpd/server_fcgi.c   Thu May 20 05:57:23 2021
> |+++ usr.sbin/httpd/server_fcgi.c   Thu May 20 06:03:40 2021
> --
> Patching file server_fcgi.c using Plan A...
> Hunk #1 succeeded at 620.
> Hunk #2 succeeded at 642.
> done
> 
> On 19/05/2021 11:41 p.m., Florian Obser wrote:
>> Yes, ugh, this is much better, thanks!
>> I'll wait for Steve to confirm that it fixes nextclown for him, too and
>> then I'll put it in.
>>
>> On 2021-05-20 06:43 +02, Matthias Pressfreund  wrote:
>>> Fix works for me, too. Thanks.
>>>
>>> It now sets the "Content-Length: 0" header for ALL traffic that
>>> is not chunk-encoded. But chunk-encoding may be disabled already
>>> (e.g. for http/1.0). I'd therefore suggest to move the fix to where
>>> the handling of zero-length bodies actually takes place.
>>>
>>>
>>> --- usr.sbin/httpd/server_fcgi.c    Thu May 20 05:57:23 2021
>>> +++ usr.sbin/httpd/server_fcgi.c    Thu May 20 06:03:40 2021
>>> @@ -620,6 +620,12 @@
>>>   EVBUFFER_LENGTH(clt->clt_srvevb) == 0) {
>>>   /* Can't chunk encode an empty body. */
>>>   clt->clt_fcgi.chunked = 0;
>>> +    key.kv_key = "Content-Length";
>>> +    if ((kv = kv_find(&resp->http_headers, &key)) == NULL) {
>>> +    if (kv_add(&resp->http_headers,
>>> +    "Content-Length", "0") == NULL)
>>> +    return (-1);
>>> +    }
>>>   }
>>>     /* Set chunked encoding */
>>> @@ -636,13 +642,6 @@
>>>   if (kv_add(&resp->http_headers,
>>>   "Transfer-Encoding", "chunked") == NULL)
>>>   return (-1);
>>> -    } else {
>>> -    key.kv_key = "Content-Length";
>>> -    if ((kv = kv_find(&resp->http_headers, &key)) == NULL) {
>>> -    if (kv_add(&resp->http_headers,
>>> -    "Content-Length", "0") == NULL)
>>> -    return (-1);
>>> -    }
>>>   }
>>>     /* Is it a persistent connection? */
>>>
>>>
>>> On 2021-05-19 20:44, Florian Obser wrote:
 The whole point of using Transfer-Encoding: chunked for fastcgi was so
 that we do not need to provide a Content-Length header if upstream
 doesn't give us one. (We'd need to slurp in all the data ugh).

 Now turns out that if we disable chunked encoding for zero sized bodies
 some browsers are picky and want a Content-Length: 0 (Firefox, Safari)
 or they'll just sit there and wait for the connection to close.

 Problem reported by Matthias Pressfreund with wordpress.
 Debugged with the help of weerd@ who pointed out that the problem is
 actually browser dependent. From there it was pretty clear what the
 problem was.

 OK?

 diff --git server_fcgi.c server_fcgi.c
 index 31d7322e9f7..b9dc4f6fe04 100644
 --- server_fcgi.c
 +++ ser

Re: Add f_modify and f_process callbacks to socket filterops

2021-05-20 Thread Visa Hankala
On Thu, May 20, 2021 at 11:35:32AM +0200, Martin Pieuchot wrote:
> On 18/05/21(Tue) 14:22, Visa Hankala wrote:
> > This diff adds f_modify and f_process callbacks to socket event filters.
> > As a result, socket events are handled using the non-legacy paths in
> > filter_modify() and filter_process() of kern_event.c This a step toward
> > MP-safety. However, everything still runs under the kernel lock.
> > 
> > The change has three intended effects:
> > 
> > * Socket events are handled without raising the system priority level.
> >   This makes the activity observable with btrace(8).
> > 
> > * kqueue itself no longer calls f_event of socket filterops, which
> >   allows replacing the conditional, NOTE_SUBMIT-based locking with
> >   a fixed call pattern.
> 
> I love this.
> 
> > * The state of a socket event is now always rechecked before delivery
> >   to user. Before, the recheck was skipped if the event was registered
> >   with EV_ONESHOT.
> 
> To me this sounds sane.  I can't think of a way to rely on the current
> behavior.  However if there's an easy way to split these changes in two
> commits, I'd prefer to stay on the safe side.

Below is an updated diff that preserves the current EV_ONESHOT
behaviour. I have just adapted a part of the compatibility logic
from function filter_process().

When f_process is given a non-NULL kev argument, it is known that
the callback is invoked from kqueue_scan(). If kev is NULL,
kqueue_register() is checking if the knote should be activated and
there is no intent to deliver the event right now.

Index: kern/uipc_socket.c
===
RCS file: src/sys/kern/uipc_socket.c,v
retrieving revision 1.261
diff -u -p -r1.261 uipc_socket.c
--- kern/uipc_socket.c  13 May 2021 19:43:11 -  1.261
+++ kern/uipc_socket.c  20 May 2021 14:01:18 -
@@ -70,15 +70,26 @@ voidsorflush(struct socket *);
 
 void   filt_sordetach(struct knote *kn);
 intfilt_soread(struct knote *kn, long hint);
+intfilt_soreadmodify(struct kevent *kev, struct knote *kn);
+intfilt_soreadprocess(struct knote *kn, struct kevent *kev);
+intfilt_soread_common(struct knote *kn, struct socket *so);
 void   filt_sowdetach(struct knote *kn);
 intfilt_sowrite(struct knote *kn, long hint);
+intfilt_sowritemodify(struct kevent *kev, struct knote *kn);
+intfilt_sowriteprocess(struct knote *kn, struct kevent *kev);
+intfilt_sowrite_common(struct knote *kn, struct socket *so);
 intfilt_solisten(struct knote *kn, long hint);
+intfilt_solistenmodify(struct kevent *kev, struct knote *kn);
+intfilt_solistenprocess(struct knote *kn, struct kevent *kev);
+intfilt_solisten_common(struct knote *kn, struct socket *so);
 
 const struct filterops solisten_filtops = {
.f_flags= FILTEROP_ISFD,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_solisten,
+   .f_modify   = filt_solistenmodify,
+   .f_process  = filt_solistenprocess,
 };
 
 const struct filterops soread_filtops = {
@@ -86,6 +97,8 @@ const struct filterops soread_filtops = 
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_soread,
+   .f_modify   = filt_soreadmodify,
+   .f_process  = filt_soreadprocess,
 };
 
 const struct filterops sowrite_filtops = {
@@ -93,6 +106,8 @@ const struct filterops sowrite_filtops =
.f_attach   = NULL,
.f_detach   = filt_sowdetach,
.f_event= filt_sowrite,
+   .f_modify   = filt_sowritemodify,
+   .f_process  = filt_sowriteprocess,
 };
 
 const struct filterops soexcept_filtops = {
@@ -100,6 +115,8 @@ const struct filterops soexcept_filtops 
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_soread,
+   .f_modify   = filt_soreadmodify,
+   .f_process  = filt_soreadprocess,
 };
 
 #ifndef SOMINCONN
@@ -2056,13 +2073,12 @@ filt_sordetach(struct knote *kn)
 }
 
 int
-filt_soread(struct knote *kn, long hint)
+filt_soread_common(struct knote *kn, struct socket *so)
 {
-   struct socket *so = kn->kn_fp->f_data;
-   int s, rv = 0;
+   int rv = 0;
+
+   soassertlocked(so);
 
-   if ((hint & NOTE_SUBMIT) == 0)
-   s = solock(so);
kn->kn_data = so->so_rcv.sb_cc;
 #ifdef SOCKET_SPLICE
if (isspliced(so)) {
@@ -2090,12 +2106,50 @@ filt_soread(struct knote *kn, long hint)
} else {
rv = (kn->kn_data >= so->so_rcv.sb_lowat);
}
-   if ((hint & NOTE_SUBMIT) == 0)
-   sounlock(so, s);
 
return rv;
 }
 
+int
+filt_soread(struct knote *kn, long hint)
+{
+   struct socket *so = kn->kn_fp->f_data;
+
+   return (filt_soread_common(kn, so));
+}
+
+int
+filt_soreadmodify(struct kevent *kev, struct knote *kn)
+{
+   struct socket *so = kn->kn_fp->f_data;
+   

Re: httpd(8): fastcgi & Content-Length: 0

2021-05-20 Thread Steve Williams

Hi,


I still get a connection error from my Andriod Nextcloud client and 
Wordpress does not work correctly on Firefox.


Please see my steps below to ensure I have tested the correct thing.  
(patches are attached as well).


Nextcloud and roundcubemail on Firefox work fine.

On Chrome, everything works as expected.

Just to make sure I did this correctly:

   1.  Extract clean 6.9 httpd source from 6.9/src.tar.gz
   2.  Apply patch (p1) from May 15 email from Florian with subject
   (Re: httpd(8): don't try to chunk-encode an empty body)
   3.  Apply patch (p2) from start of this email thread (httpd(8):
   fastcgi & Content-Length: 0)
   4.  Apply patch (p3) from this email thread (Matthias)
   5.  All patches applied cleanly
   6.  make
   7. make install
   restart and test




pcengine# patch < ../p1
Hmm...  Looks like a unified diff to me...
The text leading up to this was:
--
|k--- httpd.h
|+++ httpd.h
--
Patching file httpd.h using Plan A...
Hunk #1 succeeded at 300.
Hmm...  The next patch looks like a unified diff to me...
The text leading up to this was:
--
|diff --git server_fcgi.c server_fcgi.c
|index 64a0e9d2abb..e1e9704c920 100644
|--- server_fcgi.c
|+++ server_fcgi.c
--
Patching file server_fcgi.c using Plan A...
Hunk #1 succeeded at 114.
Hunk #2 succeeded at 545.
Hunk #3 succeeded at 599.
Hunk #4 succeeded at 616.
done
pcengine# patch < ../p2
Hmm...  Looks like a unified diff to me...
The text leading up to this was:
--
|--- server_fcgi.c
|+++ server_fcgi.c
--
Patching file server_fcgi.c using Plan A...
Hunk #1 succeeded at 636.
done
pcengine# patch < ../p3
Hmm...  Looks like a unified diff to me...
The text leading up to this was:
--
|--- usr.sbin/httpd/server_fcgi.c   Thu May 20 05:57:23 2021
|+++ usr.sbin/httpd/server_fcgi.c   Thu May 20 06:03:40 2021
--
Patching file server_fcgi.c using Plan A...
Hunk #1 succeeded at 620.
Hunk #2 succeeded at 642.
done

On 19/05/2021 11:41 p.m., Florian Obser wrote:

Yes, ugh, this is much better, thanks!
I'll wait for Steve to confirm that it fixes nextclown for him, too and
then I'll put it in.

On 2021-05-20 06:43 +02, Matthias Pressfreund  wrote:

Fix works for me, too. Thanks.

It now sets the "Content-Length: 0" header for ALL traffic that
is not chunk-encoded. But chunk-encoding may be disabled already
(e.g. for http/1.0). I'd therefore suggest to move the fix to where
the handling of zero-length bodies actually takes place.


--- usr.sbin/httpd/server_fcgi.cThu May 20 05:57:23 2021
+++ usr.sbin/httpd/server_fcgi.cThu May 20 06:03:40 2021
@@ -620,6 +620,12 @@
EVBUFFER_LENGTH(clt->clt_srvevb) == 0) {
/* Can't chunk encode an empty body. */
clt->clt_fcgi.chunked = 0;
+   key.kv_key = "Content-Length";
+   if ((kv = kv_find(&resp->http_headers, &key)) == NULL) {
+   if (kv_add(&resp->http_headers,
+   "Content-Length", "0") == NULL)
+   return (-1);
+   }
}
  
  	/* Set chunked encoding */

@@ -636,13 +642,6 @@
if (kv_add(&resp->http_headers,
"Transfer-Encoding", "chunked") == NULL)
return (-1);
-   } else {
-   key.kv_key = "Content-Length";
-   if ((kv = kv_find(&resp->http_headers, &key)) == NULL) {
-   if (kv_add(&resp->http_headers,
-   "Content-Length", "0") == NULL)
-   return (-1);
-   }
}
  
  	/* Is it a persistent connection? */



On 2021-05-19 20:44, Florian Obser wrote:

The whole point of using Transfer-Encoding: chunked for fastcgi was so
that we do not need to provide a Content-Length header if upstream
doesn't give us one. (We'd need to slurp in all the data ugh).

Now turns out that if we disable chunked encoding for zero sized bodies
some browsers are picky and want a Content-Length: 0 (Firefox, Safari)
or they'll just sit there and wait for the connection to close.

Problem reported by Matthias Pressfreund with wordpress.
Debugged with the help of weerd@ who pointed out that the problem is
actually browser dependent. From there it was pretty clear what the
problem was.

OK?

diff --git server_fcgi.c server_fcgi.c
index 31d7322e9f7..b9dc4f6fe04 100644
--- server_fcgi.c
+++ server_fcgi.c
@@ -636,6 +636,13 @@ server_fcgi_header(struct client *clt, unsigned int code)
if (kv_add(&resp->http_headers,
"Transfer-Encoding", "chunked") == NULL)
return (-1);
+   } else {
+   key.kv_key = "Content-Length";
+   if ((kv = kv_find(&resp->http_headers, &key)) == NULL) {
+   if (kv_add(&resp-

Re: smtp(1): protocols and ciphers

2021-05-20 Thread Eric Faurot
Here is an updated diff integrating different suggestions I received.

- use -T (for TLS) instead of -O
- use getsubopt(3) which I didn't know
- manpage tweaks

Eric.

Index: smtp.1
===
RCS file: /cvs/src/usr.sbin/smtpd/smtp.1,v
retrieving revision 1.9
diff -u -p -r1.9 smtp.1
--- smtp.1  13 Feb 2021 08:07:48 -  1.9
+++ smtp.1  20 May 2021 09:06:20 -
@@ -27,6 +27,7 @@
 .Op Fl F Ar from
 .Op Fl H Ar helo
 .Op Fl s Ar server
+.Op Fl T Ar params
 .Op Ar recipient ...
 .Sh DESCRIPTION
 The
@@ -91,6 +92,26 @@ SMTP session with forced TLS on connecti
 .Pp
 Defaults to
 .Dq smtp://localhost:25 .
+.It Fl T Ar params
+Set specific parameters for TLS sessions.
+The
+.Ar params
+string is a comma or space separated list of options.
+The available options are:
+.Bl -tag -width Ds
+.It protocols Ns = Ns Ar value
+Specify the protocols to use.
+Refer to
+.Xr tls_config_parse_protocols 3
+for
+.Ar value .
+.It ciphers Ns = Ns Ar value
+Specify the allowed ciphers.
+Refer to
+.Xr tls_config_set_ciphers 3
+for
+.Ar value .
+.El
 .It Fl v
 Be more verbose.
 This option can be specified multiple times.
Index: smtpc.c
===
RCS file: /cvs/src/usr.sbin/smtpd/smtpc.c,v
retrieving revision 1.15
diff -u -p -r1.15 smtpc.c
--- smtpc.c 10 Apr 2021 10:19:19 -  1.15
+++ smtpc.c 20 May 2021 08:57:16 -
@@ -48,22 +48,57 @@ static struct smtp_mail mail;
 static const char *servname = NULL;
 static struct tls_config *tls_config;
 
+static const char *protocols = NULL;
+static const char *ciphers = NULL;
+
 static void
 usage(void)
 {
extern char *__progname;
 
fprintf(stderr, "usage: %s [-Chnv] [-a authfile] [-F from] [-H helo] "
-   "[-s server] [recipient ...]\n", __progname);
+   "[-s server] [-T params] [recipient ...]\n", __progname);
exit(1);
 }
 
+static void
+parse_tls_options(char *opt)
+{
+   static char * const tokens[] = {
+#define CIPHERS 0
+   "ciphers",
+#define PROTOCOLS 1
+   "protocols",
+   NULL };
+   char *value;
+
+   while (*opt) {
+   switch (getsubopt(&opt, tokens, &value)) {
+   case CIPHERS:
+   if (value == NULL)
+   fatalx("missing value for ciphers");
+   ciphers = value;
+   break;
+   case PROTOCOLS:
+   if (value == NULL)
+   fatalx("missing value for protocols");
+   protocols = value;
+   break;
+   case -1:
+   if (suboptarg)
+   fatalx("invalid TLS option \"%s\"", suboptarg);
+   fatalx("missing TLS option");
+   }
+   }
+}
+
 int
 main(int argc, char **argv)
 {
char hostname[256];
FILE *authfile;
int ch, i;
+   uint32_t protos;
char *server = "localhost";
char *authstr = NULL;
size_t alloc = 0;
@@ -91,7 +126,7 @@ main(int argc, char **argv)
memset(&mail, 0, sizeof(mail));
mail.from = pw->pw_name;
 
-   while ((ch = getopt(argc, argv, "CF:H:S:a:hns:v")) != -1) {
+   while ((ch = getopt(argc, argv, "CF:H:S:T:a:hns:v")) != -1) {
switch (ch) {
case 'C':
params.tls_verify = 0;
@@ -105,6 +140,9 @@ main(int argc, char **argv)
case 'S':
servname = optarg;
break;
+   case 'T':
+   parse_tls_options(optarg);
+   break;
case 'a':
if ((authfile = fopen(optarg, "r")) == NULL)
fatal("%s: open", optarg);
@@ -159,6 +197,17 @@ main(int argc, char **argv)
tls_config = tls_config_new();
if (tls_config == NULL)
fatal("tls_config_new");
+
+   if (protocols) {
+   if (tls_config_parse_protocols(&protos, protocols) == -1)
+   fatalx("failed to parse protocol '%s'", protocols);
+   if (tls_config_set_protocols(tls_config, protos) == -1)
+   fatalx("tls_config_set_protocols: %s",
+   tls_config_error(tls_config));
+   }
+   if (ciphers && tls_config_set_ciphers(tls_config, ciphers) == -1)
+   fatalx("tls_config_set_ciphers: %s",
+   tls_config_error(tls_config));
if (tls_config_set_ca_file(tls_config, tls_default_ca_cert_file()) == 
-1)
fatal("tls_set_ca_file");
if (!params.tls_verify) {



Serialize kqueue's internals with a mutex

2021-05-20 Thread Visa Hankala
This patch adds a mutex that serializes access to a kqueue. As a result,
most of kqueue's internals should become safe to run without the kernel
lock. In principle, the patch should allow unlocking kevent(2).

Some notes:

* The existing uses of splhigh() outline where the mutex should be held.

* The code is a true entanglement of lock operations. There are many
  spots where lock usage is far from optimal. The patch does not attempt
  to fix them, so as to keep the changeset relatively small.

* As msleep() with PCATCH requires the kernel lock, kqueue_scan() locks
  the kernel for the section that might sleep. The lock is released
  before the actual scan of events. An opportunistic implementation
  could do a precheck to determine if the scan could be started right
  away, but this is not part of the diff.

* knote_acquire() has a gap where it might miss a wakeup. This is an
  unlikely situation that may arise with klist_invalidate(). It should
  not happen during normal operation, and the code should recover thanks
  to the one-second timeout. The loss of wakeup could be avoided with
  serial numbering for example.

* The timeout in knote_acquire() makes the function try-lock-like, which
  is essential in klist_invalidate(). The normal sequence of action is
  that knote_acquire() comes before klist_lock(). klist_invalidate() has
  to violate this, and the timeout, and retrying, prevent the system
  from deadlocking.

* At the moment, all event sources still require the kernel lock.
  kqueue will lock the kernel when it invokes the filterops callbacks
  if FILTEROP_MPSAFE is not set.


Please test!


Index: kern/kern_event.c
===
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.163
diff -u -p -r1.163 kern_event.c
--- kern/kern_event.c   22 Apr 2021 15:30:12 -  1.163
+++ kern/kern_event.c   20 May 2021 13:45:32 -
@@ -124,7 +124,8 @@ voidknote_dequeue(struct knote *kn);
 intknote_acquire(struct knote *kn, struct klist *, int);
 void   knote_release(struct knote *kn);
 void   knote_activate(struct knote *kn);
-void   knote_remove(struct proc *p, struct knlist *list, int purge);
+void   knote_remove(struct proc *p, struct kqueue *kq, struct knlist *list,
+   int purge);
 
 void   filt_kqdetach(struct knote *kn);
 intfilt_kqueue(struct knote *kn, long hint);
@@ -265,7 +266,7 @@ filt_kqueue(struct knote *kn, long hint)
 {
struct kqueue *kq = kn->kn_fp->f_data;
 
-   kn->kn_data = kq->kq_count;
+   kn->kn_data = kq->kq_count; /* unlocked read */
return (kn->kn_data > 0);
 }
 
@@ -739,28 +740,31 @@ kqpoll_dequeue(struct proc *p)
 {
struct knote *kn;
struct kqueue *kq = p->p_kq;
-   int s;
 
-   s = splhigh();
+   mtx_enter(&kq->kq_lock);
while ((kn = TAILQ_FIRST(&kq->kq_head)) != NULL) {
/* This kqueue should not be scanned by other threads. */
KASSERT(kn->kn_filter != EVFILT_MARKER);
 
-   if (!knote_acquire(kn, NULL, 0))
+   if (!knote_acquire(kn, NULL, 0)) {
+   /* knote_acquire() has released kq_lock. */
+   mtx_enter(&kq->kq_lock);
continue;
+   }
 
kqueue_check(kq);
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
kn->kn_status &= ~KN_QUEUED;
kq->kq_count--;
+   mtx_leave(&kq->kq_lock);
 
-   splx(s);
-   kn->kn_fop->f_detach(kn);
+   filter_detach(kn);
knote_drop(kn, p);
-   s = splhigh();
+
+   mtx_enter(&kq->kq_lock);
kqueue_check(kq);
}
-   splx(s);
+   mtx_leave(&kq->kq_lock);
 }
 
 struct kqueue *
@@ -772,6 +776,7 @@ kqueue_alloc(struct filedesc *fdp)
kq->kq_refs = 1;
kq->kq_fdp = fdp;
TAILQ_INIT(&kq->kq_head);
+   mtx_init(&kq->kq_lock, IPL_HIGH);
task_set(&kq->kq_task, kqueue_task, kq);
 
return (kq);
@@ -933,8 +938,7 @@ kqueue_do_check(struct kqueue *kq, const
struct knote *kn;
int count = 0, nmarker = 0;
 
-   KERNEL_ASSERT_LOCKED();
-   splassert(IPL_HIGH);
+   MUTEX_ASSERT_LOCKED(&kq->kq_lock);
 
TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
if (kn->kn_filter == EVFILT_MARKER) {
@@ -973,7 +977,7 @@ kqueue_register(struct kqueue *kq, struc
struct file *fp = NULL;
struct knote *kn = NULL, *newkn = NULL;
struct knlist *list = NULL;
-   int s, error = 0;
+   int error = 0;
 
if (kev->filter < 0) {
if (kev->filter + EVFILT_SYSCOUNT < 0)
@@ -1005,11 +1009,13 @@ again:
error = EBADF;
goto done;
}
+   mtx_enter(&kq->kq_lock);
if (kev->flags & EV_ADD)
kqueue_expand_list(k

Re: bgpd adjust graceful restart capability negotiation

2021-05-20 Thread Claudio Jeker
On Tue, May 18, 2021 at 02:06:15PM +0200, Claudio Jeker wrote:
> When I adjusted the capability negotiation to check both sides for
> presence I made the graceful restart capability lose all AFI/SAFI
> elements for the peer capabilities.
> 
> This can be viewed with bgpctl show nei: e.g
> 
>  Description: beznau-1
>   BGP version 4, remote router-id 192.168.0.252
>   BGP state = Established, up for 02:11:07
>   Last read 00:00:09, holdtime 90s, keepalive interval 30s
>   Last write 00:00:06
>   Neighbor capabilities:
> Multiprotocol extensions: IPv4 unicast
> 4-byte AS numbers
> Route Refresh
> Graceful Restart: Timeout: 90,
>   Negotiated capabilities:
> Multiprotocol extensions: IPv4 unicast
> 4-byte AS numbers
> Route Refresh
> 
>   Message statistics:
> 
> Instead of
> 
>  Description: beznau-1
>   BGP version 4, remote router-id 192.168.0.252
>   BGP state = Established, up for 00:02:31
>   Last read 00:00:00, holdtime 90s, keepalive interval 30s
>   Last write 00:00:00
>   Neighbor capabilities:
> Multiprotocol extensions: IPv4 unicast
> 4-byte AS numbers
> Route Refresh
> Graceful Restart: Timeout: 90, restarted, IPv4 unicast
>   Negotiated capabilities:
> Multiprotocol extensions: IPv4 unicast
> 4-byte AS numbers
> Route Refresh
> 
> This is just a visual issue. In both cases the flush happens and graceful
> refresh remains disabled.

Actually lets go one step further and change what we announce. bgpd only
supports the "Procedures for the Receiving Speaker". It never keeps the
forwarding state over a restart. Because of this there is no need to
include any AFI/SAFI in the graceful restart capability.

>From the RFC:
   When a sender of this capability does not include any  in
   the capability, it means that the sender is not capable of preserving
   its forwarding state during BGP restart, but supports procedures for
   the Receiving Speaker (as defined in Section 4.2 of this document).
   In that case, the value of the "Restart Time" field advertised by the
   sender is irrelevant.

I think not including any AFI/SAFI is the sensible thing to do.
-- 
:wq Claudio

Index: session.c
===
RCS file: /cvs/src/usr.sbin/bgpd/session.c,v
retrieving revision 1.415
diff -u -p -r1.415 session.c
--- session.c   16 May 2021 09:09:11 -  1.415
+++ session.c   20 May 2021 11:22:28 -
@@ -1443,41 +1443,20 @@ session_open(struct peer *p)
/* graceful restart and End-of-RIB marker, RFC 4724 */
if (p->capa.ann.grestart.restart) {
int rst = 0;
-   u_int16_t   hdr;
-   u_int8_tgrlen;
+   u_int16_t   hdr = 0;
 
-   if (mpcapa) {
-   grlen = 2 + 4 * mpcapa;
-   for (i = 0; i < AID_MAX; i++) {
-   if (p->capa.neg.grestart.flags[i] &
-   CAPA_GR_RESTARTING)
-   rst++;
-   }
-   } else {/* AID_INET */
-   grlen = 2 + 4;
-   if (p->capa.neg.grestart.flags[AID_INET] &
-   CAPA_GR_RESTARTING)
+   for (i = 0; i < AID_MAX; i++) {
+   if (p->capa.neg.grestart.flags[i] & CAPA_GR_RESTARTING)
rst++;
}
 
-   hdr = conf->holdtime;   /* default timeout */
-   /* if client does graceful restart don't set R flag */
+   /* Only set the R-flag if no graceful restart is ongoing */
if (!rst)
hdr |= CAPA_GR_R_FLAG;
hdr = htons(hdr);
 
-   errs += session_capa_add(opb, CAPA_RESTART, grlen);
+   errs += session_capa_add(opb, CAPA_RESTART, sizeof(hdr));
errs += ibuf_add(opb, &hdr, sizeof(hdr));
-
-   if (mpcapa) {
-   for (i = 0; i < AID_MAX; i++) {
-   if (p->capa.ann.mp[i]) {
-   errs += session_capa_add_gr(p, opb, i);
-   }
-   }
-   } else {/* AID_INET */
-   errs += session_capa_add_gr(p, opb, AID_INET);
-   }
}
 
/* 4-bytes AS numbers, draft-ietf-idr-as4bytes-13 */
@@ -2585,24 +2564,24 @@ capa_neg_calc(struct peer *p)
int8_t  negflags;
 
/* disable GR if the AFI/SAFI is not present */
-   if (p->capa.ann.grestart.restart == 0 ||
-   (p->capa.peer.grestart.flags[i] & CAPA_GR_PRESENT &&
+   if ((p->capa.peer.grestart.flags[i] & CAPA_GR_PRESENT &&
p->capa.neg.mp[i] == 0))
p->capa.peer.grestart.flags[i] = 0; /* disable */
 

vmm(4): Mask TSC_ADJUST cpu feature

2021-05-20 Thread Dave Voutila
We don't currently emulate all TSC related features yet. While hacking
on other issues, I've found some more obnoxious guests (*cough* debian
*cough*) constantly try to read the IA32_TSC_ADJUST msr every second,
not getting the hint when we inject #GP. This floods the kernel message
buffer with things like:

  vmx_handle_rdmsr: unsupported rdmsr (msr=0x3b), injecting #GP

(The above debug logging exists to help find msr's we're not supporting
that guests are poking, so I guess you can say it's working as intended
[1].)

If and when we add more TSC capabilities to vmm we can always unmask.

Ok?

[1] https://marc.info/?l=openbsd-tech&m=161739346822128&w=2

Index: sys/arch/amd64/include/vmmvar.h
===
RCS file: /cvs/src/sys/arch/amd64/include/vmmvar.h,v
retrieving revision 1.71
diff -u -p -r1.71 vmmvar.h
--- sys/arch/amd64/include/vmmvar.h 5 Apr 2021 18:26:46 -   1.71
+++ sys/arch/amd64/include/vmmvar.h 16 May 2021 16:55:06 -
@@ -637,6 +637,7 @@ struct vm_mprotect_ept_params {

 /*
  * SEFF flags - copy from host minus:
+ *  TSC_ADJUST (SEFF0EBX_TSC_ADJUST)
  *  SGX (SEFF0EBX_SGX)
  *  HLE (SEFF0EBX_HLE)
  *  INVPCID (SEFF0EBX_INVPCID)
@@ -655,7 +656,8 @@ struct vm_mprotect_ept_params {
  *  PT (SEFF0EBX_PT)
  *  AVX512VBMI (SEFF0ECX_AVX512VBMI)
  */
-#define VMM_SEFF0EBX_MASK ~(SEFF0EBX_SGX | SEFF0EBX_HLE | SEFF0EBX_INVPCID | \
+#define VMM_SEFF0EBX_MASK ~(SEFF0EBX_TSC_ADJUST | SEFF0EBX_SGX | \
+SEFF0EBX_HLE | SEFF0EBX_INVPCID | \
 SEFF0EBX_RTM | SEFF0EBX_PQM | SEFF0EBX_MPX | \
 SEFF0EBX_PCOMMIT | SEFF0EBX_PT | \
 SEFF0EBX_AVX512F | SEFF0EBX_AVX512DQ | \



Re: [External] : Re: move copyout() in DIOCGETSTATES outside of NET_LOCK() and state_lcok

2021-05-20 Thread Alexandr Nedvedicky
Hello,

On Thu, May 20, 2021 at 11:28:19AM +0200, Claudio Jeker wrote:

> 
> One way to reduce the problems with copyout(9) is to use uvm_vslock() to
> lock the pages. This is what sysctl does but it comes with its own set of
> issues.
> 

using uvm_vslock() did not come to my mind, when I was thinking of
how to fix it.

> In general exporting large collections from the kernel needs to be
> rethought. The system should not grab a lock for a long time to
> serve a userland process.
>  
> > > Diff below moves copyout() at line 1784 outside of protection of both 
> > > locks.
> > > The approach I took is relatively straightforward:
> > > 
> > > let DIOCGETSTATES to allocate hold_states array, which will keep
> > > references to states.
> > > 
> > > grab locks and take references, keep those references in hold
> > > array.
> > > 
> > > drop locks, export states and do copyout, while walking
> > > array of references.
> > > 
> > > drop references, release hold_states array.
> > > 
> > > does it make sense? If we agree that this approach makes sense
> > 
> > In my opinion it does.  The other approach would be to (ab)use the
> > NET_LOCK() to serialize updates, like bluhm@'s diff does.  Both
> > approaches have pros and cons.
> > 
> 
> I really think adding more to the NET_LOCK() is a step in the wrong
> direction. It will just creap into everything and grow the size of the
> kernel lock. For me the cons outweight the pros.
> 

that's also my understanding. I would like to remove all pf
configuration (a.k.a. pfioctl()) outside of NET_LOCK() scope.
I mean it still be fine for pf_test() if caller will hold NET_LOCK().
pf_test() caller must accept a fact that pf_test() may need to grab
pf's internal locks to do its housekeeping properly.

thanks and
regards
sashan



Re: ssh(1) -v gives debug1: pledge: filesystem full

2021-05-20 Thread Hiltjo Posthuma
On Thu, May 20, 2021 at 12:15:54PM +0200, Marcus MERIGHI wrote:
> Hello Hiltjo, 
> 
> thanks for looking further than I did...
> 
> Moving to tech@...
> 
> hil...@codemadness.org (Hiltjo Posthuma), 2021.05.19 (Wed) 16:59 (CEST):
> > On Wed, May 19, 2021 at 01:20:34PM +0200, Marcus MERIGHI wrote:
> > > By accident I noticed that 
> > > 
> > > $ ssh -v $host
> > > 
> > > gives me, among many other lines, this
> > > 
> > > debug1: pledge: filesystem full
> > > 
> > > Is this expected? Something to worry about?
> > 
> > A grep shows its in the file /usr/src/usr.bin/ssh/clientloop.c client_loop()
> > function.
> 
> to prevent the *very* similiar wording of the infamous dmesg output
> 
> uid 539 on /var/db/clamav: file system full
> 
> I'd suggest a slightly different message, something like this?
> 
> Marcus
> 
> Index: usr.bin/ssh/clientloop.c
> ===
> RCS file: /cvs/src/usr.bin/ssh/clientloop.c,v
> retrieving revision 1.363
> diff -u -p -u -r1.363 clientloop.c
> --- usr.bin/ssh/clientloop.c  19 May 2021 01:24:05 -  1.363
> +++ usr.bin/ssh/clientloop.c  20 May 2021 10:11:06 -
> @@ -1232,7 +1232,7 @@ client_loop(struct ssh *ssh, int have_pt
>   fatal_f("pledge(): %s", strerror(errno));
>  
>   } else if (options.update_hostkeys) {
> - debug("pledge: filesystem full");
> + debug("pledge: filesystem full access");
>   if (pledge("stdio rpath wpath cpath unix inet dns proc tty",
>   NULL) == -1)
>   fatal_f("pledge(): %s", strerror(errno));

Hi,

You're welcome. I think the patch makes sense.

-- 
Kind regards,
Hiltjo



Re: move copyout() in DIOCGETSTATES outside of NET_LOCK() and state_lcok

2021-05-20 Thread Patrick Wildt
Am Thu, May 20, 2021 at 11:28:19AM +0200 schrieb Claudio Jeker:
> On Thu, May 20, 2021 at 09:37:38AM +0200, Martin Pieuchot wrote:
> > On 20/05/21(Thu) 03:23, Alexandr Nedvedicky wrote:
> > > Hrvoje gave a try to experimental diff, which trades rw-locks in pf(4)
> > > for mutexes [1]. Hrvoje soon discovered machine panics, when doing 'pfctl 
> > > -ss'
> > > The callstack looks as follows:
> > >
> > > [...]
> > > specific to experimental diff [1]. However this made me thinking, that
> > > it's not a good idea to do copyout() while holding NET_LOCK() and 
> > > state_lock.
> > 
> > malloc(9) and copyout(9) are kind of ok while using the NET_LOCK() but
> > if a deadlock occurs while a global rwlock is held, debugging becomes
> > harder.
> > 
> > As long as the `state_lock' and PF_LOCK() are mutexes all allocations
> > and copyin/copyout(9) must be done without holding them.
> 
> One way to reduce the problems with copyout(9) is to use uvm_vslock() to
> lock the pages. This is what sysctl does but it comes with its own set of
> issues.
> 
> In general exporting large collections from the kernel needs to be
> rethought. The system should not grab a lock for a long time to
> serve a userland process.
>  
> > > Diff below moves copyout() at line 1784 outside of protection of both 
> > > locks.
> > > The approach I took is relatively straightforward:
> > > 
> > > let DIOCGETSTATES to allocate hold_states array, which will keep
> > > references to states.
> > > 
> > > grab locks and take references, keep those references in hold
> > > array.
> > > 
> > > drop locks, export states and do copyout, while walking
> > > array of references.
> > > 
> > > drop references, release hold_states array.
> > > 
> > > does it make sense? If we agree that this approach makes sense
> > 
> > In my opinion it does.  The other approach would be to (ab)use the
> > NET_LOCK() to serialize updates, like bluhm@'s diff does.  Both
> > approaches have pros and cons.
> > 
> 
> I really think adding more to the NET_LOCK() is a step in the wrong
> direction. It will just creap into everything and grow the size of the
> kernel lock. For me the cons outweight the pros.

While what we do at genua probably isn't particularly relevant to the
OpenBSD approach to unlocking the network subsystem, but what we do is
that there is a rwlock that protects the network configuration, and for
ioctls like DIOCGETSTATES we do indeed call uvm_vslock(), then take the
lock either read or write, depending on which ioctl it is, and unlock it
prior to return.

Since this is only the network conf lock, taking it with a 'read' for
something like DIOCGETSTATES does not really influence the network
receive/transmit path, I think.



Re: ssh(1) -v gives debug1: pledge: filesystem full

2021-05-20 Thread Marcus MERIGHI
Hello Hiltjo, 

thanks for looking further than I did...

Moving to tech@...

hil...@codemadness.org (Hiltjo Posthuma), 2021.05.19 (Wed) 16:59 (CEST):
> On Wed, May 19, 2021 at 01:20:34PM +0200, Marcus MERIGHI wrote:
> > By accident I noticed that 
> > 
> > $ ssh -v $host
> > 
> > gives me, among many other lines, this
> > 
> > debug1: pledge: filesystem full
> > 
> > Is this expected? Something to worry about?
> 
> A grep shows its in the file /usr/src/usr.bin/ssh/clientloop.c client_loop()
> function.

to prevent the *very* similiar wording of the infamous dmesg output

uid 539 on /var/db/clamav: file system full

I'd suggest a slightly different message, something like this?

Marcus

Index: usr.bin/ssh/clientloop.c
===
RCS file: /cvs/src/usr.bin/ssh/clientloop.c,v
retrieving revision 1.363
diff -u -p -u -r1.363 clientloop.c
--- usr.bin/ssh/clientloop.c19 May 2021 01:24:05 -  1.363
+++ usr.bin/ssh/clientloop.c20 May 2021 10:11:06 -
@@ -1232,7 +1232,7 @@ client_loop(struct ssh *ssh, int have_pt
fatal_f("pledge(): %s", strerror(errno));
 
} else if (options.update_hostkeys) {
-   debug("pledge: filesystem full");
+   debug("pledge: filesystem full access");
if (pledge("stdio rpath wpath cpath unix inet dns proc tty",
NULL) == -1)
fatal_f("pledge(): %s", strerror(errno));



Re: Add f_modify and f_process callbacks to socket filterops

2021-05-20 Thread Martin Pieuchot
On 18/05/21(Tue) 14:22, Visa Hankala wrote:
> This diff adds f_modify and f_process callbacks to socket event filters.
> As a result, socket events are handled using the non-legacy paths in
> filter_modify() and filter_process() of kern_event.c This a step toward
> MP-safety. However, everything still runs under the kernel lock.
> 
> The change has three intended effects:
> 
> * Socket events are handled without raising the system priority level.
>   This makes the activity observable with btrace(8).
> 
> * kqueue itself no longer calls f_event of socket filterops, which
>   allows replacing the conditional, NOTE_SUBMIT-based locking with
>   a fixed call pattern.

I love this.

> * The state of a socket event is now always rechecked before delivery
>   to user. Before, the recheck was skipped if the event was registered
>   with EV_ONESHOT.

To me this sounds sane.  I can't think of a way to rely on the current
behavior.  However if there's an easy way to split these changes in two
commits, I'd prefer to stay on the safe side.

> However, the change of behaviour with EV_ONESHOT is questionable.
> When an activated event is being processed, the code will acquire the
> socket lock anyway. Skipping the state check would only be a minor
> optimization. In addition, I think the behaviour becomes more
> consistent as now a delivered EV_ONESHOT event really was active at
> the time of delivery.
> 
> Consider the following program. It creates a socket pair, writes a byte
> to the socket, registers an EV_ONESHOT event, and reads the byte from
> the socket. Next it checks how kevent(2) behaves.
> 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> 
> int
> main(void)
> {
>   struct kevent kev[1];
>   struct timespec ts = {};
>   int fds[2], flags, kq, n;
>   char b;
> 
>   if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds) == -1)
>   err(1, "socketpair");
>   flags = fcntl(fds[0], F_GETFL, 0);
>   fcntl(fds[0], F_SETFL, flags | O_NONBLOCK);
> 
>   printf("write 1\n");
>   write(fds[1], "x", 1);
> 
>   kq = kqueue();
>   if (kq == -1)
>   err(1, "kqueue");
>   EV_SET(&kev[0], fds[0], EVFILT_READ, EV_ADD | EV_ONESHOT, 0, 0, NULL);
>   if (kevent(kq, kev, 1, NULL, 0, NULL) == -1)
>   err(1, "kevent");
> 
>   n = read(fds[0], &b, 1);
>   printf("read %d\n", n);
>   n = read(fds[0], &b, 1);
>   printf("read %d\n", n);
> 
>   n = kevent(kq, NULL, 0, kev, 1, &ts);
>   printf("kevent %d\n", n);
>   n = read(fds[0], &b, 1);
>   printf("read %d\n", n);
> 
>   n = kevent(kq, NULL, 0, kev, 1, &ts);
>   printf("kevent %d\n", n);
> 
>   printf("write 1\n");
>   write(fds[1], "x", 1);
> 
>   n = kevent(kq, NULL, 0, kev, 1, &ts);
>   printf("kevent %d\n", n);
>   n = read(fds[0], &b, 1);
>   printf("read %d\n", n);
> 
>   return 0;
> }
> 
> With an unpatched kernel, the EV_ONESHOT event gets activated by the
> pending byte when the event is registered. The event remains active
> until delivery, and the delivery happens even though it is clear that
> reading from the socket will fail. The program prints:
> 
> write 1
> read 1
> read -1
> kevent 1
> read -1
> kevent 0
> write 1
> kevent 0
> read 1
> 
> With the patch applied, the event gets delivered only if the socket
> has bytes pending.
> 
> write 1
> read 1
> read -1
> kevent 0
> read -1
> kevent 0
> write 1
> kevent 1
> read 1
> 
> So, is this EV_ONESHOT change reasonable, or should the implementation
> stick with the old way? FreeBSD appears to follow the old way. MacOS
> might perform differently, though I am not sure about that.
> 
> It is not essential to change EV_ONESHOT, however.
> 
> Feedback and tests are welcome.
> 
> Index: kern/uipc_socket.c
> ===
> RCS file: src/sys/kern/uipc_socket.c,v
> retrieving revision 1.261
> diff -u -p -r1.261 uipc_socket.c
> --- kern/uipc_socket.c13 May 2021 19:43:11 -  1.261
> +++ kern/uipc_socket.c18 May 2021 12:56:24 -
> @@ -70,15 +70,26 @@ void  sorflush(struct socket *);
>  
>  void filt_sordetach(struct knote *kn);
>  int  filt_soread(struct knote *kn, long hint);
> +int  filt_soreadmodify(struct kevent *kev, struct knote *kn);
> +int  filt_soreadprocess(struct knote *kn, struct kevent *kev);
> +int  filt_soread_common(struct knote *kn, struct socket *so);
>  void filt_sowdetach(struct knote *kn);
>  int  filt_sowrite(struct knote *kn, long hint);
> +int  filt_sowritemodify(struct kevent *kev, struct knote *kn);
> +int  filt_sowriteprocess(struct knote *kn, struct kevent *kev);
> +int  filt_sowrite_common(struct knote *kn, struct socket *so);
>  int  filt_solisten(struct knote *kn, long hint);
> +int  filt_solistenmodify(struct kevent *kev, struct knote *kn);
> +int  filt_solistenprocess(struct knote *kn, struct kevent *kev);
> +int  filt_soliste

Re: move copyout() in DIOCGETSTATES outside of NET_LOCK() and state_lcok

2021-05-20 Thread Claudio Jeker
On Thu, May 20, 2021 at 09:37:38AM +0200, Martin Pieuchot wrote:
> On 20/05/21(Thu) 03:23, Alexandr Nedvedicky wrote:
> > Hrvoje gave a try to experimental diff, which trades rw-locks in pf(4)
> > for mutexes [1]. Hrvoje soon discovered machine panics, when doing 'pfctl 
> > -ss'
> > The callstack looks as follows:
> >
> > [...]
> > specific to experimental diff [1]. However this made me thinking, that
> > it's not a good idea to do copyout() while holding NET_LOCK() and 
> > state_lock.
> 
> malloc(9) and copyout(9) are kind of ok while using the NET_LOCK() but
> if a deadlock occurs while a global rwlock is held, debugging becomes
> harder.
> 
> As long as the `state_lock' and PF_LOCK() are mutexes all allocations
> and copyin/copyout(9) must be done without holding them.

One way to reduce the problems with copyout(9) is to use uvm_vslock() to
lock the pages. This is what sysctl does but it comes with its own set of
issues.

In general exporting large collections from the kernel needs to be
rethought. The system should not grab a lock for a long time to
serve a userland process.
 
> > Diff below moves copyout() at line 1784 outside of protection of both locks.
> > The approach I took is relatively straightforward:
> > 
> > let DIOCGETSTATES to allocate hold_states array, which will keep
> > references to states.
> > 
> > grab locks and take references, keep those references in hold
> > array.
> > 
> > drop locks, export states and do copyout, while walking
> > array of references.
> > 
> > drop references, release hold_states array.
> > 
> > does it make sense? If we agree that this approach makes sense
> 
> In my opinion it does.  The other approach would be to (ab)use the
> NET_LOCK() to serialize updates, like bluhm@'s diff does.  Both
> approaches have pros and cons.
> 

I really think adding more to the NET_LOCK() is a step in the wrong
direction. It will just creap into everything and grow the size of the
kernel lock. For me the cons outweight the pros.


-- 
:wq Claudio



Re: Use atomic op for UVM map refcount

2021-05-20 Thread Mark Kettenis
> Date: Thu, 20 May 2021 10:28:29 +0200
> From: Martin Pieuchot 
> 
> On 19/05/21(Wed) 16:17, Mark Kettenis wrote:
> > > Date: Tue, 18 May 2021 13:24:42 +0200
> > > From: Martin Pieuchot 
> > > 
> > > On 18/05/21(Tue) 12:07, Mark Kettenis wrote:
> > > > > Date: Tue, 18 May 2021 12:02:19 +0200
> > > > > From: Martin Pieuchot 
> > > > > 
> > > > > This allows us to not rely on the KERNEL_LOCK() to check reference
> > > > > counts.
> > > > > 
> > > > > Also reduces differences with NetBSD and shrink my upcoming 
> > > > > `vmobjlock'
> > > > > diff.
> > > > > 
> > > > > ok?
> > > > 
> > > > Shouldn't we make ref_count volatile in that case?
> > > 
> > > I don't know,  I couldn't find any evidence about where to use "volatile"
> > > in the kernel.
> > > 
> > > My understanding is that using "volatile" tells the compiler to not
> > > "cache" the value of such field in a register because it can change at
> > > any time.  Is it so?
> > 
> > Right.  So if you want the access to be atomic, it needs to be
> > "uncached" and therefore you need to use volatile.  Now the atomic
> > APIs explicitly cast their pointer arguments to volatile, so if you
> > exclusively through those APIs you don't strictly need the variable
> > itself to be declared volatile.  But I think it still is a good idea
> > to declare them as such.
> 
> Thanks for the explanation.  Do you suggest we use the "volatile"
> keyword as a hint and/or to avoid surprises?  If we agree on this
> I'll look at similar uses of atomic operations to unify them.

Yes, I think we should do that.  The volatile keyword shouldn't hurt
and is a clear signal that there is something special about a
variable.

> > > There's only a couple of 'volatile' usages in sys/sys.  These annotations
> > > do not explicitly indicate which piece of code requires it.  Maybe it 
> > > would
> > > be clearer to use a cast or a macro where necessary.  This might help us
> > > understand why and where "volatile" is needed.
> > 
> > There are the READ_ONCE() and WRITE_ONCE() macros.  I'm not a big fan
> > of those (since they add clutter) but they do take care of dependency
> > ordering issues that exist in the alpha memory model.  Must admit that
> > I only vaguely understand that issue, but I think it involves ordered
> > access to two atomic variables which doesn't seem to be the case.
> 
> These macros are used in places where declaring the field as "volatile"
> could also work, no?  We can look at __mp_lock and SMR implementations.
> So could we agree one way to do things?

Not 100%; see the comment about the alpha memory model above.  So as
long as we support OpenBSD/alpha, READ_ONCE() and WRITE_ONCE() will be
necessary in certain cases.

> Visa, David, why did you pick READ_ONCE() in SMR and veb(4)?  Anything
> we overlooked regarding the use of "volatile"?



Re: Use atomic op for UVM map refcount

2021-05-20 Thread Martin Pieuchot
On 19/05/21(Wed) 16:17, Mark Kettenis wrote:
> > Date: Tue, 18 May 2021 13:24:42 +0200
> > From: Martin Pieuchot 
> > 
> > On 18/05/21(Tue) 12:07, Mark Kettenis wrote:
> > > > Date: Tue, 18 May 2021 12:02:19 +0200
> > > > From: Martin Pieuchot 
> > > > 
> > > > This allows us to not rely on the KERNEL_LOCK() to check reference
> > > > counts.
> > > > 
> > > > Also reduces differences with NetBSD and shrink my upcoming `vmobjlock'
> > > > diff.
> > > > 
> > > > ok?
> > > 
> > > Shouldn't we make ref_count volatile in that case?
> > 
> > I don't know,  I couldn't find any evidence about where to use "volatile"
> > in the kernel.
> > 
> > My understanding is that using "volatile" tells the compiler to not
> > "cache" the value of such field in a register because it can change at
> > any time.  Is it so?
> 
> Right.  So if you want the access to be atomic, it needs to be
> "uncached" and therefore you need to use volatile.  Now the atomic
> APIs explicitly cast their pointer arguments to volatile, so if you
> exclusively through those APIs you don't strictly need the variable
> itself to be declared volatile.  But I think it still is a good idea
> to declare them as such.

Thanks for the explanation.  Do you suggest we use the "volatile"
keyword as a hint and/or to avoid surprises?  If we agree on this
I'll look at similar uses of atomic operations to unify them.

> > There's only a couple of 'volatile' usages in sys/sys.  These annotations
> > do not explicitly indicate which piece of code requires it.  Maybe it would
> > be clearer to use a cast or a macro where necessary.  This might help us
> > understand why and where "volatile" is needed.
> 
> There are the READ_ONCE() and WRITE_ONCE() macros.  I'm not a big fan
> of those (since they add clutter) but they do take care of dependency
> ordering issues that exist in the alpha memory model.  Must admit that
> I only vaguely understand that issue, but I think it involves ordered
> access to two atomic variables which doesn't seem to be the case.

These macros are used in places where declaring the field as "volatile"
could also work, no?  We can look at __mp_lock and SMR implementations.
So could we agree one way to do things?

Visa, David, why did you pick READ_ONCE() in SMR and veb(4)?  Anything
we overlooked regarding the use of "volatile"?



Re: [External] : Re: move copyout() in DIOCGETSTATES outside of NET_LOCK() and state_lcok

2021-05-20 Thread Alexandr Nedvedicky
Hello,

> 
> 
> with this diff i can't reproduce panic as before. i tried pf with
> routing, veb, tpmr and bridge.
> 
> Do you want me to test this diff with parallel diff?
> 

I think it makes no sense to test the change around copyout() with parallel
diff as mpi@ points out rwlocks don't mind (sort of) if copyout() is being
called inside the scope protected by rw-locks.

thanks and
regards
sashan



Re: move copyout() in DIOCGETSTATES outside of NET_LOCK() and state_lcok

2021-05-20 Thread Hrvoje Popovski
On 20.5.2021. 3:23, Alexandr Nedvedicky wrote:
> Hello,
> 
> Hrvoje gave a try to experimental diff, which trades rw-locks in pf(4)
> for mutexes [1]. Hrvoje soon discovered machine panics, when doing 'pfctl -ss'
> The callstack looks as follows:
> 
> panic: acquiring blockable sleep lock with spinlock or critical section
> held (rwlock) vmmaplk
> Stopped at  db_enter+0x10:  popq%rbp
> TIDPIDUID PRFLAGS PFLAGS  CPU  COMMAND
> *512895  28841  0 0x3  03K pfctl
> db_enter() at db_enter+0x10
> panic(81e19411) at panic+0x12a
> witness_checkorder(fd83b09b4d18,1,0) at witness_checkorder+0xbce
> rw_enter_read(fd83b09b4d08) at rw_enter_read+0x38
> uvmfault_lookup(8000238e3418,0) at uvmfault_lookup+0x8a
> uvm_fault_check(8000238e3418,8000238e3450,8000238e3478) at
> uvm_fault_check+0x32
> uvm_fault(fd83b09b4d00,e36553c000,0,2) at uvm_fault+0xfc
> kpageflttrap(8000238e3590,e36553c000) at kpageflttrap+0x131
> kerntrap(8000238e3590) at kerntrap+0x91
> alltraps_kern_meltdown() at alltraps_kern_meltdown+0x7b
> copyout() at copyout+0x53
> VOP_IOCTL(fd83b24340e0,c0104419,8000238e3930,1,fd842f7d7120,800
> 0239597a8) at VOP_IOCTL+0x68
> vn_ioctl(fd83b294edc0,c0104419,8000238e3930,8000239597a8) at
> vn_ioctl+0x75
> sys_ioctl(8000239597a8,8000238e3a40,8000238e3aa0) at
> sys_ioctl+0x2c4
> end trace frame: 0x8000238e3b00, count: 0
> https://www.openbsd.org/ddb.html
> reports.  Insufficient info makes it difficult to find and fix bugs.
> ddb{3}>
> 
> The problem comes from DIOCGETSTATES in pfioctl() here:
> 1775 
> 1776 NET_LOCK();
> 1777 PF_STATE_ENTER_READ();
> 1778 state = TAILQ_FIRST(&state_list);
> 1779 while (state) {
> 1780 if (state->timeout != PFTM_UNLINKED) {
> 1781 if ((nr+1) * sizeof(*p) > ps->ps_len)
> 1782 break;
> 1783 pf_state_export(pstore, state);
> 1784 error = copyout(pstore, p, sizeof(*p));
> 1785 if (error) {
> 1786 free(pstore, M_TEMP, 
> sizeof(*pstore));
> 1787 PF_STATE_EXIT_READ();
> 1788 NET_UNLOCK();
> 1789 goto fail;
> 1790 }
> 1791 p++;
> 1792 nr++;
> 1793 }
> 1794 state = TAILQ_NEXT(state, entry_list);
> 1795 }
> 1796 PF_STATE_EXIT_READ();
> 1797 NET_UNLOCK();
> 1798 
> 
> at line 1784 we do copyout() while holding mutex. As I've mentioned glitch is 
> a
> specific to experimental diff [1]. However this made me thinking, that
> it's not a good idea to do copyout() while holding NET_LOCK() and state_lock.
> 
> Diff below moves copyout() at line 1784 outside of protection of both locks.
> The approach I took is relatively straightforward:
> 
> let DIOCGETSTATES to allocate hold_states array, which will keep
> references to states.
> 
> grab locks and take references, keep those references in hold
> array.
> 
> drop locks, export states and do copyout, while walking
> array of references.
> 
> drop references, release hold_states array.
> 
> does it make sense? If we agree that this approach makes sense
> I'll commit this diff and revisit other such places we currently
> have in pfioctl().


Hi,

with this diff i can't reproduce panic as before. i tried pf with
routing, veb, tpmr and bridge.

Do you want me to test this diff with parallel diff?






Re: move copyout() in DIOCGETSTATES outside of NET_LOCK() and state_lcok

2021-05-20 Thread Martin Pieuchot
On 20/05/21(Thu) 03:23, Alexandr Nedvedicky wrote:
> Hrvoje gave a try to experimental diff, which trades rw-locks in pf(4)
> for mutexes [1]. Hrvoje soon discovered machine panics, when doing 'pfctl -ss'
> The callstack looks as follows:
>
> [...]
> specific to experimental diff [1]. However this made me thinking, that
> it's not a good idea to do copyout() while holding NET_LOCK() and state_lock.

malloc(9) and copyout(9) are kind of ok while using the NET_LOCK() but
if a deadlock occurs while a global rwlock is held, debugging becomes
harder.

As long as the `state_lock' and PF_LOCK() are mutexes all allocations
and copyin/copyout(9) must be done without holding them.

> Diff below moves copyout() at line 1784 outside of protection of both locks.
> The approach I took is relatively straightforward:
> 
> let DIOCGETSTATES to allocate hold_states array, which will keep
> references to states.
> 
> grab locks and take references, keep those references in hold
> array.
> 
> drop locks, export states and do copyout, while walking
> array of references.
> 
> drop references, release hold_states array.
> 
> does it make sense? If we agree that this approach makes sense

In my opinion it does.  The other approach would be to (ab)use the
NET_LOCK() to serialize updates, like bluhm@'s diff does.  Both
approaches have pros and cons.

> I'll commit this diff and revisit other such places we currently
> have in pfioctl().
> 
> thanks and
> regards
> sashan
> 
> [1] https://marc.info/?l=openbsd-tech&m=162138181106887&w=2
> 
> 8<---8<---8<--8<
> diff --git a/sys/net/pf_ioctl.c b/sys/net/pf_ioctl.c
> index ae7bb008351..0d4ac97a92c 100644
> --- a/sys/net/pf_ioctl.c
> +++ b/sys/net/pf_ioctl.c
> @@ -1762,43 +1762,58 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int 
> flags, struct proc *p)
>   struct pf_state *state;
>   struct pfsync_state *p, *pstore;
>   u_int32_tnr = 0;
> + struct pf_state **hold_states;
> + u_int32_thold_len, i;
>  
>   if (ps->ps_len == 0) {
>   nr = pf_status.states;
>   ps->ps_len = sizeof(struct pfsync_state) * nr;
>   break;
> + } else {
> + hold_len = ps->ps_len / sizeof(struct pfsync_state);
> + hold_len = MIN(hold_len, pf_status.states);
>   }
>  
>   pstore = malloc(sizeof(*pstore), M_TEMP, M_WAITOK);
> + hold_states = mallocarray(hold_len + 1,
> + sizeof(struct pf_state *), M_TEMP, M_WAITOK | M_ZERO);
>  
>   p = ps->ps_states;
>  
> + i = 0;
>   NET_LOCK();
>   PF_STATE_ENTER_READ();
> - state = TAILQ_FIRST(&state_list);
> - while (state) {
> + TAILQ_FOREACH(state, &state_list, entry_list) {
> + hold_states[i++] = pf_state_ref(state);
> + if (i >= hold_len)
> + break;
> + }
> + PF_STATE_EXIT_READ();
> + NET_UNLOCK();
> +
> + i = 0;
> + while ((state = hold_states[i++]) != NULL) {
>   if (state->timeout != PFTM_UNLINKED) {
> - if ((nr+1) * sizeof(*p) > ps->ps_len)
> - break;
>   pf_state_export(pstore, state);
>   error = copyout(pstore, p, sizeof(*p));
> - if (error) {
> - free(pstore, M_TEMP, sizeof(*pstore));
> - PF_STATE_EXIT_READ();
> - NET_UNLOCK();
> - goto fail;
> - }
> + if (error)
> + break;
>   p++;
>   nr++;
>   }
> - state = TAILQ_NEXT(state, entry_list);
> + pf_state_unref(state);
>   }
> - PF_STATE_EXIT_READ();
> - NET_UNLOCK();
>  
> - ps->ps_len = sizeof(struct pfsync_state) * nr;
> + if (error) {
> + pf_state_unref(state);
> + while ((state = hold_states[i++]) != NULL)
> + pf_state_unref(state);
> + } else
> + ps->ps_len = sizeof(struct pfsync_state) * nr;
>  
>   free(pstore, M_TEMP, sizeof(*pstore));
> + free(hold_states, M_TEMP,
> + sizeof(struct pf_state *) * (hold_len + 1));
>   break;
>   }
>  
> 



How to debug kernel core with lldb

2021-05-20 Thread Masato Asou
Hi tech,

Does anybudy know how to debug kernel core with lldb?

I am useing OpenBSD current.

I know how to debug kernel core with target kvm command of gdb by man
crash.  However, lldb does not have taeget kvm command.  Has target
kvm command implemented yet?
--
ASOU Masato