Re: Patch: Sending credentials over Unix datagram sockets

2020-03-17 Thread David Mackay
GMail seems determined to ruin the formatting. Diff is now an
attachment instead.

-Original Message-
From: David Mackay  
Sent: 18 March 2020 03:24
To: 'tech@openbsd.org' 
Subject: RE: Patch: Sending credentials over Unix datagram sockets

Sorry, looks like my mail client mangled the previous message.
I've reproduced the message below and made the diff an attachment in
case it happens again.

Dear openbsd-tech,

On GNU HURD and FreeBSD, the control message SCM_CREDS may be allocated
by a client of a Unix datagram socket. When the kernel encounters this,
it fills out a struct cmsgcred containing PID, UID, GID, effective UID,
and effective GIDs of the sender.

This patch implements this for OpenBSD.

Kind regards,
David

-Original Message-
From: David Mackay  
Sent: 18 March 2020 02:53
To: 'tech@openbsd.org' 
Subject: Patch: Sending credentials over Unix datagram sockets

Dear openbsd-tech,

On GNU HURD and FreeBSD, the control message SCM_CREDS may be allocated by a
client of a Unix datagram socket. When the kernel encounters this, it fills out
a struct cmsgcred containing PID, UID, GID, effective UID, and effective GIDs of
the sender.

This patch implements this for OpenBSD.

Kind regards,
David


diff --git sys/kern/uipc_usrreq.c sys/kern/uipc_usrreq.c
index 8a735f3..7df3846 100644
--- sys/kern/uipc_usrreq.c
+++ sys/kern/uipc_usrreq.c
@@ -688,9 +688,10 @@ unp_externalize(struct mbuf *rights, socklen_t controllen,
int flags)
/*
 * This code only works because SCM_RIGHTS is the only supported
 * control message type on unix sockets. Enforce this here.
+* If it isn't, then return.
 */
if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET)
-   return EINVAL;
+   return (0);
 
nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
sizeof(struct fdpass);
@@ -831,10 +832,26 @@ unp_internalize(struct mbuf *control, struct proc *p)
 */
if (control->m_len < CMSG_LEN(0) || cm->cmsg_len < CMSG_LEN(0))
return (EINVAL);
-   if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
-   !(cm->cmsg_len == control->m_len ||
+   if (!(cm->cmsg_type == SCM_RIGHTS || cm->cmsg_type == SCM_CREDS) ||
+   cm->cmsg_level != SOL_SOCKET || !(cm->cmsg_len == control->m_len ||
control->m_len == CMSG_ALIGN(cm->cmsg_len)))
return (EINVAL);
+
+   if(cm->cmsg_type == SCM_CREDS) {
+   struct cmsgcred * cmcred = (struct cmsgcred *)CMSG_DATA(cm);
+
+   cmcred->cmcred_pid = p->p_p->ps_pid;
+   cmcred->cmcred_uid = p->p_ucred->cr_ruid;
+   cmcred->cmcred_euid = p->p_ucred->cr_uid;
+   cmcred->cmcred_gid = p->p_ucred->cr_rgid;
+   cmcred->cmcred_ngroups = MIN(p->p_ucred->cr_ngroups,
+
CMGROUP_MAX);
+   for (i = 0; i < cmcred->cmcred_ngroups; i++)
+   cmcred->cmcred_groups[i] = p->p_ucred->cr_groups[i];
+
+   return (0);
+   }
+
nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
 
if (unp_rights + nfds > maxfiles / 10)
diff --git sys/sys/socket.h sys/sys/socket.h
index 927fd26..75c2319 100644
--- sys/sys/socket.h
+++ sys/sys/socket.h
@@ -486,6 +486,26 @@ struct cmsghdr {
 /* followed by u_char  cmsg_data[]; */
 };
 
+/*
+ * Only 16 groups are sent because struct mbuf is a fixed size; these must fit.
+ */
+#define CMGROUP_MAX 16
+
+/*
+ * Credentials as attached to control messages of type SCM_CREDS.
+ * The sender must specify this control message, but the kernel
+ * will overwrite anything the sender put in the actual structure
+ * with the real values.
+ */
+struct cmsgcred {
+   pid_t   cmcred_pid; /* sender PID */
+   uid_t   cmcred_uid; /* sender real UID */
+   uid_t   cmcred_euid;/* sender effective UID */
+   gid_t   cmcred_gid; /* sender real GID*/
+   short   cmcred_ngroups; /* number of GIDs */
+   gid_t   cmcred_groups[CMGROUP_MAX]; /* GIDs */
+}
+
 /* given pointer to struct cmsghdr, return pointer to data */
 #defineCMSG_DATA(cmsg) \
((unsigned char *)(cmsg) + _ALIGN(sizeof(struct cmsghdr)))
@@ -520,6 +540,7 @@ struct cmsghdr {
 
 /* "Socket"-level control message types: */
 #defineSCM_RIGHTS  0x01/* access rights (array of int)
*/
+#defineSCM_CREDS   0x02/* sender credentials (struct
cmsgcred) */
 #defineSCM_TIMESTAMP   0x04/* timestamp (struct timeval) */
 
 #ifndef _KERNEL


creds.patch
Description: Binary data


Re: Patch: Sending credentials over Unix datagram sockets

2020-03-17 Thread David Mackay
Sorry, looks like my mail client mangled the previous message.
I've reproduced the message below and made the diff an attachment in
case it happens again.

Dear openbsd-tech,

On GNU HURD and FreeBSD, the control message SCM_CREDS may be allocated
by a client of a Unix datagram socket. When the kernel encounters this,
it fills out a struct cmsgcred containing PID, UID, GID, effective UID,
and effective GIDs of the sender.

This patch implements this for OpenBSD.

Kind regards,
David

-Original Message-
From: David Mackay  
Sent: 18 March 2020 02:53
To: 'tech@openbsd.org' 
Subject: Patch: Sending credentials over Unix datagram sockets

Dear openbsd-tech,

On GNU HURD and FreeBSD, the control message SCM_CREDS may be allocated by a
client of a Unix datagram socket. When the kernel encounters this, it fills out
a struct cmsgcred containing PID, UID, GID, effective UID, and effective GIDs of
the sender.

This patch implements this for OpenBSD.

Kind regards,
David


diff --git sys/kern/uipc_usrreq.c sys/kern/uipc_usrreq.c
index 8a735f3..7df3846 100644
--- sys/kern/uipc_usrreq.c
+++ sys/kern/uipc_usrreq.c
@@ -688,9 +688,10 @@ unp_externalize(struct mbuf *rights, socklen_t controllen,
int flags)
/*
 * This code only works because SCM_RIGHTS is the only supported
 * control message type on unix sockets. Enforce this here.
+* If it isn't, then return.
 */
if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET)
-   return EINVAL;
+   return (0);
 
nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
sizeof(struct fdpass);
@@ -831,10 +832,26 @@ unp_internalize(struct mbuf *control, struct proc *p)
 */
if (control->m_len < CMSG_LEN(0) || cm->cmsg_len < CMSG_LEN(0))
return (EINVAL);
-   if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
-   !(cm->cmsg_len == control->m_len ||
+   if (!(cm->cmsg_type == SCM_RIGHTS || cm->cmsg_type == SCM_CREDS) ||
+   cm->cmsg_level != SOL_SOCKET || !(cm->cmsg_len == control->m_len ||
control->m_len == CMSG_ALIGN(cm->cmsg_len)))
return (EINVAL);
+
+   if(cm->cmsg_type == SCM_CREDS) {
+   struct cmsgcred * cmcred = (struct cmsgcred *)CMSG_DATA(cm);
+
+   cmcred->cmcred_pid = p->p_p->ps_pid;
+   cmcred->cmcred_uid = p->p_ucred->cr_ruid;
+   cmcred->cmcred_euid = p->p_ucred->cr_uid;
+   cmcred->cmcred_gid = p->p_ucred->cr_rgid;
+   cmcred->cmcred_ngroups = MIN(p->p_ucred->cr_ngroups,
+
CMGROUP_MAX);
+   for (i = 0; i < cmcred->cmcred_ngroups; i++)
+   cmcred->cmcred_groups[i] = p->p_ucred->cr_groups[i];
+
+   return (0);
+   }
+
nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
 
if (unp_rights + nfds > maxfiles / 10)
diff --git sys/sys/socket.h sys/sys/socket.h
index 927fd26..75c2319 100644
--- sys/sys/socket.h
+++ sys/sys/socket.h
@@ -486,6 +486,26 @@ struct cmsghdr {
 /* followed by u_char  cmsg_data[]; */
 };
 
+/*
+ * Only 16 groups are sent because struct mbuf is a fixed size; these must fit.
+ */
+#define CMGROUP_MAX 16
+
+/*
+ * Credentials as attached to control messages of type SCM_CREDS.
+ * The sender must specify this control message, but the kernel
+ * will overwrite anything the sender put in the actual structure
+ * with the real values.
+ */
+struct cmsgcred {
+   pid_t   cmcred_pid; /* sender PID */
+   uid_t   cmcred_uid; /* sender real UID */
+   uid_t   cmcred_euid;/* sender effective UID */
+   gid_t   cmcred_gid; /* sender real GID*/
+   short   cmcred_ngroups; /* number of GIDs */
+   gid_t   cmcred_groups[CMGROUP_MAX]; /* GIDs */
+}
+
 /* given pointer to struct cmsghdr, return pointer to data */
 #defineCMSG_DATA(cmsg) \
((unsigned char *)(cmsg) + _ALIGN(sizeof(struct cmsghdr)))
@@ -520,6 +540,7 @@ struct cmsghdr {
 
 /* "Socket"-level control message types: */
 #defineSCM_RIGHTS  0x01/* access rights (array of int)
*/
+#defineSCM_CREDS   0x02/* sender credentials (struct
cmsgcred) */
 #defineSCM_TIMESTAMP   0x04/* timestamp (struct timeval) */
 
 #ifndef _KERNEL



Patch: Sending credentials over Unix datagram sockets

2020-03-17 Thread David Mackay
Dear openbsd-tech,

On GNU HURD and FreeBSD, the control message SCM_CREDS may be allocated by a
client of a Unix datagram socket. When the kernel encounters this, it fills
out
a struct cmsgcred containing PID, UID, GID, effective UID, and effective
GIDs of
the sender.

This patch implements this for OpenBSD.

Kind regards,
David


diff --git sys/kern/uipc_usrreq.c sys/kern/uipc_usrreq.c
index 8a735f3..7df3846 100644
--- sys/kern/uipc_usrreq.c
+++ sys/kern/uipc_usrreq.c
@@ -688,9 +688,10 @@ unp_externalize(struct mbuf *rights, socklen_t
controllen, int flags)
/*
 * This code only works because SCM_RIGHTS is the only supported
 * control message type on unix sockets. Enforce this here.
+* If it isn't, then return.
 */
if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET)
-   return EINVAL;
+   return (0);
 
nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
sizeof(struct fdpass);
@@ -831,10 +832,26 @@ unp_internalize(struct mbuf *control, struct proc *p)
 */
if (control->m_len < CMSG_LEN(0) || cm->cmsg_len < CMSG_LEN(0))
return (EINVAL);
-   if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
-   !(cm->cmsg_len == control->m_len ||
+   if (!(cm->cmsg_type == SCM_RIGHTS || cm->cmsg_type == SCM_CREDS) ||
+   cm->cmsg_level != SOL_SOCKET || !(cm->cmsg_len == control->m_len
||
control->m_len == CMSG_ALIGN(cm->cmsg_len)))
return (EINVAL);
+
+   if(cm->cmsg_type == SCM_CREDS) {
+   struct cmsgcred * cmcred = (struct cmsgcred *)CMSG_DATA(cm);
+
+   cmcred->cmcred_pid = p->p_p->ps_pid;
+   cmcred->cmcred_uid = p->p_ucred->cr_ruid;
+   cmcred->cmcred_euid = p->p_ucred->cr_uid;
+   cmcred->cmcred_gid = p->p_ucred->cr_rgid;
+   cmcred->cmcred_ngroups = MIN(p->p_ucred->cr_ngroups,
+
CMGROUP_MAX);
+   for (i = 0; i < cmcred->cmcred_ngroups; i++)
+   cmcred->cmcred_groups[i] = p->p_ucred->cr_groups[i];
+
+   return (0);
+   }
+
nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
 
if (unp_rights + nfds > maxfiles / 10)
diff --git sys/sys/socket.h sys/sys/socket.h
index 927fd26..75c2319 100644
--- sys/sys/socket.h
+++ sys/sys/socket.h
@@ -486,6 +486,26 @@ struct cmsghdr {
 /* followed by u_char  cmsg_data[]; */
 };
 
+/*
+ * Only 16 groups are sent because struct mbuf is a fixed size; these must
fit.
+ */
+#define CMGROUP_MAX 16
+
+/*
+ * Credentials as attached to control messages of type SCM_CREDS.
+ * The sender must specify this control message, but the kernel
+ * will overwrite anything the sender put in the actual structure
+ * with the real values.
+ */
+struct cmsgcred {
+   pid_t   cmcred_pid; /* sender PID */
+   uid_t   cmcred_uid; /* sender real UID */
+   uid_t   cmcred_euid;/* sender effective UID */
+   gid_t   cmcred_gid; /* sender real GID*/
+   short   cmcred_ngroups; /* number of GIDs */
+   gid_t   cmcred_groups[CMGROUP_MAX]; /* GIDs */
+}
+
 /* given pointer to struct cmsghdr, return pointer to data */
 #defineCMSG_DATA(cmsg) \
((unsigned char *)(cmsg) + _ALIGN(sizeof(struct cmsghdr)))
@@ -520,6 +540,7 @@ struct cmsghdr {
 
 /* "Socket"-level control message types: */
 #defineSCM_RIGHTS  0x01/* access rights (array of
int) */
+#defineSCM_CREDS   0x02/* sender credentials
(struct cmsgcred) */
 #defineSCM_TIMESTAMP   0x04/* timestamp (struct
timeval) */
 
 #ifndef _KERNEL



Re: Regarding the understanding of the malloc(3) code

2020-03-17 Thread Neeraj Pal
On Fri, Mar 13, 2020, at 11:45 AM Otto Moerbeek  wrote:
>
> Please indent your code snippets.
yeah, my apologies. I shall indent the code snippets.

>
> di_info is special. Having a guard page on both sides for regular
> allocation can be done, but would waste more pages. Note that
> allocations are already spread throughout the address space, so it is
> very likely that an allocation is surrounded by unmapped pages.
>
Okay, So, as from omalloc_poolinit() code it is not there, but we can do
but it will be wastage of one-page memory and also entire address space is
spread with unmapped pages. So, it is very likely that there will some
unmapped page beside dir_info in the memory.

>
> We need two pages to store dir_info.
>
>
> > Now, MMAPNONE maps up to len (8192 + (4096 * 2)) = 16384
>
> We allocate 4 pages prot none.
>
> > then, mprotecting the pages through p + MALLOC_PAGESIZE + DIR_INFO_RSZ
- 1
>
> the two middle pages are r/w.
>
> > d_avail = (8192 - 4824) >> 4 = 3368 >> 4 = 210
> >
> > Now, d = (p + MALLOC_PAGESIZE + (random_no_under_210 << 4)
>
> di_info ends up on an aligned address somewhere in the middle pages on
> an offset between 0 and (210<<4) = 0..3360, counting from the start of
> the two middle pages.
Thank you for the information. So, it is like [(p + MALLOC_PAGESIZE) +
(0..3360)]. It is kind of like array, For example, let's suppose, x = p +
MALLOC_PAGESIZE. So, it will be x[0..3360].
Am I right?
>
> MALLOC_CHUNK_LISTS could be increased at the cost of overhead.
> MALLOC_MAXSHIFT cannot, it is the shift of the max chunk size that fits in
> a page.
Okay. I understood. And, yeah more randomization means more cost overhead.

Thank you, Otto, for your detailed information.

Please find the code below:

948 /*
949  * Allocate a chunk
950  */
951 static void *
952 malloc_bytes(struct dir_info *d, size_t size, void *f)
953 {
954 u_int i, r;
955 int j, listnum;
956 size_t k;
957 u_short *lp;
958 struct chunk_info *bp;
959 void *p;
960 
961 if (mopts.malloc_canary != (d->canary1 ^ 
(u_int32_t)(uintptr_t)d) ||
962 d->canary1 != ~d->canary2)
963 wrterror(d, "internal struct corrupt");
964 
965 j = find_chunksize(size);
966 
967 r = ((u_int)getrbyte(d) << 8) | getrbyte(d);
968 listnum = r % MALLOC_CHUNK_LISTS;
969 /* If it's empty, make a page more of that size chunks */
970 if ((bp = LIST_FIRST(>chunk_dir[j][listnum])) == NULL) {
971 bp = omalloc_make_chunks(d, j, listnum);
972 if (bp == NULL)
973 return NULL;
974 }
975 
976 if (bp->canary != (u_short)d->canary1)
977 wrterror(d, "chunk info corrupted");


Here, in the code mentioned above, we can see that on line 961 and
line 976. I don't understand why it is checking for
canaries of malloc_readonly with d and then allocated chunk bp with d,
because I have seen that validation of canary
happens in free(3) not in malloc(3). So, it is like there may be some
cases where one can corrupt these also??


978 
979 i = (r / MALLOC_CHUNK_LISTS) & (bp->total - 1);
980 
981 /* start somewhere in a short */
982 lp = >bits[i / MALLOC_BITS];
983 if (*lp) {
984 j = i % MALLOC_BITS;
985 k = ffs(*lp >> j);
986 if (k != 0) {
987 k += j - 1;
988 goto found;
989 }
990 }
991 /* no bit halfway, go to next full short */
992 i /= MALLOC_BITS;
993 for (;;) {
994 if (++i >= bp->total / MALLOC_BITS)
995 i = 0;
996 lp = >bits[i];
997 if (*lp) {
998 k = ffs(*lp) - 1;
999 break;
1000}
1001}
1002found:
1003#ifdef MALLOC_STATS
1004if (i == 0 && k == 0) {
1005struct region_info *r = find(d, bp->page);
1006r->f = f;
1007}
1008#endif
1009
1010*lp ^= 1 << k;
1011
1012/* If there are no more free, remove from free-list */
1013if (--bp->free == 0)
1014LIST_REMOVE(bp, entries);
1015
1016/* Adjust to the real offset of that chunk */
1017k += (lp - bp->bits) * MALLOC_BITS;
1018
1019if (mopts.chunk_canaries && size > 0)
1020bp->bits[bp->offset + k] = size;
1021
1022k <<= bp->shift;
1023
1024p = (char *)bp->page + k;
1025if (bp->size > 0) {
1026if (d->malloc_junk == 2)
1027

Re: [PLEASE TEST] acpi(4): acpi_sleep(): tsleep(9) -> tsleep_nsec(9)

2020-03-17 Thread Lucas Raab
On Sun, Mar 15, 2020, at 21:55, Scott Cheloha wrote:
> Hi,
> 
> This is a straightforward ticks-to-milliseconds conversion, but IIRC
> pirofti@ wanted me to get some tests before committing it.
> 
> The only users of acpi_sleep() are (a) acpitz(4) and (b) any AML code
> that uses AMLOP_SLEEP.  AMLOP_SLEEP seems to trigger just before a
> suspend.  I don't know when else it is used.
> 
> If you have an acpi(4) laptop with suspend/resume support, please
> apply this patch and let me know if anything doesn't work,
> particularly with suspend/resume.
> 
> I've been running with this since November without apparent issue...
> But this driver impacts many people, so I think I need more tests.
> 
> Cheers,
> 
> Scott
> 

No problems so far on a Thinkpad X1C7

Lucas



vmm Linux guest crashes frequently

2020-03-17 Thread Johan Huldtgren
hello,

i am running vmd on current and one of the guests I have is an Ubuntu
18.04.4 LTS. I have two problems, first this guest will crash 
frequently and second when restarting this guest it will often crash
on restart.

Starting with the second issue first, when the guest has crashed and
is started back up it will sometimes crash, but only during fsck, this
makes me suspect disk I/O being the culprit. The output I get when
doing 'vmctl start -c 2' is:

[0.00] ACPI BIOS Error (bug): A valid RSDP was not found 
(20170831/tbxfroot-244) a command-line.   
[1.096305] mce: Unable to init MCE device (rc: -5) 
/dev/vda1: recovering journal  
/dev/vda1: clean, 291619/1966080 files, 4637993/7863808 blocks

[EOT]

I've attached the the full log displayed during startup (with VMM_DEBUG)
the one thing I noticed was this seems to happen just before each
crash:

Mar 17 11:08:40 absu /bsd: vmx_handle_wrmsr: wrmsr exit, msr=0xc90, discarding 
data written from guest=0x0:0xf
Mar 17 11:08:40 absu /bsd: vmx_handle_rdmsr: rdmsr exit, msr=0xc90, data 
returned to guest=0x0:0x0
Mar 17 11:08:40 absu vmd[74219]: vcpu_process_com_data: guest reading com1 when 
not ready
Mar 17 11:08:40 absu vmd[74219]: vcpu_process_com_data: guest reading com1 when 
not ready
Mar 17 11:08:40 absu /bsd: vmx_handle_rdmsr: rdmsr exit, msr=0x3b, data 
returned to guest=0x0:0x0
Mar 17 11:08:40 absu vmd[74219]: vioblk_notifyq: unsupported command 0x8
Mar 17 11:08:40 absu vmd[74219]: vioblk_notifyq: unsupported command 0x8
Mar 17 11:08:42 absu vmd[74219]: vmd: vm 3 event thread exited unexpectedly
Mar 17 11:08:42 absu /bsd: vmm_free_vpid: freed VPID/ASID 2

Generally after a few attempts it will get passed this and then start
up normally.

The issue of the guest crashing is harder to pin down, it's random,
I've gone days with everything running fine, sometimes it crashes
multiple times a day. Most common is about once a day or so. When
it did crash even with VMM_DEBUG I didn't see much:

Mar 17 14:27:13 absu /bsd: vmx_handle_rdmsr: rdmsr exit, msr=0x3b, data 
returned to guest=0x0:0x0
Mar 17 14:27:46 absu last message repeated 6 times
Mar 17 14:29:46 absu last message repeated 22 times
Mar 17 14:39:44 absu last message repeated 109 times
Mar 17 14:49:46 absu last message repeated 110 times
Mar 17 14:56:53 absu last message repeated 78 times
Mar 17 15:09:45 absu last message repeated 63 times
Mar 17 15:19:46 absu last message repeated 78 times
Mar 17 15:29:48 absu last message repeated 110 times
Mar 17 15:39:50 absu last message repeated 110 times
Mar 17 15:49:47 absu last message repeated 109 times
Mar 17 15:59:49 absu last message repeated 110 times
Mar 17 16:09:49 absu last message repeated 109 times
Mar 17 16:19:49 absu last message repeated 74 times
Mar 17 16:29:45 absu last message repeated 107 times
Mar 17 16:39:47 absu last message repeated 109 times
Mar 17 16:49:51 absu last message repeated 110 times
Mar 17 16:59:47 absu last message repeated 108 times
Mar 17 17:00:59 absu last message repeated 12 times
Mar 17 17:01:20 absu vmd[55361]: vmd: vm 5 event thread exited unexpectedly
Mar 17 17:01:21 absu /bsd: vmm_free_vpid: freed VPID/ASID 2

any hints on further debugging this?

thanks,

.jh
Mar 17 11:07:59 absu /bsd: vm_impl_init_vmx: created vm_map @ 0xfd9db272edd8
Mar 17 11:08:00 absu /bsd: vm_resetcpu: resetting vm 3 vcpu 0 to power on 
defaults
Mar 17 11:08:00 absu /bsd: Guest EPTP = 0x207da5c01e
Mar 17 11:08:00 absu /bsd: vmm_alloc_vpid: allocated VPID/ASID 2
Mar 17 11:08:04 absu /bsd: vmx_handle_cr: mov to cr0 @ b4e0008b, 
data=0x80050033
Mar 17 11:08:04 absu /bsd: vmm_handle_cpuid: function 0x06 (thermal/power mgt) 
not supported
Mar 17 11:08:04 absu /bsd: vmm_handle_cpuid: function 0x0f (QoS info) not 
supported
Mar 17 11:08:04 absu /bsd: vmm_handle_cpuid: function 0x06 (thermal/power mgt) 
not supported
Mar 17 11:08:04 absu /bsd: vmm_handle_cpuid: function 0x06 (thermal/power mgt) 
not supported
Mar 17 11:08:04 absu /bsd: vmm_handle_cpuid: invalid cpuid input leaf 0x10, 
guest rip=0xb4e6cbbf - resetting to 0xf
Mar 17 11:08:04 absu /bsd: vmm_handle_cpuid: function 0x0f (QoS info) not 
supported
Mar 17 11:08:05 absu /bsd: vmm_handle_cpuid: invalid cpuid input leaf 0x10, 
guest rip=0xb4e6cbbf - resetting to 0xf
Mar 17 11:08:05 absu /bsd: vmm_handle_cpuid: function 0x0f (QoS info) not 
supported
Mar 17 11:08:05 absu /bsd: vmm_handle_cpuid: invalid cpuid input leaf 0x10, 
guest rip=0xb4e6cbbf - resetting to 0xf
Mar 17 11:08:05 absu /bsd: vmm_handle_cpuid: function 0x0f (QoS info) not 
supported
Mar 17 11:08:05 absu /bsd: vmm_handle_cpuid: invalid cpuid input leaf 0x10, 
guest rip=0xb4e6cbbf - resetting to 0xf
Mar 17 11:08:05 absu /bsd: vmm_handle_cpuid: function 0x0f (QoS info) not 
supported
Mar 17 11:08:05 absu /bsd: vmm_handle_cpuid: invalid cpuid input leaf 0x10, 
guest 

Re: ldapd: fix return values for illegal passwords

2020-03-17 Thread Robert Klein
ping...

On Sun, 8 Mar 2020 12:18:39 +0100
Robert Klein  wrote:

> Hi,
> 
> I thought a bit more about using LDAP resultCode values and I think
> some intermediate values are needed so it is clearer what happens.
> 
> Also, I found out a clients connection hangs in the "Database is being
> reopened" case in ldap_auth_simple().
> 
> Below is a patch proposal that
> 
> 1. patches the original issue, so ldapd returns "invalid credentials"
>instead of "operations error" where appropriate.
> 2. use private "result Code" values for intermediate results (see also
>
> https://www.iana.org/assignments/ldap-parameters/ldap-parameters.xhtml#ldap-parameters-6):
>- LDAP_X_HANDLES_BY_LDAPE_IMSGEV for SASL binds and {BSDAUTH} style
>  passwords where the actual answer is provided via IMSGEV
>- LDAP_X_PASSWORD_OK in check_password() because there's still some
>  work left until it can be called LDAP_SUCCESS.
> 3. returns LDAP_BUSY for "Database is being reopened": the old code
> did not lead to a "return ldap_respond" in bind_ldap(), so nothing is
>returned to the client which is trying to bind and the client
> hangs.
> 
> 
> Best regards
> Robert
> 
> 
> diff 403185e43a653dece6518a28d0750f212ff40fc5 /usr/src
> blob - f6f542e4e4a0af0b815643d9f622913a46707709
> file + usr.sbin/ldapd/aldap.h
> --- usr.sbin/ldapd/aldap.h
> +++ usr.sbin/ldapd/aldap.h
> @@ -109,6 +109,8 @@ enum result_code {
>   LDAP_AFFECTS_MULTIPLE_DSAS  = 71,
>  
>   LDAP_OTHER  = 80,
> + LDAP_X_PASSWORD_OK  = 16384,
> + LDAP_X_HANDLED_BY_LDAPE_IMSGEV  = 16385,
>  };
>  
>  enum filter {
> blob - f8debff7a2d6a0feb9cf984905de385b029b8744
> file + usr.sbin/ldapd/auth.c
> --- usr.sbin/ldapd/auth.c
> +++ usr.sbin/ldapd/auth.c
> @@ -179,33 +179,35 @@ send_auth_request(struct request *req, const
> char *use const char *password)
>  {
>   struct auth_req  auth_req;
> + int  ret = LDAP_X_HANDLED_BY_LDAPE_IMSGEV;
>  
>   memset(_req, 0, sizeof(auth_req));
>   if (strlcpy(auth_req.name, username,
> - sizeof(auth_req.name)) >= sizeof(auth_req.name))
> + sizeof(auth_req.name)) >= sizeof(auth_req.name)) {
> + ret = LDAP_INVALID_CREDENTIALS;
>   goto fail;
> + }
>   if (strlcpy(auth_req.password, password,
> - sizeof(auth_req.password)) >= sizeof(auth_req.password))
> + sizeof(auth_req.password)) >= sizeof(auth_req.password))
> {
> + ret = LDAP_INVALID_CREDENTIALS;
>   goto fail;
> + }
>   auth_req.fd = req->conn->fd;
>   auth_req.msgid = req->msgid;
>  
>   if (imsgev_compose(iev_ldapd, IMSG_LDAPD_AUTH, 0, 0, -1,
> _req,
> - sizeof(auth_req)) == -1)
> + sizeof(auth_req)) == -1) {
> + ret = LDAP_OTHER;
>   goto fail;
> + }
>  
>   req->conn->bind_req = req;
> - return 0;
>  
>  fail:
>   memset(_req, 0, sizeof(auth_req));
> - return -1;
> + return ret;
>  }
>  
> -/*
> - * Check password. Returns 1 if password matches, 2 if password
> matching
> - * is in progress, 0 on mismatch and -1 on error.
> - */
>  static int
>  check_password(struct request *req, const char *stored_passwd,
>  const char *passwd)
> @@ -218,37 +220,39 @@ check_password(struct request *req, const char
> *stored intsz;
>  
>   if (stored_passwd == NULL)
> - return -1;
> + return LDAP_INVALID_CREDENTIALS;
>  
>   if (strncmp(stored_passwd, "{SHA}", 5) == 0) {
>   sz = b64_pton(stored_passwd + 5, tmp, sizeof(tmp));
>   if (sz != SHA_DIGEST_LENGTH)
> - return (-1);
> + return (LDAP_INVALID_CREDENTIALS);
>   SHA1_Init();
>   SHA1_Update(, passwd, strlen(passwd));
>   SHA1_Final(md, );
> - return (bcmp(md, tmp, SHA_DIGEST_LENGTH) == 0 ? 1 :
> 0);
> + return (bcmp(md, tmp, SHA_DIGEST_LENGTH) == 0 ?
> + LDAP_X_PASSWORD_OK : LDAP_INVALID_CREDENTIALS);
>   } else if (strncmp(stored_passwd, "{SSHA}", 6) == 0) {
>   sz = b64_pton(stored_passwd + 6, tmp, sizeof(tmp));
>   if (sz <= SHA_DIGEST_LENGTH)
> - return (-1);
> + return (LDAP_INVALID_CREDENTIALS);
>   salt = tmp + SHA_DIGEST_LENGTH;
>   SHA1_Init();
>   SHA1_Update(, passwd, strlen(passwd));
>   SHA1_Update(, salt, sz - SHA_DIGEST_LENGTH);
>   SHA1_Final(md, );
> - return (bcmp(md, tmp, SHA_DIGEST_LENGTH) == 0 ? 1 :
> 0);
> + return (bcmp(md, tmp, SHA_DIGEST_LENGTH) == 0 ?
> + LDAP_X_PASSWORD_OK : LDAP_INVALID_CREDENTIALS);
>   } else if (strncmp(stored_passwd, "{CRYPT}", 7) == 0) {
>   encpw = crypt(passwd, stored_passwd + 7);
>   if (encpw == NULL)
> - 

ikectl(8): Reset SAs by policy ID

2020-03-17 Thread Tobias Heider
Hi,

this diff adds a new command to ikectl(8) and iked(8) that allows to reset
SAs based on the peers ID, which is equivalent to resetting a single policy.
The expected ID format is the same as printed by 'ipsecctl -sf' in the
'dstid' field.

Example:
  $ ikectl reset id FQDN/peer1

ok?

diff --git a/sbin/iked/control.c b/sbin/iked/control.c
index 67466c6a555..42b14e261e1 100644
--- a/sbin/iked/control.c
+++ b/sbin/iked/control.c
@@ -50,7 +50,8 @@ void   control_imsg_forward(struct imsg *);
 voidcontrol_run(struct privsep *, struct privsep_proc *, void *);
 
 static struct privsep_proc procs[] = {
-   { "parent", PROC_PARENT, NULL }
+   { "parent", PROC_PARENT, NULL },
+   { "ikev2",  PROC_IKEV2, NULL }
 };
 
 pid_t
@@ -305,6 +306,9 @@ control_dispatch_imsg(int fd, short event, void *arg)
case IMSG_CTL_PASSIVE:
proc_forward_imsg(>sc_ps, , PROC_PARENT, -1);
break;
+   case IMSG_CTL_RESET_ID:
+   proc_forward_imsg(>sc_ps, , PROC_IKEV2, -1);
+   break;
default:
log_debug("%s: error handling imsg %d",
__func__, imsg.hdr.type);
diff --git a/sbin/iked/ikev2.c b/sbin/iked/ikev2.c
index 2dd2b2c648d..ebe0adce0e0 100644
--- a/sbin/iked/ikev2.c
+++ b/sbin/iked/ikev2.c
@@ -49,6 +49,7 @@
 voidikev2_run(struct privsep *, struct privsep_proc *, void *);
 int ikev2_dispatch_parent(int, struct privsep_proc *, struct imsg *);
 int ikev2_dispatch_cert(int, struct privsep_proc *, struct imsg *);
+int ikev2_dispatch_control(int, struct privsep_proc *, struct imsg *);
 
 struct iked_sa *
 ikev2_getimsgdata(struct iked *, struct imsg *, struct iked_sahdr *,
@@ -155,10 +156,12 @@ intikev2_update_sa_addresses(struct iked *, 
struct iked_sa *);
 int ikev2_resp_informational(struct iked *, struct iked_sa *,
struct iked_message *);
 
+void   ikev2_ctl_reset_id(struct iked *, struct imsg *, unsigned int);
 
 static struct privsep_proc procs[] = {
{ "parent", PROC_PARENT,ikev2_dispatch_parent },
-   { "certstore",  PROC_CERT,  ikev2_dispatch_cert }
+   { "certstore",  PROC_CERT,  ikev2_dispatch_cert },
+   { "control",PROC_CONTROL,   ikev2_dispatch_control }
 };
 
 pid_t
@@ -376,6 +379,22 @@ ikev2_dispatch_cert(int fd, struct privsep_proc *p, struct 
imsg *imsg)
return (0);
 }
 
+int
+ikev2_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
+{
+   struct iked *env = p->p_env;
+
+   switch (imsg->hdr.type) {
+   case IMSG_CTL_RESET_ID:
+   ikev2_ctl_reset_id(env, imsg, imsg->hdr.type);
+   break;
+   default:
+   return (-1);
+   }
+
+   return (0);
+}
+
 const char *
 ikev2_ikesa_info(uint64_t spi, const char *msg)
 {
@@ -390,6 +409,38 @@ ikev2_ikesa_info(uint64_t spi, const char *msg)
return buf;
 }
 
+void
+ikev2_ctl_reset_id(struct iked *env, struct imsg *imsg, unsigned int type)
+{
+   struct iked_sa  *sa;
+   char*reset_id = NULL;
+   char sa_id[IKED_ID_SIZE];
+
+   if ((reset_id = get_string(imsg->data, IMSG_DATA_SIZE(imsg))) == NULL)
+   return;
+
+   log_debug("%s: %s %d", __func__, reset_id, type);
+   RB_FOREACH(sa, iked_sas, >sc_sas) {
+   if (ikev2_print_id(IKESA_DSTID(sa), sa_id, sizeof(sa_id)) == -1)
+   continue;
+   if (strcmp(reset_id, sa_id) != 0)
+   continue;
+   if (sa->sa_state == IKEV2_STATE_CLOSED)
+   continue;
+   if (sa->sa_state == IKEV2_STATE_ESTABLISHED)
+   ikev2_disable_timer(env, sa);
+   log_info("%s: IKE SA %p id %s ispi %s rspi %s", __func__,
+   sa, sa_id,
+   print_spi(sa->sa_hdr.sh_ispi, 8),
+   print_spi(sa->sa_hdr.sh_rspi, 8));
+   ikev2_ikesa_delete(env, sa, 1);
+   /* default IKED_IKE_SA_DELETE_TIMEOUT is 120s, so switch to 6s 
*/
+   timer_add(env, >sa_timer, 3 * IKED_RETRANSMIT_TIMEOUT);
+   }
+   free(reset_id);
+}
+
+
 struct iked_sa *
 ikev2_getimsgdata(struct iked *env, struct imsg *imsg, struct iked_sahdr *sh,
 uint8_t *type, uint8_t **buf, size_t *size)
diff --git a/sbin/iked/types.h b/sbin/iked/types.h
index e8881a74e9a..07cec6a5902 100644
--- a/sbin/iked/types.h
+++ b/sbin/iked/types.h
@@ -105,6 +105,7 @@ enum imsg_type {
IMSG_CTL_MOBIKE,
IMSG_CTL_FRAGMENTATION,
IMSG_CTL_NATTPORT,
+   IMSG_CTL_RESET_ID,
IMSG_COMPILE,
IMSG_UDP_SOCKET,
IMSG_PFKEY_SOCKET,
diff --git a/usr.sbin/ikectl/ikectl.8 b/usr.sbin/ikectl/ikectl.8
index 40d30ac0e21..9956b42b3c0 100644
--- a/usr.sbin/ikectl/ikectl.8
+++ 

uvm_map_inentry diff for testing

2020-03-17 Thread Mark Kettenis
While playing with dt(4)/btrace(4) flamegraphs on a 32-core arm64
machine, I noticed that the kernel was spending a lot of time (6.84%)
in uvm_map_inentry().  This is caused by kernel lock contention.
Pushing baack the kernel lock further into the slow path reduces the
time to 0.05%.

Now mpi@ tried this before in sys/uvm/uvm_map.c rev 1.249.  That was
backed out because it caused issues with golang.  So I had a look
again and identified a potential issue.  When we "fix" the cached
entry, we use the serial number that has been passed down to us.  But
since that serial number was recorded before we grapped the vm_map
lock, it may be stale.  Instead we should be using the current map
serial number.

This makes me wonder whether having two serial numbers is a case of
premature optimization.  But that is a different discussion.

I'd like to see this tested, especially by people running/building
golang.  Does anybody remember specific details of the issue?  If it
manifests itself, it should be pretty obvious as it results in a
kernel printf and a killed process.


Index: uvm/uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.263
diff -u -p -r1.263 uvm_map.c
--- uvm/uvm_map.c   4 Mar 2020 21:15:38 -   1.263
+++ uvm/uvm_map.c   17 Mar 2020 18:57:19 -
@@ -161,7 +161,7 @@ void uvm_map_addr_augment(struct 
vm_m
 int uvm_map_inentry_recheck(u_long, vaddr_t,
 struct p_inentry *);
 boolean_t   uvm_map_inentry_fix(struct proc *, struct p_inentry *,
-vaddr_t, int (*)(vm_map_entry_t), u_long);
+vaddr_t, int (*)(vm_map_entry_t));
 /*
  * Tree management functions.
  */
@@ -1848,10 +1848,11 @@ uvm_map_inentry_recheck(u_long serial, v
  */
 boolean_t
 uvm_map_inentry_fix(struct proc *p, struct p_inentry *ie, vaddr_t addr,
-int (*fn)(vm_map_entry_t), u_long serial)
+int (*fn)(vm_map_entry_t))
 {
vm_map_t map = >p_vmspace->vm_map;
vm_map_entry_t entry;
+   u_long serial;
int ret;
 
if (addr < map->min_offset || addr >= map->max_offset)
@@ -1866,6 +1867,11 @@ uvm_map_inentry_fix(struct proc *p, stru
return (FALSE);
}
 
+   if (ie == >p_spinentry)
+   serial = map->sserial;
+   else
+   serial = map->wserial;
+
ret = (*fn)(entry);
if (ret == 0) {
vm_map_unlock_read(map);
@@ -1889,16 +1895,16 @@ uvm_map_inentry(struct proc *p, struct p
boolean_t ok = TRUE;
 
if (uvm_map_inentry_recheck(serial, addr, ie)) {
-   KERNEL_LOCK();
-   ok = uvm_map_inentry_fix(p, ie, addr, fn, serial);
+   ok = uvm_map_inentry_fix(p, ie, addr, fn);
if (!ok) {
printf(fmt, p->p_p->ps_comm, p->p_p->ps_pid, p->p_tid,
addr, ie->ie_start, ie->ie_end);
+   KERNEL_LOCK();
p->p_p->ps_acflag |= AMAP;
sv.sival_ptr = (void *)PROC_PC(p);
trapsignal(p, SIGSEGV, 0, SEGV_ACCERR, sv);
+   KERNEL_UNLOCK();
}
-   KERNEL_UNLOCK();
}
return (ok);
 }



Re: bug? in getopt(3) / possible [PATCH]

2020-03-17 Thread 0xef967c36
On Tue, Mar 17, 2020 at 02:14:58PM +0100, Martijn van Duren wrote:
> Included millert, since he imported this code.
> He might shed some more light on what's intended.
> 
> On 3/15/20 11:27 PM, 0xef967...@gmail.com wrote:
> > On Sun, Mar 15, 2020 at 07:32:52PM +0100, Martijn van Duren wrote:
> >> tl;dr new diff below
> > 
> > Check with:
> > 
> > OPTS=r- ./getopt-test-MvD2 -r-
> > getopt-test-MvD2: unknown option -- -
> >   #U- |
> > 
> > With the unpatched code, my patch, and the original 44BSD, it's:
> > 
> > OPTS=r- ./getopt-test-O -r-
> > {./getopt-test-O}  <-> |
> > 
> > See proposed fix at the end.
> 
> This is intentional. From getopt_long(3):
>
>  OpenBSD  a ‘-’ within the option string matches a ‘-’ (single
>   dash) on the command line.  This functionality is
>   provided for backward compatibility with programs,
>   such as su(1), that use ‘-’ as an option flag.  This
>   practice is wrong, and should not be used in any
>   current development.

I don't get your point, sorry. My patch didn't mean to change that.
Nor do I think that it should be changed (if that matters).

My stupid and unnecessary "(optchar == (int)'-' && oli == options)"
check was breaking the case where someone meant to use both the
GNU-like optstring-starting-with-dash and the BSD dashes-in-optstring
features, by using an optstring like "--xy" or "-x-y".

As I mentioned, that was a mistake.

For the reference, here is an updated patch, without that check:

--- getopt-long.c~  2020-03-12 02:23:29.028903616 +0200
+++ getopt-long.c   2020-03-15 23:46:07.988119523 +0200
@@ -418,15 +418,7 @@
}
 
if ((optchar = (int)*place++) == (int)':' ||
-   (optchar == (int)'-' && *place != '\0') ||
(oli = strchr(options, optchar)) == NULL) {
-   /*
-* If the user specified "-" and  '-' isn't listed in
-* options, return -1 (non-option) as per POSIX.
-* Otherwise, it is an unknown option character (or ':').
-*/
-   if (optchar == (int)'-' && *place == '\0')
-   return (-1);
if (!*place)
++optind;
if (PRINT_ERROR)



Re: macppc kernel and clang

2020-03-17 Thread George Koehler
On Mon, 16 Mar 2020 19:13:13 -0600
"Theo de Raadt"  wrote:

> How are the bootblocks faring?
> 
> And userland?

ofwboot with clang works for me.

I failed to make bsd.rd (in src/distrib/macppc/ramdisk) with clang,
but I don't remember exactly what I did, so I might have been making
it the wrong way.  My build failed at

clang -static -L.  -nopie -o instbin instbin.o dd.lo ...
/usr/bin/ld: dd.lo(.text+0x14): R_PPC_PLTREL24 reloc against local symbol

I was able to make bsd.rd with gcc.

"$CC -fno-pie -c something.c; objdump -dlr something.o" shows that
gcc -fno-pie uses R_PPC_REL24, but clang -fno-pie uses R_PPC_PLTREL24,
for calls to external functions.  The symbols would be global until
crunchgen(8) hides them by marking them local.  R_PPC_REL24 would
branch directly to a function, and R_PPC_PLTREL24 would go through
the PLT (for a function in a shared library).  The compiler can't
know whether the function is in a shared lib; the linker must decide.

clang -fno-pie on amd64 uses R_X86_64_PLT32, so I guess that clang
-fno-pie on powerpc isn't wrong to use R_PPC_PLTREL24.

Passing -M to crunchgen(8), as we do on {longsoon,octeon,sgi}, might
work around the problem, but I haven't tried it, and I don't know
whether this is the only problem.

--George



Re: ktrwriteraw & vget

2020-03-17 Thread Martin Pieuchot
On 17/03/20(Tue) 07:50, Philip Guenther wrote:
> On Tue, Mar 17, 2020 at 5:18 AM Martin Pieuchot  wrote:
> > On 17/03/20(Tue) 04:02, Philip Guenther wrote:
> > > On Tue, Mar 17, 2020 at 1:07 AM Martin Pieuchot  wrote:
> > > [...]
> > > > @@ -663,8 +668,6 @@ ktrwriteraw(struct proc *curp, struct vn
> > > > LIST_FOREACH(pr, , ps_list)
> > > > if (pr->ps_tracevp == vp && pr->ps_tracecred == cred)
> > > > ktrcleartrace(pr);
> > > > -
> > > > -   vput(vp);
> > > > return (error);
> > > >  }
> > > >
> > >
> > > This looks unsafe to me: isn't ktrcleartrace() only safe if the caller
> > > holds a reference to the vnode?  Once ktrcleartrace() clears the
> > reference
> > > from the current thread's process and it goes on the freelist, can't the
> > > vnode vp points to be invalidated and reused?
> >
> > As long as a process holds a reference to the vnode, via `ps_tracevp',
> > it wont be recycle.  Only the last call of ktrcleartrace() will release
> > the vnode via vrele(9).
> 
> ...and after that last reference is released this code will continue to
> walk the allprocess list, comparing a possibly-recycled pointer to
> ps_tracevp pointers in the remaining processes.  Good thing that vrele(9)
> is guaranteed to never sleep and thereby let another process start
> ktracing, reusing that vnode.??

Are you saying that `allprocess' should only be walked if the current
thread is still holding a valid reference to the vnode?

Index: kern/kern_ktrace.c
===
RCS file: /cvs/src/sys/kern/kern_ktrace.c,v
retrieving revision 1.100
diff -u -p -r1.100 kern_ktrace.c
--- kern/kern_ktrace.c  6 Oct 2019 16:24:14 -   1.100
+++ kern/kern_ktrace.c  17 Mar 2020 17:14:46 -
@@ -649,22 +649,29 @@ ktrwriteraw(struct proc *curp, struct vn
auio.uio_iovcnt++;
auio.uio_resid += kth->ktr_len;
}
-   vget(vp, LK_EXCLUSIVE | LK_RETRY);
+   error = vget(vp, LK_EXCLUSIVE | LK_RETRY);
+   if (error)
+   goto bad;
error = VOP_WRITE(vp, , IO_UNIT|IO_APPEND, cred);
-   if (!error) {
-   vput(vp);
-   return (0);
-   }
+   vput(vp);
+   if (error)
+   goto bad;
+
+   return (0);
+
+bad:
/*
 * If error encountered, give up tracing on this vnode.
 */
log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n",
error);
-   LIST_FOREACH(pr, , ps_list)
+   LIST_FOREACH(pr, , ps_list) {
+   if (pr == curp->p_p)
+   continue;
if (pr->ps_tracevp == vp && pr->ps_tracecred == cred)
ktrcleartrace(pr);
-
-   vput(vp);
+   }
+   ktrcleartrace(curp->p_p);
return (error);
 }
 



Expose a few more symbols in arm64 kernel

2020-03-17 Thread Mark Kettenis
While playing with dt(4) on arm64, I noticed that there were some
unrecognized functions.  Diff below fixes that.

ok?


Index: arch/arm64/arm64/exception.S
===
RCS file: /cvs/src/sys/arch/arm64/arm64/exception.S,v
retrieving revision 1.10
diff -u -p -r1.10 exception.S
--- arch/arm64/arm64/exception.S17 Dec 2019 22:25:56 -  1.10
+++ arch/arm64/arm64/exception.S17 Mar 2020 17:06:06 -
@@ -155,6 +155,8 @@
 1:
 .endm
 
+   .globl handle_el1h_sync
+   .type handle_el1h_sync,@function
 handle_el1h_sync:
save_registers 1
mov x0, sp
@@ -164,6 +166,8 @@ handle_el1h_sync:
dsb nsh
isb
 
+   .globl handle_el1h_irq
+   .type handle_el1h_irq,@function
 handle_el1h_irq:
save_registers 1
mov x0, sp
@@ -173,6 +177,8 @@ handle_el1h_irq:
dsb nsh
isb
 
+   .globl handle_el1h_error
+   .type handle_el1h_error,@function
 handle_el1h_error:
brk 0xf13
 



Re: ldomctl: list-io: print names

2020-03-17 Thread Mark Kettenis
> Date: Tue, 17 Mar 2020 17:27:21 +0100
> From: Klemens Nanni 
> 
> On Tue, Mar 17, 2020 at 03:26:56PM +0100, Mark Kettenis wrote:
> > > @@ -2889,7 +2896,8 @@ list_components(void)
> > >  
> > >   pri_init(pri);
> > >  
> > > + printf("PATH\t\tNAME\n");
> > 
> > Using tabs to make things line up isn't going to work very well.  So
> > better use field widths in the printf calls.
> Sure thing, assignable paths should not be longer than 15 characters, so
> I went with even 16 as column width.
> 
> OK?

ok kettenis@

> Index: config.c
> ===
> RCS file: /cvs/src/usr.sbin/ldomctl/config.c,v
> retrieving revision 1.35
> diff -u -p -r1.35 config.c
> --- config.c  7 Mar 2020 18:51:06 -   1.35
> +++ config.c  17 Mar 2020 16:00:29 -
> @@ -48,6 +48,7 @@ TAILQ_HEAD(, core) cores;
>  
>  struct component {
>   const char *path;
> + const char *nac;
>   int assigned;
>  
>   struct md_node *hv_node;
> @@ -216,6 +217,7 @@ pri_init_components(struct md *md)
>   struct component *component;
>   struct md_node *node;
>   const char *path;
> + const char *nac;
>   const char *type;
>  
>   TAILQ_INIT();
> @@ -228,6 +230,10 @@ pri_init_components(struct md *md)
>   if (md_get_prop_str(md, node, "assignable-path", )) {
>   component = xzalloc(sizeof(*component));
>   component->path = path;
> + if (md_get_prop_str(md, node, "nac", ))
> + component->nac = nac;
> + else
> + component->nac = "-";
>   TAILQ_INSERT_TAIL(, component, link);
>   }
>  
> @@ -2889,7 +2896,8 @@ list_components(void)
>  
>   pri_init_components(pri);
>  
> + printf("%-16s %s\n", "PATH", "NAME");
>   TAILQ_FOREACH(component, , link) {
> - printf("%s\n", component->path);
> + printf("%-16s %s\n", component->path, component->nac);
>   }
>  }
> 



Re: ktrwriteraw & vget

2020-03-17 Thread Philip Guenther
On Tue, Mar 17, 2020 at 5:18 AM Martin Pieuchot  wrote:

> On 17/03/20(Tue) 04:02, Philip Guenther wrote:
> > On Tue, Mar 17, 2020 at 1:07 AM Martin Pieuchot  wrote:
> > [...]
> > > @@ -663,8 +668,6 @@ ktrwriteraw(struct proc *curp, struct vn
> > > LIST_FOREACH(pr, , ps_list)
> > > if (pr->ps_tracevp == vp && pr->ps_tracecred == cred)
> > > ktrcleartrace(pr);
> > > -
> > > -   vput(vp);
> > > return (error);
> > >  }
> > >
> >
> > This looks unsafe to me: isn't ktrcleartrace() only safe if the caller
> > holds a reference to the vnode?  Once ktrcleartrace() clears the
> reference
> > from the current thread's process and it goes on the freelist, can't the
> > vnode vp points to be invalidated and reused?
>
> As long as a process holds a reference to the vnode, via `ps_tracevp',
> it wont be recycle.  Only the last call of ktrcleartrace() will release
> the vnode via vrele(9).
>

...and after that last reference is released this code will continue to
walk the allprocess list, comparing a possibly-recycled pointer to
ps_tracevp pointers in the remaining processes.  Good thing that vrele(9)
is guaranteed to never sleep and thereby let another process start
ktracing, reusing that vnode.??


Re: ldomctl: list-io: print names

2020-03-17 Thread Klemens Nanni
On Tue, Mar 17, 2020 at 03:26:56PM +0100, Mark Kettenis wrote:
> > @@ -2889,7 +2896,8 @@ list_components(void)
> >  
> > pri_init(pri);
> >  
> > +   printf("PATH\t\tNAME\n");
> 
> Using tabs to make things line up isn't going to work very well.  So
> better use field widths in the printf calls.
Sure thing, assignable paths should not be longer than 15 characters, so
I went with even 16 as column width.

OK?


Index: config.c
===
RCS file: /cvs/src/usr.sbin/ldomctl/config.c,v
retrieving revision 1.35
diff -u -p -r1.35 config.c
--- config.c7 Mar 2020 18:51:06 -   1.35
+++ config.c17 Mar 2020 16:00:29 -
@@ -48,6 +48,7 @@ TAILQ_HEAD(, core) cores;
 
 struct component {
const char *path;
+   const char *nac;
int assigned;
 
struct md_node *hv_node;
@@ -216,6 +217,7 @@ pri_init_components(struct md *md)
struct component *component;
struct md_node *node;
const char *path;
+   const char *nac;
const char *type;
 
TAILQ_INIT();
@@ -228,6 +230,10 @@ pri_init_components(struct md *md)
if (md_get_prop_str(md, node, "assignable-path", )) {
component = xzalloc(sizeof(*component));
component->path = path;
+   if (md_get_prop_str(md, node, "nac", ))
+   component->nac = nac;
+   else
+   component->nac = "-";
TAILQ_INSERT_TAIL(, component, link);
}
 
@@ -2889,7 +2896,8 @@ list_components(void)
 
pri_init_components(pri);
 
+   printf("%-16s %s\n", "PATH", "NAME");
TAILQ_FOREACH(component, , link) {
-   printf("%s\n", component->path);
+   printf("%-16s %s\n", component->path, component->nac);
}
 }



Re: ldomctl: list-io: print names

2020-03-17 Thread Mark Kettenis
> Date: Tue, 17 Mar 2020 14:12:14 +0100
> From: Klemens Nanni 
> Content-Type: text/plain; charset=us-ascii
> Content-Disposition: inline
> 
> On Sat, Mar 07, 2020 at 05:44:12PM +0100, Klemens Nanni wrote:
> > Next to the IO device path to be used in ldom.conf we can also print the
> > name which actually tells in human readable form what device we're
> > looking at.
> > 
> > Those names are printed in similar format by Solaris `ldm list-io' and
> > directly match the the structure seen in the iLOM shell.
> > 
> > $ doas ldomctl list-io
> > PATHNAME
> > /@400/@2/@0/@8  /SYS/MB/PCIE0
> > /@500/@2/@0/@a  /SYS/MB/PCIE1
> > /@400/@2/@0/@4  /SYS/MB/PCIE2
> > /@500/@2/@0/@6  /SYS/MB/PCIE3
> > /@400/@2/@0/@0  /SYS/MB/PCIE4
> > /@500/@2/@0/@0  /SYS/MB/PCIE5
> > /@400/@1/@0/@8  /SYS/MB/PCIE6
> > /@500/@1/@0/@6  /SYS/MB/PCIE7
> > /@400/@1/@0/@c  /SYS/MB/PCIE8
> > /@500/@1/@0/@0  /SYS/MB/PCIE9
> > /@400/@2/@0/@e  /SYS/MB/SASHBA
> > /@400/@1/@0/@4  /SYS/MB/NET0
> > /@500/@1/@0/@5  /SYS/MB/NET2
> > 
> > OK?
> Anyone?  I'll probably just commit in a few days unless someone objects,
> printing device names is a nice aid in partitioning machines, especially
> since picking the wrong iodevice needs power cycling and might even
> prevent the primary domain from booting since another guest domain
> accidentially got its root disk device.
> 
> PS: with the new mdprint port on ports@ one can easily inspect the PRI
> and verify:
> 
>   $ mdprint -pqng component.assignable-path,component.nac ./pri |
>   > grep assignable
>   component|assignable-path="/@400/@2/@0/@8"|nac="/SYS/MB/PCIE0"
>   component|assignable-path="/@500/@2/@0/@a"|nac="/SYS/MB/PCIE1"
>   component|assignable-path="/@400/@2/@0/@4"|nac="/SYS/MB/PCIE2"
>   component|assignable-path="/@500/@2/@0/@6"|nac="/SYS/MB/PCIE3"
>   component|assignable-path="/@400/@2/@0/@0"|nac="/SYS/MB/PCIE4"
>   component|assignable-path="/@500/@2/@0/@0"|nac="/SYS/MB/PCIE5"
>   component|assignable-path="/@400/@1/@0/@8"|nac="/SYS/MB/PCIE6"
>   component|assignable-path="/@500/@1/@0/@6"|nac="/SYS/MB/PCIE7"
>   component|assignable-path="/@400/@1/@0/@c"|nac="/SYS/MB/PCIE8"
>   component|assignable-path="/@500/@1/@0/@0"|nac="/SYS/MB/PCIE9"
>   component|assignable-path="/@400/@2/@0/@e"|nac="/SYS/MB/SASHBA"
>   component|assignable-path="/@400/@1/@0/@4"|nac="/SYS/MB/NET0"
>   component|assignable-path="/@500/@1/@0/@5"|nac="/SYS/MB/NET2"
> 
> That's how ldomctl works around iodevices: They're "component" nodes in
> the tree that have an "assignable-path" attribute;  ldomctl picks those,
> assigns whatever is specified in ldom.conf to guest domains, and assigns
> all remaining components to the primary domain.

Looks like I missed this.  I like this.  However:

> Index: config.c
> ===
> RCS file: /cvs/src/usr.sbin/ldomctl/config.c,v
> retrieving revision 1.34
> diff -u -p -r1.34 config.c
> --- config.c  21 Feb 2020 19:39:28 -  1.34
> +++ config.c  7 Mar 2020 16:39:25 -
> @@ -48,6 +48,7 @@ TAILQ_HEAD(, core) cores;
>  
>  struct component {
>   const char *path;
> + const char *nac;
>   int assigned;
>  
>   struct md_node *hv_node;
> @@ -216,6 +217,7 @@ pri_init_components(struct md *md)
>   struct component *component;
>   struct md_node *node;
>   const char *path;
> + const char *nac;
>   const char *type;
>  
>   TAILQ_INIT();
> @@ -228,6 +230,10 @@ pri_init_components(struct md *md)
>   if (md_get_prop_str(md, node, "assignable-path", )) {
>   component = xzalloc(sizeof(*component));
>   component->path = path;
> + if (md_get_prop_str(md, node, "nac", ))
> + component->nac = nac;
> + else
> + component->nac = "-";
>   TAILQ_INSERT_TAIL(, component, link);
>   }
>  
> @@ -2889,7 +2896,8 @@ list_components(void)
>  
>   pri_init(pri);
>  
> + printf("PATH\t\tNAME\n");

Using tabs to make things line up isn't going to work very well.  So
better use field widths in the printf calls.

>   TAILQ_FOREACH(component, , link) {
> - printf("%s\n", component->path);
> + printf("%s\t%s\n", component->path, component->nac);
>   }
>  }
> 



Re: ktrwriteraw & vget

2020-03-17 Thread Martin Pieuchot
On 17/03/20(Tue) 04:02, Philip Guenther wrote:
> On Tue, Mar 17, 2020 at 1:07 AM Martin Pieuchot  wrote:
> [...]
> > @@ -663,8 +668,6 @@ ktrwriteraw(struct proc *curp, struct vn
> > LIST_FOREACH(pr, , ps_list)
> > if (pr->ps_tracevp == vp && pr->ps_tracecred == cred)
> > ktrcleartrace(pr);
> > -
> > -   vput(vp);
> > return (error);
> >  }
> >
> 
> This looks unsafe to me: isn't ktrcleartrace() only safe if the caller
> holds a reference to the vnode?  Once ktrcleartrace() clears the reference
> from the current thread's process and it goes on the freelist, can't the
> vnode vp points to be invalidated and reused?

As long as a process holds a reference to the vnode, via `ps_tracevp',
it wont be recycle.  Only the last call of ktrcleartrace() will release
the vnode via vrele(9).



Re: bug? in getopt(3) / possible [PATCH]

2020-03-17 Thread Martijn van Duren
Included millert, since he imported this code.
He might shed some more light on what's intended.

On 3/15/20 11:27 PM, 0xef967...@gmail.com wrote:
> On Sun, Mar 15, 2020 at 07:32:52PM +0100, Martijn van Duren wrote:
>> tl;dr new diff below
> 
> Check with:
> 
> OPTS=r- ./getopt-test-MvD2 -r-
> getopt-test-MvD2: unknown option -- -
>   #U- |
> 
> With the unpatched code, my patch, and the original 44BSD, it's:
> 
> OPTS=r- ./getopt-test-O -r-
> {./getopt-test-O}  <-> |
> 
> See proposed fix at the end.

This is intentional. From getopt_long(3):
 OpenBSD  a ‘-’ within the option string matches a ‘-’ (single
  dash) on the command line.  This functionality is
  provided for backward compatibility with programs,
  such as su(1), that use ‘-’ as an option flag.  This
  practice is wrong, and should not be used in any
  current development.

The original might have accepted this, but if the getopt_long(3) manpage
is rather specific about this and the current code goes out of its way
to make sure that it only appears at the end of the string (-q-q doesn't
work). So why go out of your way to disallow that behaviour, but not
catch the case you describe here (and is not documented as such).

If we do care about this particular feature then why add the additional
checks at all? Why not let all the code prior to this section handle all
the common "-" cases and let the rest utilize "-" as any normal getopt
character, including an optstring with "-:" which would allow -q-
and "- ".

My reasoning is: Let's restrict this misfeature as much as possible and
only use it for where it's needed (su(1)). But let's see what millert
or any of the other more experienced devs have to say.
> 
>>optreset and flags. Note that on line 317 we increment the pointer on
>>options. This means that the "oli == options" check in your code is
>>not needed and even introduces a bug when someone would be stupid
>>enough to create an optstring which starts with "--", where a "-"
>>option stop to being recognized:
> 
> Yes, that check was dumb, it should be removed.
> 
>> On 3/15/20 11:21 AM, 0xef967...@gmail.com wrote:
>>> Notice that OpenBSD's getopt(3) explicitly allows to use "-" as an
>>> option character, and there may be a /separate/ bug related to that,
>>> but my patch doesn't change or affect it in any way.
>>
>> This is in line with getopt(3) and getopt_long(3) expands on that
>> saying it's (at least) for su(1). So if we want to remove this
>> (mis)feature a lot more scrutiny needs to be done and I'm not feeling
>> like going down that rab(bit|ies)-hole any time soon.
> 
> I didn't say that that feature should be removed, but that there may
> be another bug/incompat related to that. Since it was there since 44BSD,
> it should be left as is (IMVHO).
> 
>> I did some pointer dumping inside your test-application and inside
>> getopt_internal and found that the OPTS environment variable is 1 byte
>> from the last place pointer. When you overwrite every byte of the
>> original avs with ':' (including the terminating NUL) you expand the
>> string length of the place variable inside getopt_internal and it starts
>> to parse the "garbage" you see in your example.
> 
> That was of course intentional. It's perfectly legit to overwrite
> the args strings (including their terminating \0). And it makes for
> more dramatic testcases ;-)
>  
>> Since our manpage explicitly states that you need to set optreset I
>> don't know if it's worth adding a "fix" for, if such a thing is at
> 
> No need to add another fix for it; that return (-1) that my (and your
> current patch) eliminates is the only path through which -1 can be
> returned /without/ setting "place = EMSG".
> 
>>  if ((optchar = (int)*place++) == (int)':' ||
>> -(optchar == (int)'-' && *place != '\0') ||
>> -(oli = strchr(options, optchar)) == NULL) {
>> -/*
>> - * If the user specified "-" and  '-' isn't listed in
>> - * options, return -1 (non-option) as per POSIX.
>> - * Otherwise, it is an unknown option character (or ':').
>> - */
>> -if (optchar == (int)'-' && *place == '\0')
>> -return (-1);
>> +(oli = strchr(options, optchar)) == NULL ||
>> +(optchar == '-' &&
>> +(oli == NULL || strcmp(nargv[optind], "-") != 0))) {
>>  if (!*place)
>>  ++optind;
>>  if (PRINT_ERROR)
> 
> ... || (foo = bar) == NULL || (baz && (foo == NULL || quux))
> 
> is the same as
> 
> ... || (foo = bar) == NULL || (baz && quux)
> 
> since the "||" is shortcutting.
> 
> But I still don't see how "quux" (strcmp(nargv[optind], "-") != 0)
> may happen with a args which is just "-" (since it's already caught by
> the "If we have "-" do nothing" check above it).
> 
> It will however BREAK a case like that mentioned at the beginning
> of 

Re: ldomctl: list-io: print names

2020-03-17 Thread Klemens Nanni
On Sat, Mar 07, 2020 at 05:44:12PM +0100, Klemens Nanni wrote:
> Next to the IO device path to be used in ldom.conf we can also print the
> name which actually tells in human readable form what device we're
> looking at.
> 
> Those names are printed in similar format by Solaris `ldm list-io' and
> directly match the the structure seen in the iLOM shell.
> 
>   $ doas ldomctl list-io
>   PATHNAME
>   /@400/@2/@0/@8  /SYS/MB/PCIE0
>   /@500/@2/@0/@a  /SYS/MB/PCIE1
>   /@400/@2/@0/@4  /SYS/MB/PCIE2
>   /@500/@2/@0/@6  /SYS/MB/PCIE3
>   /@400/@2/@0/@0  /SYS/MB/PCIE4
>   /@500/@2/@0/@0  /SYS/MB/PCIE5
>   /@400/@1/@0/@8  /SYS/MB/PCIE6
>   /@500/@1/@0/@6  /SYS/MB/PCIE7
>   /@400/@1/@0/@c  /SYS/MB/PCIE8
>   /@500/@1/@0/@0  /SYS/MB/PCIE9
>   /@400/@2/@0/@e  /SYS/MB/SASHBA
>   /@400/@1/@0/@4  /SYS/MB/NET0
>   /@500/@1/@0/@5  /SYS/MB/NET2
> 
> OK?
Anyone?  I'll probably just commit in a few days unless someone objects,
printing device names is a nice aid in partitioning machines, especially
since picking the wrong iodevice needs power cycling and might even
prevent the primary domain from booting since another guest domain
accidentially got its root disk device.

PS: with the new mdprint port on ports@ one can easily inspect the PRI
and verify:

$ mdprint -pqng component.assignable-path,component.nac ./pri |
> grep assignable
component|assignable-path="/@400/@2/@0/@8"|nac="/SYS/MB/PCIE0"
component|assignable-path="/@500/@2/@0/@a"|nac="/SYS/MB/PCIE1"
component|assignable-path="/@400/@2/@0/@4"|nac="/SYS/MB/PCIE2"
component|assignable-path="/@500/@2/@0/@6"|nac="/SYS/MB/PCIE3"
component|assignable-path="/@400/@2/@0/@0"|nac="/SYS/MB/PCIE4"
component|assignable-path="/@500/@2/@0/@0"|nac="/SYS/MB/PCIE5"
component|assignable-path="/@400/@1/@0/@8"|nac="/SYS/MB/PCIE6"
component|assignable-path="/@500/@1/@0/@6"|nac="/SYS/MB/PCIE7"
component|assignable-path="/@400/@1/@0/@c"|nac="/SYS/MB/PCIE8"
component|assignable-path="/@500/@1/@0/@0"|nac="/SYS/MB/PCIE9"
component|assignable-path="/@400/@2/@0/@e"|nac="/SYS/MB/SASHBA"
component|assignable-path="/@400/@1/@0/@4"|nac="/SYS/MB/NET0"
component|assignable-path="/@500/@1/@0/@5"|nac="/SYS/MB/NET2"

That's how ldomctl works around iodevices: They're "component" nodes in
the tree that have an "assignable-path" attribute;  ldomctl picks those,
assigns whatever is specified in ldom.conf to guest domains, and assigns
all remaining components to the primary domain.

Index: config.c
===
RCS file: /cvs/src/usr.sbin/ldomctl/config.c,v
retrieving revision 1.34
diff -u -p -r1.34 config.c
--- config.c21 Feb 2020 19:39:28 -  1.34
+++ config.c7 Mar 2020 16:39:25 -
@@ -48,6 +48,7 @@ TAILQ_HEAD(, core) cores;
 
 struct component {
const char *path;
+   const char *nac;
int assigned;
 
struct md_node *hv_node;
@@ -216,6 +217,7 @@ pri_init_components(struct md *md)
struct component *component;
struct md_node *node;
const char *path;
+   const char *nac;
const char *type;
 
TAILQ_INIT();
@@ -228,6 +230,10 @@ pri_init_components(struct md *md)
if (md_get_prop_str(md, node, "assignable-path", )) {
component = xzalloc(sizeof(*component));
component->path = path;
+   if (md_get_prop_str(md, node, "nac", ))
+   component->nac = nac;
+   else
+   component->nac = "-";
TAILQ_INSERT_TAIL(, component, link);
}
 
@@ -2889,7 +2896,8 @@ list_components(void)
 
pri_init(pri);
 
+   printf("PATH\t\tNAME\n");
TAILQ_FOREACH(component, , link) {
-   printf("%s\n", component->path);
+   printf("%s\t%s\n", component->path, component->nac);
}
 }



Re: ktrwriteraw & vget

2020-03-17 Thread Philip Guenther
On Tue, Mar 17, 2020 at 1:07 AM Martin Pieuchot  wrote:

> On 16/03/20(Mon) 14:01, Martin Pieuchot wrote:
> > vget(9) might fail, stop right away if that happens.
> >
> > CID 1453020 Unchecked return value.
>
> Updated diff that stops tracing if vget(9) fails, similar to what's
> currently done if VOP_WRITE(9) fails, suggested by visa@.
>
> Code shuffling is there to avoid calling vput(9) if vget(9) doesn't
> succeed.
>
> ok?
>
> Index: kern/kern_ktrace.c
> ===
> RCS file: /cvs/src/sys/kern/kern_ktrace.c,v
> retrieving revision 1.100
> diff -u -p -r1.100 kern_ktrace.c
> --- kern/kern_ktrace.c  6 Oct 2019 16:24:14 -   1.100
> +++ kern/kern_ktrace.c  17 Mar 2020 09:46:03 -
> @@ -649,12 +649,17 @@ ktrwriteraw(struct proc *curp, struct vn
> auio.uio_iovcnt++;
> auio.uio_resid += kth->ktr_len;
> }
> -   vget(vp, LK_EXCLUSIVE | LK_RETRY);
> +   error = vget(vp, LK_EXCLUSIVE | LK_RETRY);
> +   if (error)
> +   goto bad;
> error = VOP_WRITE(vp, , IO_UNIT|IO_APPEND, cred);
> -   if (!error) {
> -   vput(vp);
> -   return (0);
> -   }
> +   vput(vp);
> +   if (error)
> +   goto bad;
> +
> +   return (0);
> +
> +bad:
> /*
>  * If error encountered, give up tracing on this vnode.
>  */
> @@ -663,8 +668,6 @@ ktrwriteraw(struct proc *curp, struct vn
> LIST_FOREACH(pr, , ps_list)
> if (pr->ps_tracevp == vp && pr->ps_tracecred == cred)
> ktrcleartrace(pr);
> -
> -   vput(vp);
> return (error);
>  }
>

This looks unsafe to me: isn't ktrcleartrace() only safe if the caller
holds a reference to the vnode?  Once ktrcleartrace() clears the reference
from the current thread's process and it goes on the freelist, can't the
vnode vp points to be invalidated and reused?

Maybe this vget() should be split into vref() + vn_lock(), and the vput()
similarly split into VOP_UNLOCK() + vrele(), so the vrele() can be left
after this LIST_FOREACH() while the VOP_UNLOCK() is hoisted to the
"vn_lock() succeeded" path.


Philip Guenther


Please test: ix(4)+em(4) MSI-X towards multiqueues

2020-03-17 Thread Martin Pieuchot
Diff below allows ix(4) and em(4) to establish two MSI-X handlers.
Multiqueue support is intentionally not included in this diff.

One handler is for the single queue (index ":0") and one for the
link interrupt:

  # vmstat -i |grep ix0
  irq114/ix0:0 73178178 3758
  irq115/ix0  20

A per-driver toggle allows to switch MSI-X support on/off.  My plan is
to commit with the switch "off" for the moment.  Switching it to "on"
should be better done in the beginning of a release cycle.  However it
is "on" in the diff below for testing purposes.

The em(4) diff doesn't include support for 82575 nor 82574.  If somebody
has such hardware and is interested I can lift the necessary bits from
FreeBSD.  That said em_setup_queues_msix() contains references to 82575
to help diffing with FreeBSD.

The ix(4) diff cannot be committed right now as it breaks the build on
architectures that define pci_intr_map_msix() to (-1).  The compiler
complains that the arguments of the macro are unused.

  /sys/dev/pci/if_ix.c:1789:21: error: unused variable 'dummy'
[-Werror,-Wunused-variable]
  pci_intr_handle_tdummy;
   ^
  /sys/dev/pci/if_ix.c:1788:26: error: unused variable 'pa'
[-Werror,-Wunused-variable]
  struct pci_attach_args  *pa = >os_pa;

So I'd like to know if that's the way to test if MSI-X can be used:

   /*
* Try a dummy map, maybe this bus doesn't like MSI, this function
* has no side effects.
*/
   if (pci_intr_map_msix(pa, 0, ))
   return (0);

I'd appreciate tests on many hardware as possible, especially for em(4).
If you have been testing an earlier version of these diffs, please do
it again :)

Thanks,
Martin

Index: dev/pci/if_em.c
===
RCS file: /cvs/src/sys/dev/pci/if_em.c,v
retrieving revision 1.347
diff -u -p -r1.347 if_em.c
--- dev/pci/if_em.c 8 Mar 2020 11:43:43 -   1.347
+++ dev/pci/if_em.c 17 Mar 2020 11:18:01 -
@@ -233,6 +233,7 @@ void em_defer_attach(struct device*);
 int  em_detach(struct device *, int);
 int  em_activate(struct device *, int);
 int  em_intr(void *);
+int  em_allocate_legacy(struct em_softc *);
 void em_start(struct ifqueue *);
 int  em_ioctl(struct ifnet *, u_long, caddr_t);
 void em_watchdog(struct ifnet *);
@@ -290,6 +291,13 @@ void em_flush_tx_ring(struct em_queue *)
 void em_flush_rx_ring(struct em_queue *);
 void em_flush_desc_rings(struct em_softc *);
 
+/* MSIX/Multiqueue functions */
+int  em_allocate_msix(struct em_softc *);
+int  em_setup_queues_msix(struct em_softc *);
+int  em_queue_intr_msix(void *);
+int  em_link_intr_msix(void *);
+void em_enable_queue_intr_msix(struct em_queue *);
+
 /*
  *  OpenBSD Device Interface Entry Points
  */
@@ -304,6 +312,7 @@ struct cfdriver em_cd = {
 };
 
 static int em_smart_pwr_down = FALSE;
+int em_enable_msix = 1;
 
 /*
  *  Device identification routine
@@ -919,6 +928,14 @@ em_init(void *arg)
}
em_initialize_receive_unit(sc);
 
+   if (sc->msix) {
+   if (em_setup_queues_msix(sc)) {
+   printf("%s: Can't setup msix queues\n", DEVNAME(sc));
+   splx(s);
+   return;
+   }
+   }
+
/* Program promiscuous mode and multicast filters. */
em_iff(sc);
 
@@ -1617,10 +1634,7 @@ int
 em_allocate_pci_resources(struct em_softc *sc)
 {
int val, rid;
-   pci_intr_handle_t   ih;
-   const char  *intrstr = NULL;
struct pci_attach_args *pa = >osdep.em_pa;
-   pci_chipset_tag_t   pc = pa->pa_pc;
struct em_queue*que = NULL;
 
val = pci_conf_read(pa->pa_pc, pa->pa_tag, EM_MMBA);
@@ -1691,6 +1705,9 @@ em_allocate_pci_resources(struct em_soft
}
 }
 
+   sc->osdep.dev = (struct device *)sc;
+   sc->hw.back = >osdep;
+
/* Only one queue for the moment. */
que = malloc(sizeof(struct em_queue), M_DEVBUF, M_NOWAIT | M_ZERO);
if (que == NULL) {
@@ -1703,29 +1720,10 @@ em_allocate_pci_resources(struct em_soft
 
sc->queues = que;
sc->num_queues = 1;
+   sc->msix = 0;
sc->legacy_irq = 0;
-   if (pci_intr_map_msi(pa, )) {
-   if (pci_intr_map(pa, )) {
-   printf(": couldn't map interrupt\n");
-   return (ENXIO);
-   }
-   sc->legacy_irq = 1;
-   }
-
-   sc->osdep.dev = (struct device *)sc;
-   sc->hw.back = >osdep;
-
-   intrstr = pci_intr_string(pc, ih);
-   sc->sc_intrhand = pci_intr_establish(pc, ih, IPL_NET | IPL_MPSAFE,
-   

Re: macppc kernel and clang

2020-03-17 Thread Mark Kettenis
> From: "Theo de Raadt" 
> Date: Mon, 16 Mar 2020 19:13:13 -0600
> 
> ok deraadt
> 
> Next up, to figure out the right plan for ofw.
> 
> Thank you so much for figuring out these two details.
> 
> How are the bootblocks faring?

I believe somebody already tested the bootloader.  Doesn't hurt to try
again though.

> And userland?

base was in pretty good shape already and I'm rebuilding now that the
ABI fix is in.  Haven't tried xenocara yet.

> George Koehler  wrote:
> 
> > On Mon, 16 Mar 2020 12:54:52 +0100 (CET)
> > Mark Kettenis  wrote:
> > 
> > > I had a look at what NetBSD does, and they use '%L0' instead of
> > > '%0+1'.  As far as I can tell this works on both gcc and clang.  The
> > > diff below produces a working kernel when building with gcc.  Not yet
> > > in a position to test a clang-built kernel myself yet.  But if this
> > > produces a working kernel with clang as well, I'd prefer this diff
> > > instead of yours.
> > 
> > Yes, the clang kernel is working with your %L0 diff and the noinline
> > stuff.  I now prefer your %L0 diff, ok gkoehler@
> > 
> > "mftb %L0" becomes "mftb ${0:L}" in LLVM IR (clang -S -emit-llvm),
> > then LLVM handles the 'L' in PPCAsmPrinter::PrintAsmOperand in
> > /usr/src/gnu/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
> > 
> > > Index: arch/powerpc/include/cpu.h
> > > ===
> > > RCS file: /cvs/src/sys/arch/powerpc/include/cpu.h,v
> > > retrieving revision 1.65
> > > diff -u -p -r1.65 cpu.h
> > > --- arch/powerpc/include/cpu.h23 Mar 2019 05:27:53 -  1.65
> > > +++ arch/powerpc/include/cpu.h16 Mar 2020 11:30:42 -
> > > @@ -336,7 +336,7 @@ ppc_mftb(void)
> > >   u_long scratch;
> > >   u_int64_t tb;
> > >  
> > > - __asm volatile ("1: mftbu %0; mftb %0+1; mftbu %1;"
> > > + __asm volatile ("1: mftbu %0; mftb %L0; mftbu %1;"
> > >   " cmpw 0,%0,%1; bne 1b" : "=r"(tb), "=r"(scratch));
> > >   return tb;
> > >  }
> > 
> > 
> > -- 
> > George Koehler 
> > 
> 



Re: ktrwriteraw & vget

2020-03-17 Thread Martin Pieuchot
On 16/03/20(Mon) 14:01, Martin Pieuchot wrote:
> vget(9) might fail, stop right away if that happens.
> 
> CID 1453020 Unchecked return value.

Updated diff that stops tracing if vget(9) fails, similar to what's
currently done if VOP_WRITE(9) fails, suggested by visa@.

Code shuffling is there to avoid calling vput(9) if vget(9) doesn't
succeed.

ok?

Index: kern/kern_ktrace.c
===
RCS file: /cvs/src/sys/kern/kern_ktrace.c,v
retrieving revision 1.100
diff -u -p -r1.100 kern_ktrace.c
--- kern/kern_ktrace.c  6 Oct 2019 16:24:14 -   1.100
+++ kern/kern_ktrace.c  17 Mar 2020 09:46:03 -
@@ -649,12 +649,17 @@ ktrwriteraw(struct proc *curp, struct vn
auio.uio_iovcnt++;
auio.uio_resid += kth->ktr_len;
}
-   vget(vp, LK_EXCLUSIVE | LK_RETRY);
+   error = vget(vp, LK_EXCLUSIVE | LK_RETRY);
+   if (error)
+   goto bad;
error = VOP_WRITE(vp, , IO_UNIT|IO_APPEND, cred);
-   if (!error) {
-   vput(vp);
-   return (0);
-   }
+   vput(vp);
+   if (error)
+   goto bad;
+
+   return (0);
+
+bad:
/*
 * If error encountered, give up tracing on this vnode.
 */
@@ -663,8 +668,6 @@ ktrwriteraw(struct proc *curp, struct vn
LIST_FOREACH(pr, , ps_list)
if (pr->ps_tracevp == vp && pr->ps_tracecred == cred)
ktrcleartrace(pr);
-
-   vput(vp);
return (error);
 }
 



Re: unbound 1.10.0

2020-03-17 Thread Renaud Allard



On 3/16/20 10:54 PM, Jordan Geoghegan wrote:


That's good to hear, I've had a number of issues with DoT stalling on 
1.9.4 and 1.9.6. I've found having multiple entries in my forwarders 
section has somewhat mitigated the issue. I wonder if the stalling 
problem is related to this:

https://github.com/NLnetLabs/unbound/issues/47


I was talking about DoT servers, I haven't really noticed any big issue 
with DoT forwarding.


That said, I had 2 stalls over the night, so unbound 1.10.0 still 
doesn't solve the issue.
Besides, I noticed that the android app "nebulo" wasn't happy with 
either 1.9.6 or 1.10.0, but works fine with dnsdist which I am now 
trying following sthen@ advise.




smime.p7s
Description: S/MIME Cryptographic Signature