Re: Do not spin on the NET_LOCK() in kqueue

2021-07-10 Thread Vitaliy Makkoveev
Thanks for explanation. I missed the last commit to
sys/kern/uipc_socket.c

> On 10 Jul 2021, at 22:56, Martin Pieuchot  wrote:
> 
> On 10/07/21(Sat) 21:53, Vitaliy Makkoveev wrote:
>> Hi,
>> 
>> In filt_solisten_common() you touches `so_qlen’ only. It’s not
>> related to buffer and not protected by introduced `sb_mtx’ so
>> the solock() replacement in filt_solisten*() is wrong.
>> 
>> However, in filt_solisten_common() you only checks is
>> `so_qlen’ != 0 condition and such check could be performed lockless.
>> I propose you to commit this by separate diff.
> 
> The dance is there because knote needs a lock which could be the
> mutex.
> 
>>> @@ -208,8 +208,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf 
>>> *m, struct mbuf *nam,
>>>  * Adjust backpressure on sender
>>>  * and wakeup any waiting to write.
>>>  */
>>> +   mtx_enter(>so_snd.sb_mtx);
>>> so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
>>> so2->so_snd.sb_cc = so->so_rcv.sb_cc;
>>> +   mtx_leave(>so_snd.sb_mtx);
>>> sowwakeup(so2);
>>> break;
>>> 
>> 
>> This is 'PRU_RCVD’ case, so you hold solock() on `so’ and it’s receive
>> buffer is locked by sblock(). Is it assumed `sb_mbcnt’ and `sb_cc’
>> modification of `so_rcv’ protected by solock()? Should the both buffers
>> be locked here? I’m asking because you only remove solock() from kqueue(9)
>> path and the solock() still serialises the rest of sockets, but you are
>> going to reduce solock().
>> 
>> The same question for 'PRU_SEND’ case.
> 
> In this case the fields of "so2" are modified.  Modifications are always
> serialized by the solock.  The mutex is there to prevent another thread
> running the kqueue filters to read between the update of `sb_mbcnt' and
> `sb_cc'.
> 
>>> @@ -284,8 +286,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf 
>>> *m, struct mbuf *nam,
>>> sbappendrecord(so2, >so_rcv, m);
>>> else
>>> sbappend(so2, >so_rcv, m);
>>> +   mtx_enter(>so_snd.sb_mtx);
>>> so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt;
>>> so->so_snd.sb_cc = so2->so_rcv.sb_cc;
>>> +   mtx_leave(>so_snd.sb_mtx);
>>> if (so2->so_rcv.sb_cc > 0)
>>> sorwakeup(so2);
>> 
>> 
>> Since you touch 'so2->so_rcv’ content here, you want to lock it instead
>> of 'so2->so_snd’, right?
> 
> Indeed, that's a mistake, thanks!
> 



ipsec: remove unused `PolicyHead' from 'sockaddr_encap' structure

2021-07-10 Thread Vitaliy Makkoveev
This member is never set or used. Also I kept 'SENT_IP6' definition for
prevent the potential break of third party software. Is it ok to
redefine it to '0x0002'? At least openswan wants this [1].

1. 
https://github.com/xelerance/Openswan/blob/master/include/openswan/ipsec_encap.h#L20

Index: sys/netinet/ip_ipsp.h
===
RCS file: /cvs/src/sys/netinet/ip_ipsp.h,v
retrieving revision 1.200
diff -u -p -r1.200 ip_ipsp.h
--- sys/netinet/ip_ipsp.h   8 Jul 2021 21:07:19 -   1.200
+++ sys/netinet/ip_ipsp.h   11 Jul 2021 02:25:17 -
@@ -113,8 +113,6 @@ struct sockaddr_encap {
u_int16_t   Sport;
u_int16_t   Dport;
} Sip6;
-
-   struct ipsec_policy *PolicyHead;/* SENT_IPSP */
} Sen;
 };
 
@@ -208,7 +206,6 @@ struct m_tag;
 #definesen_ip6_sport   Sen.Sip6.Sport
 #definesen_ip6_dport   Sen.Sip6.Dport
 #definesen_ip6_direction   Sen.Sip6.Direction
-#definesen_ipspSen.PolicyHead
 
 /*
  * The "type" is really part of the address as far as the routing
@@ -219,7 +216,6 @@ struct m_tag;
  */
 
 #defineSENT_IP40x0001  /* data is two struct in_addr */
-#defineSENT_IPSP   0x0002  /* data as in IP4/6 plus SPI */
 #defineSENT_IP60x0004
 
 #defineSENT_LENsizeof(struct sockaddr_encap)



ipsec: document locks of some structures

2021-07-10 Thread Vitaliy Makkoveev
The diff below documents locks of 'ipsec_ids', 'ipsec_acquire' and
'ipsec_policy' structures.

I marked `ipa_pcb' as immutable, but we never set or access it and I
want to remove it with the next diff.

Index: sys/netinet/ip_ipsp.h
===
RCS file: /cvs/src/sys/netinet/ip_ipsp.h,v
retrieving revision 1.200
diff -u -p -r1.200 ip_ipsp.h
--- sys/netinet/ip_ipsp.h   8 Jul 2021 21:07:19 -   1.200
+++ sys/netinet/ip_ipsp.h   11 Jul 2021 00:59:18 -
@@ -45,6 +45,12 @@
 #include 
 #include 
 
+/*
+ * Locks used to protect struct members in this file:
+ * I   Immutable after creation
+ * N   netlock
+ */
+
 union sockaddr_union {
struct sockaddr sa;
struct sockaddr_in  sin;
@@ -230,36 +236,37 @@ struct ipsec_id {
 };
 
 struct ipsec_ids {
-   RBT_ENTRY(ipsec_ids)id_node_id;
-   RBT_ENTRY(ipsec_ids)id_node_flow;
-   struct ipsec_id *id_local;
-   struct ipsec_id *id_remote;
-   u_int32_t   id_flow;
-   int id_refcount;
+   RBT_ENTRY(ipsec_ids)id_node_id; /* [N] */
+   RBT_ENTRY(ipsec_ids)id_node_flow;   /* [N] */
+   struct ipsec_id *id_local;  /* [I] */
+   struct ipsec_id *id_remote; /* [I] */
+   u_int32_t   id_flow;/* [N] */
+   int id_refcount;/* [N] */
struct timeout  id_timeout;
 };
 RBT_HEAD(ipsec_ids_flows, ipsec_ids);
 RBT_HEAD(ipsec_ids_tree, ipsec_ids);
 
 struct ipsec_acquire {
-   union sockaddr_unionipa_addr;
-   u_int32_t   ipa_seq;
-   struct sockaddr_encap   ipa_info;
-   struct sockaddr_encap   ipa_mask;
+   union sockaddr_unionipa_addr;   /* [I] */
+   u_int32_t   ipa_seq;/* [I] */ 
+   struct sockaddr_encap   ipa_info;   /* [I] */
+   struct sockaddr_encap   ipa_mask;   /* [I] */
struct timeout  ipa_timeout;
-   struct ipsec_policy *ipa_policy;
-   struct inpcb*ipa_pcb;
-   TAILQ_ENTRY(ipsec_acquire)  ipa_ipo_next;
-   TAILQ_ENTRY(ipsec_acquire)  ipa_next;
+   struct ipsec_policy *ipa_policy;/* [I] */
+   struct inpcb*ipa_pcb;   /* [I] */
+   TAILQ_ENTRY(ipsec_acquire)  ipa_ipo_next;   /* [N] */
+   TAILQ_ENTRY(ipsec_acquire)  ipa_next;   /* [N] */
 };
 
 struct ipsec_policy {
struct radix_node   ipo_nodes[2];   /* radix tree glue */
-   struct sockaddr_encap   ipo_addr;
-   struct sockaddr_encap   ipo_mask;
+   struct sockaddr_encap   ipo_addr;   /* [I] */
+   struct sockaddr_encap   ipo_mask;   /* [I] */
 
-   union sockaddr_unionipo_src;/* Local address to use */
-   union sockaddr_unionipo_dst;/* Remote gateway -- if it's 
zeroed:
+   union sockaddr_unionipo_src;/* [N] Local address to use */
+   union sockaddr_unionipo_dst;/* [N] Remote gateway --
+* if it's zeroed:
 * - on output, we try to
 * contact the remote host
 * directly (if needed).
@@ -270,22 +277,28 @@ struct ipsec_policy {
 * mode was used.
 */
 
-   u_int64_t   ipo_last_searched;  /* Timestamp of last 
lookup */
-
-   u_int8_tipo_flags;  /* See IPSP_POLICY_* 
definitions */
-   u_int8_tipo_type;   /* USE/ACQUIRE/... */
-   u_int8_tipo_sproto; /* ESP/AH; if zero, use system 
dflts */
-   u_int   ipo_rdomain;
-
-   int ipo_ref_count;
-
-   struct tdb  *ipo_tdb;   /* Cached entry */
-
-   struct ipsec_ids*ipo_ids;
+   u_int64_t   ipo_last_searched;  /* [N] Timestamp
+  of last lookup */
 
-   TAILQ_HEAD(ipo_acquires_head, ipsec_acquire) ipo_acquires; /* List of 
acquires */
-   TAILQ_ENTRY(ipsec_policy)   ipo_tdb_next;   /* List TDB policies */
-   TAILQ_ENTRY(ipsec_policy)   ipo_list;   /* List of all policies 
*/
+   u_int8_tipo_flags;  /* [N] See IPSP_POLICY_*
+  definitions */
+   u_int8_tipo_type;   /* [N] USE/ACQUIRE/... */
+   u_int8_tipo_sproto; /* [N] ESP/AH; if zero,
+  use system dflts */
+ 

Re: Do not spin on the NET_LOCK() in kqueue

2021-07-10 Thread Martin Pieuchot
On 10/07/21(Sat) 21:53, Vitaliy Makkoveev wrote:
> Hi,
> 
> In filt_solisten_common() you touches `so_qlen’ only. It’s not
> related to buffer and not protected by introduced `sb_mtx’ so
> the solock() replacement in filt_solisten*() is wrong.
> 
> However, in filt_solisten_common() you only checks is
> `so_qlen’ != 0 condition and such check could be performed lockless.
> I propose you to commit this by separate diff.

The dance is there because knote needs a lock which could be the
mutex.

> > @@ -208,8 +208,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf 
> > *m, struct mbuf *nam,
> >  * Adjust backpressure on sender
> >  * and wakeup any waiting to write.
> >  */
> > +   mtx_enter(>so_snd.sb_mtx);
> > so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
> > so2->so_snd.sb_cc = so->so_rcv.sb_cc;
> > +   mtx_leave(>so_snd.sb_mtx);
> > sowwakeup(so2);
> > break;
> > 
> 
> This is 'PRU_RCVD’ case, so you hold solock() on `so’ and it’s receive
> buffer is locked by sblock(). Is it assumed `sb_mbcnt’ and `sb_cc’
> modification of `so_rcv’ protected by solock()? Should the both buffers
> be locked here? I’m asking because you only remove solock() from kqueue(9)
> path and the solock() still serialises the rest of sockets, but you are
> going to reduce solock().
> 
> The same question for 'PRU_SEND’ case.

In this case the fields of "so2" are modified.  Modifications are always
serialized by the solock.  The mutex is there to prevent another thread
running the kqueue filters to read between the update of `sb_mbcnt' and
`sb_cc'.

> > @@ -284,8 +286,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf 
> > *m, struct mbuf *nam,
> > sbappendrecord(so2, >so_rcv, m);
> > else
> > sbappend(so2, >so_rcv, m);
> > +   mtx_enter(>so_snd.sb_mtx);
> > so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt;
> > so->so_snd.sb_cc = so2->so_rcv.sb_cc;
> > +   mtx_leave(>so_snd.sb_mtx);
> > if (so2->so_rcv.sb_cc > 0)
> > sorwakeup(so2);
> 
> 
> Since you touch 'so2->so_rcv’ content here, you want to lock it instead
> of 'so2->so_snd’, right?

Indeed, that's a mistake, thanks!



Re: xmm(4): WIP diff for Intel XMM7360 LTE modem

2021-07-10 Thread Stuart Henderson
On 2021/07/09 17:22, Stuart Henderson wrote:
> On 2021/07/09 16:33, Stuart Henderson wrote:
> > Notes so far:
> > 
> > > +xmmc*)
> > > + dev=${U%.*}
> > > + func=${U#*.}
> > > + M xmmc$U c 101 $(($(($dev*16))+$func)) 660
> > > + ;;
> > 
> > "sh MAKEDEV xmmc" isn't enough, it needs "sh MAKEDEV xmmc.0"
> > 
> > > + ret = EIO;
> > > + syslog(LOG_ERR, "FCC unlock not implemented, yet");
> > 
> > Thus ends my initial experimentation ;)
> > 
> 
> oh; it's just in xmmctl of course! With the horrible python thing, I have
> a working connection.

If anyone else wants to play, here's a port roughly based on the one in
pkgsrc. Obviously not optimal but it is nice to be able to see it working.



xmm7360.tgz
Description: application/tar-gz


Re: Do not spin on the NET_LOCK() in kqueue

2021-07-10 Thread Vitaliy Makkoveev
Hi,

In filt_solisten_common() you touches `so_qlen’ only. It’s not
related to buffer and not protected by introduced `sb_mtx’ so
the solock() replacement in filt_solisten*() is wrong.

However, in filt_solisten_common() you only checks is
`so_qlen’ != 0 condition and such check could be performed lockless.
I propose you to commit this by separate diff.

Also there are two places I want to point: 

> @@ -208,8 +208,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, 
> struct mbuf *nam,
>* Adjust backpressure on sender
>* and wakeup any waiting to write.
>*/
> + mtx_enter(>so_snd.sb_mtx);
>   so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
>   so2->so_snd.sb_cc = so->so_rcv.sb_cc;
> + mtx_leave(>so_snd.sb_mtx);
>   sowwakeup(so2);
>   break;
> 

This is 'PRU_RCVD’ case, so you hold solock() on `so’ and it’s receive
buffer is locked by sblock(). Is it assumed `sb_mbcnt’ and `sb_cc’
modification of `so_rcv’ protected by solock()? Should the both buffers
be locked here? I’m asking because you only remove solock() from kqueue(9)
path and the solock() still serialises the rest of sockets, but you are
going to reduce solock().

The same question for 'PRU_SEND’ case.

> @@ -284,8 +286,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, 
> struct mbuf *nam,
>   sbappendrecord(so2, >so_rcv, m);
>   else
>   sbappend(so2, >so_rcv, m);
> + mtx_enter(>so_snd.sb_mtx);
>   so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt;
>   so->so_snd.sb_cc = so2->so_rcv.sb_cc;
> + mtx_leave(>so_snd.sb_mtx);
>   if (so2->so_rcv.sb_cc > 0)
>   sorwakeup(so2);


Since you touch 'so2->so_rcv’ content here, you want to lock it instead
of 'so2->so_snd’, right?


> On 10 Jul 2021, at 18:26, Martin Pieuchot  wrote:
> 
> One of the reasons for the drop of performances in the kqueue-based
> poll/select is the fact that kqueue filters are called up to 3 times
> per syscall and that they all spin on the NET_LOCK() for TCP/UDP
> packets.
> 
> Diff below is a RFC for improving the situation.
> 
> socket kqueue filters mainly check for the amount of available items to
> read/write.  This involves comparing various socket buffer fields (sb_cc,
> sb_lowat, etc).  The diff below introduces a new mutex to serialize
> updates of those fields with reads in the kqueue filters.
> 
> Since these fields are always modified with the socket lock held, either
> the mutex or the solock are enough to have a coherent view of them.
> Note that either of these locks is necessary only if multiple fields
> have to be read (like in sbspace()).
> 
> Other per-socket fields accessed in the kqueue filters are never
> combined (with &&) to determine a condition.  So assuming it is fine to
> read register-sized fields w/o the socket lock we can safely remove it
> there.
> 
> Could such mutex also be used to serialize klist updates?
> 
> Comments?
> 
> diff --git sys/kern/uipc_socket.c sys/kern/uipc_socket.c
> index dce20208828..d1cb9f4fc3b 100644
> --- sys/kern/uipc_socket.c
> +++ sys/kern/uipc_socket.c
> @@ -181,6 +181,8 @@ socreate(int dom, struct socket **aso, int type, int 
> proto)
>   so->so_egid = p->p_ucred->cr_gid;
>   so->so_cpid = p->p_p->ps_pid;
>   so->so_proto = prp;
> + mtx_init(>so_snd.sb_mtx, IPL_MPFLOOR);
> + mtx_init(>so_rcv.sb_mtx, IPL_MPFLOOR);
>   so->so_snd.sb_timeo_nsecs = INFSLP;
>   so->so_rcv.sb_timeo_nsecs = INFSLP;
> 
> @@ -276,7 +278,9 @@ sofree(struct socket *so, int s)
>   }
>   }
> #endif /* SOCKET_SPLICE */
> + mtx_enter(>so_snd.sb_mtx);
>   sbrelease(so, >so_snd);
> + mtx_leave(>so_snd.sb_mtx);
>   sorflush(so);
>   sounlock(so, s);
> #ifdef SOCKET_SPLICE
> @@ -1019,8 +1023,10 @@ dontblock:
>   *mp = m_copym(m, 0, len, M_WAIT);
>   m->m_data += len;
>   m->m_len -= len;
> + mtx_enter(>so_rcv.sb_mtx);
>   so->so_rcv.sb_cc -= len;
>   so->so_rcv.sb_datacc -= len;
> + mtx_leave(>so_rcv.sb_mtx);
>   }
>   }
>   if (so->so_oobmark) {
> @@ -1537,8 +1543,10 @@ somove(struct socket *so, int wait)
>   }
>   so->so_rcv.sb_mb->m_data += size;
>   so->so_rcv.sb_mb->m_len -= size;
> + mtx_enter(>so_rcv.sb_mtx);
>   so->so_rcv.sb_cc -= size;
>   so->so_rcv.sb_datacc -= size;
> + mtx_leave(>so_rcv.sb_mtx);
> 

Re: Cleanup of err(1, "unveil") pattern

2021-07-10 Thread Ashton Fagg
Friendly weekly ping.

Ashton Fagg  writes:

> Original thread and discussion here:
>
> https://marc.info/?l=openbsd-tech=162000231914017=2
>
> I started this a couple of months ago but realized I never actually
> finished this and submitted the full diff. So here it is, for whole
> src tree.
>
> I've run this through a `make build` here locally to ensure it builds -
> there's no behavioral changes anywhere, only the formatting of the error
> strings. Hopefully I've not missed any.
>
> If it's preferred for me to break the patch up into smaller pieces,
> that's fine - just let me know. I figured it's probably fine to apply
> all at once since it's related but I could be wrong.
>
> Perhaps another question is whether it's worth documenting this
> convention somewhere - if that's of interest also I'm happy to take a
> stab at that.

Index: bin/ps/ps.c
===
RCS file: /cvs/src/bin/ps/ps.c,v
retrieving revision 1.76
diff -u -p -u -p -r1.76 ps.c
--- bin/ps/ps.c	16 Dec 2019 19:21:16 -	1.76
+++ bin/ps/ps.c	2 Jul 2021 20:40:08 -
@@ -276,18 +276,18 @@ main(int argc, char *argv[])
 		errx(1, "%s", errbuf);
 
 	if (unveil(_PATH_DEVDB, "r") == -1 && errno != ENOENT)
-		err(1, "unveil");
+		err(1, "unveil %s", _PATH_DEVDB);
 	if (unveil(_PATH_DEV, "r") == -1 && errno != ENOENT)
-		err(1, "unveil");
+		err(1, "unveil %s", _PATH_DEV);
 	if (swapf)
 		if (unveil(swapf, "r") == -1)
-			err(1, "unveil");
+			err(1, "unveil %s", swapf);
 	if (nlistf)
 		if (unveil(nlistf, "r") == -1)
-			err(1, "unveil");
+			err(1, "unveil %s", nlistf);
 	if (memf)
 		if (unveil(memf, "r") == -1)
-			err(1, "unveil");
+			err(1, "unveil %s", memf);
 	if (pledge("stdio rpath getpw ps", NULL) == -1)
 		err(1, "pledge");
 
Index: games/tetris/tetris.c
===
RCS file: /cvs/src/games/tetris/tetris.c,v
retrieving revision 1.34
diff -u -p -u -p -r1.34 tetris.c
--- games/tetris/tetris.c	18 May 2019 19:38:25 -	1.34
+++ games/tetris/tetris.c	2 Jul 2021 20:40:09 -
@@ -234,7 +234,7 @@ main(int argc, char *argv[])
 	scr_init();
 
 	if (unveil(scorepath, "rwc") == -1)
-		err(1, "unveil");
+		err(1, "unveil %s", scorepath);
 
 	if (pledge("stdio rpath wpath cpath tty", NULL) == -1)
 		err(1, "pledge");
Index: libexec/comsat/comsat.c
===
RCS file: /cvs/src/libexec/comsat/comsat.c,v
retrieving revision 1.49
diff -u -p -u -p -r1.49 comsat.c
--- libexec/comsat/comsat.c	24 Sep 2018 22:56:54 -	1.49
+++ libexec/comsat/comsat.c	2 Jul 2021 20:40:17 -
@@ -92,13 +92,13 @@ main(int argc, char *argv[])
 	}
 
 	if (unveil(_PATH_MAILDIR, "r") == -1)
-		err(1, "unveil");
+		err(1, "unveil %s", _PATH_MAILDIR);
 	if (unveil(_PATH_UTMP, "r") == -1)
-		err(1, "unveil");
+		err(1, "unveil %s", _PATH_UTMP);
 	if (unveil("/tmp", "w") == -1)
-		err(1, "unveil");
+		err(1, "unveil /tmp");
 	if (unveil(_PATH_DEV, "rw") == -1)
-		err(1, "unveil");
+		err(1, "unveil %s", _PATH_DEV);
 	if (pledge("stdio rpath wpath proc tty", NULL) == -1)
 		err(1, "pledge");
 
Index: libexec/fingerd/fingerd.c
===
RCS file: /cvs/src/libexec/fingerd/fingerd.c,v
retrieving revision 1.41
diff -u -p -u -p -r1.41 fingerd.c
--- libexec/fingerd/fingerd.c	28 Jun 2019 13:32:53 -	1.41
+++ libexec/fingerd/fingerd.c	2 Jul 2021 20:40:17 -
@@ -109,7 +109,7 @@ main(int argc, char *argv[])
 		}
 
 	if (unveil(prog, "x") == -1)
-		err(1, "unveil");
+		err(1, "unveil %s", prog);
 	if (pledge("stdio inet dns proc exec", NULL) == -1)
 		err(1, "pledge");
 
Index: libexec/lockspool/lockspool.c
===
RCS file: /cvs/src/libexec/lockspool/lockspool.c,v
retrieving revision 1.21
diff -u -p -u -p -r1.21 lockspool.c
--- libexec/lockspool/lockspool.c	9 Feb 2020 14:59:20 -	1.21
+++ libexec/lockspool/lockspool.c	2 Jul 2021 20:40:17 -
@@ -54,7 +54,7 @@ main(int argc, char *argv[])
 	int holdfd;
 
 	if (unveil(_PATH_MAILDIR, "rwc") == -1)
-		err(1, "unveil");
+		err(1, "unveil %s", _PATH_MAILDIR);
 	if (pledge("stdio rpath wpath getpw cpath fattr", NULL) == -1)
 		err(1, "pledge");
 
Index: libexec/spamlogd/spamlogd.c
===
RCS file: /cvs/src/libexec/spamlogd/spamlogd.c,v
retrieving revision 1.31
diff -u -p -u -p -r1.31 spamlogd.c
--- libexec/spamlogd/spamlogd.c	25 Jul 2019 17:32:33 -	1.31
+++ libexec/spamlogd/spamlogd.c	2 Jul 2021 20:40:17 -
@@ -463,7 +463,7 @@ main(int argc, char **argv)
 	}
 
 	if (unveil(PATH_SPAMD_DB, "rw") == -1)
-		err(1, "unveil");
+		err(1, "unveil %s", PATH_SPAMD_DB);
 	if (syncsend) {
 		if (pledge("stdio rpath wpath inet flock", NULL) == -1)
 			err(1, "pledge");
Index: regress/sbin/ifconfig/ifaddr.c
===
RCS file: 

Re: cron(8): add '@' interval time specifier

2021-07-10 Thread Job Snijders
On Sat, Jul 10, 2021 at 09:05:15AM -0600, Theo de Raadt wrote:
> Job Snijders  wrote:
> > A use case could be running rpki-client more frequently than once an
> > hour:

Perhaps I choose a poor example, because of $work *I* run rpki-client
very often, but do not recommend others to follow that schedule. :)

> After cron has this support, I suspect you will propose that root's
> crontab be changed to use the feature, thus further increasing the
> compute done directly on a router.

It is not my goal to change the root's example crontab. I think the
current 1 hour suggestion is appropriate for anno 2021. I agree the CPU
cost is considerable, we should work to reduce it - not to increase it.

> That then begs the question why cron needs this extension for this
> purpose.

I am not propsing it for 'this' purpose, the feature is for folks that
have jobs of unknown run time who wish to put some distance between two
jobs. I saw the feature described in the freebsd manual and just thought
to myself 'This looks doable, perhaps others like it too!' :)

> > @15 -n rpki-client && bgpctl reload
> > 
> > The above is equivalent to:
> > 
> > * * * * * -sn sleep 900 && rpki-client && bgpctl reload
> 
> Somewhat the same effect, but I think there are big differences in
> behaviour.  The second version creates a process hierarchy which sleeps
> for a long time.  I suspect crontab -e causes drastic difference in
> behaviour when the job is already running.

Good point. I hadn't thought of that.

> So I agree with getting away from the sleep pattern.  I have never
> used it myself, and we should discourage others from using it by
> providing something better.
> 
> > I borrowed the idea from FreeBSD's cron [1]. A difference between
> > the below changeset and the freebsd implementation is that they
> > specify the interval in seconds, while the below specifies in
> > minutes.
> 
> Why be incompatible?  Better have a strong justification.

OpenBSD cron would need to be extended to support 'up to 1 second
precise' scheduling, perhaps some code can be borrowed from 
https://github.com/freebsd/freebsd-src/commit/7a5c30c5b67555c9f76a053812ba80ae258b8874#diff-f8f4ba67b7df93eb4b436203d0c70670560029bd8c5b9a642691a914aa33d400
to accomplish this.

Alternatively, I can remove the below '* SECONDS_PER_MINUTE' multiplier,
so that from a syntax perspective we are equivalent to FreeBSD, but our
cron will have 'one minute precision' regarding when the job is
executed.

entry.c:
+   } else if (*cmd != '\0' &&
+   (interval = strtol(cmd, , 10)) > 0 &&
+   *endptr == '\0') {
+   e->interval = interval * SECONDS_PER_MINUTE;
+   e->flags |= INTERVAL | SINGLE_JOB;

For me 'one minute' precision is good enough, but if others think it is
worth putting in the effort to get to '1 second' precision scheduling
I'm happy to try to hack it.

Suggestions welcome!

Kind regards,

Job



Do not spin on the NET_LOCK() in kqueue

2021-07-10 Thread Martin Pieuchot
One of the reasons for the drop of performances in the kqueue-based
poll/select is the fact that kqueue filters are called up to 3 times
per syscall and that they all spin on the NET_LOCK() for TCP/UDP
packets.

Diff below is a RFC for improving the situation.

socket kqueue filters mainly check for the amount of available items to
read/write.  This involves comparing various socket buffer fields (sb_cc,
sb_lowat, etc).  The diff below introduces a new mutex to serialize
updates of those fields with reads in the kqueue filters.

Since these fields are always modified with the socket lock held, either
the mutex or the solock are enough to have a coherent view of them.
Note that either of these locks is necessary only if multiple fields
have to be read (like in sbspace()).

Other per-socket fields accessed in the kqueue filters are never
combined (with &&) to determine a condition.  So assuming it is fine to
read register-sized fields w/o the socket lock we can safely remove it
there.

Could such mutex also be used to serialize klist updates?

Comments?

diff --git sys/kern/uipc_socket.c sys/kern/uipc_socket.c
index dce20208828..d1cb9f4fc3b 100644
--- sys/kern/uipc_socket.c
+++ sys/kern/uipc_socket.c
@@ -181,6 +181,8 @@ socreate(int dom, struct socket **aso, int type, int proto)
so->so_egid = p->p_ucred->cr_gid;
so->so_cpid = p->p_p->ps_pid;
so->so_proto = prp;
+   mtx_init(>so_snd.sb_mtx, IPL_MPFLOOR);
+   mtx_init(>so_rcv.sb_mtx, IPL_MPFLOOR);
so->so_snd.sb_timeo_nsecs = INFSLP;
so->so_rcv.sb_timeo_nsecs = INFSLP;
 
@@ -276,7 +278,9 @@ sofree(struct socket *so, int s)
}
}
 #endif /* SOCKET_SPLICE */
+   mtx_enter(>so_snd.sb_mtx);
sbrelease(so, >so_snd);
+   mtx_leave(>so_snd.sb_mtx);
sorflush(so);
sounlock(so, s);
 #ifdef SOCKET_SPLICE
@@ -1019,8 +1023,10 @@ dontblock:
*mp = m_copym(m, 0, len, M_WAIT);
m->m_data += len;
m->m_len -= len;
+   mtx_enter(>so_rcv.sb_mtx);
so->so_rcv.sb_cc -= len;
so->so_rcv.sb_datacc -= len;
+   mtx_leave(>so_rcv.sb_mtx);
}
}
if (so->so_oobmark) {
@@ -1537,8 +1543,10 @@ somove(struct socket *so, int wait)
}
so->so_rcv.sb_mb->m_data += size;
so->so_rcv.sb_mb->m_len -= size;
+   mtx_enter(>so_rcv.sb_mtx);
so->so_rcv.sb_cc -= size;
so->so_rcv.sb_datacc -= size;
+   mtx_leave(>so_rcv.sb_mtx);
} else {
*mp = so->so_rcv.sb_mb;
sbfree(>so_rcv, *mp);
@@ -1777,30 +1785,40 @@ sosetopt(struct socket *so, int level, int optname, 
struct mbuf *m)
case SO_SNDBUF:
if (so->so_state & SS_CANTSENDMORE)
return (EINVAL);
+   mtx_enter(>so_snd.sb_mtx);
if (sbcheckreserve(cnt, so->so_snd.sb_wat) ||
sbreserve(so, >so_snd, cnt))
-   return (ENOBUFS);
-   so->so_snd.sb_wat = cnt;
+   error = ENOBUFS;
+   if (error == 0)
+   so->so_snd.sb_wat = cnt;
+   mtx_leave(>so_snd.sb_mtx);
break;
 
case SO_RCVBUF:
if (so->so_state & SS_CANTRCVMORE)
return (EINVAL);
+   mtx_enter(>so_rcv.sb_mtx);
if (sbcheckreserve(cnt, so->so_rcv.sb_wat) ||
sbreserve(so, >so_rcv, cnt))
-   return (ENOBUFS);
-   so->so_rcv.sb_wat = cnt;
+   error = ENOBUFS;
+   if (error == 0)
+   so->so_rcv.sb_wat = cnt;
+   mtx_leave(>so_rcv.sb_mtx);
break;
 
case SO_SNDLOWAT:
+   mtx_enter(>so_snd.sb_mtx);
so->so_snd.sb_lowat =
(cnt > so->so_snd.sb_hiwat) ?
so->so_snd.sb_hiwat : cnt;
+   mtx_leave(>so_snd.sb_mtx);
break;
case SO_RCVLOWAT:
+   mtx_leave(>so_rcv.sb_mtx);

Re: cron(8): add '@' interval time specifier

2021-07-10 Thread Theo de Raadt
Theo de Raadt  wrote:

> It will do 5 minutes of compute, 15 minutes of pause, and do it all
> again.

A recent rpki-client took longer:

# Processing time 989 seconds (199 seconds user, 155 seconds system)

So that would be 6 minutes cpu, over 16 minutes elapsed.

Followed by a 15 minute pause.  Then do it again?

I am worried by where this is heading.



Re: cron(8): add '@' interval time specifier

2021-07-10 Thread Theo de Raadt
Job Snijders  wrote:

> A use case could be running rpki-client more frequently than once an
> hour:

I want to say I think running rpki-client so often is misguided for
these rpki-processing-on-the-router configurations.

rpki-processing-on-the-router seems like more a technology demonstration
than performant practical solution.  A rpki-client run is a lot of x509
compute, roughly 5 minutes of full cpu.  After cron has this support, I
suspect you will propose that root's crontab be changed to use the
feature, thus further increasing the compute done directly on a router.
It will do 5 minutes of compute, 15 minutes of pause, and do it all
again.  This will have impact on traffic flow through the OpenBSD
software routing stack.

Surely the on-router approach is only acceptable in very narrow
circumstances, and those situations don't need to be updating RPKI every
20 minutes, as I doubt the internet at large is doing anything close to
20 minute updates.

The normal RPKI use pattern should be offline processing (on a different
dedicated system), then uploaded to a router.

How often RPKI is run should be sensitively scaled to the resources
available.

Are we actually helping people by proposing they use 5 minutes of total
cpu compute, every 20 minutes?  I have serious doubts.

That then begs the question why cron needs this extension for this purpose.
 
> @15 -n rpki-client && bgpctl reload
> 
> The above is equivalent to:
> 
> * * * * * -sn sleep 900 && rpki-client && bgpctl reload

Somewhat the same effect, but I think there are big differences in
behaviour.  The second version creates a process hierarchy which sleeps
for a long time.  I suspect crontab -e causes drastic difference in
behaviour when the job is already running.

I hate the sleep pattern, because you can't see what is happening from a
system perspective (ie. in ps), and if root decides to run rpki-client
or various bgpctl commands by hand, you can get surprising results.

So I agree with getting away from the sleep pattern.  I have never used
it myself, and we should discourage others from using it by providing
something better.

> I borrowed the idea from FreeBSD's cron [1]. A difference between the
> below changeset and the freebsd implementation is that they specify the
> interval in seconds, while the below specifies in minutes.

Why be incompatible?  Better have a strong justification.




Re: cron(8): add '@' interval time specifier

2021-07-10 Thread Jason McIntyre
On Sat, Jul 10, 2021 at 02:07:53PM +, Job Snijders wrote:
> Hi all,
> 
> The below patch adds a new kind of time specifier: an interval (in
> minutes). When used, cron(8) will schedule the next instance of a job
> after the previous job has completed and a full interval has passed. 
> 
> A crontab(5) configured as following:
> 
> $ crontab -l
> @3 sleep 100
> 
> Will result in this schedule:
> 
> Jul 10 13:38:17 vurt cron[96937]: (CRON) STARTUP (V5.0)
> Jul 10 13:42:01 vurt cron[79385]: (job) CMD (sleep 100)
> Jul 10 13:47:01 vurt cron[3165]: (job) CMD (sleep 100)
> Jul 10 13:52:01 vurt cron[40539]: (job) CMD (sleep 100)
> Jul 10 13:57:01 vurt cron[84504]: (job) CMD (sleep 100)
> 
> A use case could be running rpki-client more frequently than once an
> hour:
> 
> @15 -n rpki-client && bgpctl reload
> 
> The above is equivalent to:
> 
> * * * * * -sn sleep 900 && rpki-client && bgpctl reload
> 
> I borrowed the idea from FreeBSD's cron [1]. A difference between the
> below changeset and the freebsd implementation is that they specify the
> interval in seconds, while the below specifies in minutes. I was able
> to leverage the 'singleton' infrastructure. And removed a comment that
> reads like a TODO nobody is going to do.
> 
> Thoughts?
> 
> Kind regards,
> 
> Job
> 

hi.

just to be clear - the point here is that you don;t know how long a job
takes to run, so you can't specify the interval in the normal way?

the doc changes look fine, but i have some suggestions:

> Index: crontab.5
> ===
> RCS file: /cvs/src/usr.sbin/cron/crontab.5,v
> retrieving revision 1.41
> diff -u -p -r1.41 crontab.5
> --- crontab.5 18 Apr 2020 17:11:40 -  1.41
> +++ crontab.5 10 Jul 2021 13:38:13 -
> @@ -265,7 +265,7 @@ For example,
>  would cause a command to be run at 4:30 am on the 1st and 15th of each
>  month, plus every Friday.
>  .Pp
> -Instead of the first five fields, one of eight special strings may appear:
> +Instead of the first five fields, one of nine special strings may appear:

instead of being specific about numbers, we could just say "one of these
special strings". then we don;t have to keep changing the text.

>  .Bl -column "@midnight" "meaning" -offset indent
>  .It Sy string Ta Sy meaning
>  .It @reboot Ta Run once, at startup.
> @@ -276,6 +276,14 @@ Instead of the first five fields, one of
>  .It @daily Ta Run every midnight (0 0 * * *).
>  .It @midnight Ta The same as @daily.
>  .It @hourly Ta Run every hour, on the hour (0 * * * *).
> +.It Pf @ Ar minutes Ta The

the field "minutes" could be confusing, because the other fields
are similar type (hourly, daily, etc) and literal, whereas this is
an argument name. would it be clearer to use sth like "n", a more
common arg name for an arbitrary number, which would also match
your text "followed by a numeric value"?

that's not a request, it's a question ;)

> +.Sq @
> +symbol followed by a numeric value has a special notion of running a job
> +after an interval specified in minutes has passed, and the previous
> +instance has completed.
> +The first run is scheduled at a full interval after
> +.Xr cron 8
> +started.

i hate how every other entry here is a succint one-liner, and this
addition is a block. i wonder if we could be super trim here, and
be more expansive using an example. i already feel from the text
that i don;t really understand when i might use this, so an example
might help.

for example:

...
@n  Run after previous job + "n" minutes have elapsed.
...
EXAMPLES
...
# the gories
#
#
@15 -n rpki-client && bgpctl reload

jmc

>  .El
>  .Sh ENVIRONMENT
>  .Bl -tag -width "LOGNAMEXXX"
> @@ -346,7 +354,9 @@ MAILTO=paul
>  5 4 * * sun echo "run at 5 after 4 every sunday"
>  
>  # run hourly at a random time within the first 30 minutes of the hour
> -0~30 * * * *   /usr/libexec/spamd-setup
> +0~30 * * * */usr/libexec/spamd-setup
> +
> +@10 sleep 180 && echo "starts every 13 minutes, implies -s"
>  .Ed
>  .Sh SEE ALSO
>  .Xr crontab 1 ,
> @@ -372,6 +382,8 @@ Random intervals are supported using the
>  character.
>  .It
>  Months or days of the week can be specified by name.
> +.It
> +Interval mode.
>  .It
>  Environment variables can be set in a crontab.
>  .It



cron(8): add '@' interval time specifier

2021-07-10 Thread Job Snijders
Hi all,

The below patch adds a new kind of time specifier: an interval (in
minutes). When used, cron(8) will schedule the next instance of a job
after the previous job has completed and a full interval has passed. 

A crontab(5) configured as following:

$ crontab -l
@3 sleep 100

Will result in this schedule:

Jul 10 13:38:17 vurt cron[96937]: (CRON) STARTUP (V5.0)
Jul 10 13:42:01 vurt cron[79385]: (job) CMD (sleep 100)
Jul 10 13:47:01 vurt cron[3165]: (job) CMD (sleep 100)
Jul 10 13:52:01 vurt cron[40539]: (job) CMD (sleep 100)
Jul 10 13:57:01 vurt cron[84504]: (job) CMD (sleep 100)

A use case could be running rpki-client more frequently than once an
hour:

@15 -n rpki-client && bgpctl reload

The above is equivalent to:

* * * * * -sn sleep 900 && rpki-client && bgpctl reload

I borrowed the idea from FreeBSD's cron [1]. A difference between the
below changeset and the freebsd implementation is that they specify the
interval in seconds, while the below specifies in minutes. I was able
to leverage the 'singleton' infrastructure. And removed a comment that
reads like a TODO nobody is going to do.

Thoughts?

Kind regards,

Job

[1]: 
https://github.com/freebsd/freebsd-src/commit/a08d12d3f2d4f4dabfc01953be696fdc0750da9c#

Index: cron.c
===
RCS file: /cvs/src/usr.sbin/cron/cron.c,v
retrieving revision 1.79
diff -u -p -r1.79 cron.c
--- cron.c  16 Apr 2020 17:51:56 -  1.79
+++ cron.c  10 Jul 2021 13:38:13 -
@@ -273,6 +273,8 @@ run_reboot_jobs(cron_db *db)
SLIST_FOREACH(e, >crontab, entries) {
if (e->flags & WHEN_REBOOT)
job_add(e, u);
+   if (e->flags & INTERVAL)
+   e->lastexit = StartTime;
}
}
(void) job_runqueue();
@@ -303,6 +305,12 @@ find_jobs(time_t vtime, cron_db *db, int
 */
TAILQ_FOREACH(u, >users, entries) {
SLIST_FOREACH(e, >crontab, entries) {
+   if (e->flags & INTERVAL) {
+   if (e->lastexit > 0 &&
+   virtualSecond >= e->lastexit + e->interval)
+   job_add(e, u);
+   continue;
+   }
if (bit_test(e->minute, minute) &&
bit_test(e->hour, hour) &&
bit_test(e->month, month) &&
Index: crontab.5
===
RCS file: /cvs/src/usr.sbin/cron/crontab.5,v
retrieving revision 1.41
diff -u -p -r1.41 crontab.5
--- crontab.5   18 Apr 2020 17:11:40 -  1.41
+++ crontab.5   10 Jul 2021 13:38:13 -
@@ -265,7 +265,7 @@ For example,
 would cause a command to be run at 4:30 am on the 1st and 15th of each
 month, plus every Friday.
 .Pp
-Instead of the first five fields, one of eight special strings may appear:
+Instead of the first five fields, one of nine special strings may appear:
 .Bl -column "@midnight" "meaning" -offset indent
 .It Sy string Ta Sy meaning
 .It @reboot Ta Run once, at startup.
@@ -276,6 +276,14 @@ Instead of the first five fields, one of
 .It @daily Ta Run every midnight (0 0 * * *).
 .It @midnight Ta The same as @daily.
 .It @hourly Ta Run every hour, on the hour (0 * * * *).
+.It Pf @ Ar minutes Ta The
+.Sq @
+symbol followed by a numeric value has a special notion of running a job
+after an interval specified in minutes has passed, and the previous
+instance has completed.
+The first run is scheduled at a full interval after
+.Xr cron 8
+started.
 .El
 .Sh ENVIRONMENT
 .Bl -tag -width "LOGNAMEXXX"
@@ -346,7 +354,9 @@ MAILTO=paul
 5 4 * * sun echo "run at 5 after 4 every sunday"
 
 # run hourly at a random time within the first 30 minutes of the hour
-0~30 * * * *   /usr/libexec/spamd-setup
+0~30 * * * */usr/libexec/spamd-setup
+
+@10 sleep 180 && echo "starts every 13 minutes, implies -s"
 .Ed
 .Sh SEE ALSO
 .Xr crontab 1 ,
@@ -372,6 +382,8 @@ Random intervals are supported using the
 character.
 .It
 Months or days of the week can be specified by name.
+.It
+Interval mode.
 .It
 Environment variables can be set in a crontab.
 .It
Index: do_command.c
===
RCS file: /cvs/src/usr.sbin/cron/do_command.c,v
retrieving revision 1.61
diff -u -p -r1.61 do_command.c
--- do_command.c16 Apr 2020 17:51:56 -  1.61
+++ do_command.c10 Jul 2021 13:38:13 -
@@ -54,7 +54,8 @@ do_command(entry *e, user *u)
 
/* fork to become asynchronous -- parent process is done immediately,
 * and continues to run the normal cron code, which means return to
-* tick().  the child and grandchild don't leave this function, alive.
+* find_jobs().
+* The child and grandchild 

Re: pthread_cond, futex(2) & ECANCELED

2021-07-10 Thread Martin Pieuchot
On 19/01/20(Sun) 14:44, Martin Pieuchot wrote:
> On 18/01/20(Sat) 14:16, Martin Pieuchot wrote:
> > When futex(2) got imported it didn't return ECANCELED.  This was changed
> > later with futex-based semaphores.
> > 
> > This modification introduced a behavior change in pthread_cond_*wait(3).
> > The diff below restores the previous behavior by treating ECANCELED like
> > EINTR.
> > 
> > Note that the __thrsleep(2) version also doesn't completely check for
> > ECANCELED, this diff also changes that.
> 
> Updated version below includes a check missed previously in the
> __thrsleep(2) based implementation, pointed out by visa@

I still have these M in my tree, any ok?

Index: thread/rthread_cond.c
===
RCS file: /cvs/src/lib/libc/thread/rthread_cond.c,v
retrieving revision 1.5
diff -u -p -r1.5 rthread_cond.c
--- thread/rthread_cond.c   29 Jan 2019 17:40:26 -  1.5
+++ thread/rthread_cond.c   18 Jan 2020 13:10:56 -
@@ -109,13 +109,13 @@ _rthread_cond_timedwait(pthread_cond_t c
* we should just go back to sleep without changing state
* (timeouts, etc).
*/
-   } while ((error == EINTR) &&
+   } while ((error == EINTR || error == ECANCELED) &&
   (tib->tib_canceled == 0 || (tib->tib_cantcancel & CANCEL_DISABLED)));
 
/* if timeout or canceled, make note of that */
if (error == ETIMEDOUT)
rv = ETIMEDOUT;
-   else if (error == EINTR)
+   else if (error == EINTR || error == ECANCELED)
canceled = 1;
 
pthread_mutex_lock(mutexp);
Index: thread/rthread_sync.c
===
RCS file: /cvs/src/lib/libc/thread/rthread_sync.c,v
retrieving revision 1.5
diff -u -p -r1.5 rthread_sync.c
--- thread/rthread_sync.c   24 Apr 2018 16:28:42 -  1.5
+++ thread/rthread_sync.c   19 Jan 2020 09:50:06 -
@@ -407,7 +407,7 @@ pthread_cond_timedwait(pthread_cond_t *c
/* if timeout or canceled, make note of that */
if (error == EWOULDBLOCK)
rv = ETIMEDOUT;
-   else if (error == EINTR)
+   else if (error == EINTR || error = ECANCELED)
canceled = 1;
 
/* transfer between the queues */
@@ -544,7 +544,7 @@ pthread_cond_wait(pthread_cond_t *condp,
assert(self->blocking_cond == cond);
 
/* if canceled, make note of that */
-   if (error == EINTR)
+   if (error == EINTR || error == ECANCELED)
canceled = 1;
 
/* transfer between the queues */



Re: gprof: Profiling a multi-threaded application

2021-07-10 Thread Martin Pieuchot
Hello Yuichiro, thanks for your work !

> On 2021/06/16 16:34, Yuichiro NAITO wrote:
> > When I compile a multi-threaded application with '-pg' option, I always get 
> > no
> > results in gmon.out. With bad luck, my application gets SIGSEGV while 
> > running.
> > Because the buffer that holds number of caller/callee count is the only one
> > in the process and will be broken by multi-threaded access.
> > 
> > I get the idea to solve this problem from NetBSD. NetBSD has individual 
> > buffers
> > for each thread and merges them at the end of profiling.

Note that the kernel use a similar approach but doesn't merge the buffer,
instead it generates a file for each CPU.

> > 
> > NetBSD stores the reference to the individual buffer by 
> > pthread_setspecific(3).
> > I think it causes infinite recursive call if whole libc library (except
> > mcount.c) is compiled with -pg.
> > 
> > The compiler generates '_mcount' function call at the beginning of every
> > functions. If '_mcount' calls pthread_getspecific(3) to get the individual
> > buffer, pthread_getspecific() calls '_mcount' again and causes infinite
> > recursion.
> > 
> > NetBSD prevents from infinite recursive call by checking a global variable. 
> > But
> > this approach also prevents simultaneously call of '_mcount' on a 
> > multi-threaded
> > application. It makes a little bit wrong results of profiling.
> > 
> > So I added a pointer to the buffer in `struct pthread` that can be 
> > accessible
> > via macro call as same as pthread_self(3). This approach can prevent of
> > infinite recursive call of '_mcount'.

Not calling a libc function for this makes sense.  However I'm not
convinced that accessing `tib_thread' before _rthread_init() has been
called is safe.

I'm not sure where is the cleanest way to place the per-thread buffer,
I'd suggest you ask guenther@ about this.

Maybe the initialization can be done outside of _mcount()?

> > I obtained merging function from NetBSD that is called in '_mcleanup' 
> > function.
> > Merging function needs to walk through all the individual buffers,
> > I added SLIST_ENTRY member in 'struct gmonparam' to make a list of the 
> > buffers.
> > And also added '#ifndef _KERNEL' for the SLIST_ENTRY member not to be used 
> > for
> > the kernel.
> > 
> > But I still use pthread_getspecific(3) for that can call destructor when
> > a thread is terminated. The individual buffer is moved to free list to reuse
> > for a new thread.
> 
> Here is a patch for this approach.

I have various comments:

We choose not to use C++ style comment (// comment) in the tree, could you
fix yours?

There's two copies of mcount.c, the other one lies in sys/lib/libkern could
you keep them in sync?

gmon.c is only compiled in userland and don't need #ifndef _KERNEL

In libc there's also the use of _MUTEX_LOCK/UNLOCK() macro instead of
calling pthread_mutex* directly.  This might help reduce the differences
between ST and MT paths.

> diff --git a/lib/libc/gmon/gmon.c b/lib/libc/gmon/gmon.c
> index 1ce0a1c289e..6887f4f5987 100644
> --- a/lib/libc/gmon/gmon.c
> +++ b/lib/libc/gmon/gmon.c
> @@ -42,6 +42,16 @@
>  
>  struct gmonparam _gmonparam = { GMON_PROF_OFF };
>  
> +#ifndef _KERNEL
> +#include 
> +
> +SLIST_HEAD(, gmonparam) _gmonfree = SLIST_HEAD_INITIALIZER(_gmonfree);
> +SLIST_HEAD(, gmonparam) _gmoninuse = SLIST_HEAD_INITIALIZER(_gmoninuse);
> +pthread_mutex_t _gmonlock = PTHREAD_MUTEX_INITIALIZER;
> +pthread_key_t _gmonkey;
> +struct gmonparam _gmondummy;
> +#endif
> +
>  static int   s_scale;
>  /* see profil(2) where this is describe (incorrectly) */
>  #define  SCALE_1_TO_10x1L
> @@ -52,6 +62,13 @@ PROTO_NORMAL(moncontrol);
>  PROTO_DEPRECATED(monstartup);
>  static int hertz(void);
>  
> +#ifndef _KERNEL
> +static void _gmon_destructor(void *);
> +struct gmonparam *_gmon_alloc(void);
> +static void _gmon_merge(void);
> +static void _gmon_merge_two(struct gmonparam *, struct gmonparam *);
> +#endif
> +
>  void
>  monstartup(u_long lowpc, u_long highpc)
>  {
> @@ -114,6 +131,11 @@ monstartup(u_long lowpc, u_long highpc)
>   } else
>   s_scale = SCALE_1_TO_1;
>  
> +#ifndef _KERNEL
> + _gmondummy.state = GMON_PROF_BUSY;
> + pthread_key_create(&_gmonkey, _gmon_destructor);
> +#endif
> +
>   moncontrol(1);
>   return;
>  
> @@ -134,6 +156,194 @@ mapfailed:
>  }
>  __strong_alias(_monstartup,monstartup);
>  
> +#ifndef _KERNEL
> +static void
> +_gmon_destructor(void *arg)
> +{
> + struct gmonparam *p = arg, *q, **prev;
> +
> + if (p == &_gmondummy)
> + return;
> +
> + pthread_setspecific(_gmonkey, &_gmondummy);
> +
> + pthread_mutex_lock(&_gmonlock);
> + SLIST_REMOVE(&_gmoninuse, p, gmonparam, next);
> + SLIST_INSERT_HEAD(&_gmonfree, p, next);
> + pthread_mutex_unlock(&_gmonlock);
> +
> + pthread_setspecific(_gmonkey, NULL);
> +}
> +
> +struct gmonparam *
> +_gmon_alloc(void)
> +{
> + void *addr;
> + struct gmonparam *p;
> +
> +