date:20210726

Re: pipex(4): introduce mutexes to protect 'pipex_session' context

2021-07-26 Thread Alexander Bluhm

On Tue, Jul 27, 2021 at 12:30:06AM +0300, Vitaliy Makkoveev wrote:
> > The diff below makes pipex(4) a little bit more parallelization
> > reliable.

> > @@ -2238,10 +2240,6 @@ pipex_mppe_input(struct mbuf *m0, struct
> > coher_cnt &= PIPEX_COHERENCY_CNT_MASK;
> > pktloss = 0;
> > 
> > -   PIPEX_MPPE_DBG((session, LOG_DEBUG, "in coher_cnt=%03x %s%s",
> > -   mppe->coher_cnt, (flushed) ? "[flushed]" : "",
> > -   (encrypt) ? "[encrypt]" : ""));
> > -
> > if (encrypt == 0) {
> > pipex_session_log(session, LOG_DEBUG,
> > "Received unexpected MPPE packet.(no ecrypt)");
> > @@ -2251,6 +2249,12 @@ pipex_mppe_input(struct mbuf *m0, struct
> > /* adjust mbuf */
> > m_adj(m0, sizeof(coher_cnt));
> > 
> > +   mtx_enter(>pxm_mtx);
> > +
> > +   PIPEX_MPPE_DBG((session, LOG_DEBUG, "in coher_cnt=%03x %s%s",
> > +   mppe->coher_cnt, (flushed) ? "[flushed]" : "",
> > +   (encrypt) ? "[encrypt]" : ""));
> > +

Is is really necessary to move the debug print?  encrypt is always
0 now.  I guess the author of this line also wanted to see the
messages with encrypt == 1.  It is only a reading access to
mppe->coher_cnt in a debug print that is not MP safe.

> > +   mtx_enter(>pxs_mtx);
> > +   ccp_id=session->ccp_id;
> > +   session->ccp_id++;
> > +   mtx_leave(>pxs_mtx);

ccp_id=session->ccp_id, there must be spaces around =

> > int16_t stateless:1,/* [I] key change mode */
> > -   resetreq:1, /* [N] */
> > +   resetreq:1, /* [m] */
> > reserved:14;

Can we have differnet locks on bit fields?  I thought the minimum
granulatrity of locked access is an int.

anyway OK bluhm@

Re: ipsec(4): use per-CPU counters for tunnel descriptor block (tdb) statistics

2021-07-26 Thread Alexander Bluhm

On Tue, Jul 27, 2021 at 12:29:55AM +0300, Vitaliy Makkoveev wrote:
> > +tdb_stat_inc(struct tdb *tdb, enum tdb_counters c)
> > +tdb_stat_add(struct tdb *tdb, enum tdb_counters c, uint64_t v)
> > +tdb_stat_pkt(struct tdb *tdb, enum tdb_counters pc, enum tdb_counters bc,

All the other ...stat_inc only have a single _
Better tdbstat_inc() tdbstat_add() tdbstat_pkt() for consistency.

OK bluhm@

Re: ipsec: document locks of some structures

2021-07-26 Thread Vitaliy Makkoveev

anyone?

> On 19 Jul 2021, at 01:21, Vitaliy Makkoveev  wrote:
> 
> ping?
> 
> The diff below updated to the most recent source.
> 
> Index: sys/netinet/ip_ipsp.h
> ===
> RCS file: /cvs/src/sys/netinet/ip_ipsp.h,v
> retrieving revision 1.203
> diff -u -p -r1.203 ip_ipsp.h
> --- sys/netinet/ip_ipsp.h 18 Jul 2021 18:19:22 -  1.203
> +++ sys/netinet/ip_ipsp.h 18 Jul 2021 22:19:28 -
> @@ -45,6 +45,12 @@
> #include 
> #include 
> 
> +/*
> + * Locks used to protect struct members in this file:
> + *   I   Immutable after creation
> + *   N   netlock
> + */
> +
> union sockaddr_union {
>   struct sockaddr sa;
>   struct sockaddr_in  sin;
> @@ -226,37 +232,38 @@ struct ipsec_id {
> };
> 
> struct ipsec_ids {
> - LIST_ENTRY(ipsec_ids)   id_gc_list;
> - RBT_ENTRY(ipsec_ids)id_node_id;
> - RBT_ENTRY(ipsec_ids)id_node_flow;
> - struct ipsec_id *id_local;
> - struct ipsec_id *id_remote;
> - u_int32_t   id_flow;
> - int id_refcount;
> - u_int   id_gc_ttl;
> + LIST_ENTRY(ipsec_ids)   id_gc_list; /* [N] */
> + RBT_ENTRY(ipsec_ids)id_node_id; /* [N] */
> + RBT_ENTRY(ipsec_ids)id_node_flow;   /* [N] */
> + struct ipsec_id *id_local;  /* [I] */
> + struct ipsec_id *id_remote; /* [I] */
> + u_int32_t   id_flow;/* [I] */
> + int id_refcount;/* [N] */
> + u_int   id_gc_ttl;  /* [N] */
> };
> RBT_HEAD(ipsec_ids_flows, ipsec_ids);
> RBT_HEAD(ipsec_ids_tree, ipsec_ids);
> 
> struct ipsec_acquire {
> - union sockaddr_unionipa_addr;
> - u_int32_t   ipa_seq;
> - struct sockaddr_encap   ipa_info;
> - struct sockaddr_encap   ipa_mask;
> + union sockaddr_unionipa_addr;   /* [I] */
> + u_int32_t   ipa_seq;/* [I] */
> + struct sockaddr_encap   ipa_info;   /* [I] */
> + struct sockaddr_encap   ipa_mask;   /* [I] */
>   struct timeout  ipa_timeout;
> - struct ipsec_policy *ipa_policy;
> - struct inpcb*ipa_pcb;
> - TAILQ_ENTRY(ipsec_acquire)  ipa_ipo_next;
> - TAILQ_ENTRY(ipsec_acquire)  ipa_next;
> + struct ipsec_policy *ipa_policy;/* [I] */
> + struct inpcb*ipa_pcb;   /* [I] */
> + TAILQ_ENTRY(ipsec_acquire)  ipa_ipo_next;   /* [N] */
> + TAILQ_ENTRY(ipsec_acquire)  ipa_next;   /* [N] */
> };
> 
> struct ipsec_policy {
>   struct radix_node   ipo_nodes[2];   /* radix tree glue */
> - struct sockaddr_encap   ipo_addr;
> - struct sockaddr_encap   ipo_mask;
> + struct sockaddr_encap   ipo_addr;   /* [I] */
> + struct sockaddr_encap   ipo_mask;   /* [I] */
> 
> - union sockaddr_unionipo_src;/* Local address to use */
> - union sockaddr_unionipo_dst;/* Remote gateway -- if it's 
> zeroed:
> + union sockaddr_unionipo_src;/* [N] Local address to use */
> + union sockaddr_unionipo_dst;/* [N] Remote gateway --
> +  * if it's zeroed:
>* - on output, we try to
>* contact the remote host
>* directly (if needed).
> @@ -267,22 +274,28 @@ struct ipsec_policy {
>* mode was used.
>*/
> 
> - u_int64_t   ipo_last_searched;  /* Timestamp of last 
> lookup */
> -
> - u_int8_tipo_flags;  /* See IPSP_POLICY_* 
> definitions */
> - u_int8_tipo_type;   /* USE/ACQUIRE/... */
> - u_int8_tipo_sproto; /* ESP/AH; if zero, use system 
> dflts */
> - u_int   ipo_rdomain;
> -
> - int ipo_ref_count;
> -
> - struct tdb  *ipo_tdb;   /* Cached entry */
> -
> - struct ipsec_ids*ipo_ids;
> + u_int64_t   ipo_last_searched;  /* [N] Timestamp
> +of last lookup */
> 
> - TAILQ_HEAD(ipo_acquires_head, ipsec_acquire) ipo_acquires; /* List of 
> acquires */
> - TAILQ_ENTRY(ipsec_policy)   ipo_tdb_next;   /* List TDB policies */
> - TAILQ_ENTRY(ipsec_policy)   ipo_list;   /* List of all policies 
> */
> + u_int8_tipo_flags;  /* [N] See IPSP_POLICY_*
> +definitions */
> + u_int8_tipo_type;   /* [N] USE/ACQUIRE/... */

Re: ipsec(4): use per-CPU counters for tunnel descriptor block (tdb) statistics

2021-07-26 Thread Vitaliy Makkoveev

ping

> On 23 Jul 2021, at 03:12, Vitaliy Makkoveev  wrote:
> 
> This makes 'tdb' struct more MP compliant. 'tdb_data' struct became
> unused and was removed.
> 
> Index: sys/net/pfkeyv2_convert.c
> ===
> RCS file: /cvs/src/sys/net/pfkeyv2_convert.c,v
> retrieving revision 1.72
> diff -u -p -r1.72 pfkeyv2_convert.c
> --- sys/net/pfkeyv2_convert.c 14 Jul 2021 22:39:26 -  1.72
> +++ sys/net/pfkeyv2_convert.c 23 Jul 2021 00:01:58 -
> @@ -960,18 +960,21 @@ export_satype(void **p, struct tdb *tdb)
> void
> export_counter(void **p, struct tdb *tdb)
> {
> + uint64_t counters[tdb_ncounters];
>   struct sadb_x_counter *scnt = (struct sadb_x_counter *)*p;
> 
> + counters_read(tdb->tdb_counters, counters, tdb_ncounters);
> +
>   scnt->sadb_x_counter_len = sizeof(struct sadb_x_counter) /
>   sizeof(uint64_t);
>   scnt->sadb_x_counter_pad = 0;
> - scnt->sadb_x_counter_ipackets = tdb->tdb_ipackets;
> - scnt->sadb_x_counter_opackets = tdb->tdb_opackets;
> - scnt->sadb_x_counter_ibytes = tdb->tdb_ibytes;
> - scnt->sadb_x_counter_obytes = tdb->tdb_obytes;
> - scnt->sadb_x_counter_idrops = tdb->tdb_idrops;
> - scnt->sadb_x_counter_odrops = tdb->tdb_odrops;
> - scnt->sadb_x_counter_idecompbytes = tdb->tdb_idecompbytes;
> - scnt->sadb_x_counter_ouncompbytes = tdb->tdb_ouncompbytes;
> + scnt->sadb_x_counter_ipackets = counters[tdb_ipackets];
> + scnt->sadb_x_counter_opackets = counters[tdb_opackets];
> + scnt->sadb_x_counter_ibytes = counters[tdb_ibytes];
> + scnt->sadb_x_counter_obytes = counters[tdb_obytes];
> + scnt->sadb_x_counter_idrops = counters[tdb_idrops];
> + scnt->sadb_x_counter_odrops = counters[tdb_odrops];
> + scnt->sadb_x_counter_idecompbytes = counters[tdb_idecompbytes];
> + scnt->sadb_x_counter_ouncompbytes = counters[tdb_ouncompbytes];
>   *p += sizeof(struct sadb_x_counter);
> }
> Index: sys/netinet/ip_ah.c
> ===
> RCS file: /cvs/src/sys/netinet/ip_ah.c,v
> retrieving revision 1.151
> diff -u -p -r1.151 ip_ah.c
> --- sys/netinet/ip_ah.c   18 Jul 2021 14:38:20 -  1.151
> +++ sys/netinet/ip_ah.c   23 Jul 2021 00:01:58 -
> @@ -609,7 +609,7 @@ ah_input(struct mbuf *m, struct tdb *tdb
>   /* Update the counters. */
>   ibytes = (m->m_pkthdr.len - skip - hl * sizeof(u_int32_t));
>   tdb->tdb_cur_bytes += ibytes;
> - tdb->tdb_ibytes += ibytes;
> + tdb_stat_add(tdb, tdb_ibytes, ibytes);
>   ahstat_add(ahs_ibytes, ibytes);
> 
>   /* Hard expiration. */
> Index: sys/netinet/ip_esp.c
> ===
> RCS file: /cvs/src/sys/netinet/ip_esp.c,v
> retrieving revision 1.169
> diff -u -p -r1.169 ip_esp.c
> --- sys/netinet/ip_esp.c  18 Jul 2021 14:38:20 -  1.169
> +++ sys/netinet/ip_esp.c  23 Jul 2021 00:01:58 -
> @@ -426,7 +426,7 @@ esp_input(struct mbuf *m, struct tdb *td
>   /* Update the counters */
>   ibytes = m->m_pkthdr.len - skip - hlen - alen;
>   tdb->tdb_cur_bytes += ibytes;
> - tdb->tdb_ibytes += ibytes;
> + tdb_stat_add(tdb, tdb_ibytes, ibytes);
>   espstat_add(esps_ibytes, ibytes);
> 
>   /* Hard expiration */
> Index: sys/netinet/ip_ipcomp.c
> ===
> RCS file: /cvs/src/sys/netinet/ip_ipcomp.c,v
> retrieving revision 1.71
> diff -u -p -r1.71 ip_ipcomp.c
> --- sys/netinet/ip_ipcomp.c   8 Jul 2021 21:07:19 -   1.71
> +++ sys/netinet/ip_ipcomp.c   23 Jul 2021 00:01:58 -
> @@ -215,7 +215,7 @@ ipcomp_input_cb(struct tdb *tdb, struct 
>   /* update the counters */
>   ibytes = m->m_pkthdr.len - (skip + hlen);
>   tdb->tdb_cur_bytes += ibytes;
> - tdb->tdb_ibytes += ibytes;
> + tdb_stat_add(tdb, tdb_ibytes, ibytes);
>   ipcompstat_add(ipcomps_ibytes, ibytes);
> 
>   /* Hard expiration */
> Index: sys/netinet/ip_ipsp.c
> ===
> RCS file: /cvs/src/sys/netinet/ip_ipsp.c,v
> retrieving revision 1.242
> diff -u -p -r1.242 ip_ipsp.c
> --- sys/netinet/ip_ipsp.c 19 Jul 2021 14:49:55 -  1.242
> +++ sys/netinet/ip_ipsp.c 23 Jul 2021 00:01:58 -
> @@ -830,6 +830,9 @@ tdb_alloc(u_int rdomain)
>   tdbp->tdb_rdomain = rdomain;
>   tdbp->tdb_rdomain_post = rdomain;
> 
> + /* Initialize counters. */
> + tdbp->tdb_counters = counters_alloc(tdb_ncounters);
> +
>   /* Initialize timeouts. */
>   timeout_set_proc(>tdb_timer_tmo, tdb_timeout, tdbp);
>   timeout_set_proc(>tdb_first_tmo, tdb_firstuse, tdbp);
> @@ -881,6 +884,8 @@ tdb_free(struct tdb *tdbp)
> 
>   if ((tdbp->tdb_inext) && (tdbp->tdb_inext->tdb_onext == tdbp))
>   tdbp->tdb_inext->tdb_onext = NULL;
> +
> + counters_free(tdbp->tdb_counters, tdb_ncounters);
> 
>

Re: pipex(4): introduce mutexes to protect 'pipex_session' context

2021-07-26 Thread Vitaliy Makkoveev

ping

> On 22 Jul 2021, at 01:56, Vitaliy Makkoveev  wrote:
> 
> With bluhm@'s diff for parallel forwarding pipex(4) could be accessed in
> parallel through (*ifp->if_input)() -> ether_input() ->
> pipex_pppoe_input(). PPPOE pipex(4) sessions are mostly immutable except
> MPPE crypt. 
> 
> The diff below makes pipex(4) a little bit more parallelization
> reliable.
> 
> The new per-session `pxs_mtx' mutex(9) used to protect session's
> `ccp-id' which is incremented each time we send CCP reset-request.
> 
> The new per-MPPE context `pxm_mtx' mutex(9) used to protect MPPE
> context. We have two of them: one for the input and one for output path.
> With bluhm@'s diff those paths are mostly serialized, except the case
> when we send CCP reset-request. I don't see the reason to other
> pipex_pppoe_input() threads spin while we perform pipex_ccp_output(). 
> 
> Where is no lock order limitations because those new mutex(9)'es newer
> held together.
> 
> I'm not PPPOE user and I'll be happy if such user tests this diff in
> real workflow. 
> 
> Index: sys/net/pipex.c
> ===
> RCS file: /cvs/src/sys/net/pipex.c,v
> retrieving revision 1.134
> diff -u -p -r1.134 pipex.c
> --- sys/net/pipex.c   20 Jul 2021 16:44:55 -  1.134
> +++ sys/net/pipex.c   21 Jul 2021 22:27:47 -
> @@ -263,6 +263,7 @@ pipex_init_session(struct pipex_session 
> 
>   /* prepare a new session */
>   session = pool_get(_session_pool, PR_WAITOK | PR_ZERO);
> + mtx_init(>pxs_mtx, IPL_SOFTNET);
>   session->state = PIPEX_STATE_INITIAL;
>   session->protocol = req->pr_protocol;
>   session->session_id = req->pr_session_id;
> @@ -2099,6 +2100,7 @@ pipex_mppe_init(struct pipex_mppe *mppe,
> u_char *master_key, int has_oldkey)
> {
>   memset(mppe, 0, sizeof(struct pipex_mppe));
> + mtx_init(>pxm_mtx, IPL_SOFTNET);
>   if (stateless)
>   mppe->stateless = 1;
>   if (has_oldkey)
> @@ -2238,10 +2240,6 @@ pipex_mppe_input(struct mbuf *m0, struct
>   coher_cnt &= PIPEX_COHERENCY_CNT_MASK;
>   pktloss = 0;
> 
> - PIPEX_MPPE_DBG((session, LOG_DEBUG, "in coher_cnt=%03x %s%s",
> - mppe->coher_cnt, (flushed) ? "[flushed]" : "",
> - (encrypt) ? "[encrypt]" : ""));
> -
>   if (encrypt == 0) {
>   pipex_session_log(session, LOG_DEBUG,
>   "Received unexpected MPPE packet.(no ecrypt)");
> @@ -2251,6 +2249,12 @@ pipex_mppe_input(struct mbuf *m0, struct
>   /* adjust mbuf */
>   m_adj(m0, sizeof(coher_cnt));
> 
> + mtx_enter(>pxm_mtx);
> +
> + PIPEX_MPPE_DBG((session, LOG_DEBUG, "in coher_cnt=%03x %s%s",
> + mppe->coher_cnt, (flushed) ? "[flushed]" : "",
> + (encrypt) ? "[encrypt]" : ""));
> +
>   /*
>* L2TP data session may be used without sequencing, PPP frames may
>* arrive in disorder.  The 'coherency counter' of MPPE detects such
> @@ -2274,6 +2278,7 @@ pipex_mppe_input(struct mbuf *m0, struct
>   pipex_session_log(session, LOG_DEBUG,
>   "Workaround the out-of-sequence PPP framing 
> problem: "
>   "%d => %d", mppe->coher_cnt, coher_cnt);
> + mtx_leave(>pxm_mtx);
>   goto drop;
>   }
>   rewind = 1;
> @@ -2305,10 +2310,19 @@ pipex_mppe_input(struct mbuf *m0, struct
>   coher_cnt &= PIPEX_COHERENCY_CNT_MASK;
>   mppe->coher_cnt = coher_cnt;
>   } else if (mppe->coher_cnt != coher_cnt) {
> + int ccp_id;
> +
> + mtx_leave(>pxm_mtx);
> +
>   /* Send CCP ResetReq */
>   PIPEX_DBG((session, LOG_DEBUG, "CCP SendResetReq"));
> - pipex_ccp_output(session, CCP_RESETREQ,
> - session->ccp_id++);
> +
> + mtx_enter(>pxs_mtx);
> + ccp_id=session->ccp_id;
> + session->ccp_id++;
> + mtx_leave(>pxs_mtx);
> +
> + pipex_ccp_output(session, CCP_RESETREQ, ccp_id);
>   goto drop;
>   }
>   if ((coher_cnt & 0xff) == 0xff) {
> @@ -2336,6 +2350,9 @@ pipex_mppe_input(struct mbuf *m0, struct
>   mppe->coher_cnt++;
>   mppe->coher_cnt &= PIPEX_COHERENCY_CNT_MASK;
>   }
> +
> + mtx_leave(>pxm_mtx);
> +
>   if (m0->m_pkthdr.len < PIPEX_PPPMINLEN)
>   goto drop;
> 
> @@ -2387,6 +2404,8 @@ pipex_mppe_output(struct mbuf *m0, struc
>   flushed = 0;
>   encrypt = 1;
> 
> + mtx_enter(>pxm_mtx);
> +
>   if (mppe->stateless != 0) {
>   flushed = 1;
>   mppe_key_change(mppe);
> @@ -2429,6 +2448,8 @@ pipex_mppe_output(struct mbuf *m0, struc
>   pipex_mppe_crypt(mppe, len, cp, cp);
>   }
> 
> + mtx_leave(>pxm_mtx);
>

Re: pf icmp reflect

2021-07-26 Thread Klemens Nanni

On Mon, Jul 26, 2021 at 06:41:42PM +0200, Alexander Bluhm wrote:
> The mbuf header cleanup I added in revision 1.173 of ip_icmp.c is
> too strict.  ICMP error packets generated by pf are not passed
> immediately, but may be blocked.  Preserve PF_TAG_GENERATED flag
> in icmp_reflect() and icmp6_reflect().

OK kn

Re: pf icmp reflect

2021-07-26 Thread Patrick Wildt

On Mon, Jul 26, 2021 at 06:41:42PM +0200, Alexander Bluhm wrote:
> Hi,
> 
> The mbuf header cleanup I added in revision 1.173 of ip_icmp.c is
> too strict.  ICMP error packets generated by pf are not passed
> immediately, but may be blocked.  Preserve PF_TAG_GENERATED flag
> in icmp_reflect() and icmp6_reflect().
> 
> ok?

While I do prefer uint8_t, the struct member is defined as u_int8_t, so
I guess for consistency we can use that.

ok patrick@

> bluhm
> 
> Index: netinet/ip_icmp.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_icmp.c,v
> retrieving revision 1.186
> diff -u -p -r1.186 ip_icmp.c
> --- netinet/ip_icmp.c 30 Mar 2021 08:37:10 -  1.186
> +++ netinet/ip_icmp.c 26 Jul 2021 14:10:37 -
> @@ -691,6 +691,7 @@ icmp_reflect(struct mbuf *m, struct mbuf
>   struct rtentry *rt = NULL;
>   int optlen = (ip->ip_hl << 2) - sizeof(struct ip);
>   u_int rtableid;
> + u_int8_t pfflags;
>  
>   if (!in_canforward(ip->ip_src) &&
>   ((ip->ip_src.s_addr & IN_CLASSA_NET) !=
> @@ -704,8 +705,10 @@ icmp_reflect(struct mbuf *m, struct mbuf
>   return (ELOOP);
>   }
>   rtableid = m->m_pkthdr.ph_rtableid;
> + pfflags = m->m_pkthdr.pf.flags;
>   m_resethdr(m);
>   m->m_pkthdr.ph_rtableid = rtableid;
> + m->m_pkthdr.pf.flags = pfflags & PF_TAG_GENERATED;
>  
>   /*
>* If the incoming packet was addressed directly to us,
> Index: netinet6/icmp6.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/icmp6.c,v
> retrieving revision 1.235
> diff -u -p -r1.235 icmp6.c
> --- netinet6/icmp6.c  10 Mar 2021 10:21:49 -  1.235
> +++ netinet6/icmp6.c  26 Jul 2021 15:42:33 -
> @@ -1052,6 +1052,7 @@ icmp6_reflect(struct mbuf **mp, size_t o
>   struct in6_addr t, *src = NULL;
>   struct sockaddr_in6 sa6_src, sa6_dst;
>   u_int rtableid;
> + u_int8_t pfflags;
>  
>   CTASSERT(sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) <= MHLEN);
>  
> @@ -1069,8 +1070,10 @@ icmp6_reflect(struct mbuf **mp, size_t o
>   return (ELOOP);
>   }
>   rtableid = m->m_pkthdr.ph_rtableid;
> + pfflags = m->m_pkthdr.pf.flags;
>   m_resethdr(m);
>   m->m_pkthdr.ph_rtableid = rtableid;
> + m->m_pkthdr.pf.flags = pfflags & PF_TAG_GENERATED;
>  
>   /*
>* If there are extra headers between IPv6 and ICMPv6, strip
>

pf icmp reflect

2021-07-26 Thread Alexander Bluhm

Hi,

The mbuf header cleanup I added in revision 1.173 of ip_icmp.c is
too strict.  ICMP error packets generated by pf are not passed
immediately, but may be blocked.  Preserve PF_TAG_GENERATED flag
in icmp_reflect() and icmp6_reflect().

ok?

bluhm

Index: netinet/ip_icmp.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_icmp.c,v
retrieving revision 1.186
diff -u -p -r1.186 ip_icmp.c
--- netinet/ip_icmp.c   30 Mar 2021 08:37:10 -  1.186
+++ netinet/ip_icmp.c   26 Jul 2021 14:10:37 -
@@ -691,6 +691,7 @@ icmp_reflect(struct mbuf *m, struct mbuf
struct rtentry *rt = NULL;
int optlen = (ip->ip_hl << 2) - sizeof(struct ip);
u_int rtableid;
+   u_int8_t pfflags;
 
if (!in_canforward(ip->ip_src) &&
((ip->ip_src.s_addr & IN_CLASSA_NET) !=
@@ -704,8 +705,10 @@ icmp_reflect(struct mbuf *m, struct mbuf
return (ELOOP);
}
rtableid = m->m_pkthdr.ph_rtableid;
+   pfflags = m->m_pkthdr.pf.flags;
m_resethdr(m);
m->m_pkthdr.ph_rtableid = rtableid;
+   m->m_pkthdr.pf.flags = pfflags & PF_TAG_GENERATED;
 
/*
 * If the incoming packet was addressed directly to us,
Index: netinet6/icmp6.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/icmp6.c,v
retrieving revision 1.235
diff -u -p -r1.235 icmp6.c
--- netinet6/icmp6.c10 Mar 2021 10:21:49 -  1.235
+++ netinet6/icmp6.c26 Jul 2021 15:42:33 -
@@ -1052,6 +1052,7 @@ icmp6_reflect(struct mbuf **mp, size_t o
struct in6_addr t, *src = NULL;
struct sockaddr_in6 sa6_src, sa6_dst;
u_int rtableid;
+   u_int8_t pfflags;
 
CTASSERT(sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) <= MHLEN);
 
@@ -1069,8 +1070,10 @@ icmp6_reflect(struct mbuf **mp, size_t o
return (ELOOP);
}
rtableid = m->m_pkthdr.ph_rtableid;
+   pfflags = m->m_pkthdr.pf.flags;
m_resethdr(m);
m->m_pkthdr.ph_rtableid = rtableid;
+   m->m_pkthdr.pf.flags = pfflags & PF_TAG_GENERATED;
 
/*
 * If there are extra headers between IPv6 and ICMPv6, strip

Re: sleep.3: miscellaneous cleanup and rewrites

2021-07-26 Thread Todd C . Miller

On Mon, 26 Jul 2021 07:55:00 -0500, Scott Cheloha wrote:

> We don't know if it is still in the wild or not.
>
> There are lots of libc implementations.  We can't account for all of
> them.

Right.  We don't know who is still using a signal or itimer-based
sleep.

> > Being proscriptive in OpenBSD manual pages isn't going to stop someone
> > from creating the precise problem you describe in some other body of
> > code.  Except, you are saying they don't create that problem.
>
> Right, we can't prevent it, which is why I wanted to say "beware".

POSIX doesn't allow a SIGALRM-based implementation when threads are
used anyway.  A future version of POSIX will likely make a SIGALRM-based
implementation non-conforming event for single-threaded processes.

> > So why foam at the mouth over it?
>
> Because up until this very moment we referenced the historical
> implementation approach in the manpage.

The 4.4BSD man page had even more detail about this :-)

> Anyway, updated patch.  No mention of SIGALRM or alarm(3) except in
> the History section.

I think this is OK for now.

> Still wondering whether we need an Errors section to mention that
> sleep(3) can set errno.

POSIX doesn't define any errors.  Of the errors returned by nanosleep,
only EINTR is really possible.  I guess we could save & restore
errno in sleep.c but I don't see any implementations that actually
do that.  I only checked illumos, glibc and musl though.

 - todd

iwx(4) firmware update to -63

2021-07-26 Thread Stefan Sperling

This patch implements support for new iwx(4) -63 firmware images
available in the iwx-firmware-20210512 package (via fw_update).

Please test this patch and report back. If testing works out well
then I will commit these changes to CVS incrementally. Thanks!

For those testing my iwx Tx agg patch: Please remove that patch first.
Tx agg will come back later. Getting new firmware images to work is more
important right now because these images contain fixes for fragattacks.

diff 54a415eecf0903b6663209706a0fc6e2e9d40044 refs/heads/iwxfw
blob - 48bf14d7a8a421c6f1bf2b4bfc6c1792396b5400
blob + 91be6f7f188e6d6754571b7dc1f4fb38c72546da
--- sys/dev/pci/if_iwx.c
+++ sys/dev/pci/if_iwx.c
@@ -232,6 +232,7 @@ const int iwx_mcs2ridx[] = {
 };
 
 uint8_tiwx_lookup_cmd_ver(struct iwx_softc *, uint8_t, uint8_t);
+uint8_tiwx_lookup_notif_ver(struct iwx_softc *, uint8_t, uint8_t);
 intiwx_is_mimo_ht_plcp(uint8_t);
 intiwx_is_mimo_mcs(int);
 intiwx_store_cscheme(struct iwx_softc *, uint8_t *, size_t);
@@ -297,6 +298,8 @@ int iwx_nic_rx_init(struct iwx_softc *);
 intiwx_nic_init(struct iwx_softc *);
 intiwx_enable_txq(struct iwx_softc *, int, int, int, int);
 void   iwx_post_alive(struct iwx_softc *);
+intiwx_schedule_session_protection(struct iwx_softc *, struct iwx_node *,
+   uint32_t);
 void   iwx_protect_session(struct iwx_softc *, struct iwx_node *, uint32_t,
uint32_t);
 void   iwx_unprotect_session(struct iwx_softc *, struct iwx_node *);
@@ -356,6 +359,10 @@ void   iwx_rx_tx_cmd(struct iwx_softc *, struct 
iwx_rx_p
 void   iwx_rx_bmiss(struct iwx_softc *, struct iwx_rx_packet *,
struct iwx_rx_data *);
 intiwx_binding_cmd(struct iwx_softc *, struct iwx_node *, uint32_t);
+intiwx_phy_ctxt_cmd_uhb_v3(struct iwx_softc *, struct iwx_phy_ctxt *, 
uint8_t,
+   uint8_t, uint32_t);
+intiwx_phy_ctxt_cmd_v3(struct iwx_softc *, struct iwx_phy_ctxt *, uint8_t,
+   uint8_t, uint32_t);
 intiwx_phy_ctxt_cmd_uhb(struct iwx_softc *, struct iwx_phy_ctxt *, uint8_t,
uint8_t, uint32_t, uint32_t);
 intiwx_phy_ctxt_cmd(struct iwx_softc *, struct iwx_phy_ctxt *, uint8_t,
@@ -387,7 +394,16 @@ intiwx_add_sta_cmd(struct iwx_softc *, struct 
iwx_nod
 intiwx_add_aux_sta(struct iwx_softc *);
 intiwx_rm_sta_cmd(struct iwx_softc *, struct iwx_node *);
 intiwx_fill_probe_req(struct iwx_softc *, struct iwx_scan_probe_req *);
+intiwx_config_umac_scan_reduced(struct iwx_softc *);
 intiwx_config_umac_scan(struct iwx_softc *);
+uint16_t iwx_scan_umac_flags_v2(struct iwx_softc *, int);
+void   iwx_scan_umac_dwell_v10(struct iwx_softc *,
+   struct iwx_scan_general_params_v10 *, int);
+void   iwx_scan_umac_fill_general_p_v10(struct iwx_softc *,
+   struct iwx_scan_general_params_v10 *, uint16_t, int);
+void   iwx_scan_umac_fill_ch_p_v6(struct iwx_softc *,
+   struct iwx_scan_channel_params_v6 *, uint32_t, int, int);
+intiwx_umac_scan_v14(struct iwx_softc *, int);
 intiwx_umac_scan(struct iwx_softc *, int);
 void   iwx_mcc_update(struct iwx_softc *, struct iwx_mcc_chub_notif *);
 uint8_tiwx_ridx2rate(struct ieee80211_rateset *, int);
@@ -490,6 +506,21 @@ iwx_lookup_cmd_ver(struct iwx_softc *sc, uint8_t grp, 
return IWX_FW_CMD_VER_UNKNOWN;
 }
 
+uint8_t
+iwx_lookup_notif_ver(struct iwx_softc *sc, uint8_t grp, uint8_t cmd)
+{
+   const struct iwx_fw_cmd_version *entry;
+   int i;
+
+   for (i = 0; i < sc->n_cmd_versions; i++) {
+   entry = >cmd_versions[i];
+   if (entry->group == grp && entry->cmd == cmd)
+   return entry->notif_ver;
+   }
+
+   return IWX_FW_CMD_VER_UNKNOWN;
+}
+
 int
 iwx_is_mimo_ht_plcp(uint8_t ht_plcp)
 {
@@ -924,7 +955,7 @@ iwx_firmware_store_section(struct iwx_softc *sc, enum 
 
 #define IWX_DEFAULT_SCAN_CHANNELS  40
 /* Newer firmware might support more channels. Raise this value if needed. */
-#define IWX_MAX_SCAN_CHANNELS  52 /* as of 8265-34 firmware image */
+#define IWX_MAX_SCAN_CHANNELS  67 /* as of iwx-cc-a0-62 firmware */
 
 struct iwx_tlv_calib_data {
uint32_t ucode_type;
@@ -1304,6 +1335,8 @@ iwx_read_firmware(struct iwx_softc *sc)
break;
 
case IWX_UCODE_TLV_FW_FSEQ_VERSION:
+   case IWX_UCODE_TLV_PHY_INTEGRATION_VERSION:
+   case IWX_UCODE_TLV_FW_NUM_STATIONS:
break;
 
/* undocumented TLVs found in iwx-cc-a0-46 image */
@@ -1317,6 +1350,13 @@ iwx_read_firmware(struct iwx_softc *sc)
case 0x102:
break;
 
+   case IWX_UCODE_TLV_TYPE_DEBUG_INFO:
+   case IWX_UCODE_TLV_TYPE_BUFFER_ALLOCATION:
+   case IWX_UCODE_TLV_TYPE_HCMD:
+   case IWX_UCODE_TLV_TYPE_REGIONS:
+   case IWX_UCODE_TLV_TYPE_TRIGGERS:
+   break;
+

Re: bgpd refactor struct prefix

2021-07-26 Thread Sebastian Benoit

Claudio Jeker(cje...@diehard.n-r-g.com) on 2021.07.14 09:33:19 +0200:
> On Tue, Jun 29, 2021 at 12:00:24PM +0200, Claudio Jeker wrote:
> > This diff moves the rib_entry pointer re into the union to safe some
> > space. For add-path I need to add a few more u_int32_t and that would
> > blow the size of struct prefix from 128 to 132 bytes. malloc would round
> > that up to 256bytes and that is bad for the struct that is allocted in
> > millions in bgpd.
> > 
> > To make this somewhat save introduce PREFIX_FLAG_ADJOUT to mark prefixes
> > that live in the adj-rib-out. Those prefixes can not access the re pointer
> > also use a wrapper prefix_re() which returns the re pointer or NULL.
> > Also add some assertions to make sure that prefixes don't end up in the
> > wrong tree.
> > 
> > This change shrinks the struct back to 120bytes and gives me the space
> > needed for add-path.
> > 
> > Please test
> 

reads ok, and works for me.

I don't like all those fatals. We are bound to overlook something and hit.
But of course those are hard to work around.

> -- 
> :wq Claudio
> 
> Index: rde.c
> ===
> RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
> retrieving revision 1.530
> diff -u -p -r1.530 rde.c
> --- rde.c 25 Jun 2021 09:25:48 -  1.530
> +++ rde.c 29 Jun 2021 08:28:33 -
> @@ -2298,6 +2298,7 @@ rde_dump_rib_as(struct prefix *p, struct
>   struct ibuf *wbuf;
>   struct attr *a;
>   struct nexthop  *nexthop;
> + struct rib_entry*re;
>   void*bp;
>   time_t   staletime;
>   size_t   aslen;
> @@ -2330,7 +2331,8 @@ rde_dump_rib_as(struct prefix *p, struct
>   rib.origin = asp->origin;
>   rib.validation_state = p->validation_state;
>   rib.flags = 0;
> - if (p->re != NULL && p->re->active == p)
> + re = prefix_re(p);
> + if (re != NULL && re->active == p)
>   rib.flags |= F_PREF_ACTIVE;
>   if (!prefix_peer(p)->conf.ebgp)
>   rib.flags |= F_PREF_INTERNAL;
> @@ -2412,14 +2414,16 @@ static void
>  rde_dump_filter(struct prefix *p, struct ctl_show_rib_request *req)
>  {
>   struct rde_aspath   *asp;
> + struct rib_entry*re;
>  
>   if (!rde_match_peer(prefix_peer(p), >neighbor))
>   return;
>  
>   asp = prefix_aspath(p);
> + re = prefix_re(p);
>   if (asp == NULL)/* skip pending withdraw in Adj-RIB-Out */
>   return;
> - if ((req->flags & F_CTL_ACTIVE) && p->re->active != p)
> + if ((req->flags & F_CTL_ACTIVE) && re != NULL && re->active != p)
>   return;
>   if ((req->flags & F_CTL_INVALID) &&
>   (asp->flags & F_ATTR_PARSE_ERR) == 0)
> Index: rde.h
> ===
> RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v
> retrieving revision 1.240
> diff -u -p -r1.240 rde.h
> --- rde.h 17 Jun 2021 16:05:26 -  1.240
> +++ rde.h 29 Jun 2021 08:33:18 -v
> @@ -56,10 +56,10 @@ struct rib {
>   struct filter_head  *in_rules_tmp;
>   u_int   rtableid;
>   u_int   rtableid_tmp;
> + enum reconf_action  state, fibstate;
> + u_int16_t   id;
>   u_int16_t   flags;
>   u_int16_t   flags_tmp;
> - u_int16_t   id;
> - enum reconf_action  state, fibstate;
>  };
>  
>  #define RIB_ADJ_IN   0
> @@ -317,13 +317,13 @@ struct prefix {
>   union {
>   struct {
>   LIST_ENTRY(prefix)   rib, nexthop;
> + struct rib_entry*re;
>   } list;
>   struct {
>   RB_ENTRY(prefix) index, update;
>   } tree;
>   }entry;
>   struct pt_entry *pt;
> - struct rib_entry*re;
>   struct rde_aspath   *aspath;
>   struct rde_community*communities;
>   struct rde_peer *peer;
> @@ -338,6 +338,7 @@ struct prefix {
>  #define  PREFIX_FLAG_DEAD0x04/* locked but removed */
>  #define  PREFIX_FLAG_STALE   0x08/* stale entry (graceful 
> reload) */
>  #define  PREFIX_FLAG_MASK0x0f/* mask for the prefix types */
> +#define  PREFIX_FLAG_ADJOUT  0x10/* prefix is in the adj-out rib 
> */
>  #define  PREFIX_NEXTHOP_LINKED   0x40/* prefix is linked onto 
> nexthop list */
>  #define  PREFIX_FLAG_LOCKED  0x80/* locked by rib walker */
>  };
> @@ -637,6 +638,14 @@ static inline u_int8_t
>  prefix_vstate(struct prefix *p)
>  {
>   return (p->validation_state & ROA_MASK);
> +}
> +
> +static inline struct rib_entry *
> +prefix_re(struct prefix *p)
> +{
> + if (p->flags & PREFIX_FLAG_ADJOUT)
> + return

Re: [please test] amd64: schedule clock interrupts against system clock

2021-07-26 Thread Scott Cheloha

On Fri, Jun 25, 2021 at 06:09:27PM -0500, Scott Cheloha wrote:
> 
> [...]

1 month bump.  I really appreciate the tests I've gotten so far, thank
you.

The test reports I've received suggest that suspend/resume is not
broken by the patch.  The reports also don't show any other big ticket
regressions.  I have multiple reports of successful make builds, ports
builds, and some performance tests.

Two things I'm still unsure about are hibernate/unhibernate and
behavior atop a hypervisor.

Nobody who has submitted a test report has a machine with working
hibernate/unhibernate.  I myself don't have such a machine.  If
someone with known working hibernate support would test the patch out
I'd really appreciate it.

I also have no test reports on OpenBSD guests.  If someone could test
this on a KVM guest (or ESXi, or Hyper-V), I'd really appreciate it.

Then we can move on to other platforms.

Re: sleep.3: miscellaneous cleanup and rewrites

2021-07-26 Thread Scott Cheloha

On Sun, Jul 25, 2021 at 07:41:47PM -0600, Theo de Raadt wrote:
> Scott Cheloha  wrote:
> 
> > Given this, I want to tell the reader, roughly:
> > 
> > "hey!  it's plausible there is a SIGALRM-based sleep() implementation
> >  using still floating around out there in the wild.  If you find one,
> >  you'll want to avoid using it because there are unfixable bugs in
> >  such an implementation.  Maybe use nanosleep() instead.  If you *do*
> >  use it, just know that it will behave differently from OpenBSD's
> >  sleep() in some corner cases."
> > 
> > But if you really think there is no point in mentioning that, and
> > others agree with you, then we won't mention it.
> 
> I don't think the manual pages need to be proscriptive about a concern
> which doesn't occur in the wild.

We don't know if it is still in the wild or not.

There are lots of libc implementations.  We can't account for all of
them.

> Being proscriptive in OpenBSD manual pages isn't going to stop someone
> from creating the precise problem you describe in some other body of
> code.  Except, you are saying they don't create that problem.

Right, we can't prevent it, which is why I wanted to say "beware".

> So why foam at the mouth over it?

Because up until this very moment we referenced the historical
implementation approach in the manpage.

--

Anyway, updated patch.  No mention of SIGALRM or alarm(3) except in
the History section.

Still wondering whether we need an Errors section to mention that
sleep(3) can set errno.

Otherwise I think this is about done.

Index: sleep.3
===
RCS file: /cvs/src/lib/libc/gen/sleep.3,v
retrieving revision 1.16
diff -u -p -r1.16 sleep.3
--- sleep.3 8 Feb 2020 01:09:57 -   1.16
+++ sleep.3 26 Jul 2021 12:52:23 -
@@ -32,7 +32,7 @@
 .Os
 .Sh NAME
 .Nm sleep
-.Nd suspend process execution for interval measured in seconds
+.Nd suspend execution for an interval of seconds
 .Sh SYNOPSIS
 .In unistd.h
 .Ft unsigned int
@@ -40,41 +40,38 @@
 .Sh DESCRIPTION
 The
 .Fn sleep
-function suspends execution of the calling process until either
+function suspends execution until at least the given number of
 .Fa seconds
-seconds have elapsed or a signal is delivered to the process and its
-action is to invoke a signal-catching function or to terminate the
-process.
-The suspension time may be longer than requested due to the
-scheduling of other activity by the system.
+have elapsed or an unmasked signal is delivered to the calling thread.
 .Pp
-This function is implemented using
-.Xr nanosleep 2
-by pausing for
-.Fa seconds
-seconds or until a signal occurs.
-Consequently, in this implementation,
-sleeping has no effect on the state of process timers,
-and there is no special handling for
-.Dv SIGALRM .
+This version of
+.Fn sleep
+is implemented with
+.Xr nanosleep 2 ,
+so delivery of any unmasked signal will terminate the sleep early,
+even if
+.Dv SA_RESTART
+is set with
+.Xr sigaction 2
+for the interrupting signal.
 .Sh RETURN VALUES
-If the
+If
 .Fn sleep
-function returns because the requested time has elapsed, the value
-returned will be zero.
-If the
+sleeps for the full count of
+.Fa seconds
+it returns 0.
+Otherwise,
 .Fn sleep
-function returns due to the delivery of a signal, the value returned
-will be the unslept amount (the request time minus the time actually
-slept) in seconds.
+returns the number of seconds remaining from the original request.
 .Sh SEE ALSO
 .Xr sleep 1 ,
-.Xr nanosleep 2
+.Xr nanosleep 2 ,
+.Xr sigaction 2
 .Sh STANDARDS
 The
 .Fn sleep
 function conforms to
-.St -p1003.1-90 .
+.St -p1003.1-2008 .
 .Sh HISTORY
 A
 .Fn sleep

Re: ix(4): fix Rx hash type

2021-07-26 Thread Jonathan Matthew

On Wed, Jul 14, 2021 at 01:46:37PM +0800, Kevin Lo wrote:
> Hi,
> 
> The diff below fixes Rx desc RSS type.  This matches what Linux and FreeBSD 
> do.
> ok?

ok jmatthew@

> 
> Index: sys/dev/pci/if_ix.c
> ===
> RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
> retrieving revision 1.178
> diff -u -p -u -p -r1.178 if_ix.c
> --- sys/dev/pci/if_ix.c   22 Dec 2020 23:25:37 -  1.178
> +++ sys/dev/pci/if_ix.c   14 Jul 2021 05:41:08 -
> @@ -3071,7 +3071,8 @@ ixgbe_rxeof(struct rx_ring *rxr)
>  
>   i = rxr->next_to_check;
>   while (if_rxr_inuse(>rx_ring) > 0) {
> - uint32_t hash, hashtype;
> + uint32_t hash;
> + uint16_t hashtype;
>  
>   bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
>   dsize * i, dsize, BUS_DMASYNC_POSTREAD);
> @@ -3101,7 +3102,8 @@ ixgbe_rxeof(struct rx_ring *rxr)
>   vtag = letoh16(rxdesc->wb.upper.vlan);
>   eop = ((staterr & IXGBE_RXD_STAT_EOP) != 0);
>   hash = lemtoh32(>wb.lower.hi_dword.rss);
> - hashtype = lemtoh32(>wb.lower.lo_dword.data) &
> + hashtype =
> + lemtoh16(>wb.lower.lo_dword.hs_rss.pkt_info) &
>   IXGBE_RXDADV_RSSTYPE_MASK;
>  
>   if (staterr & IXGBE_RXDADV_ERR_FRAME_ERR_MASK) {
>

Driver and kernel recognition for intel AX210 wifi chip

2021-07-26 Thread Alex Beakes

I figured it would be more appropriate mailing tech then misc, I hope such 
emails are allowed.

Intel wifi card: AX210 in the T14s gen2 intel

If anything there is an email in misc with all the details of the probmem. The 
subject: 
>Device not recognized: T14s gen 2 (intel); soldered intel wifi chip isn't 
>recognized by openbsd.


Can anyone help me with writing the needed software?

Or at least if there are some books or websites I can read and understand, so 
that I can understand and maybe start the process

And how hard is it to write drivers and dev type programs for openbsd?

How does one start to implement stuff like that?

Just want to start using openbsd as soon as possible, start learning the system 
and maybe if I get good start patching it.

I have a crude understanding of C, C++. Better of rust. Don't know much about 
deep kernel and OS stuff. Want to learn more.

Thanks

Re: bgpctl add support for RFC8050 (add-path support for MRT parser)

2021-07-26 Thread Sebastian Benoit

ok

Claudio Jeker(cje...@diehard.n-r-g.com) on 2021.07.13 15:37:36 +0200:
> This diff adds support to read MRT files using the new introduced _ADDPATH
> types as defined in RFC8050. I also started adding MRT support to bgpd but
> that depends on ADD-PATH itself.
> 
> There are a few gotchas, especially the MRT_DUMP_V2 RIB_GENERIC_ADDPATH
> handling is different from all other RIB entry handling. This is a major
> pain point for bgpd less so for the bgpctl parser.
> 
> Some MRT update dumps that can be downloaded and use ADDPATH do actually
> use the BGP4MP_MESSAGE _ADDPATH variant for non-addpath enabled sessions.
> The update messages can not be parsed because the NLRI encoding is incorrect.
> 
> I tested with a few RIB and UPDATE dumps from RIS, route-views and other
> open collectors and it works for me.
> -- 
> :wq Claudio
> 
> Index: usr.sbin/bgpctl/bgpctl.c
> ===
> RCS file: /cvs/src/usr.sbin/bgpctl/bgpctl.c,v
> retrieving revision 1.269
> diff -u -p -r1.269 bgpctl.c
> --- usr.sbin/bgpctl/bgpctl.c  16 Jun 2021 16:24:11 -  1.269
> +++ usr.sbin/bgpctl/bgpctl.c  13 Jul 2021 13:20:51 -
> @@ -470,7 +470,7 @@ show(struct imsg *imsg, struct parse_res
>   warnx("bad IMSG_CTL_SHOW_RIB_ATTR received");
>   break;
>   }
> - output->attr(imsg->data, ilen, res->flags);
> + output->attr(imsg->data, ilen, res->flags, 0);
>   break;
>   case IMSG_CTL_SHOW_RIB_MEM:
>   if (imsg->hdr.len < IMSG_HEADER_SIZE + sizeof(stats))
> @@ -1150,6 +1150,10 @@ show_mrt_dump(struct mrt_rib *mr, struct
>   ctl.local_pref = mre->local_pref;
>   ctl.med = mre->med;
>   /* weight is not part of the mrt dump so it can't be set */
> + if (mr->add_path) {
> + ctl.flags |= F_PREF_PATH_ID;
> + ctl.path_id = mre->path_id;
> + }
>  
>   if (mre->peer_idx < mp->npeers) {
>   ctl.remote_addr = mp->peers[mre->peer_idx].addr;
> @@ -1195,7 +1199,7 @@ show_mrt_dump(struct mrt_rib *mr, struct
>   if (req->flags & F_CTL_DETAIL) {
>   for (j = 0; j < mre->nattrs; j++)
>   output->attr(mre->attrs[j].attr,
> - mre->attrs[j].attr_len, req->flags);
> + mre->attrs[j].attr_len, req->flags, 0);
>   }
>   }
>  }
> @@ -1211,6 +1215,10 @@ network_mrt_dump(struct mrt_rib *mr, str
>   time_t   now;
>   u_int16_ti, j;
>  
> + /* can't announce more than one path so ignore add-path */
> + if (mr->add_path)
> + return;
> +
>   now = time(NULL);
>   for (i = 0; i < mr->nentries; i++) {
>   mre = >entries[i];
> @@ -1586,10 +1594,11 @@ show_mrt_notification(u_char *p, u_int16
>  
>  /* XXX this function does not handle JSON output */
>  static void
> -show_mrt_update(u_char *p, u_int16_t len, int reqflags)
> +show_mrt_update(u_char *p, u_int16_t len, int reqflags, int addpath)
>  {
>   struct bgpd_addr prefix;
>   int pos;
> + u_int32_t pathid;
>   u_int16_t wlen, alen;
>   u_int8_t prefixlen;
>  
> @@ -1609,12 +1618,25 @@ show_mrt_update(u_char *p, u_int16_t len
>   if (wlen > 0) {
>   printf("\n Withdrawn prefixes:");
>   while (wlen > 0) {
> + if (addpath) {
> + if (wlen <= sizeof(pathid)) {
> + printf("bad withdraw prefix");
> + return;
> + }
> + memcpy(, p, sizeof(pathid));
> + pathid = ntohl(pathid);
> + p += sizeof(pathid);
> + len -= sizeof(pathid);
> + wlen -= sizeof(pathid);
> + }
>   if ((pos = nlri_get_prefix(p, wlen, ,
>   )) == -1) {
>   printf("bad withdraw prefix");
>   return;
>   }
>   printf(" %s/%u", log_addr(), prefixlen);
> + if (addpath)
> + printf(" path-id %u", pathid);
>   p += pos;
>   len -= pos;
>   wlen -= pos;
> @@ -1655,7 +1677,7 @@ show_mrt_update(u_char *p, u_int16_t len
>   attrlen += 1 + 2;
>   }
>  
> - output->attr(p, attrlen, reqflags);
> + output->attr(p, attrlen, reqflags, addpath);
>   p += attrlen;
>   alen -= attrlen;
>   len -= attrlen;
> @@ -1664,12 +1686,24 @@ show_mrt_update(u_char *p, u_int16_t

Re: remove efibind.h cruft

2021-07-26 Thread Mark Kettenis

> Date: Mon, 26 Jul 2021 18:45:31 +1000
> From: Jonathan Gray 
> 
> Follow what was done with riscv64 and replace efibind.h with just the
> defines we need.
> 
> Tested on armv7 arm64 and amd64 (bootx64).

ok kettenis@
 
> Index: sys/stand/efi/include/amd64/efibind.h
> ===
> RCS file: /cvs/src/sys/stand/efi/include/amd64/efibind.h,v
> retrieving revision 1.2
> diff -u -p -r1.2 efibind.h
> --- sys/stand/efi/include/amd64/efibind.h 4 Jun 2021 00:09:34 -   
> 1.2
> +++ sys/stand/efi/include/amd64/efibind.h 26 Jul 2021 05:39:34 -
> @@ -1,271 +1,22 @@
> -/* $FreeBSD: head/sys/boot/efi/include/amd64/efibind.h 279038 2015-02-20 
> 01:40:55Z imp $ */
> -/*++
> +/* Public Domain. */
>  
> -Copyright (c)  1999 - 2003 Intel Corporation. All rights reserved
> -This software and associated documentation (if any) is furnished
> -under a license and may only be used or copied in accordance
> -with the terms of the license. Except as permitted by such
> -license, no part of this software or documentation may be
> -reproduced, stored in a retrieval system, or transmitted in any
> -form or by any means without the express written consent of
> -Intel Corporation.
> -
> -Module Name:
> -
> -efefind.h
> -
> -Abstract:
> -
> -EFI to compile bindings
> -
> -
> -
> -
> -Revision History
> -
> ---*/
> -
> -#pragma pack()
> -
> -
> -#if defined(__FreeBSD__) || defined(__OpenBSD__)
>  #include 
> -#else
> -//
> -// Basic int types of various widths
> -//
> -
> -#if (__STDC_VERSION__ < 199901L )
> -
> -// No ANSI C 1999/2000 stdint.h integer width declarations
> -
> -#if _MSC_EXTENSIONS
> -
> -// Use Microsoft C compiler integer width declarations
> -
> -typedef unsigned __int64uint64_t;
> -typedef __int64 int64_t;
> -typedef unsigned __int32uint32_t;
> -typedef __int32 int32_t;
> -typedef unsigned short  uint16_t;
> -typedef short   int16_t;
> -typedef unsigned char   uint8_t;
> -typedef charint8_t;
> -#else
> -#ifdef UNIX_LP64
> -
> -// Use LP64 programming model from C_FLAGS for integer width 
> declarations
> -
> -typedef unsigned long   uint64_t;
> -typedef longint64_t;
> -typedef unsigned intuint32_t;
> -typedef int int32_t;
> -typedef unsigned short  uint16_t;
> -typedef short   int16_t;
> -typedef unsigned char   uint8_t;
> -typedef charint8_t;
> -#else
> -
> -// Assume P64 programming model from C_FLAGS for integer width 
> declarations
> -
> -typedef unsigned long long  uint64_t;
> -typedef long long   int64_t;
> -typedef unsigned intuint32_t;
> -typedef int int32_t;
> -typedef unsigned short  uint16_t;
> -typedef short   int16_t;
> -typedef unsigned char   uint8_t;
> -typedef charint8_t;
> -#endif
> -#endif
> -#endif
> -#endif   /* __FreeBSD__ || __OpenBSD__ */
> -
> -//
> -// Basic EFI types of various widths
> -//
> -
> -#ifndef ACPI_THREAD_ID   /* ACPI's definitions are fine */
> -#define ACPI_USE_SYSTEM_INTTYPES 1   /* Tell ACPI we've defined types */
> -
> -typedef uint64_t   UINT64;
> -typedef int64_tINT64;
> -
> -#ifndef _BASETSD_H_
> -typedef uint32_t   UINT32;
> -typedef int32_tINT32;
> -#endif
> -
> -typedef uint16_t   UINT16;
> -typedef int16_tINT16;
> -typedef uint8_tUINT8;
> -typedef int8_t INT8;
> -
> -#endif
> -
> -#undef VOID
> -#define VOIDvoid
> -
> -
> -typedef int64_tINTN;
> -typedef uint64_t   UINTN;
> -
> -#ifdef EFI_NT_EMULATOR
> -#define POST_CODE(_Data)
> -#else
> -#ifdef EFI_DEBUG
> -#define POST_CODE(_Data)__asm mov eax,(_Data) __asm out 0x80,al
> -#else
> -#define POST_CODE(_Data)
> -#endif
> -#endif
> -
> -#define EFIERR(a)   (0x8000 | a)
> -#define EFI_ERROR_MASK  0x8000
> -#define EFIERR_OEM(a)   (0xc000 | a)
> -
> -
> -#define BAD_POINTER 0xFBFBFBFBFBFBFBFB
> -#define MAX_ADDRESS 0x
> -
> -#define BREAKPOINT()__asm { int 3 }
> -
> -//
> -// Pointers must be aligned to these address to function
> -//
> -
> -#define MIN_ALIGNMENT_SIZE  4
> -
> -#define ALIGN_VARIABLE(Value ,Adjustment) \
> -(UINTN)Adjustment = 0; \
> -if((UINTN)Value % MIN_ALIGNMENT_SIZE) \
> -(UINTN)Adjustment = MIN_ALIGNMENT_SIZE - ((UINTN)Value % 
> MIN_ALIGNMENT_SIZE); \
> -Value = (UINTN)Value + (UINTN)Adjustment
> -
> -
> -//
> -// Define macros to build data structure

remove efibind.h cruft

2021-07-26 Thread Jonathan Gray

Follow what was done with riscv64 and replace efibind.h with just the
defines we need.

Tested on armv7 arm64 and amd64 (bootx64).

Index: sys/stand/efi/include/amd64/efibind.h
===
RCS file: /cvs/src/sys/stand/efi/include/amd64/efibind.h,v
retrieving revision 1.2
diff -u -p -r1.2 efibind.h
--- sys/stand/efi/include/amd64/efibind.h   4 Jun 2021 00:09:34 -   
1.2
+++ sys/stand/efi/include/amd64/efibind.h   26 Jul 2021 05:39:34 -
@@ -1,271 +1,22 @@
-/* $FreeBSD: head/sys/boot/efi/include/amd64/efibind.h 279038 2015-02-20 
01:40:55Z imp $ */
-/*++
+/* Public Domain. */
 
-Copyright (c)  1999 - 2003 Intel Corporation. All rights reserved
-This software and associated documentation (if any) is furnished
-under a license and may only be used or copied in accordance
-with the terms of the license. Except as permitted by such
-license, no part of this software or documentation may be
-reproduced, stored in a retrieval system, or transmitted in any
-form or by any means without the express written consent of
-Intel Corporation.
-
-Module Name:
-
-efefind.h
-
-Abstract:
-
-EFI to compile bindings
-
-
-
-
-Revision History
-
---*/
-
-#pragma pack()
-
-
-#if defined(__FreeBSD__) || defined(__OpenBSD__)
 #include 
-#else
-//
-// Basic int types of various widths
-//
-
-#if (__STDC_VERSION__ < 199901L )
-
-// No ANSI C 1999/2000 stdint.h integer width declarations
-
-#if _MSC_EXTENSIONS
-
-// Use Microsoft C compiler integer width declarations
-
-typedef unsigned __int64uint64_t;
-typedef __int64 int64_t;
-typedef unsigned __int32uint32_t;
-typedef __int32 int32_t;
-typedef unsigned short  uint16_t;
-typedef short   int16_t;
-typedef unsigned char   uint8_t;
-typedef charint8_t;
-#else
-#ifdef UNIX_LP64
-
-// Use LP64 programming model from C_FLAGS for integer width 
declarations
-
-typedef unsigned long   uint64_t;
-typedef longint64_t;
-typedef unsigned intuint32_t;
-typedef int int32_t;
-typedef unsigned short  uint16_t;
-typedef short   int16_t;
-typedef unsigned char   uint8_t;
-typedef charint8_t;
-#else
-
-// Assume P64 programming model from C_FLAGS for integer width 
declarations
-
-typedef unsigned long long  uint64_t;
-typedef long long   int64_t;
-typedef unsigned intuint32_t;
-typedef int int32_t;
-typedef unsigned short  uint16_t;
-typedef short   int16_t;
-typedef unsigned char   uint8_t;
-typedef charint8_t;
-#endif
-#endif
-#endif
-#endif /* __FreeBSD__ || __OpenBSD__ */
-
-//
-// Basic EFI types of various widths
-//
-
-#ifndef ACPI_THREAD_ID /* ACPI's definitions are fine */
-#define ACPI_USE_SYSTEM_INTTYPES 1 /* Tell ACPI we've defined types */
-
-typedef uint64_t   UINT64;
-typedef int64_tINT64;
-
-#ifndef _BASETSD_H_
-typedef uint32_t   UINT32;
-typedef int32_tINT32;
-#endif
-
-typedef uint16_t   UINT16;
-typedef int16_tINT16;
-typedef uint8_tUINT8;
-typedef int8_t INT8;
-
-#endif
-
-#undef VOID
-#define VOIDvoid
-
-
-typedef int64_tINTN;
-typedef uint64_t   UINTN;
-
-#ifdef EFI_NT_EMULATOR
-#define POST_CODE(_Data)
-#else
-#ifdef EFI_DEBUG
-#define POST_CODE(_Data)__asm mov eax,(_Data) __asm out 0x80,al
-#else
-#define POST_CODE(_Data)
-#endif
-#endif
-
-#define EFIERR(a)   (0x8000 | a)
-#define EFI_ERROR_MASK  0x8000
-#define EFIERR_OEM(a)   (0xc000 | a)
-
-
-#define BAD_POINTER 0xFBFBFBFBFBFBFBFB
-#define MAX_ADDRESS 0x
-
-#define BREAKPOINT()__asm { int 3 }
-
-//
-// Pointers must be aligned to these address to function
-//
-
-#define MIN_ALIGNMENT_SIZE  4
-
-#define ALIGN_VARIABLE(Value ,Adjustment) \
-(UINTN)Adjustment = 0; \
-if((UINTN)Value % MIN_ALIGNMENT_SIZE) \
-(UINTN)Adjustment = MIN_ALIGNMENT_SIZE - ((UINTN)Value % 
MIN_ALIGNMENT_SIZE); \
-Value = (UINTN)Value + (UINTN)Adjustment
-
-
-//
-// Define macros to build data structure signatures from characters.
-//
-
-#define EFI_SIGNATURE_16(A,B) ((A) | (B<<8))
-#define EFI_SIGNATURE_32(A,B,C,D) (EFI_SIGNATURE_16(A,B) | 
(EFI_SIGNATURE_16(C,D) << 16))
-#define EFI_SIGNATURE_64(A,B,C,D,E,F,G,H) (EFI_SIGNATURE_32(A,B,C,D) | 
((UINT64)(EFI_SIGNATURE_32(E,F,G,H)) << 32))
-
-//
-// EFIAPI - prototype calling convention for EFI function pointers
-// BOOTSERVICE - prototype for

Re: Driver and kernel recognition for intel AX210 wifi chip

2021-07-26 Thread Timo Myyrä

Alex Beakes  [2021-07-26, 07:24 +]:

> I figured it would be more appropriate mailing tech then misc, I hope such 
> emails are allowed.
>
> Intel wifi card: AX210 in the T14s gen2 intel
>
> If anything there is an email in misc with all the details of the probmem. 
> The subject: 
>>Device not recognized: T14s gen 2 (intel); soldered intel wifi chip isn't 
>>recognized by openbsd.
>
>
> Can anyone help me with writing the needed software?
>
> Or at least if there are some books or websites I can read and understand, so 
> that I can understand and maybe start the process
>
> And how hard is it to write drivers and dev type programs for openbsd?
>
> How does one start to implement stuff like that?
>
> Just want to start using openbsd as soon as possible, start learning the 
> system and maybe if I get good start patching it.
>
> I have a crude understanding of C, C++. Better of rust. Don't know much about 
> deep kernel and OS stuff. Want to learn more.
>
> Thanks

Check out following paper to get started:
https://www.openbsd.org/papers/eurobsdcon2017-device-drivers.pdf

Timo

Re: new kqueue-based select(2) implementation

2021-07-26 Thread Martin Pieuchot

On 21/07/21(Wed) 09:19, Martin Pieuchot wrote:
> On 23/06/21(Wed) 15:53, Alexander Bluhm wrote:
> > On Wed, Jun 23, 2021 at 11:40:18AM +0200, Martin Pieuchot wrote:
> > > Our previous attempt [0] to replace the current select(2) implementation
> > > has been reverted due to non-acceptable latency increase on sockets [1].
> > 
> > I have measured the performance difference.
> > 
> > http://bluhm.genua.de/perform/results/2021-06-21T09%3A44%3A18Z/perform.html
> > 
> > Worst 20% throughput drop is in 'iperf3 -c10.3.45.35 -u -b10G -w1m
> > -t10 -R' which can be seen here.
> > 
> > http://bluhm.genua.de/perform/results/2021-06-21T09%3A44%3A18Z/gnuplot/udp.html
> > 
> > Note that iperf3 calls select(2) multiple times per UDP packet.
> > 
> > As a new feature I have links to btrace kstack flame graphs in the
> > table.
> 
> Thanks a lot for the tests.  The FlameGraphs have shown that lazy
> removal wasn't working correctly.  Updated diff below now works as
> expected.
> 
> I'm aware of the throughput drop in the UDP iperf3 test, this is not a
> real case scenario so I don't consider it as a blocker.  However it is
> very useful to check the contention on the NET_LOCK() in select(2).  I'm
> working on this issue on another thread, but there's an interdependency
> between the two diffs, due to lock ordering. 

Updated diff after recent commits from visa@.

Index: kern/sys_generic.c
===
RCS file: /cvs/src/sys/kern/sys_generic.c,v
retrieving revision 1.135
diff -u -p -r1.135 sys_generic.c
--- kern/sys_generic.c  8 Jan 2021 09:29:04 -   1.135
+++ kern/sys_generic.c  26 Jul 2021 06:56:22 -
@@ -55,6 +55,7 @@
 #include 
 #include 
 #include 
+#include 
 #ifdef KTRACE
 #include 
 #endif
@@ -66,8 +67,21 @@
 
 #include 
 
-int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
-void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
+/*
+ * Debug values:
+ *  1 - print implementation errors, things that should not happen.
+ *  2 - print ppoll(2) information, somewhat verbose
+ *  3 - print pselect(2) and ppoll(2) information, very verbose
+ */
+int kqpoll_debug = 0;
+#define DPRINTFN(v, x...) if (kqpoll_debug > v) {  \
+   printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid);  \
+   printf(x);  \
+}
+
+int pselregister(struct proc *, fd_set *[], int, int *);
+int pselcollect(struct proc *, struct kevent *, fd_set *[], int *);
+
 int pollout(struct pollfd *, struct pollfd *, u_int);
 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
 struct timespec *, const sigset_t *, register_t *);
@@ -582,13 +596,12 @@ sys_pselect(struct proc *p, void *v, reg
 
 int
 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
-struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
+struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
 {
+   struct kqueue_scan_state scan;
fd_mask bits[6];
fd_set *pibits[3], *pobits[3];
-   struct timespec elapsed, start, stop;
-   uint64_t nsecs;
-   int s, ncoll, error = 0;
+   int error, n, ncollected = 0, nevents = 0;
u_int ni;
 
if (nd < 0)
@@ -618,6 +631,8 @@ dopselect(struct proc *p, int nd, fd_set
pobits[2] = (fd_set *)[5];
}
 
+   kqpoll_init();
+
 #definegetbits(name, x) \
if (name && (error = copyin(name, pibits[x], ni))) \
goto done;
@@ -636,43 +651,65 @@ dopselect(struct proc *p, int nd, fd_set
if (sigmask)
dosigsuspend(p, *sigmask &~ sigcantmask);
 
-retry:
-   ncoll = nselcoll;
-   atomic_setbits_int(>p_flag, P_SELECT);
-   error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
-   if (error || *retval)
+   /* Register kqueue events */
+   error = pselregister(p, pibits, nd, );
+   if (error != 0)
goto done;
-   if (timeout == NULL || timespecisset(timeout)) {
-   if (timeout != NULL) {
-   getnanouptime();
-   nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
-   } else
-   nsecs = INFSLP;
-   s = splhigh();
-   if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
-   splx(s);
-   goto retry;
-   }
-   atomic_clearbits_int(>p_flag, P_SELECT);
-   error = tsleep_nsec(, PSOCK | PCATCH, "select", nsecs);
-   splx(s);
-   if (timeout != NULL) {
-   getnanouptime();
-   timespecsub(, , );
-   timespecsub(timeout, , timeout);
-   if (timeout->tv_sec < 0)
-   timespecclear(timeout);
+
+   /*
+* The poll/select family of syscalls has

Re: Do not spin on the NET_LOCK() in kqueue

2021-07-26 Thread Martin Pieuchot

On 26/07/21(Mon) 08:55, Martin Pieuchot wrote:
> On 21/07/21(Wed) 10:18, Martin Pieuchot wrote:
> > On 11/07/21(Sun) 14:45, Visa Hankala wrote:
> > > On Sat, Jul 10, 2021 at 05:26:57PM +0200, Martin Pieuchot wrote:
> > > > One of the reasons for the drop of performances in the kqueue-based
> > > > poll/select is the fact that kqueue filters are called up to 3 times
> > > > per syscall and that they all spin on the NET_LOCK() for TCP/UDP
> > > > packets.
> > > > 
> > > > Diff below is a RFC for improving the situation.
> > > > 
> > > > socket kqueue filters mainly check for the amount of available items to
> > > > read/write.  This involves comparing various socket buffer fields 
> > > > (sb_cc,
> > > > sb_lowat, etc).  The diff below introduces a new mutex to serialize
> > > > updates of those fields with reads in the kqueue filters.
> > > > 
> > > > Since these fields are always modified with the socket lock held, either
> > > > the mutex or the solock are enough to have a coherent view of them.
> > > > Note that either of these locks is necessary only if multiple fields
> > > > have to be read (like in sbspace()).
> > > > 
> > > > Other per-socket fields accessed in the kqueue filters are never
> > > > combined (with &&) to determine a condition.  So assuming it is fine to
> > > > read register-sized fields w/o the socket lock we can safely remove it
> > > > there.
> > > > 
> > > > Could such mutex also be used to serialize klist updates?
> > > 
> > > I think the lock should be such that it can serialize socket klists.
> > > 
> > > As the main motivator for this change is kqueue, the viability of using
> > > the mutex for the klist locking should be checked now. The mutex has to
> > > be held whenever calling KNOTE() on sb_sel.si_note, or selwakeup() on
> > > sb_sel. Then the socket f_event callbacks will not need to lock the
> > > mutex themselves.
> > > 
> > > I had a diff that serialized socket klists using solock(). It did not
> > > work well because it increased lock contention, especially when using
> > > kqueue as backend for poll(2) and select(2). The diff is not even
> > > correct any longer since recent changes to socket locking have
> > > introduced new lock order constraints that conflict with it.
> > 
> > Updated diff below does that.  It also uses a single per-socket mutex as
> > suggested by bluhm@.
> > 
> > Note that as long poll(2) & select(2) use the current implementation a
> > KERNEL_LOCK()/UNLOCK() dance is necessary in sowakeup().  The goal of
> > this change combined with the poll/select rewrite is to get rid of this
> > dance.
> 
> Updated diff after recent commits, more comments?  Oks?

Previous diff had a double mtx_enter() in filt_fifowrite_common(), this
one use the *locked() version of sbspace() to prevent it.


Index: kern/uipc_socket.c
===
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.264
diff -u -p -r1.264 uipc_socket.c
--- kern/uipc_socket.c  26 Jul 2021 05:51:13 -  1.264
+++ kern/uipc_socket.c  26 Jul 2021 07:20:58 -
@@ -84,7 +84,7 @@ int   filt_solistenprocess(struct knote *k
 intfilt_solisten_common(struct knote *kn, struct socket *so);
 
 const struct filterops solisten_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_solisten,
@@ -93,7 +93,7 @@ const struct filterops solisten_filtops 
 };
 
 const struct filterops soread_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_soread,
@@ -102,7 +102,7 @@ const struct filterops soread_filtops = 
 };
 
 const struct filterops sowrite_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sowdetach,
.f_event= filt_sowrite,
@@ -111,7 +111,7 @@ const struct filterops sowrite_filtops =
 };
 
 const struct filterops soexcept_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_soread,
@@ -181,6 +181,9 @@ socreate(int dom, struct socket **aso, i
so->so_egid = p->p_ucred->cr_gid;
so->so_cpid = p->p_p->ps_pid;
so->so_proto = prp;
+   mtx_init(>so_mtx, IPL_MPFLOOR);
+   klist_init_mutex(>so_snd.sb_sel.si_note, >so_mtx);
+   klist_init_mutex(>so_rcv.sb_sel.si_note, >so_mtx);
so->so_snd.sb_timeo_nsecs = INFSLP;
so->so_rcv.sb_timeo_nsecs = INFSLP;
 
@@ -276,7 +279,9 @@ sofree(struct socket *so, int s)
}
}
 #endif /* SOCKET_SPLICE */
+

Re: Do not spin on the NET_LOCK() in kqueue

2021-07-26 Thread Martin Pieuchot

On 21/07/21(Wed) 10:18, Martin Pieuchot wrote:
> On 11/07/21(Sun) 14:45, Visa Hankala wrote:
> > On Sat, Jul 10, 2021 at 05:26:57PM +0200, Martin Pieuchot wrote:
> > > One of the reasons for the drop of performances in the kqueue-based
> > > poll/select is the fact that kqueue filters are called up to 3 times
> > > per syscall and that they all spin on the NET_LOCK() for TCP/UDP
> > > packets.
> > > 
> > > Diff below is a RFC for improving the situation.
> > > 
> > > socket kqueue filters mainly check for the amount of available items to
> > > read/write.  This involves comparing various socket buffer fields (sb_cc,
> > > sb_lowat, etc).  The diff below introduces a new mutex to serialize
> > > updates of those fields with reads in the kqueue filters.
> > > 
> > > Since these fields are always modified with the socket lock held, either
> > > the mutex or the solock are enough to have a coherent view of them.
> > > Note that either of these locks is necessary only if multiple fields
> > > have to be read (like in sbspace()).
> > > 
> > > Other per-socket fields accessed in the kqueue filters are never
> > > combined (with &&) to determine a condition.  So assuming it is fine to
> > > read register-sized fields w/o the socket lock we can safely remove it
> > > there.
> > > 
> > > Could such mutex also be used to serialize klist updates?
> > 
> > I think the lock should be such that it can serialize socket klists.
> > 
> > As the main motivator for this change is kqueue, the viability of using
> > the mutex for the klist locking should be checked now. The mutex has to
> > be held whenever calling KNOTE() on sb_sel.si_note, or selwakeup() on
> > sb_sel. Then the socket f_event callbacks will not need to lock the
> > mutex themselves.
> > 
> > I had a diff that serialized socket klists using solock(). It did not
> > work well because it increased lock contention, especially when using
> > kqueue as backend for poll(2) and select(2). The diff is not even
> > correct any longer since recent changes to socket locking have
> > introduced new lock order constraints that conflict with it.
> 
> Updated diff below does that.  It also uses a single per-socket mutex as
> suggested by bluhm@.
> 
> Note that as long poll(2) & select(2) use the current implementation a
> KERNEL_LOCK()/UNLOCK() dance is necessary in sowakeup().  The goal of
> this change combined with the poll/select rewrite is to get rid of this
> dance.

Updated diff after recent commits, more comments?  Oks?

Index: kern/uipc_socket.c
===
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.264
diff -u -p -r1.264 uipc_socket.c
--- kern/uipc_socket.c  26 Jul 2021 05:51:13 -  1.264
+++ kern/uipc_socket.c  26 Jul 2021 05:57:45 -
@@ -84,7 +84,7 @@ int   filt_solistenprocess(struct knote *k
 intfilt_solisten_common(struct knote *kn, struct socket *so);
 
 const struct filterops solisten_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_solisten,
@@ -93,7 +93,7 @@ const struct filterops solisten_filtops 
 };
 
 const struct filterops soread_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_soread,
@@ -102,7 +102,7 @@ const struct filterops soread_filtops = 
 };
 
 const struct filterops sowrite_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sowdetach,
.f_event= filt_sowrite,
@@ -111,7 +111,7 @@ const struct filterops sowrite_filtops =
 };
 
 const struct filterops soexcept_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_sordetach,
.f_event= filt_soread,
@@ -181,6 +181,9 @@ socreate(int dom, struct socket **aso, i
so->so_egid = p->p_ucred->cr_gid;
so->so_cpid = p->p_p->ps_pid;
so->so_proto = prp;
+   mtx_init(>so_mtx, IPL_MPFLOOR);
+   klist_init_mutex(>so_snd.sb_sel.si_note, >so_mtx);
+   klist_init_mutex(>so_rcv.sb_sel.si_note, >so_mtx);
so->so_snd.sb_timeo_nsecs = INFSLP;
so->so_rcv.sb_timeo_nsecs = INFSLP;
 
@@ -276,7 +279,9 @@ sofree(struct socket *so, int s)
}
}
 #endif /* SOCKET_SPLICE */
+   mtx_enter(>so_mtx);
sbrelease(so, >so_snd);
+   mtx_leave(>so_mtx);
sorflush(so);
sounlock(so, s);
 #ifdef SOCKET_SPLICE
@@ -1019,8 +1024,10 @@ dontblock:
*mp = m_copym(m, 0, len, M_WAIT);

OpenBSD Errata: July 25, 2021 (libc, mips64)

2021-07-26 Thread Sebastian Benoit

An errata patch for the libc library on the mips64 architecture has  
been released for OpenBSD 6.8 and OpenBSD 6.9.

On mips64, the strchr/index/strrchr/rindex functions in libc handled
signed characters incorrectly.

Source code patches can be found on the respective errata pages:

  https://www.openbsd.org/errata68.html
  https://www.openbsd.org/errata69.html

Re: pipex(4): introduce mutexes to protect 'pipex_session' context

Re: ipsec(4): use per-CPU counters for tunnel descriptor block (tdb) statistics

Re: ipsec: document locks of some structures

Re: ipsec(4): use per-CPU counters for tunnel descriptor block (tdb) statistics

Re: pipex(4): introduce mutexes to protect 'pipex_session' context

Re: pf icmp reflect

Re: pf icmp reflect

pf icmp reflect

Re: sleep.3: miscellaneous cleanup and rewrites

iwx(4) firmware update to -63

Re: bgpd refactor struct prefix

Re: [please test] amd64: schedule clock interrupts against system clock

Re: sleep.3: miscellaneous cleanup and rewrites

Re: ix(4): fix Rx hash type

Driver and kernel recognition for intel AX210 wifi chip

Re: bgpctl add support for RFC8050 (add-path support for MRT parser)

Re: remove efibind.h cruft

remove efibind.h cruft

Re: Driver and kernel recognition for intel AX210 wifi chip

Re: new kqueue-based select(2) implementation

Re: Do not spin on the NET_LOCK() in kqueue

Re: Do not spin on the NET_LOCK() in kqueue

OpenBSD Errata: July 25, 2021 (libc, mips64)

23 matches

Site Navigation

Mail list logo

Footer information