On 08/12/21(Wed) 22:39, Alexander Bluhm wrote:
> On Wed, Dec 08, 2021 at 03:28:34PM -0300, Martin Pieuchot wrote:
> > On 04/12/21(Sat) 01:02, Alexander Bluhm wrote:
> > > Hi,
> > >
> > > As I want a read-only net lock for forwarding, all paths should be
> > > checked for the correct net lock and asserts. I found code in
> > > in_pcbhashlookup() where reading the PCB table results in a write
> > > to optimize the cache.
> > >
> > > Porperly protecting PCB hashes is out of scope for parallel forwarding.
> > > Can we get away with this hack? Only update the cache when we are
> > > in TCP oder UDP stack with the write lock. The access from pf is
> > > read-only.
> > >
> > > NET_WLOCKED() indicates whether we may change data structures.
> >
> > I recall that we currently do not want to introduce such idiom: change
> > the behavior of the code depending on which lock is held by the caller.
> >
> > Can we instead assert that a write-lock is held before modifying the
> > list?
>
> We could also pass down the kind of lock that is used. Goal is
> that pf uses shared net lock. TCP and UDP will keep the exclusice
> net lock for a while.
Changing the logic of a function based on the type of a lock is not
different from the previous approach.
> Diff gets longer but perhaps a bit clearer what is going on.
I believe we want to split in_pcblookup_listen() into two functions.
One which is read-only and one which modifies the head of the hash
chain.
The read-only asserts for any lock and the one that modifies the hash
calls the former and assert for a write lock.
Alternatively, we could protect the PCB hash with a mutex. This has the
advantage of not making the scope of the NET_LOCK() more complex. In
the end we all know something like that will be done. I don't know how
other BSD did this but I'm sure this will help getting the remaining
socket layer out of the NET_LOCK().
> Index: net/pf.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v
> retrieving revision 1.1122
> diff -u -p -r1.1122 pf.c
> --- net/pf.c 7 Jul 2021 18:38:25 -0000 1.1122
> +++ net/pf.c 8 Dec 2021 21:16:16 -0000
> @@ -3317,14 +3317,12 @@ pf_socket_lookup(struct pf_pdesc *pd)
> sport = pd->hdr.tcp.th_sport;
> dport = pd->hdr.tcp.th_dport;
> PF_ASSERT_LOCKED();
> - NET_ASSERT_LOCKED();
> tb = &tcbtable;
> break;
> case IPPROTO_UDP:
> sport = pd->hdr.udp.uh_sport;
> dport = pd->hdr.udp.uh_dport;
> PF_ASSERT_LOCKED();
> - NET_ASSERT_LOCKED();
> tb = &udbtable;
> break;
> default:
> @@ -3348,22 +3346,24 @@ pf_socket_lookup(struct pf_pdesc *pd)
> * Fails when rtable is changed while evaluating the ruleset
> * The socket looked up will not match the one hit in the end.
> */
> - inp = in_pcbhashlookup(tb, saddr->v4, sport, daddr->v4, dport,
> - pd->rdomain);
> + NET_ASSERT_LOCKED();
> + inp = in_pcbhashlookup_wlocked(tb, saddr->v4, sport, daddr->v4,
> + dport, pd->rdomain, 0);
> if (inp == NULL) {
> - inp = in_pcblookup_listen(tb, daddr->v4, dport,
> - NULL, pd->rdomain);
> + inp = in_pcblookup_listen_wlocked(tb, daddr->v4, dport,
> + NULL, pd->rdomain, 0);
> if (inp == NULL)
> return (-1);
> }
> break;
> #ifdef INET6
> case AF_INET6:
> - inp = in6_pcbhashlookup(tb, &saddr->v6, sport, &daddr->v6,
> - dport, pd->rdomain);
> + NET_ASSERT_LOCKED();
> + inp = in6_pcbhashlookup_wlocked(tb, &saddr->v6, sport,
> + &daddr->v6, dport, pd->rdomain, 0);
> if (inp == NULL) {
> - inp = in6_pcblookup_listen(tb, &daddr->v6, dport,
> - NULL, pd->rdomain);
> + inp = in6_pcblookup_listen_wlocked(tb, &daddr->v6,
> + dport, NULL, pd->rdomain, 0);
> if (inp == NULL)
> return (-1);
> }
> Index: netinet/in_pcb.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
> retrieving revision 1.256
> diff -u -p -r1.256 in_pcb.c
> --- netinet/in_pcb.c 25 Oct 2021 22:20:47 -0000 1.256
> +++ netinet/in_pcb.c 8 Dec 2021 21:16:16 -0000
> @@ -1051,6 +1051,15 @@ in_pcbresize(struct inpcbtable *table, i
> int in_pcbnotifymiss = 0;
> #endif
>
> +struct inpcb *
> +in_pcbhashlookup(struct inpcbtable *table, struct in_addr faddr,
> + u_int fport_arg, struct in_addr laddr, u_int lport_arg, u_int rtable)
> +{
> + NET_ASSERT_WLOCKED();
> + return in_pcbhashlookup_wlocked(table, faddr ,fport_arg, laddr,
> + lport_arg, rtable, 1);
> +}
> +
> /*
> * The in(6)_pcbhashlookup functions are used to locate connected sockets
> * quickly:
> @@ -1061,8 +1070,9 @@ int in_pcbnotifymiss = 0;
> * After those two lookups no other are necessary.
> */
> struct inpcb *
> -in_pcbhashlookup(struct inpcbtable *table, struct in_addr faddr,
> - u_int fport_arg, struct in_addr laddr, u_int lport_arg, u_int rtable)
> +in_pcbhashlookup_wlocked(struct inpcbtable *table, struct in_addr faddr,
> + u_int fport_arg, struct in_addr laddr, u_int lport_arg, u_int rtable,
> + int wlocked)
> {
> struct inpcbhead *head;
> struct inpcb *inp;
> @@ -1093,7 +1103,7 @@ in_pcbhashlookup(struct inpcbtable *tabl
> }
> }
> #ifdef DIAGNOSTIC
> - if (inp == NULL && in_pcbnotifymiss) {
> + if (wlocked && inp == NULL && in_pcbnotifymiss) {
> printf("%s: faddr=%08x fport=%d laddr=%08x lport=%d rdom=%u\n",
> __func__, ntohl(faddr.s_addr), ntohs(fport),
> ntohl(laddr.s_addr), ntohs(lport), rdomain);
> @@ -1102,6 +1112,15 @@ in_pcbhashlookup(struct inpcbtable *tabl
> return (inp);
> }
>
> +struct inpcb *
> +in_pcblookup_listen(struct inpcbtable *table, struct in_addr laddr,
> + u_int lport_arg, struct mbuf *m, u_int rtable)
> +{
> + NET_ASSERT_WLOCKED();
> + return in_pcblookup_listen_wlocked(table, laddr, lport_arg, m, rtable,
> + 1);
> +}
> +
> /*
> * The in(6)_pcblookup_listen functions are used to locate listening
> * sockets quickly. This are sockets with unspecified foreign address
> @@ -1110,8 +1129,8 @@ in_pcbhashlookup(struct inpcbtable *tabl
> * *.* <-> *.lport
> */
> struct inpcb *
> -in_pcblookup_listen(struct inpcbtable *table, struct in_addr laddr,
> - u_int lport_arg, struct mbuf *m, u_int rtable)
> +in_pcblookup_listen_wlocked(struct inpcbtable *table, struct in_addr laddr,
> + u_int lport_arg, struct mbuf *m, u_int rtable, int wlocked)
> {
> struct inpcbhead *head;
> const struct in_addr *key1, *key2;
> @@ -1185,7 +1204,7 @@ in_pcblookup_listen(struct inpcbtable *t
> * repeated accesses are quicker. This is analogous to
> * the historic single-entry PCB cache.
> */
> - if (inp != NULL && inp != LIST_FIRST(head)) {
> + if (wlocked && inp != NULL && inp != LIST_FIRST(head)) {
> LIST_REMOVE(inp, inp_hash);
> LIST_INSERT_HEAD(head, inp, inp_hash);
> }
> Index: netinet/in_pcb.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
> retrieving revision 1.121
> diff -u -p -r1.121 in_pcb.h
> --- netinet/in_pcb.h 25 Jan 2021 03:40:46 -0000 1.121
> +++ netinet/in_pcb.h 8 Dec 2021 21:16:16 -0000
> @@ -274,20 +274,32 @@ void in_pcbunref(struct inpcb *);
> void in_pcbdisconnect(struct inpcb *);
> struct inpcb *
> in_pcbhashlookup(struct inpcbtable *, struct in_addr,
> - u_int, struct in_addr, u_int, u_int);
> + u_int, struct in_addr, u_int, u_int);
> struct inpcb *
> in_pcblookup_listen(struct inpcbtable *, struct in_addr, u_int,
> struct mbuf *, u_int);
> +struct inpcb *
> + in_pcbhashlookup_wlocked(struct inpcbtable *, struct in_addr,
> + u_int, struct in_addr, u_int, u_int, int);
> +struct inpcb *
> + in_pcblookup_listen_wlocked(struct inpcbtable *, struct in_addr, u_int,
> + struct mbuf *, u_int, int);
> #ifdef INET6
> struct inpcbhead *
> in6_pcbhash(struct inpcbtable *, int, const struct in6_addr *,
> u_short, const struct in6_addr *, u_short);
> struct inpcb *
> in6_pcbhashlookup(struct inpcbtable *, const struct in6_addr *,
> - u_int, const struct in6_addr *, u_int, u_int);
> + u_int, const struct in6_addr *, u_int, u_int);
> struct inpcb *
> in6_pcblookup_listen(struct inpcbtable *, struct in6_addr *, u_int,
> struct mbuf *, u_int);
> +struct inpcb *
> + in6_pcbhashlookup_wlocked(struct inpcbtable *, const struct in6_addr *,
> + u_int, const struct in6_addr *, u_int, u_int, int);
> +struct inpcb *
> + in6_pcblookup_listen_wlocked(struct inpcbtable *, struct in6_addr *,
> + u_int, struct mbuf *, u_int, int);
> int in6_pcbaddrisavail(struct inpcb *, struct sockaddr_in6 *, int,
> struct proc *);
> int in6_pcbconnect(struct inpcb *, struct mbuf *);
> Index: netinet6/in6_pcb.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_pcb.c,v
> retrieving revision 1.112
> diff -u -p -r1.112 in6_pcb.c
> --- netinet6/in6_pcb.c 11 Feb 2021 10:41:19 -0000 1.112
> +++ netinet6/in6_pcb.c 8 Dec 2021 21:16:16 -0000
> @@ -500,6 +500,16 @@ in6_pcbhashlookup(struct inpcbtable *tab
> u_int fport_arg, const struct in6_addr *laddr, u_int lport_arg,
> u_int rtable)
> {
> + NET_ASSERT_WLOCKED();
> + return in6_pcbhashlookup_wlocked(table, faddr, fport_arg,
> + laddr, lport_arg, rtable, 1);
> +}
> +
> +struct inpcb *
> +in6_pcbhashlookup_wlocked(struct inpcbtable *table,
> + const struct in6_addr *faddr, u_int fport_arg,
> + const struct in6_addr *laddr, u_int lport_arg, u_int rtable, int wlocked)
> +{
> struct inpcbhead *head;
> struct inpcb *inp;
> u_int16_t fport = fport_arg, lport = lport_arg;
> @@ -539,6 +549,14 @@ struct inpcb *
> in6_pcblookup_listen(struct inpcbtable *table, struct in6_addr *laddr,
> u_int lport_arg, struct mbuf *m, u_int rtable)
> {
> + return in6_pcblookup_listen_wlocked(table, laddr, lport_arg, m, rtable,
> + 1);
> +}
> +
> +struct inpcb *
> +in6_pcblookup_listen_wlocked(struct inpcbtable *table, struct in6_addr
> *laddr,
> + u_int lport_arg, struct mbuf *m, u_int rtable, int wlocked)
> +{
> struct inpcbhead *head;
> const struct in6_addr *key1, *key2;
> struct inpcb *inp;
> @@ -604,7 +622,7 @@ in6_pcblookup_listen(struct inpcbtable *
> * repeated accesses are quicker. This is analogous to
> * the historic single-entry PCB cache.
> */
> - if (inp != NULL && inp != LIST_FIRST(head)) {
> + if (wlocked && inp != NULL && inp != LIST_FIRST(head)) {
> LIST_REMOVE(inp, inp_hash);
> LIST_INSERT_HEAD(head, inp, inp_hash);
> }
>