On Wed, Dec 08, 2021 at 03:28:34PM -0300, Martin Pieuchot wrote:
> On 04/12/21(Sat) 01:02, Alexander Bluhm wrote:
> > Hi,
> >
> > As I want a read-only net lock for forwarding, all paths should be
> > checked for the correct net lock and asserts. I found code in
> > in_pcbhashlookup() where reading the PCB table results in a write
> > to optimize the cache.
> >
> > Porperly protecting PCB hashes is out of scope for parallel forwarding.
> > Can we get away with this hack? Only update the cache when we are
> > in TCP oder UDP stack with the write lock. The access from pf is
> > read-only.
> >
> > NET_WLOCKED() indicates whether we may change data structures.
>
> I recall that we currently do not want to introduce such idiom: change
> the behavior of the code depending on which lock is held by the caller.
>
> Can we instead assert that a write-lock is held before modifying the
> list?
We could also pass down the kind of lock that is used. Goal is
that pf uses shared net lock. TCP and UDP will keep the exclusice
net lock for a while.
Diff gets longer but perhaps a bit clearer what is going on.
bluhm
Index: net/pf.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v
retrieving revision 1.1122
diff -u -p -r1.1122 pf.c
--- net/pf.c 7 Jul 2021 18:38:25 -0000 1.1122
+++ net/pf.c 8 Dec 2021 21:16:16 -0000
@@ -3317,14 +3317,12 @@ pf_socket_lookup(struct pf_pdesc *pd)
sport = pd->hdr.tcp.th_sport;
dport = pd->hdr.tcp.th_dport;
PF_ASSERT_LOCKED();
- NET_ASSERT_LOCKED();
tb = &tcbtable;
break;
case IPPROTO_UDP:
sport = pd->hdr.udp.uh_sport;
dport = pd->hdr.udp.uh_dport;
PF_ASSERT_LOCKED();
- NET_ASSERT_LOCKED();
tb = &udbtable;
break;
default:
@@ -3348,22 +3346,24 @@ pf_socket_lookup(struct pf_pdesc *pd)
* Fails when rtable is changed while evaluating the ruleset
* The socket looked up will not match the one hit in the end.
*/
- inp = in_pcbhashlookup(tb, saddr->v4, sport, daddr->v4, dport,
- pd->rdomain);
+ NET_ASSERT_LOCKED();
+ inp = in_pcbhashlookup_wlocked(tb, saddr->v4, sport, daddr->v4,
+ dport, pd->rdomain, 0);
if (inp == NULL) {
- inp = in_pcblookup_listen(tb, daddr->v4, dport,
- NULL, pd->rdomain);
+ inp = in_pcblookup_listen_wlocked(tb, daddr->v4, dport,
+ NULL, pd->rdomain, 0);
if (inp == NULL)
return (-1);
}
break;
#ifdef INET6
case AF_INET6:
- inp = in6_pcbhashlookup(tb, &saddr->v6, sport, &daddr->v6,
- dport, pd->rdomain);
+ NET_ASSERT_LOCKED();
+ inp = in6_pcbhashlookup_wlocked(tb, &saddr->v6, sport,
+ &daddr->v6, dport, pd->rdomain, 0);
if (inp == NULL) {
- inp = in6_pcblookup_listen(tb, &daddr->v6, dport,
- NULL, pd->rdomain);
+ inp = in6_pcblookup_listen_wlocked(tb, &daddr->v6,
+ dport, NULL, pd->rdomain, 0);
if (inp == NULL)
return (-1);
}
Index: netinet/in_pcb.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
retrieving revision 1.256
diff -u -p -r1.256 in_pcb.c
--- netinet/in_pcb.c 25 Oct 2021 22:20:47 -0000 1.256
+++ netinet/in_pcb.c 8 Dec 2021 21:16:16 -0000
@@ -1051,6 +1051,15 @@ in_pcbresize(struct inpcbtable *table, i
int in_pcbnotifymiss = 0;
#endif
+struct inpcb *
+in_pcbhashlookup(struct inpcbtable *table, struct in_addr faddr,
+ u_int fport_arg, struct in_addr laddr, u_int lport_arg, u_int rtable)
+{
+ NET_ASSERT_WLOCKED();
+ return in_pcbhashlookup_wlocked(table, faddr ,fport_arg, laddr,
+ lport_arg, rtable, 1);
+}
+
/*
* The in(6)_pcbhashlookup functions are used to locate connected sockets
* quickly:
@@ -1061,8 +1070,9 @@ int in_pcbnotifymiss = 0;
* After those two lookups no other are necessary.
*/
struct inpcb *
-in_pcbhashlookup(struct inpcbtable *table, struct in_addr faddr,
- u_int fport_arg, struct in_addr laddr, u_int lport_arg, u_int rtable)
+in_pcbhashlookup_wlocked(struct inpcbtable *table, struct in_addr faddr,
+ u_int fport_arg, struct in_addr laddr, u_int lport_arg, u_int rtable,
+ int wlocked)
{
struct inpcbhead *head;
struct inpcb *inp;
@@ -1093,7 +1103,7 @@ in_pcbhashlookup(struct inpcbtable *tabl
}
}
#ifdef DIAGNOSTIC
- if (inp == NULL && in_pcbnotifymiss) {
+ if (wlocked && inp == NULL && in_pcbnotifymiss) {
printf("%s: faddr=%08x fport=%d laddr=%08x lport=%d rdom=%u\n",
__func__, ntohl(faddr.s_addr), ntohs(fport),
ntohl(laddr.s_addr), ntohs(lport), rdomain);
@@ -1102,6 +1112,15 @@ in_pcbhashlookup(struct inpcbtable *tabl
return (inp);
}
+struct inpcb *
+in_pcblookup_listen(struct inpcbtable *table, struct in_addr laddr,
+ u_int lport_arg, struct mbuf *m, u_int rtable)
+{
+ NET_ASSERT_WLOCKED();
+ return in_pcblookup_listen_wlocked(table, laddr, lport_arg, m, rtable,
+ 1);
+}
+
/*
* The in(6)_pcblookup_listen functions are used to locate listening
* sockets quickly. This are sockets with unspecified foreign address
@@ -1110,8 +1129,8 @@ in_pcbhashlookup(struct inpcbtable *tabl
* *.* <-> *.lport
*/
struct inpcb *
-in_pcblookup_listen(struct inpcbtable *table, struct in_addr laddr,
- u_int lport_arg, struct mbuf *m, u_int rtable)
+in_pcblookup_listen_wlocked(struct inpcbtable *table, struct in_addr laddr,
+ u_int lport_arg, struct mbuf *m, u_int rtable, int wlocked)
{
struct inpcbhead *head;
const struct in_addr *key1, *key2;
@@ -1185,7 +1204,7 @@ in_pcblookup_listen(struct inpcbtable *t
* repeated accesses are quicker. This is analogous to
* the historic single-entry PCB cache.
*/
- if (inp != NULL && inp != LIST_FIRST(head)) {
+ if (wlocked && inp != NULL && inp != LIST_FIRST(head)) {
LIST_REMOVE(inp, inp_hash);
LIST_INSERT_HEAD(head, inp, inp_hash);
}
Index: netinet/in_pcb.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
retrieving revision 1.121
diff -u -p -r1.121 in_pcb.h
--- netinet/in_pcb.h 25 Jan 2021 03:40:46 -0000 1.121
+++ netinet/in_pcb.h 8 Dec 2021 21:16:16 -0000
@@ -274,20 +274,32 @@ void in_pcbunref(struct inpcb *);
void in_pcbdisconnect(struct inpcb *);
struct inpcb *
in_pcbhashlookup(struct inpcbtable *, struct in_addr,
- u_int, struct in_addr, u_int, u_int);
+ u_int, struct in_addr, u_int, u_int);
struct inpcb *
in_pcblookup_listen(struct inpcbtable *, struct in_addr, u_int,
struct mbuf *, u_int);
+struct inpcb *
+ in_pcbhashlookup_wlocked(struct inpcbtable *, struct in_addr,
+ u_int, struct in_addr, u_int, u_int, int);
+struct inpcb *
+ in_pcblookup_listen_wlocked(struct inpcbtable *, struct in_addr, u_int,
+ struct mbuf *, u_int, int);
#ifdef INET6
struct inpcbhead *
in6_pcbhash(struct inpcbtable *, int, const struct in6_addr *,
u_short, const struct in6_addr *, u_short);
struct inpcb *
in6_pcbhashlookup(struct inpcbtable *, const struct in6_addr *,
- u_int, const struct in6_addr *, u_int, u_int);
+ u_int, const struct in6_addr *, u_int, u_int);
struct inpcb *
in6_pcblookup_listen(struct inpcbtable *, struct in6_addr *, u_int,
struct mbuf *, u_int);
+struct inpcb *
+ in6_pcbhashlookup_wlocked(struct inpcbtable *, const struct in6_addr *,
+ u_int, const struct in6_addr *, u_int, u_int, int);
+struct inpcb *
+ in6_pcblookup_listen_wlocked(struct inpcbtable *, struct in6_addr *,
+ u_int, struct mbuf *, u_int, int);
int in6_pcbaddrisavail(struct inpcb *, struct sockaddr_in6 *, int,
struct proc *);
int in6_pcbconnect(struct inpcb *, struct mbuf *);
Index: netinet6/in6_pcb.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_pcb.c,v
retrieving revision 1.112
diff -u -p -r1.112 in6_pcb.c
--- netinet6/in6_pcb.c 11 Feb 2021 10:41:19 -0000 1.112
+++ netinet6/in6_pcb.c 8 Dec 2021 21:16:16 -0000
@@ -500,6 +500,16 @@ in6_pcbhashlookup(struct inpcbtable *tab
u_int fport_arg, const struct in6_addr *laddr, u_int lport_arg,
u_int rtable)
{
+ NET_ASSERT_WLOCKED();
+ return in6_pcbhashlookup_wlocked(table, faddr, fport_arg,
+ laddr, lport_arg, rtable, 1);
+}
+
+struct inpcb *
+in6_pcbhashlookup_wlocked(struct inpcbtable *table,
+ const struct in6_addr *faddr, u_int fport_arg,
+ const struct in6_addr *laddr, u_int lport_arg, u_int rtable, int wlocked)
+{
struct inpcbhead *head;
struct inpcb *inp;
u_int16_t fport = fport_arg, lport = lport_arg;
@@ -539,6 +549,14 @@ struct inpcb *
in6_pcblookup_listen(struct inpcbtable *table, struct in6_addr *laddr,
u_int lport_arg, struct mbuf *m, u_int rtable)
{
+ return in6_pcblookup_listen_wlocked(table, laddr, lport_arg, m, rtable,
+ 1);
+}
+
+struct inpcb *
+in6_pcblookup_listen_wlocked(struct inpcbtable *table, struct in6_addr *laddr,
+ u_int lport_arg, struct mbuf *m, u_int rtable, int wlocked)
+{
struct inpcbhead *head;
const struct in6_addr *key1, *key2;
struct inpcb *inp;
@@ -604,7 +622,7 @@ in6_pcblookup_listen(struct inpcbtable *
* repeated accesses are quicker. This is analogous to
* the historic single-entry PCB cache.
*/
- if (inp != NULL && inp != LIST_FIRST(head)) {
+ if (wlocked && inp != NULL && inp != LIST_FIRST(head)) {
LIST_REMOVE(inp, inp_hash);
LIST_INSERT_HEAD(head, inp, inp_hash);
}