On Mon, Dec 05, 2022 at 11:32:06AM +1000, David Gwynne wrote: > On Mon, Dec 05, 2022 at 12:45:29AM +0100, Alexandr Nedvedicky wrote: > > Hello, > > > > > > On Sat, Dec 03, 2022 at 09:53:45AM +1000, David Gwynne wrote: > > > we (mostly sashan@ and me) have a problem where pfsync can be holding a > > > reference to a pf_state that pf has decided to purge, and then pfsync > > > crashes because it tries to read the pf_state_key parts of the state, > > > but they been removed by the purge process. > > > > > > the background to this is that pf_state structures don't contain ip > > > addresses, protocols, ports, etc. that information is stored in a > > > pf_state_key struct, which is used to wire a state into the state table. > > > when pfsync wants to export information about a state, particularly the > > > addresses on it, it needs the pf_state_key struct to read from. > > > > > > the pf purge code current assumes that when a state is removed from the > > > state table, the pf_state_key structs that are used to wire it in can > > > also be removed and freed. this has obvious bad consequences for pfsync, > > > which could still be holding a reference to the pf_state struct with the > > > intention of reading from it. > > > > > > this diff tweaks the lifetime of pf_state_structs so they at least match > > > the lifetime of pf_states. > > > > > > just so i'm clear, the main idea is that once pf_state_insert succeeds, > > > a pf_state struct will always have pf_state_key structs hanging off it. > > > it's only after the pf_state struct itself is being torn down that its > > > references to pf_state_keys are released. if you're holding a > > > pf_state reference, you can use that to access the pf_state_key > > > structs hanging off it. > > > > > > this is largely accomplished by taking more advantage of the pf_state > > > and pf_state_key refcnts. pf_states get properly accounted references to > > > the pf_state_keys, and the state table gets its own reference counts to > > > the pf_state_keys too. when a state is removed from the state table, the > > > state table reference counts drop. however, it's not until all pf_state > > > references drop that pf_state will give up its references to the > > > pf_states. > > > > > > this is a very new diff, but i've been kicking it around with pfsync a > > > bit and the pf_state_export crashes i used to get are gone now. > > > > > > if you're going to test the things to look for are if NAT still works, > > > and if the pfstate and pfstkey pool numbers still look right. > > > > all above makes sense to me. I'm fine with the approach. > > I think I could just one single glitch here in pf_state_key_attach(): > > > > > > > > Index: pf.c > > > =================================================================== > > > RCS file: /cvs/src/sys/net/pf.c,v > > > retrieving revision 1.1156 > > > diff -u -p -r1.1156 pf.c > > > --- pf.c 25 Nov 2022 20:27:53 -0000 1.1156 > > > +++ pf.c 2 Dec 2022 23:20:36 -0000 > > </snip> > > > @@ -766,44 +777,52 @@ pf_state_key_attach(struct pf_state_key > > > /* remove late or sks can go away */ > > > olds = si->s; > > > } else { > > > - pool_put(&pf_state_key_pl, sk); > > > - return (-1); /* collision! */ > > > + pf_state_key_unref(sk); > > > + return (NULL); /* collision! */ > > > } > > > } > > > - pool_put(&pf_state_key_pl, sk); > > > - s->key[idx] = cur; > > > - } else > > > - s->key[idx] = sk; > > > + } > > > + > > > + /* reuse the existing state key */ > > > + pf_state_key_unref(sk); > > > + sk = cur; > > > + } > > > > > > if ((si = pool_get(&pf_state_item_pl, PR_NOWAIT)) == NULL) { > > > - pf_state_key_detach(s, idx); > > > - return (-1); > > > + if (TAILQ_EMPTY(&sk->states)) { > > > + RB_REMOVE(pf_state_tree, &pf_statetbl, sk); > > > + sk->removed = 1; > > > + } > > > + > > > + pf_state_key_unref(sk); > > > > I think pf_state_key_unref() above needs to happen if and only if > > sk != cur. > > ah, yes. lemme think about this a sec.
yes, you're right. the diff below includes the simple fix to that. this demonstrates how subtle the reference counting around the trees is though. maybe it would be more obvious to take another refcnt before giving the pf_state_key to the tree(s), and then give the pf_state_key_attach callers reference to the pf_state struct. at the moment we give the callers reference to the tree while holding the PF_LOCK, and borrow it to give another one to the state while the lock is still held. Index: pf.c =================================================================== RCS file: /cvs/src/sys/net/pf.c,v retrieving revision 1.1156 diff -u -p -r1.1156 pf.c --- pf.c 25 Nov 2022 20:27:53 -0000 1.1156 +++ pf.c 5 Dec 2022 01:41:15 -0000 @@ -185,6 +185,8 @@ int pf_translate_icmp_af(struct pf_pd void pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t, int, sa_family_t, struct pf_rule *, u_int); void pf_detach_state(struct pf_state *); +struct pf_state_key *pf_state_key_attach(struct pf_state_key *, + struct pf_state *, int); void pf_state_key_detach(struct pf_state *, int); u_int32_t pf_tcp_iss(struct pf_pdesc *); void pf_rule_to_actions(struct pf_rule *, @@ -379,7 +381,7 @@ pf_set_protostate(struct pf_state *s, in if (s->src.state == newstate) return; - if (s->creatorid == pf_status.hostid && s->key[PF_SK_STACK] != NULL && + if (s->creatorid == pf_status.hostid && s->key[PF_SK_STACK]->proto == IPPROTO_TCP && !(TCPS_HAVEESTABLISHED(s->src.state) || s->src.state == TCPS_CLOSED) && @@ -720,17 +722,26 @@ pf_state_compare_id(struct pf_state *a, return (0); } -int +/* + * on failure, pf_state_key_attach() releases the pf_state_key + * reference and returns NULL. + */ +struct pf_state_key * pf_state_key_attach(struct pf_state_key *sk, struct pf_state *s, int idx) { struct pf_state_item *si; struct pf_state_key *cur; struct pf_state *olds = NULL; + PF_ASSERT_LOCKED(); + KASSERT(s->key[idx] == NULL); - if ((cur = RB_INSERT(pf_state_tree, &pf_statetbl, sk)) != NULL) { + sk->removed = 0; + cur = RB_INSERT(pf_state_tree, &pf_statetbl, sk); + if (cur != NULL) { + sk->removed = 1; /* key exists. check for same kif, if none, add to key */ - TAILQ_FOREACH(si, &cur->states, entry) + TAILQ_FOREACH(si, &cur->states, entry) { if (si->s->kif == s->kif && ((si->s->key[PF_SK_WIRE]->af == sk->af && si->s->direction == s->direction) || @@ -766,44 +777,53 @@ pf_state_key_attach(struct pf_state_key /* remove late or sks can go away */ olds = si->s; } else { - pool_put(&pf_state_key_pl, sk); - return (-1); /* collision! */ + pf_state_key_unref(sk); + return (NULL); /* collision! */ } } - pool_put(&pf_state_key_pl, sk); - s->key[idx] = cur; - } else - s->key[idx] = sk; + } + + /* reuse the existing state key */ + pf_state_key_unref(sk); + sk = cur; + } if ((si = pool_get(&pf_state_item_pl, PR_NOWAIT)) == NULL) { - pf_state_key_detach(s, idx); - return (-1); + if (TAILQ_EMPTY(&sk->states)) { + KASSERT(cur == NULL); + RB_REMOVE(pf_state_tree, &pf_statetbl, sk); + sk->removed = 1; + pf_state_key_unref(sk); + } + + return (NULL); } - si->s = s; + + s->key[idx] = pf_state_key_ref(sk); /* give a ref to state */ + si->s = pf_state_ref(s); /* list is sorted, if-bound states before floating */ if (s->kif == pfi_all) - TAILQ_INSERT_TAIL(&s->key[idx]->states, si, entry); + TAILQ_INSERT_TAIL(&sk->states, si, entry); else - TAILQ_INSERT_HEAD(&s->key[idx]->states, si, entry); + TAILQ_INSERT_HEAD(&sk->states, si, entry); if (olds) pf_remove_state(olds); - return (0); + /* caller owns the pf_state ref, which owns a pf_state_key ref now */ + return (sk); } void pf_detach_state(struct pf_state *s) { - if (s->key[PF_SK_WIRE] == s->key[PF_SK_STACK]) - s->key[PF_SK_WIRE] = NULL; + KASSERT(s->key[PF_SK_WIRE] != NULL); + pf_state_key_detach(s, PF_SK_WIRE); - if (s->key[PF_SK_STACK] != NULL) + KASSERT(s->key[PF_SK_STACK] != NULL); + if (s->key[PF_SK_STACK] != s->key[PF_SK_WIRE]) pf_state_key_detach(s, PF_SK_STACK); - - if (s->key[PF_SK_WIRE] != NULL) - pf_state_key_detach(s, PF_SK_WIRE); } void @@ -812,20 +832,23 @@ pf_state_key_detach(struct pf_state *s, struct pf_state_item *si; struct pf_state_key *sk; - if (s->key[idx] == NULL) + PF_ASSERT_LOCKED(); + + sk = s->key[idx]; + if (sk == NULL) return; - si = TAILQ_FIRST(&s->key[idx]->states); - while (si && si->s != s) - si = TAILQ_NEXT(si, entry); - - if (si) { - TAILQ_REMOVE(&s->key[idx]->states, si, entry); - pool_put(&pf_state_item_pl, si); + TAILQ_FOREACH(si, &sk->states, entry) { + if (si->s == s) + break; } + if (si == NULL) + return; - sk = s->key[idx]; - s->key[idx] = NULL; + TAILQ_REMOVE(&sk->states, si, entry); + pool_put(&pf_state_item_pl, si); + pf_state_unref(s); + if (TAILQ_EMPTY(&sk->states)) { RB_REMOVE(pf_state_tree, &pf_statetbl, sk); sk->removed = 1; @@ -842,7 +865,10 @@ pf_alloc_state_key(int pool_flags) if ((sk = pool_get(&pf_state_key_pl, pool_flags)) == NULL) return (NULL); + + PF_REF_INIT(sk->refcnt); TAILQ_INIT(&sk->states); + sk->removed = 1; return (sk); } @@ -916,8 +942,6 @@ pf_state_key_setup(struct pf_pdesc *pd, sk1->proto = pd->proto; sk1->af = pd->af; sk1->rdomain = pd->rdomain; - PF_REF_INIT(sk1->refcnt); - sk1->removed = 0; if (rtableid >= 0) wrdom = rtable_l2(rtableid); @@ -926,7 +950,7 @@ pf_state_key_setup(struct pf_pdesc *pd, pd->nsport != pd->osport || pd->ndport != pd->odport || wrdom != pd->rdomain || afto) { /* NAT/NAT64 */ if ((sk2 = pf_alloc_state_key(PR_NOWAIT | PR_ZERO)) == NULL) { - pool_put(&pf_state_key_pl, sk1); + pf_state_key_unref(sk1); return (ENOMEM); } pf_state_key_addr_setup(pd, sk2, afto ? pd->didx : pd->sidx, @@ -949,10 +973,8 @@ pf_state_key_setup(struct pf_pdesc *pd, sk2->proto = pd->proto; sk2->af = pd->naf; sk2->rdomain = wrdom; - PF_REF_INIT(sk2->refcnt); - sk2->removed = 0; } else - sk2 = sk1; + sk2 = pf_state_key_ref(sk1); if (pd->dir == PF_IN) { *skw = sk1; @@ -971,34 +993,47 @@ pf_state_key_setup(struct pf_pdesc *pd, return (0); } +/* + * pf_state_insert() does the following: + * - links the pf_state up with pf_state_key(s). + * - inserts the pf_state_keys into pf_state_tree. + * - inserts the pf_state into the into pf_state_tree_id. + * - tells pfsync about the state. + * + * pf_state_insert() owns the references to the pf_state_key structs + * it is given. on failure to insert, these references are released. + * on success, the caller owns a pf_state reference that allows it + * to access the state keys. + */ + int -pf_state_insert(struct pfi_kif *kif, struct pf_state_key **skw, - struct pf_state_key **sks, struct pf_state *s) +pf_state_insert(struct pfi_kif *kif, struct pf_state_key **skwp, + struct pf_state_key **sksp, struct pf_state *s) { + struct pf_state_key *skw = *skwp; + struct pf_state_key *sks = *sksp; + int same = (skw == sks); + PF_ASSERT_LOCKED(); s->kif = kif; PF_STATE_ENTER_WRITE(); - if (*skw == *sks) { - if (pf_state_key_attach(*skw, s, PF_SK_WIRE)) { - PF_STATE_EXIT_WRITE(); - return (-1); - } - *skw = *sks = s->key[PF_SK_WIRE]; - s->key[PF_SK_STACK] = s->key[PF_SK_WIRE]; - } else { - if (pf_state_key_attach(*skw, s, PF_SK_WIRE)) { - pool_put(&pf_state_key_pl, *sks); - PF_STATE_EXIT_WRITE(); - return (-1); - } - *skw = s->key[PF_SK_WIRE]; - if (pf_state_key_attach(*sks, s, PF_SK_STACK)) { - pf_state_key_detach(s, PF_SK_WIRE); - PF_STATE_EXIT_WRITE(); - return (-1); - } - *sks = s->key[PF_SK_STACK]; + + skw = pf_state_key_attach(skw, s, PF_SK_WIRE); + if (skw == NULL) { + pf_state_key_unref(sks); + PF_STATE_EXIT_WRITE(); + return (-1); + } + + if (same) { + /* pf_state_key_attach might have swapped skw */ + pf_state_key_unref(sks); + s->key[PF_SK_STACK] = sks = pf_state_key_ref(skw); + } else if (pf_state_key_attach(sks, s, PF_SK_STACK) == NULL) { + pf_state_key_detach(s, PF_SK_WIRE); + PF_STATE_EXIT_WRITE(); + return (-1); } if (s->id == 0 && s->creatorid == 0) { @@ -1024,6 +1059,10 @@ pf_state_insert(struct pfi_kif *kif, str pfsync_insert_state(s); #endif /* NPFSYNC > 0 */ PF_STATE_EXIT_WRITE(); + + *skwp = skw; + *sksp = sks; + return (0); } @@ -1374,7 +1413,7 @@ pf_state_import(const struct pfsync_stat if ((sks = pf_alloc_state_key(pool_flags)) == NULL) goto cleanup; } else - sks = skw; + sks = pf_state_key_ref(skw); /* allocate memory for scrub info */ if (pf_state_alloc_scrub_memory(&sp->src, &st->src) || @@ -1387,7 +1426,6 @@ pf_state_import(const struct pfsync_stat skw->port[0] = sp->key[PF_SK_WIRE].port[0]; skw->port[1] = sp->key[PF_SK_WIRE].port[1]; skw->rdomain = ntohs(sp->key[PF_SK_WIRE].rdomain); - PF_REF_INIT(skw->refcnt); skw->proto = sp->proto; if (!(skw->af = sp->key[PF_SK_WIRE].af)) skw->af = sp->af; @@ -1397,7 +1435,6 @@ pf_state_import(const struct pfsync_stat sks->port[0] = sp->key[PF_SK_STACK].port[0]; sks->port[1] = sp->key[PF_SK_STACK].port[1]; sks->rdomain = ntohs(sp->key[PF_SK_STACK].rdomain); - PF_REF_INIT(sks->refcnt); if (!(sks->af = sp->key[PF_SK_STACK].af)) sks->af = sp->af; if (sks->af != skw->af) { @@ -1499,12 +1536,10 @@ pf_state_import(const struct pfsync_stat return (0); cleanup: - if (skw == sks) - sks = NULL; if (skw != NULL) - pool_put(&pf_state_key_pl, skw); + pf_state_key_unref(skw); if (sks != NULL) - pool_put(&pf_state_key_pl, sks); + pf_state_key_unref(sks); cleanup_state: /* pf_state_insert frees the state keys */ if (st) { @@ -1587,10 +1622,9 @@ pf_state_expires(const struct pf_state * */ /* handle all PFTM_* > PFTM_MAX here */ - if (stimeout == PFTM_PURGE) + if (stimeout > PFTM_MAX) return (0); - KASSERT(stimeout != PFTM_UNLINKED); KASSERT(stimeout < PFTM_MAX); timeout = state->rule.ptr->timeout[stimeout]; @@ -4460,7 +4494,6 @@ pf_create_state(struct pf_pdesc *pd, str } if (pf_state_insert(BOUND_IFACE(r, pd->kif), skw, sks, s)) { - pf_detach_state(s); *sks = *skw = NULL; REASON_SET(&reason, PFRES_STATEINS); goto csfailed; @@ -7993,8 +8026,9 @@ pf_state_unref(struct pf_state *s) #endif /* NPFSYNC */ KASSERT((TAILQ_NEXT(s, entry_list) == NULL) || (TAILQ_NEXT(s, entry_list) == _Q_INVALID)); - KASSERT((s->key[PF_SK_WIRE] == NULL) && - (s->key[PF_SK_STACK] == NULL)); + + pf_state_key_unref(s->key[PF_SK_WIRE]); + pf_state_key_unref(s->key[PF_SK_STACK]); pool_put(&pf_state_pl, s); } Index: pfvar.h =================================================================== RCS file: /cvs/src/sys/net/pfvar.h,v retrieving revision 1.521 diff -u -p -r1.521 pfvar.h --- pfvar.h 25 Nov 2022 20:27:53 -0000 1.521 +++ pfvar.h 5 Dec 2022 01:41:15 -0000 @@ -1731,7 +1731,6 @@ void pf_pkt_addr_changed(struct mbuf *); struct inpcb *pf_inp_lookup(struct mbuf *); void pf_inp_link(struct mbuf *, struct inpcb *); void pf_inp_unlink(struct inpcb *); -int pf_state_key_attach(struct pf_state_key *, struct pf_state *, int); int pf_translate(struct pf_pdesc *, struct pf_addr *, u_int16_t, struct pf_addr *, u_int16_t, u_int16_t, int); int pf_translate_af(struct pf_pdesc *); Index: pfvar_priv.h =================================================================== RCS file: /cvs/src/sys/net/pfvar_priv.h,v retrieving revision 1.23 diff -u -p -r1.23 pfvar_priv.h --- pfvar_priv.h 25 Nov 2022 20:27:53 -0000 1.23 +++ pfvar_priv.h 5 Dec 2022 01:41:15 -0000 @@ -42,7 +42,7 @@ /* * Protection/ownership of pf_state members: - * I immutable after creation + * I immutable after pf_state_insert() * M pf_state mtx * P PF_STATE_LOCK * S pfsync mutex @@ -69,7 +69,7 @@ struct pf_state { union pf_rule_ptr natrule; /* [I] */ struct pf_addr rt_addr; /* [I] */ struct pf_sn_head src_nodes; /* [I] */ - struct pf_state_key *key[2]; /* stack and wire */ + struct pf_state_key *key[2]; /* [I] stack and wire */ struct pfi_kif *kif; /* [I] */ struct mutex mtx; pf_refcnt_t refcnt;