Hello,
below is a diff which hopes to fix the issue. Although diff is fairly
large the change itself is kind of straightforward. Let me briefly
explain what's going on here. Diff introduces a mutex to pf_state,
which protects array of keys (pf_state::key) bound to state.
The panic which diff below hopes to fix is caused by a race between timer
thread, which expires state and pfsync dispatch task, which updates a peer.
According to data provided by Hrvoje we panic due to NULL pointer dereference
in pf_state_export(), which finds sk->key[] to be NULL. This may happen because
purge state mechanism detaches state key from state under protection
of PF_STATE_LOCK, while pfsync dispatch task just keeps a reference to state
without using a PF_STATE_LOCK to access a state instance.
In order to synchronize access to pf_statey::key between purge thread
and pfsync dispatch task diff below introduces pf_state::mtx.
pfsync uses pf_state::mtx to attempt to grab references to keys bound
to state, while purge task uses mtx to safely invalidate state keys
in pf_detach_state().
Such change requires pfsync(4) to deal with situation when state
got detached while waiting in dispatch queue to update a peer.
We have to a .write() operation on sync-queue to indicate a failure
so pfsync_sendout() will just skip the state when processing dispatch
queue.
Also diff changes pf_state_key_detach() such caller must pass pointer to state
key instead of key index to be detached from state. It also requires caller to
invalidate a state key entry in pf_state::key member.
I've just smoked tested the diff _without_ pfsync.
thanks and
regards
sashan
--------8<---------------8<---------------8<------------------8<--------
diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c
index c78ca62766e..02d45673e20 100644
--- a/sys/net/if_pfsync.c
+++ b/sys/net/if_pfsync.c
@@ -157,16 +157,16 @@ struct {
};
struct pfsync_q {
- void (*write)(struct pf_state *, void *);
+ int (*write)(struct pf_state *, void *);
size_t len;
u_int8_t action;
};
/* we have one of these for every PFSYNC_S_ */
-void pfsync_out_state(struct pf_state *, void *);
-void pfsync_out_iack(struct pf_state *, void *);
-void pfsync_out_upd_c(struct pf_state *, void *);
-void pfsync_out_del(struct pf_state *, void *);
+int pfsync_out_state(struct pf_state *, void *);
+int pfsync_out_iack(struct pf_state *, void *);
+int pfsync_out_upd_c(struct pf_state *, void *);
+int pfsync_out_del(struct pf_state *, void *);
struct pfsync_q pfsync_qs[] = {
{ pfsync_out_iack, sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
@@ -516,10 +516,10 @@ pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
return (0);
}
-void
+int
pfsync_state_export(struct pfsync_state *sp, struct pf_state *st)
{
- pf_state_export(sp, st);
+ return (pf_state_export(sp, st));
}
int
@@ -1529,24 +1529,25 @@ pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
return (0);
}
-void
+int
pfsync_out_state(struct pf_state *st, void *buf)
{
struct pfsync_state *sp = buf;
- pfsync_state_export(sp, st);
+ return (pfsync_state_export(sp, st));
}
-void
+int
pfsync_out_iack(struct pf_state *st, void *buf)
{
struct pfsync_ins_ack *iack = buf;
iack->id = st->id;
iack->creatorid = st->creatorid;
+ return (0);
}
-void
+int
pfsync_out_upd_c(struct pf_state *st, void *buf)
{
struct pfsync_upd_c *up = buf;
@@ -1557,9 +1558,10 @@ pfsync_out_upd_c(struct pf_state *st, void *buf)
pf_state_peer_hton(&st->dst, &up->dst);
up->creatorid = st->creatorid;
up->timeout = st->timeout;
+ return (0);
}
-void
+int
pfsync_out_del(struct pf_state *st, void *buf)
{
struct pfsync_del_c *dp = buf;
@@ -1568,6 +1570,7 @@ pfsync_out_del(struct pf_state *st, void *buf)
dp->creatorid = st->creatorid;
SET(st->state_flags, PFSTATE_NOSYNC);
+ return (0);
}
void
@@ -1881,8 +1884,8 @@ pfsync_sendout(void)
KASSERT(st->snapped == 1);
st->sync_state = PFSYNC_S_NONE;
st->snapped = 0;
- pfsync_qs[q].write(st, m->m_data + offset);
- offset += pfsync_qs[q].len;
+ if (pfsync_qs[q].write(st, m->m_data + offset) == 0)
+ offset += pfsync_qs[q].len;
pf_state_unref(st);
count++;
diff --git a/sys/net/if_pfsync.h b/sys/net/if_pfsync.h
index bee6c77f228..4440a561854 100644
--- a/sys/net/if_pfsync.h
+++ b/sys/net/if_pfsync.h
@@ -326,7 +326,7 @@ int pfsync_sysctl(int *, u_int, void *,
size_t *,
#define PFSYNC_SI_CKSUM 0x02
#define PFSYNC_SI_ACK 0x04
int pfsync_state_import(struct pfsync_state *, int);
-void pfsync_state_export(struct pfsync_state *,
+int pfsync_state_export(struct pfsync_state *,
struct pf_state *);
void pfsync_insert_state(struct pf_state *);
diff --git a/sys/net/pf.c b/sys/net/pf.c
index 93fe5702625..c76f4c71263 100644
--- a/sys/net/pf.c
+++ b/sys/net/pf.c
@@ -185,7 +185,8 @@ int pf_translate_icmp_af(struct pf_pdesc*,
int, void *);
void pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t, int,
sa_family_t, struct pf_rule *, u_int);
void pf_detach_state(struct pf_state *);
-void pf_state_key_detach(struct pf_state *, int);
+void pf_state_key_detach(struct pf_state *,
+ struct pf_state_key *);
u_int32_t pf_tcp_iss(struct pf_pdesc *);
void pf_rule_to_actions(struct pf_rule *,
struct pf_rule_actions *);
@@ -260,6 +261,9 @@ void
pf_state_key_unlink_inpcb(struct pf_state_key *);
void pf_inpcb_unlink_state_key(struct inpcb *);
void pf_pktenqueue_delayed(void *);
int32_t pf_state_expires(const struct pf_state *,
uint8_t);
+void pf_state_keys_take(struct pf_state *,
+ struct pf_state_key **);
+void pf_state_keys_rele(struct pf_state_key **);
#if NPFLOG > 0
void pf_log_matches(struct pf_pdesc *, struct pf_rule *,
@@ -778,7 +782,8 @@ pf_state_key_attach(struct pf_state_key *sk, struct
pf_state *s, int idx)
s->key[idx] = sk;
if ((si = pool_get(&pf_state_item_pl, PR_NOWAIT)) == NULL) {
- pf_state_key_detach(s, idx);
+ pf_state_key_detach(s, s->key[idx]);
+ s->key[idx] = NULL;
return (-1);
}
si->s = s;
@@ -798,42 +803,50 @@ pf_state_key_attach(struct pf_state_key *sk, struct
pf_state *s, int idx)
void
pf_detach_state(struct pf_state *s)
{
- if (s->key[PF_SK_WIRE] == s->key[PF_SK_STACK])
- s->key[PF_SK_WIRE] = NULL;
+ struct pf_state_key *key[2];
+
+ mtx_enter(&s->mtx);
+ key[PF_SK_WIRE] = s->key[PF_SK_WIRE];
+ key[PF_SK_STACK] = s->key[PF_SK_STACK];
+ s->key[PF_SK_WIRE] = NULL;
+ s->key[PF_SK_STACK] = NULL;
+ mtx_leave(&s->mtx);
+
+ if (key[PF_SK_WIRE] == key[PF_SK_STACK])
+ key[PF_SK_WIRE] = NULL;
- if (s->key[PF_SK_STACK] != NULL)
- pf_state_key_detach(s, PF_SK_STACK);
+ if (key[PF_SK_STACK] != NULL)
+ pf_state_key_detach(s, key[PF_SK_STACK]);
- if (s->key[PF_SK_WIRE] != NULL)
- pf_state_key_detach(s, PF_SK_WIRE);
+ if (key[PF_SK_WIRE] != NULL)
+ pf_state_key_detach(s, key[PF_SK_WIRE]);
}
void
-pf_state_key_detach(struct pf_state *s, int idx)
+pf_state_key_detach(struct pf_state *s, struct pf_state_key *key)
{
struct pf_state_item *si;
- struct pf_state_key *sk;
- if (s->key[idx] == NULL)
+ PF_STATE_ASSERT_LOCKED();
+
+ if (key == NULL)
return;
- si = TAILQ_FIRST(&s->key[idx]->states);
+ si = TAILQ_FIRST(&key->states);
while (si && si->s != s)
si = TAILQ_NEXT(si, entry);
if (si) {
- TAILQ_REMOVE(&s->key[idx]->states, si, entry);
+ TAILQ_REMOVE(&key->states, si, entry);
pool_put(&pf_state_item_pl, si);
}
- sk = s->key[idx];
- s->key[idx] = NULL;
- if (TAILQ_EMPTY(&sk->states)) {
- RB_REMOVE(pf_state_tree, &pf_statetbl, sk);
- sk->removed = 1;
- pf_state_key_unlink_reverse(sk);
- pf_state_key_unlink_inpcb(sk);
- pf_state_key_unref(sk);
+ if (TAILQ_EMPTY(&key->states)) {
+ RB_REMOVE(pf_state_tree, &pf_statetbl, key);
+ key->removed = 1;
+ pf_state_key_unlink_reverse(key);
+ pf_state_key_unlink_inpcb(key);
+ pf_state_key_unref(key);
}
}
@@ -996,7 +1009,9 @@ pf_state_insert(struct pfi_kif *kif, struct pf_state_key
**skw,
}
*skw = s->key[PF_SK_WIRE];
if (pf_state_key_attach(*sks, s, PF_SK_STACK)) {
- pf_state_key_detach(s, PF_SK_WIRE);
+ pf_state_key_detach(s, s->key[PF_SK_WIRE]);
+ s->key[PF_SK_WIRE] = NULL;
+ *skw = NULL;
PF_STATE_EXIT_WRITE();
return (-1);
}
@@ -1185,30 +1200,35 @@ pf_find_state_all(struct pf_state_key_cmp *key, u_int
dir, int *more)
return (ret ? ret->s : NULL);
}
-void
+int
pf_state_export(struct pfsync_state *sp, struct pf_state *st)
{
int32_t expire;
+ struct pf_state_key *key[2] = { NULL, NULL };
memset(sp, 0, sizeof(struct pfsync_state));
/* copy from state key */
- sp->key[PF_SK_WIRE].addr[0] = st->key[PF_SK_WIRE]->addr[0];
- sp->key[PF_SK_WIRE].addr[1] = st->key[PF_SK_WIRE]->addr[1];
- sp->key[PF_SK_WIRE].port[0] = st->key[PF_SK_WIRE]->port[0];
- sp->key[PF_SK_WIRE].port[1] = st->key[PF_SK_WIRE]->port[1];
- sp->key[PF_SK_WIRE].rdomain = htons(st->key[PF_SK_WIRE]->rdomain);
- sp->key[PF_SK_WIRE].af = st->key[PF_SK_WIRE]->af;
- sp->key[PF_SK_STACK].addr[0] = st->key[PF_SK_STACK]->addr[0];
- sp->key[PF_SK_STACK].addr[1] = st->key[PF_SK_STACK]->addr[1];
- sp->key[PF_SK_STACK].port[0] = st->key[PF_SK_STACK]->port[0];
- sp->key[PF_SK_STACK].port[1] = st->key[PF_SK_STACK]->port[1];
- sp->key[PF_SK_STACK].rdomain = htons(st->key[PF_SK_STACK]->rdomain);
- sp->key[PF_SK_STACK].af = st->key[PF_SK_STACK]->af;
+ pf_state_keys_take(st, key);
+ if ((key[PF_SK_WIRE] == NULL) || (key[PF_SK_STACK] == NULL))
+ return (-1);
+
+ sp->key[PF_SK_WIRE].addr[0] = key[PF_SK_WIRE]->addr[0];
+ sp->key[PF_SK_WIRE].addr[1] = key[PF_SK_WIRE]->addr[1];
+ sp->key[PF_SK_WIRE].port[0] = key[PF_SK_WIRE]->port[0];
+ sp->key[PF_SK_WIRE].port[1] = key[PF_SK_WIRE]->port[1];
+ sp->key[PF_SK_WIRE].rdomain = htons(key[PF_SK_WIRE]->rdomain);
+ sp->key[PF_SK_WIRE].af = key[PF_SK_WIRE]->af;
+ sp->key[PF_SK_STACK].addr[0] = key[PF_SK_STACK]->addr[0];
+ sp->key[PF_SK_STACK].addr[1] = key[PF_SK_STACK]->addr[1];
+ sp->key[PF_SK_STACK].port[0] = key[PF_SK_STACK]->port[0];
+ sp->key[PF_SK_STACK].port[1] = key[PF_SK_STACK]->port[1];
+ sp->key[PF_SK_STACK].rdomain = htons(key[PF_SK_STACK]->rdomain);
+ sp->key[PF_SK_STACK].af = key[PF_SK_STACK]->af;
sp->rtableid[PF_SK_WIRE] = htonl(st->rtableid[PF_SK_WIRE]);
sp->rtableid[PF_SK_STACK] = htonl(st->rtableid[PF_SK_STACK]);
- sp->proto = st->key[PF_SK_WIRE]->proto;
- sp->af = st->key[PF_SK_WIRE]->af;
+ sp->proto = key[PF_SK_WIRE]->proto;
+ sp->af = key[PF_SK_WIRE]->af;
/* copy from state */
strlcpy(sp->ifname, st->kif->pfik_name, sizeof(sp->ifname));
@@ -1255,6 +1275,10 @@ pf_state_export(struct pfsync_state *sp, struct pf_state
*st)
sp->set_tos = st->set_tos;
sp->set_prio[0] = st->set_prio[0];
sp->set_prio[1] = st->set_prio[1];
+
+ pf_state_keys_rele(key);
+
+ return (0);
}
/* END state table stuff */
@@ -4108,6 +4132,7 @@ pf_create_state(struct pf_pdesc *pd, struct pf_rule *r,
struct pf_rule *a,
* pf_state_inserts() grabs reference for pfsync!
*/
refcnt_init(&s->refcnt);
+ mtx_init(&s->mtx, IPL_NET);
switch (pd->proto) {
case IPPROTO_TCP:
@@ -7789,3 +7814,19 @@ pf_pktenqueue_delayed(void *arg)
pool_put(&pf_pktdelay_pl, pdy);
}
+
+void
+pf_state_keys_take(struct pf_state *st, struct pf_state_key *keys[])
+{
+ mtx_enter(&st->mtx);
+ keys[PF_SK_WIRE] = pf_state_key_ref(st->key[PF_SK_WIRE]);
+ keys[PF_SK_STACK] = pf_state_key_ref(st->key[PF_SK_STACK]);
+ mtx_leave(&st->mtx);
+}
+
+void
+pf_state_keys_rele(struct pf_state_key *keys[])
+{
+ pf_state_key_unref(keys[PF_SK_WIRE]);
+ pf_state_key_unref(keys[PF_SK_STACK]);
+}
diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h
index 93394e6da5c..4604869ede3 100644
--- a/sys/net/pfvar.h
+++ b/sys/net/pfvar.h
@@ -763,6 +763,7 @@ struct pf_state {
struct pf_sn_head src_nodes;
struct pf_state_key *key[2]; /* addresses stack and wire */
struct pfi_kif *kif;
+ struct mutex mtx;
u_int64_t packets[2];
u_int64_t bytes[2];
int32_t creation;
@@ -1737,7 +1738,7 @@ void
pf_state_rm_src_node(struct pf_state *,
extern struct pf_state *pf_find_state_byid(struct pf_state_cmp *);
extern struct pf_state *pf_find_state_all(struct pf_state_key_cmp *,
u_int, int *);
-extern void pf_state_export(struct pfsync_state *,
+extern int pf_state_export(struct pfsync_state *,
struct pf_state *);
extern void pf_print_state(struct pf_state *);
extern void pf_print_flags(u_int8_t);