Hello,

</snip>
> 
> 
> Hi,
> 
> here's panic with WITNESS and this diff on tech@
> https://www.mail-archive.com/tech@openbsd.org/msg72582.html
> 
> I will stop now because I'm not sure what I'm doing and which diffs I'm
> testing...
> 
> 
> r620-1# uvm_fault(0xffffffff8248ea28, 0x17, 0, 2) -> e
> kernel: page fault trap, code=0
> Stopped at      pfsync_q_del+0x96:      movq    %rdx,0x8(%rax)
>     TID    PID    UID     PRFLAGS     PFLAGS  CPU  COMMAND
> *300703  35643      0     0x14000      0x200    1K systq
>  237790  10061      0     0x14000 0x40000200    0  softclock
> pfsync_q_del(fffffd8323dc3900) at pfsync_q_del+0x96
> pfsync_delete_state(fffffd8323dc3900) at pfsync_delete_state+0x118
> pf_remove_state(fffffd8323dc3900) at pf_remove_state+0x14e
> pf_purge_expired_states(c3501) at pf_purge_expired_states+0x1b3
> pf_purge(ffffffff823ae080) at pf_purge+0x28
> taskq_thread(ffffffff822cbe30) at taskq_thread+0x11a
> end trace frame: 0x0, count: 9
> https://www.openbsd.org/ddb.html describes the minimum info required in
> bug reports.  Insufficient info makes it difficult to find and fix bugs.
> ddb{1}>
> 
> 

    diff below should avoid panic above (and similar panics in pfsync_q_del().
    It also prints some 'error' system message buffer (a.k.a. dmesg)

    We panic because we attempt to remove state from psync queue which is
    already empty. the pfsync_q_del() must be acting on some stale information
    kept in `st` argument (state).

    the information we print into dmesg buffer should help us understand
    what's going on. At the moment I can not explain how does it come
    there is a state which claims its presence on state queue, while the
    queue in question is empty.

    I'd like to ask you to give a try diff below and repeat your test.
    Let it run for some time and collect 'dmesg' output for me after usual
    uptime-to-panic elapses during a test run.

thanks a lot
regards
sashan

--------8<---------------8<---------------8<------------------8<--------
diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c
index f69790ee98d..d4be84a1f57 100644
--- a/sys/net/if_pfsync.c
+++ b/sys/net/if_pfsync.c
@@ -2254,6 +2254,74 @@ pfsync_q_ins(struct pf_state *st, int q)
        } while (0);
 }
 
+const char *
+pfsync_qname(int q)
+{
+       switch (q) {
+       case PFSYNC_S_IACK:
+               return ("PFSYNC_S_IACK");
+       
+       case PFSYNC_S_UPD_C:
+               return ("PFSYNC_S_UPD_C");
+
+       case PFSYNC_S_DEL:
+               return ("PFSYNC_S_DEL");
+
+       case PFSYNC_S_INS:
+               return ("PFSYNC_S_INS");
+
+       case PFSYNC_S_UPD:
+               return ("PFSYNC_S_UPD");
+
+       case PFSYNC_S_COUNT:
+               return ("PFSYNC_S_COUNT");
+
+       case PFSYNC_S_DEFER:
+               return ("PFSYNC_S_DEFER");
+
+       case PFSYNC_S_NONE:
+               return ("PFSYNC_S_NONE");
+
+       default:
+               return ("???");
+       }
+}
+
+const char *
+pfsync_timeout_name(unsigned int timeout)
+{
+       const char *timeout_name[] = {
+           "PFTM_TCP_FIRST_PACKET",
+           "PFTM_TCP_OPENING",
+           "PFTM_TCP_ESTABLISHED",
+           "PFTM_TCP_CLOSING",
+           "PFTM_TCP_FIN_WAIT",
+           "PFTM_TCP_CLOSED",
+           "PFTM_UDP_FIRST_PACKET",
+           "PFTM_UDP_SINGLE",
+           "PFTM_UDP_MULTIPLE",
+           "PFTM_ICMP_FIRST_PACKET",
+           "PFTM_ICMP_ERROR_REPLY",
+           "PFTM_OTHER_FIRST_PACKET",
+           "PFTM_OTHER_SINGLE",
+           "PFTM_OTHER_MULTIPLE",
+           "PFTM_FRAG",
+           "PFTM_INTERVAL",
+           "PFTM_ADAPTIVE_START",
+           "PFTM_ADAPTIVE_END",
+           "PFTM_SRC_NODE",
+           "PFTM_TS_DIFF",
+           "PFTM_MAX",
+           "PFTM_PURGE",
+           "PFTM_UNLINKED"
+       };
+
+       if (timeout > PFTM_UNLINKED)
+               return ("???");
+       else
+               return (timeout_name[timeout]);
+}
+
 void
 pfsync_q_del(struct pf_state *st)
 {
@@ -2273,6 +2341,19 @@ pfsync_q_del(struct pf_state *st)
                mtx_leave(&sc->sc_st_mtx);
                return;
        }
+
+       if (TAILQ_EMPTY(&sc->sc_qs[q])) {
+               mtx_leave(&sc->sc_st_mtx);
+               DPFPRINTF(LOG_ERR,
+                   "%s: stale queue (%s) in state (id %llu)"
+                   "nosync: %s unlinked: %s timeout: %s", __func__,
+                   pfsync_qname(q), st->id,
+                   (st->state_flags & PFSTATE_NOSYNC) ? "yes" : "no",
+                   (st->timeout == PFTM_UNLINKED) ? "yes" : "no",
+                   pfsync_timeout_name(st->timeout));
+               return;
+       }
+
        atomic_sub_long(&sc->sc_len, pfsync_qs[q].len);
        TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
        if (TAILQ_EMPTY(&sc->sc_qs[q]))

Reply via email to