Re: pfsync slpassert on boot and panic

2022-11-25 Thread Mark Kettenis
> Date: Fri, 25 Nov 2022 10:12:12 +0100
> From: Alexandr Nedvedicky 
> 
> Hello Hrvoje,
> 
> On Fri, Nov 25, 2022 at 09:57:15AM +0100, Hrvoje Popovski wrote:
> > Hi,
> > 
> > I think that this is similar problem as what David Hill send on tech@
> > with subject "splassert on boot"
> > 
> > I've checkout tree few minutes ago and in there should be
> > mvs@ "Remove netlock assertion within PF_LOCK()" and
> > dlg@ "get rid of NET_LOCK in the pf purge work" diffs.
> > 
> > on boot I'm getting this splassert
> > 
> > splassert: pfsync_delete_state: want 2 have 256
> > Starting stack trace...
> > pfsync_delete_state(fd83a66644d8) at pfsync_delete_state+0x58
> > pf_remove_state(fd83a66644d8) at pf_remove_state+0x14b
> > pf_purge_expired_states(1fdb,40) at pf_purge_expired_states+0x202
> > pf_purge_states(0) at pf_purge_states+0x1c
> > taskq_thread(822f69c8) at taskq_thread+0x11a
> > end trace frame: 0x0, count: 252
> > End of stack trace.
> 
> I've sent a diff yesterday to David Hill [1]. It looks like
> I've forgot to add cc' to tach@
> 
> 
> > splassert: pfsync_delete_state: want 2 have 0
> > Starting stack trace...
> > pfsync_delete_state(fd83a6676628) at pfsync_delete_state+0x58
> > pf_remove_state(fd83a6676628) at pf_remove_state+0x14b
> > pf_purge_expired_states(1f9c,40) at pf_purge_expired_states+0x202
> > pf_purge_states(0) at pf_purge_states+0x1c
> > taskq_thread(822f69c8) at taskq_thread+0x11a
> > end trace frame: 0x0, count: 252
> > End of stack trace.
> > 
> > 
> > and if i destroy pfsync interface and then sh /etc/netstart box panic
> > 
> > uvm_fault(0x823d3250, 0x810, 0, 1) -> e
> > kernel: page fault trap, code=0
> > Stopped at  pfsync_q_ins+0x1a:  movq0x810(%r13),%rsi
> > TIDPIDUID PRFLAGS PFLAGS  CPU  COMMAND
> > * 68977  95532  0 0x14000  0x2003K systqmp
> > pfsync_q_ins(fd83a6676628,2) at pfsync_q_ins+0x1a
> > pf_remove_state(fd83a6676628) at pf_remove_state+0x14b
> > pf_purge_expired_states(1f9c,40) at pf_purge_expired_states+0x202
> > pf_purge_states(0) at pf_purge_states+0x1c
> > taskq_thread(822f69c8) at taskq_thread+0x11a
> > end trace frame: 0x0, count: 10
> > https://www.openbsd.org/ddb.html describes the minimum info required in
> > bug reports.  Insufficient info makes it difficult to find and fix bugs.
> > ddb{3}>
> > 
> 
> Looks like we need to synchronize pfsync destroy with timer thread.
> 
> thanks for great testing.
> 
> regards
> sashan
> 
> 8<---8<---8<--8<
> diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c
> index f69790ee98d..24963a546de 100644
> --- a/sys/net/if_pfsync.c
> +++ b/sys/net/if_pfsync.c
> @@ -1865,8 +1865,6 @@ pfsync_undefer(struct pfsync_deferral *pd, int drop)
>  {
>   struct pfsync_softc *sc = pfsyncif;
>  
> - NET_ASSERT_LOCKED();
> -
>   if (sc == NULL)
>   return;
>  
> @@ -2128,8 +2126,6 @@ pfsync_delete_state(struct pf_state *st)
>  {
>   struct pfsync_softc *sc = pfsyncif;
>  
> - NET_ASSERT_LOCKED();
> -
>   if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
>   return;

But if_flags is listed as being protected by the net lock.  So this
isn't safe.

> @@ -2188,8 +2184,6 @@ pfsync_clear_states(u_int32_t creatorid, const char 
> *ifname)
>   struct pfsync_clr clr;
>   } __packed r;
>  
> - NET_ASSERT_LOCKED();
> -
>   if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
>   return;

Same here.



Re: pfsync slpassert on boot and panic

2022-11-25 Thread Hrvoje Popovski
On 25.11.2022. 10:12, Alexandr Nedvedicky wrote:
> Looks like we need to synchronize pfsync destroy with timer thread.
> 
> thanks for great testing.
> 
> regards
> sashan
> 
> 8<---8<---8<--8<
> diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c
> index f69790ee98d..24963a546de 100644
> --- a/sys/net/if_pfsync.c
> +++ b/sys/net/if_pfsync.c
> @@ -1865,8 +1865,6 @@ pfsync_undefer(struct pfsync_deferral *pd, int drop)
>  {
>   struct pfsync_softc *sc = pfsyncif;
>  
> - NET_ASSERT_LOCKED();
> -
>   if (sc == NULL)
>   return;
>  
> @@ -2128,8 +2126,6 @@ pfsync_delete_state(struct pf_state *st)
>  {
>   struct pfsync_softc *sc = pfsyncif;
>  
> - NET_ASSERT_LOCKED();
> -
>   if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
>   return;
>  
> @@ -2188,8 +2184,6 @@ pfsync_clear_states(u_int32_t creatorid, const char 
> *ifname)
>   struct pfsync_clr clr;
>   } __packed r;
>  
> - NET_ASSERT_LOCKED();
> -
>   if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
>   return;
>  


Hi,

yes, this diff helps and I can't panic box in the way i can without it.
I will play with this diff and try to trigger more panics ...

Thank you guys for work on unlocking pf and pfsync ...





Re: pfsync slpassert on boot and panic

2022-11-25 Thread Alexandr Nedvedicky
Hello Hrvoje,


On Fri, Nov 25, 2022 at 09:57:15AM +0100, Hrvoje Popovski wrote:
> Hi,
> 
> I think that this is similar problem as what David Hill send on tech@
> with subject "splassert on boot"
> 
> I've checkout tree few minutes ago and in there should be
> mvs@ "Remove netlock assertion within PF_LOCK()" and
> dlg@ "get rid of NET_LOCK in the pf purge work" diffs.
> 
> on boot I'm getting this splassert
> 
> splassert: pfsync_delete_state: want 2 have 256
> Starting stack trace...
> pfsync_delete_state(fd83a66644d8) at pfsync_delete_state+0x58
> pf_remove_state(fd83a66644d8) at pf_remove_state+0x14b
> pf_purge_expired_states(1fdb,40) at pf_purge_expired_states+0x202
> pf_purge_states(0) at pf_purge_states+0x1c
> taskq_thread(822f69c8) at taskq_thread+0x11a
> end trace frame: 0x0, count: 252
> End of stack trace.

I've sent a diff yesterday to David Hill [1]. It looks like
I've forgot to add cc' to tach@


> splassert: pfsync_delete_state: want 2 have 0
> Starting stack trace...
> pfsync_delete_state(fd83a6676628) at pfsync_delete_state+0x58
> pf_remove_state(fd83a6676628) at pf_remove_state+0x14b
> pf_purge_expired_states(1f9c,40) at pf_purge_expired_states+0x202
> pf_purge_states(0) at pf_purge_states+0x1c
> taskq_thread(822f69c8) at taskq_thread+0x11a
> end trace frame: 0x0, count: 252
> End of stack trace.
> 
> 
> and if i destroy pfsync interface and then sh /etc/netstart box panic
> 
> uvm_fault(0x823d3250, 0x810, 0, 1) -> e
> kernel: page fault trap, code=0
> Stopped at  pfsync_q_ins+0x1a:  movq0x810(%r13),%rsi
> TIDPIDUID PRFLAGS PFLAGS  CPU  COMMAND
> * 68977  95532  0 0x14000  0x2003K systqmp
> pfsync_q_ins(fd83a6676628,2) at pfsync_q_ins+0x1a
> pf_remove_state(fd83a6676628) at pf_remove_state+0x14b
> pf_purge_expired_states(1f9c,40) at pf_purge_expired_states+0x202
> pf_purge_states(0) at pf_purge_states+0x1c
> taskq_thread(822f69c8) at taskq_thread+0x11a
> end trace frame: 0x0, count: 10
> https://www.openbsd.org/ddb.html describes the minimum info required in
> bug reports.  Insufficient info makes it difficult to find and fix bugs.
> ddb{3}>
> 

Looks like we need to synchronize pfsync destroy with timer thread.

thanks for great testing.

regards
sashan

8<---8<---8<--8<
diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c
index f69790ee98d..24963a546de 100644
--- a/sys/net/if_pfsync.c
+++ b/sys/net/if_pfsync.c
@@ -1865,8 +1865,6 @@ pfsync_undefer(struct pfsync_deferral *pd, int drop)
 {
struct pfsync_softc *sc = pfsyncif;
 
-   NET_ASSERT_LOCKED();
-
if (sc == NULL)
return;
 
@@ -2128,8 +2126,6 @@ pfsync_delete_state(struct pf_state *st)
 {
struct pfsync_softc *sc = pfsyncif;
 
-   NET_ASSERT_LOCKED();
-
if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
return;
 
@@ -2188,8 +2184,6 @@ pfsync_clear_states(u_int32_t creatorid, const char 
*ifname)
struct pfsync_clr clr;
} __packed r;
 
-   NET_ASSERT_LOCKED();
-
if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
return;
 



pfsync slpassert on boot and panic

2022-11-25 Thread Hrvoje Popovski
Hi,

I think that this is similar problem as what David Hill send on tech@
with subject "splassert on boot"

I've checkout tree few minutes ago and in there should be
mvs@ "Remove netlock assertion within PF_LOCK()" and
dlg@ "get rid of NET_LOCK in the pf purge work" diffs.

on boot I'm getting this splassert

splassert: pfsync_delete_state: want 2 have 256
Starting stack trace...
pfsync_delete_state(fd83a66644d8) at pfsync_delete_state+0x58
pf_remove_state(fd83a66644d8) at pf_remove_state+0x14b
pf_purge_expired_states(1fdb,40) at pf_purge_expired_states+0x202
pf_purge_states(0) at pf_purge_states+0x1c
taskq_thread(822f69c8) at taskq_thread+0x11a
end trace frame: 0x0, count: 252
End of stack trace.

splassert: pfsync_delete_state: want 2 have 0
Starting stack trace...
pfsync_delete_state(fd83a6676628) at pfsync_delete_state+0x58
pf_remove_state(fd83a6676628) at pf_remove_state+0x14b
pf_purge_expired_states(1f9c,40) at pf_purge_expired_states+0x202
pf_purge_states(0) at pf_purge_states+0x1c
taskq_thread(822f69c8) at taskq_thread+0x11a
end trace frame: 0x0, count: 252
End of stack trace.


and if i destroy pfsync interface and then sh /etc/netstart box panic

uvm_fault(0x823d3250, 0x810, 0, 1) -> e
kernel: page fault trap, code=0
Stopped at  pfsync_q_ins+0x1a:  movq0x810(%r13),%rsi
TIDPIDUID PRFLAGS PFLAGS  CPU  COMMAND
* 68977  95532  0 0x14000  0x2003K systqmp
pfsync_q_ins(fd83a6676628,2) at pfsync_q_ins+0x1a
pf_remove_state(fd83a6676628) at pf_remove_state+0x14b
pf_purge_expired_states(1f9c,40) at pf_purge_expired_states+0x202
pf_purge_states(0) at pf_purge_states+0x1c
taskq_thread(822f69c8) at taskq_thread+0x11a
end trace frame: 0x0, count: 10
https://www.openbsd.org/ddb.html describes the minimum info required in
bug reports.  Insufficient info makes it difficult to find and fix bugs.
ddb{3}>


ddb{3}> show panic
*cpu3: uvm_fault(0x823d3250, 0x810, 0, 1) -> e


ddb{3}> show reg
rdi   0xfd83a6676628
rsi  0x2
rbp   0x800022d5ef90
rbx   0xfd83a6676628
rdx   0xfe0f
rcx0x282
rax 0xff
r80x8233fa38w_locklistdata+0x43e68
r90x800022d5f100
r10   0x3925934c5d55f628
r11   0xba2a637b8a7a5b53
r12 0x40
r130
r14  0x2
r15   0xfd83a6676628
rip   0x81b88c8apfsync_q_ins+0x1a
cs   0x8
rflags   0x10282__ALIGN_SIZE+0xf282
rsp   0x800022d5ef50
ss  0x10
pfsync_q_ins+0x1a:  movq0x810(%r13),%rsi



ddb{3}> show locks
exclusive kernel_lock &kernel_lock r = 0 (0x82438590)
#0  witness_lock+0x311
#1  kpageflttrap+0x1b2
#2  kerntrap+0x91
#3  alltraps_kern_meltdown+0x7b
#4  pfsync_q_ins+0x1a
#5  pf_remove_state+0x14b
#6  pf_purge_expired_states+0x202
#7  pf_purge_states+0x1c
#8  taskq_thread+0x11a
#9  proc_trampoline+0x1c
exclusive rwlock pf_state_lock r = 0 (0x822c05a0)
#0  witness_lock+0x311
#1  pf_purge_expired_states+0x1d5
#2  pf_purge_states+0x1c
#3  taskq_thread+0x11a
#4  proc_trampoline+0x1c
exclusive rwlock pf_lock r = 0 (0x822c0570)
#0  witness_lock+0x311
#1  pf_purge_expired_states+0x1c9
#2  pf_purge_states+0x1c
#3  taskq_thread+0x11a
#4  proc_trampoline+0x1c
exclusive rwlock pfstates r = 0 (0x822b4210)
#0  witness_lock+0x311
#1  pf_purge_expired_states+0x1bd
#2  pf_purge_states+0x1c
#3  taskq_thread+0x11a
#4  proc_trampoline+0x1c
shared rwlock systqmp r = 0 (0x822f6a38)
#0  witness_lock+0x311
#1  taskq_thread+0x10d
#2  proc_trampoline+0x1c
ddb{3}>


ddb{3}> ps
   PID TID   PPIDUID  S   FLAGS  WAIT  COMMAND
 83757   73556  1  0  30x100083  ttyin ksh
   230  204977  73028  0  30x100083  ttyin ksh
 73028  319173  85298   1000  30x10008b  sigsusp   ksh
 85298  521998  43878   1000  30x10  pf_state_loc  sshd
 43878  290809  21538  0  30x82  kqreadsshd
 50517  491548  1  0  30x100098  kqreadcron
 91327  121436  26507 95  3   0x1100092  kqreadsmtpd
 17435  384685  26507103  3   0x1100092  kqreadsmtpd
 67349  520840  26507 95  3   0x1100092  kqreadsmtpd
 97353  483313  26507 95  30x100092  kqreadsmtpd
 44262  496860  26507 95  3   0x1100092  kqreadsmtpd
52  255319  26507 95  3   0x1100092  kqreadsmtpd
 26507   41672  1  0  30x100080  kqreadsmtpd
 21538  173086  1  0  30x88  kqreadsshd
 44849  364511  1  0  30x100080  kqreadntpd
 33978  331427  77612 83  30x100092  kqread