Hello,
skip reading if you are not interested in L2 switching combined
with bluhm's diff [1], which enables parallel forwarding.
Hrvoje gave it a try and soon discovered some panics. Diff below
fixes a panic indicated by stack as follows:
login: panic: kernel diagnostic assertion "smr->smr_func == NULL"\
fai(ed: file "/home/sasha/src.sashan/sys/kern/kern_smr.c", line 247
Stopped at db_enter+0x10: popq %rbp
TID PID UID PRFLAGS PFLAGS CPU COMMAND
168970 15734 0 0x14000 0x200 3 softnet
92200 58362 0 0x14000 0x200 5 softnet
195539 36092 0 0x14000 0x200 2 softnet
*162819 18587 0 0x14000 0x200 1 softnet
db_enter() at db_enter+0x10
panic(ffffffff81e0483c) at panic+0xbf
__assert(ffffffff81e6d854,ffffffff81e6b008,f7,ffffffff81e6b033) at __assert+0x2b
smr_call_impl(fffffd83936c7160,ffffffff810ef0f0,fffffd83936c7100,0) \
at smr_call_impl+0xd4
veb_port_input(ffff800000082048,fffffd80cccaef00,90e2ba33b4a1,ffff80000015f900)\
at veb_port_input+0x2fa
ether_input(ffff800000082048,fffffd80cccaef00) at ether_input+0xf5
if_input_process(ffff800000082048,ffff800022c62388) at if_input_process+0x6f
ifiq_process(ffff800000082458) at ifiq_process+0x69
taskq_thread(ffff80000002f080) at taskq_thread+0x81
end trace frame: 0x0, count: 6
https://www.openbsd.org/ddb.html describes the minimum info required in bug
reports. Insufficient info makes it difficult to find and fix bugs.
Hrvoje knows all details [2] how to wire things up to trigger the
crash. I'm just using all HW and scripts he kindly provided me
to reproduce those panics reliably.
I think the crash comes from combination of SMR and reference
counting done by atomic ops. Let's assume two cpus
are trying to update the same entry.
Let's assume one CPU (cpu0) just found oebe using ebl_find() at line 311:
310 smr_read_enter();
311 oebe = ebl_find(ebl, eba);
312 if (oebe == NULL)
313 new = 1;
314 else {
315 if (oebe->ebe_age != now)
316 oebe->ebe_age = now;
317
318 /* does this entry need to be replaced? */
319 if (oebe->ebe_type == EBE_DYNAMIC &&
320 !eb_port_eq(eb, oebe->ebe_port, port)) {
321 new = 1;
322 ebe_take(oebe);
323 } else
few ticks later the other CPU (cpu1) runs ahead. It just removed
the same entry found by cpu0 at line 360:
353 mtx_enter(&eb->eb_lock);
354 num = eb->eb_num + (oebe == NULL);
355 if (num <= eb->eb_max && ebt_insert(eb, nebe) == oebe) {
356 /* we won, do the update */
357 ebl_insert(ebl, nebe);
358
359 if (oebe != NULL) {
360 ebl_remove(ebl, oebe);
361 ebt_replace(eb, oebe, nebe);
362
let's further assume cpu1 reaches line 389:
383 if (oebe != NULL) {
384 /*
385 * the old entry could be referenced in
386 * multiple places, including an smr read
387 * section, so release it properly.
388 */
389 ebe_rele(oebe);
390 }
before cpu0 raches line 322 (ebe_take()). If that happens the cpu1 drops
the last reference to oebe, which is in fact shared between cpu1 and cpu0.
if cpu1 sees reference count is zero, it does smr_call(), which schedules
ebe_free() on oebe, so oebe can be freed when cpu0 is done with it.
few ticks later cpu0 reaches line 322 and takes its reference to oebe.
the cpu0 enters critical section and sees it lost race (because
ebt_insert() != oebe). cpu0 continues at line 389. it drops last
reference and calls smr_call(). It trips the assert, because ebe_free()
is scheduled already by cpu1.
diff below fixes the flaw by introducing `cebe` (conflicting ebe) local
variable:
345 mtx_enter(&eb->eb_lock);
346 num = eb->eb_num + (oebe == NULL);
347 cebe = NULL;
348 if (num <= eb->eb_max) {
349 cebe = ebt_insert(eb, nebe);
350
351 if (cebe == NULL) {
352 /* nebe got inserted without conflict */
353 eb->eb_num++;
354 ebl_insert(ebl, nebe);
355 nebe = NULL;
356 } else if ((oebe != NULL) && (oebe == cebe)) {
357 /* we won, do the update */
358 ebl_insert(ebl, nebe);
359 ebl_remove(ebl, cebe);
360 ebt_replace(eb, cebe, nebe);
361 nebe = NULL;
362 } else {
363 cebe = NULL;
364 }
365 }
366 mtx_leave(&eb->eb_lock);
367
368 if (nebe != NULL) {
369 /*
370 * the new entry didn't make it into the
371 * table, so it can be freed directly.
372 */
373 ebe_free(nebe);
374 }
375
376 if (cebe != NULL) {
377 /*
378 * the old entry could be referenced in
379 * multiple places, including an smr read
380 * section, so release it properly.
381 */
382 ebe_rele(cebe);
383 }
as you can see we use `oebe` to perform eb_max boundary check. The newly
introduced variable `cebe` is used to detect and resolve conflict, when
two CPUs are trying to insert entry. The key part is to make sure
the looser does not try to drop reference to cebe.
diff seems to solve the issue. vmstat(8) does not seem to indicate
a memory leak.
OK?
thanks and
regards
sashan
[1] https://marc.info/?l=openbsd-tech&m=161903387904923&w=2
[2] https://marc.info/?l=openbsd-tech&m=162306191225590&w=
--------8<---------------8<---------------8<------------------8<--------
diff --git a/sys/net/if_etherbridge.c b/sys/net/if_etherbridge.c
index b4a07b5eefc..c1d8109af6e 100644
--- a/sys/net/if_etherbridge.c
+++ b/sys/net/if_etherbridge.c
@@ -44,7 +44,6 @@
#include <net/if_etherbridge.h>
-static inline void ebe_take(struct eb_entry *);
static inline void ebe_rele(struct eb_entry *);
static void ebe_free(void *);
@@ -232,12 +231,6 @@ ebt_remove(struct etherbridge *eb, struct eb_entry *ebe)
RBT_REMOVE(eb_tree, &eb->eb_tree, ebe);
}
-static inline void
-ebe_take(struct eb_entry *ebe)
-{
- refcnt_take(&ebe->ebe_refs);
-}
-
static void
ebe_rele(struct eb_entry *ebe)
{
@@ -295,7 +288,7 @@ void
etherbridge_map(struct etherbridge *eb, void *port, uint64_t eba)
{
struct eb_list *ebl;
- struct eb_entry *oebe, *nebe;
+ struct eb_entry *oebe, *nebe, *cebe;
unsigned int num;
void *nport;
int new = 0;
@@ -317,10 +310,9 @@ etherbridge_map(struct etherbridge *eb, void *port,
uint64_t eba)
/* does this entry need to be replaced? */
if (oebe->ebe_type == EBE_DYNAMIC &&
- !eb_port_eq(eb, oebe->ebe_port, port)) {
+ !eb_port_eq(eb, oebe->ebe_port, port))
new = 1;
- ebe_take(oebe);
- } else
+ else
oebe = NULL;
}
smr_read_leave();
@@ -352,23 +344,24 @@ etherbridge_map(struct etherbridge *eb, void *port,
uint64_t eba)
mtx_enter(&eb->eb_lock);
num = eb->eb_num + (oebe == NULL);
- if (num <= eb->eb_max && ebt_insert(eb, nebe) == oebe) {
- /* we won, do the update */
- ebl_insert(ebl, nebe);
-
- if (oebe != NULL) {
- ebl_remove(ebl, oebe);
- ebt_replace(eb, oebe, nebe);
-
- /* take the table reference away */
- if (refcnt_rele(&oebe->ebe_refs)) {
- panic("%s: eb %p oebe %p refcnt",
- __func__, eb, oebe);
- }
+ cebe = NULL;
+ if (num <= eb->eb_max) {
+ cebe = ebt_insert(eb, nebe);
+
+ if (cebe == NULL) {
+ /* nebe got inserted without conflict */
+ eb->eb_num++;
+ ebl_insert(ebl, nebe);
+ nebe = NULL;
+ } else if ((oebe != NULL) && (oebe == cebe)) {
+ /* we won, do the update */
+ ebl_insert(ebl, nebe);
+ ebl_remove(ebl, cebe);
+ ebt_replace(eb, cebe, nebe);
+ nebe = NULL;
+ } else {
+ cebe = NULL;
}
-
- nebe = NULL;
- eb->eb_num = num;
}
mtx_leave(&eb->eb_lock);
@@ -380,13 +373,13 @@ etherbridge_map(struct etherbridge *eb, void *port,
uint64_t eba)
ebe_free(nebe);
}
- if (oebe != NULL) {
+ if (cebe != NULL) {
/*
* the old entry could be referenced in
* multiple places, including an smr read
* section, so release it properly.
*/
- ebe_rele(oebe);
+ ebe_rele(cebe);
}
}