On Sat, Jun 19, 2021 at 12:32:04AM +0200, Alexandr Nedvedicky wrote:
> Hello,
> 
> skip reading if you are not interested in L2 switching combined
> with bluhm's diff [1], which enables parallel forwarding.
> 
> Hrvoje gave it a try and soon discovered some panics. Diff below
> fixes a panic indicated by stack as follows:

nice.

>     login: panic: kernel diagnostic assertion "smr->smr_func == NULL"\
>     fai(ed: file "/home/sasha/src.sashan/sys/kern/kern_smr.c", line 247
> Stopped at      db_enter+0x10:  popq    %rbp
>     TID    PID    UID     PRFLAGS     PFLAGS  CPU  COMMAND
>  168970  15734      0     0x14000      0x200    3  softnet
>   92200  58362      0     0x14000      0x200    5  softnet
>  195539  36092      0     0x14000      0x200    2  softnet
> *162819  18587      0     0x14000      0x200    1  softnet
> db_enter() at db_enter+0x10
> panic(ffffffff81e0483c) at panic+0xbf
> __assert(ffffffff81e6d854,ffffffff81e6b008,f7,ffffffff81e6b033) at 
> __assert+0x2b
> smr_call_impl(fffffd83936c7160,ffffffff810ef0f0,fffffd83936c7100,0) \
>     at smr_call_impl+0xd4
> veb_port_input(ffff800000082048,fffffd80cccaef00,90e2ba33b4a1,ffff80000015f900)\
>     at veb_port_input+0x2fa
> ether_input(ffff800000082048,fffffd80cccaef00) at ether_input+0xf5
> if_input_process(ffff800000082048,ffff800022c62388) at if_input_process+0x6f
> ifiq_process(ffff800000082458) at ifiq_process+0x69
> taskq_thread(ffff80000002f080) at taskq_thread+0x81
> end trace frame: 0x0, count: 6
> https://www.openbsd.org/ddb.html describes the minimum info required in bug
> reports.  Insufficient info makes it difficult to find and fix bugs.
> 
> Hrvoje knows all details [2] how to wire things up to trigger the
> crash. I'm just using all HW and scripts he kindly provided me
> to reproduce those panics reliably.
> 
> I think the crash comes from combination of SMR and reference
> counting done by atomic ops. Let's assume two cpus
> are trying to update the same entry.
> 
> Let's assume one CPU (cpu0) just found oebe using ebl_find() at line 311:
> 
> 310         smr_read_enter();
> 311         oebe = ebl_find(ebl, eba);
> 312         if (oebe == NULL)
> 313                 new = 1;
> 314         else {
> 315                 if (oebe->ebe_age != now)
> 316                         oebe->ebe_age = now;
> 317 
> 318                 /* does this entry need to be replaced? */
> 319                 if (oebe->ebe_type == EBE_DYNAMIC &&
> 320                     !eb_port_eq(eb, oebe->ebe_port, port)) {
> 321                         new = 1;
> 322                         ebe_take(oebe);
> 323                 } else
> 
> few ticks later the other CPU (cpu1) runs ahead. It just removed
> the same entry found by cpu0 at line 360:
> 353         mtx_enter(&eb->eb_lock);
> 354         num = eb->eb_num + (oebe == NULL);
> 355         if (num <= eb->eb_max && ebt_insert(eb, nebe) == oebe) {
> 356                 /* we won, do the update */
> 357                 ebl_insert(ebl, nebe);
> 358 
> 359                 if (oebe != NULL) {
> 360                         ebl_remove(ebl, oebe);
> 361                         ebt_replace(eb, oebe, nebe);
> 362 
> 
> let's further assume cpu1 reaches line 389:
> 383         if (oebe != NULL) {
> 384                 /*
> 385                  * the old entry could be referenced in
> 386                  * multiple places, including an smr read
> 387                  * section, so release it properly.
> 388                  */
> 389                 ebe_rele(oebe);
> 390         }
> before cpu0 raches line 322 (ebe_take()). If that happens the cpu1 drops
> the last reference to oebe, which is in fact shared between cpu1 and cpu0.
> if cpu1 sees reference count is zero, it does smr_call(), which schedules
> ebe_free() on oebe, so oebe can be freed when cpu0 is done with it.
> 
> few ticks later cpu0 reaches line 322 and takes its reference to oebe.
> the cpu0 enters critical section and sees it lost race (because
> ebt_insert() != oebe). cpu0 continues at line 389. it drops last
> reference and calls smr_call(). It trips the assert, because ebe_free()
> is scheduled already by cpu1.
> 
> diff below fixes the flaw by introducing `cebe` (conflicting ebe) local
> variable:

i think your real fix is where you stop taking an oebe reference
from the smr critical section.

> 345         mtx_enter(&eb->eb_lock);
> 346         num = eb->eb_num + (oebe == NULL);
> 347         cebe = NULL;
> 348         if (num <= eb->eb_max) {
> 349                 cebe = ebt_insert(eb, nebe);
> 350 
> 351                 if (cebe == NULL) {
> 352                         /* nebe got inserted without conflict */
> 353                         eb->eb_num++;
> 354                         ebl_insert(ebl, nebe);
> 355                         nebe = NULL;
> 356                 } else if ((oebe != NULL) && (oebe == cebe)) {
> 357                         /* we won, do the update */
> 358                         ebl_insert(ebl, nebe);
> 359                         ebl_remove(ebl, cebe);
> 360                         ebt_replace(eb, cebe, nebe);
> 361                         nebe = NULL;
> 362                 } else {
> 363                         cebe = NULL;
> 364                 }
> 365         }
> 366         mtx_leave(&eb->eb_lock);
> 367 
> 368         if (nebe != NULL) {
> 369                 /*
> 370                  * the new entry didn't make it into the
> 371                  * table, so it can be freed directly.
> 372                  */
> 373                 ebe_free(nebe);
> 374         }
> 375 
> 376         if (cebe != NULL) {
> 377                 /*
> 378                  * the old entry could be referenced in
> 379                  * multiple places, including an smr read
> 380                  * section, so release it properly.
> 381                  */
> 382                 ebe_rele(cebe);
> 383         }
> 
> as you can see we use `oebe` to perform eb_max boundary check. The newly
> introduced variable `cebe` is used to detect and resolve conflict, when
> two CPUs are trying to insert entry. The key part is to make sure
> the looser does not try to drop reference to cebe.
> 
> diff seems to solve the issue. vmstat(8) does not seem to indicate
> a memory leak.
> 
> OK?

i think we can get away with not refcounting eb_entry structures at all.
either they're in the etherbridge map/table or they're not, and the
thing that takes them out of the map while holding the eb_lock mutex
becomes responsible for their cleanup.

i feel like most of the original logic can still hold up if we fix my
stupid refcnting mistake(s) and do a better job of avoiding a double
free.

> 
> thanks and
> regards
> sashan
> 
> [1] https://marc.info/?l=openbsd-tech&m=161903387904923&w=2
> 
> [2] https://marc.info/?l=openbsd-tech&m=162306191225590&w=

does this make sense? can hrvoje try it?

Index: if_etherbridge.c
===================================================================
RCS file: /cvs/src/sys/net/if_etherbridge.c,v
retrieving revision 1.6
diff -u -p -r1.6 if_etherbridge.c
--- if_etherbridge.c    10 Mar 2021 10:21:47 -0000      1.6
+++ if_etherbridge.c    24 Jun 2021 03:51:15 -0000
@@ -44,7 +44,6 @@
 
 #include <net/if_etherbridge.h>
 
-static inline void     ebe_take(struct eb_entry *);
 static inline void     ebe_rele(struct eb_entry *);
 static void            ebe_free(void *);
 
@@ -233,16 +232,9 @@ ebt_remove(struct etherbridge *eb, struc
 }
 
 static inline void
-ebe_take(struct eb_entry *ebe)
-{
-       refcnt_take(&ebe->ebe_refs);
-}
-
-static void
 ebe_rele(struct eb_entry *ebe)
 {
-       if (refcnt_rele(&ebe->ebe_refs))
-               smr_call(&ebe->ebe_smr_entry, ebe_free, ebe);
+       smr_call(&ebe->ebe_smr_entry, ebe_free, ebe);
 }
 
 static void
@@ -317,14 +309,20 @@ etherbridge_map(struct etherbridge *eb, 
 
                /* does this entry need to be replaced? */
                if (oebe->ebe_type == EBE_DYNAMIC &&
-                   !eb_port_eq(eb, oebe->ebe_port, port)) {
+                   !eb_port_eq(eb, oebe->ebe_port, port))
                        new = 1;
-                       ebe_take(oebe);
-               } else
+               else
                        oebe = NULL;
        }
        smr_read_leave();
 
+        /*
+         * note that we don't own or hold a valid reference to oebe
+         * here until we get it back again inside the eb->eb_lock
+         * critical section below. until then it's just a value we
+         * can compare against, not a pointer we can deref.
+        */
+
        if (!new)
                return;
 
@@ -342,7 +340,6 @@ etherbridge_map(struct etherbridge *eb, 
        }
 
        smr_init(&nebe->ebe_smr_entry);
-       refcnt_init(&nebe->ebe_refs);
        nebe->ebe_etherbridge = eb;
 
        nebe->ebe_addr = eba;
@@ -359,16 +356,13 @@ etherbridge_map(struct etherbridge *eb, 
                if (oebe != NULL) {
                        ebl_remove(ebl, oebe);
                        ebt_replace(eb, oebe, nebe);
-
-                       /* take the table reference away */
-                       if (refcnt_rele(&oebe->ebe_refs)) {
-                               panic("%s: eb %p oebe %p refcnt",
-                                   __func__, eb, oebe);
-                       }
                }
 
-               nebe = NULL;
+               nebe = NULL; /* give nebe reference to the table */
                eb->eb_num = num;
+       } else {
+               /* we lost, we didn't end up replacing oebe */
+               oebe = NULL;
        }
        mtx_leave(&eb->eb_lock);
 
@@ -381,11 +375,7 @@ etherbridge_map(struct etherbridge *eb, 
        }
 
        if (oebe != NULL) {
-               /*
-                * the old entry could be referenced in
-                * multiple places, including an smr read
-                * section, so release it properly.
-                */
+               /* we removed this entry, so it's ours to release. */
                ebe_rele(oebe);
        }
 }
@@ -415,7 +405,6 @@ etherbridge_add_addr(struct etherbridge 
        }
 
        smr_init(&nebe->ebe_smr_entry);
-       refcnt_init(&nebe->ebe_refs);
        nebe->ebe_etherbridge = eb;
 
        nebe->ebe_addr = eba;
@@ -551,12 +540,18 @@ etherbridge_detach_port(struct etherbrid
                mtx_leave(&eb->eb_lock);
        }
 
-       smr_barrier(); /* try and do it once for all the entries */
+       if (TAILQ_EMPTY(&ebq))
+               return;
+
+       /*
+        * do one smr barrier for all the entries rather than an
+        * smr_call each.
+        */
+       smr_barrier();
 
        TAILQ_FOREACH_SAFE(ebe, &ebq, ebe_qentry, nebe) {
                TAILQ_REMOVE(&ebq, ebe, ebe_qentry);
-               if (refcnt_rele(&ebe->ebe_refs))
-                       ebe_free(ebe);
+               ebe_free(ebe);
        }
 }
 
@@ -587,12 +582,18 @@ etherbridge_flush(struct etherbridge *eb
                mtx_leave(&eb->eb_lock);
        }
 
-       smr_barrier(); /* try and do it once for all the entries */
+       if (TAILQ_EMPTY(&ebq))
+               return;
+
+       /*
+        * do one smr barrier for all the entries rather than an
+        * smr_call each.
+        */
+       smr_barrier();
 
        TAILQ_FOREACH_SAFE(ebe, &ebq, ebe_qentry, nebe) {
                TAILQ_REMOVE(&ebq, ebe, ebe_qentry);
-               if (refcnt_rele(&ebe->ebe_refs))
-                       ebe_free(ebe);
+               ebe_free(ebe);
        }
 }
 
Index: if_etherbridge.h
===================================================================
RCS file: /cvs/src/sys/net/if_etherbridge.h,v
retrieving revision 1.3
diff -u -p -r1.3 if_etherbridge.h
--- if_etherbridge.h    26 Feb 2021 01:28:51 -0000      1.3
+++ if_etherbridge.h    24 Jun 2021 03:51:15 -0000
@@ -51,7 +51,6 @@ struct eb_entry {
        time_t                           ebe_age;
 
        struct etherbridge              *ebe_etherbridge;
-       struct refcnt                    ebe_refs;
        struct smr_entry                 ebe_smr_entry;
 };
 

Reply via email to