Our running ovs-vswitchd was cored with stack [1]. It shows a
concurrent race condition.

When the revalidator thread is performing rule revalidation/translation,
another thread has already deleted the rule tag and reduced the reference
count to 0. Using ofproto_rule_ref() at this point will trigger an assertion,
as it assumes the object is still alive (refcount > 0).

Replace the mandatory reference to the rule in xlate_table_action()
with a non-fatal try-ref: ofproto_rule_try_ref().
If the reference fails (the rule is being/has been released),
treat it as a table miss path, avoiding referencing "dead" rules
and causing assertion errors.

[1] call stack:
*0  raise () from /lib64/libc.so.6
*1  abort () from /lib64/libc.so.6
*2  ovs_abort_valist () at lib/util.c:499
*3  vlog_abort_valist () at lib/vlog.c:1249
*4  vlog_abort () at lib/vlog.c:1263
*5  ovs_assert_failure () at lib/util.c:86
*6  ovs_refcount_ref () at lib/ovs-atomic.h:547
*7  ovs_refcount_ref () at ofproto/ofproto.c:3010
*8  ofproto_rule_ref () at ofproto/ofproto.c:3012
*9  xlate_table_action () at ofproto/ofproto-dpif-xlate.c:4506
*10 do_xlate_actions () at ofproto/ofproto-dpif-xlate.c:7183
*11 xlate_recursively () at ofproto/ofproto-dpif-xlate.c:4382
*12 xlate_table_action () at ofproto/ofproto-dpif-xlate.c:4511
*13 do_xlate_actions () at ofproto/ofproto-dpif-xlate.c:7183
*14 clone_xlate_actions () at ofproto/ofproto-dpif-xlate.c:5809
*15 xlate_recursively () at ofproto/ofproto-dpif-xlate.c:4382
*16 xlate_table_action () at ofproto/ofproto-dpif-xlate.c:4511
*17 patch_port_output () at ofproto/ofproto-dpif-xlate.c:3889
*18 compose_output_action__ () at ofproto/ofproto-dpif-xlate.c:4204
*19 compose_output_action () at ofproto/ofproto-dpif-xlate.c:4359
*20 xlate_output_action () at ofproto/ofproto-dpif-xlate.c:5304
*21 do_xlate_actions () at ofproto/ofproto-dpif-xlate.c:7033
*22 xlate_actions () at ofproto/ofproto-dpif-xlate.c:8001
*23 xlate_key () at ofproto/ofproto-dpif-upcall.c:2321
*24 xlate_ukey () at ofproto/ofproto-dpif-upcall.c:2336
*25 revalidate_ukey__ () at ofproto/ofproto-dpif-upcall.c:2382
*26 revalidate_ukey () at ofproto/ofproto-dpif-upcall.c:2491
*27 revalidate () at ofproto/ofproto-dpif-upcall.c:2945
*28 udpif_revalidator () at ofproto/ofproto-dpif-upcall.c:1089
*29 ovsthread_wrapper () at lib/ovs-thread.c:422
*30 start_thread () from /lib64/libpthread.so.0
*31 clone () from /lib64/libc.so.6

Signed-off-by: LIU Yulong <[email protected]>
---
V3:
- Adressed Addressed Ilya's and Adrián's comments:
- release the reference unconditionally
- Adressed Adrián's comments:
- make code consistent in xlate_actions()
V2:
- Addressed Eelco's comments:
- release the reference after ofproto_rule_try_ref()
- Addressed Ilya's comments:
- remove the commit message function args and mem addresses
---
 ofproto/ofproto-dpif-xlate.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index 920d998e6..f6a9e97e6 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -4793,7 +4793,7 @@ xlate_table_action(struct xlate_ctx *ctx, ofp_port_t 
in_port, uint8_t table_id,
             tuple_swap(&ctx->xin->flow, ctx->wc);
         }
 
-        if (rule) {
+        if (rule && ofproto_rule_try_ref(&rule->up)) {
             /* Fill in the cache entry here instead of xlate_recursively
              * to make the reference counting more explicit.  We take a
              * reference in the lookups above if we are going to cache the
@@ -4805,6 +4805,7 @@ xlate_table_action(struct xlate_ctx *ctx, ofp_port_t 
in_port, uint8_t table_id,
                 entry->rule = rule;
                 ofproto_rule_ref(&rule->up);
             }
+            ofproto_rule_unref(&rule->up);
 
             struct ovs_list *old_trace = ctx->xin->trace;
             xlate_report_table(ctx, rule, table_id);
@@ -8457,12 +8458,14 @@ xlate_actions(struct xlate_in *xin, struct xlate_out 
*xout)
         if (ctx.xin->resubmit_stats) {
             rule_dpif_credit_stats(ctx.rule, ctx.xin->resubmit_stats, false);
         }
-        if (ctx.xin->xcache) {
-            struct xc_entry *entry;
-
-            entry = xlate_cache_add_entry(ctx.xin->xcache, XC_RULE);
-            entry->rule = ctx.rule;
-            ofproto_rule_ref(&ctx.rule->up);
+        if (ctx.rule && ofproto_rule_try_ref(&ctx.rule->up)) {
+            if (ctx.xin->xcache) {
+                struct xc_entry *entry;
+                entry = xlate_cache_add_entry(ctx.xin->xcache, XC_RULE);
+                entry->rule = ctx.rule;
+                ofproto_rule_ref(&ctx.rule->up);
+            }
+            ofproto_rule_unref(&ctx.rule->up);
         }
 
         xlate_report_table(&ctx, ctx.rule, ctx.table_id);
-- 
2.50.1 (Apple Git-155)

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to