Our running ovs-vswitchd was cored with stack [1]. It shows a
concurrent race condition.
When the revalidator thread is performing rule revalidation/translation,
another thread has already deleted the rule tag and reduced the reference
count to 0. Using ofproto_rule_ref() at this point will trigger an assertion,
as it assumes the object is still alive (refcount > 0).
Replace the mandatory reference to the rule in xlate_table_action()
with a non-fatal try-ref: ofproto_rule_try_ref().
If the reference fails (the rule is being/has been released),
treat it as a table miss path, avoiding referencing "dead" rules
and causing assertion errors.
[1] call stack:
*0 0x00007f2e2950c337 in raise () from /lib64/libc.so.6
*1 0x00007f2e2950da28 in abort () from /lib64/libc.so.6
*2 0x00005581a8eb3c4e in ovs_abort_valist (err_no=err_no@entry=0,
format=format@entry=0x5581a93191d0 "%s: assertion %s failed in %s()",
args=args@entry=0x7f2e17327700) at lib/util.c:499
*3 0x00005581a8ebb7a0 in vlog_abort_valist (module_=<optimized out>,
message=0x5581a93191d0 "%s: assertion %s failed in %s()",
args=args@entry=0x7f2e17327700) at lib/vlog.c:1249
*4 0x00005581a8ebb834 in vlog_abort (module=module@entry=0x5581a9888960
<this_module>,
message=message@entry=0x5581a93191d0 "%s: assertion %s failed in %s()") at
lib/vlog.c:1263
*5 0x00005581a8eb386c in ovs_assert_failure (where=where@entry=0x5581a92e95e8
"./lib/ovs-atomic.h:547",
function=function@entry=0x5581a92eb7a0 <__func__.6720> "ovs_refcount_ref",
condition=condition@entry=0x5581a92e95d7 "old_refcount > 0") at
lib/util.c:86
*6 0x00005581a2b33dd1 in ovs_refcount_ref (refcount=0x5581aca1ef28) at
lib/ovs-atomic.h:547
*7 0x00005581a8d82afb in ovs_refcount_ref (refcount=0x5581aca1ef28) at
ofproto/ofproto.c:3010
*8 ofproto_rule_ref (rule=rule@entry=0x5581aca1eeb0) at ofproto/ofproto.c:3012
*9 0x00005581a8dacfbd in xlate_table_action (ctx=0x7f2e1732bcb0,
in_port=<optimized out>,
table_id=<optimized out>, may_packet_in=<optimized out>,
honor_table_miss=<optimized out>,
with_ct_orig=<optimized out>, is_last_action=true,
xlator=0x5581a8daf850 <do_xlate_actions>) at
ofproto/ofproto-dpif-xlate.c:4506
*10 0x00005581a8db0fdc in do_xlate_actions (ofpacts=<optimized out>,
ofpacts_len=<optimized out>,
ctx=0x7f2e1732bcb0, is_last_action=<optimized out>,
group_bucket_action=<optimized out>) at ofproto/ofproto-dpif-xlate.c:7183
*11 0x00005581a8dad045 in xlate_recursively (actions_xlator=0x5581a8daf850
<do_xlate_actions>,
is_last_action=true, deepens=false, rule=0x5581ac88e440,
ctx=0x7f2e1732bcb0) at ofproto/ofproto-dpif-xlate.c:4382
*12 xlate_table_action (ctx=0x7f2e1732bcb0, in_port=<optimized out>,
table_id=<optimized out>,
may_packet_in=<optimized out>, honor_table_miss=<optimized out>,
with_ct_orig=<optimized out>, is_last_action=true,
xlator=0x5581a8daf850 <do_xlate_actions>) at
ofproto/ofproto-dpif-xlate.c:4511
*13 0x00005581a8db0fdc in do_xlate_actions
(ofpacts=ofpacts@entry=0x5581ac8912d8,
ofpacts_len=ofpacts_len@entry=8, ctx=ctx@entry=0x7f2e1732bcb0,
is_last_action=is_last_action@entry=true,
group_bucket_action=group_bucket_action@entry=false) at
ofproto/ofproto-dpif-xlate.c:7183
*14 0x00005581a8db74a8 in clone_xlate_actions (actions=0x5581ac8912d8,
actions_len=8,
ctx=0x7f2e1732bcb0, is_last_action=<optimized out>,
group_bucket_action=<optimized out>) at ofproto/ofproto-dpif-xlate.c:5809
*15 0x00005581a8dad045 in xlate_recursively (actions_xlator=0x5581a8db72b0
<clone_xlate_actions>,
is_last_action=true, deepens=true, rule=0x5581ac88d990,
ctx=0x7f2e1732bcb0) at ofproto/ofproto-dpif-xlate.c:4382
*16 xlate_table_action (ctx=0x7f2e1732bcb0, in_port=<optimized out>,
table_id=<optimized out>,
may_packet_in=<optimized out>, honor_table_miss=<optimized out>,
with_ct_orig=<optimized out>, is_last_action=true,
xlator=0x5581a8db72b0 <clone_xlate_actions>) at
ofproto/ofproto-dpif-xlate.c:4511
*17 0x00005581a8db3470 in patch_port_output (ctx=ctx@entry=0x7f2e1732bcb0,
out_dev=0x5581acbef790, is_last_action=is_last_action@entry=true,
in_dev=0x5581acce4210, in_dev=0x5581acce4210) at
ofproto/ofproto-dpif-xlate.c:3889
*18 0x00005581a8db3847 in compose_output_action__ (ctx=ctx@entry=0x7f2e1732bcb0,
ofp_port=2, xr=xr@entry=0x0, check_stp=check_stp@entry=true,
is_last_action=is_last_action@entry=true, truncate=truncate@entry=false)
at ofproto/ofproto-dpif-xlate.c:4204
*19 0x00005581a8db6310 in compose_output_action (truncate=false,
is_last_action=true, xr=0x0, ofp_port=<optimized out>,
ctx=0x7f2e1732bcb0) at ofproto/ofproto-dpif-xlate.c:4359
*20 xlate_output_action (ctx=ctx@entry=0x7f2e1732bcb0,
port=<optimized out>, controller_len=<optimized out>,
may_packet_in=may_packet_in@entry=true,
is_last_action=is_last_action@entry=true,
truncate=truncate@entry=false,
group_bucket_action=group_bucket_action@entry=false) at
ofproto/ofproto-dpif-xlate.c:5304
*21 0x00005581a8dafae4 in do_xlate_actions
(ofpacts=ofpacts@entry=0x5581ac64a918,
ofpacts_len=ofpacts_len@entry=48, ctx=ctx@entry=0x7f2e1732bcb0,
is_last_action=is_last_action@entry=true,
group_bucket_action=group_bucket_action@entry=false) at
ofproto/ofproto-dpif-xlate.c:7033
*22 0x00005581a8db9616 in xlate_actions (xin=xin@entry=0x7f2e1732c570,
xout=xout@entry=0x7f2e1732cea0) at ofproto/ofproto-dpif-xlate.c:8001
*23 0x00005581a8da54a0 in xlate_key (key=<optimized out>, len=<optimized out>,
push=push@entry=0x7f2e1732c920, ctx=ctx@entry=0x7f2e1732ce80,
udpif=<optimized out>) at ofproto/ofproto-dpif-upcall.c:2321
*24 0x00005581a8da5b54 in xlate_ukey (ukey=0x7f2ddea72f40,
ukey=0x7f2ddea72f40, ctx=0x7f2e1732ce80, tcp_flags=<optimized out>,
udpif=0x5581ac626e00) at ofproto/ofproto-dpif-upcall.c:2336
*25 revalidate_ukey__ (udpif=udpif@entry=0x5581ac626e00,
ukey=ukey@entry=0x7f2ddea72f40, tcp_flags=<optimized out>,
odp_actions=0x7f2e1732d340, recircs=recircs@entry=0x7f2e1732d310,
xcache=<optimized out>) at ofproto/ofproto-dpif-upcall.c:2382
*26 0x00005581a8da5dd1 in revalidate_ukey (udpif=udpif@entry=0x5581ac626e00,
ukey=ukey@entry=0x7f2ddea72f40, stats=stats@entry=0x7f2e1732d320,
odp_actions=odp_actions@entry=0x7f2e1732d340,
reval_seq=reval_seq@entry=3127840410,
recircs=recircs@entry=0x7f2e1732d310) at ofproto/ofproto-dpif-upcall.c:2491
*27 0x00005581a8daa1d2 in revalidate
(revalidator=revalidator@entry=0x5581ac672690) at
ofproto/ofproto-dpif-upcall.c:2945
*28 0x00005581a8daa514 in udpif_revalidator (arg=0x5581ac672690) at
ofproto/ofproto-dpif-upcall.c:1089
*29 0x00005581a8e7f61f in ovsthread_wrapper (aux_=<optimized out>) at
lib/ovs-thread.c:422
*30 0x00007f2e2b662e65 in start_thread () from /lib64/libpthread.so.0
*31 0x00007f2e295d488d in clone () from /lib64/libc.so.6
Signed-off-by: LIU Yulong <[email protected]>
---
ofproto/ofproto-dpif-xlate.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index 2c8197fb7..cc8c75b90 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -4793,6 +4793,10 @@ xlate_table_action(struct xlate_ctx *ctx, ofp_port_t
in_port, uint8_t table_id,
tuple_swap(&ctx->xin->flow, ctx->wc);
}
+ if (rule && !ofproto_rule_try_ref(&rule->up)) {
+ rule = NULL;
+ }
+
if (rule) {
/* Fill in the cache entry here instead of xlate_recursively
* to make the reference counting more explicit. We take a
--
2.50.1 (Apple Git-155)
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev