Hmm, that seems odd. Also, the code change you propose below does not make sense and would likely cause similar crashes itself.
Maybe, you explain what you are trying to do in your testing ? Can you say what traffic are you sending and from which ports ? I’ll take another look at the related code. Darrell On 9/6/17, 6:14 AM, "Huanglili (lee)" <[email protected]> wrote: Hi, We met another vswitchd crash when we use ct(nat) (ovs+dpdk). Program terminated with signal 11, Segmentation fault. #0 0x0000000000574a0b in hmap_remove (node=0x7f150c6e60a8, hmap=0x7f1553c40780) at lib/hmap.h:270 while (*bucket != node) { (gdb) bt #0 0x0000000000574a0b in hmap_remove (node=0x7f150c6e60a8, hmap=0x7f1553c40780) #1 sweep_bucket (limit=1808, now=563303851, ctb=0x7f1553c40778, ct=0x7f1553c3f9a8) #2 conntrack_clean (now=563303851, ct=0x7f1553c3f9a8) #3 clean_thread_main (f_=0x7f1553c3f9a8) This crash can be triggered by using following flows, maybe the flows are not reasonable, but shouldn't trigger crash "table=0,priority=2,in_port=1 actions=resubmit(,2) table=0,priority=2,in_port=4 actions=resubmit(,2) table=0,priority=0 actions=drop table=0,priority=1 actions=resubmit(,10) table=1,priority=0 actions=resubmit(,14) table=2,priority=0 actions=resubmit(,4) table=4,priority=0 actions=resubmit(,14) table=10,priority=2,arp actions=resubmit(,12) table=10,priority=1,dl_src=90:E2:BA:69:CD:61 actions=resubmit(,1) table=10,priority=0 actions=drop table=12,priority=3,arp,dl_src=90:E2:BA:69:CD:61,arp_spa=194.168.100.1,arp_sha=90:E2:BA:69:CD:61 actions=resubmit(,1) table=12,priority=2,arp actions=drop table=14,priority=6,ip actions=ct(table=16,zone=1) table=14,priority=0 actions=resubmit(,20) table=14,priority=20,ip,ip_frag=yes,actions=resubmit(,18) table=16,priority=20,ct_state=+est+trk,ip actions=resubmit(,20) table=16,priority=15,ct_state=+rel+trk,ip actions=resubmit(,20) table=16,priority=10,ct_mark=0x80000000/0x80000000,udp actions=resubmit(,20) table=16,priority=5,ct_state=+new+trk,ip,in_port=3 actions=resubmit(,18) table=16,priority=5,ct_state=+new+trk,ip,in_port=4 actions=resubmit(,18) table=16,priority=5,ct_state=+new+trk,ip,in_port=2 actions=ct(commit,zone=1,exec(load:0x1->NXM_NX_CT_MARK[31])),output:4 table=16,priority=5,ct_state=+new+trk,ip,in_port=1 actions=ct(commit,zone=1,exec(load:0x1->NXM_NX_CT_MARK[31])),output:3 table=18,priority=0,in_port=3 actions=ct(zone=1,table=24) table=18,priority=0,in_port=2 actions=output:4 table=18,priority=0,in_port=4,ip actions=ct(commit,zone=1,nat(dst=194.168.100.1)),2 table=18,priority=0,in_port=1 actions=output:3 table=20,priority=10,in_port=3,ip actions=ct(zone=1,table=22) table=20,priority=10,in_port=4,ip actions=ct(zone=1,table=23) table=20,priority=1 actions=ct(zone=1,table=18) table=22,priority=10,in_port=3 action=4 table=23,priority=10,in_port=4 action=3 table=24,priority=10,in_port=3 action=1" The networking: vm | br-ply - br-linux | br-int We find rev_conn is in the list of ctb->exp_lists[] sometimes. The following change will solve this problem, but we can't explain why $ git diff diff --git a/lib/conntrack.c b/lib/conntrack.c index 419cb1d..d5141c4 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ nat_clean(struct conntrack *ct, struct conn *conn, if (rev_conn && (!nat_conn_key_node || conn_key_cmp(&nat_conn_key_node->value, &rev_conn->rev_key))) { + ovs_list_remove(&rev_conn->exp_node); hmap_remove(&ct->buckets[bucket_rev_conn].connections, &rev_conn->node); free(rev_conn); @@ create_un_nat_conn(struct conntrack *ct, struct conn *conn_f or_un_nat_copy, nat_conn_keys_lookup(&ct->nat_conn_keys, &nc->key, ct->hash_basis); if (nat_conn_key_node && !conn_key_cmp(&nat_conn_key_node->value, &nc->rev_key) && !rev_conn) { - + ovs_list_init(&nc->exp_node); hmap_insert(&ct->buckets[un_nat_conn_bucket].connections, &nc->node, un_nat_hash); Any idea? Thanks. ---------------------------------------------------------------------------------------------------------------------------- On 8/24/17, 3:36 AM, "[email protected] on behalf of huanglili" <[email protected] on behalf of [email protected]> wrote: From: Lili Huang <[email protected]> Conn should be removed from the list before freed. This crash will be triggered when a established flow do ct(nat) again, like "ip,actions=ct(table=1) table=1,in_port=1,ip,actions=ct(commit,nat(dst=5.5.5.5)),2 table=1,in_port=2,ip,ct_state=+est,actions=1 table=1,in_port=1,ip,ct_state=+est,actions=2" Signed-off-by: Lili Huang <[email protected]> --- lib/conntrack.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/conntrack.c b/lib/conntrack.c index 1c0e023..dd73e1a 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -779,6 +779,8 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, ct, nc, conn_for_un_nat_copy); if (!nat_res) { + ovs_list_remove(&nc->exp_node); + ctx->conn = NULL; goto nat_res_exhaustion; } _______________________________________________ discuss mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-discuss
