Hi, just found I forget to answer other questions.
Ilya Maximets <i.maxim...@ovn.org> 于2022年2月15日周二 02:11写道: > On 1/28/22 17:14, Aaron Conole wrote: > > From: Peng He <xnhp0...@gmail.com> > > > > ipf_postprocess will emit packets into the datapath pipeline ignoring > > the conntrack context, this might casuse weird issues when a packet > > batch has less space to hold all the fragments belonging to single > > packet. > > > > Given the below ruleest and consider sending a 64K ICMP packet which > > is splitted into 64 fragments. > > > > priority=1,action=drop > > priority=10,arp,action=normal > > priority=100,in_port=1,ct_state=-trk,icmp,action=ct(zone=9,table=0) > > priority=100,in_port=1,ct_state=+new+trk,icmp,action=ct(zone=9,commit),2 > > priority=100,in_port=1,ct_state=-new+est+trk,icmp,action=2 > > priority=100,in_port=2,ct_state=-trk,icmp,action=ct(table=0,zone=9) > > priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1 > > > > Batch 1: > > the first 32 packets will be buffered in the ipf preprocessing, nothing > > more proceeds. > > > > Batch 2: > > the second 32 packets succeed the fragment reassembly and goes to ct > > and ipf_post will emits the first 32 packets due to the limit of batch > > size. > > > > the first 32 packets goes to the datapath again due to the > > recirculation, and again buffered at ipf preprocessing before ct commit, > > then the ovs tries to call ct commit as well as ipf_postprocessing which > emits > > the last 32 packets, in this case the last 32 packets will follow > > the current actions which will be sent to port 2 directly without > > recirculation and going to ipf preprocssing and ct commit again. > > > > This will cause the first 32 packets never get the chance to > > reassemble and evevntually this large ICMP packets fail to transmit. > > > > this patch fixes this issue by adding firstly ipf context to avoid > > ipf_postprocessing emits packets in the wrong context. Then by > > re-executing the action list again to emit the last 32 packets > > in the right context to correctly transmitting multiple fragments. > > > > This might be one of the root cause why the OVS log warns: > > > > " > > skipping output to input port on bridge br-int while processing > > " > > > > As a pkt might enter into different ipf context. > > > > One implementation trick of implementing ipf_ctx_eq, is to use ipf_list's > > key instead of the first frag's metadata, this can reduce quite a lot of > > cache misses as at the time of calling ipf_ctx_eq, ipf_list is cache-hot. > > On x86_64, this trick saves 2% of CPU usage of ipf processing. > > > > Signed-off-by: Peng He <hepeng.0...@bytedance.com> > > Co-authored-by: Mike Pattrick <m...@redhat.com> > > Signed-off-by: Aaron Conole <acon...@redhat.com> > > --- > > lib/conntrack.c | 9 ++++--- > > lib/conntrack.h | 15 ++++++------ > > lib/dpif-netdev.c | 52 +++++++++++++++++++++++++++++++++++++---- > > lib/ipf.c | 39 ++++++++++++++++++++++++------- > > lib/ipf.h | 11 +++++++-- > > tests/system-traffic.at | 33 ++++++++++++++++++++++++++ > > tests/test-conntrack.c | 6 ++--- > > 7 files changed, 135 insertions(+), 30 deletions(-) > > > > diff --git a/lib/conntrack.c b/lib/conntrack.c > > index 33a1a92953..72999f1aed 100644 > > --- a/lib/conntrack.c > > +++ b/lib/conntrack.c > > @@ -1434,14 +1434,14 @@ process_one(struct conntrack *ct, struct > dp_packet *pkt, > > * connection tables. 'setmark', if not NULL, should point to a two > > * elements array containing a value and a mask to set the connection > mark. > > * 'setlabel' behaves similarly for the connection label.*/ > > -int > > +bool > > conntrack_execute(struct conntrack *ct, struct dp_packet_batch > *pkt_batch, > > ovs_be16 dl_type, bool force, bool commit, uint16_t > zone, > > const uint32_t *setmark, > > const struct ovs_key_ct_labels *setlabel, > > ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper, > > const struct nat_action_info_t *nat_action_info, > > - long long now, uint32_t tp_id) > > + long long now, uint32_t tp_id, struct ipf_ctx > *ipf_ctx) > > { > > ipf_preprocess_conntrack(ct->ipf, pkt_batch, now, dl_type, zone, > > ct->hash_basis); > > @@ -1468,9 +1468,8 @@ conntrack_execute(struct conntrack *ct, struct > dp_packet_batch *pkt_batch, > > } > > } > > > > - ipf_postprocess_conntrack(ct->ipf, pkt_batch, now, dl_type); > > - > > - return 0; > > + return ipf_postprocess_conntrack(ct->ipf, pkt_batch, ipf_ctx, \ > > + now, dl_type); > > } > > > > void > > diff --git a/lib/conntrack.h b/lib/conntrack.h > > index 9553b188a4..211efad3f3 100644 > > --- a/lib/conntrack.h > > +++ b/lib/conntrack.h > > @@ -64,6 +64,7 @@ > > struct dp_packet_batch; > > > > struct conntrack; > > +struct ipf_ctx; > > > > union ct_addr { > > ovs_be32 ipv4; > > @@ -88,13 +89,13 @@ struct nat_action_info_t { > > struct conntrack *conntrack_init(void); > > void conntrack_destroy(struct conntrack *); > > > > -int conntrack_execute(struct conntrack *ct, struct dp_packet_batch > *pkt_batch, > > - ovs_be16 dl_type, bool force, bool commit, > uint16_t zone, > > - const uint32_t *setmark, > > - const struct ovs_key_ct_labels *setlabel, > > - ovs_be16 tp_src, ovs_be16 tp_dst, const char > *helper, > > - const struct nat_action_info_t *nat_action_info, > > - long long now, uint32_t tp_id); > > +bool conntrack_execute(struct conntrack *ct, struct dp_packet_batch > *pkt_batch, > > + ovs_be16 dl_type, bool force, bool commit, > > + uint16_t zone, const uint32_t *setmark, > > + const struct ovs_key_ct_labels *setlabel, > > + ovs_be16 tp_src, ovs_be16 tp_dst, const char > *helper, > > + const struct nat_action_info_t *nat_action_info, > > + long long now, uint32_t tp_id, struct ipf_ctx > *ipf_ctx); > > void conntrack_clear(struct dp_packet *packet); > > > > struct conntrack_dump { > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > > index e28e0b5543..8cebdfab56 100644 > > --- a/lib/dpif-netdev.c > > +++ b/lib/dpif-netdev.c > > @@ -8500,6 +8500,7 @@ dp_netdev_recirculate(struct dp_netdev_pmd_thread > *pmd, > > struct dp_netdev_execute_aux { > > struct dp_netdev_pmd_thread *pmd; > > const struct flow *flow; > > + const struct nlattr *redo_actions; > > }; > > > > static void > > @@ -9029,10 +9030,23 @@ dp_execute_cb(void *aux_, struct dp_packet_batch > *packets_, > > VLOG_WARN_RL(&rl, "NAT specified without commit."); > > } > > > > - conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, > force, > > - commit, zone, setmark, setlabel, > aux->flow->tp_src, > > - aux->flow->tp_dst, helper, > nat_action_info_ref, > > - pmd->ctx.now / 1000, tp_id); > > + bool more_pkts; > > + struct ipf_ctx ipf_ctx = { aux->flow->recirc_id, > > + aux->flow->in_port, > > packets_->packets[0]->md.in_port.odp_port > We cannot use the packets in the batch, as the batch might be empty, it might be feed by the redo_actions. > > > + zone }; > > + more_pkts = conntrack_execute(dp->conntrack, packets_, > > + aux->flow->dl_type, > > + force, commit, zone, > > + setmark, setlabel, > > + aux->flow->tp_src, > > + aux->flow->tp_dst, > > + helper, nat_action_info_ref, > > + pmd->ctx.now / 1000, tp_id, > &ipf_ctx); > > + if (more_pkts) { > > + aux->redo_actions = a; > > + } else { > > + aux->redo_actions = NULL; > > + } > > Assuming we have 2 conntrack actions in the same action list, > in this case while executing a secont ct() action we will > overwrite the 'redo_actions' context of the first action even > though it might have more packets to process. > IIUC, these remaining fragments from the first ct() action > will not be sent out until some other traffic hists it and > we actually have a room in the batch. > > It should, probably, be a list, where you can add new entry > points, so dp_netdev_execute_actions() can iterate over all > of them. > > > break; > > } > > > > @@ -9067,16 +9081,44 @@ dp_execute_cb(void *aux_, struct dp_packet_batch > *packets_, > > dp_packet_delete_batch(packets_, should_steal); > > } > > > > +static size_t > > +dp_netdev_find_actions_left(const struct nlattr *actions, > > + size_t actions_len, > > + const struct nlattr *target) > > +{ > > + const struct nlattr *a; > > + size_t left; > > + NL_ATTR_FOR_EACH_UNSAFE (a, left, actions, actions_len) { > > + if (a == target) { > > + break; > > + } > > + } > > + return left; > > +} > > + > > static void > > dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd, > > struct dp_packet_batch *packets, > > bool should_steal, const struct flow *flow, > > const struct nlattr *actions, size_t > actions_len) > > { > > - struct dp_netdev_execute_aux aux = { pmd, flow }; > > + int max_reexec = 2; > > I'm sorry if I missed the discussin, but why only 2? > IIUC, in theory we may have up to 1K fragments that needs to be sent. > These will also be stuck waiting for the next batch of packets to hit > the same ct() action. > Normally, a 64K IP pkts will generate 64 frag pkts ( for 1500) there might be 1K frags in one ctx, but it's more like a frag attack? 2 can be used as a kind of protection, I guess. > > > + struct dp_netdev_execute_aux aux = { pmd, flow, NULL }; > > > > odp_execute_actions(&aux, packets, should_steal, actions, > > actions_len, dp_execute_cb); > > + > > + while (OVS_UNLIKELY(aux.redo_actions && max_reexec --)) { > > + struct dp_packet_batch batch; > > + dp_packet_batch_init(&batch); > > + size_t redo_actions_len = \ > > Please, don't use '\' in a normal C code. > here and in other parts of that patch. > will fix it in the next version. > > > + dp_netdev_find_actions_left(actions, > > + actions_len, > > + aux.redo_actions); > > + > > + odp_execute_actions(&aux, &batch, should_steal, > > If 'should_steal' is 'false', we will leak all the packets here. > e.g. if the last fragment came through the packet_out. > > > + aux.redo_actions, redo_actions_len, dp_execute_cb); > > + } > > } > > > > struct dp_netdev_ct_dump { > > diff --git a/lib/ipf.c b/lib/ipf.c > > index 507db2aea2..2ff04c6156 100644 > > --- a/lib/ipf.c > > +++ b/lib/ipf.c > > @@ -1046,30 +1046,51 @@ ipf_send_frags_in_list(struct ipf *ipf, struct > ipf_list *ipf_list, > > OVS_NOT_REACHED(); > > } > > > > +static bool > > +ipf_ctx_eq(struct ipf_list *ipf_list, struct ipf_ctx *ctx, > > + struct dp_packet *pkt) > > +{ > > + if (ipf_list->key.recirc_id != ctx->recirc_id || > > + pkt->md.in_port.odp_port != ctx->in_port.odp_port || > > + ipf_list->key.zone != ctx->zone) { > > + return false; > > + } > > + return true; > > +} > > + > > /* Adds fragments associated with a completed fragment list to a packet > batch > > * to be processed by the calling application, typically conntrack. Also > > * cleans up the list context when it is empty.*/ > > -static void > > +static bool > > ipf_send_completed_frags(struct ipf *ipf, struct dp_packet_batch *pb, > > - long long now, bool v6) > > + struct ipf_ctx *ctx, long long now, bool v6) > > { > > if (ovs_list_is_empty(&ipf->frag_complete_list)) { > > - return; > > + return false; > > } > > > > + bool more_pkts = false; > > ovs_mutex_lock(&ipf->ipf_lock); > > struct ipf_list *ipf_list, *next; > > > > LIST_FOR_EACH_SAFE (ipf_list, next, list_node, > &ipf->frag_complete_list) { > > + > > + if (ctx && !ipf_ctx_eq(ipf_list, ctx, \ > > + ipf_list->frag_list[ipf_list->last_sent_idx + > 1].pkt)) { > > + continue; > > + } > > + > > if (ipf_send_frags_in_list(ipf, ipf_list, pb, > IPF_FRAG_COMPLETED_LIST, > > v6, now)) { > > ipf_completed_list_clean(&ipf->frag_lists, ipf_list); > > } else { > > + more_pkts = true; > > break; > > } > > } > > > > ovs_mutex_unlock(&ipf->ipf_lock); > > + return more_pkts; > > } > > > > /* Conservatively adds fragments associated with a expired fragment > list to > > @@ -1225,8 +1246,8 @@ ipf_post_execute_reass_pkts(struct ipf *ipf, > > * be added to the batch to be sent through conntrack. */ > > void > > ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb, > > - long long now, ovs_be16 dl_type, uint16_t zone, > > - uint32_t hash_basis) > > + long long now, ovs_be16 dl_type, > > + uint16_t zone, uint32_t hash_basis) > > { > > if (ipf_get_enabled(ipf)) { > > ipf_extract_frags_from_batch(ipf, pb, dl_type, zone, now, > hash_basis); > > @@ -1241,16 +1262,18 @@ ipf_preprocess_conntrack(struct ipf *ipf, struct > dp_packet_batch *pb, > > * through conntrack and adds these fragments to any batches seen. > Expired > > * fragments are marked as invalid and also added to the batches seen > > * with low priority. Reassembled packets are freed. */ > > -void > > +bool > > ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb, > > - long long now, ovs_be16 dl_type) > > + struct ipf_ctx *ctx, long long now, ovs_be16 > dl_type) > > { > > + bool more_pkts = false; > > if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) { > > bool v6 = dl_type == htons(ETH_TYPE_IPV6); > > ipf_post_execute_reass_pkts(ipf, pb, v6); > > - ipf_send_completed_frags(ipf, pb, now, v6); > > + more_pkts = ipf_send_completed_frags(ipf, pb, ctx, now, v6); > > ipf_send_expired_frags(ipf, pb, now, v6); > > } > > + return more_pkts; > > } > > > > static void * > > diff --git a/lib/ipf.h b/lib/ipf.h > > index 6ac91b2708..7bbf453afe 100644 > > --- a/lib/ipf.h > > +++ b/lib/ipf.h > > @@ -40,14 +40,21 @@ struct ipf_status { > > unsigned int nfrag_max; > > }; > > > > +struct ipf_ctx { > > + uint32_t recirc_id; > > + union flow_in_port in_port; /* Input port. */ > > We don't seem to ever need an OF port number, so should be just: > > odp_port_t in_port; > > The comment also doesn't seem to be useful. Agree. will fix it. > > > + uint16_t zone; > > +}; > > + > > struct ipf *ipf_init(void); > > void ipf_destroy(struct ipf *ipf); > > void ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch > *pb, > > long long now, ovs_be16 dl_type, uint16_t > zone, > > uint32_t hash_basis); > > > > -void ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch > *pb, > > - long long now, ovs_be16 dl_type); > > +bool ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch > *pb, > > + struct ipf_ctx *ctx, long long now, > > + ovs_be16 dl_type); > > > > int ipf_set_enabled(struct ipf *ipf, bool v6, bool enable); > > int ipf_set_min_frag(struct ipf *ipf, bool v6, uint32_t value); > > diff --git a/tests/system-traffic.at b/tests/system-traffic.at > > index 9da84a0734..4765513747 100644 > > --- a/tests/system-traffic.at > > +++ b/tests/system-traffic.at > > @@ -3118,6 +3118,39 @@ DPCTL_CHECK_FRAGMENTATION_PASS() > > OVS_TRAFFIC_VSWITCHD_STOP > > AT_CLEANUP > > > > +AT_SETUP([conntrack - IPv4 large fragmentation]) > > +CHECK_CONNTRACK() > > +OVS_TRAFFIC_VSWITCHD_START() > > + > > +ADD_NAMESPACES(at_ns0, at_ns1) > > + > > +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") > > +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") > > + > > +dnl Sending ping through conntrack > > +AT_DATA([flows.txt], [dnl > > +priority=1,action=drop > > +priority=10,arp,action=normal > > +priority=100,in_port=1,ct_state=-trk,icmp,action=ct(zone=9,table=0) > > +priority=100,in_port=1,ct_state=+new+trk,icmp,action=ct(zone=9,commit),2 > > +priority=100,in_port=1,ct_state=-new+est+trk,icmp,action=2 > > +priority=100,in_port=2,ct_state=-trk,icmp,action=ct(table=0,zone=9) > > +priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1 > > +]) > > + > > +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) > > + > > +dnl Modify userspace conntrack fragmentation handling. > > +DPCTL_MODIFY_FRAGMENTATION() > > + > > +dnl Ipv4 larger fragmentation connectivity check. > > +NS_CHECK_EXEC([at_ns0], [ping -s 65000 -q -c 3 -i 0.3 -w 2 10.1.1.2 | > FORMAT_PING], [0], [dnl > > +3 packets transmitted, 3 received, 0% packet loss, time 0ms > > +]) > > + > > +OVS_TRAFFIC_VSWITCHD_STOP > > +AT_CLEANUP > > + > > AT_SETUP([conntrack - IPv4 fragmentation expiry]) > > CHECK_CONNTRACK() > > OVS_TRAFFIC_VSWITCHD_START() > > diff --git a/tests/test-conntrack.c b/tests/test-conntrack.c > > index 24c93e4a48..f90b7aa78b 100644 > > --- a/tests/test-conntrack.c > > +++ b/tests/test-conntrack.c > > @@ -91,7 +91,7 @@ ct_thread_main(void *aux_) > > ovs_barrier_block(&barrier); > > for (i = 0; i < n_pkts; i += batch_size) { > > conntrack_execute(ct, pkt_batch, dl_type, false, true, 0, NULL, > NULL, > > - 0, 0, NULL, NULL, now, 0); > > + 0, 0, NULL, NULL, now, 0, NULL); > > DP_PACKET_BATCH_FOR_EACH (j, pkt, pkt_batch) { > > pkt_metadata_init_conn(&pkt->md); > > } > > @@ -178,7 +178,7 @@ pcap_batch_execute_conntrack(struct conntrack *ct_, > > > > if (flow.dl_type != dl_type) { > > conntrack_execute(ct_, &new_batch, dl_type, false, true, 0, > > - NULL, NULL, 0, 0, NULL, NULL, now, 0); > > + NULL, NULL, 0, 0, NULL, NULL, now, 0, > NULL); > > dp_packet_batch_init(&new_batch); > > } > > dp_packet_batch_add(&new_batch, packet); > > @@ -186,7 +186,7 @@ pcap_batch_execute_conntrack(struct conntrack *ct_, > > > > if (!dp_packet_batch_is_empty(&new_batch)) { > > conntrack_execute(ct_, &new_batch, dl_type, false, true, 0, > NULL, NULL, > > - 0, 0, NULL, NULL, now, 0); > > + 0, 0, NULL, NULL, now, 0, NULL); > > } > > > > } > > _______________________________________________ > dev mailing list > d...@openvswitch.org > https://mail.openvswitch.org/mailman/listinfo/ovs-dev > -- hepeng _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev