Hello Peng,
Great patch! How do you feel about a patch like this to get away from
recursion?
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index fa56d86e0..935e416ea 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -8303,6 +8303,8 @@ dp_execute_cb(void *aux_, struct dp_packet_batch
*packets_,
pmd->ctx.now / 1000, tp_id,
&ipf_ctx);
if (more_pkts) {
aux->redo_actions = a;
+ } else {
+ aux->redo_actions = NULL;
}
break;
}
@@ -8358,28 +8360,23 @@ dp_netdev_execute_actions(struct
dp_netdev_pmd_thread *pmd,
bool should_steal, const struct flow *flow,
const struct nlattr *actions, size_t
actions_len)
{
+ int max_reexec = 2;
struct dp_netdev_execute_aux aux = { pmd, flow, NULL };
odp_execute_actions(&aux, packets, should_steal, actions,
actions_len, dp_execute_cb);
- if (OVS_UNLIKELY(aux.redo_actions)) {
- uint32_t *depth = recirc_depth_get();
- /* reuse depth to limit re-execution times to avoid too
- * large fragments.
- */
- if (*depth < MAX_RECIRC_DEPTH) {
- struct dp_packet_batch batch;
- dp_packet_batch_init(&batch);
- size_t redo_actions_len = \
- dp_netdev_find_actions_left(acti
ons,
- actions_len,
- aux.redo_actions);
- (*depth) ++;
- dp_netdev_execute_actions(pmd, &batch, should_steal, flow,
- aux.redo_actions, redo_actions_len);
- (*depth) --;
- }
+ while (OVS_UNLIKELY(aux.redo_actions && max_reexec--)) {
+ struct dp_packet_batch batch;
+ dp_packet_batch_init(&batch);
+ size_t redo_actions_len = \
+ dp_netdev_find_actions_left(actions,
+ actions_len,
+ aux.redo_actions);
+
+ odp_execute_actions(&aux, &batch, should_steal,
aux.redo_actions,
+ redo_actions_len, dp_execute_cb);
+
}
}
Cheers,
Michael
On Wed, 2021-11-17 at 09:03 -0500, Aaron Conole wrote:
> Hi Peng,
>
> Peng He <[email protected]> writes:
>
> > From: Peng He <[email protected]>
> >
> > ipf_postprocess will emit packets into the datapath pipeline
> > ignoring
> > the conntrack context, this might casuse weird issues when a packet
> > batch has less space to hold all the fragments belonging to single
> > packet.
> >
> > Given the below ruleest and consider sending a 64K ICMP packet
> > which
> > is splitted into 64 fragments.
> >
> > priority=1,action=drop
> > priority=10,arp,action=normal
> > priority=100,in_port=1,ct_state=-trk,icmp,action=ct(zone=9,table=0)
> > priority=100,in_port=1,ct_state=+new+trk,icmp,action=ct(zone=9,comm
> > it),2
> > priority=100,in_port=1,ct_state=-new+est+trk,icmp,action=2
> > priority=100,in_port=2,ct_state=-trk,icmp,action=ct(table=0,zone=9)
> > priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1
> >
> > Batch 1:
> > the first 32 packets will be buffered in the ipf preprocessing,
> > nothing
> > more proceeds.
> >
> > Batch 2:
> > the second 32 packets succeed the fragment reassembly and goes to
> > ct
> > and ipf_post will emits the first 32 packets due to the limit of
> > batch
> > size.
> >
> > the first 32 packets goes to the datapath again due to the
> > recirculation, and again buffered at ipf preprocessing before ct
> > commit,
> > then the ovs tries to call ct commit as well as ipf_postprocessing
> > which emits
> > the last 32 packets, in this case the last 32 packets will follow
> > the current actions which will be sent to port 2 directly without
> > recirculation and going to ipf preprocssing and ct commit again.
> >
> > This will cause the first 32 packets never get the chance to
> > reassemble and evevntually this large ICMP packets fail to
> > transmit.
> >
> > this patch fixes this issue by adding firstly ipf context to avoid
> > ipf_postprocessing emits packets in the wrong context. Then by
> > re-executing the action list again to emit the last 32 packets
> > in the right context to correctly transmitting multiple fragments.
> >
> > Last, we reuse the recirc_depth to limit the times of re-execution
> > as re-execution is implemented by recursive function call.
> >
> > Signed-off-by: Peng He <[email protected]>
> > ---
>
> Thanks for looking into this problem - it's a current problem with
> the
> userspace ipf implementation.
>
> > lib/conntrack.c | 9 +++----
> > lib/conntrack.h | 15 +++++------
> > lib/dpif-netdev.c | 56
> > +++++++++++++++++++++++++++++++++++++----
> > lib/ipf.c | 40 +++++++++++++++++++++++------
> > lib/ipf.h | 11 ++++++--
> > tests/system-traffic.at | 33 ++++++++++++++++++++++++
> > tests/test-conntrack.c | 6 ++---
> > 7 files changed, 140 insertions(+), 30 deletions(-)
> >
> > diff --git a/lib/conntrack.c b/lib/conntrack.c
> > index 33a1a9295..72999f1ae 100644
> > --- a/lib/conntrack.c
> > +++ b/lib/conntrack.c
> > @@ -1434,14 +1434,14 @@ process_one(struct conntrack *ct, struct
> > dp_packet *pkt,
> > * connection tables. 'setmark', if not NULL, should point to a
> > two
> > * elements array containing a value and a mask to set the
> > connection mark.
> > * 'setlabel' behaves similarly for the connection label.*/
> > -int
> > +bool
> > conntrack_execute(struct conntrack *ct, struct dp_packet_batch
> > *pkt_batch,
> > ovs_be16 dl_type, bool force, bool commit,
> > uint16_t zone,
> > const uint32_t *setmark,
> > const struct ovs_key_ct_labels *setlabel,
> > ovs_be16 tp_src, ovs_be16 tp_dst, const char
> > *helper,
> > const struct nat_action_info_t *nat_action_info,
> > - long long now, uint32_t tp_id)
> > + long long now, uint32_t tp_id, struct ipf_ctx
> > *ipf_ctx)
> > {
> > ipf_preprocess_conntrack(ct->ipf, pkt_batch, now, dl_type,
> > zone,
> > ct->hash_basis);
> > @@ -1468,9 +1468,8 @@ conntrack_execute(struct conntrack *ct,
> > struct dp_packet_batch *pkt_batch,
> > }
> > }
> >
> > - ipf_postprocess_conntrack(ct->ipf, pkt_batch, now, dl_type);
> > -
> > - return 0;
> > + return ipf_postprocess_conntrack(ct->ipf, pkt_batch, ipf_ctx,
> > \
> > + now, dl_type);
> > }
> >
> > void
> > diff --git a/lib/conntrack.h b/lib/conntrack.h
> > index 9553b188a..211efad3f 100644
> > --- a/lib/conntrack.h
> > +++ b/lib/conntrack.h
> > @@ -64,6 +64,7 @@
> > struct dp_packet_batch;
> >
> > struct conntrack;
> > +struct ipf_ctx;
> >
> > union ct_addr {
> > ovs_be32 ipv4;
> > @@ -88,13 +89,13 @@ struct nat_action_info_t {
> > struct conntrack *conntrack_init(void);
> > void conntrack_destroy(struct conntrack *);
> >
> > -int conntrack_execute(struct conntrack *ct, struct dp_packet_batch
> > *pkt_batch,
> > - ovs_be16 dl_type, bool force, bool commit,
> > uint16_t zone,
> > - const uint32_t *setmark,
> > - const struct ovs_key_ct_labels *setlabel,
> > - ovs_be16 tp_src, ovs_be16 tp_dst, const char
> > *helper,
> > - const struct nat_action_info_t
> > *nat_action_info,
> > - long long now, uint32_t tp_id);
> > +bool conntrack_execute(struct conntrack *ct, struct
> > dp_packet_batch *pkt_batch,
> > + ovs_be16 dl_type, bool force, bool commit,
> > + uint16_t zone, const uint32_t *setmark,
> > + const struct ovs_key_ct_labels *setlabel,
> > + ovs_be16 tp_src, ovs_be16 tp_dst, const
> > char *helper,
> > + const struct nat_action_info_t
> > *nat_action_info,
> > + long long now, uint32_t tp_id, struct
> > ipf_ctx *ipf_ctx);
> > void conntrack_clear(struct dp_packet *packet);
> >
> > struct conntrack_dump {
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index 98453a206..c4c837e2e 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -7756,6 +7756,7 @@ dp_netdev_recirculate(struct
> > dp_netdev_pmd_thread *pmd,
> > struct dp_netdev_execute_aux {
> > struct dp_netdev_pmd_thread *pmd;
> > const struct flow *flow;
> > + const struct nlattr *redo_actions;
> > };
> >
> > static void
> > @@ -8285,10 +8286,21 @@ dp_execute_cb(void *aux_, struct
> > dp_packet_batch *packets_,
> > VLOG_WARN_RL(&rl, "NAT specified without commit.");
> > }
> >
> > - conntrack_execute(dp->conntrack, packets_, aux->flow-
> > >dl_type, force,
> > - commit, zone, setmark, setlabel, aux-
> > >flow->tp_src,
> > - aux->flow->tp_dst, helper,
> > nat_action_info_ref,
> > - pmd->ctx.now / 1000, tp_id);
> > + bool more_pkts;
> > + struct ipf_ctx ipf_ctx = { aux->flow->recirc_id,
> > + aux->flow->in_port,
> > + zone };
> > + more_pkts = conntrack_execute(dp->conntrack, packets_,
> > + aux->flow->dl_type,
> > + force, commit, zone,
> > + setmark, setlabel,
> > + aux->flow->tp_src,
> > + aux->flow->tp_dst,
> > + helper, nat_action_info_ref,
> > + pmd->ctx.now / 1000, tp_id,
> > &ipf_ctx);
> > + if (more_pkts) {
> > + aux->redo_actions = a;
> > + }
> > break;
> > }
> >
> > @@ -8322,16 +8334,50 @@ dp_execute_cb(void *aux_, struct
> > dp_packet_batch *packets_,
> > dp_packet_delete_batch(packets_, should_steal);
> > }
> >
> > +static size_t
> > +dp_netdev_find_actions_left(const struct nlattr *actions,
> > + size_t actions_len,
> > + const struct nlattr *target)
> > +{
> > + const struct nlattr *a;
> > + size_t left;
> > + NL_ATTR_FOR_EACH_UNSAFE (a, left, actions, actions_len) {
> > + if (a == target) {
> > + break;
> > + }
> > + }
> > + return left;
> > +}
> > +
> > static void
> > dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
> > struct dp_packet_batch *packets,
> > bool should_steal, const struct flow
> > *flow,
> > const struct nlattr *actions, size_t
> > actions_len)
> > {
> > - struct dp_netdev_execute_aux aux = { pmd, flow };
> > + struct dp_netdev_execute_aux aux = { pmd, flow, NULL };
> >
> > odp_execute_actions(&aux, packets, should_steal, actions,
> > actions_len, dp_execute_cb);
>
> I'm concerned here - we currently have an issue with stack explosion
> in
> some corner cases during normal packet processing. I'm worried that
> we
> just expanded it to another place.
>
> Any thought to doing this iteratively instead?
>
> > +
> > + if (OVS_UNLIKELY(aux.redo_actions)) {
> > + uint32_t *depth = recirc_depth_get();
> > + /* reuse depth to limit re-execution times to avoid too
> > + * large fragments.
> > + */
> > + if (*depth < MAX_RECIRC_DEPTH) {
> > + struct dp_packet_batch batch;
> > + dp_packet_batch_init(&batch);
> > + size_t redo_actions_len = \
> > + dp_netdev_find_actions_left(
> > actions,
> > + actions_len,
> > + aux.redo_actions);
> > + (*depth) ++;
> > + dp_netdev_execute_actions(pmd, &batch, should_steal,
> > flow,
> > + aux.redo_actions, redo_actions_len);
> > + (*depth) --;
> > + }
> > + }
> > }
> >
> > struct dp_netdev_ct_dump {
> > diff --git a/lib/ipf.c b/lib/ipf.c
> > index 507db2aea..1fd3d8d30 100644
> > --- a/lib/ipf.c
> > +++ b/lib/ipf.c
> > @@ -1046,30 +1046,52 @@ ipf_send_frags_in_list(struct ipf *ipf,
> > struct ipf_list *ipf_list,
> > OVS_NOT_REACHED();
> > }
> >
> > +static bool
> > +ipf_ctx_eq(struct ipf_list *ipf_list, struct ipf_ctx *ctx)
> > +{
> > + struct dp_packet *pkt =
> > + ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
> > +
> > + if (pkt->md.recirc_id != ctx->recirc_id ||
> > + pkt->md.in_port.odp_port != ctx->in_port.odp_port ||
> > + pkt->md.ct_zone != ctx->zone) {
> > + return false;
> > + }
> > + return true;
> > +}
> > +
> > /* Adds fragments associated with a completed fragment list to a
> > packet batch
> > * to be processed by the calling application, typically
> > conntrack. Also
> > * cleans up the list context when it is empty.*/
> > -static void
> > +static bool
> > ipf_send_completed_frags(struct ipf *ipf, struct dp_packet_batch
> > *pb,
> > - long long now, bool v6)
> > + struct ipf_ctx *ctx, long long now, bool
> > v6)
> > {
> > if (ovs_list_is_empty(&ipf->frag_complete_list)) {
> > - return;
> > + return false;
> > }
> >
> > + bool more_pkts = false;
> > ovs_mutex_lock(&ipf->ipf_lock);
> > struct ipf_list *ipf_list, *next;
> >
> > LIST_FOR_EACH_SAFE (ipf_list, next, list_node, &ipf-
> > >frag_complete_list) {
> > +
> > + if (ctx && !ipf_ctx_eq(ipf_list, ctx)) {
> > + continue;
> > + }
> > +
> > if (ipf_send_frags_in_list(ipf, ipf_list, pb,
> > IPF_FRAG_COMPLETED_LIST,
> > v6, now)) {
> > ipf_completed_list_clean(&ipf->frag_lists, ipf_list);
> > } else {
> > + more_pkts = true;
> > break;
> > }
> > }
> >
> > ovs_mutex_unlock(&ipf->ipf_lock);
> > + return more_pkts;
> > }
> >
> > /* Conservatively adds fragments associated with a expired
> > fragment list to
> > @@ -1225,8 +1247,8 @@ ipf_post_execute_reass_pkts(struct ipf *ipf,
> > * be added to the batch to be sent through conntrack. */
> > void
> > ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch
> > *pb,
> > - long long now, ovs_be16 dl_type, uint16_t
> > zone,
> > - uint32_t hash_basis)
> > + long long now, ovs_be16 dl_type,
> > + uint16_t zone, uint32_t hash_basis)
> > {
> > if (ipf_get_enabled(ipf)) {
> > ipf_extract_frags_from_batch(ipf, pb, dl_type, zone, now,
> > hash_basis);
> > @@ -1241,16 +1263,18 @@ ipf_preprocess_conntrack(struct ipf *ipf,
> > struct dp_packet_batch *pb,
> > * through conntrack and adds these fragments to any batches
> > seen. Expired
> > * fragments are marked as invalid and also added to the batches
> > seen
> > * with low priority. Reassembled packets are freed. */
> > -void
> > +bool
> > ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch
> > *pb,
> > - long long now, ovs_be16 dl_type)
> > + struct ipf_ctx *ctx, long long now,
> > ovs_be16 dl_type)
> > {
> > + bool more_pkts = false;
> > if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) {
> > bool v6 = dl_type == htons(ETH_TYPE_IPV6);
> > ipf_post_execute_reass_pkts(ipf, pb, v6);
> > - ipf_send_completed_frags(ipf, pb, now, v6);
> > + more_pkts = ipf_send_completed_frags(ipf, pb, ctx, now,
> > v6);
> > ipf_send_expired_frags(ipf, pb, now, v6);
> > }
> > + return more_pkts;
> > }
> >
> > static void *
> > diff --git a/lib/ipf.h b/lib/ipf.h
> > index 6ac91b270..7bbf453af 100644
> > --- a/lib/ipf.h
> > +++ b/lib/ipf.h
> > @@ -40,14 +40,21 @@ struct ipf_status {
> > unsigned int nfrag_max;
> > };
> >
> > +struct ipf_ctx {
> > + uint32_t recirc_id;
> > + union flow_in_port in_port; /* Input port. */
> > + uint16_t zone;
> > +};
> > +
> > struct ipf *ipf_init(void);
> > void ipf_destroy(struct ipf *ipf);
> > void ipf_preprocess_conntrack(struct ipf *ipf, struct
> > dp_packet_batch *pb,
> > long long now, ovs_be16 dl_type,
> > uint16_t zone,
> > uint32_t hash_basis);
> >
> > -void ipf_postprocess_conntrack(struct ipf *ipf, struct
> > dp_packet_batch *pb,
> > - long long now, ovs_be16 dl_type);
> > +bool ipf_postprocess_conntrack(struct ipf *ipf, struct
> > dp_packet_batch *pb,
> > + struct ipf_ctx *ctx, long long now,
> > + ovs_be16 dl_type);
> >
> > int ipf_set_enabled(struct ipf *ipf, bool v6, bool enable);
> > int ipf_set_min_frag(struct ipf *ipf, bool v6, uint32_t value);
> > diff --git a/tests/system-traffic.at b/tests/system-traffic.at
> > index a69896d33..a09318882 100644
> > --- a/tests/system-traffic.at
> > +++ b/tests/system-traffic.at
> > @@ -2741,6 +2741,39 @@ DPCTL_CHECK_FRAGMENTATION_PASS()
> > OVS_TRAFFIC_VSWITCHD_STOP
> > AT_CLEANUP
> >
> > +AT_SETUP([conntrack - IPv4 large fragmentation])
> > +CHECK_CONNTRACK()
> > +OVS_TRAFFIC_VSWITCHD_START()
> > +
> > +ADD_NAMESPACES(at_ns0, at_ns1)
> > +
> > +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24")
> > +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
> > +
> > +dnl Sending ping through conntrack
> > +AT_DATA([flows.txt], [dnl
> > +priority=1,action=drop
> > +priority=10,arp,action=normal
> > +priority=100,in_port=1,ct_state=-
> > trk,icmp,action=ct(zone=9,table=0)
> > +priority=100,in_port=1,ct_state=+new+trk,icmp,action=ct(zone=9,com
> > mit),2
> > +priority=100,in_port=1,ct_state=-new+est+trk,icmp,action=2
> > +priority=100,in_port=2,ct_state=-
> > trk,icmp,action=ct(table=0,zone=9)
> > +priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1
> > +])
> > +
> > +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
> > +
> > +dnl Modify userspace conntrack fragmentation handling.
> > +DPCTL_MODIFY_FRAGMENTATION()
> > +
> > +dnl Ipv4 larger fragmentation connectivity check.
> > +NS_CHECK_EXEC([at_ns0], [ping -s 65000 -q -c 3 -i 0.3 -w 2
> > 10.1.1.2 | FORMAT_PING], [0], [dnl
> > +3 packets transmitted, 3 received, 0% packet loss, time 0ms
> > +])
> > +
> > +OVS_TRAFFIC_VSWITCHD_STOP
> > +AT_CLEANUP
> > +
> > AT_SETUP([conntrack - IPv4 fragmentation expiry])
> > CHECK_CONNTRACK()
> > OVS_TRAFFIC_VSWITCHD_START()
> > diff --git a/tests/test-conntrack.c b/tests/test-conntrack.c
> > index 24c93e4a4..f90b7aa78 100644
> > --- a/tests/test-conntrack.c
> > +++ b/tests/test-conntrack.c
> > @@ -91,7 +91,7 @@ ct_thread_main(void *aux_)
> > ovs_barrier_block(&barrier);
> > for (i = 0; i < n_pkts; i += batch_size) {
> > conntrack_execute(ct, pkt_batch, dl_type, false, true, 0,
> > NULL, NULL,
> > - 0, 0, NULL, NULL, now, 0);
> > + 0, 0, NULL, NULL, now, 0, NULL);
> > DP_PACKET_BATCH_FOR_EACH (j, pkt, pkt_batch) {
> > pkt_metadata_init_conn(&pkt->md);
> > }
> > @@ -178,7 +178,7 @@ pcap_batch_execute_conntrack(struct conntrack
> > *ct_,
> >
> > if (flow.dl_type != dl_type) {
> > conntrack_execute(ct_, &new_batch, dl_type, false,
> > true, 0,
> > - NULL, NULL, 0, 0, NULL, NULL, now,
> > 0);
> > + NULL, NULL, 0, 0, NULL, NULL, now,
> > 0, NULL);
> > dp_packet_batch_init(&new_batch);
> > }
> > dp_packet_batch_add(&new_batch, packet);
> > @@ -186,7 +186,7 @@ pcap_batch_execute_conntrack(struct conntrack
> > *ct_,
> >
> > if (!dp_packet_batch_is_empty(&new_batch)) {
> > conntrack_execute(ct_, &new_batch, dl_type, false, true,
> > 0, NULL, NULL,
> > - 0, 0, NULL, NULL, now, 0);
> > + 0, 0, NULL, NULL, now, 0, NULL);
> > }
> >
> > }
>
> _______________________________________________
> dev mailing list
> [email protected]
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev