Hi,

just found I forget to answer other questions.

Ilya Maximets <i.maxim...@ovn.org> 于2022年2月15日周二 02:11写道:

> On 1/28/22 17:14, Aaron Conole wrote:
> > From: Peng He <xnhp0...@gmail.com>
> >
> > ipf_postprocess will emit packets into the datapath pipeline ignoring
> > the conntrack context, this might casuse weird issues when a packet
> > batch has less space to hold all the fragments belonging to single
> > packet.
> >
> > Given the below ruleest and consider sending a 64K ICMP packet which
> > is splitted into 64 fragments.
> >
> > priority=1,action=drop
> > priority=10,arp,action=normal
> > priority=100,in_port=1,ct_state=-trk,icmp,action=ct(zone=9,table=0)
> > priority=100,in_port=1,ct_state=+new+trk,icmp,action=ct(zone=9,commit),2
> > priority=100,in_port=1,ct_state=-new+est+trk,icmp,action=2
> > priority=100,in_port=2,ct_state=-trk,icmp,action=ct(table=0,zone=9)
> > priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1
> >
> > Batch 1:
> > the first 32 packets will be buffered in the ipf preprocessing, nothing
> > more proceeds.
> >
> > Batch 2:
> > the second 32 packets succeed the fragment reassembly and goes to ct
> > and ipf_post will emits the first 32 packets due to the limit of batch
> > size.
> >
> > the first 32 packets goes to the datapath again due to the
> > recirculation, and again buffered at ipf preprocessing before ct commit,
> > then the ovs tries to call ct commit as well as ipf_postprocessing which
> emits
> > the last 32 packets, in this case the last 32 packets will follow
> > the current actions which will be sent to port 2 directly without
> > recirculation and going to ipf preprocssing and ct commit again.
> >
> > This will cause the first 32 packets never get the chance to
> > reassemble and evevntually this large ICMP packets fail to transmit.
> >
> > this patch fixes this issue by adding firstly ipf context to avoid
> > ipf_postprocessing emits packets in the wrong context. Then by
> > re-executing the action list again to emit the last 32 packets
> > in the right context to correctly transmitting multiple fragments.
> >
> > This might be one of the root cause why the OVS log warns:
> >
> > "
> > skipping output to input port on bridge br-int while processing
> > "
> >
> > As a pkt might enter into different ipf context.
> >
> > One implementation trick of implementing ipf_ctx_eq, is to use ipf_list's
> > key instead of the first frag's metadata, this can reduce quite a lot of
> > cache misses as at the time of calling ipf_ctx_eq, ipf_list is cache-hot.
> > On x86_64, this trick saves 2% of CPU usage of ipf processing.
> >
> > Signed-off-by: Peng He <hepeng.0...@bytedance.com>
> > Co-authored-by: Mike Pattrick <m...@redhat.com>
> > Signed-off-by: Aaron Conole <acon...@redhat.com>
> > ---
> >  lib/conntrack.c         |  9 ++++---
> >  lib/conntrack.h         | 15 ++++++------
> >  lib/dpif-netdev.c       | 52 +++++++++++++++++++++++++++++++++++++----
> >  lib/ipf.c               | 39 ++++++++++++++++++++++++-------
> >  lib/ipf.h               | 11 +++++++--
> >  tests/system-traffic.at | 33 ++++++++++++++++++++++++++
> >  tests/test-conntrack.c  |  6 ++---
> >  7 files changed, 135 insertions(+), 30 deletions(-)
> >
> > diff --git a/lib/conntrack.c b/lib/conntrack.c
> > index 33a1a92953..72999f1aed 100644
> > --- a/lib/conntrack.c
> > +++ b/lib/conntrack.c
> > @@ -1434,14 +1434,14 @@ process_one(struct conntrack *ct, struct
> dp_packet *pkt,
> >   * connection tables.  'setmark', if not NULL, should point to a two
> >   * elements array containing a value and a mask to set the connection
> mark.
> >   * 'setlabel' behaves similarly for the connection label.*/
> > -int
> > +bool
> >  conntrack_execute(struct conntrack *ct, struct dp_packet_batch
> *pkt_batch,
> >                    ovs_be16 dl_type, bool force, bool commit, uint16_t
> zone,
> >                    const uint32_t *setmark,
> >                    const struct ovs_key_ct_labels *setlabel,
> >                    ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper,
> >                    const struct nat_action_info_t *nat_action_info,
> > -                  long long now, uint32_t tp_id)
> > +                  long long now, uint32_t tp_id, struct ipf_ctx
> *ipf_ctx)
> >  {
> >      ipf_preprocess_conntrack(ct->ipf, pkt_batch, now, dl_type, zone,
> >                               ct->hash_basis);
> > @@ -1468,9 +1468,8 @@ conntrack_execute(struct conntrack *ct, struct
> dp_packet_batch *pkt_batch,
> >          }
> >      }
> >
> > -    ipf_postprocess_conntrack(ct->ipf, pkt_batch, now, dl_type);
> > -
> > -    return 0;
> > +    return ipf_postprocess_conntrack(ct->ipf, pkt_batch, ipf_ctx, \
> > +                                     now, dl_type);
> >  }
> >
> >  void
> > diff --git a/lib/conntrack.h b/lib/conntrack.h
> > index 9553b188a4..211efad3f3 100644
> > --- a/lib/conntrack.h
> > +++ b/lib/conntrack.h
> > @@ -64,6 +64,7 @@
> >  struct dp_packet_batch;
> >
> >  struct conntrack;
> > +struct ipf_ctx;
> >
> >  union ct_addr {
> >      ovs_be32 ipv4;
> > @@ -88,13 +89,13 @@ struct nat_action_info_t {
> >  struct conntrack *conntrack_init(void);
> >  void conntrack_destroy(struct conntrack *);
> >
> > -int conntrack_execute(struct conntrack *ct, struct dp_packet_batch
> *pkt_batch,
> > -                      ovs_be16 dl_type, bool force, bool commit,
> uint16_t zone,
> > -                      const uint32_t *setmark,
> > -                      const struct ovs_key_ct_labels *setlabel,
> > -                      ovs_be16 tp_src, ovs_be16 tp_dst, const char
> *helper,
> > -                      const struct nat_action_info_t *nat_action_info,
> > -                      long long now, uint32_t tp_id);
> > +bool conntrack_execute(struct conntrack *ct, struct dp_packet_batch
> *pkt_batch,
> > +                       ovs_be16 dl_type, bool force, bool commit,
> > +                       uint16_t zone, const uint32_t *setmark,
> > +                       const struct ovs_key_ct_labels *setlabel,
> > +                       ovs_be16 tp_src, ovs_be16 tp_dst, const char
> *helper,
> > +                       const struct nat_action_info_t *nat_action_info,
> > +                       long long now, uint32_t tp_id, struct ipf_ctx
> *ipf_ctx);
> >  void conntrack_clear(struct dp_packet *packet);
> >
> >  struct conntrack_dump {
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index e28e0b5543..8cebdfab56 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -8500,6 +8500,7 @@ dp_netdev_recirculate(struct dp_netdev_pmd_thread
> *pmd,
> >  struct dp_netdev_execute_aux {
> >      struct dp_netdev_pmd_thread *pmd;
> >      const struct flow *flow;
> > +    const struct nlattr *redo_actions;
> >  };
> >
> >  static void
> > @@ -9029,10 +9030,23 @@ dp_execute_cb(void *aux_, struct dp_packet_batch
> *packets_,
> >              VLOG_WARN_RL(&rl, "NAT specified without commit.");
> >          }
> >
> > -        conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type,
> force,
> > -                          commit, zone, setmark, setlabel,
> aux->flow->tp_src,
> > -                          aux->flow->tp_dst, helper,
> nat_action_info_ref,
> > -                          pmd->ctx.now / 1000, tp_id);
> > +        bool more_pkts;
> > +        struct ipf_ctx ipf_ctx = { aux->flow->recirc_id,
> > +                                   aux->flow->in_port,
>
> packets_->packets[0]->md.in_port.odp_port
>

We cannot use the packets in the batch, as the batch might be
empty, it might be feed by the redo_actions.



>
> > +                                   zone };
> > +        more_pkts = conntrack_execute(dp->conntrack, packets_,
> > +                                      aux->flow->dl_type,
> > +                                      force, commit, zone,
> > +                                      setmark, setlabel,
> > +                                      aux->flow->tp_src,
> > +                                      aux->flow->tp_dst,
> > +                                      helper, nat_action_info_ref,
> > +                                      pmd->ctx.now / 1000, tp_id,
> &ipf_ctx);
> > +        if (more_pkts) {
> > +            aux->redo_actions = a;
> > +        } else {
> > +            aux->redo_actions = NULL;
> > +        }
>
> Assuming we have 2 conntrack actions in the same action list,
> in this case while executing a secont ct() action we will
> overwrite the 'redo_actions' context of the first action even
> though it might have more packets to process.
> IIUC, these remaining fragments from the first ct() action
> will not be sent out until some other traffic hists it and
> we actually have a room in the batch.
>
> It should, probably, be a list, where you can add new entry
> points, so dp_netdev_execute_actions() can iterate over all
> of them.
>
> >          break;
> >      }
> >
> > @@ -9067,16 +9081,44 @@ dp_execute_cb(void *aux_, struct dp_packet_batch
> *packets_,
> >      dp_packet_delete_batch(packets_, should_steal);
> >  }
> >
> > +static size_t
> > +dp_netdev_find_actions_left(const struct nlattr *actions,
> > +                            size_t actions_len,
> > +                            const struct nlattr *target)
> > +{
> > +    const struct nlattr *a;
> > +    size_t left;
> > +    NL_ATTR_FOR_EACH_UNSAFE (a, left, actions, actions_len) {
> > +        if (a == target) {
> > +            break;
> > +        }
> > +    }
> > +    return left;
> > +}
> > +
> >  static void
> >  dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
> >                            struct dp_packet_batch *packets,
> >                            bool should_steal, const struct flow *flow,
> >                            const struct nlattr *actions, size_t
> actions_len)
> >  {
> > -    struct dp_netdev_execute_aux aux = { pmd, flow };
> > +    int max_reexec = 2;
>
> I'm sorry if I missed the discussin, but why only 2?
> IIUC, in theory we may have up to 1K fragments that needs to be sent.
> These will also be stuck waiting for the next batch of packets to hit
> the same ct() action.
>

Normally, a 64K IP pkts will generate 64 frag pkts ( for 1500)
there might be 1K frags in one ctx, but it's more like a frag attack?
2 can be used as a kind of protection, I guess.


>
> > +    struct dp_netdev_execute_aux aux = { pmd, flow, NULL };
> >
> >      odp_execute_actions(&aux, packets, should_steal, actions,
> >                          actions_len, dp_execute_cb);
> > +
> > +    while (OVS_UNLIKELY(aux.redo_actions && max_reexec --)) {
> > +        struct dp_packet_batch batch;
> > +        dp_packet_batch_init(&batch);
> > +        size_t redo_actions_len = \
>
> Please, don't use '\' in a normal C code.
> here and in other parts of that patch.
>

will fix it in the next version.

>
> > +                                  dp_netdev_find_actions_left(actions,
> > +                                          actions_len,
> > +                                          aux.redo_actions);
> > +
> > +        odp_execute_actions(&aux, &batch, should_steal,
>
> If 'should_steal' is 'false', we will leak all the packets here.
> e.g. if the last fragment came through the packet_out.
>
> > +                aux.redo_actions, redo_actions_len, dp_execute_cb);
> > +    }
> >  }
> >
> >  struct dp_netdev_ct_dump {
> > diff --git a/lib/ipf.c b/lib/ipf.c
> > index 507db2aea2..2ff04c6156 100644
> > --- a/lib/ipf.c
> > +++ b/lib/ipf.c
> > @@ -1046,30 +1046,51 @@ ipf_send_frags_in_list(struct ipf *ipf, struct
> ipf_list *ipf_list,
> >      OVS_NOT_REACHED();
> >  }
> >
> > +static bool
> > +ipf_ctx_eq(struct ipf_list *ipf_list, struct ipf_ctx *ctx,
> > +           struct dp_packet *pkt)
> > +{
> > +    if (ipf_list->key.recirc_id != ctx->recirc_id ||
> > +            pkt->md.in_port.odp_port != ctx->in_port.odp_port ||
> > +            ipf_list->key.zone != ctx->zone) {
> > +        return false;
> > +    }
> > +    return true;
> > +}
> > +
> >  /* Adds fragments associated with a completed fragment list to a packet
> batch
> >   * to be processed by the calling application, typically conntrack. Also
> >   * cleans up the list context when it is empty.*/
> > -static void
> > +static bool
> >  ipf_send_completed_frags(struct ipf *ipf, struct dp_packet_batch *pb,
> > -                         long long now, bool v6)
> > +                         struct ipf_ctx *ctx, long long now, bool v6)
> >  {
> >      if (ovs_list_is_empty(&ipf->frag_complete_list)) {
> > -        return;
> > +        return false;
> >      }
> >
> > +    bool more_pkts = false;
> >      ovs_mutex_lock(&ipf->ipf_lock);
> >      struct ipf_list *ipf_list, *next;
> >
> >      LIST_FOR_EACH_SAFE (ipf_list, next, list_node,
> &ipf->frag_complete_list) {
> > +
> > +        if (ctx && !ipf_ctx_eq(ipf_list, ctx, \
> > +                    ipf_list->frag_list[ipf_list->last_sent_idx +
> 1].pkt)) {
> > +            continue;
> > +        }
> > +
> >          if (ipf_send_frags_in_list(ipf, ipf_list, pb,
> IPF_FRAG_COMPLETED_LIST,
> >                                     v6, now)) {
> >              ipf_completed_list_clean(&ipf->frag_lists, ipf_list);
> >          } else {
> > +            more_pkts = true;
> >              break;
> >          }
> >      }
> >
> >      ovs_mutex_unlock(&ipf->ipf_lock);
> > +    return more_pkts;
> >  }
> >
> >  /* Conservatively adds fragments associated with a expired fragment
> list to
> > @@ -1225,8 +1246,8 @@ ipf_post_execute_reass_pkts(struct ipf *ipf,
> >   * be added to the batch to be sent through conntrack. */
> >  void
> >  ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
> > -                         long long now, ovs_be16 dl_type, uint16_t zone,
> > -                         uint32_t hash_basis)
> > +                         long long now, ovs_be16 dl_type,
> > +                         uint16_t zone, uint32_t hash_basis)
> >  {
> >      if (ipf_get_enabled(ipf)) {
> >          ipf_extract_frags_from_batch(ipf, pb, dl_type, zone, now,
> hash_basis);
> > @@ -1241,16 +1262,18 @@ ipf_preprocess_conntrack(struct ipf *ipf, struct
> dp_packet_batch *pb,
> >   * through conntrack and adds these fragments to any batches seen.
> Expired
> >   * fragments are marked as invalid and also added to the batches seen
> >   * with low priority.  Reassembled packets are freed. */
> > -void
> > +bool
> >  ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
> > -                          long long now, ovs_be16 dl_type)
> > +                          struct ipf_ctx *ctx, long long now, ovs_be16
> dl_type)
> >  {
> > +    bool more_pkts = false;
> >      if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) {
> >          bool v6 = dl_type == htons(ETH_TYPE_IPV6);
> >          ipf_post_execute_reass_pkts(ipf, pb, v6);
> > -        ipf_send_completed_frags(ipf, pb, now, v6);
> > +        more_pkts = ipf_send_completed_frags(ipf, pb, ctx, now, v6);
> >          ipf_send_expired_frags(ipf, pb, now, v6);
> >      }
> > +    return more_pkts;
> >  }
> >
> >  static void *
> > diff --git a/lib/ipf.h b/lib/ipf.h
> > index 6ac91b2708..7bbf453afe 100644
> > --- a/lib/ipf.h
> > +++ b/lib/ipf.h
> > @@ -40,14 +40,21 @@ struct ipf_status {
> >     unsigned int nfrag_max;
> >  };
> >
> > +struct ipf_ctx {
> > +    uint32_t recirc_id;
> > +    union flow_in_port in_port; /* Input port. */
>
> We don't seem to ever need an OF port number, so should be just:
>
>     odp_port_t in_port;
>
> The comment also doesn't seem to be useful.


Agree.  will fix it.

>


> > +    uint16_t zone;
> > +};
> > +
> >  struct ipf *ipf_init(void);
> >  void ipf_destroy(struct ipf *ipf);
> >  void ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch
> *pb,
> >                                long long now, ovs_be16 dl_type, uint16_t
> zone,
> >                                uint32_t hash_basis);
> >
> > -void ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch
> *pb,
> > -                               long long now, ovs_be16 dl_type);
> > +bool ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch
> *pb,
> > +                               struct ipf_ctx *ctx, long long now,
> > +                               ovs_be16 dl_type);
> >
> >  int ipf_set_enabled(struct ipf *ipf, bool v6, bool enable);
> >  int ipf_set_min_frag(struct ipf *ipf, bool v6, uint32_t value);
> > diff --git a/tests/system-traffic.at b/tests/system-traffic.at
> > index 9da84a0734..4765513747 100644
> > --- a/tests/system-traffic.at
> > +++ b/tests/system-traffic.at
> > @@ -3118,6 +3118,39 @@ DPCTL_CHECK_FRAGMENTATION_PASS()
> >  OVS_TRAFFIC_VSWITCHD_STOP
> >  AT_CLEANUP
> >
> > +AT_SETUP([conntrack - IPv4 large fragmentation])
> > +CHECK_CONNTRACK()
> > +OVS_TRAFFIC_VSWITCHD_START()
> > +
> > +ADD_NAMESPACES(at_ns0, at_ns1)
> > +
> > +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24")
> > +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
> > +
> > +dnl Sending ping through conntrack
> > +AT_DATA([flows.txt], [dnl
> > +priority=1,action=drop
> > +priority=10,arp,action=normal
> > +priority=100,in_port=1,ct_state=-trk,icmp,action=ct(zone=9,table=0)
> > +priority=100,in_port=1,ct_state=+new+trk,icmp,action=ct(zone=9,commit),2
> > +priority=100,in_port=1,ct_state=-new+est+trk,icmp,action=2
> > +priority=100,in_port=2,ct_state=-trk,icmp,action=ct(table=0,zone=9)
> > +priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1
> > +])
> > +
> > +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
> > +
> > +dnl Modify userspace conntrack fragmentation handling.
> > +DPCTL_MODIFY_FRAGMENTATION()
> > +
> > +dnl Ipv4 larger fragmentation connectivity check.
> > +NS_CHECK_EXEC([at_ns0], [ping -s 65000 -q -c 3 -i 0.3 -w 2 10.1.1.2 |
> FORMAT_PING], [0], [dnl
> > +3 packets transmitted, 3 received, 0% packet loss, time 0ms
> > +])
> > +
> > +OVS_TRAFFIC_VSWITCHD_STOP
> > +AT_CLEANUP
> > +
> >  AT_SETUP([conntrack - IPv4 fragmentation expiry])
> >  CHECK_CONNTRACK()
> >  OVS_TRAFFIC_VSWITCHD_START()
> > diff --git a/tests/test-conntrack.c b/tests/test-conntrack.c
> > index 24c93e4a48..f90b7aa78b 100644
> > --- a/tests/test-conntrack.c
> > +++ b/tests/test-conntrack.c
> > @@ -91,7 +91,7 @@ ct_thread_main(void *aux_)
> >      ovs_barrier_block(&barrier);
> >      for (i = 0; i < n_pkts; i += batch_size) {
> >          conntrack_execute(ct, pkt_batch, dl_type, false, true, 0, NULL,
> NULL,
> > -                          0, 0, NULL, NULL, now, 0);
> > +                          0, 0, NULL, NULL, now, 0, NULL);
> >          DP_PACKET_BATCH_FOR_EACH (j, pkt, pkt_batch) {
> >              pkt_metadata_init_conn(&pkt->md);
> >          }
> > @@ -178,7 +178,7 @@ pcap_batch_execute_conntrack(struct conntrack *ct_,
> >
> >          if (flow.dl_type != dl_type) {
> >              conntrack_execute(ct_, &new_batch, dl_type, false, true, 0,
> > -                              NULL, NULL, 0, 0, NULL, NULL, now, 0);
> > +                              NULL, NULL, 0, 0, NULL, NULL, now, 0,
> NULL);
> >              dp_packet_batch_init(&new_batch);
> >          }
> >          dp_packet_batch_add(&new_batch, packet);
> > @@ -186,7 +186,7 @@ pcap_batch_execute_conntrack(struct conntrack *ct_,
> >
> >      if (!dp_packet_batch_is_empty(&new_batch)) {
> >          conntrack_execute(ct_, &new_batch, dl_type, false, true, 0,
> NULL, NULL,
> > -                          0, 0, NULL, NULL, now, 0);
> > +                          0, 0, NULL, NULL, now, 0, NULL);
> >      }
> >
> >  }
>
> _______________________________________________
> dev mailing list
> d...@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>


-- 
hepeng
_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to