Re: [ovs-dev] [PATCH ovn] controller: ofctrl: Use index for meter lookups.

2024-02-26 Thread Han Zhou
On Mon, Feb 26, 2024 at 9:44 AM Ilya Maximets  wrote:
>
> Currently, ovn-controller attempts to sync all the meters on each
> ofctrl_put() call.  And the complexity of this logic is quadratic
> because for each desired meter we perform a full scan of all the
> rows in the Southbound Meter table in order to lookup a matching
> meter.  This is very inefficient.  In a setup with 25K meters this
> operation takes anywhere from 30 to 60 seconds to perform.  All
> that time ovn-controller is blocked and doesn't process any updates.
> So, addition of new ports in such a setup becomes very slow.
>
> The meter lookup is performed by name and we have an index for it
> in the database schema.  Might as well use it.
>
> Using the index for lookup reduces complexity to O(n * log n).
> And the time to process port addition on the same setup drops down
> to just 100 - 300 ms.
>
> We are still iterating over all the desired meters while they can
> probably be processed incrementally instead.  But using an index
> is a simpler fix for now.
>
> Fixes: 885655e16e63 ("controller: reconfigure ovs meters for ovn meters")
> Fixes: 999e1adfb572 ("ovn: Support configuring meters through SB Meter
table.")
> Reported-at: https://issues.redhat.com/browse/FDP-399
> Signed-off-by: Ilya Maximets 
> ---
>
> This is a "performance bug", so the decision to backport this or not
> is on maintainers.  But it is severe enough, IMO.
>

Thanks Ilya. The fix looks good to me. And I think it is ok to backport,
since the change is simple enough.

Acked-by: Han Zhou 

Just curious, how would the OVS perform with this large number of meters?

Thanks,
Han
>  controller/ofctrl.c | 37 ++---
>  controller/ofctrl.h |  2 +-
>  controller/ovn-controller.c |  4 +++-
>  3 files changed, 26 insertions(+), 17 deletions(-)
>
> diff --git a/controller/ofctrl.c b/controller/ofctrl.c
> index cb460a2a4..f14cd79a8 100644
> --- a/controller/ofctrl.c
> +++ b/controller/ofctrl.c
> @@ -2257,18 +2257,29 @@ ofctrl_meter_bands_erase(struct
ovn_extend_table_info *entry,
>  }
>  }
>
> +static const struct sbrec_meter *
> +sb_meter_lookup_by_name(struct ovsdb_idl_index *sbrec_meter_by_name,
> +const char *name)
> +{
> +const struct sbrec_meter *sb_meter;
> +struct sbrec_meter *index_row;
> +
> +index_row = sbrec_meter_index_init_row(sbrec_meter_by_name);
> +sbrec_meter_index_set_name(index_row, name);
> +sb_meter = sbrec_meter_index_find(sbrec_meter_by_name, index_row);
> +sbrec_meter_index_destroy_row(index_row);
> +
> +return sb_meter;
> +}
> +
>  static void
>  ofctrl_meter_bands_sync(struct ovn_extend_table_info *m_existing,
> -const struct sbrec_meter_table *meter_table,
> +struct ovsdb_idl_index *sbrec_meter_by_name,
>  struct ovs_list *msgs)
>  {
>  const struct sbrec_meter *sb_meter;
> -SBREC_METER_TABLE_FOR_EACH (sb_meter, meter_table) {
> -if (!strcmp(m_existing->name, sb_meter->name)) {
> -break;
> -}
> -}
>
> +sb_meter = sb_meter_lookup_by_name(sbrec_meter_by_name,
m_existing->name);
>  if (sb_meter) {
>  /* OFPMC13_ADD or OFPMC13_MODIFY */
>  ofctrl_meter_bands_update(sb_meter, m_existing, msgs);
> @@ -2280,16 +2291,12 @@ ofctrl_meter_bands_sync(struct
ovn_extend_table_info *m_existing,
>
>  static void
>  add_meter(struct ovn_extend_table_info *m_desired,
> -  const struct sbrec_meter_table *meter_table,
> +  struct ovsdb_idl_index *sbrec_meter_by_name,
>struct ovs_list *msgs)
>  {
>  const struct sbrec_meter *sb_meter;
> -SBREC_METER_TABLE_FOR_EACH (sb_meter, meter_table) {
> -if (!strcmp(m_desired->name, sb_meter->name)) {
> -break;
> -}
> -}
>
> +sb_meter = sb_meter_lookup_by_name(sbrec_meter_by_name,
m_desired->name);
>  if (!sb_meter) {
>  static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
>  VLOG_ERR_RL(, "could not find meter named \"%s\"",
m_desired->name);
> @@ -2656,7 +2663,7 @@ ofctrl_put(struct ovn_desired_flow_table
*lflow_table,
> struct ovn_desired_flow_table *pflow_table,
> struct shash *pending_ct_zones,
> struct hmap *pending_lb_tuples,
> -   const struct sbrec_meter_table *meter_table,
> +   struct ovsdb_idl_index *sbrec_meter_by_name,
> uint64_t req_cfg,
> bool lflows_changed,
> bool pflows_changed)
> @@ -2733,10 +2740,10 @@ ofctrl_put(struct ovn_desired_flow_table
*lflow_table,
>   * describes the meter itself. */
>  add_meter_string(m_desired, );
>  } else {
> -add_meter(m_desired, meter_table, );
> +add_meter(m_desired, sbrec_meter_by_name, );
>  }
>  } else {
> -ofctrl_meter_bands_sync(m_existing, 

[ovs-dev] [PATCH v7 2/2] ofproto-dpif-mirror: Add support for pre-selection filter.

2024-02-26 Thread Mike Pattrick
Currently a bridge mirror will collect all packets and tools like
ovs-tcpdump can apply additional filters after they have already been
duplicated by vswitchd. This can result in inefficient collection.

This patch adds support to apply pre-selection to bridge mirrors, which
can limit which packets are mirrored based on flow metadata. This
significantly improves overall vswitchd performance during mirroring if
only a subset of traffic is required.

Signed-off-by: Mike Pattrick 
---
v7:
 - Make sure filter mask is added to masks of non-matching flows.
 - Added additional tests.
---
 Documentation/ref/ovs-tcpdump.8.rst |   8 +-
 NEWS|   3 +
 lib/flow.c  |  21 +++-
 lib/flow.h  |  12 +++
 ofproto/ofproto-dpif-mirror.c   |  78 ++-
 ofproto/ofproto-dpif-mirror.h   |  12 ++-
 ofproto/ofproto-dpif-xlate.c|  26 -
 ofproto/ofproto-dpif.c  |   9 +-
 ofproto/ofproto-dpif.h  |   6 ++
 ofproto/ofproto.c   |   4 +-
 ofproto/ofproto.h   |   3 +
 tests/ofproto-dpif.at   | 142 
 utilities/ovs-tcpdump.in|  13 ++-
 vswitchd/bridge.c   |  13 ++-
 vswitchd/vswitch.ovsschema  |   5 +-
 vswitchd/vswitch.xml|  13 +++
 16 files changed, 343 insertions(+), 25 deletions(-)

diff --git a/Documentation/ref/ovs-tcpdump.8.rst 
b/Documentation/ref/ovs-tcpdump.8.rst
index b9f8cdf6f..e21e61211 100644
--- a/Documentation/ref/ovs-tcpdump.8.rst
+++ b/Documentation/ref/ovs-tcpdump.8.rst
@@ -61,8 +61,14 @@ Options
 
   If specified, mirror all ports (optional).
 
+* ``--filter ``
+
+  If specified, only mirror flows that match the provided OpenFlow filter.
+  The available fields are documented in ``ovs-fields(7)``.
+
 See Also
 
 
 ``ovs-appctl(8)``, ``ovs-vswitchd(8)``, ``ovs-pcap(1)``,
-``ovs-tcpundump(1)``, ``tcpdump(8)``, ``wireshark(8)``.
+``ovs-fields(7)``, ``ovs-tcpundump(1)``, ``tcpdump(8)``,
+``wireshark(8)``.
diff --git a/NEWS b/NEWS
index c9e4064e6..35f7eb0c7 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,9 @@ Post-v3.3.0
  * Conntrack now supports 'random' flag for selecting ports in a range
while natting and 'persistent' flag for selection of the IP address
from a range.
+   - OVSDB:
+ * Added a new filter column in the Mirror table which can be used to
+   apply filters to mirror ports.
 
 
 v3.3.0 - 16 Feb 2024
diff --git a/lib/flow.c b/lib/flow.c
index 8e3402388..a088bdc86 100644
--- a/lib/flow.c
+++ b/lib/flow.c
@@ -3569,7 +3569,7 @@ miniflow_equal_in_minimask(const struct miniflow *a, 
const struct miniflow *b,
 return true;
 }
 
-/* Returns true if 'a' and 'b' are equal at the places where there are 1-bits
+/* Returns true if 'a' and 'b' are equal at the places where there are 0-bits
  * in 'mask', false if they differ. */
 bool
 miniflow_equal_flow_in_minimask(const struct miniflow *a, const struct flow *b,
@@ -3587,6 +3587,25 @@ miniflow_equal_flow_in_minimask(const struct miniflow 
*a, const struct flow *b,
 return true;
 }
 
+/* Returns false if 'a' and 'b' differ in places where there are 1-bits in
+ * 'wc', true otherwise. */
+bool
+miniflow_equal_flow_in_flow_wc(const struct miniflow *a, const struct flow *b,
+   const struct flow_wildcards *wc)
+{
+const struct flow *wc_masks = >masks;
+size_t idx;
+
+FLOWMAP_FOR_EACH_INDEX (idx, a->map) {
+if ((miniflow_get(a, idx) ^ flow_u64_value(b, idx)) &
+flow_u64_value(wc_masks, idx)) {
+return false;
+}
+}
+
+return true;
+}
+
 
 void
 minimask_init(struct minimask *mask, const struct flow_wildcards *wc)
diff --git a/lib/flow.h b/lib/flow.h
index 75a9be3c1..a644be39d 100644
--- a/lib/flow.h
+++ b/lib/flow.h
@@ -748,6 +748,9 @@ bool miniflow_equal_in_minimask(const struct miniflow *a,
 bool miniflow_equal_flow_in_minimask(const struct miniflow *a,
  const struct flow *b,
  const struct minimask *);
+bool miniflow_equal_flow_in_flow_wc(const struct miniflow *a,
+const struct flow *b,
+const struct flow_wildcards *);
 uint32_t miniflow_hash_5tuple(const struct miniflow *flow, uint32_t basis);
 
 
@@ -939,6 +942,15 @@ flow_union_with_miniflow(struct flow *dst, const struct 
miniflow *src)
 flow_union_with_miniflow_subset(dst, src, src->map);
 }
 
+/* Perform a bitwise OR of minimask 'src' mask data with the equivalent
+ * fields in 'dst', storing the result in 'dst'. */
+static inline void
+flow_wildcards_union_with_minimask(struct flow_wildcards *dst,
+   const struct minimask *src)
+{
+flow_union_with_miniflow_subset(>masks, >masks, src->masks.map);
+}
+
 static inline bool is_ct_valid(const struct flow *flow,
   

[ovs-dev] [PATCH v7 1/2] ofproto-dpif-mirror: Reduce number of function parameters.

2024-02-26 Thread Mike Pattrick
Previously the mirror_set() and mirror_get() functions took a large
number of parameters, which was inefficient and difficult to read and
extend. This patch moves most of the parameters into a struct.

Signed-off-by: Mike Pattrick 
Acked-by: Simon Horman 
Acked-by: Eelco Chaudron 
---
 ofproto/ofproto-dpif-mirror.c | 61 ++-
 ofproto/ofproto-dpif-mirror.h | 42 +++-
 ofproto/ofproto-dpif-xlate.c  | 29 -
 ofproto/ofproto-dpif.c| 23 ++---
 4 files changed, 91 insertions(+), 64 deletions(-)

diff --git a/ofproto/ofproto-dpif-mirror.c b/ofproto/ofproto-dpif-mirror.c
index 343b75f0e..a84c843b3 100644
--- a/ofproto/ofproto-dpif-mirror.c
+++ b/ofproto/ofproto-dpif-mirror.c
@@ -207,19 +207,23 @@ mirror_bundle_dst(struct mbridge *mbridge, struct 
ofbundle *ofbundle)
 }
 
 int
-mirror_set(struct mbridge *mbridge, void *aux, const char *name,
-   struct ofbundle **srcs, size_t n_srcs,
-   struct ofbundle **dsts, size_t n_dsts,
-   unsigned long *src_vlans, struct ofbundle *out_bundle,
-   uint16_t snaplen,
-   uint16_t out_vlan)
+mirror_set(struct mbridge *mbridge, void *aux,
+   const struct ofproto_mirror_settings *ms,
+   const struct mirror_bundles *mb)
+
 {
 struct mbundle *mbundle, *out;
 mirror_mask_t mirror_bit;
 struct mirror *mirror;
 struct hmapx srcs_map;  /* Contains "struct ofbundle *"s. */
 struct hmapx dsts_map;  /* Contains "struct ofbundle *"s. */
+uint16_t out_vlan;
+
+if (!ms || !mbridge) {
+return EINVAL;
+}
 
+out_vlan = ms->out_vlan;
 mirror = mirror_lookup(mbridge, aux);
 if (!mirror) {
 int idx;
@@ -227,7 +231,7 @@ mirror_set(struct mbridge *mbridge, void *aux, const char 
*name,
 idx = mirror_scan(mbridge);
 if (idx < 0) {
 VLOG_WARN("maximum of %d port mirrors reached, cannot create %s",
-  MAX_MIRRORS, name);
+  MAX_MIRRORS, ms->name);
 return EFBIG;
 }
 
@@ -242,8 +246,8 @@ mirror_set(struct mbridge *mbridge, void *aux, const char 
*name,
 unsigned long *vlans = ovsrcu_get(unsigned long *, >vlans);
 
 /* Get the new configuration. */
-if (out_bundle) {
-out = mbundle_lookup(mbridge, out_bundle);
+if (mb->out_bundle) {
+out = mbundle_lookup(mbridge, mb->out_bundle);
 if (!out) {
 mirror_destroy(mbridge, mirror->aux);
 return EINVAL;
@@ -252,16 +256,16 @@ mirror_set(struct mbridge *mbridge, void *aux, const char 
*name,
 } else {
 out = NULL;
 }
-mbundle_lookup_multiple(mbridge, srcs, n_srcs, _map);
-mbundle_lookup_multiple(mbridge, dsts, n_dsts, _map);
+mbundle_lookup_multiple(mbridge, mb->srcs, mb->n_srcs, _map);
+mbundle_lookup_multiple(mbridge, mb->dsts, mb->n_dsts, _map);
 
 /* If the configuration has not changed, do nothing. */
 if (hmapx_equals(_map, >srcs)
 && hmapx_equals(_map, >dsts)
-&& vlan_bitmap_equal(vlans, src_vlans)
+&& vlan_bitmap_equal(vlans, ms->src_vlans)
 && mirror->out == out
 && mirror->out_vlan == out_vlan
-&& mirror->snaplen == snaplen)
+&& mirror->snaplen == ms->snaplen)
 {
 hmapx_destroy(_map);
 hmapx_destroy(_map);
@@ -275,15 +279,15 @@ mirror_set(struct mbridge *mbridge, void *aux, const char 
*name,
 hmapx_swap(_map, >dsts);
 hmapx_destroy(_map);
 
-if (vlans || src_vlans) {
+if (vlans || ms->src_vlans) {
 ovsrcu_postpone(free, vlans);
-vlans = vlan_bitmap_clone(src_vlans);
+vlans = vlan_bitmap_clone(ms->src_vlans);
 ovsrcu_set(>vlans, vlans);
 }
 
 mirror->out = out;
 mirror->out_vlan = out_vlan;
-mirror->snaplen = snaplen;
+mirror->snaplen = ms->snaplen;
 
 /* Update mbundles. */
 mirror_bit = MIRROR_MASK_C(1) << mirror->idx;
@@ -406,23 +410,22 @@ mirror_update_stats(struct mbridge *mbridge, 
mirror_mask_t mirrors,
 /* Retrieves the mirror numbered 'index' in 'mbridge'.  Returns true if such a
  * mirror exists, false otherwise.
  *
- * If successful, '*vlans' receives the mirror's VLAN membership information,
+ * If successful 'mc->vlans' receives the mirror's VLAN membership information,
  * either a null pointer if the mirror includes all VLANs or a 4096-bit bitmap
  * in which a 1-bit indicates that the mirror includes a particular VLAN,
- * '*dup_mirrors' receives a bitmap of mirrors whose output duplicates mirror
- * 'index', '*out' receives the output ofbundle (if any), and '*out_vlan'
- * receives the output VLAN (if any).
+ * 'mc->dup_mirrors' receives a bitmap of mirrors whose output duplicates
+ * mirror 'index', 'mc->out' receives the output ofbundle (if any),
+ * and 'mc->out_vlan' receives the output VLAN (if any).
  *
  * Everything returned here is assumed to be RCU protected.
  */
 bool

Re: [ovs-dev] [PATCH ovn] controller: ofctrl: Use index for meter lookups.

2024-02-26 Thread Ilya Maximets
On 2/26/24 18:44, Ilya Maximets wrote:
> Currently, ovn-controller attempts to sync all the meters on each
> ofctrl_put() call.  And the complexity of this logic is quadratic
> because for each desired meter we perform a full scan of all the
> rows in the Southbound Meter table in order to lookup a matching
> meter.  This is very inefficient.  In a setup with 25K meters this
> operation takes anywhere from 30 to 60 seconds to perform.  All
> that time ovn-controller is blocked and doesn't process any updates.
> So, addition of new ports in such a setup becomes very slow.
> 
> The meter lookup is performed by name and we have an index for it
> in the database schema.  Might as well use it.
> 
> Using the index for lookup reduces complexity to O(n * log n).
> And the time to process port addition on the same setup drops down
> to just 100 - 300 ms.
> 
> We are still iterating over all the desired meters while they can
> probably be processed incrementally instead.  But using an index
> is a simpler fix for now.
> 
> Fixes: 885655e16e63 ("controller: reconfigure ovs meters for ovn meters")
> Fixes: 999e1adfb572 ("ovn: Support configuring meters through SB Meter 
> table.")
> Reported-at: https://issues.redhat.com/browse/FDP-399
> Signed-off-by: Ilya Maximets 
> ---
> 
> This is a "performance bug", so the decision to backport this or not
> is on maintainers.  But it is severe enough, IMO.
> 
>  controller/ofctrl.c | 37 ++---
>  controller/ofctrl.h |  2 +-
>  controller/ovn-controller.c |  4 +++-
>  3 files changed, 26 insertions(+), 17 deletions(-)


Recheck-request: github-robot-_Build_and_Test
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH ovn] controller: ofctrl: Use index for meter lookups.

2024-02-26 Thread Ilya Maximets
Currently, ovn-controller attempts to sync all the meters on each
ofctrl_put() call.  And the complexity of this logic is quadratic
because for each desired meter we perform a full scan of all the
rows in the Southbound Meter table in order to lookup a matching
meter.  This is very inefficient.  In a setup with 25K meters this
operation takes anywhere from 30 to 60 seconds to perform.  All
that time ovn-controller is blocked and doesn't process any updates.
So, addition of new ports in such a setup becomes very slow.

The meter lookup is performed by name and we have an index for it
in the database schema.  Might as well use it.

Using the index for lookup reduces complexity to O(n * log n).
And the time to process port addition on the same setup drops down
to just 100 - 300 ms.

We are still iterating over all the desired meters while they can
probably be processed incrementally instead.  But using an index
is a simpler fix for now.

Fixes: 885655e16e63 ("controller: reconfigure ovs meters for ovn meters")
Fixes: 999e1adfb572 ("ovn: Support configuring meters through SB Meter table.")
Reported-at: https://issues.redhat.com/browse/FDP-399
Signed-off-by: Ilya Maximets 
---

This is a "performance bug", so the decision to backport this or not
is on maintainers.  But it is severe enough, IMO.

 controller/ofctrl.c | 37 ++---
 controller/ofctrl.h |  2 +-
 controller/ovn-controller.c |  4 +++-
 3 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/controller/ofctrl.c b/controller/ofctrl.c
index cb460a2a4..f14cd79a8 100644
--- a/controller/ofctrl.c
+++ b/controller/ofctrl.c
@@ -2257,18 +2257,29 @@ ofctrl_meter_bands_erase(struct ovn_extend_table_info 
*entry,
 }
 }
 
+static const struct sbrec_meter *
+sb_meter_lookup_by_name(struct ovsdb_idl_index *sbrec_meter_by_name,
+const char *name)
+{
+const struct sbrec_meter *sb_meter;
+struct sbrec_meter *index_row;
+
+index_row = sbrec_meter_index_init_row(sbrec_meter_by_name);
+sbrec_meter_index_set_name(index_row, name);
+sb_meter = sbrec_meter_index_find(sbrec_meter_by_name, index_row);
+sbrec_meter_index_destroy_row(index_row);
+
+return sb_meter;
+}
+
 static void
 ofctrl_meter_bands_sync(struct ovn_extend_table_info *m_existing,
-const struct sbrec_meter_table *meter_table,
+struct ovsdb_idl_index *sbrec_meter_by_name,
 struct ovs_list *msgs)
 {
 const struct sbrec_meter *sb_meter;
-SBREC_METER_TABLE_FOR_EACH (sb_meter, meter_table) {
-if (!strcmp(m_existing->name, sb_meter->name)) {
-break;
-}
-}
 
+sb_meter = sb_meter_lookup_by_name(sbrec_meter_by_name, m_existing->name);
 if (sb_meter) {
 /* OFPMC13_ADD or OFPMC13_MODIFY */
 ofctrl_meter_bands_update(sb_meter, m_existing, msgs);
@@ -2280,16 +2291,12 @@ ofctrl_meter_bands_sync(struct ovn_extend_table_info 
*m_existing,
 
 static void
 add_meter(struct ovn_extend_table_info *m_desired,
-  const struct sbrec_meter_table *meter_table,
+  struct ovsdb_idl_index *sbrec_meter_by_name,
   struct ovs_list *msgs)
 {
 const struct sbrec_meter *sb_meter;
-SBREC_METER_TABLE_FOR_EACH (sb_meter, meter_table) {
-if (!strcmp(m_desired->name, sb_meter->name)) {
-break;
-}
-}
 
+sb_meter = sb_meter_lookup_by_name(sbrec_meter_by_name, m_desired->name);
 if (!sb_meter) {
 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
 VLOG_ERR_RL(, "could not find meter named \"%s\"", m_desired->name);
@@ -2656,7 +2663,7 @@ ofctrl_put(struct ovn_desired_flow_table *lflow_table,
struct ovn_desired_flow_table *pflow_table,
struct shash *pending_ct_zones,
struct hmap *pending_lb_tuples,
-   const struct sbrec_meter_table *meter_table,
+   struct ovsdb_idl_index *sbrec_meter_by_name,
uint64_t req_cfg,
bool lflows_changed,
bool pflows_changed)
@@ -2733,10 +2740,10 @@ ofctrl_put(struct ovn_desired_flow_table *lflow_table,
  * describes the meter itself. */
 add_meter_string(m_desired, );
 } else {
-add_meter(m_desired, meter_table, );
+add_meter(m_desired, sbrec_meter_by_name, );
 }
 } else {
-ofctrl_meter_bands_sync(m_existing, meter_table, );
+ofctrl_meter_bands_sync(m_existing, sbrec_meter_by_name, );
 }
 }
 
diff --git a/controller/ofctrl.h b/controller/ofctrl.h
index 46bfccd85..502c73da6 100644
--- a/controller/ofctrl.h
+++ b/controller/ofctrl.h
@@ -58,7 +58,7 @@ void ofctrl_put(struct ovn_desired_flow_table *lflow_table,
 struct ovn_desired_flow_table *pflow_table,
 struct shash *pending_ct_zones,
 struct hmap *pending_lb_tuples,
-   

Re: [ovs-dev] [PATCH OVN] DHCP Relay Agent support for overlay subnets

2024-02-26 Thread Naveen Yerramneni


> On 24-Jan-2024, at 6:30 PM, Naveen Yerramneni  
> wrote:
> 
> 
> 
>> On 24-Jan-2024, at 8:59 AM, Numan Siddique  wrote:
>> 
>> On Tue, Jan 23, 2024 at 8:02 PM Naveen Yerramneni
>>  wrote:
>>> 
>>> 
>>> 
 On 16-Jan-2024, at 2:30 AM, Numan Siddique  wrote:
 
 On Tue, Dec 12, 2023 at 1:05 PM Naveen Yerramneni
  wrote:
> 
>  This patch contains changes to enable DHCP Relay Agent support for 
> overlay subnets.
> 
>  USE CASE:
>  --
>- Enable IP address assignment for overlay subnets from the 
> centralized DHCP server present in the underlay network.
> 
>  PREREQUISITES
>  --
>- Logical Router Port IP should be assigned (statically) from the same 
> overlay subnet which is managed by DHCP server.
>- LRP IP is used for GIADRR field when relaying the DHCP packets and 
> also same IP needs to be configured as default gateway for the overlay 
> subnet.
>- Overlay subnets managed by external DHCP server are expected to be 
> directly reachable from the underlay network.
> 
>  EXPECTED PACKET FLOW:
>  --
>  Following is the expected packet flow inorder to support DHCP rleay 
> functionality in OVN.
>1. DHCP client originates DHCP discovery (broadcast).
>2. DHCP relay (running on the OVN) receives the broadcast and forwards 
> the packet to the DHCP server by converting it to unicast. While 
> forwarding the packet, it updates the GIADDR in DHCP header to its
>   interface IP on which DHCP packet is received.
>3. DHCP server uses GIADDR field to decide the IP address pool from 
> which IP has to be assigned and DHCP offer is sent to the same IP 
> (GIADDR).
>4. DHCP relay agent forwards the offer to the client, it resets the 
> GIADDR field when forwarding the offer to the client.
>5. DHCP client sends DHCP request (broadcast) packet.
>6. DHCP relay (running on the OVN) receives the broadcast and forwards 
> the packet to the DHCP server by converting it to unicast. While 
> forwarding the packet, it updates the GIADDR in DHCP header to its
>   interface IP on which DHCP packet is received.
>7. DHCP Server sends the ACK packet.
>8. DHCP relay agent forwards the ACK packet to the client, it resets 
> the GIADDR field when forwarding the ACK to the client.
>9. All the future renew/release packets are directly exchanged between 
> DHCP client and DHCP server.
> 
>  OVN DHCP RELAY PACKET FLOW:
>  
>  To add DHCP Relay support on OVN, we need to replicate all the behavior 
> described above using distributed logical switch and logical router.
>  At, highlevel packet flow is distributed among Logical Switch and 
> Logical Router on source node (where VM is deployed) and redirect 
> chassis(RC) node.
>1. Request packet gets processed on the source node where VM is 
> deployed and relays the packet to DHCP server.
>2. Response packet is first processed on RC node (which first recieves 
> the packet from underlay network). RC node forwards the packet to the 
> right node by filling in the dest MAC and IP.
> 
>  OVN Packet flow with DHCP relay is explained below.
>1. DHCP client (VM) sends the DHCP discover packet (broadcast).
>2. Logical switch converts the packet to L2 unicast by setting the 
> destination MAC to LRP's MAC
>3. Logical Router receives the packet and redirects it to the OVN 
> controller.
>4. OVN controller updates the required information(GIADDR) in the DHCP 
> payload after doing the required checks. If any check fails, packet is 
> dropped.
>5. Logical Router converts the packet to L3 unicast and forwards it to 
> the server. This packets gets routed like any other packet (via RC node).
>6. Server replies with DHCP offer.
>7. RC node processes the DHCP offer and forwards it to the OVN 
> controller.
>8. OVN controller does sanity checks and  updates the destination MAC 
> (available in DHCP header), destination IP (available in DHCP header), 
> resets GIADDR  and reinjects the packet to datapath.
>   If any check fails, packet is dropped.
>9. Logical router updates the source IP and port and forwards the 
> packet to logical switch.
>10. Logical switch delivers the packet to the DHCP client.
>11. Similar steps are performed for Request and Ack packets.
>12. All the future renew/release packets are directly exchanged 
> between DHCP client and DHCP server
> 
>  NEW OVN ACTIONS
>  ---
> 
>1. dhcp_relay_req(, )
>- This action executes on the source node on which the DHCP 
> request originated.
>- This action relays the DHCP 

Re: [ovs-dev] [PATCH ovn v2] ovn-controller: Stop dropping bind_vport requests immediately after handling.

2024-02-26 Thread 0-day Robot
Bleep bloop.  Greetings Mohammad Heib, I am a robot and I have tried out your 
patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
WARNING: The subject, ': ', is over 70 characters, i.e., 77.
Subject: ovn-controller: Stop dropping bind_vport requests immediately after 
handling.
Lines checked: 140, Warnings: 1, Errors: 0


Please check this out.  If you feel there has been an error, please email 
acon...@redhat.com

Thanks,
0-day Robot
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovn] ovn-controller: Stop dropping bind_vport requests immediately after handling.

2024-02-26 Thread Mohammad Heib
Hi Ales,
Thank you for the review
i addressed all your comments in v2.
regarding the test actually, it's a bit hard to test that using a unit test
because we need to let ovn-sb ignore the first binding request which is not
applicable using a unit test, i was testing that manually by dropping the
connection to the SB that will drop the first bind request but it a bit
hard
to do that i an unit test.


On Fri, Feb 2, 2024 at 1:21 PM Ales Musil  wrote:

>
>
> On Fri, Feb 2, 2024 at 12:19 PM Ales Musil  wrote:
>
>>
>>
>> On Tue, Jan 30, 2024 at 2:59 PM Mohammad Heib  wrote:
>>
>>> ovn-controller immediately removes the vport_bindings requests that were
>>> generated by VIFs after handling them locally, this approach is intended
>>> to avoid binding the vport to one VIF only and allocate the vport
>>> between the different VIFs that exist in the vport:virtual-parents.
>>>
>>> Although the behavior mentioned above is correct, in some cases when the
>>> SB Database is busy the transaction that binds this vport to the desired
>>> VIF/chassis can fail and the controller will not re-try to bind the
>>> vport again because we deleted the bind_vport request in the previous
>>> loop/TXN.
>>>
>>> This patch aims to change the above behavior by storing the bind_vport
>>> requests for a bit longer time and this is done by the following:
>>> 1. add relevancy_time for each new bind_vport request and
>>>mark this request as new.
>>>
>>> 2. loop0: ovn-controller will try to handle this bind_vport request
>>>for the first time as usual (no change).
>>>
>>>3. loop0: ovn-controller will try to delete the already handled
>>> bind_vport
>>>   request as usual but first, it will check if this request is
>>> marked as new and
>>>   if the relevancy_time is still valid if so the controller will
>>> mark this
>>>   request as an old request and keep it, otherwise remove it.
>>>
>>>4.loop1: ovn-controller will try to commit the same change again for
>>>  the old request, if the previous commit in loop0 succeeded the
>>>  change will not have any effect on SB, otherwise we will try to
>>>  commit the same vport_bind request again.
>>>
>>>   5. loop1: delete the old bind_vport request.
>>>
>>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1954659
>>> Signed-off-by: Mohammad Heib 
>>> ---
>>>
>>
>> Hi  Mohammad,
>>
>> overall the change makes sense, I have a couple of comments see down
>> below.
>>
>>  controller/pinctrl.c | 58 +++-
>>>  1 file changed, 52 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/controller/pinctrl.c b/controller/pinctrl.c
>>> index bd3bd3d81..152962448 100644
>>> --- a/controller/pinctrl.c
>>> +++ b/controller/pinctrl.c
>>> @@ -6519,10 +6519,52 @@ struct put_vport_binding {
>>>  uint32_t vport_key;
>>>
>>>  uint32_t vport_parent_key;
>>> +
>>> +/* This vport record Only relevant if "relevancy_time"
>>> + * is earlier than the current_time, "new_record" is true.
>>> + */
>>> +long long int relevancy_time;
>>>
>>
>> The intention of the variable should be probably clearer e.g.
>> relevant_until_ms.
>>
>> Also reading through the rest of the code it doesn't seem possible that
>> the binding wouldn't be deleted, hence I think there isn't any need for the
>> relevancy time, it should be enough to have a flag that will be flipped. In
>> any case we will try to commit twice. I would leave out the whole relevancy
>> and keep the flag flipping it on first commit WDYT?
>>
>>
>>> +bool new_record;
>>>  };
>>>
>>>  /* Contains "struct put_vport_binding"s. */
>>>  static struct hmap put_vport_bindings;
>>> +/* the relevance time in ms of vport record before deleteing. */
>>> +#define VPORT_RELEVANCE_TIME 1500
>>> +
>>> +/*
>>> + * Validate if the vport_binding record that was added
>>> + * by the pinctrl thread is still relevant and needs
>>> + * to be updated in the SBDB or not.
>>> + *
>>> + * vport_binding record is only relevant and needs to be updated in SB
>>> if:
>>> + *   1. The put_vport_binding:relevancy_time still valid.
>>> + *   2. The put_vport_binding:new_record is true:
>>> + *   The new_record will be set to "true" when this vport record is
>>> created
>>> + *   by function "pinctrl_handle_bind_vport".
>>> + *
>>> + *   After the first attempt to bind this vport to the chassis and
>>> + *   virtual_parent by function "run_put_vport_bindings" we will
>>> set the
>>> + *   value of vpb:new_record to "false" and keep it in
>>> "put_vport_bindings"
>>> + *
>>> + *   After the second attempt of binding the vpb it will be removed
>>> by
>>> + *   this function.
>>> + *
>>> + *   The above guarantees that we will try to bind the vport twice
>>> in
>>> + *   a certain amount of time.
>>> + *
>>> +*/
>>> +static bool
>>> +is_vport_binding_relevant(struct put_vport_binding *vpb)
>>> +{
>>> +long long int cur_time = time_msec();
>>> +
>>> +if 

[ovs-dev] [PATCH ovn v2] ovn-controller: Stop dropping bind_vport requests immediately after handling.

2024-02-26 Thread Mohammad Heib
ovn-controller immediately removes the vport_bindings requests that were
generated by VIFs after handling them locally, this approach is intended
to avoid binding the vport to one VIF only and allocate the vport
between the different VIFs that exist in the vport:virtual-parents.

Although the behavior mentioned above is correct, in some cases when the
SB Database is busy the transaction that binds this vport to the desired
VIF/chassis can fail and the controller will not re-try to bind the
vport again because we deleted the bind_vport request in the previous
loop/TXN.

This patch aims to change the above behavior by storing the bind_vport
requests for a bit longer time and this is done by the following:
1. mark each new bind_vport request as new.

2. loop0: ovn-controller will try to handle this bind_vport request
   for the first time as usual (no change).

3. loop0: ovn-controller will try to delete the already handled bind_vport
   request as usual but first, it will check if this request is marked as 
new and
   if so the controller will mark this request as an old request and keep 
it,
   otherwise remove it.

4. loop1: ovn-controller will try to commit the same change again for
   the old request, if the previous commit in loop0 succeeded the
   change will not have any effect on SB, otherwise we will try to
   commit the same vport_bind request again.

5. loop1: delete the old bind_vport request.

Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1954659
Signed-off-by: Mohammad Heib 
---
V2:
  Address comments from Ales in v1.
---
 controller/pinctrl.c | 50 ++--
 1 file changed, 44 insertions(+), 6 deletions(-)

diff --git a/controller/pinctrl.c b/controller/pinctrl.c
index 98b29de9f..e2f86f299 100644
--- a/controller/pinctrl.c
+++ b/controller/pinctrl.c
@@ -6529,11 +6529,46 @@ struct put_vport_binding {
 uint32_t vport_key;
 
 uint32_t vport_parent_key;
+
+/* This vport record Only relevant if "new_record" is true. */
+bool new_record;
 };
 
 /* Contains "struct put_vport_binding"s. */
 static struct hmap put_vport_bindings;
 
+/*
+ * Validate if the vport_binding record that was added
+ * by the pinctrl thread is still relevant and needs
+ * to be updated in the SBDB or not.
+ *
+ * vport_binding record is only relevant and needs to be updated in SB if:
+ *   2. The put_vport_binding:new_record is true:
+ *   The new_record will be set to "true" when this vport record is created
+ *   by function "pinctrl_handle_bind_vport".
+ *
+ *   After the first attempt to bind this vport to the chassis and
+ *   virtual_parent by function "run_put_vport_bindings" we will set the
+ *   value of vpb:new_record to "false" and keep it in "put_vport_bindings"
+ *
+ *   After the second attempt of binding the vpb it will be removed by
+ *   this function.
+ *
+ *   The above guarantees that we will try to bind the vport twice in
+ *   a certain amount of time.
+ *
+*/
+static bool
+is_vport_binding_relevant(struct put_vport_binding *vpb)
+{
+
+if (vpb->new_record) {
+vpb->new_record = false;
+return true;
+}
+return false;
+}
+
 static void
 init_put_vport_bindings(void)
 {
@@ -6541,18 +6576,21 @@ init_put_vport_bindings(void)
 }
 
 static void
-flush_put_vport_bindings(void)
+flush_put_vport_bindings(bool force_flush)
 {
 struct put_vport_binding *vport_b;
-HMAP_FOR_EACH_POP (vport_b, hmap_node, _vport_bindings) {
-free(vport_b);
+HMAP_FOR_EACH_SAFE (vport_b, hmap_node, _vport_bindings) {
+if (!is_vport_binding_relevant(vport_b) || force_flush) {
+hmap_remove(_vport_bindings, _b->hmap_node);
+free(vport_b);
+}
 }
 }
 
 static void
 destroy_put_vport_bindings(void)
 {
-flush_put_vport_bindings();
+flush_put_vport_bindings(true);
 hmap_destroy(_vport_bindings);
 }
 
@@ -6630,7 +6668,7 @@ run_put_vport_bindings(struct ovsdb_idl_txn 
*ovnsb_idl_txn,
   sbrec_port_binding_by_key, chassis, vpb);
 }
 
-flush_put_vport_bindings();
+flush_put_vport_bindings(false);
 }
 
 /* Called with in the pinctrl_handler thread context. */
@@ -6668,7 +6706,7 @@ pinctrl_handle_bind_vport(
 vpb->dp_key = dp_key;
 vpb->vport_key = vport_key;
 vpb->vport_parent_key = vport_parent_key;
-
+vpb->new_record = true;
 notify_pinctrl_main();
 }
 
-- 
2.34.3

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2] dp-packet: Don't offload inner csum if outer isn't supported.

2024-02-26 Thread Mike Pattrick
Some network cards support inner checksum offloading but not outer
checksum offloading. Currently OVS will resolve that outer checksum but
allows the network card to resolve the inner checksum, invalidating the
outer checksum in the process.

Now if we can't offload outer checksums, we don't offload inner either.

Reported-at: https://issues.redhat.com/browse/FDP-363
Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.")
Signed-off-by: Mike Pattrick 
---
nb: I also tested a more complex patch that only resolved the inner
checksum and offloaded the UDP layer. This didn't noticably improve
performance.
v2: Added IPv4 flag
---
 lib/dp-packet.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/lib/dp-packet.c b/lib/dp-packet.c
index 305822293..df7bf8e6b 100644
--- a/lib/dp-packet.c
+++ b/lib/dp-packet.c
@@ -592,6 +592,18 @@ dp_packet_ol_send_prepare(struct dp_packet *p, uint64_t 
flags)
 if (dp_packet_hwol_is_tunnel_geneve(p) ||
 dp_packet_hwol_is_tunnel_vxlan(p)) {
 tnl_inner = true;
+
+/* If the TX interface doesn't support UDP tunnel offload but does
+ * support inner checksum offload and an outer UDP checksum is
+ * required, then we can't offload inner checksum either. As that would
+ * invalidate the outer checksum. */
+if (!(flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM) &&
+dp_packet_hwol_is_outer_udp_cksum(p)) {
+flags &= ~(NETDEV_TX_OFFLOAD_TCP_CKSUM |
+   NETDEV_TX_OFFLOAD_UDP_CKSUM |
+   NETDEV_TX_OFFLOAD_SCTP_CKSUM |
+   NETDEV_TX_OFFLOAD_IPV4_CKSUM);
+}
 }
 
 if (dp_packet_hwol_tx_ip_csum(p)) {
-- 
2.39.3

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH ovn v3] OVN-SB: Exposes igmp group protocol version through IGMP table.

2024-02-26 Thread Mohammad Heib
Expose the igmp/mld group protocol version through the
IGMP_GROUP table in SBDB.

This patch can be used by ovn consumer for debuggability purposes, user
now can  match between the protocol version used in the OVN logical
switches and the uplink ports.

Rreported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2160476
Signed-off-by: Mohammad Heib 
---
v3:
  Address Ales comments in v2 and rebase over main.
---
 NEWS  |  2 ++
 controller/ip-mcast.c | 11 +--
 controller/ip-mcast.h | 11 ++-
 controller/pinctrl.c  | 16 ++--
 northd/ovn-northd.c   |  2 +-
 ovn-sb.ovsschema  |  5 +++--
 ovn-sb.xml|  4 
 tests/ovn.at  |  3 +++
 8 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/NEWS b/NEWS
index c0131ceee..784fedfa0 100644
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,8 @@ Post v24.03.0
 cloned to all unknown ports connected to the same Logical Switch.
   - Added a new logical switch port option "disable_arp_nd_rsp" to
 disable adding the ARP responder flows if set to true.
+  - IGMP_Group has new "protocol" column that displays the the group
+protocol version.
 
 OVN v24.03.0 - xx xxx 
 --
diff --git a/controller/ip-mcast.c b/controller/ip-mcast.c
index b457c7e69..0969cc07f 100644
--- a/controller/ip-mcast.c
+++ b/controller/ip-mcast.c
@@ -111,11 +111,12 @@ igmp_mrouter_create(struct ovsdb_idl_txn *idl_txn,
 }
 
 void
-igmp_group_update_ports(const struct sbrec_igmp_group *g,
+igmp_group_update(const struct sbrec_igmp_group *g,
 struct ovsdb_idl_index *datapaths,
 struct ovsdb_idl_index *port_bindings,
 const struct mcast_snooping *ms OVS_UNUSED,
-const struct mcast_group *mc_group)
+const struct mcast_group *mc_group,
+const bool igmp_support_protocol)
 OVS_REQ_RDLOCK(ms->rwlock)
 {
 struct igmp_group_port *old_ports_storage =
@@ -155,6 +156,12 @@ igmp_group_update_ports(const struct sbrec_igmp_group *g,
 sbrec_igmp_group_update_ports_delvalue(g, igmp_port->port);
 }
 
+/* set Group protocol */
+if (igmp_support_protocol) {
+ sbrec_igmp_group_set_protocol(g, mcast_snooping_group_protocol_str(
+   mc_group->protocol_version));
+}
+
 free(old_ports_storage);
 hmap_destroy(_ports);
 }
diff --git a/controller/ip-mcast.h b/controller/ip-mcast.h
index eebada968..026503139 100644
--- a/controller/ip-mcast.h
+++ b/controller/ip-mcast.h
@@ -47,11 +47,12 @@ struct sbrec_igmp_group *igmp_mrouter_create(
 const struct sbrec_chassis *chassis,
 bool igmp_group_has_chassis_name);
 
-void igmp_group_update_ports(const struct sbrec_igmp_group *g,
- struct ovsdb_idl_index *datapaths,
- struct ovsdb_idl_index *port_bindings,
- const struct mcast_snooping *ms,
- const struct mcast_group *mc_group)
+void igmp_group_update(const struct sbrec_igmp_group *g,
+   struct ovsdb_idl_index *datapaths,
+   struct ovsdb_idl_index *port_bindings,
+   const struct mcast_snooping *ms,
+   const struct mcast_group *mc_group,
+   const bool igmp_support_protocol)
 OVS_REQ_RDLOCK(ms->rwlock);
 void
 igmp_mrouter_update_ports(const struct sbrec_igmp_group *g,
diff --git a/controller/pinctrl.c b/controller/pinctrl.c
index 98b29de9f..66e390e80 100644
--- a/controller/pinctrl.c
+++ b/controller/pinctrl.c
@@ -181,6 +181,7 @@ struct pinctrl {
 bool fdb_can_timestamp;
 bool dns_supports_ovn_owned;
 bool igmp_group_has_chassis_name;
+bool igmp_support_protocol;
 };
 
 static struct pinctrl pinctrl;
@@ -3599,6 +3600,17 @@ pinctrl_update(const struct ovsdb_idl *idl, const char 
*br_int_name)
 notify_pinctrl_handler();
 }
 
+bool igmp_support_proto =
+sbrec_server_has_igmp_group_table_col_protocol(idl);
+if (igmp_support_proto != pinctrl.igmp_support_protocol) {
+pinctrl.igmp_support_protocol = igmp_support_proto;
+
+/* Notify pinctrl_handler that igmp protocol column
+ * availability has changed. */
+notify_pinctrl_handler();
+}
+
+
 ovs_mutex_unlock(_mutex);
 }
 
@@ -5409,9 +5421,9 @@ ip_mcast_sync(struct ovsdb_idl_txn *ovnsb_idl_txn,
 chassis, pinctrl.igmp_group_has_chassis_name);
 }
 
-igmp_group_update_ports(sbrec_igmp, sbrec_datapath_binding_by_key,
+igmp_group_update(sbrec_igmp, sbrec_datapath_binding_by_key,
 sbrec_port_binding_by_key, ip_ms->ms,
-mc_group);
+mc_group, pinctrl.igmp_support_protocol);
 }
 
 /* Mrouters. */
diff --git 

Re: [ovs-dev] [PATCH] ofproto-dpif-trace: Fix infinite recirculation tracing.

2024-02-26 Thread Simon Horman
On Thu, Feb 22, 2024 at 04:06:32PM +0100, Ilya Maximets wrote:
> Trace attempts to process all the recirculations.  However, if there
> is a recirculation loop, i.e. if every recirculation generates another
> recirculation, this process will never stop.  It will grind until the
> trace fills the system memory.
> 
> A simple reproducer:
> 
>   make sandbox
>   ovs-vsctl add-br br0
>   ovs-vsctl add-port br0 p1
>   ovs-ofctl add-flow br0 "table=0,in_port=p1,ip,actions=ct(table=0)"
>   ovs-appctl ofproto/trace br0 in_port=p1,ip
> 
> Limit the number of recirculations trace is processing with a fairly
> arbitrary number - 4096 (loosely based on the resubmit limit, but
> they are not actually related).
> 
> Not adding a test for this since it's only for a trace, but also
> because the test may lead to OOM event in a system if the test fails,
> which is not nice.
> 
> Fixes: e6bc8e749381 ("ofproto/trace: Add support for tracing conntrack 
> recirculation")
> Reported-by: Jaime Caamaño Ruiz 
> Signed-off-by: Ilya Maximets 

Acked-by: Simon Horman 

FWIIW, 4096 strikes me as an excessively generous limit.
But I have no reason to argue for a smaller value.

...
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH ovn] northd: Don't skip the unSNAT stage for traffic towards VIPs.

2024-02-26 Thread Dumitru Ceara
Otherwise, in case there's also a SNAT rule that uses the VIP as
external IP, we break sessions initiated from behind the VIP.

This partially reverts 832893bdbb42 ("ovn-northd: Skip unsnat flows for
load balancer vips in router ingress pipeline").  That's OK because
commit 384a7c6237da ("northd: Refactor Logical Flows for routers with
DNAT/Load Balancers") addressed the original issue in a better way:

In the reply direction, the order of traversal of the tables
"lr_in_defrag", "lr_in_unsnat" and "lr_in_dnat" adds incorrect
datapath flows that check ct_state in the wrong conntrack zone.
This is illustrated below where reply trafic enters the physical host
port (6) and traverses DNAT zone (14), SNAT zone (default), back to the
DNAT zone and then on to Logical Switch Port zone (22). The third
flow is incorrectly checking the state from the SNAT zone instead
of the DNAT zone.

We also add a system test to ensure traffic initiated from behind a VIP
+ SNAT is not broken.

Another nice side effect is that the northd I-P is slightly simplified
because we don't need to track NAT external IPs anymore.

Fixes: 832893bdbb42 ("ovn-northd: Skip unsnat flows for load balancer vips in 
router ingress pipeline")
Reported-at: https://issues.redhat.com/browse/FDP-291
Signed-off-by: Dumitru Ceara 
---
 northd/en-lflow.c   |  3 +-
 northd/en-lr-nat.c  |  5 ---
 northd/en-lr-nat.h  |  3 --
 northd/en-lr-stateful.c | 51 --
 northd/en-lr-stateful.h |  9 +
 northd/northd.c | 33 +
 tests/ovn-northd.at | 24 +
 tests/system-ovn.at | 80 +
 8 files changed, 91 insertions(+), 117 deletions(-)

diff --git a/northd/en-lflow.c b/northd/en-lflow.c
index f1a83839df..c4b927fb8c 100644
--- a/northd/en-lflow.c
+++ b/northd/en-lflow.c
@@ -174,8 +174,7 @@ lflow_lr_stateful_handler(struct engine_node *node, void 
*data)
 struct ed_type_lr_stateful *lr_sful_data =
 engine_get_input_data("lr_stateful", node);
 
-if (!lr_stateful_has_tracked_data(_sful_data->trk_data)
-|| lr_sful_data->trk_data.vip_nats_changed) {
+if (!lr_stateful_has_tracked_data(_sful_data->trk_data)) {
 return false;
 }
 
diff --git a/northd/en-lr-nat.c b/northd/en-lr-nat.c
index ad11025c69..215d924e42 100644
--- a/northd/en-lr-nat.c
+++ b/northd/en-lr-nat.c
@@ -219,10 +219,6 @@ lr_nat_record_init(struct lr_nat_record *lrnat_rec,
 lrnat_rec->nbr_uuid = od->nbr->header_.uuid;
 
 shash_init(_rec->snat_ips);
-sset_init(_rec->external_ips);
-for (size_t i = 0; i < od->nbr->n_nat; i++) {
-sset_add(_rec->external_ips, od->nbr->nat[i]->external_ip);
-}
 
 sset_init(_rec->external_macs);
 lrnat_rec->has_distributed_nat = false;
@@ -343,7 +339,6 @@ lr_nat_record_clear(struct lr_nat_record *lrnat_rec)
 }
 
 free(lrnat_rec->nat_entries);
-sset_destroy(_rec->external_ips);
 sset_destroy(_rec->external_macs);
 }
 
diff --git a/northd/en-lr-nat.h b/northd/en-lr-nat.h
index 71d3a1782d..81a7b0abd7 100644
--- a/northd/en-lr-nat.h
+++ b/northd/en-lr-nat.h
@@ -64,9 +64,6 @@ struct lr_nat_record {
 
 bool has_distributed_nat;
 
-/* Set of nat external ips on the router. */
-struct sset external_ips;
-
 /* Set of nat external macs on the router. */
 struct sset external_macs;
 
diff --git a/northd/en-lr-stateful.c b/northd/en-lr-stateful.c
index 6d0192487c..baf1bd2f89 100644
--- a/northd/en-lr-stateful.c
+++ b/northd/en-lr-stateful.c
@@ -82,7 +82,6 @@ static void remove_lrouter_lb_reachable_ips(struct 
lr_stateful_record *,
 enum lb_neighbor_responder_mode,
 const struct sset *lb_ips_v4,
 const struct sset *lb_ips_v6);
-static bool lr_stateful_rebuild_vip_nats(struct lr_stateful_record *);
 
 /* 'lr_stateful' engine node manages the NB logical router LB data.
  */
@@ -110,7 +109,6 @@ en_lr_stateful_clear_tracked_data(void *data_)
 struct ed_type_lr_stateful *data = data_;
 
 hmapx_clear(>trk_data.crupdated);
-data->trk_data.vip_nats_changed = false;
 }
 
 void
@@ -198,10 +196,6 @@ lr_stateful_lb_data_handler(struct engine_node *node, void 
*data_)
 
 /* Add the lr_stateful_rec rec to the tracking data. */
 hmapx_add(>trk_data.crupdated, lr_stateful_rec);
-
-if (!sset_is_empty(_stateful_rec->vip_nats)) {
-data->trk_data.vip_nats_changed = true;
-}
 continue;
 }
 
@@ -319,9 +313,6 @@ lr_stateful_lb_data_handler(struct engine_node *node, void 
*data_)
 
 HMAPX_FOR_EACH (hmapx_node, >trk_data.crupdated) {
 struct lr_stateful_record *lr_stateful_rec = hmapx_node->data;
-if (lr_stateful_rebuild_vip_nats(lr_stateful_rec)) {
-data->trk_data.vip_nats_changed = true;
-  

[ovs-dev] [PATCH v2] conntrack: Fix flush not flushing all elements.

2024-02-26 Thread Xavier Simonart
On netdev datapath, when a ct element was cleaned, the cmap
could be shrinked, potentially causing some elements to be skipped
in the flush iteration.

Fixes: 967bb5c5cd90 ("conntrack: Add rcu support.")
Signed-off-by: Xavier Simonart 
---
v2: - Updated commit message.
- Use compose-packet instead of hex packet content.
- Use dnl for comments.
- Remove unnecessary errors in OVS_TRAFFIC_VSWITCHD_STOP.
- Rebased on origin/master.
---
 lib/conntrack.c | 14 
 lib/conntrack.h |  1 +
 tests/system-traffic.at | 47 +
 3 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 8a7056bac..5786424f6 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -2651,25 +2651,19 @@ conntrack_dump_start(struct conntrack *ct, struct 
conntrack_dump *dump,
 
 dump->ct = ct;
 *ptot_bkts = 1; /* Need to clean up the callers. */
+dump->cursor = cmap_cursor_start(>conns);
 return 0;
 }
 
 int
 conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
 {
-struct conntrack *ct = dump->ct;
 long long now = time_msec();
 
-for (;;) {
-struct cmap_node *cm_node = cmap_next_position(>conns,
-   >cm_pos);
-if (!cm_node) {
-break;
-}
-struct conn_key_node *keyn;
-struct conn *conn;
+struct conn_key_node *keyn;
+struct conn *conn;
 
-INIT_CONTAINER(keyn, cm_node, cm_node);
+CMAP_CURSOR_FOR_EACH_CONTINUE (keyn, cm_node, >cursor) {
 if (keyn->dir != CT_DIR_FWD) {
 continue;
 }
diff --git a/lib/conntrack.h b/lib/conntrack.h
index ee7da099e..aa12a1847 100644
--- a/lib/conntrack.h
+++ b/lib/conntrack.h
@@ -109,6 +109,7 @@ struct conntrack_dump {
 union {
 struct cmap_position cm_pos;
 struct hmap_position hmap_pos;
+struct cmap_cursor cursor;
 };
 bool filter_zone;
 uint16_t zone;
diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index 98e494abf..34f93b2e5 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -8389,6 +8389,53 @@ AT_CHECK([ovs-pcap client.pcap | grep 
20102000], [0], [dnl
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 
+AT_SETUP([conntrack - Flush many conntrack entries by port])
+CHECK_CONNTRACK()
+OVS_TRAFFIC_VSWITCHD_START()
+
+ADD_NAMESPACES(at_ns0, at_ns1)
+
+ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24")
+ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
+
+AT_DATA([flows.txt], [dnl
+priority=100,in_port=1,udp,action=ct(zone=1,commit),2
+])
+
+AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
+
+dnl 20 packets from port 1 and 1 packet from port 2.
+flow_l3="\
+eth_src=50:54:00:00:00:09,eth_dst=50:54:00:00:00:0a,dl_type=0x0800,\
+nw_src=10.1.1.1,nw_dst=10.1.1.2,nw_proto=17,nw_ttl=64,nw_frag=no"
+
+for i in $(seq 1 20); do
+frame=$(ovs-ofctl compose-packet --bare "$flow_l3, udp_src=1,udp_dst=$i")
+AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=$frame 
actions=resubmit(,0)"])
+done
+frame=$(ovs-ofctl compose-packet --bare "$flow_l3, udp_src=2,udp_dst=1")
+AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=$frame 
actions=resubmit(,0)"])
+
+: > conntrack
+
+for i in $(seq 1 20); do
+echo 
"udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=${i}),reply=(src=10.1.1.2,dst=10.1.1.1,sport=${i},dport=1),zone=1"
 >> conntrack
+done
+echo 
"udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=2,dport=1),reply=(src=10.1.1.2,dst=10.1.1.1,sport=1,dport=2),zone=1"
 >> conntrack
+
+sort conntrack > expout
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep -F "src=10.1.1.1," | sort ], 
[0], [expout])
+
+dnl Check that flushing conntrack by port 1 flush all ct for port 1 but keeps 
ct for port 2.
+AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_proto=17,ct_tp_src=1'])
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep -F "src=10.1.1.1," | sort ], 
[0], [dnl
+udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=2,dport=1),reply=(src=10.1.1.2,dst=10.1.1.1,sport=1,dport=2),zone=1
+])
+
+OVS_TRAFFIC_VSWITCHD_STOP
+AT_CLEANUP
+
 AT_BANNER([IGMP])
 
 AT_SETUP([IGMP - flood under normal action])
-- 
2.41.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v5 1/2] util: Support checking for kernel versions.

2024-02-26 Thread Felix Huettner via dev
Extract checking for a given kernel version to a separate function.
It will be used also in the next patch.

Signed-off-by: Felix Huettner 
---
v4->v5:
- fix wrong ifdef that broke on macos
- fix ovs_kernel_is_version_or_newer working in reverse than desired
- ovs_kernel_is_version_or_newer now always returns false if uname
  errors (Thanks Eelco)
v4:
- extract function to check kernel version
 lib/netdev-linux.c | 14 +++---
 lib/util.c | 27 +++
 lib/util.h |  4 
 3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index bf91ef462..51bd71ae3 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -6427,18 +6427,10 @@ getqdisc_is_safe(void)
 static bool safe = false;
 
 if (ovsthread_once_start()) {
-struct utsname utsname;
-int major, minor;
-
-if (uname() == -1) {
-VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
-} else if (!ovs_scan(utsname.release, "%d.%d", , )) {
-VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
-} else if (major < 2 || (major == 2 && minor < 35)) {
-VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
-  utsname.release);
-} else {
+if (ovs_kernel_is_version_or_newer(2, 35)) {
 safe = true;
+} else {
+VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel");
 }
 ovsthread_once_done();
 }
diff --git a/lib/util.c b/lib/util.c
index 3fb3a4b40..f5b2da095 100644
--- a/lib/util.c
+++ b/lib/util.c
@@ -27,6 +27,7 @@
 #include 
 #ifdef __linux__
 #include 
+#include 
 #endif
 #include 
 #include 
@@ -2500,3 +2501,29 @@ OVS_CONSTRUCTOR(winsock_start) {
}
 }
 #endif
+
+#ifdef __linux__
+bool
+ovs_kernel_is_version_or_newer(int target_major, int target_minor)
+{
+static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
+static int current_major, current_minor = -1;
+
+if (ovsthread_once_start()) {
+struct utsname utsname;
+
+if (uname() == -1) {
+VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
+} else if (!ovs_scan(utsname.release, "%d.%d",
+_major, _minor)) {
+VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
+}
+ovsthread_once_done();
+}
+if (current_major == -1 || current_minor == -1) {
+return false;
+}
+return current_major > target_major || (
+current_major == target_major && current_minor > target_minor);
+}
+#endif
diff --git a/lib/util.h b/lib/util.h
index f2d45bcac..55718fd87 100644
--- a/lib/util.h
+++ b/lib/util.h
@@ -611,4 +611,8 @@ int ftruncate(int fd, off_t length);
 }
 #endif
 
+#ifdef __linux__
+bool ovs_kernel_is_version_or_newer(int target_major, int target_minor);
+#endif
+
 #endif /* util.h */

base-commit: 166ee41d282c506d100bc2185d60af277121b55b
-- 
2.43.2

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v5 2/2] netlink-conntrack: Optimize flushing ct zone.

2024-02-26 Thread Felix Huettner via dev
Previously the kernel did not provide a netlink interface to flush/list
only conntrack entries matching a specific zone. With [1] and [2] it is now
possible to flush and list conntrack entries filtered by zone. Older
kernels not yet supporting this feature will ignore the filter.
For the list request that means just returning all entries (which we can
then filter in userspace as before).
For the flush request that means deleting all conntrack entries.

The implementation is now identical to the windows one, so we combine
them.

These significantly improves the performance of flushing conntrack zones
when the conntrack table is large. Since flushing a conntrack zone is
normally triggered via an openflow command it blocks the main ovs thread
and thereby also blocks new flows from being applied. Using this new
feature we can reduce the flushing time for zones by around 93%.

In combination with OVN the creation of a Logical_Router (which causes
the flushing of a ct zone) could block other operations, e.g. the
failover of Logical_Routers (as they cause new flows to be created).
This is visible from a user perspective as a ovn-controller that is idle
(as it waits for vswitchd) and vswitchd reporting:
"blocked 1000 ms waiting for main to quiesce" (potentially with ever
increasing times).

The following performance tests where run in a qemu vm with 500.000
conntrack entries distributed evenly over 500 ct zones using `ovstest
test-netlink-conntrack flush zone=`.

  |  flush zone with 1000 entries  |   flush zone with no entry |
  +-+--+-+--|
  |   with the patch| without  |   with the patch| without  |
  +--+--+--+--+--+--|
  | v6.8-rc4 |  v6.7.1  | v6.8-rc4 | v6.8-rc4 |  v6.7.1  | v6.8-rc4 |
+-+--+--+--+--+--+--|
| Min |  0.260   |  3.946   |  3.497   |  0.228   |  3.462   |  3.212   |
| Median  |  0.319   |  4.237   |  4.349   |  0.298   |  4.460   |  4.010   |
| 90%ile  |  0.335   |  4.367   |  4.522   |  0.325   |  4.662   |  4.572   |
| 99%ile  |  0.348   |  4.495   |  4.773   |  0.340   |  4.931   |  6.003   |
| Max |  0.362   |  4.543   |  5.054   |  0.348   |  5.390   |  6.396   |
| Mean|  0.320   |  4.236   |  4.331   |  0.296   |  4.430   |  4.071   |
| Total   |  80.02   |  1058|  1082|  73.93   |  1107|  1017|

[1]: 
https://github.com/torvalds/linux/commit/eff3c558bb7e61c41b53e4c8130e514a5a4df9ba
[2]: 
https://github.com/torvalds/linux/commit/fa173a1b4e3fd1ab5451cbc57de6fc624c824b0a

Co-Authored-By: Luca Czesla 
Signed-off-by: Luca Czesla 
Co-Authored-By: Max Lamprecht 
Signed-off-by: Max Lamprecht 
Signed-off-by: Felix Huettner 
---
v4->v5: none
v3->v4:
- combine the flush logic with windows implementation
v2->v3:
- update description to include upstream fix (Thanks to Ilya for finding
  that issue)
v1->v2:
- fixed wrong signed-off-by
 lib/netlink-conntrack.c | 52 -
 tests/system-traffic.at |  8 +++
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/lib/netlink-conntrack.c b/lib/netlink-conntrack.c
index 492bfcffb..d9015d9f0 100644
--- a/lib/netlink-conntrack.c
+++ b/lib/netlink-conntrack.c
@@ -141,6 +141,9 @@ nl_ct_dump_start(struct nl_ct_dump_state **statep, const 
uint16_t *zone,
 
 nl_msg_put_nfgenmsg(>buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK,
 IPCTNL_MSG_CT_GET, NLM_F_REQUEST);
+if (zone) {
+nl_msg_put_be16(>buf, CTA_ZONE, htons(*zone));
+}
 nl_dump_start(>dump, NETLINK_NETFILTER, >buf);
 ofpbuf_clear(>buf);
 
@@ -263,11 +266,9 @@ out:
 return err;
 }
 
-#ifdef _WIN32
-int
-nl_ct_flush_zone(uint16_t flush_zone)
+static int
+nl_ct_flush_zone_with_cta_zone(uint16_t flush_zone)
 {
-/* Windows can flush a specific zone */
 struct ofpbuf buf;
 int err;
 
@@ -282,24 +283,63 @@ nl_ct_flush_zone(uint16_t flush_zone)
 
 return err;
 }
+
+#ifdef _WIN32
+int
+nl_ct_flush_zone(uint16_t flush_zone)
+{
+return nl_ct_flush_zone_with_cta_zone(flush_zone);
+}
 #else
+
+static bool
+netlink_flush_supports_zone(void)
+{
+static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
+static bool supported = false;
+
+if (ovsthread_once_start()) {
+if (ovs_kernel_is_version_or_newer(6, 8)) {
+supported = true;
+} else {
+VLOG_INFO("disabling conntrack flush by zone. "
+  "Not supported in Linux kernel");
+}
+ovsthread_once_done();
+}
+return supported;
+}
+
 int
 nl_ct_flush_zone(uint16_t flush_zone)
 {
-/* Apparently, there's no netlink interface to flush a specific zone.
+/* In older kernels, there was no netlink interface to flush a specific
+ * conntrack zone.
  * This code dumps every connection, checks the zone and eventually