Re: [ovs-dev] [PATCH v2 2/2] ipf: Handle common case of ipf defragmentation.

2024-06-02 Thread Paolo Valerio
Mike Pattrick  writes:

> When conntrack is reassembling packet fragments, the same reassembly
> context can be shared across multiple threads handling different packets
> simultaneously. Once a full packet is assembled, it is added to a packet
> batch for processing, in the case where there are multiple different pmd
> threads accessing conntrack simultaneously, there is a race condition
> where the reassembled packet may be added to an arbitrary batch even if
> the current batch is available.
>
> When this happens, the packet may be handled incorrectly as it is
> inserted into a random openflow execution pipeline, instead of the
> pipeline for that packets flow.
>
> This change makes a best effort attempt to try to add the defragmented
> packet to the current batch. directly. This should succeed most of the
> time.
>
> Fixes: 4ea96698f667 ("Userspace datapath: Add fragmentation handling.")
> Reported-at: https://issues.redhat.com/browse/FDP-560
> Signed-off-by: Mike Pattrick 
> ---

Acked-by: Paolo Valerio 

>  lib/ipf.c | 27 ---
>  1 file changed, 20 insertions(+), 7 deletions(-)
>
> diff --git a/lib/ipf.c b/lib/ipf.c
> index 3c8960be3..2d715f5e9 100644
> --- a/lib/ipf.c
> +++ b/lib/ipf.c
> @@ -506,13 +506,15 @@ ipf_reassemble_v6_frags(struct ipf_list *ipf_list)
>  }
>  
>  /* Called when a frag list state transitions to another state. This is
> - * triggered by new fragment for the list being received.*/
> -static void
> +* triggered by new fragment for the list being received. Returns a 
> reassembled
> +* packet if this fragment has completed one. */
> +static struct reassembled_pkt *
>  ipf_list_state_transition(struct ipf *ipf, struct ipf_list *ipf_list,
>bool ff, bool lf, bool v6)
>  OVS_REQUIRES(ipf->ipf_lock)
>  {
>  enum ipf_list_state curr_state = ipf_list->state;
> +struct reassembled_pkt *ret = NULL;
>  enum ipf_list_state next_state;
>  switch (curr_state) {
>  case IPF_LIST_STATE_UNUSED:
> @@ -562,12 +564,15 @@ ipf_list_state_transition(struct ipf *ipf, struct 
> ipf_list *ipf_list,
>  ipf_reassembled_list_add(>reassembled_pkt_list, rp);
>  ipf_expiry_list_remove(ipf_list);
>  next_state = IPF_LIST_STATE_COMPLETED;
> +ret = rp;
>  } else {
>  next_state = IPF_LIST_STATE_REASS_FAIL;
>  }
>  }
>  }
>  ipf_list->state = next_state;
> +
> +return ret;
>  }
>  
>  /* Some sanity checks are redundant, but prudent, in case code paths for
> @@ -799,7 +804,8 @@ ipf_is_frag_duped(const struct ipf_frag *frag_list, int 
> last_inuse_idx,
>  static bool
>  ipf_process_frag(struct ipf *ipf, struct ipf_list *ipf_list,
>   struct dp_packet *pkt, uint16_t start_data_byte,
> - uint16_t end_data_byte, bool ff, bool lf, bool v6)
> + uint16_t end_data_byte, bool ff, bool lf, bool v6,
> + struct reassembled_pkt **rp)
>  OVS_REQUIRES(ipf->ipf_lock)
>  {
>  bool duped_frag = ipf_is_frag_duped(ipf_list->frag_list,
> @@ -820,7 +826,7 @@ ipf_process_frag(struct ipf *ipf, struct ipf_list 
> *ipf_list,
>  ipf_list->last_inuse_idx++;
>  atomic_count_inc(>nfrag);
>  ipf_count(ipf, v6, IPF_NFRAGS_ACCEPTED);
> -ipf_list_state_transition(ipf, ipf_list, ff, lf, v6);
> +*rp = ipf_list_state_transition(ipf, ipf_list, ff, lf, v6);
>  } else {
>  OVS_NOT_REACHED();
>  }
> @@ -853,7 +859,8 @@ ipf_list_init(struct ipf_list *ipf_list, struct 
> ipf_list_key *key,
>   * to a list of fragemnts. */
>  static bool
>  ipf_handle_frag(struct ipf *ipf, struct dp_packet *pkt, ovs_be16 dl_type,
> -uint16_t zone, long long now, uint32_t hash_basis)
> +uint16_t zone, long long now, uint32_t hash_basis,
> +struct reassembled_pkt **rp)
>  OVS_REQUIRES(ipf->ipf_lock)
>  {
>  struct ipf_list_key key;
> @@ -922,7 +929,7 @@ ipf_handle_frag(struct ipf *ipf, struct dp_packet *pkt, 
> ovs_be16 dl_type,
>  }
>  
>  return ipf_process_frag(ipf, ipf_list, pkt, start_data_byte,
> -end_data_byte, ff, lf, v6);
> +end_data_byte, ff, lf, v6, rp);
>  }
>  
>  /* Filters out fragments from a batch of fragments and adjust the batch. */
> @@ -941,11 +948,17 @@ ipf_extract_frags_from_batch(struct ipf *ipf, struct 
> dp_packet_batch *pb,
>||
>  

Re: [ovs-dev] [PATCH v2 1/2] ipf: Only add fragments to batch of same dl_type.

2024-06-02 Thread Paolo Valerio
Mike Pattrick  writes:

> When conntrack is reassembling packet fragments, the same reassembly
> context can be shared across multiple threads handling different packets
> simultaneously. Once a full packet is assembled, it is added to a packet
> batch for processing, this is most likely the batch that added it in the
> first place, but that isn't a guarantee.
>
> The packets in these batches should be segregated by network protocol
> version (ipv4 vs ipv6) for conntrack defragmentation to function
> appropriately. However, there are conditions where we would add a
> reassembled packet of one type to a batch of another.
>
> This change introduces checks to make sure that reassembled or expired
> fragments are only added to packet batches of the same type.
>
> Fixes: 4ea96698f667 ("Userspace datapath: Add fragmentation handling.")
> Reported-at: https://issues.redhat.com/browse/FDP-560
> Signed-off-by: Mike Pattrick 
> ---

Acked-by: Paolo Valerio 

>  lib/ipf.c | 12 ++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/lib/ipf.c b/lib/ipf.c
> index 7d74e2c13..3c8960be3 100644
> --- a/lib/ipf.c
> +++ b/lib/ipf.c
> @@ -1063,6 +1063,9 @@ ipf_send_completed_frags(struct ipf *ipf, struct 
> dp_packet_batch *pb,
>  struct ipf_list *ipf_list;
>  
>  LIST_FOR_EACH_SAFE (ipf_list, list_node, >frag_complete_list) {
> +if ((ipf_list->key.dl_type == htons(ETH_TYPE_IPV6)) != v6) {
> +continue;
> +}
>  if (ipf_send_frags_in_list(ipf, ipf_list, pb, 
> IPF_FRAG_COMPLETED_LIST,
> v6, now)) {
>  ipf_completed_list_clean(>frag_lists, ipf_list);
> @@ -1096,6 +1099,9 @@ ipf_send_expired_frags(struct ipf *ipf, struct 
> dp_packet_batch *pb,
>  size_t lists_removed = 0;
>  
>  LIST_FOR_EACH_SAFE (ipf_list, list_node, >frag_exp_list) {
> +if ((ipf_list->key.dl_type == htons(ETH_TYPE_IPV6)) != v6) {
> +continue;
> +}
>  if (now <= ipf_list->expiration ||
>  lists_removed >= IPF_FRAG_LIST_MAX_EXPIRED) {
>  break;
> @@ -1116,7 +1122,8 @@ ipf_send_expired_frags(struct ipf *ipf, struct 
> dp_packet_batch *pb,
>  /* Adds a reassmebled packet to a packet batch to be processed by the caller.
>   */
>  static void
> -ipf_execute_reass_pkts(struct ipf *ipf, struct dp_packet_batch *pb)
> +ipf_execute_reass_pkts(struct ipf *ipf, struct dp_packet_batch *pb,
> +   ovs_be16 dl_type)
>  {
>  if (ovs_list_is_empty(>reassembled_pkt_list)) {
>  return;
> @@ -1127,6 +1134,7 @@ ipf_execute_reass_pkts(struct ipf *ipf, struct 
> dp_packet_batch *pb)
>  
>  LIST_FOR_EACH_SAFE (rp, rp_list_node, >reassembled_pkt_list) {
>  if (!rp->list->reass_execute_ctx &&
> +rp->list->key.dl_type == dl_type &&
>  ipf_dp_packet_batch_add(pb, rp->pkt, false)) {
>  rp->list->reass_execute_ctx = rp->pkt;
>  }
> @@ -1237,7 +1245,7 @@ ipf_preprocess_conntrack(struct ipf *ipf, struct 
> dp_packet_batch *pb,
>  }
>  
>  if (ipf_get_enabled(ipf) || atomic_count_get(>nfrag)) {
> -ipf_execute_reass_pkts(ipf, pb);
> +ipf_execute_reass_pkts(ipf, pb, dl_type);
>  }
>  }
>  
> -- 
> 2.39.3
>
> ___
> dev mailing list
> d...@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v3 1/8] netdev-linux: Fix possible int overflow in tc_add_matchall_policer().

2024-05-31 Thread Paolo Valerio
Eelco Chaudron  writes:

> Fix unintentional integer overflow reported by Coverity by adding
> the ULL suffix to the numerical literals used in the multiplications.
>
> Fixes: ed2300cca0d3 ("netdev-linux: Refactor put police action netlink 
> message")
> Acked-by: Mike Pattrick 
> Signed-off-by: Eelco Chaudron 
> ---

Acked-by: Paolo Valerio 

>  lib/netdev-linux.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index 25349c605..eb0c5c624 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -2915,8 +2915,8 @@ tc_add_matchall_policer(struct netdev *netdev, uint64_t 
> kbits_rate,
>  basic_offset = nl_msg_start_nested(, TCA_OPTIONS);
>  action_offset = nl_msg_start_nested(, TCA_MATCHALL_ACT);
>  nl_msg_put_act_police(, 0, kbits_rate, kbits_burst,
> -  kpkts_rate * 1000, kpkts_burst * 1000, 
> TC_ACT_UNSPEC,
> -  false);
> +  kpkts_rate * 1000ULL, kpkts_burst * 1000ULL,
> +  TC_ACT_UNSPEC, false);
>  nl_msg_end_nested(, action_offset);
>  nl_msg_end_nested(, basic_offset);
>  
> -- 
> 2.44.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v3 7/8] db-ctl-base: Initialize the output variable in the ctx structure.

2024-05-31 Thread Paolo Valerio
Eelco Chaudron  writes:

> Coverity was flagged that the uninitialized output variable was used
> in the ctl_context_init_command() function. This patch initializes
> the variable.
>
> In addition it also destroys the ds string in ctl_context_done()
> in case it's not cleared properly.
>
> Fixes: 07ff77ccb82a ("db-ctl-base: Make common database command code into 
> library.")
> Signed-off-by: Eelco Chaudron 
> ---

Acked-by: Paolo Valerio 

>  lib/db-ctl-base.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/lib/db-ctl-base.c b/lib/db-ctl-base.c
> index 3a8068b12..b3e9b92d1 100644
> --- a/lib/db-ctl-base.c
> +++ b/lib/db-ctl-base.c
> @@ -2656,6 +2656,7 @@ ctl_context_init(struct ctl_context *ctx, struct 
> ctl_command *command,
>   struct ovsdb_symbol_table *symtab,
>   void (*invalidate_cache_cb)(struct ctl_context *))
>  {
> +ds_init(>output);
>  if (command) {
>  ctl_context_init_command(ctx, command, false);
>  }
> @@ -2688,6 +2689,7 @@ ctl_context_done(struct ctl_context *ctx,
>  ctl_context_done_command(ctx, command);
>  }
>  invalidate_cache(ctx);
> +ds_destroy(>output);
>  }
>  
>  char * OVS_WARN_UNUSED_RESULT
> -- 
> 2.44.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v3 2/8] cfm: Fix possible integer overflow in tc_add_matchall_policer().

2024-05-31 Thread Paolo Valerio
Eelco Chaudron  writes:

> Fix unintentional integer overflow reported by Coverity by adding
> the LL suffix to the numerical literals used in the multiplication.
>
> Fixes: 5767a79a4059 ("cfm: Require ccm received in demand mode.")
> Acked-by: Mike Pattrick 
> Signed-off-by: Eelco Chaudron 
> ---

Acked-by: Paolo Valerio 

>  lib/cfm.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/lib/cfm.c b/lib/cfm.c
> index c3742f3de..7eb080157 100644
> --- a/lib/cfm.c
> +++ b/lib/cfm.c
> @@ -863,7 +863,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct 
> dp_packet *p)
>  rmp->num_health_ccm++;
>  if (cfm->demand) {
>  timer_set_duration(>demand_rx_ccm_t,
> -   100 * cfm->ccm_interval_ms);
> +   100LL * cfm->ccm_interval_ms);
>  }
>  }
>  rmp->recv = true;
> -- 
> 2.44.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v2 6/8] ofproto-dpif: Define age as time_t in ofproto_unixctl_fdb_add().

2024-05-28 Thread Paolo Valerio
Eelco Chaudron  writes:

> Fix the warning from Coverity about potential truncation of the
> time_t value when copying to a local variable by changing the
> local variable's type to time_t.
>
> ccc24fc88d59 ("ofproto-dpif: APIs and CLI option to add/delete static fdb 
> entry.")

It seems "Fixes:" slipped out here.
I guess this could be fixed while applying.
That aside,

Acked-by: Paolo Valerio 

> Signed-off-by: Eelco Chaudron 
> ---
>  ofproto/ofproto-dpif.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
> index 32d037be6..fcd7cd753 100644
> --- a/ofproto/ofproto-dpif.c
> +++ b/ofproto/ofproto-dpif.c
> @@ -6097,7 +6097,7 @@ ofproto_unixctl_fdb_add(struct unixctl_conn *conn, int 
> argc OVS_UNUSED,
>  const char *port_name = argv[2];
>  uint16_t vlan = atoi(argv[3]);
>  struct eth_addr mac;
> -int age;
> +time_t age;
>  
>  ofproto = ofproto_dpif_lookup_by_name(br_name);
>  if (!ofproto) {
> -- 
> 2.44.0
>
> ___
> dev mailing list
> d...@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v2 2/2] conntrack: Key connections by zone.

2024-05-13 Thread Paolo Valerio
Hi Peng,

Peng He  writes:

> To seperate into N cmaps, why not use hash value divided by N?
>

FWIW, I think it makes sense to discuss the potential benefits of other
approaches as well.
They may even end up not being as performant as this one, but also some
points to consider here are:

- the number of zones used in the common case (or even for the specific
  use case as the expectation is that the fewer the zones involved, the
  smaller the benefit)
- given flush per zone is where most of the gain is, the flush per zone
  for the use case

As a last remark, partitioning per zone also implies a substantial
design change that may potentially result in contrast with other
approaches targeting the overall performance (e.g., [0] is a quick
example that comes to mind with good scalability improvements in cps,
and probably, but this is just a guess, measurable improvements in the
same ct execute test).

[0] 
https://patchwork.ozlabs.org/project/openvswitch/patch/165668250987.1967719.7371616138630033269.st...@fed.void/

> Simon Horman  于2024年5月1日周三 19:06写道:
>
>> On Wed, Apr 24, 2024 at 02:44:54PM +0200, Felix Huettner via dev wrote:
>> > Currently conntrack uses a single large cmap for all connections stored.
>> > This cmap contains all connections for all conntrack zones which are
>> > completely separate from each other. By separating each zone to its own
>> > cmap we can significantly optimize the performance when using multiple
>> > zones.
>> >
>> > The change fixes a similar issue as [1] where slow conntrack zone flush
>> > operations significantly slow down OVN router failover. The difference is
>> > just that this fix is used whith dpdk, while [1] was when using the ovs
>> > kernel module.
>> >
>> > As we now need to store more cmap's the memory usage of struct conntrack
>> > increases by 524280 bytes. Additionally we need 65535 cmaps with 128
>> > bytes each. This leads to a total memory increase of around 10MB.
>> >
>> > Running "./ovstest test-conntrack benchmark 4 33554432 32 1" shows no
>> > real difference in the multithreading behaviour against a single zone.
>> >
>> > Running the new "./ovstest test-conntrack benchmark-zones" show
>> > significant speedups as shown below. The values for "ct execute" are for
>> > acting on the complete zone with all its entries in total (so in the
>> > first case adding 10,000 new conntrack entries). All tests are run 1000
>> > times.
>> >
>> > When running with 1,000 zones with 10,000 entries each we see the
>> > following results (all in microseconds):
>> > "./ovstest test-conntrack benchmark-zones 1 1000 1000"
>> >
>> >  +--++-+-+
>> >  |  Min |   Max  |  95%ile |   Avg   |
>> > ++--++-+-+
>> > | ct execute (commit)|  || | |
>> > |with commit | 2266 |   3505 | 2707.06 | 2592.06 |
>> > | without commit | 2411 |  12730 | 4432.50 | 2736.78 |
>> > ++--++-+-+
>> > | ct execute (no commit) |  || | |
>> > |with commit |  699 |   1238 |  886.15 |  722.67 |
>> > | without commit |  700 |   3377 | 1934.42 |  803.53 |
>> > ++--++-+-+
>> > | flush full zone|  || | |
>> > |with commit |  619 |   1122 |  901.36 |  679.15 |
>> > | without commit |  618 | 105078 |   64591 | 2886.46 |
>> > ++--++-+-+
>> > | flush empty zone   |  || | |
>> > |with commit |0 |  5 |1.00 |0.64 |
>> > | without commit |   54 |  87469 |   64520 | 2172.25 |
>> > ++--++-+-+
>> >
>> > When running with 10,000 zones with 1,000 entries each we see the
>> > following results (all in microseconds):
>> > "./ovstest test-conntrack benchmark-zones 1000 1 1000"
>> >
>> >  +--++-+-+
>> >  |  Min |   Max  |  95%ile |   Avg   |
>> > ++--++-+-+
>> > | ct execute (commit)|  || | |
>> > |with commit |  215 |287 |  231.88 |  222.30 |
>> > | without commit |  214 |   1692 |  569.18 |  285.83 |
>> > ++--++-+-+
>> > | ct execute (no commit) |  || | |
>> > |with commit |   68 | 97 |   74.69 |   70.09 |
>> > | without commit |   68 |300 |  158.40 |   82.06 |
>> > ++--++-+-+
>> > | flush full zone|  || | |
>> > |with commit |   47 |211 |   56.34 |   50.34 |
>> > | 

[ovs-dev] [PATCH v2] conntrack: Fully initialize conn struct before insertion.

2024-05-10 Thread Paolo Valerio
From: Mike Pattrick 

In case packets are concurrently received in both directions, there's
a chance that the ones in the reverse direction get received right
after the connection gets added to the connection tracker but before
some of the connection's fields are fully initialized.
This could cause OVS to access potentially invalid, as the lookup may
end up retrieving the wrong offsets during CONTAINER_OF(), or
uninitialized memory.

This may happen in case of regular NAT or all-zero SNAT.

Fix it by initializing early the connections fields.

Fixes: 1116459b3ba8 ("conntrack: Remove nat_conn introducing key 
directionality.")
Reported-at: https://issues.redhat.com/browse/FDP-616
Signed-off-by: Mike Pattrick 
Co-authored-by: Paolo Valerio 
Signed-off-by: Paolo Valerio 
---
 lib/conntrack.c | 24 +++-
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 16e1c8bb5..5fdfe98de 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -947,6 +947,18 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
 nc->parent_key = alg_exp->parent_key;
 }
 
+ovs_mutex_init_adaptive(>lock);
+atomic_flag_clear(>reclaimed);
+fwd_key_node->dir = CT_DIR_FWD;
+rev_key_node->dir = CT_DIR_REV;
+
+if (zl) {
+nc->admit_zone = zl->czl.zone;
+nc->zone_limit_seq = zl->czl.zone_limit_seq;
+} else {
+nc->admit_zone = INVALID_ZONE;
+}
+
 if (nat_action_info) {
 nc->nat_action = nat_action_info->nat_action;
 
@@ -972,22 +984,16 @@ conn_not_found(struct conntrack *ct, struct dp_packet 
*pkt,
 _key_node->cm_node, rev_hash);
 }
 
-ovs_mutex_init_adaptive(>lock);
-atomic_flag_clear(>reclaimed);
-fwd_key_node->dir = CT_DIR_FWD;
-rev_key_node->dir = CT_DIR_REV;
 cmap_insert(>conns[ctx->key.zone],
 _key_node->cm_node, ctx->hash);
 conn_expire_push_front(ct, nc);
 atomic_count_inc(>n_conn);
-ctx->conn = nc; /* For completeness. */
+
 if (zl) {
-nc->admit_zone = zl->czl.zone;
-nc->zone_limit_seq = zl->czl.zone_limit_seq;
 atomic_count_inc(>czl.count);
-} else {
-nc->admit_zone = INVALID_ZONE;
 }
+
+ctx->conn = nc; /* For completeness. */
 }
 
 return nc;
-- 
2.45.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] Subject: conntrack: Fully initialize conn struct before insertion.

2024-05-10 Thread Paolo Valerio
From: Mike Pattrick 

In case packets are concurrently received in both directions, there's
a chance that the ones in the reverse direction get received right
after the connection gets added to the connection tracker but before
some of the connection's fields are fully initialized.
This could cause OVS to access potentially invalid, as the lookup may
end up retrieving the wrong offsets during CONTAINER_OF(), or
uninitialized memory.

This may happen in case of regular NAT or all-zero SNAT.

Fix it by initializing early the connections fields.

Fixes: 1116459b3ba8 ("conntrack: Remove nat_conn introducing key 
directionality.")
Reported-at: https://issues.redhat.com/browse/FDP-616
Signed-off-by: Mike Pattrick 
Co-authored-by: Paolo Valerio 
Signed-off-by: Paolo Valerio 
---
 lib/conntrack.c | 24 +++-
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 16e1c8bb5..5fdfe98de 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -947,6 +947,18 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
 nc->parent_key = alg_exp->parent_key;
 }
 
+ovs_mutex_init_adaptive(>lock);
+atomic_flag_clear(>reclaimed);
+fwd_key_node->dir = CT_DIR_FWD;
+rev_key_node->dir = CT_DIR_REV;
+
+if (zl) {
+nc->admit_zone = zl->czl.zone;
+nc->zone_limit_seq = zl->czl.zone_limit_seq;
+} else {
+nc->admit_zone = INVALID_ZONE;
+}
+
 if (nat_action_info) {
 nc->nat_action = nat_action_info->nat_action;
 
@@ -972,22 +984,16 @@ conn_not_found(struct conntrack *ct, struct dp_packet 
*pkt,
 _key_node->cm_node, rev_hash);
 }
 
-ovs_mutex_init_adaptive(>lock);
-atomic_flag_clear(>reclaimed);
-fwd_key_node->dir = CT_DIR_FWD;
-rev_key_node->dir = CT_DIR_REV;
 cmap_insert(>conns[ctx->key.zone],
 _key_node->cm_node, ctx->hash);
 conn_expire_push_front(ct, nc);
 atomic_count_inc(>n_conn);
-ctx->conn = nc; /* For completeness. */
+
 if (zl) {
-nc->admit_zone = zl->czl.zone;
-nc->zone_limit_seq = zl->czl.zone_limit_seq;
 atomic_count_inc(>czl.count);
-} else {
-nc->admit_zone = INVALID_ZONE;
 }
+
+ctx->conn = nc; /* For completeness. */
 }
 
 return nc;
-- 
2.45.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v2] conntrack: Do not use {0} to initialize unions.

2024-05-09 Thread Paolo Valerio
Xavier Simonart  writes:

> In the following case:
> union ct_addr {
> unsigned int ipv4;
> struct in6_addr ipv6;
> };
> union ct_addr zero_ip = {0};
>
> The ipv6 field might not be properly initialized.
> For instance, clang 18.1.1 does not initialize the ipv6 field.
>
> Reported-at: https://issues.redhat.com/browse/FDP-608
> Signed-off-by: Xavier Simonart 
> ---
> v2: updated based on nit from Paolo.
> ---

Thanks Xavier.

Acked-by: Paolo Valerio 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] conntrack: Do not use {0} to initialize unions.

2024-05-08 Thread Paolo Valerio
Hello Xavier,

just curious, based on your tests, is clang 18.1.1 the only
compiler/version known so far to lead to the problem, right?

Anyways, only a small cosmetic nit below. Other than that:

Acked-by: Paolo Valerio 

Xavier Simonart  writes:

> In the following case:
> union ct_addr {
> unsigned int ipv4;
> struct in6_addr ipv6;
> };
> union ct_addr zero_ip = {0};
>
> The ipv6 field might not be properly initialized.
> For instance, clang 18.1.1 does not initialize the ipv6 field.
>
> Reported-at: https://issues.redhat.com/browse/FDP-608
> Signed-off-by: Xavier Simonart 
> ---
>  lib/conntrack.c | 9 +++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 16e1c8bb5..ff4a17abc 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -2302,7 +2302,8 @@ find_addr(const struct conn_key *key, union ct_addr 
> *min,
>uint32_t hash, bool ipv4,
>const struct nat_action_info_t *nat_info)
>  {
> -const union ct_addr zero_ip = {0};
> +union ct_addr zero_ip;
> +memset(_ip, 0, sizeof zero_ip);
>  
>  /* All-zero case. */
>  if (!memcmp(min, _ip, sizeof *min)) {
> @@ -2394,7 +2395,7 @@ nat_get_unique_tuple(struct conntrack *ct, struct conn 
> *conn,
>  {
>  struct conn_key *fwd_key = >key_node[CT_DIR_FWD].key;
>  struct conn_key *rev_key = >key_node[CT_DIR_REV].key;
> -union ct_addr min_addr = {0}, max_addr = {0}, addr = {0};
> +union ct_addr min_addr, max_addr, addr;

nit: please keep the reverse xmas tree

>  bool pat_proto = fwd_key->nw_proto == IPPROTO_TCP ||
>   fwd_key->nw_proto == IPPROTO_UDP ||
>   fwd_key->nw_proto == IPPROTO_SCTP;
> @@ -2402,6 +2403,10 @@ nat_get_unique_tuple(struct conntrack *ct, struct conn 
> *conn,
>  uint16_t min_sport, max_sport, curr_sport;
>  uint32_t hash, port_off, basis;
>  
> +memset(_addr, 0, sizeof min_addr);
> +memset(_addr, 0, sizeof max_addr);
> +memset(, 0, sizeof addr);
> +
>  basis = (nat_info->nat_flags & NAT_PERSISTENT) ? 0 : ct->hash_basis;
>  hash = nat_range_hash(fwd_key, basis, nat_info);
>  
> -- 
> 2.31.1
>
> ___
> dev mailing list
> d...@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] dpctl: fix segfault on ct-{set,del}-limits

2024-04-22 Thread Paolo Valerio
When no parameters other than the datapath are specified a segfault
occurs.

Fix it by checking the argument access is inside the bounds.

Signed-off-by: Paolo Valerio 
---
 lib/dpctl.c | 27 ---
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/lib/dpctl.c b/lib/dpctl.c
index 34ee7d0e2..3c555a559 100644
--- a/lib/dpctl.c
+++ b/lib/dpctl.c
@@ -2168,13 +2168,20 @@ static int
 dpctl_ct_set_limits(int argc, const char *argv[],
 struct dpctl_params *dpctl_p)
 {
-struct dpif *dpif;
-struct ds ds = DS_EMPTY_INITIALIZER;
+struct ovs_list zone_limits = OVS_LIST_INITIALIZER(_limits);
 int i =  dp_arg_exists(argc, argv) ? 2 : 1;
+struct ds ds = DS_EMPTY_INITIALIZER;
+struct dpif *dpif = NULL;
 uint32_t default_limit;
-struct ovs_list zone_limits = OVS_LIST_INITIALIZER(_limits);
+int error;
+
+if (i >= argc) {
+ds_put_cstr(, "too few arguments");
+error = EINVAL;
+goto error;
+}
 
-int error = opt_dpif_open(argc, argv, dpctl_p, INT_MAX, );
+error = opt_dpif_open(argc, argv, dpctl_p, INT_MAX, );
 if (error) {
 return error;
 }
@@ -2261,11 +2268,17 @@ static int
 dpctl_ct_del_limits(int argc, const char *argv[],
 struct dpctl_params *dpctl_p)
 {
-struct dpif *dpif;
+struct ovs_list zone_limits = OVS_LIST_INITIALIZER(_limits);
+int i =  dp_arg_exists(argc, argv) ? 2 : 1;
 struct ds ds = DS_EMPTY_INITIALIZER;
+struct dpif *dpif = NULL;
 int error;
-int i =  dp_arg_exists(argc, argv) ? 2 : 1;
-struct ovs_list zone_limits = OVS_LIST_INITIALIZER(_limits);
+
+if (i >= argc) {
+ds_put_cstr(, "too few arguments");
+error = EINVAL;
+goto error;
+}
 
 error = opt_dpif_open(argc, argv, dpctl_p, 4, );
 if (error) {
-- 
2.44.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-discuss] Urgent Help needed: OVS 3.2.2 Strange TC DROPs

2024-04-18 Thread Paolo Valerio via discuss
Paolo Valerio  writes:

> Adrian Moreno via discuss  writes:
>
>> Hi Gavin
>>
>> On 4/18/24 02:38, Gavin McKee via discuss wrote:
>>> This is an example.
>>> 
>>> Again the TCP 3 handshake completes , but the next packet fails to NAT
>>> and goes out onto the physical network using the private address .  An
>>> example of this is in the packet trace I provided.
>>> 
>>
>> Given you were using retis in your initial troubleshooting, you can use it 
>> with 
>> the additional "ct" collector to see if the kernel datapath is retrieving 
>> the 
>> right conntrack entry and what's its state. If that shows some unexpected 
>> conntrack entry change, it can be confirmed by monitoring "conntrack -E".
>>
>> Additionally, a dp flow dump (ovs-appctl dpctl/dump-flows -m) when the 
>> problem 
>> is happening might be also useful.
>>
>
> one thing to add on top of the above is a check for invalid logs (nf-log
> can be used as well):
>
> # modprobe nf_log_ipv4
> ## set logger for AF_INET (should be already set)
> # sysctl -w net.netfilter.nf_log.2=nf_log_ipv4
> ## enable invalid logs for TCP
> # sysctl net.netfilter.nf_conntrack_log_invalid=6
>

there's C/P mistake here:

sysctl -w net.netfilter.nf_conntrack_log_invalid=6

and soon after the invalid example message below in the clean up.
Should be:

sysctl -w net.netfilter.nf_conntrack_log_invalid=0

> in the case an invalid packet gets logged, you should see in dmesg
> something like:
>
> [312352.460843] nf_ct_proto_6: invalid new IN=eno1 ...
>
> once done:
>
> # sysctl net.netfilter.nf_conntrack_log_invalid=6
> # rmmod nf_log_syslog
>
> Paolo
>
>> --
>> Adrián
>>
>>
>>> ovs-appctl ofproto/trace br-int
>>> in_port=7753,dl_src=4e:42:14:a1:2a:fb,dl_dst=1a:16:b1:58:e1:cd,tcp,nw_src=172.27.18.244,nw_dst=104.18.3.35,nw_ttl=32,tcp_src=52776,tcp_dst=443,tcp_flags=2
>>> Flow: 
>>> tcp,in_port=7753,vlan_tci=0x,dl_src=4e:42:14:a1:2a:fb,dl_dst=1a:16:b1:58:e1:cd,nw_src=172.27.18.244,nw_dst=104.18.3.35,nw_tos=0,nw_ecn=0,nw_ttl=32,nw_frag=no,tp_src=52776,tp_dst=443,tcp_flags=syn
>>> 
>>> bridge("br-int")
>>> 
>>>   0. in_port=7753, priority 100, cookie 0xc33f39c4
>>>  set_field:0x16d->reg13
>>>  set_field:0x155->reg11
>>>  set_field:0x1cf->reg12
>>>  set_field:0x12->metadata
>>>  set_field:0xca->reg14
>>>  resubmit(,8)
>>>   8. metadata=0x12, priority 50, cookie 0x1645d3f2
>>>  set_field:0/0x1000->reg10
>>>  resubmit(,73)
>>>  73. 
>>> ip,reg14=0xca,metadata=0x12,dl_src=4e:42:14:a1:2a:fb,nw_src=172.27.18.244,
>>> priority 90, cookie 0xc33f39c4
>>>  set_field:0/0x1000->reg10
>>>  move:NXM_NX_REG10[12]->NXM_NX_XXREG0[111]
>>>   -> NXM_NX_XXREG0[111] is now 0
>>>  resubmit(,9)
>>>   9. metadata=0x12, priority 0, cookie 0xcc4fd106
>>>  resubmit(,10)
>>> 10. metadata=0x12, priority 0, cookie 0x9e10ad0e
>>>  resubmit(,11)
>>> 11. metadata=0x12, priority 0, cookie 0x557f3249
>>>  resubmit(,12)
>>> 12. ip,metadata=0x12, priority 100, cookie 0x14131a67
>>>  
>>> set_field:0x1/0x1->xxreg0
>>>  resubmit(,13)
>>> 13. metadata=0x12, priority 0, cookie 0x85f9ed4f
>>>  resubmit(,14)
>>> 14. ip,reg0=0x1/0x1,metadata=0x12, priority 100, cookie 0x279651c
>>>  ct(table=15,zone=NXM_NX_REG13[0..15])
>>>  drop
>>>   -> A clone of the packet is forked to recirculate. The forked
>>> pipeline will be resumed at table 15.
>>>   -> Sets the packet to an untracked state, and clears all the
>>> conntrack fields.
>>> 
>>> Final flow: 
>>> tcp,reg0=0x1,reg11=0x155,reg12=0x1cf,reg13=0x16d,reg14=0xca,metadata=0x12,in_port=7753,vlan_tci=0x,dl_src=4e:42:14:a1:2a:fb,dl_dst=1a:16:b1:58:e1:cd,nw_src=172.27.18.244,nw_dst=104.18.3.35,nw_tos=0,nw_ecn=0,nw_ttl=32,nw_frag=no,tp_src=52776,tp_dst=443,tcp_flags=syn
>>> Megaflow: 
>>> recirc_id=0,eth,tcp,in_port=7753,dl_src=4e:42:14:a1:2a:fb,dl_dst=1a:16:b1:58:e1:cd,nw_src=172.27.18.244,nw_frag=no
>>> Datapath actions: ct(zone=365),recirc(0x13b5a)
>>> 
>>> ===
>>> recirc(0x13b5a) - resume conntrack with default ct_state=trk|new (use
>>> --ct-next to custom

Re: [ovs-discuss] Urgent Help needed: OVS 3.2.2 Strange TC DROPs

2024-04-18 Thread Paolo Valerio via discuss
Adrian Moreno via discuss  writes:

> Hi Gavin
>
> On 4/18/24 02:38, Gavin McKee via discuss wrote:
>> This is an example.
>> 
>> Again the TCP 3 handshake completes , but the next packet fails to NAT
>> and goes out onto the physical network using the private address .  An
>> example of this is in the packet trace I provided.
>> 
>
> Given you were using retis in your initial troubleshooting, you can use it 
> with 
> the additional "ct" collector to see if the kernel datapath is retrieving the 
> right conntrack entry and what's its state. If that shows some unexpected 
> conntrack entry change, it can be confirmed by monitoring "conntrack -E".
>
> Additionally, a dp flow dump (ovs-appctl dpctl/dump-flows -m) when the 
> problem 
> is happening might be also useful.
>

one thing to add on top of the above is a check for invalid logs (nf-log
can be used as well):

# modprobe nf_log_ipv4
## set logger for AF_INET (should be already set)
# sysctl -w net.netfilter.nf_log.2=nf_log_ipv4
## enable invalid logs for TCP
# sysctl net.netfilter.nf_conntrack_log_invalid=6

in the case an invalid packet gets logged, you should see in dmesg
something like:

[312352.460843] nf_ct_proto_6: invalid new IN=eno1 ...

once done:

# sysctl net.netfilter.nf_conntrack_log_invalid=6
# rmmod nf_log_syslog

Paolo

> --
> Adrián
>
>
>> ovs-appctl ofproto/trace br-int
>> in_port=7753,dl_src=4e:42:14:a1:2a:fb,dl_dst=1a:16:b1:58:e1:cd,tcp,nw_src=172.27.18.244,nw_dst=104.18.3.35,nw_ttl=32,tcp_src=52776,tcp_dst=443,tcp_flags=2
>> Flow: 
>> tcp,in_port=7753,vlan_tci=0x,dl_src=4e:42:14:a1:2a:fb,dl_dst=1a:16:b1:58:e1:cd,nw_src=172.27.18.244,nw_dst=104.18.3.35,nw_tos=0,nw_ecn=0,nw_ttl=32,nw_frag=no,tp_src=52776,tp_dst=443,tcp_flags=syn
>> 
>> bridge("br-int")
>> 
>>   0. in_port=7753, priority 100, cookie 0xc33f39c4
>>  set_field:0x16d->reg13
>>  set_field:0x155->reg11
>>  set_field:0x1cf->reg12
>>  set_field:0x12->metadata
>>  set_field:0xca->reg14
>>  resubmit(,8)
>>   8. metadata=0x12, priority 50, cookie 0x1645d3f2
>>  set_field:0/0x1000->reg10
>>  resubmit(,73)
>>  73. 
>> ip,reg14=0xca,metadata=0x12,dl_src=4e:42:14:a1:2a:fb,nw_src=172.27.18.244,
>> priority 90, cookie 0xc33f39c4
>>  set_field:0/0x1000->reg10
>>  move:NXM_NX_REG10[12]->NXM_NX_XXREG0[111]
>>   -> NXM_NX_XXREG0[111] is now 0
>>  resubmit(,9)
>>   9. metadata=0x12, priority 0, cookie 0xcc4fd106
>>  resubmit(,10)
>> 10. metadata=0x12, priority 0, cookie 0x9e10ad0e
>>  resubmit(,11)
>> 11. metadata=0x12, priority 0, cookie 0x557f3249
>>  resubmit(,12)
>> 12. ip,metadata=0x12, priority 100, cookie 0x14131a67
>>  
>> set_field:0x1/0x1->xxreg0
>>  resubmit(,13)
>> 13. metadata=0x12, priority 0, cookie 0x85f9ed4f
>>  resubmit(,14)
>> 14. ip,reg0=0x1/0x1,metadata=0x12, priority 100, cookie 0x279651c
>>  ct(table=15,zone=NXM_NX_REG13[0..15])
>>  drop
>>   -> A clone of the packet is forked to recirculate. The forked
>> pipeline will be resumed at table 15.
>>   -> Sets the packet to an untracked state, and clears all the
>> conntrack fields.
>> 
>> Final flow: 
>> tcp,reg0=0x1,reg11=0x155,reg12=0x1cf,reg13=0x16d,reg14=0xca,metadata=0x12,in_port=7753,vlan_tci=0x,dl_src=4e:42:14:a1:2a:fb,dl_dst=1a:16:b1:58:e1:cd,nw_src=172.27.18.244,nw_dst=104.18.3.35,nw_tos=0,nw_ecn=0,nw_ttl=32,nw_frag=no,tp_src=52776,tp_dst=443,tcp_flags=syn
>> Megaflow: 
>> recirc_id=0,eth,tcp,in_port=7753,dl_src=4e:42:14:a1:2a:fb,dl_dst=1a:16:b1:58:e1:cd,nw_src=172.27.18.244,nw_frag=no
>> Datapath actions: ct(zone=365),recirc(0x13b5a)
>> 
>> ===
>> recirc(0x13b5a) - resume conntrack with default ct_state=trk|new (use
>> --ct-next to customize)
>> ===
>> 
>> Flow: 
>> recirc_id=0x13b5a,ct_state=new|trk,ct_zone=365,eth,tcp,reg0=0x1,reg11=0x155,reg12=0x1cf,reg13=0x16d,reg14=0xca,metadata=0x12,in_port=7753,vlan_tci=0x,dl_src=4e:42:14:a1:2a:fb,dl_dst=1a:16:b1:58:e1:cd,nw_src=172.27.18.244,nw_dst=104.18.3.35,nw_tos=0,nw_ecn=0,nw_ttl=32,nw_frag=no,tp_src=52776,tp_dst=443,tcp_flags=syn
>> 
>> bridge("br-int")
>> 
>>  thaw
>>  Resuming from table 15
>> 15. ct_state=+new-est+trk,metadata=0x12, priority 7, cookie 0xa9e0ee6f
>>  
>> set_field:0x80/0x80->xxreg0
>>  
>> set_field:0x200/0x200->xxreg0
>>  resubmit(,16)
>> 16. conj_id=2865573479,tcp,reg0=0x80/0x80,reg14=0xca,metadata=0x12,
>> priority 3000, cookie 0xabdec111
>>  set_field:0x1/0x1->xreg4
>>  
>> set_field:0x2/0x2->xxreg0
>>  resubmit(,17)
>> 17. reg8=0x1/0x1,metadata=0x12, priority 1000, cookie 0x8171c04a
>>  

Re: [ovs-dev] [PATCH] conntrack: Do not use icmp reverse helper for icmpv6.

2024-03-28 Thread Paolo Valerio
Ilya Maximets  writes:

> On 3/12/24 11:02, Paolo Valerio wrote:
>> In the flush tuple code path, while populating the conn_key,
>> reverse_icmp_type() gets called for both icmp and icmpv6 cases,
>> while, depending on the proto, its respective helper should be
>> called, instead.
>
> Thanks for the fix!
>
> Some minor nits below.
>
>> 
>> The above leads to an abort:
>> 
>> [...]
>> 0x7f3d461888ff in __GI_abort () at abort.c:79
>> 0x0064eeb7 in reverse_icmp_type (type=128 '\200') at 
>> lib/conntrack.c:1795
>> 0x00650a63 in tuple_to_conn_key (tuple=0x7ffe0db5c620, zone=0, 
>> key=0x7ffe0db5c520)
>> at lib/conntrack.c:2590
>> 0x006510f7 in conntrack_flush_tuple (ct=0x25715a0, 
>> tuple=0x7ffe0db5c620, zone=0) at lib/conntrack.c:2787
>> 0x004b5988 in dpif_netdev_ct_flush (dpif=0x25e4640, 
>> zone=0x7ffe0db5c6a4, tuple=0x7ffe0db5c620)
>> at lib/dpif-netdev.c:9618
>> 0x0049938a in ct_dpif_flush_tuple (dpif=0x25e4640, zone=0x0, 
>> match=0x7ffe0db5c7e0) at lib/ct-dpif.c:331
>> 0x0049942a in ct_dpif_flush (dpif=0x25e4640, zone=0x0, 
>> match=0x7ffe0db5c7e0) at lib/ct-dpif.c:361
>> 0x00657b9a in dpctl_flush_conntrack (argc=2, argv=0x254ceb0, 
>> dpctl_p=0x7ffe0db5c8a0) at lib/dpctl.c:1797
>> 0x0065af36 in dpctl_unixctl_handler (conn=0x25c48d0, argc=2, 
>> argv=0x254ceb0,
>> [...]
>
> Could you, please, strip out some unnecessary information from
> the trace?  For example, function addresses in hex are not
> actually needed and most of the function arguments are not
> needed as well.  Only a few of the arguments are actually important.
> Removing those will shorten the lines and make the trace more
> clear for the reader.
>
>> 
>> Fix it by calling reverse_icmp6_type() when needed.
>> Furthermore, self tests have been modified in order to exercise and
>> check this behavior.
>> 
>> Fixes: 271e48a0e244 ("conntrack: Support conntrack flush by ct 5-tuple")
>> Reported-at: https://issues.redhat.com/browse/FDP-447
>> Signed-off-by: Paolo Valerio 
>> ---
>>  lib/conntrack.c |  4 +++-
>>  tests/system-traffic.at | 10 +-
>>  2 files changed, 12 insertions(+), 2 deletions(-)
>> 
>> diff --git a/lib/conntrack.c b/lib/conntrack.c
>> index 5786424f6..a62f27d24 100644
>> --- a/lib/conntrack.c
>> +++ b/lib/conntrack.c
>> @@ -2586,7 +2586,9 @@ tuple_to_conn_key(const struct ct_dpif_tuple *tuple, 
>> uint16_t zone,
>>  key->src.icmp_type = tuple->icmp_type;
>>  key->src.icmp_code = tuple->icmp_code;
>>  key->dst.icmp_id = tuple->icmp_id;
>> -key->dst.icmp_type = reverse_icmp_type(tuple->icmp_type);
>> +key->dst.icmp_type = (tuple->ip_proto == IPPROTO_ICMP) ?
>> +reverse_icmp_type(tuple->icmp_type) :
>> +reverse_icmp6_type(tuple->icmp_type);
>
> Please, wrap the lines before ?:, not after.  And align the branches
> of the ternary to the beginning of a condition, i.e.:
>
> +key->dst.icmp_type = (tuple->ip_proto == IPPROTO_ICMP)
> + ? reverse_icmp_type(tuple->icmp_type)
> + : reverse_icmp6_type(tuple->icmp_type);
>

Thank you Ilya.
I sent a v2 with your suggestions:

https://patchwork.ozlabs.org/project/openvswitch/patch/20240328165608.273344-1-pvale...@redhat.com/

> Best regards, Ilya Maximets.

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2] conntrack: Do not use icmp reverse helper for icmpv6.

2024-03-28 Thread Paolo Valerio
In the flush tuple code path, while populating the conn_key,
reverse_icmp_type() gets called for both icmp and icmpv6 cases,
while, depending on the proto, its respective helper should be
called, instead.

The above leads to an abort:

[...]
__GI_abort () at abort.c:79
reverse_icmp_type (type=128 '\200') at lib/conntrack.c:1795
tuple_to_conn_key (...) at lib/conntrack.c:2590
in conntrack_flush_tuple (...) at lib/conntrack.c:2787
in dpif_netdev_ct_flush (...) at lib/dpif-netdev.c:9618
ct_dpif_flush_tuple (...) at lib/ct-dpif.c:331
ct_dpif_flush (...) at lib/ct-dpif.c:361
dpctl_flush_conntrack (...) at lib/dpctl.c:1797
[...]

Fix it by calling reverse_icmp6_type() when needed.
Furthermore, self tests have been modified in order to exercise and
check this behavior.

Fixes: 271e48a0e244 ("conntrack: Support conntrack flush by ct 5-tuple")
Reported-at: https://issues.redhat.com/browse/FDP-447
Signed-off-by: Paolo Valerio 
---
v2 (Ilya):
- stripped down backtrace
- aligned ternary
---
 lib/conntrack.c |  4 +++-
 tests/system-traffic.at | 10 +-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 5786424f6..7e3ed0ee0 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -2586,7 +2586,9 @@ tuple_to_conn_key(const struct ct_dpif_tuple *tuple, 
uint16_t zone,
 key->src.icmp_type = tuple->icmp_type;
 key->src.icmp_code = tuple->icmp_code;
 key->dst.icmp_id = tuple->icmp_id;
-key->dst.icmp_type = reverse_icmp_type(tuple->icmp_type);
+key->dst.icmp_type = (tuple->ip_proto == IPPROTO_ICMP)
+ ? reverse_icmp_type(tuple->icmp_type)
+ : reverse_icmp6_type(tuple->icmp_type);
 key->dst.icmp_code = tuple->icmp_code;
 } else {
 key->src.port = tuple->src_port;
diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index 2d12d558e..87de0692a 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -3103,7 +3103,10 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | 
FORMAT_CT(10.1.1.2)], [0], [dnl
 
icmp,orig=(src=10.1.1.1,dst=10.1.1.2,id=,type=8,code=0),reply=(src=10.1.1.2,dst=10.1.1.1,id=,type=0,code=0)
 ])
 
-AT_CHECK([ovs-appctl dpctl/flush-conntrack])
+AT_CHECK([ovs-appctl dpctl/flush-conntrack 
'ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2'])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl
+])
 
 dnl Pings from ns1->ns0 should fail.
 NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -w 2 10.1.1.1 | FORMAT_PING], 
[0], [dnl
@@ -3244,6 +3247,11 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | 
FORMAT_CT(fc00::2)], [0], [dnl
 
icmpv6,orig=(src=fc00::1,dst=fc00::2,id=,type=128,code=0),reply=(src=fc00::2,dst=fc00::1,id=,type=129,code=0)
 ])
 
+AT_CHECK([ovs-appctl dpctl/flush-conntrack 
'ct_ipv6_src=fc00::1,ct_ipv6_dst=fc00::2'])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fc00::2)], [0], [dnl
+])
+
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 
-- 
2.44.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2] conntrack: Fix SNAT with exhaustion system test.

2024-03-28 Thread Paolo Valerio
Recent kernels introduced a mechanism that allows to evict colliding
entries in a closing state whereas they were previously considered as
parts of a non-recoverable clash.
This new behavior makes "conntrack - SNAT with port range with
exhaustion test" fail, as it relies on the previous assumptions.

Fix it by creating and not advancing the first entry in SYN_SENT to
avoid early eviction.

Suggested-by: Ilya Maximets 
Reported-at: https://issues.redhat.com/browse/FDP-486
Signed-off-by: Paolo Valerio 
---
v2:
- replaced open-coded bytes with
  'ovs-ofctl compose-packet --bare' (Ilya)
---
 tests/system-traffic.at | 24 +---
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index 2d12d558e..20b011b7e 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -6388,7 +6388,6 @@ OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 
 AT_SETUP([conntrack - SNAT with port range with exhaustion])
-OVS_CHECK_GITHUB_ACTION()
 CHECK_CONNTRACK()
 CHECK_CONNTRACK_NAT()
 OVS_TRAFFIC_VSWITCHD_START()
@@ -6398,11 +6397,11 @@ ADD_NAMESPACES(at_ns0, at_ns1)
 ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24")
 NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address 80:88:88:88:88:88])
 ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
+NS_CHECK_EXEC([at_ns1], [ip link set dev p1 address 80:89:89:89:89:89])
 
 dnl Allow any traffic from ns0->ns1. Only allow nd, return traffic from 
ns1->ns0.
 AT_DATA([flows.txt], [dnl
-in_port=1,tcp,action=ct(commit,zone=1,nat(src=10.1.1.240:34568,random)),2
-in_port=2,ct_state=-trk,tcp,tp_dst=34567,action=ct(table=0,zone=1,nat)
+in_port=1,tcp,action=ct(commit,zone=1,nat(src=10.1.1.240:34568)),2
 in_port=2,ct_state=-trk,tcp,tp_dst=34568,action=ct(table=0,zone=1,nat)
 in_port=2,ct_state=+trk,ct_zone=1,tcp,action=1
 dnl
@@ -6426,17 +6425,28 @@ AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
 
 dnl HTTP requests from p0->p1 should work fine.
 OVS_START_L7([at_ns1], [http])
-NS_CHECK_EXEC([at_ns0], [wget 10.1.1.2 -t 1 -T 1 --retry-connrefused -v -o 
wget0.log])
+
+dnl Send a valid SYN to make conntrack pick it up.
+dnl The source port used is 123 to prevent unwanted reuse in the next HTTP 
request.
+syn_pkt=$(ovs-ofctl compose-packet --bare 
"eth_src=80:88:88:88:88:88,eth_dst=80:89:89:89:89:89,\
+  
dl_type=0x0800,nw_src=10.1.1.1,nw_dst=10.1.1.2,nw_proto=6,nw_ttl=64,nw_frag=no,tcp_flags=syn,\
+  tcp_src=123,tcp_dst=80")
+AT_CHECK([ovs-ofctl packet-out br0 "packet=${syn_pkt} 
actions=ct(commit,zone=1,nat(src=10.1.1.240:34568))"])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | uniq], [0], 
[dnl
+tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.240,sport=,dport=),zone=1,protoinfo=(state=)
+])
 
 NS_CHECK_EXEC([at_ns0], [wget 10.1.1.2 -t 1 -T 1 --retry-connrefused -v -o 
wget0.log], [4])
 
-AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | sed -e 
's/dst=10.1.1.2[[45]][[0-9]]/dst=10.1.1.2XX/' | uniq], [0], [dnl
-tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.2XX,sport=,dport=),zone=1,protoinfo=(state=)
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | uniq], [0], 
[dnl
+tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.240,sport=,dport=),zone=1,protoinfo=(state=)
 ])
 
 OVS_TRAFFIC_VSWITCHD_STOP(["dnl
 /Unable to NAT due to tuple space exhaustion - if DoS attack, use firewalling 
and\/or zone partitioning./d
-/Dropped .* log messages in last .* seconds \(most recently, .* seconds ago\) 
due to excessive rate/d"])
+/Dropped .* log messages in last .* seconds \(most recently, .* seconds ago\) 
due to excessive rate/d
+/|WARN|.* execute ct.* failed/d"])
 AT_CLEANUP
 
 AT_SETUP([conntrack - more complex SNAT])
-- 
2.44.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] conntrack: Fix SNAT with exhaustion system test.

2024-03-28 Thread Paolo Valerio
Ilya Maximets  writes:

> On 3/13/24 12:08, Paolo Valerio wrote:
>> Recent kernels introduced a mechanism that allows to evict colliding
>> entries in a closing state whereas they were previously considered as
>> parts of a non-recoverable clash.
>> This new behavior makes "conntrack - SNAT with port range with
>> exhaustion test" fail, as it relies on the previous assumptions.
>> 
>> Fix it by creating and not advancing the first entry in SYN_SENT to
>> avoid early eviction.
>> 
>> Suggested-by: Ilya Maximets 
>> Reported-at: https://issues.redhat.com/browse/FDP-486
>> Signed-off-by: Paolo Valerio 
>> ---
>
> Hi, Paolo.  Thanks for the fix!
>

Hi Ilya,

Thanks for the feedback!

> Some small comments inline.
>
>>  tests/system-traffic.at | 21 ++---
>>  1 file changed, 14 insertions(+), 7 deletions(-)
>> 
>> diff --git a/tests/system-traffic.at b/tests/system-traffic.at
>> index 2d12d558e..04559f5e8 100644
>> --- a/tests/system-traffic.at
>> +++ b/tests/system-traffic.at
>> @@ -6388,7 +6388,6 @@ OVS_TRAFFIC_VSWITCHD_STOP
>>  AT_CLEANUP
>>  
>>  AT_SETUP([conntrack - SNAT with port range with exhaustion])
>> -OVS_CHECK_GITHUB_ACTION()
>>  CHECK_CONNTRACK()
>>  CHECK_CONNTRACK_NAT()
>>  OVS_TRAFFIC_VSWITCHD_START()
>> @@ -6398,11 +6397,11 @@ ADD_NAMESPACES(at_ns0, at_ns1)
>>  ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24")
>>  NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address 80:88:88:88:88:88])
>>  ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
>> +NS_CHECK_EXEC([at_ns1], [ip link set dev p1 address 80:89:89:89:89:89])
>>  
>>  dnl Allow any traffic from ns0->ns1. Only allow nd, return traffic from 
>> ns1->ns0.
>>  AT_DATA([flows.txt], [dnl
>> -in_port=1,tcp,action=ct(commit,zone=1,nat(src=10.1.1.240:34568,random)),2
>> -in_port=2,ct_state=-trk,tcp,tp_dst=34567,action=ct(table=0,zone=1,nat)
>
> Do you know why this flow was there in the first place?
>

AFAICT, this seemed to me part of C/P ("conntrack - SNAT with port
range").
While at it, I preferred to clean it a bit as this (along with a couple
of minor things) was not required.

>> +in_port=1,tcp,action=ct(commit,zone=1,nat(src=10.1.1.240:34568)),2
>>  in_port=2,ct_state=-trk,tcp,tp_dst=34568,action=ct(table=0,zone=1,nat)
>>  in_port=2,ct_state=+trk,ct_zone=1,tcp,action=1
>>  dnl
>> @@ -6426,17 +6425,25 @@ AT_CHECK([ovs-ofctl --bundle add-flows br0 
>> flows.txt])
>>  
>>  dnl HTTP requests from p0->p1 should work fine.
>>  OVS_START_L7([at_ns1], [http])
>> -NS_CHECK_EXEC([at_ns0], [wget 10.1.1.2 -t 1 -T 1 --retry-connrefused -v -o 
>> wget0.log])
>> +
>> +dnl Send a valid SYN to make conntrack pick it up.
>> +dnl The source port used is 123 to prevent unwanted reuse in the next HTTP 
>> request.
>> +AT_CHECK([ovs-ofctl packet-out br0 
>> "packet=8089898989898088080045280001400664cb0a0101010a010102007b0050500220007913
>>  actions=ct(commit,zone=1,nat(src=10.1.1.240:34568))"])
>
> Can we use 'ovs-ofctl compose-packet --bare' instead of open-coding bytes?
>

sure, I'll send a v2.

> Best regards, Ilya Maximets.

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] conntrack: Fix SNAT with exhaustion system test.

2024-03-13 Thread Paolo Valerio
Recent kernels introduced a mechanism that allows to evict colliding
entries in a closing state whereas they were previously considered as
parts of a non-recoverable clash.
This new behavior makes "conntrack - SNAT with port range with
exhaustion test" fail, as it relies on the previous assumptions.

Fix it by creating and not advancing the first entry in SYN_SENT to
avoid early eviction.

Suggested-by: Ilya Maximets 
Reported-at: https://issues.redhat.com/browse/FDP-486
Signed-off-by: Paolo Valerio 
---
 tests/system-traffic.at | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index 2d12d558e..04559f5e8 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -6388,7 +6388,6 @@ OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 
 AT_SETUP([conntrack - SNAT with port range with exhaustion])
-OVS_CHECK_GITHUB_ACTION()
 CHECK_CONNTRACK()
 CHECK_CONNTRACK_NAT()
 OVS_TRAFFIC_VSWITCHD_START()
@@ -6398,11 +6397,11 @@ ADD_NAMESPACES(at_ns0, at_ns1)
 ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24")
 NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address 80:88:88:88:88:88])
 ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
+NS_CHECK_EXEC([at_ns1], [ip link set dev p1 address 80:89:89:89:89:89])
 
 dnl Allow any traffic from ns0->ns1. Only allow nd, return traffic from 
ns1->ns0.
 AT_DATA([flows.txt], [dnl
-in_port=1,tcp,action=ct(commit,zone=1,nat(src=10.1.1.240:34568,random)),2
-in_port=2,ct_state=-trk,tcp,tp_dst=34567,action=ct(table=0,zone=1,nat)
+in_port=1,tcp,action=ct(commit,zone=1,nat(src=10.1.1.240:34568)),2
 in_port=2,ct_state=-trk,tcp,tp_dst=34568,action=ct(table=0,zone=1,nat)
 in_port=2,ct_state=+trk,ct_zone=1,tcp,action=1
 dnl
@@ -6426,17 +6425,25 @@ AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
 
 dnl HTTP requests from p0->p1 should work fine.
 OVS_START_L7([at_ns1], [http])
-NS_CHECK_EXEC([at_ns0], [wget 10.1.1.2 -t 1 -T 1 --retry-connrefused -v -o 
wget0.log])
+
+dnl Send a valid SYN to make conntrack pick it up.
+dnl The source port used is 123 to prevent unwanted reuse in the next HTTP 
request.
+AT_CHECK([ovs-ofctl packet-out br0 
"packet=8089898989898088080045280001400664cb0a0101010a010102007b0050500220007913
 actions=ct(commit,zone=1,nat(src=10.1.1.240:34568))"])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | uniq], [0], 
[dnl
+tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.240,sport=,dport=),zone=1,protoinfo=(state=)
+])
 
 NS_CHECK_EXEC([at_ns0], [wget 10.1.1.2 -t 1 -T 1 --retry-connrefused -v -o 
wget0.log], [4])
 
-AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | sed -e 
's/dst=10.1.1.2[[45]][[0-9]]/dst=10.1.1.2XX/' | uniq], [0], [dnl
-tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.2XX,sport=,dport=),zone=1,protoinfo=(state=)
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | uniq], [0], 
[dnl
+tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.240,sport=,dport=),zone=1,protoinfo=(state=)
 ])
 
 OVS_TRAFFIC_VSWITCHD_STOP(["dnl
 /Unable to NAT due to tuple space exhaustion - if DoS attack, use firewalling 
and\/or zone partitioning./d
-/Dropped .* log messages in last .* seconds \(most recently, .* seconds ago\) 
due to excessive rate/d"])
+/Dropped .* log messages in last .* seconds \(most recently, .* seconds ago\) 
due to excessive rate/d
+/|WARN|.* execute ct.* failed/d"])
 AT_CLEANUP
 
 AT_SETUP([conntrack - more complex SNAT])
-- 
2.44.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] conntrack: Do not use icmp reverse helper for icmpv6.

2024-03-12 Thread Paolo Valerio
In the flush tuple code path, while populating the conn_key,
reverse_icmp_type() gets called for both icmp and icmpv6 cases,
while, depending on the proto, its respective helper should be
called, instead.

The above leads to an abort:

[...]
0x7f3d461888ff in __GI_abort () at abort.c:79
0x0064eeb7 in reverse_icmp_type (type=128 '\200') at 
lib/conntrack.c:1795
0x00650a63 in tuple_to_conn_key (tuple=0x7ffe0db5c620, zone=0, 
key=0x7ffe0db5c520)
at lib/conntrack.c:2590
0x006510f7 in conntrack_flush_tuple (ct=0x25715a0, 
tuple=0x7ffe0db5c620, zone=0) at lib/conntrack.c:2787
0x004b5988 in dpif_netdev_ct_flush (dpif=0x25e4640, 
zone=0x7ffe0db5c6a4, tuple=0x7ffe0db5c620)
at lib/dpif-netdev.c:9618
0x0049938a in ct_dpif_flush_tuple (dpif=0x25e4640, zone=0x0, 
match=0x7ffe0db5c7e0) at lib/ct-dpif.c:331
0x0049942a in ct_dpif_flush (dpif=0x25e4640, zone=0x0, 
match=0x7ffe0db5c7e0) at lib/ct-dpif.c:361
0x00657b9a in dpctl_flush_conntrack (argc=2, argv=0x254ceb0, 
dpctl_p=0x7ffe0db5c8a0) at lib/dpctl.c:1797
0x0065af36 in dpctl_unixctl_handler (conn=0x25c48d0, argc=2, 
argv=0x254ceb0,
[...]

Fix it by calling reverse_icmp6_type() when needed.
Furthermore, self tests have been modified in order to exercise and
check this behavior.

Fixes: 271e48a0e244 ("conntrack: Support conntrack flush by ct 5-tuple")
Reported-at: https://issues.redhat.com/browse/FDP-447
Signed-off-by: Paolo Valerio 
---
 lib/conntrack.c |  4 +++-
 tests/system-traffic.at | 10 +-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 5786424f6..a62f27d24 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -2586,7 +2586,9 @@ tuple_to_conn_key(const struct ct_dpif_tuple *tuple, 
uint16_t zone,
 key->src.icmp_type = tuple->icmp_type;
 key->src.icmp_code = tuple->icmp_code;
 key->dst.icmp_id = tuple->icmp_id;
-key->dst.icmp_type = reverse_icmp_type(tuple->icmp_type);
+key->dst.icmp_type = (tuple->ip_proto == IPPROTO_ICMP) ?
+reverse_icmp_type(tuple->icmp_type) :
+reverse_icmp6_type(tuple->icmp_type);
 key->dst.icmp_code = tuple->icmp_code;
 } else {
 key->src.port = tuple->src_port;
diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index 2d12d558e..87de0692a 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -3103,7 +3103,10 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | 
FORMAT_CT(10.1.1.2)], [0], [dnl
 
icmp,orig=(src=10.1.1.1,dst=10.1.1.2,id=,type=8,code=0),reply=(src=10.1.1.2,dst=10.1.1.1,id=,type=0,code=0)
 ])
 
-AT_CHECK([ovs-appctl dpctl/flush-conntrack])
+AT_CHECK([ovs-appctl dpctl/flush-conntrack 
'ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2'])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl
+])
 
 dnl Pings from ns1->ns0 should fail.
 NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -w 2 10.1.1.1 | FORMAT_PING], 
[0], [dnl
@@ -3244,6 +3247,11 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | 
FORMAT_CT(fc00::2)], [0], [dnl
 
icmpv6,orig=(src=fc00::1,dst=fc00::2,id=,type=128,code=0),reply=(src=fc00::2,dst=fc00::1,id=,type=129,code=0)
 ])
 
+AT_CHECK([ovs-appctl dpctl/flush-conntrack 
'ct_ipv6_src=fc00::1,ct_ipv6_dst=fc00::2'])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fc00::2)], [0], [dnl
+])
+
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 
-- 
2.44.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] github: Temporarily disable SNAT with exhaustion system test.

2024-03-01 Thread Paolo Valerio
Ilya Maximets  writes:

> With a new runner update, GitHub Actions had a kernel update.
> And it seems like something changed between kernels 6.2 and 6.5
> so this test now fails very frequently.
>
> I can reproduce the same issue on RHEL 9, and I can't reproduce
> it on Ubuntu 23.04 (kernel 6.2).
>
> The test is creating a NAT with a single address+port pair in
> an attempt to simulate an address space exhaustion.  It is
> expected that a first connection with wget leaves a conntrack
> entry in a TIME_WAIT state and the second wget should fail
> as long as this entry remains, because the only available
> address+port pair is already taken.
>
> However, for some reason, very frequently (not always!) the
> second connection replaces the first conntrack entry with a
> new one and connection succeeds.  There is still only one
> connection in the conntrack at any single moment in time, so
> there is seemingly no issue with the NAT, but the behavior
> is unexpected and the test fails.
>
> Disable the test in CI until we figure out how to fix the
> kernel (if it is a kernel bug) or the test.
>
> Signed-off-by: Ilya Maximets 
> ---

Acked-by: Paolo Valerio 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v2 2/2] conntrack: Handle persistent selection for IP addresses.

2024-02-16 Thread Paolo Valerio
Simon Horman  writes:

> On Wed, Feb 07, 2024 at 06:38:08PM +0100, Paolo Valerio wrote:
>> The patch, when 'persistent' flag is specified, makes the IP selection
>> in a range persistent across reboots.
>> 
>> Signed-off-by: Paolo Valerio 
>
> Hi Paolo,
>
> I have some minor nits below - which you can feel free to take or leave.
> But overall this looks good to me.
>
> Acked-by: Simon Horman 
>
> ...
>
>> diff --git a/lib/conntrack.c b/lib/conntrack.c
>
> ...
>
>> @@ -2386,12 +2390,23 @@ nat_get_unique_tuple(struct conntrack *ct, struct 
>> conn *conn,
>>  bool pat_proto = fwd_key->nw_proto == IPPROTO_TCP ||
>>   fwd_key->nw_proto == IPPROTO_UDP ||
>>   fwd_key->nw_proto == IPPROTO_SCTP;
>> +uint32_t hash, port_off, basis = ct->hash_basis;
>>  uint16_t min_dport, max_dport, curr_dport;
>>  uint16_t min_sport, max_sport, curr_sport;
>> -uint32_t hash, port_off;
>>  
>> -hash = nat_range_hash(fwd_key, ct->hash_basis, nat_info);
>> -port_off = nat_info->nat_flags & NAT_RANGE_RANDOM ? random_uint32() : 
>> hash;
>> +if (nat_info->nat_flags & NAT_PERSISTENT) {
>> +basis = 0;
>> +}
>
> nit: maybe it is nicer to set basis only once.
>
> basis = (nat_info->nat_flags & NAT_PERSISTENT) ? 0 : ct->hash_basis;
>
>> +
>> +hash = nat_range_hash(fwd_key, basis, nat_info);
>> +
>> +if (nat_info->nat_flags & NAT_RANGE_RANDOM) {
>> +port_off = random_uint32();
>> +} else {
>> +port_off =
>> +basis ? hash : nat_range_hash(fwd_key, ct->hash_basis, 
>> nat_info);
>> +}
>> +
>
> nit: maybe this is a little easier on the eyes (completely untested!)?
>
> if (nat_info->nat_flags & NAT_RANGE_RANDOM) {
> port_off = random_uint32();
> } else if (basis) {
> port_off = hash;
> } else {
> port_off = nat_range_hash(fwd_key, ct->hash_basis, nat_info);
> }
>

thanks Simon for taking a look.
Agreed, looks easier on the eyes. I included your suggestions and your
acks in v3.

I guess the above solve Aaron's suggestions as well.

>>  min_addr = nat_info->min_addr;
>>  max_addr = nat_info->max_addr;
>>  
>
> ...

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v3 2/2] conntrack: Handle persistent selection for IP addresses.

2024-02-16 Thread Paolo Valerio
The patch, when 'persistent' flag is specified, makes the IP selection
in a range persistent across reboots.

Signed-off-by: Paolo Valerio 
Acked-by: Simon Horman 
---
v3:
- rearranged branches in nat_get_unique_tuple() (Simon)
---
 NEWS  |  3 ++-
 lib/conntrack.c   | 25 +++--
 lib/conntrack.h   |  1 +
 lib/dpif-netdev.c |  2 ++
 4 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/NEWS b/NEWS
index 93046b963..0c86bba81 100644
--- a/NEWS
+++ b/NEWS
@@ -2,7 +2,8 @@ Post-v3.3.0
 
- Userspace datapath:
  * Conntrack now supports 'random' flag for selecting ports in a range
-   while natting.
+   while natting and 'persistent' flag for selection of the IP address
+   from a range.
 
 
 v3.3.0 - xx xxx 
diff --git a/lib/conntrack.c b/lib/conntrack.c
index e09ecdf33..8a7056bac 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -2202,17 +2202,21 @@ nat_range_hash(const struct conn_key *key, uint32_t 
basis,
 {
 uint32_t hash = basis;
 
+if (!basis) {
+hash = ct_addr_hash_add(hash, >src.addr);
+} else {
+hash = ct_endpoint_hash_add(hash, >src);
+hash = ct_endpoint_hash_add(hash, >dst);
+}
+
 hash = ct_addr_hash_add(hash, _info->min_addr);
 hash = ct_addr_hash_add(hash, _info->max_addr);
 hash = hash_add(hash,
 ((uint32_t) nat_info->max_port << 16)
 | nat_info->min_port);
-hash = ct_endpoint_hash_add(hash, >src);
-hash = ct_endpoint_hash_add(hash, >dst);
 hash = hash_add(hash, (OVS_FORCE uint32_t) key->dl_type);
 hash = hash_add(hash, key->nw_proto);
 hash = hash_add(hash, key->zone);
-
 /* The purpose of the second parameter is to distinguish hashes of data of
  * different length; our data always has the same length so there is no
  * value in counting. */
@@ -2388,10 +2392,19 @@ nat_get_unique_tuple(struct conntrack *ct, struct conn 
*conn,
  fwd_key->nw_proto == IPPROTO_SCTP;
 uint16_t min_dport, max_dport, curr_dport;
 uint16_t min_sport, max_sport, curr_sport;
-uint32_t hash, port_off;
+uint32_t hash, port_off, basis;
+
+basis = (nat_info->nat_flags & NAT_PERSISTENT) ? 0 : ct->hash_basis;
+hash = nat_range_hash(fwd_key, basis, nat_info);
+
+if (nat_info->nat_flags & NAT_RANGE_RANDOM) {
+port_off = random_uint32();
+} else if (basis) {
+port_off = hash;
+} else {
+port_off = nat_range_hash(fwd_key, ct->hash_basis, nat_info);
+}
 
-hash = nat_range_hash(fwd_key, ct->hash_basis, nat_info);
-port_off = nat_info->nat_flags & NAT_RANGE_RANDOM ? random_uint32() : hash;
 min_addr = nat_info->min_addr;
 max_addr = nat_info->max_addr;
 
diff --git a/lib/conntrack.h b/lib/conntrack.h
index 9b0c6aa88..ee7da099e 100644
--- a/lib/conntrack.h
+++ b/lib/conntrack.h
@@ -79,6 +79,7 @@ enum nat_action_e {
 
 enum nat_flags_e {
 NAT_RANGE_RANDOM = 1 << 0,
+NAT_PERSISTENT = 1 << 1,
 };
 
 struct nat_action_info_t {
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index c3334c667..fbf7ccabd 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -9413,6 +9413,8 @@ dp_execute_cb(void *aux_, struct dp_packet_batch 
*packets_,
 nat_action_info.nat_flags |= NAT_RANGE_RANDOM;
 break;
 case OVS_NAT_ATTR_PERSISTENT:
+nat_action_info.nat_flags |= NAT_PERSISTENT;
+break;
 case OVS_NAT_ATTR_PROTO_HASH:
 break;
 case OVS_NAT_ATTR_UNSPEC:
-- 
2.43.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v3 1/2] conntrack: Handle random selection for port ranges.

2024-02-16 Thread Paolo Valerio
The userspace conntrack only supported hash for port selection.
With the patch, both userspace and kernel datapath support the random
flag.

The default behavior remains the same, that is, if no flags are
specified, hash is selected.

Signed-off-by: Paolo Valerio 
Acked-by: Simon Horman 
---
 Documentation/ref/ovs-actions.7.rst |  3 +--
 NEWS|  3 +++
 lib/conntrack.c | 15 ---
 lib/conntrack.h |  5 +
 lib/dpif-netdev.c   |  4 +++-
 5 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/Documentation/ref/ovs-actions.7.rst 
b/Documentation/ref/ovs-actions.7.rst
index 36adcc5db..80acd9070 100644
--- a/Documentation/ref/ovs-actions.7.rst
+++ b/Documentation/ref/ovs-actions.7.rst
@@ -1551,8 +1551,7 @@ following arguments:
 should be selected. When a port range is specified, fallback to
 ephemeral ports does not happen, else, it will.  The port number
 selection can be informed by the optional ``random`` and ``hash`` flags
-described below.  The userspace datapath only supports the ``hash``
-behavior.
+described below.
 
 The optional *flags* are:
 
diff --git a/NEWS b/NEWS
index a6617546c..93046b963 100644
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,8 @@
 Post-v3.3.0
 
+   - Userspace datapath:
+ * Conntrack now supports 'random' flag for selecting ports in a range
+   while natting.
 
 
 v3.3.0 - xx xxx 
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 013709bd6..e09ecdf33 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -,7 +,7 @@ nat_range_hash(const struct conn_key *key, uint32_t basis,
 /* Ports are stored in host byte order for convenience. */
 static void
 set_sport_range(const struct nat_action_info_t *ni, const struct conn_key *k,
-uint32_t hash, uint16_t *curr, uint16_t *min,
+uint32_t off, uint16_t *curr, uint16_t *min,
 uint16_t *max)
 {
 if (((ni->nat_action & NAT_ACTION_SNAT_ALL) == NAT_ACTION_SRC) ||
@@ -2241,19 +2241,19 @@ set_sport_range(const struct nat_action_info_t *ni, 
const struct conn_key *k,
 } else {
 *min = ni->min_port;
 *max = ni->max_port;
-*curr = *min + (hash % ((*max - *min) + 1));
+*curr =  *min + (off % ((*max - *min) + 1));
 }
 }
 
 static void
 set_dport_range(const struct nat_action_info_t *ni, const struct conn_key *k,
-uint32_t hash, uint16_t *curr, uint16_t *min,
+uint32_t off, uint16_t *curr, uint16_t *min,
 uint16_t *max)
 {
 if (ni->nat_action & NAT_ACTION_DST_PORT) {
 *min = ni->min_port;
 *max = ni->max_port;
-*curr = *min + (hash % ((*max - *min) + 1));
+*curr = *min + (off % ((*max - *min) + 1));
 } else {
 *curr = ntohs(k->dst.port);
 *min = *max = *curr;
@@ -2388,18 +2388,19 @@ nat_get_unique_tuple(struct conntrack *ct, struct conn 
*conn,
  fwd_key->nw_proto == IPPROTO_SCTP;
 uint16_t min_dport, max_dport, curr_dport;
 uint16_t min_sport, max_sport, curr_sport;
-uint32_t hash;
+uint32_t hash, port_off;
 
 hash = nat_range_hash(fwd_key, ct->hash_basis, nat_info);
+port_off = nat_info->nat_flags & NAT_RANGE_RANDOM ? random_uint32() : hash;
 min_addr = nat_info->min_addr;
 max_addr = nat_info->max_addr;
 
 find_addr(fwd_key, _addr, _addr, , hash,
   (fwd_key->dl_type == htons(ETH_TYPE_IP)), nat_info);
 
-set_sport_range(nat_info, fwd_key, hash, _sport,
+set_sport_range(nat_info, fwd_key, port_off, _sport,
 _sport, _sport);
-set_dport_range(nat_info, fwd_key, hash, _dport,
+set_dport_range(nat_info, fwd_key, port_off, _dport,
 _dport, _dport);
 
 if (pat_proto) {
diff --git a/lib/conntrack.h b/lib/conntrack.h
index 0a888be45..9b0c6aa88 100644
--- a/lib/conntrack.h
+++ b/lib/conntrack.h
@@ -77,12 +77,17 @@ enum nat_action_e {
 NAT_ACTION_DST_PORT = 1 << 3,
 };
 
+enum nat_flags_e {
+NAT_RANGE_RANDOM = 1 << 0,
+};
+
 struct nat_action_info_t {
 union ct_addr min_addr;
 union ct_addr max_addr;
 uint16_t min_port;
 uint16_t max_port;
 uint16_t nat_action;
+uint16_t nat_flags;
 };
 
 struct conntrack *conntrack_init(void);
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index c1981137f..c3334c667 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -9409,9 +9409,11 @@ dp_execute_cb(void *aux_, struct dp_packet_batch 
*packets_,
 nl_attr_get_u16(b_nest);
 proto_num_max_specified = true;
 break;
+case OVS_NAT_ATTR_PROTO_RANDOM:
+nat_action_info.nat_flags |= NAT_RANGE_RANDOM;
+break;
 

[ovs-dev] [PATCH v2 2/2] conntrack: Handle persistent selection for IP addresses.

2024-02-07 Thread Paolo Valerio
The patch, when 'persistent' flag is specified, makes the IP selection
in a range persistent across reboots.

Signed-off-by: Paolo Valerio 
---
 NEWS  |  3 ++-
 lib/conntrack.c   | 27 +--
 lib/conntrack.h   |  1 +
 lib/dpif-netdev.c |  2 ++
 4 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/NEWS b/NEWS
index 93046b963..0c86bba81 100644
--- a/NEWS
+++ b/NEWS
@@ -2,7 +2,8 @@ Post-v3.3.0
 
- Userspace datapath:
  * Conntrack now supports 'random' flag for selecting ports in a range
-   while natting.
+   while natting and 'persistent' flag for selection of the IP address
+   from a range.
 
 
 v3.3.0 - xx xxx 
diff --git a/lib/conntrack.c b/lib/conntrack.c
index e09ecdf33..7868a67f7 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -2202,17 +2202,21 @@ nat_range_hash(const struct conn_key *key, uint32_t 
basis,
 {
 uint32_t hash = basis;
 
+if (!basis) {
+hash = ct_addr_hash_add(hash, >src.addr);
+} else {
+hash = ct_endpoint_hash_add(hash, >src);
+hash = ct_endpoint_hash_add(hash, >dst);
+}
+
 hash = ct_addr_hash_add(hash, _info->min_addr);
 hash = ct_addr_hash_add(hash, _info->max_addr);
 hash = hash_add(hash,
 ((uint32_t) nat_info->max_port << 16)
 | nat_info->min_port);
-hash = ct_endpoint_hash_add(hash, >src);
-hash = ct_endpoint_hash_add(hash, >dst);
 hash = hash_add(hash, (OVS_FORCE uint32_t) key->dl_type);
 hash = hash_add(hash, key->nw_proto);
 hash = hash_add(hash, key->zone);
-
 /* The purpose of the second parameter is to distinguish hashes of data of
  * different length; our data always has the same length so there is no
  * value in counting. */
@@ -2386,12 +2390,23 @@ nat_get_unique_tuple(struct conntrack *ct, struct conn 
*conn,
 bool pat_proto = fwd_key->nw_proto == IPPROTO_TCP ||
  fwd_key->nw_proto == IPPROTO_UDP ||
  fwd_key->nw_proto == IPPROTO_SCTP;
+uint32_t hash, port_off, basis = ct->hash_basis;
 uint16_t min_dport, max_dport, curr_dport;
 uint16_t min_sport, max_sport, curr_sport;
-uint32_t hash, port_off;
 
-hash = nat_range_hash(fwd_key, ct->hash_basis, nat_info);
-port_off = nat_info->nat_flags & NAT_RANGE_RANDOM ? random_uint32() : hash;
+if (nat_info->nat_flags & NAT_PERSISTENT) {
+basis = 0;
+}
+
+hash = nat_range_hash(fwd_key, basis, nat_info);
+
+if (nat_info->nat_flags & NAT_RANGE_RANDOM) {
+port_off = random_uint32();
+} else {
+port_off =
+basis ? hash : nat_range_hash(fwd_key, ct->hash_basis, nat_info);
+}
+
 min_addr = nat_info->min_addr;
 max_addr = nat_info->max_addr;
 
diff --git a/lib/conntrack.h b/lib/conntrack.h
index 9b0c6aa88..ee7da099e 100644
--- a/lib/conntrack.h
+++ b/lib/conntrack.h
@@ -79,6 +79,7 @@ enum nat_action_e {
 
 enum nat_flags_e {
 NAT_RANGE_RANDOM = 1 << 0,
+NAT_PERSISTENT = 1 << 1,
 };
 
 struct nat_action_info_t {
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index c3334c667..fbf7ccabd 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -9413,6 +9413,8 @@ dp_execute_cb(void *aux_, struct dp_packet_batch 
*packets_,
 nat_action_info.nat_flags |= NAT_RANGE_RANDOM;
 break;
 case OVS_NAT_ATTR_PERSISTENT:
+nat_action_info.nat_flags |= NAT_PERSISTENT;
+break;
 case OVS_NAT_ATTR_PROTO_HASH:
 break;
 case OVS_NAT_ATTR_UNSPEC:
-- 
2.43.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2 1/2] conntrack: Handle random selection for port ranges.

2024-02-07 Thread Paolo Valerio
The userspace conntrack only supported hash for port selection.
With the patch, both userspace and kernel datapath support the random
flag.

The default behavior remains the same, that is, if no flags are
specified, hash is selected.

Signed-off-by: Paolo Valerio 
---
 Documentation/ref/ovs-actions.7.rst |  3 +--
 NEWS|  3 +++
 lib/conntrack.c | 15 ---
 lib/conntrack.h |  5 +
 lib/dpif-netdev.c   |  4 +++-
 5 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/Documentation/ref/ovs-actions.7.rst 
b/Documentation/ref/ovs-actions.7.rst
index 36adcc5db..80acd9070 100644
--- a/Documentation/ref/ovs-actions.7.rst
+++ b/Documentation/ref/ovs-actions.7.rst
@@ -1551,8 +1551,7 @@ following arguments:
 should be selected. When a port range is specified, fallback to
 ephemeral ports does not happen, else, it will.  The port number
 selection can be informed by the optional ``random`` and ``hash`` flags
-described below.  The userspace datapath only supports the ``hash``
-behavior.
+described below.
 
 The optional *flags* are:
 
diff --git a/NEWS b/NEWS
index a6617546c..93046b963 100644
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,8 @@
 Post-v3.3.0
 
+   - Userspace datapath:
+ * Conntrack now supports 'random' flag for selecting ports in a range
+   while natting.
 
 
 v3.3.0 - xx xxx 
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 013709bd6..e09ecdf33 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -,7 +,7 @@ nat_range_hash(const struct conn_key *key, uint32_t basis,
 /* Ports are stored in host byte order for convenience. */
 static void
 set_sport_range(const struct nat_action_info_t *ni, const struct conn_key *k,
-uint32_t hash, uint16_t *curr, uint16_t *min,
+uint32_t off, uint16_t *curr, uint16_t *min,
 uint16_t *max)
 {
 if (((ni->nat_action & NAT_ACTION_SNAT_ALL) == NAT_ACTION_SRC) ||
@@ -2241,19 +2241,19 @@ set_sport_range(const struct nat_action_info_t *ni, 
const struct conn_key *k,
 } else {
 *min = ni->min_port;
 *max = ni->max_port;
-*curr = *min + (hash % ((*max - *min) + 1));
+*curr =  *min + (off % ((*max - *min) + 1));
 }
 }
 
 static void
 set_dport_range(const struct nat_action_info_t *ni, const struct conn_key *k,
-uint32_t hash, uint16_t *curr, uint16_t *min,
+uint32_t off, uint16_t *curr, uint16_t *min,
 uint16_t *max)
 {
 if (ni->nat_action & NAT_ACTION_DST_PORT) {
 *min = ni->min_port;
 *max = ni->max_port;
-*curr = *min + (hash % ((*max - *min) + 1));
+*curr = *min + (off % ((*max - *min) + 1));
 } else {
 *curr = ntohs(k->dst.port);
 *min = *max = *curr;
@@ -2388,18 +2388,19 @@ nat_get_unique_tuple(struct conntrack *ct, struct conn 
*conn,
  fwd_key->nw_proto == IPPROTO_SCTP;
 uint16_t min_dport, max_dport, curr_dport;
 uint16_t min_sport, max_sport, curr_sport;
-uint32_t hash;
+uint32_t hash, port_off;
 
 hash = nat_range_hash(fwd_key, ct->hash_basis, nat_info);
+port_off = nat_info->nat_flags & NAT_RANGE_RANDOM ? random_uint32() : hash;
 min_addr = nat_info->min_addr;
 max_addr = nat_info->max_addr;
 
 find_addr(fwd_key, _addr, _addr, , hash,
   (fwd_key->dl_type == htons(ETH_TYPE_IP)), nat_info);
 
-set_sport_range(nat_info, fwd_key, hash, _sport,
+set_sport_range(nat_info, fwd_key, port_off, _sport,
 _sport, _sport);
-set_dport_range(nat_info, fwd_key, hash, _dport,
+set_dport_range(nat_info, fwd_key, port_off, _dport,
 _dport, _dport);
 
 if (pat_proto) {
diff --git a/lib/conntrack.h b/lib/conntrack.h
index 0a888be45..9b0c6aa88 100644
--- a/lib/conntrack.h
+++ b/lib/conntrack.h
@@ -77,12 +77,17 @@ enum nat_action_e {
 NAT_ACTION_DST_PORT = 1 << 3,
 };
 
+enum nat_flags_e {
+NAT_RANGE_RANDOM = 1 << 0,
+};
+
 struct nat_action_info_t {
 union ct_addr min_addr;
 union ct_addr max_addr;
 uint16_t min_port;
 uint16_t max_port;
 uint16_t nat_action;
+uint16_t nat_flags;
 };
 
 struct conntrack *conntrack_init(void);
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index c1981137f..c3334c667 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -9409,9 +9409,11 @@ dp_execute_cb(void *aux_, struct dp_packet_batch 
*packets_,
 nl_attr_get_u16(b_nest);
 proto_num_max_specified = true;
 break;
+case OVS_NAT_ATTR_PROTO_RANDOM:
+nat_action_info.nat_flags |= NAT_RANGE_RANDOM;
+break;
 case OVS_NAT_ATT

Re: [ovs-dev] [PATCH 2/2] conntrack: Handle persistent selection for IP addresses.

2024-02-07 Thread Paolo Valerio
Paolo Valerio  writes:

> The patch, when 'persistent' flag is specified, makes the IP selection
> in a range persistent across reboots.
>
> Signed-off-by: Paolo Valerio 
> ---
>  NEWS  |  3 ++-
>  lib/conntrack.c   | 26 ++
>  lib/conntrack.h   |  1 +
>  lib/dpif-netdev.c |  2 ++
>  4 files changed, 27 insertions(+), 5 deletions(-)
>
> diff --git a/NEWS b/NEWS
> index 93046b963..0c86bba81 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -2,7 +2,8 @@ Post-v3.3.0
>  

The patch needs a respin because of a leftover that slipped during a
rebase.
Will send a v2.

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 2/2] conntrack: Handle persistent selection for IP addresses.

2024-02-07 Thread Paolo Valerio
The patch, when 'persistent' flag is specified, makes the IP selection
in a range persistent across reboots.

Signed-off-by: Paolo Valerio 
---
 NEWS  |  3 ++-
 lib/conntrack.c   | 26 ++
 lib/conntrack.h   |  1 +
 lib/dpif-netdev.c |  2 ++
 4 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/NEWS b/NEWS
index 93046b963..0c86bba81 100644
--- a/NEWS
+++ b/NEWS
@@ -2,7 +2,8 @@ Post-v3.3.0
 
- Userspace datapath:
  * Conntrack now supports 'random' flag for selecting ports in a range
-   while natting.
+   while natting and 'persistent' flag for selection of the IP address
+   from a range.
 
 
 v3.3.0 - xx xxx 
diff --git a/lib/conntrack.c b/lib/conntrack.c
index e09ecdf33..e085ddee9 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -2202,17 +2202,21 @@ nat_range_hash(const struct conn_key *key, uint32_t 
basis,
 {
 uint32_t hash = basis;
 
+if (!basis) {
+hash = ct_addr_hash_add(hash, >src.addr);
+} else {
+hash = ct_endpoint_hash_add(hash, >src);
+hash = ct_endpoint_hash_add(hash, >dst);
+}
+
 hash = ct_addr_hash_add(hash, _info->min_addr);
 hash = ct_addr_hash_add(hash, _info->max_addr);
 hash = hash_add(hash,
 ((uint32_t) nat_info->max_port << 16)
 | nat_info->min_port);
-hash = ct_endpoint_hash_add(hash, >src);
-hash = ct_endpoint_hash_add(hash, >dst);
 hash = hash_add(hash, (OVS_FORCE uint32_t) key->dl_type);
 hash = hash_add(hash, key->nw_proto);
 hash = hash_add(hash, key->zone);
-
 /* The purpose of the second parameter is to distinguish hashes of data of
  * different length; our data always has the same length so there is no
  * value in counting. */
@@ -2386,12 +2390,26 @@ nat_get_unique_tuple(struct conntrack *ct, struct conn 
*conn,
 bool pat_proto = fwd_key->nw_proto == IPPROTO_TCP ||
  fwd_key->nw_proto == IPPROTO_UDP ||
  fwd_key->nw_proto == IPPROTO_SCTP;
+uint32_t hash, port_off, basis = ct->hash_basis;
 uint16_t min_dport, max_dport, curr_dport;
 uint16_t min_sport, max_sport, curr_sport;
-uint32_t hash, port_off;
 
 hash = nat_range_hash(fwd_key, ct->hash_basis, nat_info);
 port_off = nat_info->nat_flags & NAT_RANGE_RANDOM ? random_uint32() : hash;
+
+if (nat_info->nat_flags & NAT_PERSISTENT) {
+basis = 0;
+}
+
+hash = nat_range_hash(fwd_key, basis, nat_info);
+
+if (nat_info->nat_flags & NAT_RANGE_RANDOM) {
+port_off = random_uint16();
+} else {
+port_off =
+basis ? hash : nat_range_hash(fwd_key, ct->hash_basis, nat_info);
+}
+
 min_addr = nat_info->min_addr;
 max_addr = nat_info->max_addr;
 
diff --git a/lib/conntrack.h b/lib/conntrack.h
index 9b0c6aa88..ee7da099e 100644
--- a/lib/conntrack.h
+++ b/lib/conntrack.h
@@ -79,6 +79,7 @@ enum nat_action_e {
 
 enum nat_flags_e {
 NAT_RANGE_RANDOM = 1 << 0,
+NAT_PERSISTENT = 1 << 1,
 };
 
 struct nat_action_info_t {
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index c3334c667..fbf7ccabd 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -9413,6 +9413,8 @@ dp_execute_cb(void *aux_, struct dp_packet_batch 
*packets_,
 nat_action_info.nat_flags |= NAT_RANGE_RANDOM;
 break;
 case OVS_NAT_ATTR_PERSISTENT:
+nat_action_info.nat_flags |= NAT_PERSISTENT;
+break;
 case OVS_NAT_ATTR_PROTO_HASH:
 break;
 case OVS_NAT_ATTR_UNSPEC:
-- 
2.43.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 1/2] conntrack: Handle random selection for port ranges.

2024-02-07 Thread Paolo Valerio
The userspace conntrack only supported hash for port selection.
With the patch, both userspace and kernel datapath support the random
flag.

The default behavior remains the same, that is, if no flags are
specified, hash is selected.

Signed-off-by: Paolo Valerio 
---
 Documentation/ref/ovs-actions.7.rst |  3 +--
 NEWS|  3 +++
 lib/conntrack.c | 15 ---
 lib/conntrack.h |  5 +
 lib/dpif-netdev.c   |  4 +++-
 5 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/Documentation/ref/ovs-actions.7.rst 
b/Documentation/ref/ovs-actions.7.rst
index 36adcc5db..80acd9070 100644
--- a/Documentation/ref/ovs-actions.7.rst
+++ b/Documentation/ref/ovs-actions.7.rst
@@ -1551,8 +1551,7 @@ following arguments:
 should be selected. When a port range is specified, fallback to
 ephemeral ports does not happen, else, it will.  The port number
 selection can be informed by the optional ``random`` and ``hash`` flags
-described below.  The userspace datapath only supports the ``hash``
-behavior.
+described below.
 
 The optional *flags* are:
 
diff --git a/NEWS b/NEWS
index a6617546c..93046b963 100644
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,8 @@
 Post-v3.3.0
 
+   - Userspace datapath:
+ * Conntrack now supports 'random' flag for selecting ports in a range
+   while natting.
 
 
 v3.3.0 - xx xxx 
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 013709bd6..e09ecdf33 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -,7 +,7 @@ nat_range_hash(const struct conn_key *key, uint32_t basis,
 /* Ports are stored in host byte order for convenience. */
 static void
 set_sport_range(const struct nat_action_info_t *ni, const struct conn_key *k,
-uint32_t hash, uint16_t *curr, uint16_t *min,
+uint32_t off, uint16_t *curr, uint16_t *min,
 uint16_t *max)
 {
 if (((ni->nat_action & NAT_ACTION_SNAT_ALL) == NAT_ACTION_SRC) ||
@@ -2241,19 +2241,19 @@ set_sport_range(const struct nat_action_info_t *ni, 
const struct conn_key *k,
 } else {
 *min = ni->min_port;
 *max = ni->max_port;
-*curr = *min + (hash % ((*max - *min) + 1));
+*curr =  *min + (off % ((*max - *min) + 1));
 }
 }
 
 static void
 set_dport_range(const struct nat_action_info_t *ni, const struct conn_key *k,
-uint32_t hash, uint16_t *curr, uint16_t *min,
+uint32_t off, uint16_t *curr, uint16_t *min,
 uint16_t *max)
 {
 if (ni->nat_action & NAT_ACTION_DST_PORT) {
 *min = ni->min_port;
 *max = ni->max_port;
-*curr = *min + (hash % ((*max - *min) + 1));
+*curr = *min + (off % ((*max - *min) + 1));
 } else {
 *curr = ntohs(k->dst.port);
 *min = *max = *curr;
@@ -2388,18 +2388,19 @@ nat_get_unique_tuple(struct conntrack *ct, struct conn 
*conn,
  fwd_key->nw_proto == IPPROTO_SCTP;
 uint16_t min_dport, max_dport, curr_dport;
 uint16_t min_sport, max_sport, curr_sport;
-uint32_t hash;
+uint32_t hash, port_off;
 
 hash = nat_range_hash(fwd_key, ct->hash_basis, nat_info);
+port_off = nat_info->nat_flags & NAT_RANGE_RANDOM ? random_uint32() : hash;
 min_addr = nat_info->min_addr;
 max_addr = nat_info->max_addr;
 
 find_addr(fwd_key, _addr, _addr, , hash,
   (fwd_key->dl_type == htons(ETH_TYPE_IP)), nat_info);
 
-set_sport_range(nat_info, fwd_key, hash, _sport,
+set_sport_range(nat_info, fwd_key, port_off, _sport,
 _sport, _sport);
-set_dport_range(nat_info, fwd_key, hash, _dport,
+set_dport_range(nat_info, fwd_key, port_off, _dport,
 _dport, _dport);
 
 if (pat_proto) {
diff --git a/lib/conntrack.h b/lib/conntrack.h
index 0a888be45..9b0c6aa88 100644
--- a/lib/conntrack.h
+++ b/lib/conntrack.h
@@ -77,12 +77,17 @@ enum nat_action_e {
 NAT_ACTION_DST_PORT = 1 << 3,
 };
 
+enum nat_flags_e {
+NAT_RANGE_RANDOM = 1 << 0,
+};
+
 struct nat_action_info_t {
 union ct_addr min_addr;
 union ct_addr max_addr;
 uint16_t min_port;
 uint16_t max_port;
 uint16_t nat_action;
+uint16_t nat_flags;
 };
 
 struct conntrack *conntrack_init(void);
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index c1981137f..c3334c667 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -9409,9 +9409,11 @@ dp_execute_cb(void *aux_, struct dp_packet_batch 
*packets_,
 nl_attr_get_u16(b_nest);
 proto_num_max_specified = true;
 break;
+case OVS_NAT_ATTR_PROTO_RANDOM:
+nat_action_info.nat_flags |= NAT_RANGE_RANDOM;
+break;
 case OVS_NAT_ATT

Re: [ovs-dev] [PATCH v3 3/3] mcast-snooping: Fix comments format.

2023-11-21 Thread Paolo Valerio
David Marchand  writes:

> Capitalize comments and end them with a . when needed.
>
> Signed-off-by: David Marchand 
> ---
>  tests/mcast-snooping.at | 16 
>  1 file changed, 8 insertions(+), 8 deletions(-)
>

Acked-by: Paolo Valerio 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v2 2/2] mcast-snooping: Flush flood and report ports when deleting interfaces.

2023-11-15 Thread Paolo Valerio
David Marchand  writes:

> When a configuration change triggers an interface destruction/creation
> (like for example, setting ofport_request), a port object may still be
> referenced as a fport or a rport in the mdb.
>
> Before the fix, when flooding multicast traffic:
> bridge("br0")
> -
>  0. priority 32768
> NORMAL
>  -> forwarding to mcast group port
>  >> mcast flood port is unknown, dropping
>  -> mcast flood port is input port, dropping
>  -> forwarding to mcast flood port
>
> Before the fix, when flooding igmp report traffic:
> bridge("br0")
> -
>  0. priority 32768
> NORMAL
>  >> mcast port is unknown, dropping the report
>  -> forwarding report to mcast flagged port
>  -> mcast port is input port, dropping the Report
>  -> forwarding report to mcast flagged port
>
> Add relevant cleanup and update unit tests.
>
> Fixes: 4fbbf8624868 ("mcast-snooping: Flush ports mdb when VLAN configuration 
> changed.")
> Signed-off-by: David Marchand 
> ---
> Changes since v1:
> - updated the test on report flooding,
>

Acked-by: Paolo Valerio 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v2 1/2] mcast-snooping: Test per port explicit flooding.

2023-11-15 Thread Paolo Valerio
David Marchand  writes:

> Various options affect how the mcast snooping module work.
>
> When multicast snooping is enabled and a reporter is known, it is still
> possible to flood associated packets to some other port via the
> mcast-snooping-flood option.
>
> If flooding unregistered traffic is disabled, it is still possible to
> flood multicast traffic too with the mcast-snooping-flood option.
>
> IGMP reports may have to be flooded to some ports explicitly with the
> mcast-snooping-flood-reports option.
>
> Test those parameters.
>
> Signed-off-by: David Marchand 
> ---

Thanks David.
The patch lgtm.

Acked-by: Paolo Valerio 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH 1/2] mcast-snooping: Test per port explicit flooding.

2023-11-10 Thread Paolo Valerio
David Marchand  writes:

> On Thu, Nov 9, 2023 at 4:33 PM Paolo Valerio  wrote:
>>
>> David Marchand  writes:
>>
>> > When multicast snooping is enabled and a reporter is known, it is still
>> > possible to flood associated packets to some other port via the
>> > mcast-snooping-flood option.
>> >
>> > Test this combination.
>> >
>> > Signed-off-by: David Marchand 
>> > ---
>> >  tests/mcast-snooping.at | 88 +
>> >  1 file changed, 88 insertions(+)
>> >
>> > diff --git a/tests/mcast-snooping.at b/tests/mcast-snooping.at
>> > index d5b7c4774c..21c806ef63 100644
>> > --- a/tests/mcast-snooping.at
>> > +++ b/tests/mcast-snooping.at
>> > @@ -105,6 +105,94 @@ AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl
>> >  OVS_VSWITCHD_STOP
>> >  AT_CLEANUP
>> >
>> > +
>> > +AT_SETUP([mcast - check flooding on ports])
>> > +OVS_VSWITCHD_START([])
>> > +
>> > +AT_CHECK([
>> > +ovs-vsctl set bridge br0 \
>> > +datapath_type=dummy \
>> > +mcast_snooping_enable=true \
>> > +other-config:mcast-snooping-disable-flood-unregistered=false
>> > +], [0])
>> > +
>>
>> in the case flood unregistered is disabled packets are supposed to
>> be sent to flood ports. While at it, it might also be worth testing that
>> like in the quick example at the end I used to test it.
>> WDYT?
>
> It sounds reasonable yes.
>
> I was also considering testing reports flooding.
> WDYT?
>

if you mean testing mcast-snooping-flood-reports, that would be nice.
This way that flag as well will have some coverage.

>
>>
>> > +AT_CHECK([ovs-ofctl add-flow br0 action=normal])
>> > +
>> > +AT_CHECK([
>> > +ovs-vsctl add-port br0 p1 \
>> > +-- set Interface p1 type=dummy other-config:hwaddr=aa:55:aa:55:00:01 
>> > ofport_request=1 \
>> > +-- add-port br0 p2 \
>> > +-- set Interface p2 type=dummy other-config:hwaddr=aa:55:aa:55:00:02 
>> > ofport_request=2 \
>> > +-- add-port br0 p3 \
>> > +-- set Interface p3 type=dummy other-config:hwaddr=aa:55:aa:55:00:03 
>> > ofport_request=3 \
>> > +], [0])
>> > +
>> > +ovs-appctl time/stop
>> > +
>> > +# send report packets
>> > +AT_CHECK([
>> > +ovs-appctl netdev-dummy/receive p1  \
>> > +
>> > '01005E010101000C29A027A10800451C00014002CBAEAC10221EE001010112140CE9E0010101'
>> > +], [0])
>> > +
>> > +AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl
>> > + port  VLAN  GROUPAge
>> > +1 0  224.1.1.1   0
>> > +])
>> > +
>> > +AT_CHECK([ovs-appctl ofproto/trace 
>> > "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:5e:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"],
>> >  [0], [dnl
>> > +Flow: 
>> > udp,in_port=3,vlan_tci=0x,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:5e:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000
>> > +
>>
>> I think the mac for 224.1.1.1 maps to 01:00:5e:01:01:01.
>
> Argh.. indeed, wrong copy/paste.
> Thanks for the review!
>

thank you for working on this!

>>
>> > +bridge("br0")
>> > +-
>> > + 0. priority 32768
>> > +NORMAL
>> > + -> forwarding to mcast group port
>
>
> -- 
> David Marchand

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH 1/2] mcast-snooping: Test per port explicit flooding.

2023-11-09 Thread Paolo Valerio
David Marchand  writes:

> When multicast snooping is enabled and a reporter is known, it is still
> possible to flood associated packets to some other port via the
> mcast-snooping-flood option.
>
> Test this combination.
>
> Signed-off-by: David Marchand 
> ---
>  tests/mcast-snooping.at | 88 +
>  1 file changed, 88 insertions(+)
>
> diff --git a/tests/mcast-snooping.at b/tests/mcast-snooping.at
> index d5b7c4774c..21c806ef63 100644
> --- a/tests/mcast-snooping.at
> +++ b/tests/mcast-snooping.at
> @@ -105,6 +105,94 @@ AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl
>  OVS_VSWITCHD_STOP
>  AT_CLEANUP
>  
> +
> +AT_SETUP([mcast - check flooding on ports])
> +OVS_VSWITCHD_START([])
> +
> +AT_CHECK([
> +ovs-vsctl set bridge br0 \
> +datapath_type=dummy \
> +mcast_snooping_enable=true \
> +other-config:mcast-snooping-disable-flood-unregistered=false
> +], [0])
> +

in the case flood unregistered is disabled packets are supposed to
be sent to flood ports. While at it, it might also be worth testing that
like in the quick example at the end I used to test it.
WDYT?

> +AT_CHECK([ovs-ofctl add-flow br0 action=normal])
> +
> +AT_CHECK([
> +ovs-vsctl add-port br0 p1 \
> +-- set Interface p1 type=dummy other-config:hwaddr=aa:55:aa:55:00:01 
> ofport_request=1 \
> +-- add-port br0 p2 \
> +-- set Interface p2 type=dummy other-config:hwaddr=aa:55:aa:55:00:02 
> ofport_request=2 \
> +-- add-port br0 p3 \
> +-- set Interface p3 type=dummy other-config:hwaddr=aa:55:aa:55:00:03 
> ofport_request=3 \
> +], [0])
> +
> +ovs-appctl time/stop
> +
> +# send report packets
> +AT_CHECK([
> +ovs-appctl netdev-dummy/receive p1  \
> +
> '01005E010101000C29A027A10800451C00014002CBAEAC10221EE001010112140CE9E0010101'
> +], [0])
> +
> +AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl
> + port  VLAN  GROUPAge
> +1 0  224.1.1.1   0
> +])
> +
> +AT_CHECK([ovs-appctl ofproto/trace 
> "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:5e:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"],
>  [0], [dnl
> +Flow: 
> udp,in_port=3,vlan_tci=0x,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:5e:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000
> +

I think the mac for 224.1.1.1 maps to 01:00:5e:01:01:01. 

> +bridge("br0")
> +-
> + 0. priority 32768
> +NORMAL
> + -> forwarding to mcast group port
> +
> +Final flow: unchanged
> +Megaflow: 
> recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:5e:01:01,nw_dst=224.1.1.1,nw_frag=no
> +Datapath actions: 1
> +])
> +
> +AT_CHECK([ovs-vsctl set port p2 other_config:mcast-snooping-flood=true])
> +
> +AT_CHECK([ovs-appctl ofproto/trace 
> "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:5e:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"],
>  [0], [dnl
> +Flow: 
> udp,in_port=3,vlan_tci=0x,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:5e:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000
> +
> +bridge("br0")
> +-
> + 0. priority 32768
> +NORMAL
> + -> forwarding to mcast group port
> + -> forwarding to mcast flood port
> +
> +Final flow: unchanged
> +Megaflow: 
> recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:5e:01:01,nw_dst=224.1.1.1,nw_frag=no
> +Datapath actions: 1,2
> +])
> +
> +AT_CHECK([ovs-vsctl set port p3 other_config:mcast-snooping-flood=true])
> +
> +AT_CHECK([ovs-appctl ofproto/trace 
> "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:5e:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"],
>  [0], [dnl
> +Flow: 
> udp,in_port=3,vlan_tci=0x,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:5e:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000
> +
> +bridge("br0")
> +-
> + 0. priority 32768
> +NORMAL
> + -> forwarding to mcast group port
> + -> forwarding to mcast flood port
> + -> mcast flood port is input port, dropping
> +
> +Final flow: unchanged
> +Megaflow: 
> recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:5e:01:01,nw_dst=224.1.1.1,nw_frag=no
> +Datapath actions: 1,2
> +])
> +
> +OVS_VSWITCHD_STOP
> +AT_CLEANUP
> +
> +
>  AT_SETUP([mcast - delete the port mdb when vlan configuration changed])
>  OVS_VSWITCHD_START([])
>  
> -- 
> 2.41.0
>
> ___
> dev mailing list
> d...@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev


diff --git a/tests/mcast-snooping.at b/tests/mcast-snooping.at
index 21c806ef6..787b09570 100644
--- a/tests/mcast-snooping.at
+++ b/tests/mcast-snooping.at
@@ -189,6 +189,41 @@ Megaflow: 

Re: [ovs-dev] [PATCH v3 branch-2.17 1/2] conntrack: simplify cleanup path

2023-10-12 Thread Paolo Valerio
Frode Nordahl  writes:

> On Tue, Oct 3, 2023 at 9:06 PM Aaron Conole  wrote:
>>
>> The conntrack cleanup and allocation code is spread across multiple
>> list invocations.  This was changed in mainline code when the timeout
>> expiration lists were refactored, but backporting those fixes would
>> be a rather large effort.  Instead, take only the changes we need
>> to backport "contrack: Remove nat_conn introducing key directionality"
>> into branch-2.17.
>
> Thanks alot for your help in backporting this patch.
>
> We have a managed customer environment where circumstances make the
> issue trigger with a rate of 70% when performing a certain action. Up
> until now they have been running with a temporary package containing
> the patches from
> https://patchwork.ozlabs.org/project/openvswitch/list/?series=351579=*
>
> To test this series, they have first re-confirmed that they see the
> issue with a packaged version of OVS 2.17.7, and then switched to a
> packaged version of OVS 2.17.7 with these patches and confirmed that
> the issue is no longer occurring. The same package has been in
> production use for the past week, being exposed to real world traffic.
> No side effects or incidents to report.
>
> Tested-by: Frode Nordahl 
>

Thanks Frode, Aaron and Simon.

On my side, I don't see any issues with the series, both patches look
good to me.

> -- 
> Frode Nordahl
>
>> Signed-off-by: Aaron Conole 
>> Co-authored-by: Paolo Valerio 
>> Signed-off-by: Paolo Valerio 
>> ---
>>  lib/conntrack.c | 60 +++--
>>  1 file changed, 18 insertions(+), 42 deletions(-)
>>
>> diff --git a/lib/conntrack.c b/lib/conntrack.c
>> index fff8e77db1..83a73995d6 100644
>> --- a/lib/conntrack.c
>> +++ b/lib/conntrack.c
>> @@ -94,9 +94,8 @@ static bool valid_new(struct dp_packet *pkt, struct 
>> conn_key *);
>>  static struct conn *new_conn(struct conntrack *ct, struct dp_packet *pkt,
>>   struct conn_key *, long long now,
>>   uint32_t tp_id);
>> -static void delete_conn_cmn(struct conn *);
>> +static void delete_conn__(struct conn *);
>>  static void delete_conn(struct conn *);
>> -static void delete_conn_one(struct conn *conn);
>>  static enum ct_update_res conn_update(struct conntrack *ct, struct conn 
>> *conn,
>>struct dp_packet *pkt,
>>struct conn_lookup_ctx *ctx,
>> @@ -444,9 +443,11 @@ zone_limit_delete(struct conntrack *ct, uint16_t zone)
>>  }
>>
>>  static void
>> -conn_clean_cmn(struct conntrack *ct, struct conn *conn)
>> +conn_clean(struct conntrack *ct, struct conn *conn)
>>  OVS_REQUIRES(ct->ct_lock)
>>  {
>> +ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
>> +
>>  if (conn->alg) {
>>  expectation_clean(ct, >key);
>>  }
>> @@ -458,19 +459,9 @@ conn_clean_cmn(struct conntrack *ct, struct conn *conn)
>>  if (zl && zl->czl.zone_limit_seq == conn->zone_limit_seq) {
>>  zl->czl.count--;
>>  }
>> -}
>>
>> -/* Must be called with 'conn' of 'conn_type' CT_CONN_TYPE_DEFAULT.  Also
>> - * removes the associated nat 'conn' from the lookup datastructures. */
>> -static void
>> -conn_clean(struct conntrack *ct, struct conn *conn)
>> -OVS_REQUIRES(ct->ct_lock)
>> -{
>> -ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
>> -
>> -conn_clean_cmn(ct, conn);
>>  if (conn->nat_conn) {
>> -uint32_t hash = conn_key_hash(>nat_conn->key, ct->hash_basis);
>> +hash = conn_key_hash(>nat_conn->key, ct->hash_basis);
>>  cmap_remove(>conns, >nat_conn->cm_node, hash);
>>  }
>>  ovs_list_remove(>exp_node);
>> @@ -479,19 +470,6 @@ conn_clean(struct conntrack *ct, struct conn *conn)
>>  atomic_count_dec(>n_conn);
>>  }
>>
>> -static void
>> -conn_clean_one(struct conntrack *ct, struct conn *conn)
>> -OVS_REQUIRES(ct->ct_lock)
>> -{
>> -conn_clean_cmn(ct, conn);
>> -if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
>> -ovs_list_remove(>exp_node);
>> -conn->cleaned = true;
>> -atomic_count_dec(>n_conn);
>> -}
>> -ovsrcu_postpone(delete_conn_one, conn);
>> -}
>> -
>>  /* Destroys the connection tracker 'ct' and frees all the allocated memory.
>>   * The caller of this function mu

[ovs-dev] [PATCH v2] ofproto-dpif-xlate: Fix recirculation with patch port and controller.

2023-09-05 Thread Paolo Valerio
If a packet originating from the controller recirculates after going
through a patch port, it gets dropped with the following message:

ofproto_dpif_upcall(handler8)|INFO|received packet on unassociated
  datapath port 4294967295

This happens because there's no xport_uuid in the recirculation node
and at the same type in_port refers to the patch port.

The patch, in the case of zeroed uuid, checks that in_port belongs to
the bridge and returns the related ofproto.

Signed-off-by: Paolo Valerio 
---
 ofproto/ofproto-dpif-xlate.c |   12 +++-
 tests/ofproto-dpif.at|   34 ++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index 47ea0f47e..fcd547645 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -1615,7 +1615,8 @@ xlate_lookup_ofproto_(const struct dpif_backer *backer,
 }
 
 ofp_port_t in_port = recirc_id_node->state.metadata.in_port;
-if (in_port != OFPP_NONE && in_port != OFPP_CONTROLLER) {
+if (in_port != OFPP_NONE && in_port != OFPP_CONTROLLER &&
+!uuid_is_zero(_id_node->state.xport_uuid)) {
 struct uuid xport_uuid = recirc_id_node->state.xport_uuid;
 xport = xport_lookup_by_uuid(xcfg, _uuid);
 if (xport && xport->xbridge && xport->xbridge->ofproto) {
@@ -1626,11 +1627,19 @@ xlate_lookup_ofproto_(const struct dpif_backer *backer,
  * that the packet originated from the controller via an OpenFlow
  * "packet-out".  The right thing to do is to find just the
  * ofproto.  There is no xport, which is OK.
+ * Also a zeroed xport_uuid with a valid in_port, means that
+ * the packet originated from OFPP_CONTROLLER passed
+ * through a patch port.
  *
  * OFPP_NONE can also indicate that a bond caused recirculation. */
 struct uuid uuid = recirc_id_node->state.ofproto_uuid;
 const struct xbridge *bridge = xbridge_lookup_by_uuid(xcfg, );
+
 if (bridge && bridge->ofproto) {
+if (in_port != OFPP_CONTROLLER && in_port != OFPP_NONE &&
+!get_ofp_port(bridge, in_port)) {
+goto xport_lookup;
+}
 if (errorp) {
 *errorp = NULL;
 }
@@ -1643,6 +1652,7 @@ xlate_lookup_ofproto_(const struct dpif_backer *backer,
 }
 }
 
+xport_lookup:
 xport = xport_lookup(xcfg, tnl_port_should_receive(flow)
  ? tnl_port_receive(flow)
  : odp_port_to_ofport(backer, flow->in_port.odp_port));
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
index f242f77f3..a0a4aaf5d 100644
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -5854,6 +5854,40 @@ OVS_WAIT_UNTIL([check_flows], [ovs-ofctl dump-flows br0])
 OVS_VSWITCHD_STOP
 AT_CLEANUP
 
+# Checks for regression against a bug in which OVS dropped packets
+# originating from the the controller passing through a patch port
+AT_SETUP([ofproto-dpif - packet-out recirculation OFPP_CONTROLLER and patch 
port])
+OVS_VSWITCHD_START(
+[add-port br0 patch-br1 -- \
+ set interface patch-br1 type=patch options:peer=patch-br0 -- \
+ add-br br1 -- set bridge br1 datapath-type=dummy fail-mode=secure -- \
+ add-port br1 patch-br0 -- set interface patch-br0 type=patch 
options:peer=patch-br1
+])
+
+add_of_ports --pcap br1 1
+
+AT_DATA([flows-br0.txt], [dnl
+table=0 icmp actions=output:patch-br1
+])
+AT_CHECK([ovs-ofctl add-flows br0 flows-br0.txt])
+
+AT_DATA([flows-br1.txt], [dnl
+table=0, icmp actions=ct(table=1,zone=1)
+table=1, ct_state=+trk, icmp actions=p1
+])
+AT_CHECK([ovs-ofctl add-flows br1 flows-br1.txt])
+
+packet=50540007505400050800455c8001b94dc0a80001c0a80002080013fc000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+AT_CHECK([ovs-ofctl packet-out br0 "in_port=CONTROLLER packet=$packet 
actions=table"])
+
+OVS_WAIT_UNTIL_EQUAL([ovs-ofctl dump-flows -m br1 | grep "ct_state" | 
ofctl_strip], [dnl
+ table=1, n_packets=1, n_bytes=106, ct_state=+trk,icmp actions=output:2])
+
+OVS_WAIT_UNTIL([ovs-pcap p1-tx.pcap | grep -q "$packet"])
+
+OVS_VSWITCHD_STOP
+AT_CLEANUP
+
 AT_SETUP([ofproto-dpif - debug_slow action])
 OVS_VSWITCHD_START
 add_of_ports br0 1 2 3

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v3] conntrack: Remove nat_conn introducing key directionality.

2023-08-30 Thread Paolo Valerio
From: hepeng 

The patch avoids the extra allocation for nat_conn.
Currently, when doing NAT, the userspace conntrack will use an extra
conn for the two directions in a flow. However, each conn has actually
the two keys for both orig and rev directions. This patch introduces a
key_node[CT_DIRS] member as per Aaron's suggestion in the conn which
consists of a key, direction, and a cmap_node for hash lookup so
addressing the feedback received by the original patch [0].

[0] 
https://patchwork.ozlabs.org/project/openvswitch/patch/20201129033255.64647-2-hepeng.0...@bytedance.com/

Signed-off-by: Peng He 
Co-authored-by: Paolo Valerio 
Signed-off-by: Paolo Valerio 
---
v3:
  - resolved a potentially UB with offsetof() and integer constant
expression (Ilya)
  - int to bool assignment (Ilya)
  - check the direction early in conntrack_dump_next() to avoid
unneeded operations (Ilya)
  - unrelated change added that turns the branch:
if (!conn_lookup()) { return true; } else { return false; }
into return !conn_lookup() (Ilya)
  - cosmetic/coding style changes (Ilya)

v2:
  - use enum value instead of bool (Aaron).
  - s/conn_for_expectation/conn_for_exp/ in process_ftp_ctl_v6()
to avoid long line.
  - removed CT_CONN_TYPE_* reference in two comments.
---
 lib/conntrack-private.h |   19 +-
 lib/conntrack-tp.c  |6 +
 lib/conntrack.c |  366 +++
 3 files changed, 164 insertions(+), 227 deletions(-)

diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
index bb326868e..3fd5fccd3 100644
--- a/lib/conntrack-private.h
+++ b/lib/conntrack-private.h
@@ -49,6 +49,12 @@ struct ct_endpoint {
  * hashing in ct_endpoint_hash_add(). */
 BUILD_ASSERT_DECL(sizeof(struct ct_endpoint) == sizeof(union ct_addr) + 4);
 
+enum key_dir {
+CT_DIR_FWD = 0,
+CT_DIR_REV,
+CT_DIRS,
+};
+
 /* Changes to this structure need to be reflected in conn_key_hash()
  * and conn_key_cmp(). */
 struct conn_key {
@@ -112,20 +118,18 @@ enum ct_timeout {
 
 #define N_EXP_LISTS 100
 
-enum OVS_PACKED_ENUM ct_conn_type {
-CT_CONN_TYPE_DEFAULT,
-CT_CONN_TYPE_UN_NAT,
+struct conn_key_node {
+enum key_dir dir;
+struct conn_key key;
+struct cmap_node cm_node;
 };
 
 struct conn {
 /* Immutable data. */
-struct conn_key key;
-struct conn_key rev_key;
+struct conn_key_node key_node[CT_DIRS];
 struct conn_key parent_key; /* Only used for orig_tuple support. */
-struct cmap_node cm_node;
 uint16_t nat_action;
 char *alg;
-struct conn *nat_conn; /* The NAT 'conn' context, if there is one. */
 atomic_flag reclaimed; /* False during the lifetime of the connection,
 * True as soon as a thread has started freeing
 * its memory. */
@@ -150,7 +154,6 @@ struct conn {
 
 /* Immutable data. */
 bool alg_related; /* True if alg data connection. */
-enum ct_conn_type conn_type;
 
 uint32_t tp_id; /* Timeout policy ID. */
 };
diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
index 89cb2704a..2149fdc73 100644
--- a/lib/conntrack-tp.c
+++ b/lib/conntrack-tp.c
@@ -253,7 +253,8 @@ conn_update_expiration(struct conntrack *ct, struct conn 
*conn,
 }
 VLOG_DBG_RL(, "Update timeout %s zone=%u with policy id=%d "
 "val=%u sec.",
-ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
+ct_timeout_str[tm], conn->key_node[CT_DIR_FWD].key.zone,
+conn->tp_id, val);
 
 atomic_store_relaxed(>expiration, now + val * 1000);
 }
@@ -273,7 +274,8 @@ conn_init_expiration(struct conntrack *ct, struct conn 
*conn,
 }
 
 VLOG_DBG_RL(, "Init timeout %s zone=%u with policy id=%d val=%u sec.",
-ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
+ct_timeout_str[tm], conn->key_node[CT_DIR_FWD].key.zone,
+conn->tp_id, val);
 
 conn->expiration = now + val * 1000;
 }
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 5f1176d33..47a443fba 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -103,7 +103,7 @@ static enum ct_update_res conn_update(struct conntrack *ct, 
struct conn *conn,
   struct conn_lookup_ctx *ctx,
   long long now);
 static long long int conn_expiration(const struct conn *);
-static bool conn_expired(struct conn *, long long now);
+static bool conn_expired(const struct conn *, long long now);
 static void conn_expire_push_front(struct conntrack *ct, struct conn *conn);
 static void set_mark(struct dp_packet *, struct conn *,
  uint32_t val, uint32_t mask);
@@ -113,8 +113,7 @@ static void set_label(struct dp_packet *, struct conn *,
 static void *clean_thread_main(void *f_);
 
 static bool
-nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
-

Re: [ovs-dev] [PATCH v2] conntrack: Remove nat_conn introducing key directionality.

2023-08-30 Thread Paolo Valerio
Ilya Maximets  writes:

> On 8/23/23 14:53, Paolo Valerio wrote:
>> From: hepeng 
>> 
>> The patch avoids the extra allocation for nat_conn.
>> Currently, when doing NAT, the userspace conntrack will use an extra
>> conn for the two directions in a flow. However, each conn has actually
>> the two keys for both orig and rev directions. This patch introduces a
>> key_node[CT_DIRS] member as per Aaron's suggestion in the conn which
>> consists of a key, direction, and a cmap_node for hash lookup so
>> addressing the feedback received by the original patch [0].
>> 
>> [0] 
>> https://patchwork.ozlabs.org/project/openvswitch/patch/20201129033255.64647-2-hepeng.0...@bytedance.com/
>> 
>> Signed-off-by: Peng He 
>> Co-authored-by: Paolo Valerio 
>> Signed-off-by: Paolo Valerio 
>> ---
>> v2:
>>   - use enum value instead of bool (Aaron).
>>   - s/conn_for_expectation/conn_for_exp/ in process_ftp_ctl_v6()
>> to avoid long line.
>>   - removed CT_CONN_TYPE_* reference in two comments.
>> ---
>>  lib/conntrack-private.h |   19 +--
>>  lib/conntrack-tp.c  |6 +
>>  lib/conntrack.c |  350 
>> +++
>>  3 files changed, 155 insertions(+), 220 deletions(-)
>> 
>> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
>> index bb326868e..3fd5fccd3 100644
>> --- a/lib/conntrack-private.h
>> +++ b/lib/conntrack-private.h
>> @@ -49,6 +49,12 @@ struct ct_endpoint {
>>   * hashing in ct_endpoint_hash_add(). */
>>  BUILD_ASSERT_DECL(sizeof(struct ct_endpoint) == sizeof(union ct_addr) + 4);
>>  
>> +enum key_dir {
>> +CT_DIR_FWD = 0,
>> +CT_DIR_REV,
>> +CT_DIRS,
>> +};
>> +
>>  /* Changes to this structure need to be reflected in conn_key_hash()
>>   * and conn_key_cmp(). */
>>  struct conn_key {
>> @@ -112,20 +118,18 @@ enum ct_timeout {
>>  
>>  #define N_EXP_LISTS 100
>>  
>> -enum OVS_PACKED_ENUM ct_conn_type {
>> -CT_CONN_TYPE_DEFAULT,
>> -CT_CONN_TYPE_UN_NAT,
>> +struct conn_key_node {
>> +enum key_dir dir;
>> +struct conn_key key;
>> +struct cmap_node cm_node;
>>  };
>
> This structure and the whole business of adding the connection
> to cmap twice with different hashes is bothering me, but I really
> don't have a better solution for this, so let it be...  :)
>

this happens for the nat case. So far two connections were added to
represent one connection, with one being of type CT_CONN_TYPE_UN_NAT
(with the assumption of working with CT_CONN_TYPE_DEFAULT for most
operations).

> Just to refresh my memory, we do that because original and reply
> tuples can be completely different due to NAT, so the hashing
> being symmetric doesn't help in this case, right?
>

Yes, nat plays a role here, and it is the only case where we have two
reference to the same conn in the cmap.

If we consider the reason this has been revived (the bug it solves),
mostly the problem happens when packets go through nat, but without
actually changing the packet (all-zero with no clash). In such case two
connections get [allocated and copied] added to the cmap and if a lookup
ends up retrieving a conn of type CT_CONN_TYPE_UN_NAT with its
respective DEFAULT expired, the assertion kicks in once the second
lookup happens when attempting to get the default conn.
In general, NAT (with or without packet mangling), expired conn and hash
collision should be theoretically enough to hit the issue.

>>  
>>  struct conn {
>>  /* Immutable data. */
>> -struct conn_key key;
>> -struct conn_key rev_key;
>> +struct conn_key_node key_node[CT_DIRS];
>>  struct conn_key parent_key; /* Only used for orig_tuple support. */
>> -struct cmap_node cm_node;
>>  uint16_t nat_action;
>>  char *alg;
>> -struct conn *nat_conn; /* The NAT 'conn' context, if there is one. */
>>  atomic_flag reclaimed; /* False during the lifetime of the connection,
>>  * True as soon as a thread has started freeing
>>  * its memory. */
>> @@ -150,7 +154,6 @@ struct conn {
>>  
>>  /* Immutable data. */
>>  bool alg_related; /* True if alg data connection. */
>> -enum ct_conn_type conn_type;
>>  
>>  uint32_t tp_id; /* Timeout policy ID. */
>>  };
>> diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
>> index 89cb2704a..2149fdc73 100644
>> --- a/lib/conntrack-tp.c
>> +++ b/lib/conntrack-tp.c
>> @@ -253,7 +253,8 @@ conn_update_expiration(struct conntrack *ct,

[ovs-dev] [PATCH v2] conntrack: Remove nat_conn introducing key directionality.

2023-08-23 Thread Paolo Valerio
From: hepeng 

The patch avoids the extra allocation for nat_conn.
Currently, when doing NAT, the userspace conntrack will use an extra
conn for the two directions in a flow. However, each conn has actually
the two keys for both orig and rev directions. This patch introduces a
key_node[CT_DIRS] member as per Aaron's suggestion in the conn which
consists of a key, direction, and a cmap_node for hash lookup so
addressing the feedback received by the original patch [0].

[0] 
https://patchwork.ozlabs.org/project/openvswitch/patch/20201129033255.64647-2-hepeng.0...@bytedance.com/

Signed-off-by: Peng He 
Co-authored-by: Paolo Valerio 
Signed-off-by: Paolo Valerio 
---
v2:
  - use enum value instead of bool (Aaron).
  - s/conn_for_expectation/conn_for_exp/ in process_ftp_ctl_v6()
to avoid long line.
  - removed CT_CONN_TYPE_* reference in two comments.
---
 lib/conntrack-private.h |   19 +--
 lib/conntrack-tp.c  |6 +
 lib/conntrack.c |  350 +++
 3 files changed, 155 insertions(+), 220 deletions(-)

diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
index bb326868e..3fd5fccd3 100644
--- a/lib/conntrack-private.h
+++ b/lib/conntrack-private.h
@@ -49,6 +49,12 @@ struct ct_endpoint {
  * hashing in ct_endpoint_hash_add(). */
 BUILD_ASSERT_DECL(sizeof(struct ct_endpoint) == sizeof(union ct_addr) + 4);
 
+enum key_dir {
+CT_DIR_FWD = 0,
+CT_DIR_REV,
+CT_DIRS,
+};
+
 /* Changes to this structure need to be reflected in conn_key_hash()
  * and conn_key_cmp(). */
 struct conn_key {
@@ -112,20 +118,18 @@ enum ct_timeout {
 
 #define N_EXP_LISTS 100
 
-enum OVS_PACKED_ENUM ct_conn_type {
-CT_CONN_TYPE_DEFAULT,
-CT_CONN_TYPE_UN_NAT,
+struct conn_key_node {
+enum key_dir dir;
+struct conn_key key;
+struct cmap_node cm_node;
 };
 
 struct conn {
 /* Immutable data. */
-struct conn_key key;
-struct conn_key rev_key;
+struct conn_key_node key_node[CT_DIRS];
 struct conn_key parent_key; /* Only used for orig_tuple support. */
-struct cmap_node cm_node;
 uint16_t nat_action;
 char *alg;
-struct conn *nat_conn; /* The NAT 'conn' context, if there is one. */
 atomic_flag reclaimed; /* False during the lifetime of the connection,
 * True as soon as a thread has started freeing
 * its memory. */
@@ -150,7 +154,6 @@ struct conn {
 
 /* Immutable data. */
 bool alg_related; /* True if alg data connection. */
-enum ct_conn_type conn_type;
 
 uint32_t tp_id; /* Timeout policy ID. */
 };
diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
index 89cb2704a..2149fdc73 100644
--- a/lib/conntrack-tp.c
+++ b/lib/conntrack-tp.c
@@ -253,7 +253,8 @@ conn_update_expiration(struct conntrack *ct, struct conn 
*conn,
 }
 VLOG_DBG_RL(, "Update timeout %s zone=%u with policy id=%d "
 "val=%u sec.",
-ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
+ct_timeout_str[tm], conn->key_node[CT_DIR_FWD].key.zone,
+conn->tp_id, val);
 
 atomic_store_relaxed(>expiration, now + val * 1000);
 }
@@ -273,7 +274,8 @@ conn_init_expiration(struct conntrack *ct, struct conn 
*conn,
 }
 
 VLOG_DBG_RL(, "Init timeout %s zone=%u with policy id=%d val=%u sec.",
-ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
+ct_timeout_str[tm], conn->key_node[CT_DIR_FWD].key.zone,
+conn->tp_id, val);
 
 conn->expiration = now + val * 1000;
 }
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 5f1176d33..f75f9a8f1 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -113,8 +113,7 @@ static void set_label(struct dp_packet *, struct conn *,
 static void *clean_thread_main(void *f_);
 
 static bool
-nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
- struct conn *nat_conn,
+nat_get_unique_tuple(struct conntrack *ct, struct conn *conn,
  const struct nat_action_info_t *nat_info);
 
 static uint8_t
@@ -208,7 +207,7 @@ static alg_helper alg_helpers[] = {
 #define ALG_WC_SRC_PORT 0
 
 /* If the total number of connections goes above this value, no new connections
- * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
+ * are accepted. */
 #define DEFAULT_N_CONN_LIMIT 300
 
 /* Does a member by member comparison of two conn_keys; this
@@ -234,61 +233,6 @@ conn_key_cmp(const struct conn_key *key1, const struct 
conn_key *key2)
 return 1;
 }
 
-static void
-ct_print_conn_info(const struct conn *c, const char *log_msg,
-   enum vlog_level vll, bool force, bool rl_on)
-{
-#define CT_VLOG(RL_ON, LEVEL, ...)  \
-do {\
-if (RL_ON) {

[ovs-dev] [PATCH RFC] conntrack: Remove nat_conn introducing key directionality.

2023-08-14 Thread Paolo Valerio
From: hepeng 

The patch avoids the extra allocation for nat_conn.
Currently, when doing NAT, the userspace conntrack will use an extra
conn for the two directions in a flow. However, each conn has actually
the two keys for both orig and rev directions. This patch introduces a
key_node[CT_DIRS] member in the conn which consists of a key, direction,
and a cmap_node for hash lookup so addressing the feedback received by
the original patch [0].

The patch is an alternative approach to [1].
The patch has the advantage of solving the issue in a clean way, but,
unlike [1], it has the disadvantage of requiring some changes to the
connection clean up for older branches (down to 2.17) and all the
related operations. To make an idea, [0] contains most of the changes
required.

[0] 
https://patchwork.ozlabs.org/project/openvswitch/patch/20201129033255.64647-2-hepeng.0...@bytedance.com/
[1] https://patchwork.ozlabs.org/project/openvswitch/list/?series=351579=*

Signed-off-by: Peng He 
Co-authored-by: Paolo Valerio 
Signed-off-by: Paolo Valerio 
---
 lib/conntrack-private.h |   19 ++-
 lib/conntrack-tp.c  |6 +
 lib/conntrack.c |  339 +++
 3 files changed, 149 insertions(+), 215 deletions(-)

diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
index bb326868e..3fd5fccd3 100644
--- a/lib/conntrack-private.h
+++ b/lib/conntrack-private.h
@@ -49,6 +49,12 @@ struct ct_endpoint {
  * hashing in ct_endpoint_hash_add(). */
 BUILD_ASSERT_DECL(sizeof(struct ct_endpoint) == sizeof(union ct_addr) + 4);
 
+enum key_dir {
+CT_DIR_FWD = 0,
+CT_DIR_REV,
+CT_DIRS,
+};
+
 /* Changes to this structure need to be reflected in conn_key_hash()
  * and conn_key_cmp(). */
 struct conn_key {
@@ -112,20 +118,18 @@ enum ct_timeout {
 
 #define N_EXP_LISTS 100
 
-enum OVS_PACKED_ENUM ct_conn_type {
-CT_CONN_TYPE_DEFAULT,
-CT_CONN_TYPE_UN_NAT,
+struct conn_key_node {
+enum key_dir dir;
+struct conn_key key;
+struct cmap_node cm_node;
 };
 
 struct conn {
 /* Immutable data. */
-struct conn_key key;
-struct conn_key rev_key;
+struct conn_key_node key_node[CT_DIRS];
 struct conn_key parent_key; /* Only used for orig_tuple support. */
-struct cmap_node cm_node;
 uint16_t nat_action;
 char *alg;
-struct conn *nat_conn; /* The NAT 'conn' context, if there is one. */
 atomic_flag reclaimed; /* False during the lifetime of the connection,
 * True as soon as a thread has started freeing
 * its memory. */
@@ -150,7 +154,6 @@ struct conn {
 
 /* Immutable data. */
 bool alg_related; /* True if alg data connection. */
-enum ct_conn_type conn_type;
 
 uint32_t tp_id; /* Timeout policy ID. */
 };
diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
index 89cb2704a..2149fdc73 100644
--- a/lib/conntrack-tp.c
+++ b/lib/conntrack-tp.c
@@ -253,7 +253,8 @@ conn_update_expiration(struct conntrack *ct, struct conn 
*conn,
 }
 VLOG_DBG_RL(, "Update timeout %s zone=%u with policy id=%d "
 "val=%u sec.",
-ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
+ct_timeout_str[tm], conn->key_node[CT_DIR_FWD].key.zone,
+conn->tp_id, val);
 
 atomic_store_relaxed(>expiration, now + val * 1000);
 }
@@ -273,7 +274,8 @@ conn_init_expiration(struct conntrack *ct, struct conn 
*conn,
 }
 
 VLOG_DBG_RL(, "Init timeout %s zone=%u with policy id=%d val=%u sec.",
-ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
+ct_timeout_str[tm], conn->key_node[CT_DIR_FWD].key.zone,
+conn->tp_id, val);
 
 conn->expiration = now + val * 1000;
 }
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 5f1176d33..6f219eb9e 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -113,8 +113,7 @@ static void set_label(struct dp_packet *, struct conn *,
 static void *clean_thread_main(void *f_);
 
 static bool
-nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn,
- struct conn *nat_conn,
+nat_get_unique_tuple(struct conntrack *ct, struct conn *conn,
  const struct nat_action_info_t *nat_info);
 
 static uint8_t
@@ -234,61 +233,6 @@ conn_key_cmp(const struct conn_key *key1, const struct 
conn_key *key2)
 return 1;
 }
 
-static void
-ct_print_conn_info(const struct conn *c, const char *log_msg,
-   enum vlog_level vll, bool force, bool rl_on)
-{
-#define CT_VLOG(RL_ON, LEVEL, ...)  \
-do {\
-if (RL_ON) {\
-static struct vlog_rate_limit rl_ = VLOG_RATE_LIMIT_INIT(5, 5); \
-vlog_rate_lim

Re: [ovs-discuss] Question regarding the behavior of Conntrack netlink event

2023-07-16 Thread Paolo Valerio via discuss
YunTang Hsu via discuss  writes:

> Hi,
>
> I have a kind cluster with Antrea installed. Since I want to use the conntrack
> event listener to track the creation/termination of connections, I installed
> conntrack CLI in one of Antrea-agen pods.
> When I used command “conntrack -E” to listen to events for iperf3 traffic, I
> found that not all of the UpdateEvent for tcp state changes can be monitored.
> However, I can see the tcp state is changed to “Time_wait” in the conntrack
> table (using “conntrack -L”). I can see all the UpdateEvents for a connection
> if I use “wget”. Not sure if it is the expected behavior?
> The only difference I can notice is that the connection of the iperf3 flow has
> a zone=65520, which is the zone used by Antrea.
>

Hello,

I guess, assuming your datapath implements OVS_CT_ATTR_EVENTMASK, those
events are filtered out.
This is from ovs-vswitchd.conf.db(5), see ct_eventmask:

"True if the datapath’s OVS_ACTION_ATTR_CT action implements the
OVS_CT_ATTR_EVENTMASK attribute. When this is true, Open vSwitch uses
the event mask feature to limit the kinds of events reported to
conntrack update listeners. When Open vSwitch doesn’t limit the event
mask, listeners receive reports of numerous usually unimportant events,
such as TCP state machine changes, which can waste CPU time."

to make them pass, OVS_CT_EVENTMASK_DEFAULT (used for
OVS_CT_ATTR_EVENTMASK) should include OVS_CT_EVENT_PROTOINFO, which is
not the case.

Paolo

> I also created an issue for this question in ovs-issues (https://github.com/
> openvswitch/ovs-issues/issues/282). Any comment or suggestion is appreciated.
> Thanks.
>
> Best,
> Yun-Tang
> ___
> discuss mailing list
> disc...@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-discuss

___
discuss mailing list
disc...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-discuss


[ovs-dev] [PATCH v4] conntrack: Extract l4 information for SCTP.

2023-07-12 Thread Paolo Valerio
since a27d70a89 ("conntrack: add generic IP protocol support") all
the unrecognized IP protocols get handled using ct_proto_other ops
and are managed as L3 using 3 tuples.

This patch stores L4 information for SCTP in the conn_key so that
multiple conn instances, instead of one with ports zeroed, will be
created when there are multiple SCTP connections between two hosts.
It also performs crc32c check when not offloaded, and adds SCTP to
pat_enabled.

With this patch, given two SCTP association between two hosts,
tracking the connection will result in:

sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=55884,dport=5201),reply=(src=10.1.1.1,dst=10.1.1.2,sport=5201,dport=12345),zone=1
sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=59874,dport=5202),reply=(src=10.1.1.1,dst=10.1.1.2,sport=5202,dport=12346),zone=1

instead of:

sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=0,dport=0),reply=(src=10.1.1.1,dst=10.1.1.2,sport=0,dport=0),zone=1

Signed-off-by: Paolo Valerio 
---
v4
- rebased on top of current master
- test: turned graceful termination into ABORT.
  The graceful shutdown sequence could lead to failures because of a
  very small default timeout set for SHUTDOWN_SENT state.
  The proto state transition sequence for the kerneldp is now:
protoinfo=(state=CLOSED,vtag_orig=0,vtag_reply=3431784816)
protoinfo=(state=COOKIE_WAIT,vtag_orig=4204641061,vtag_reply=3431784816)
protoinfo=(state=COOKIE_ECHOED,vtag_orig=4204641061,vtag_reply=3431784816)
protoinfo=(state=ESTABLISHED,vtag_orig=4204641061,vtag_reply=3431784816)
protoinfo=(state=ESTABLISHED,vtag_orig=4204641061,vtag_reply=3431784816)
protoinfo=(state=ESTABLISHED,vtag_orig=4204641061,vtag_reply=3431784816)
protoinfo=(state=CLOSED,vtag_orig=4204641061,vtag_reply=3431784816)


v3:
- rebased on top of current master
- minor adjustments: commit message, comments

v2:
- ordered includes
- while at it, slightly modified the commit subject (capital letter
  and period)
---
 lib/conntrack.c  |   86 ++
 lib/packets.h|   11 +
 tests/system-kmod-macros.at  |   11 +
 tests/system-traffic.at  |   73 
 tests/system-userspace-macros.at |7 +++
 5 files changed, 187 insertions(+), 1 deletion(-)

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 4375c03e2..786531e21 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -27,6 +27,7 @@
 #include "conntrack-private.h"
 #include "conntrack-tp.h"
 #include "coverage.h"
+#include "crc32c.h"
 #include "csum.h"
 #include "ct-dpif.h"
 #include "dp-packet.h"
@@ -41,6 +42,7 @@
 #include "random.h"
 #include "rculist.h"
 #include "timeval.h"
+#include "unaligned.h"
 
 VLOG_DEFINE_THIS_MODULE(conntrack);
 
@@ -771,6 +773,8 @@ pat_packet(struct dp_packet *pkt, const struct conn_key 
*key)
 packet_set_tcp_port(pkt, key->dst.port, key->src.port);
 } else if (key->nw_proto == IPPROTO_UDP) {
 packet_set_udp_port(pkt, key->dst.port, key->src.port);
+} else if (key->nw_proto == IPPROTO_SCTP) {
+packet_set_sctp_port(pkt, key->dst.port, key->src.port);
 }
 }
 
@@ -1675,6 +1679,26 @@ checksum_valid(const struct conn_key *key, const void 
*data, size_t size,
 return valid;
 }
 
+static inline bool
+sctp_checksum_valid(const void *data, size_t size)
+{
+struct sctp_header *sctp = (struct sctp_header *) data;
+ovs_be32 rcvd_csum, csum;
+bool ret;
+
+rcvd_csum = get_16aligned_be32(>sctp_csum);
+put_16aligned_be32(>sctp_csum, 0);
+csum = crc32c(data, size);
+put_16aligned_be32(>sctp_csum, rcvd_csum);
+
+ret = (rcvd_csum == csum);
+if (!ret) {
+COVERAGE_INC(conntrack_l4csum_err);
+}
+
+return ret;
+}
+
 static inline bool
 check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
  const void *l3, bool validate_checksum)
@@ -1711,6 +1735,47 @@ check_l4_udp(const struct conn_key *key, const void 
*data, size_t size,
|| (validate_checksum ? checksum_valid(key, data, size, l3) : true);
 }
 
+static inline bool
+sctp_check_len(const struct sctp_header *sh, size_t size)
+{
+const struct sctp_chunk_header *sch;
+size_t next;
+
+if (size < SCTP_HEADER_LEN) {
+return false;
+}
+
+/* rfc4960: Chunks (including Type, Length, and Value fields) are padded
+ * out by the sender with all zero bytes to be a multiple of 4 bytes long.
+ */
+for (next = sizeof(struct sctp_header),
+ sch = SCTP_NEXT_CHUNK(sh, next);
+ next < size;
+ next += ROUND_UP(ntohs(sch->length), 4),
+ sch = SCTP_NEXT_CHUNK(sh, next)) {
+/* rfc4960: This value represents the size of the chunk in bytes,
+ * including the Chunk Type, Chunk Flags, Chunk Length, and Chunk Value
+ 

[ovs-dev] [PATCH] conntrack: Allow to dump userspace conntrack expectations.

2023-06-23 Thread Paolo Valerio
The patch introduces a new commands ovs-appctl dpctl/dump-conntrack-exp
that allows to dump the existing expectations for the userspace ct.

Signed-off-by: Paolo Valerio 
---
 NEWS |2 +
 lib/conntrack.c  |   66 +
 lib/conntrack.h  |   10 
 lib/ct-dpif.c|   87 ++
 lib/ct-dpif.h|   15 +++
 lib/dpctl.c  |   49 +
 lib/dpctl.man|6 +++
 lib/dpif-netdev.c|   50 ++
 lib/dpif-netlink.c   |3 +
 lib/dpif-provider.h  |   11 +
 tests/system-kmod-macros.at  |9 
 tests/system-traffic.at  |   44 +++
 tests/system-userspace-macros.at |6 +++
 13 files changed, 357 insertions(+), 1 deletion(-)

diff --git a/NEWS b/NEWS
index 66d5a4ea3..16cdb6933 100644
--- a/NEWS
+++ b/NEWS
@@ -24,6 +24,8 @@ Post-v3.1.0
  * New commands "dpctl/{ct-get-sweep-interval,ct-set-sweep-interval}" that
allow to get and set, for the userspace datapath, the sweep interval
for the conntrack garbage collector.
+ * New commands "dpctl/dump-conntrack-exp" that allows to dump
+   conntrack's expectations for the userspace datapath.
- ovs-ctl:
  * Added new options --[ovsdb-server|ovs-vswitchd]-umask=MODE to set umask
value when starting OVS daemons.  E.g., use --ovsdb-server-umask=0002
diff --git a/lib/conntrack.c b/lib/conntrack.c
index f5ebfa05b..4375c03e2 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -2670,6 +2670,72 @@ conntrack_dump_done(struct conntrack_dump *dump 
OVS_UNUSED)
 return 0;
 }
 
+static void
+exp_node_to_ct_dpif_exp(const struct alg_exp_node *exp,
+struct ct_dpif_exp *entry)
+{
+memset(entry, 0, sizeof *entry);
+
+conn_key_to_tuple(>key, >tuple_orig);
+conn_key_to_tuple(>parent_key, >tuple_parent);
+entry->zone = exp->key.zone;
+entry->mark = exp->parent_mark;
+memcpy(>labels, >parent_label, sizeof entry->labels);
+entry->protoinfo.proto = exp->key.nw_proto;
+}
+
+int
+conntrack_exp_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
+ const uint16_t *pzone)
+{
+memset(dump, 0, sizeof(*dump));
+
+if (pzone) {
+dump->zone = *pzone;
+dump->filter_zone = true;
+}
+
+dump->ct = ct;
+
+return 0;
+}
+
+int
+conntrack_exp_dump_next(struct conntrack_dump *dump, struct ct_dpif_exp *entry)
+{
+struct conntrack *ct = dump->ct;
+struct alg_exp_node *enode;
+int ret = EOF;
+
+ovs_rwlock_rdlock(>resources_lock);
+
+for (;;) {
+struct hmap_node *node = hmap_at_position(>alg_expectations,
+  >hmap_pos);
+if (!node) {
+break;
+}
+
+enode = CONTAINER_OF(node, struct alg_exp_node, node);
+
+if (!dump->filter_zone || enode->key.zone == dump->zone) {
+ret = 0;
+exp_node_to_ct_dpif_exp(enode, entry);
+break;
+}
+}
+
+ovs_rwlock_unlock(>resources_lock);
+
+return ret;
+}
+
+int
+conntrack_exp_dump_done(struct conntrack_dump *dump OVS_UNUSED)
+{
+return 0;
+}
+
 int
 conntrack_flush(struct conntrack *ct, const uint16_t *zone)
 {
diff --git a/lib/conntrack.h b/lib/conntrack.h
index 524ec0acb..57d5159b6 100644
--- a/lib/conntrack.h
+++ b/lib/conntrack.h
@@ -100,7 +100,10 @@ void conntrack_clear(struct dp_packet *packet);
 struct conntrack_dump {
 struct conntrack *ct;
 unsigned bucket;
-struct cmap_position cm_pos;
+union {
+struct cmap_position cm_pos;
+struct hmap_position hmap_pos;
+};
 bool filter_zone;
 uint16_t zone;
 };
@@ -132,6 +135,11 @@ int conntrack_dump_start(struct conntrack *, struct 
conntrack_dump *,
 int conntrack_dump_next(struct conntrack_dump *, struct ct_dpif_entry *);
 int conntrack_dump_done(struct conntrack_dump *);
 
+int conntrack_exp_dump_start(struct conntrack *, struct conntrack_dump *,
+ const uint16_t *);
+int conntrack_exp_dump_next(struct conntrack_dump *, struct ct_dpif_exp *);
+int conntrack_exp_dump_done(struct conntrack_dump *);
+
 int conntrack_flush(struct conntrack *, const uint16_t *zone);
 int conntrack_flush_tuple(struct conntrack *, const struct ct_dpif_tuple *,
   uint16_t zone);
diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c
index 0c4b2964f..f59c6e560 100644
--- a/lib/ct-dpif.c
+++ b/lib/ct-dpif.c
@@ -101,6 +101,65 @@ ct_dpif_dump_done(struct ct_dpif_dump_state *dump)
 ? dpif->dpif_class->ct_dump_done(dpif, dump)
 : EOPNOTSUPP);
 }
+
+/* Start dumping the expectations from the connection tracker.
+ *
+ * 

Re: [ovs-dev] [PATCH v3] conntrack: Extract l4 information for SCTP.

2023-06-16 Thread Paolo Valerio
Ilya Maximets  writes:

> On 6/16/23 14:56, Aaron Conole wrote:
>> Ilya Maximets  writes:
>> 
>>> On 6/15/23 19:49, Paolo Valerio wrote:
>>>> Ilya Maximets  writes:
>>>>
>>>>> On 6/14/23 21:08, Ilya Maximets wrote:
>>>>>> On 6/14/23 20:11, Paolo Valerio wrote:
>>>>>>> Ilya Maximets  writes:
>>>>>>>
>>>>>>>> On 6/12/23 16:57, Aaron Conole wrote:
>>>>>>>>> Paolo Valerio  writes:
>>>>>>>>>
>>>>>>>>>> since a27d70a89 ("conntrack: add generic IP protocol support") all
>>>>>>>>>> the unrecognized IP protocols get handled using ct_proto_other ops
>>>>>>>>>> and are managed as L3 using 3 tuples.
>>>>>>>>>>
>>>>>>>>>> This patch stores L4 information for SCTP in the conn_key so that
>>>>>>>>>> multiple conn instances, instead of one with ports zeroed, will be
>>>>>>>>>> created when there are multiple SCTP connections between two hosts.
>>>>>>>>>> It also performs crc32c check when not offloaded, and adds SCTP to
>>>>>>>>>> pat_enabled.
>>>>>>>>>>
>>>>>>>>>> With this patch, given two SCTP association between two hosts,
>>>>>>>>>> tracking the connection will result in:
>>>>>>>>>>
>>>>>>>>>> sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=55884,dport=5201),reply=(src=10.1.1.1,dst=10.1.1.2,sport=5201,dport=12345),zone=1
>>>>>>>>>> sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=59874,dport=5202),reply=(src=10.1.1.1,dst=10.1.1.2,sport=5202,dport=12346),zone=1
>>>>>>>>>>
>>>>>>>>>> instead of:
>>>>>>>>>>
>>>>>>>>>> sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=0,dport=0),reply=(src=10.1.1.1,dst=10.1.1.2,sport=0,dport=0),zone=1
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Paolo Valerio 
>>>>>>>>>> ---
>>>>>>>>>
>>>>>>>>> Thanks for this work - I think it looks good.
>>>>>>>>>
>>>>>>>>> Perhaps it should have a NEWS item mentioned that the userspace
>>>>>>>>> conntrack now supports matching SCTP l4 data.
>>>>>>>>>
>>>>>>>>> If you do spin a v4 with that change, you can keep my:
>>>>>>>>>
>>>>>>>>> Acked-by: Aaron Conole 
>>>>>>>>
>>>>>>>> Hi, Paolo and Aaron.
>>>>>>>>
>>>>>>>> I'm getting a consistent test failure while running check-kernel
>>>>>>>> on Ubuntu 22.10 with 5.19 kernel:
>>>>>>>>
>>>>>>>>
>>>>>>>> ./system-traffic.at:4754: cat ofctl_monitor.log
>>>>>>>> --- -   2023-06-14 11:26:41.958591125 +
>>>>>>>> +++
>>>>>>>> /root/ovs/tests/system-kmod-testsuite.dir/at-groups/105/stdout
>>>>>>>> 2023-06-14 11:26:41.95200 +
>>>>>>>> @@ -12,8 +12,6 @@
>>>>>>>>  
>>>>>>>> sctp,vlan_tci=0x,dl_src=e6:66:c1:22:22:22,dl_dst=e6:66:c1:11:11:11,nw_src=10.1.1.2,nw_dst=10.1.1.1,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=12345,tp_dst=54969
>>>>>>>> sctp_csum:9b67e853
>>>>>>>>  NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=54 in_port=1
>>>>>>>> (via action) data_len=54 (unbuffered)
>>>>>>>>  
>>>>>>>> sctp,vlan_tci=0x,dl_src=e6:66:c1:11:11:11,dl_dst=e6:66:c1:22:22:22,nw_src=10.1.1.240,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=34567,tp_dst=12345
>>>>>>>> sctp_csum:bc0e5463
>>>>>>>> -NXT_PACKET_IN2 (xid=0x0): table_id=1 cookie=0x0 total_len=50
>>>>>>>> ct_state=est|rpl|trk|dnat,ct_zone=1,ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=132,ct_tp_src=54969,ct_tp_dst=12345,ip,in_port=2
>>>>>>>> (via action) data_len=50 (unbuffered)
>>>>>>>> -sctp,vlan_tci=0x,dl_src=e6:66:c1:22:22:22,dl_dst=e6:66:c1:11:11:11,nw_src=10.1.1.2,nw

Re: [ovs-dev] [PATCH v3] conntrack: Extract l4 information for SCTP.

2023-06-15 Thread Paolo Valerio
Ilya Maximets  writes:

> On 6/14/23 21:08, Ilya Maximets wrote:
>> On 6/14/23 20:11, Paolo Valerio wrote:
>>> Ilya Maximets  writes:
>>>
>>>> On 6/12/23 16:57, Aaron Conole wrote:
>>>>> Paolo Valerio  writes:
>>>>>
>>>>>> since a27d70a89 ("conntrack: add generic IP protocol support") all
>>>>>> the unrecognized IP protocols get handled using ct_proto_other ops
>>>>>> and are managed as L3 using 3 tuples.
>>>>>>
>>>>>> This patch stores L4 information for SCTP in the conn_key so that
>>>>>> multiple conn instances, instead of one with ports zeroed, will be
>>>>>> created when there are multiple SCTP connections between two hosts.
>>>>>> It also performs crc32c check when not offloaded, and adds SCTP to
>>>>>> pat_enabled.
>>>>>>
>>>>>> With this patch, given two SCTP association between two hosts,
>>>>>> tracking the connection will result in:
>>>>>>
>>>>>> sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=55884,dport=5201),reply=(src=10.1.1.1,dst=10.1.1.2,sport=5201,dport=12345),zone=1
>>>>>> sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=59874,dport=5202),reply=(src=10.1.1.1,dst=10.1.1.2,sport=5202,dport=12346),zone=1
>>>>>>
>>>>>> instead of:
>>>>>>
>>>>>> sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=0,dport=0),reply=(src=10.1.1.1,dst=10.1.1.2,sport=0,dport=0),zone=1
>>>>>>
>>>>>> Signed-off-by: Paolo Valerio 
>>>>>> ---
>>>>>
>>>>> Thanks for this work - I think it looks good.
>>>>>
>>>>> Perhaps it should have a NEWS item mentioned that the userspace
>>>>> conntrack now supports matching SCTP l4 data.
>>>>>
>>>>> If you do spin a v4 with that change, you can keep my:
>>>>>
>>>>> Acked-by: Aaron Conole 
>>>>
>>>> Hi, Paolo and Aaron.
>>>>
>>>> I'm getting a consistent test failure while running check-kernel
>>>> on Ubuntu 22.10 with 5.19 kernel:
>>>>
>>>>
>>>> ./system-traffic.at:4754: cat ofctl_monitor.log
>>>> --- -   2023-06-14 11:26:41.958591125 +
>>>> +++ /root/ovs/tests/system-kmod-testsuite.dir/at-groups/105/stdout  
>>>> 2023-06-14 11:26:41.95200 +
>>>> @@ -12,8 +12,6 @@
>>>>  
>>>> sctp,vlan_tci=0x,dl_src=e6:66:c1:22:22:22,dl_dst=e6:66:c1:11:11:11,nw_src=10.1.1.2,nw_dst=10.1.1.1,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=12345,tp_dst=54969
>>>>  sctp_csum:9b67e853
>>>>  NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=54 in_port=1 (via action) 
>>>> data_len=54 (unbuffered)
>>>>  
>>>> sctp,vlan_tci=0x,dl_src=e6:66:c1:11:11:11,dl_dst=e6:66:c1:22:22:22,nw_src=10.1.1.240,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=34567,tp_dst=12345
>>>>  sctp_csum:bc0e5463
>>>> -NXT_PACKET_IN2 (xid=0x0): table_id=1 cookie=0x0 total_len=50 
>>>> ct_state=est|rpl|trk|dnat,ct_zone=1,ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=132,ct_tp_src=54969,ct_tp_dst=12345,ip,in_port=2
>>>>  (via action) data_len=50 (unbuffered)
>>>> -sctp,vlan_tci=0x,dl_src=e6:66:c1:22:22:22,dl_dst=e6:66:c1:11:11:11,nw_src=10.1.1.2,nw_dst=10.1.1.1,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=12345,tp_dst=54969
>>>>  sctp_csum:d6ce6b9e
>>>>  NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=50 in_port=1 (via action) 
>>>> data_len=50 (unbuffered)
>>>> -sctp,vlan_tci=0x,dl_src=e6:66:c1:11:11:11,dl_dst=e6:66:c1:22:22:22,nw_src=10.1.1.240,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=34567,tp_dst=12345
>>>>  sctp_csum:add7db93
>>>> +sctp,vlan_tci=0x,dl_src=e6:66:c1:11:11:11,dl_dst=e6:66:c1:22:22:22,nw_src=10.1.1.1,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=54969,tp_dst=12345
>>>>  sctp_csum:5db68ce
>>>>
>>>>
>>>> Do you know what can be a problem here?
>>>>
>>>> Test is passing on Fedora 38 with 6.3 kernel and on rhel 9.2.
>>>>
>>>
>>> Hi Ilya,
>>>
>>> Uhm, it seems there's a problem with the shutdown sequence.
>>> I just ran the on a VM:
>>>
>>> vagrant@ubuntu2210:~/ovs$ grep CONFIG_NF_CT_PROTO_SCTP 
>>> /boot/config-5.19.0-38-generic 
>>> CONFIG_N

Re: [ovs-dev] [PATCH v3] conntrack: Extract l4 information for SCTP.

2023-06-14 Thread Paolo Valerio
Ilya Maximets  writes:

> On 6/12/23 16:57, Aaron Conole wrote:
>> Paolo Valerio  writes:
>> 
>>> since a27d70a89 ("conntrack: add generic IP protocol support") all
>>> the unrecognized IP protocols get handled using ct_proto_other ops
>>> and are managed as L3 using 3 tuples.
>>>
>>> This patch stores L4 information for SCTP in the conn_key so that
>>> multiple conn instances, instead of one with ports zeroed, will be
>>> created when there are multiple SCTP connections between two hosts.
>>> It also performs crc32c check when not offloaded, and adds SCTP to
>>> pat_enabled.
>>>
>>> With this patch, given two SCTP association between two hosts,
>>> tracking the connection will result in:
>>>
>>> sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=55884,dport=5201),reply=(src=10.1.1.1,dst=10.1.1.2,sport=5201,dport=12345),zone=1
>>> sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=59874,dport=5202),reply=(src=10.1.1.1,dst=10.1.1.2,sport=5202,dport=12346),zone=1
>>>
>>> instead of:
>>>
>>> sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=0,dport=0),reply=(src=10.1.1.1,dst=10.1.1.2,sport=0,dport=0),zone=1
>>>
>>> Signed-off-by: Paolo Valerio 
>>> ---
>> 
>> Thanks for this work - I think it looks good.
>> 
>> Perhaps it should have a NEWS item mentioned that the userspace
>> conntrack now supports matching SCTP l4 data.
>> 
>> If you do spin a v4 with that change, you can keep my:
>> 
>> Acked-by: Aaron Conole 
>
> Hi, Paolo and Aaron.
>
> I'm getting a consistent test failure while running check-kernel
> on Ubuntu 22.10 with 5.19 kernel:
>
>
> ./system-traffic.at:4754: cat ofctl_monitor.log
> --- -   2023-06-14 11:26:41.958591125 +
> +++ /root/ovs/tests/system-kmod-testsuite.dir/at-groups/105/stdout  
> 2023-06-14 11:26:41.95200 +
> @@ -12,8 +12,6 @@
>  
> sctp,vlan_tci=0x,dl_src=e6:66:c1:22:22:22,dl_dst=e6:66:c1:11:11:11,nw_src=10.1.1.2,nw_dst=10.1.1.1,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=12345,tp_dst=54969
>  sctp_csum:9b67e853
>  NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=54 in_port=1 (via action) 
> data_len=54 (unbuffered)
>  
> sctp,vlan_tci=0x,dl_src=e6:66:c1:11:11:11,dl_dst=e6:66:c1:22:22:22,nw_src=10.1.1.240,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=34567,tp_dst=12345
>  sctp_csum:bc0e5463
> -NXT_PACKET_IN2 (xid=0x0): table_id=1 cookie=0x0 total_len=50 
> ct_state=est|rpl|trk|dnat,ct_zone=1,ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=132,ct_tp_src=54969,ct_tp_dst=12345,ip,in_port=2
>  (via action) data_len=50 (unbuffered)
> -sctp,vlan_tci=0x,dl_src=e6:66:c1:22:22:22,dl_dst=e6:66:c1:11:11:11,nw_src=10.1.1.2,nw_dst=10.1.1.1,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=12345,tp_dst=54969
>  sctp_csum:d6ce6b9e
>  NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=50 in_port=1 (via action) 
> data_len=50 (unbuffered)
> -sctp,vlan_tci=0x,dl_src=e6:66:c1:11:11:11,dl_dst=e6:66:c1:22:22:22,nw_src=10.1.1.240,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=34567,tp_dst=12345
>  sctp_csum:add7db93
> +sctp,vlan_tci=0x,dl_src=e6:66:c1:11:11:11,dl_dst=e6:66:c1:22:22:22,nw_src=10.1.1.1,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=54969,tp_dst=12345
>  sctp_csum:5db68ce
>
>
> Do you know what can be a problem here?
>
> Test is passing on Fedora 38 with 6.3 kernel and on rhel 9.2.
>

Hi Ilya,

Uhm, it seems there's a problem with the shutdown sequence.
I just ran the on a VM:

vagrant@ubuntu2210:~/ovs$ grep CONFIG_NF_CT_PROTO_SCTP 
/boot/config-5.19.0-38-generic 
CONFIG_NF_CT_PROTO_SCTP=y

vagrant@ubuntu2210:~/ovs$ grep VERSION /etc/os-release 
VERSION_ID="22.10"
VERSION="22.10 (Kinetic Kudu)"
VERSION_CODENAME=kinetic

vagrant@ubuntu2210:~/ovs$ uname -r
5.19.0-38-generic

but I can't see the failure.
Any chance to see if they are marked for some reason as invalid?

> Best regards, Ilya Maximets.

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH 2/2] conntrack: Release nat_conn in case both keys have the same hash.

2023-06-08 Thread Paolo Valerio
Brian Haley  writes:

> Hi Paolo,
>
> On 4/19/23 2:40 PM, Paolo Valerio wrote:
>> During the creation of a new connection, there's a chance both key and
>> rev_key end up having the same hash. This is more common in the case
>> of all-zero snat with no collisions. In that case, once the
>> connection is expired, but not cleaned up, if a new packet with the
>> same 5-tuple is received, an assertion failure gets triggered in
>> conn_update_state() because of a previous failure of retrieving a
>> CT_CONN_TYPE_DEFAULT connection.
>> 
>> Fix it by releasing the nat_conn during the connection creation in the
>> case of same hash for both key and rev_key.
>
> Sorry for reviving a two month-old thread, but we recently started 
> seeing this issue which seemed to also be related to [0], but I can't 
> find it in patchworks or the tree. Was there a plan to update it?
>

Hi Brian,

It transitioned to "Changes Requested" [0].

At the moment the idea is to upstream a patch initially proposed by
Peng. I'm pretty busy at the moment, and I can't look at it right away,
but yes, the plan is to update it.

[0] https://patchwork.ozlabs.org/project/openvswitch/list/?series=351579=*

> Thanks,
>
> -Brian
>
> [0] https://www.mail-archive.com/ovs-discuss@openvswitch.org/msg08945.html
>
>> 
>> Reported-by: Michael Plato 
>> Fixes: 61e48c2d1db2 ("conntrack: Handle SNAT with all-zero IP address.")
>> Signed-off-by: Paolo Valerio 
>> ---
>> In this thread [0] there are some more details. A similar
>> approach here could be to avoid to add the nat_conn to the cmap and
>> letting the sweeper release the memory for nat_conn once the whole
>> connection gets freed.
>> That approach could still be ok, but the drawback is that it could
>> require a different patch for older branches that don't include
>> 3d9c1b855a5f ("conntrack: Replace timeout based expiration lists with
>> rculists."). It still worth to be considered.
>> 
>> [0] https://mail.openvswitch.org/pipermail/ovs-discuss/2023-April/052339.html
>> ---
>>   lib/conntrack.c |   21 +
>>   1 file changed, 13 insertions(+), 8 deletions(-)
>> 
>> diff --git a/lib/conntrack.c b/lib/conntrack.c
>> index 7e1fc4b1f..d2ee127d9 100644
>> --- a/lib/conntrack.c
>> +++ b/lib/conntrack.c
>> @@ -1007,14 +1007,19 @@ conn_not_found(struct conntrack *ct, struct 
>> dp_packet *pkt,
>>   }
>>   
>>   nat_packet(pkt, nc, false, ctx->icmp_related);
>> -memcpy(_conn->key, >rev_key, sizeof nat_conn->key);
>> -memcpy(_conn->rev_key, >key, sizeof nat_conn->rev_key);
>> -nat_conn->conn_type = CT_CONN_TYPE_UN_NAT;
>> -nat_conn->nat_action = 0;
>> -nat_conn->alg = NULL;
>> -nat_conn->nat_conn = NULL;
>> -uint32_t nat_hash = conn_key_hash(_conn->key, 
>> ct->hash_basis);
>> -cmap_insert(>conns, _conn->cm_node, nat_hash);
>> +uint32_t nat_hash = conn_key_hash(>rev_key, ct->hash_basis);
>> +if (nat_hash != ctx->hash) {
>> +memcpy(_conn->key, >rev_key, sizeof nat_conn->key);
>> +memcpy(_conn->rev_key, >key, sizeof 
>> nat_conn->rev_key);
>> +nat_conn->conn_type = CT_CONN_TYPE_UN_NAT;
>> +nat_conn->nat_action = 0;
>> +nat_conn->alg = NULL;
>> +nat_conn->nat_conn = NULL;
>> +cmap_insert(>conns, _conn->cm_node, nat_hash);
>> +} else {
>> +free(nat_conn);
>> +nat_conn = NULL;
>> +}
>>   }
>>   
>>   nc->nat_conn = nat_conn;
>> 
>> ___
>> dev mailing list
>> d...@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v3] conntrack: Extract l4 information for SCTP.

2023-06-01 Thread Paolo Valerio
since a27d70a89 ("conntrack: add generic IP protocol support") all
the unrecognized IP protocols get handled using ct_proto_other ops
and are managed as L3 using 3 tuples.

This patch stores L4 information for SCTP in the conn_key so that
multiple conn instances, instead of one with ports zeroed, will be
created when there are multiple SCTP connections between two hosts.
It also performs crc32c check when not offloaded, and adds SCTP to
pat_enabled.

With this patch, given two SCTP association between two hosts,
tracking the connection will result in:

sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=55884,dport=5201),reply=(src=10.1.1.1,dst=10.1.1.2,sport=5201,dport=12345),zone=1
sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=59874,dport=5202),reply=(src=10.1.1.1,dst=10.1.1.2,sport=5202,dport=12346),zone=1

instead of:

sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=0,dport=0),reply=(src=10.1.1.1,dst=10.1.1.2,sport=0,dport=0),zone=1

Signed-off-by: Paolo Valerio 
---
v3:
- rebased on top of current master
- minor adjustments: commit message, comments

v2:
- ordered includes
- while at it, slightly modified the commit subject (capital letter
  and period)
---
 lib/conntrack.c  |   86 ++
 lib/packets.h|   11 +
 tests/system-kmod-macros.at  |   11 +
 tests/system-traffic.at  |   80 +++
 tests/system-userspace-macros.at |7 +++
 5 files changed, 194 insertions(+), 1 deletion(-)

diff --git a/lib/conntrack.c b/lib/conntrack.c
index ce8a63de5..6f2e6ef74 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -27,6 +27,7 @@
 #include "conntrack-private.h"
 #include "conntrack-tp.h"
 #include "coverage.h"
+#include "crc32c.h"
 #include "csum.h"
 #include "ct-dpif.h"
 #include "dp-packet.h"
@@ -41,6 +42,7 @@
 #include "random.h"
 #include "rculist.h"
 #include "timeval.h"
+#include "unaligned.h"
 
 VLOG_DEFINE_THIS_MODULE(conntrack);
 
@@ -771,6 +773,8 @@ pat_packet(struct dp_packet *pkt, const struct conn_key 
*key)
 packet_set_tcp_port(pkt, key->dst.port, key->src.port);
 } else if (key->nw_proto == IPPROTO_UDP) {
 packet_set_udp_port(pkt, key->dst.port, key->src.port);
+} else if (key->nw_proto == IPPROTO_SCTP) {
+packet_set_sctp_port(pkt, key->dst.port, key->src.port);
 }
 }
 
@@ -1675,6 +1679,26 @@ checksum_valid(const struct conn_key *key, const void 
*data, size_t size,
 return valid;
 }
 
+static inline bool
+sctp_checksum_valid(const void *data, size_t size)
+{
+struct sctp_header *sctp = (struct sctp_header *) data;
+ovs_be32 rcvd_csum, csum;
+bool ret;
+
+rcvd_csum = get_16aligned_be32(>sctp_csum);
+put_16aligned_be32(>sctp_csum, 0);
+csum = crc32c(data, size);
+put_16aligned_be32(>sctp_csum, rcvd_csum);
+
+ret = (rcvd_csum == csum);
+if (!ret) {
+COVERAGE_INC(conntrack_l4csum_err);
+}
+
+return ret;
+}
+
 static inline bool
 check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
  const void *l3, bool validate_checksum)
@@ -1711,6 +1735,47 @@ check_l4_udp(const struct conn_key *key, const void 
*data, size_t size,
|| (validate_checksum ? checksum_valid(key, data, size, l3) : true);
 }
 
+static inline bool
+sctp_check_len(const struct sctp_header *sh, size_t size)
+{
+const struct sctp_chunk_header *sch;
+size_t next;
+
+if (size < SCTP_HEADER_LEN) {
+return false;
+}
+
+/* rfc4960: Chunks (including Type, Length, and Value fields) are padded
+ * out by the sender with all zero bytes to be a multiple of 4 bytes long.
+ */
+for (next = sizeof(struct sctp_header),
+ sch = SCTP_NEXT_CHUNK(sh, next);
+ next < size;
+ next += ROUND_UP(ntohs(sch->length), 4),
+ sch = SCTP_NEXT_CHUNK(sh, next)) {
+/* rfc4960: This value represents the size of the chunk in bytes,
+ * including the Chunk Type, Chunk Flags, Chunk Length, and Chunk Value
+ * fields.
+ * Therefore, if the Chunk Value field is zero-length, the Length
+ * field will be set to 4. */
+if (ntohs(sch->length) < sizeof(*sch)) {
+return false;
+}
+}
+
+return (next == size);
+}
+
+static inline bool
+check_l4_sctp(const void *data, size_t size, bool validate_checksum)
+{
+if (OVS_UNLIKELY(!sctp_check_len(data, size))) {
+return false;
+}
+
+return validate_checksum ? sctp_checksum_valid(data, size) : true;
+}
+
 static inline bool
 check_l4_icmp(const void *data, size_t size, bool validate_checksum)
 {
@@ -1761,6 +1826,21 @@ extract_l4_udp(struct conn_key *key, const void *data, 
size_t size,
 return key->src.port && key->dst.port;
 }
 
+static inl

Re: [ovs-dev] [PATCH] ofproto-dpif-xlate: Fix recirculation with patch port and controller.

2023-05-22 Thread Paolo Valerio
Ilya Maximets  writes:

> On 5/15/23 17:22, Paolo Valerio wrote:
>> If a packet originating from the controller recirculates after going
>> through a patch port, it gets dropped with the following message:
>> 
>> ofproto_dpif_upcall(handler8)|INFO|received packet on unassociated
>>   datapath port 4294967295
>> 
>> This happens because there's no xport_uuid in the recirculation node
>> and at the same type in_port refers to the patch port.
>> 
>> The patch, in the case of zeroed uuid, retrieves the xport starting
>> from the ofproto_uuid stored in the recirc node.
>> 
>> Signed-off-by: Paolo Valerio 
>> ---
>>  ofproto/ofproto-dpif-xlate.c |   11 +--
>>  tests/ofproto-dpif.at|   34 ++
>>  2 files changed, 43 insertions(+), 2 deletions(-)
>> 
>> diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
>> index c01177718..3509cc73c 100644
>> --- a/ofproto/ofproto-dpif-xlate.c
>> +++ b/ofproto/ofproto-dpif-xlate.c
>> @@ -1533,8 +1533,15 @@ xlate_lookup_ofproto_(const struct dpif_backer 
>> *backer,
>>  
>>  ofp_port_t in_port = recirc_id_node->state.metadata.in_port;
>>  if (in_port != OFPP_NONE && in_port != OFPP_CONTROLLER) {
>> -struct uuid xport_uuid = recirc_id_node->state.xport_uuid;
>> -xport = xport_lookup_by_uuid(xcfg, _uuid);
>> +if (uuid_is_zero(_id_node->state.xport_uuid)) {
>> +const struct xbridge *bridge =
>> +xbridge_lookup_by_uuid(xcfg, 
>> _id_node->state.ofproto_uuid);
>> +xport = bridge ? get_ofp_port(bridge, in_port) : NULL;
>
> IIUC, xport_uuid is designed to not be uuid of the patch port.
> But the in_port here is a patch port, right?  So, we will find
> a different xport, right?
>
> Shouldn't we just fall into the else condition that handles
> NONE and CONTROLLER and not look for xport?
>

I guess it's ok to fall in the else in this case.
The only problem is that we'd return the ofproto even if the in_port is
invalid.
This would make in turn fail "conntrack - fragment reassembly with L3 L4
protocol information". This test was fixed in the past after it already
broke once 323ae1e808e6 ("ofproto-dpif-xlate: Fix recirculation when
in_port is OFPP_CONTROLLER.") fixed the use case involving packet-out
and recirculation.

One possibility is to just retrieve the xport for that case in order to
verify the in_port belongs to the bridge, without returning it (so
honoring the xport_uuid logic). Maybe this could be done in the else
branch so to make clear we're handling the special case related to
OFPP_{NONE,CONTROLLER}.

WDYT?

> Best regards, Ilya Maximets.

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] ofproto-dpif-xlate: Fix recirculation with patch port and controller.

2023-05-15 Thread Paolo Valerio
If a packet originating from the controller recirculates after going
through a patch port, it gets dropped with the following message:

ofproto_dpif_upcall(handler8)|INFO|received packet on unassociated
  datapath port 4294967295

This happens because there's no xport_uuid in the recirculation node
and at the same type in_port refers to the patch port.

The patch, in the case of zeroed uuid, retrieves the xport starting
from the ofproto_uuid stored in the recirc node.

Signed-off-by: Paolo Valerio 
---
 ofproto/ofproto-dpif-xlate.c |   11 +--
 tests/ofproto-dpif.at|   34 ++
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index c01177718..3509cc73c 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -1533,8 +1533,15 @@ xlate_lookup_ofproto_(const struct dpif_backer *backer,
 
 ofp_port_t in_port = recirc_id_node->state.metadata.in_port;
 if (in_port != OFPP_NONE && in_port != OFPP_CONTROLLER) {
-struct uuid xport_uuid = recirc_id_node->state.xport_uuid;
-xport = xport_lookup_by_uuid(xcfg, _uuid);
+if (uuid_is_zero(_id_node->state.xport_uuid)) {
+const struct xbridge *bridge =
+xbridge_lookup_by_uuid(xcfg, 
_id_node->state.ofproto_uuid);
+xport = bridge ? get_ofp_port(bridge, in_port) : NULL;
+} else {
+struct uuid xport_uuid = recirc_id_node->state.xport_uuid;
+xport = xport_lookup_by_uuid(xcfg, _uuid);
+}
+
 if (xport && xport->xbridge && xport->xbridge->ofproto) {
 goto out;
 }
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
index 6824ce0bb..8b9447c74 100644
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -5854,6 +5854,40 @@ OVS_WAIT_UNTIL([check_flows], [ovs-ofctl dump-flows br0])
 OVS_VSWITCHD_STOP
 AT_CLEANUP
 
+# Checks for regression against a bug in which OVS dropped packets
+# originating from the the controller passing through a patch port
+AT_SETUP([ofproto-dpif - packet-out recirculation OFPP_CONTROLLER and patch 
port])
+OVS_VSWITCHD_START(
+[add-port br0 patch-br1 -- \
+ set interface patch-br1 type=patch options:peer=patch-br0 -- \
+ add-br br1 -- set bridge br1 datapath-type=dummy fail-mode=secure -- \
+ add-port br1 patch-br0 -- set interface patch-br0 type=patch 
options:peer=patch-br1
+])
+
+add_of_ports --pcap br1 1
+
+AT_DATA([flows-br0.txt], [dnl
+table=0 icmp actions=output:patch-br1
+])
+AT_CHECK([ovs-ofctl add-flows br0 flows-br0.txt])
+
+AT_DATA([flows-br1.txt], [dnl
+table=0, icmp actions=ct(table=1,zone=1)
+table=1, ct_state=+trk, icmp actions=p1
+])
+AT_CHECK([ovs-ofctl add-flows br1 flows-br1.txt])
+
+packet=50540007505400050800455c8001b94dc0a80001c0a80002080013fc000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+AT_CHECK([ovs-ofctl packet-out br0 "in_port=controller packet=$packet 
actions=table"])
+
+OVS_WAIT_UNTIL_EQUAL([ovs-ofctl dump-flows -m br1 | grep "ct_state" | 
ofctl_strip], [dnl
+ table=1, n_packets=1, n_bytes=106, ct_state=+trk,icmp actions=output:2])
+
+OVS_WAIT_UNTIL([ovs-pcap p1-tx.pcap | grep -q "$packet"])
+
+OVS_VSWITCHD_STOP
+AT_CLEANUP
+
 AT_SETUP([ofproto-dpif - debug_slow action])
 OVS_VSWITCHD_START
 add_of_ports br0 1 2 3

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] ofproto-dpif-xlate: Fix recirculation with patch port and controller.

2023-05-15 Thread Paolo Valerio
If a packet originating from the controller recirculates after going
through a patch port, it gets dropped with the following message:

ofproto_dpif_upcall(handler8)|INFO|received packet on unassociated
  datapath port 4294967295

This happens because there's no xport_uuid in the recirculation node
and at the same type in_port refers to the patch port.

The patch, in the case of zeroed uuid, retrieves the xport starting
from the ofproto_uuid stored in the recirc node.

Signed-off-by: Paolo Valerio 
---
 ofproto/ofproto-dpif-xlate.c |   11 +--
 tests/ofproto-dpif.at|   34 ++
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index c01177718..3509cc73c 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -1533,8 +1533,15 @@ xlate_lookup_ofproto_(const struct dpif_backer *backer,
 
 ofp_port_t in_port = recirc_id_node->state.metadata.in_port;
 if (in_port != OFPP_NONE && in_port != OFPP_CONTROLLER) {
-struct uuid xport_uuid = recirc_id_node->state.xport_uuid;
-xport = xport_lookup_by_uuid(xcfg, _uuid);
+if (uuid_is_zero(_id_node->state.xport_uuid)) {
+const struct xbridge *bridge =
+xbridge_lookup_by_uuid(xcfg, 
_id_node->state.ofproto_uuid);
+xport = bridge ? get_ofp_port(bridge, in_port) : NULL;
+} else {
+struct uuid xport_uuid = recirc_id_node->state.xport_uuid;
+xport = xport_lookup_by_uuid(xcfg, _uuid);
+}
+
 if (xport && xport->xbridge && xport->xbridge->ofproto) {
 goto out;
 }
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
index 6824ce0bb..8b9447c74 100644
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -5854,6 +5854,40 @@ OVS_WAIT_UNTIL([check_flows], [ovs-ofctl dump-flows br0])
 OVS_VSWITCHD_STOP
 AT_CLEANUP
 
+# Checks for regression against a bug in which OVS dropped packets
+# originating from the the controller passing through a patch port
+AT_SETUP([ofproto-dpif - packet-out recirculation OFPP_CONTROLLER and patch 
port])
+OVS_VSWITCHD_START(
+[add-port br0 patch-br1 -- \
+ set interface patch-br1 type=patch options:peer=patch-br0 -- \
+ add-br br1 -- set bridge br1 datapath-type=dummy fail-mode=secure -- \
+ add-port br1 patch-br0 -- set interface patch-br0 type=patch 
options:peer=patch-br1
+])
+
+add_of_ports --pcap br1 1
+
+AT_DATA([flows-br0.txt], [dnl
+table=0 icmp actions=output:patch-br1
+])
+AT_CHECK([ovs-ofctl add-flows br0 flows-br0.txt])
+
+AT_DATA([flows-br1.txt], [dnl
+table=0, icmp actions=ct(table=1,zone=1)
+table=1, ct_state=+trk, icmp actions=p1
+])
+AT_CHECK([ovs-ofctl add-flows br1 flows-br1.txt])
+
+packet=50540007505400050800455c8001b94dc0a80001c0a80002080013fc000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+AT_CHECK([ovs-ofctl packet-out br0 "in_port=controller packet=$packet 
actions=table"])
+
+OVS_WAIT_UNTIL_EQUAL([ovs-ofctl dump-flows -m br1 | grep "ct_state" | 
ofctl_strip], [dnl
+ table=1, n_packets=1, n_bytes=106, ct_state=+trk,icmp actions=output:2])
+
+OVS_WAIT_UNTIL([ovs-pcap p1-tx.pcap | grep -q "$packet"])
+
+OVS_VSWITCHD_STOP
+AT_CLEANUP
+
 AT_SETUP([ofproto-dpif - debug_slow action])
 OVS_VSWITCHD_START
 add_of_ports br0 1 2 3

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH 2/2] conntrack: Release nat_conn in case both keys have the same hash.

2023-05-15 Thread Paolo Valerio
Ilya Maximets  writes:

> On 5/4/23 19:21, Paolo Valerio wrote:
>> Ilya Maximets  writes:
>> 
>>> On 4/19/23 20:40, Paolo Valerio wrote:
>>>> During the creation of a new connection, there's a chance both key and
>>>> rev_key end up having the same hash. This is more common in the case
>>>> of all-zero snat with no collisions. In that case, once the
>>>> connection is expired, but not cleaned up, if a new packet with the
>>>> same 5-tuple is received, an assertion failure gets triggered in
>>>> conn_update_state() because of a previous failure of retrieving a
>>>> CT_CONN_TYPE_DEFAULT connection.
>>>>
>>>> Fix it by releasing the nat_conn during the connection creation in the
>>>> case of same hash for both key and rev_key.
>>>
>>> This sounds a bit odd.  Shouldn't we treat hash collision as a normal case?
>>>
>>> Looking at the code, I'm assuming that the issue comes from the following
>>> part in process_one():
>>>
>>> if (OVS_LIKELY(conn)) {
>>> if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
>>> ...
>>> conn_key_lookup(ct, >key, hash, now, , >reply);
>>>
>>> And here we get the same connection again, because the default one is 
>>> already
>>> expired.  Is that correct?
>>>
>>> If so, maybe we should add an extra condition to conn_key_lookup() to
>>> only look for DEFAULT connections instead, just for this case?  Since
>>> we really don't want to get the UN_NAT one here.
>>>
>> 
>> Hello Ilya,
>> 
>> It's a fair point.
>> I initially thought about the approach you're suggesting, but I had some
>> concerns about it that I'll try to summarize below.
>> 
>> For sure it would fix the issue (it could require the first patch to be
>> applied as well for the branches with rcu exp lists).
>> 
>> Based on the current logic, new packets matching that expired connection
>> but not evicted will be marked as +inv and further packets will be
>> marked so for the whole sweep interval unless an exception like this get
>> added:
>> 
>> uint32_t hash = conn_key_hash(>rev_key, ct->hash_basis);
>> /* the last flag indicates CT_CONN_TYPE_DEFAULT only */
>> conn_key_lookup_(ct, >key, hash, now, , >reply, true);
>> /* special case where there's hash collision */
>> if (!conn && ctx->hash != hash) {
>> pkt->md.ct_state |= CS_INVALID;
>> write_ct_md(pkt, zone, NULL, NULL, NULL);
>> ...
>> return;
>> }
>> 
>> This would further require that subsequent lookup in the create_new_conn
>> path are restricted to CT_CONN_TYPE_DEFAULT, e.g.:
>> 
>> uint32_t hash = conn_key_hash(>key, ct->hash_basis);
>> /* Only check for CT_CONN_TYPE_DEFAULT */
>> if (!conn_key_lookup_(ct, >key, hash, now, NULL, NULL, true)) {
>> conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
>>   helper, alg_exp, ct_alg_ctl, tp_id);
>> }
>> 
>> otherwise we could incur in a false positive which prevent to create a
>> new connection.
>
> I'm not really sure if what described above is more correct way of doing
> things or not...  Aaron, do you have opinion on this?
>
> Another thought: Can we expire the CT_CONN_TYPE_UN_NAT connection the
> moment DEFAULT counterpart of it expires?  Or that will that be against
> some logic / not possible to do?
>

As far as I can tell, this could not be straightforward as simply
marking it as expired should not be reliable (e.g. doing it from the
sweeper), and I guess that managing the expiration time field for the
nat_conn as well would require updating the nat_conn every time the
default one gets updated, probably making it a bit unpractical.

Another approach would be removing the nat_conn [1] altogether.
The problem in this case is backporting. Some adjustments that would add
to the patch might be needed for older branches.

[1] 
https://patchwork.ozlabs.org/project/openvswitch/patch/20201129033255.64647-2-hepeng.0...@bytedance.com/

>
>> 
>>> Best regards, Ilya Maximets.
>>>
>>>>
>>>> Reported-by: Michael Plato 
>>>> Fixes: 61e48c2d1db2 ("conntrack: Handle SNAT with all-zero IP address.")
>>>> Signed-off-by: Paolo Valerio 
>>>> ---
>>>> In this thread [0] there are some more details. A similar
>>>> approach here could be to avoid to add the nat_conn to the cmap and
>>>> letting the sweeper release

Re: [ovs-discuss] ovs-vswitchd crashes serveral times a day

2023-05-04 Thread Paolo Valerio via discuss
Lazuardi Nasution  writes:

> Hi Paolo,
>
> Should we combine this patch too?
>
> https://patchwork.ozlabs.org/project/openvswitch/patch/
> 168192964823.4031872.3228556334798413886.st...@fed.void/
>

Hi,

no, it basically does the same thing in a slightly different way
reducing the need for modification in the case of backporting to
previous versions.

> Best regards.
>
> On Wed, Apr 5, 2023 at 2:51 AM Paolo Valerio  wrote:
>
> Hello,
>
> thanks for reporting this.
> I had a look at it, and, although this needs to be confirmed, I suspect
> it's related to nat (CT_CONN_TYPE_UN_NAT) and expired connections (but
> not yet reclaimed).
>
> The nat part does not necessarily perform any actual translation, but
> could still be triggered by ct(nat(src)...) which is the all-zero binding
> to avoid collisions, if any.
>
> Is there any chance to test the following patch (targeted for ovs 2.17)?
> This should help to confirm.
>
> -- >8 --
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 08da4ddf7..ba334afb0 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -94,9 +94,8 @@ static bool valid_new(struct dp_packet *pkt, struct
> conn_key *);
>  static struct conn *new_conn(struct conntrack *ct, struct dp_packet *pkt,
>                               struct conn_key *, long long now,
>                               uint32_t tp_id);
> -static void delete_conn_cmn(struct conn *);
> +static void delete_conn__(struct conn *);
>  static void delete_conn(struct conn *);
> -static void delete_conn_one(struct conn *conn);
>  static enum ct_update_res conn_update(struct conntrack *ct, struct conn
> *conn,
>                                        struct dp_packet *pkt,
>                                        struct conn_lookup_ctx *ctx,
> @@ -444,14 +443,13 @@ zone_limit_delete(struct conntrack *ct, uint16_t
> zone)
>  }
>
>  static void
> -conn_clean_cmn(struct conntrack *ct, struct conn *conn)
> +conn_clean_cmn(struct conntrack *ct, struct conn *conn, uint32_t hash)
>      OVS_REQUIRES(ct->ct_lock)
>  {
>      if (conn->alg) {
>          expectation_clean(ct, >key);
>      }
>
> -    uint32_t hash = conn_key_hash(>key, ct->hash_basis);
>      cmap_remove(>conns, >cm_node, hash);
>
>      struct zone_limit *zl = zone_limit_lookup(ct, conn->admit_zone);
> @@ -467,11 +465,14 @@ conn_clean(struct conntrack *ct, struct conn *conn)
>      OVS_REQUIRES(ct->ct_lock)
>  {
>      ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
> +    uint32_t conn_hash = conn_key_hash(>key, ct->hash_basis);
>
> -    conn_clean_cmn(ct, conn);
> +    conn_clean_cmn(ct, conn, conn_hash);
>      if (conn->nat_conn) {
>          uint32_t hash = conn_key_hash(>nat_conn->key, ct->
> hash_basis);
> -        cmap_remove(>conns, >nat_conn->cm_node, hash);
> +        if (conn_hash != hash) {
> +            cmap_remove(>conns, >nat_conn->cm_node, hash);
> +        }
>      }
>      ovs_list_remove(>exp_node);
>      conn->cleaned = true;
> @@ -479,19 +480,6 @@ conn_clean(struct conntrack *ct, struct conn *conn)
>      atomic_count_dec(>n_conn);
>  }
>
> -static void
> -conn_clean_one(struct conntrack *ct, struct conn *conn)
> -    OVS_REQUIRES(ct->ct_lock)
> -{
> -    conn_clean_cmn(ct, conn);
> -    if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
> -        ovs_list_remove(>exp_node);
> -        conn->cleaned = true;
> -        atomic_count_dec(>n_conn);
> -    }
> -    ovsrcu_postpone(delete_conn_one, conn);
> -}
> -
>  /* Destroys the connection tracker 'ct' and frees all the allocated
> memory.
>   * The caller of this function must already have shut down packet input
>   * and PMD threads (which would have been quiesced).  */
> @@ -505,7 +493,10 @@ conntrack_destroy(struct conntrack *ct)
>
>      ovs_mutex_lock(>ct_lock);
>      CMAP_FOR_EACH (conn, cm_node, >conns) {
> -        conn_clean_one(ct, conn);
> +        if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
> +            continue;
> +        }
> +        conn_clean(ct, conn);
>      }
>      cmap_destroy(>conns);
>
> @@ -1052,7 +1043,10 @@ conn_not_found(struct conntrack *ct, struct
> dp_packet *pkt,
>              nat_conn->alg = NULL;
>              nat_conn->nat_co

Re: [ovs-dev] [PATCH 2/2] conntrack: Release nat_conn in case both keys have the same hash.

2023-05-04 Thread Paolo Valerio
Ilya Maximets  writes:

> On 4/19/23 20:40, Paolo Valerio wrote:
>> During the creation of a new connection, there's a chance both key and
>> rev_key end up having the same hash. This is more common in the case
>> of all-zero snat with no collisions. In that case, once the
>> connection is expired, but not cleaned up, if a new packet with the
>> same 5-tuple is received, an assertion failure gets triggered in
>> conn_update_state() because of a previous failure of retrieving a
>> CT_CONN_TYPE_DEFAULT connection.
>> 
>> Fix it by releasing the nat_conn during the connection creation in the
>> case of same hash for both key and rev_key.
>
> This sounds a bit odd.  Shouldn't we treat hash collision as a normal case?
>
> Looking at the code, I'm assuming that the issue comes from the following
> part in process_one():
>
> if (OVS_LIKELY(conn)) {
> if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
> ...
> conn_key_lookup(ct, >key, hash, now, , >reply);
>
> And here we get the same connection again, because the default one is already
> expired.  Is that correct?
>
> If so, maybe we should add an extra condition to conn_key_lookup() to
> only look for DEFAULT connections instead, just for this case?  Since
> we really don't want to get the UN_NAT one here.
>

Hello Ilya,

It's a fair point.
I initially thought about the approach you're suggesting, but I had some
concerns about it that I'll try to summarize below.

For sure it would fix the issue (it could require the first patch to be
applied as well for the branches with rcu exp lists).

Based on the current logic, new packets matching that expired connection
but not evicted will be marked as +inv and further packets will be
marked so for the whole sweep interval unless an exception like this get
added:

uint32_t hash = conn_key_hash(>rev_key, ct->hash_basis);
/* the last flag indicates CT_CONN_TYPE_DEFAULT only */
conn_key_lookup_(ct, >key, hash, now, , >reply, true);
/* special case where there's hash collision */
if (!conn && ctx->hash != hash) {
pkt->md.ct_state |= CS_INVALID;
write_ct_md(pkt, zone, NULL, NULL, NULL);
...
return;
}

This would further require that subsequent lookup in the create_new_conn
path are restricted to CT_CONN_TYPE_DEFAULT, e.g.:

uint32_t hash = conn_key_hash(>key, ct->hash_basis);
/* Only check for CT_CONN_TYPE_DEFAULT */
if (!conn_key_lookup_(ct, >key, hash, now, NULL, NULL, true)) {
conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
  helper, alg_exp, ct_alg_ctl, tp_id);
}

otherwise we could incur in a false positive which prevent to create a
new connection.

> Best regards, Ilya Maximets.
>
>> 
>> Reported-by: Michael Plato 
>> Fixes: 61e48c2d1db2 ("conntrack: Handle SNAT with all-zero IP address.")
>> Signed-off-by: Paolo Valerio 
>> ---
>> In this thread [0] there are some more details. A similar
>> approach here could be to avoid to add the nat_conn to the cmap and
>> letting the sweeper release the memory for nat_conn once the whole
>> connection gets freed.
>> That approach could still be ok, but the drawback is that it could
>> require a different patch for older branches that don't include
>> 3d9c1b855a5f ("conntrack: Replace timeout based expiration lists with
>> rculists."). It still worth to be considered.
>> 
>> [0] https://mail.openvswitch.org/pipermail/ovs-discuss/2023-April/052339.html
>> ---
>>  lib/conntrack.c |   21 +
>>  1 file changed, 13 insertions(+), 8 deletions(-)
>> 
>> diff --git a/lib/conntrack.c b/lib/conntrack.c
>> index 7e1fc4b1f..d2ee127d9 100644
>> --- a/lib/conntrack.c
>> +++ b/lib/conntrack.c
>> @@ -1007,14 +1007,19 @@ conn_not_found(struct conntrack *ct, struct 
>> dp_packet *pkt,
>>  }
>>  
>>  nat_packet(pkt, nc, false, ctx->icmp_related);
>> -memcpy(_conn->key, >rev_key, sizeof nat_conn->key);
>> -memcpy(_conn->rev_key, >key, sizeof nat_conn->rev_key);
>> -nat_conn->conn_type = CT_CONN_TYPE_UN_NAT;
>> -nat_conn->nat_action = 0;
>> -nat_conn->alg = NULL;
>> -nat_conn->nat_conn = NULL;
>> -uint32_t nat_hash = conn_key_hash(_conn->key, 
>> ct->hash_basis);
>> -cmap_insert(>conns, _conn->cm_node, nat_hash);
>> +uint32_t nat_hash = conn_key_hash(>rev_key, ct->hash_basis);
>> +if (nat_hash != ctx->hash) {
>> +memcpy(_conn

Re: [ovs-dev] [PATCH 1/2] conntrack: Do not defer connection clean up.

2023-04-20 Thread Paolo Valerio
Aaron Conole  writes:

> Paolo Valerio  writes:
>
>> Connections that need to be removed, e.g. while forcing a direction,
>> were invalidated forcing them to be expired.
>> This is not actually needed, as it's typically a one-time
>> operation.
>> The patch replaces a call to conn_force_expire() with a call to
>> conn_clean().
>>
>> Signed-off-by: Paolo Valerio 
>> ---
>
> Is there a possible contention issue now where the conn update can also
> take the ct lock?  IE: before, we would rely on the expiration timer
> processing, but now we directly release which requires the ct lock.
>
> Maybe since it is a rare enough event, this isn't as big a deal?
>

That's a fair point and mostly the reason I opted to split this one from
the next. Assuming as common the scenario where, e.g. many connections
are in TIME_WAIT and new connections with the same 5-tuple are
initiated while the sweeper is actually deleting, yes. The advantage
with this patch is that nconns is lowered earlier instead of waiting for
the next sweep interval, and, assuming it is an actual upside, the load
on the sweeper thread is reduced for those deletions.

The reason I included it is that forcing the expiration makes the
reported issue theoretically possible for those use case, but doesn't
solve it for all the cases as the second patch should.

I guess it's fine to drop this, at least for the time being.

>>  lib/conntrack.c |   10 ++
>>  1 file changed, 2 insertions(+), 8 deletions(-)
>>
>> diff --git a/lib/conntrack.c b/lib/conntrack.c
>> index ce8a63de5..7e1fc4b1f 100644
>> --- a/lib/conntrack.c
>> +++ b/lib/conntrack.c
>> @@ -514,12 +514,6 @@ conn_clean(struct conntrack *ct, struct conn *conn)
>>  atomic_count_dec(>n_conn);
>>  }
>>  
>> -static void
>> -conn_force_expire(struct conn *conn)
>> -{
>> -atomic_store_relaxed(>expiration, 0);
>> -}
>> -
>>  /* Destroys the connection tracker 'ct' and frees all the allocated memory.
>>   * The caller of this function must already have shut down packet input
>>   * and PMD threads (which would have been quiesced).  */
>> @@ -1089,7 +1083,7 @@ conn_update_state(struct conntrack *ct, struct 
>> dp_packet *pkt,
>>  break;
>>  case CT_UPDATE_NEW:
>>  if (conn_lookup(ct, >key, now, NULL, NULL)) {
>> -conn_force_expire(conn);
>> +conn_clean(ct, conn);
>>  }
>>  create_new_conn = true;
>>  break;
>> @@ -1299,7 +1293,7 @@ process_one(struct conntrack *ct, struct dp_packet 
>> *pkt,
>>  /* Delete found entry if in wrong direction. 'force' implies commit. */
>>  if (OVS_UNLIKELY(force && ctx->reply && conn)) {
>>  if (conn_lookup(ct, >key, now, NULL, NULL)) {
>> -conn_force_expire(conn);
>> +conn_clean(ct, conn);
>>  }
>>  conn = NULL;
>>  }

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 2/2] conntrack: Release nat_conn in case both keys have the same hash.

2023-04-19 Thread Paolo Valerio
During the creation of a new connection, there's a chance both key and
rev_key end up having the same hash. This is more common in the case
of all-zero snat with no collisions. In that case, once the
connection is expired, but not cleaned up, if a new packet with the
same 5-tuple is received, an assertion failure gets triggered in
conn_update_state() because of a previous failure of retrieving a
CT_CONN_TYPE_DEFAULT connection.

Fix it by releasing the nat_conn during the connection creation in the
case of same hash for both key and rev_key.

Reported-by: Michael Plato 
Fixes: 61e48c2d1db2 ("conntrack: Handle SNAT with all-zero IP address.")
Signed-off-by: Paolo Valerio 
---
In this thread [0] there are some more details. A similar
approach here could be to avoid to add the nat_conn to the cmap and
letting the sweeper release the memory for nat_conn once the whole
connection gets freed.
That approach could still be ok, but the drawback is that it could
require a different patch for older branches that don't include
3d9c1b855a5f ("conntrack: Replace timeout based expiration lists with
rculists."). It still worth to be considered.

[0] https://mail.openvswitch.org/pipermail/ovs-discuss/2023-April/052339.html
---
 lib/conntrack.c |   21 +
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 7e1fc4b1f..d2ee127d9 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -1007,14 +1007,19 @@ conn_not_found(struct conntrack *ct, struct dp_packet 
*pkt,
 }
 
 nat_packet(pkt, nc, false, ctx->icmp_related);
-memcpy(_conn->key, >rev_key, sizeof nat_conn->key);
-memcpy(_conn->rev_key, >key, sizeof nat_conn->rev_key);
-nat_conn->conn_type = CT_CONN_TYPE_UN_NAT;
-nat_conn->nat_action = 0;
-nat_conn->alg = NULL;
-nat_conn->nat_conn = NULL;
-uint32_t nat_hash = conn_key_hash(_conn->key, ct->hash_basis);
-cmap_insert(>conns, _conn->cm_node, nat_hash);
+uint32_t nat_hash = conn_key_hash(>rev_key, ct->hash_basis);
+if (nat_hash != ctx->hash) {
+memcpy(_conn->key, >rev_key, sizeof nat_conn->key);
+memcpy(_conn->rev_key, >key, sizeof nat_conn->rev_key);
+nat_conn->conn_type = CT_CONN_TYPE_UN_NAT;
+nat_conn->nat_action = 0;
+nat_conn->alg = NULL;
+nat_conn->nat_conn = NULL;
+cmap_insert(>conns, _conn->cm_node, nat_hash);
+} else {
+free(nat_conn);
+nat_conn = NULL;
+}
 }
 
 nc->nat_conn = nat_conn;

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 1/2] conntrack: Do not defer connection clean up.

2023-04-19 Thread Paolo Valerio
Connections that need to be removed, e.g. while forcing a direction,
were invalidated forcing them to be expired.
This is not actually needed, as it's typically a one-time
operation.
The patch replaces a call to conn_force_expire() with a call to
conn_clean().

Signed-off-by: Paolo Valerio 
---
 lib/conntrack.c |   10 ++
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/lib/conntrack.c b/lib/conntrack.c
index ce8a63de5..7e1fc4b1f 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -514,12 +514,6 @@ conn_clean(struct conntrack *ct, struct conn *conn)
 atomic_count_dec(>n_conn);
 }
 
-static void
-conn_force_expire(struct conn *conn)
-{
-atomic_store_relaxed(>expiration, 0);
-}
-
 /* Destroys the connection tracker 'ct' and frees all the allocated memory.
  * The caller of this function must already have shut down packet input
  * and PMD threads (which would have been quiesced).  */
@@ -1089,7 +1083,7 @@ conn_update_state(struct conntrack *ct, struct dp_packet 
*pkt,
 break;
 case CT_UPDATE_NEW:
 if (conn_lookup(ct, >key, now, NULL, NULL)) {
-conn_force_expire(conn);
+conn_clean(ct, conn);
 }
 create_new_conn = true;
 break;
@@ -1299,7 +1293,7 @@ process_one(struct conntrack *ct, struct dp_packet *pkt,
 /* Delete found entry if in wrong direction. 'force' implies commit. */
 if (OVS_UNLIKELY(force && ctx->reply && conn)) {
 if (conn_lookup(ct, >key, now, NULL, NULL)) {
-conn_force_expire(conn);
+conn_clean(ct, conn);
 }
 conn = NULL;
 }

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 0/2] conntrack: Fix failed assertion in conn_update_state().

2023-04-19 Thread Paolo Valerio
The series addresses the issue reported here [0] by Michael Plato and
confirmed by Lazuardi Nasution.

More details in the patch descriptions.

The first patch is mostly a clean up and not necessarily required,
whereas the second one contains the actual fix.

[0] https://mail.openvswitch.org/pipermail/ovs-discuss/2023-April/052328.html

Paolo Valerio (2):
  conntrack: Do not defer connection clean up.
  conntrack: Release nat_conn in case both keys have the same hash.


 lib/conntrack.c | 31 +++
 1 file changed, 15 insertions(+), 16 deletions(-)

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-discuss] ovs-vswitchd crashes serveral times a day

2023-04-17 Thread Paolo Valerio via discuss
"Plato, Michael"  writes:

> Hi Paolo,
> I installed the patch for 2.17 on april 6th in our test environment and can 
> confirm that it works. We haven't had any crashes since then. Many thanks for 
> the quick solution!
>

Hi Micheal,

Nice! That's helpful. Thanks for testing it.

Paolo

> Best regards
>
> Michael
>
> -Ursprüngliche Nachricht-
> Von: Paolo Valerio  
> Gesendet: Montag, 17. April 2023 10:36
> An: Lazuardi Nasution 
> Cc: ovs-discuss@openvswitch.org; Plato, Michael 
> Betreff: Re: Re: [ovs-discuss] ovs-vswitchd crashes serveral times a day
>
> Lazuardi Nasution  writes:
>
>> Hi Paolo,
>>
>> I'm interested in your statement of "expired connections (but not yet 
>> reclaimed)". Do you think that shortening conntrack timeout policy will help?
>> Or, should we make it larger so there will be fewer conntrack table 
>> update and flush attempts?
>>
>
> it's hard to say as it depends on the specific use case.
> Probably making it larger for the specific case could help, but in general, I 
> would not rely on that.
> Of course, an actual fix is needed. It would be great if the patch sent could 
> tested, but in any case, I'll work on a formal patch.
>
>> Best regards.
>>
>> On Wed, Apr 5, 2023 at 2:51 AM Paolo Valerio  wrote:
>>
>> Hello,
>>
>> thanks for reporting this.
>> I had a look at it, and, although this needs to be confirmed, I suspect
>> it's related to nat (CT_CONN_TYPE_UN_NAT) and expired connections (but
>> not yet reclaimed).
>>
>> The nat part does not necessarily perform any actual translation, but
>> could still be triggered by ct(nat(src)...) which is the all-zero binding
>> to avoid collisions, if any.
>>
>> Is there any chance to test the following patch (targeted for ovs 2.17)?
>> This should help to confirm.
>>
>> -- >8 --
>> diff --git a/lib/conntrack.c b/lib/conntrack.c
>> index 08da4ddf7..ba334afb0 100644
>> --- a/lib/conntrack.c
>> +++ b/lib/conntrack.c
>> @@ -94,9 +94,8 @@ static bool valid_new(struct dp_packet *pkt, struct
>> conn_key *);
>>  static struct conn *new_conn(struct conntrack *ct, struct dp_packet 
>> *pkt,
>>                               struct conn_key *, long long now,
>>                               uint32_t tp_id);
>> -static void delete_conn_cmn(struct conn *);
>> +static void delete_conn__(struct conn *);
>>  static void delete_conn(struct conn *);
>> -static void delete_conn_one(struct conn *conn);
>>  static enum ct_update_res conn_update(struct conntrack *ct, struct conn
>> *conn,
>>                                        struct dp_packet *pkt,
>>                                        struct conn_lookup_ctx *ctx,
>> @@ -444,14 +443,13 @@ zone_limit_delete(struct conntrack *ct, uint16_t
>> zone)
>>  }
>>
>>  static void
>> -conn_clean_cmn(struct conntrack *ct, struct conn *conn)
>> +conn_clean_cmn(struct conntrack *ct, struct conn *conn, uint32_t hash)
>>      OVS_REQUIRES(ct->ct_lock)
>>  {
>>      if (conn->alg) {
>>          expectation_clean(ct, >key);
>>      }
>>
>> -    uint32_t hash = conn_key_hash(>key, ct->hash_basis);
>>      cmap_remove(>conns, >cm_node, hash);
>>
>>      struct zone_limit *zl = zone_limit_lookup(ct, conn->admit_zone);
>> @@ -467,11 +465,14 @@ conn_clean(struct conntrack *ct, struct conn *conn)
>>      OVS_REQUIRES(ct->ct_lock)
>>  {
>>      ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
>> +    uint32_t conn_hash = conn_key_hash(>key, 
>> ct->hash_basis);
>>
>> -    conn_clean_cmn(ct, conn);
>> +    conn_clean_cmn(ct, conn, conn_hash);
>>      if (conn->nat_conn) {
>>          uint32_t hash = conn_key_hash(>nat_conn->key, ct->
>> hash_basis);
>> -        cmap_remove(>conns, >nat_conn->cm_node, hash);
>> +        if (conn_hash != hash) {
>> +            cmap_remove(>conns, >nat_conn->cm_node, hash);
>> +        }
>>      }
>>      ovs_list_remove(>exp_node);
>>      conn->cleaned = true;
>> @@ -479,19 +480,6 @@ conn_clean(struct conntrack *ct, struct conn *conn)
>>      atomic_count_dec(>n_conn);
>>  }
>>
>> -static void
>> -conn_clean_one(struct co

Re: [ovs-discuss] ovs-vswitchd crashes serveral times a day

2023-04-17 Thread Paolo Valerio via discuss
Lazuardi Nasution  writes:

> Hi Paolo,
>
> I'm interested in your statement of "expired connections (but not yet
> reclaimed)". Do you think that shortening conntrack timeout policy will help?
> Or, should we make it larger so there will be fewer conntrack table update and
> flush attempts?
>

it's hard to say as it depends on the specific use case.
Probably making it larger for the specific case could help, but in
general, I would not rely on that.
Of course, an actual fix is needed. It would be great if the patch sent
could tested, but in any case, I'll work on a formal patch.

> Best regards.
>
> On Wed, Apr 5, 2023 at 2:51 AM Paolo Valerio  wrote:
>
> Hello,
>
> thanks for reporting this.
> I had a look at it, and, although this needs to be confirmed, I suspect
> it's related to nat (CT_CONN_TYPE_UN_NAT) and expired connections (but
> not yet reclaimed).
>
> The nat part does not necessarily perform any actual translation, but
> could still be triggered by ct(nat(src)...) which is the all-zero binding
> to avoid collisions, if any.
>
> Is there any chance to test the following patch (targeted for ovs 2.17)?
> This should help to confirm.
>
> -- >8 --
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 08da4ddf7..ba334afb0 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -94,9 +94,8 @@ static bool valid_new(struct dp_packet *pkt, struct
> conn_key *);
>  static struct conn *new_conn(struct conntrack *ct, struct dp_packet *pkt,
>                               struct conn_key *, long long now,
>                               uint32_t tp_id);
> -static void delete_conn_cmn(struct conn *);
> +static void delete_conn__(struct conn *);
>  static void delete_conn(struct conn *);
> -static void delete_conn_one(struct conn *conn);
>  static enum ct_update_res conn_update(struct conntrack *ct, struct conn
> *conn,
>                                        struct dp_packet *pkt,
>                                        struct conn_lookup_ctx *ctx,
> @@ -444,14 +443,13 @@ zone_limit_delete(struct conntrack *ct, uint16_t
> zone)
>  }
>
>  static void
> -conn_clean_cmn(struct conntrack *ct, struct conn *conn)
> +conn_clean_cmn(struct conntrack *ct, struct conn *conn, uint32_t hash)
>      OVS_REQUIRES(ct->ct_lock)
>  {
>      if (conn->alg) {
>          expectation_clean(ct, >key);
>      }
>
> -    uint32_t hash = conn_key_hash(>key, ct->hash_basis);
>      cmap_remove(>conns, >cm_node, hash);
>
>      struct zone_limit *zl = zone_limit_lookup(ct, conn->admit_zone);
> @@ -467,11 +465,14 @@ conn_clean(struct conntrack *ct, struct conn *conn)
>      OVS_REQUIRES(ct->ct_lock)
>  {
>      ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
> +    uint32_t conn_hash = conn_key_hash(>key, ct->hash_basis);
>
> -    conn_clean_cmn(ct, conn);
> +    conn_clean_cmn(ct, conn, conn_hash);
>      if (conn->nat_conn) {
>          uint32_t hash = conn_key_hash(>nat_conn->key, ct->
> hash_basis);
> -        cmap_remove(>conns, >nat_conn->cm_node, hash);
> +        if (conn_hash != hash) {
> +            cmap_remove(>conns, >nat_conn->cm_node, hash);
> +        }
>      }
>      ovs_list_remove(>exp_node);
>      conn->cleaned = true;
> @@ -479,19 +480,6 @@ conn_clean(struct conntrack *ct, struct conn *conn)
>      atomic_count_dec(>n_conn);
>  }
>
> -static void
> -conn_clean_one(struct conntrack *ct, struct conn *conn)
> -    OVS_REQUIRES(ct->ct_lock)
> -{
> -    conn_clean_cmn(ct, conn);
> -    if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
> -        ovs_list_remove(>exp_node);
> -        conn->cleaned = true;
> -        atomic_count_dec(>n_conn);
> -    }
> -    ovsrcu_postpone(delete_conn_one, conn);
> -}
> -
>  /* Destroys the connection tracker 'ct' and frees all the allocated
> memory.
>   * The caller of this function must already have shut down packet input
>   * and PMD threads (which would have been quiesced).  */
> @@ -505,7 +493,10 @@ conntrack_destroy(struct conntrack *ct)
>
>      ovs_mutex_lock(>ct_lock);
>      CMAP_FOR_EACH (conn, cm_node, >conns) {
> -        conn_clean_one(ct, conn);
> +        if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
> +            continue;
> +        }
> +        conn_clean(ct, conn);
&

[ovs-dev] [PATCH v3] ovs-dpctl: Add new command dpctl/ct-[sg]et-sweep-interval.

2023-04-06 Thread Paolo Valerio
Since 3d9c1b855a5f ("conntrack: Replace timeout based expiration lists
with rculists.") the sweep interval changed as well as the constraints
related to the sweeper.
Being able to change the default reschedule time may be convenient in
some conditions, like debugging.
This patch introduces new commands allowing to get and set the sweep
interval in ms.

Signed-off-by: Paolo Valerio 
---
v3:
- rebased on top of the current master
- renamed commands to dpctl/ct-[sg]et-sweep-interval (Ilya)
- added simple get/set test in ofproto-dpif.at (Ilya)

v2:
- resolved conflict in NEWS
- added missing comment
- added missing '\' in dpctl.man
---
 NEWS|3 ++
 lib/conntrack-private.h |1 +
 lib/conntrack.c |   18 +-
 lib/conntrack.h |2 ++
 lib/ct-dpif.c   |   14 +++
 lib/ct-dpif.h   |1 +
 lib/dpctl.c |   61 +++
 lib/dpctl.man   |9 +++
 lib/dpif-netdev.c   |   17 +
 lib/dpif-netlink.c  |2 ++
 lib/dpif-provider.h |4 +++
 tests/ofproto-dpif.at   |   22 +
 12 files changed, 153 insertions(+), 1 deletion(-)

diff --git a/NEWS b/NEWS
index b6418c36e..1155bfbb1 100644
--- a/NEWS
+++ b/NEWS
@@ -11,6 +11,9 @@ Post-v3.1.0
- ovs-appctl:
  * Add support for selecting the source address with the
'ovs-appctl ovs/route/add' command.
+ * New commands "dpctl/{ct-get-sweep-interval,ct-set-sweep-interval}" that
+   allow to get and set, for the userspace datapath, the sweep interval
+   for the conntrack garbage collector.
- ovs-ctl:
  * Added new options --[ovsdb-server|ovs-vswitchd]-umask=MODE to set umask
value when starting OVS daemons.  E.g., use --ovsdb-server-umask=0002
diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
index fae8b3a9b..bb326868e 100644
--- a/lib/conntrack-private.h
+++ b/lib/conntrack-private.h
@@ -224,6 +224,7 @@ struct conntrack {
 struct ipf *ipf; /* Fragmentation handling context. */
 uint32_t zone_limit_seq; /* Used to disambiguate zone limit counts. */
 atomic_bool tcp_seq_chk; /* Check TCP sequence numbers. */
+atomic_uint32_t sweep_ms; /* Next sweep interval. */
 };
 
 /* Lock acquisition order:
diff --git a/lib/conntrack.c b/lib/conntrack.c
index f86fa26f4..ce8a63de5 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -320,6 +320,7 @@ conntrack_init(void)
 atomic_count_init(>n_conn, 0);
 atomic_init(>n_conn_limit, DEFAULT_N_CONN_LIMIT);
 atomic_init(>tcp_seq_chk, true);
+atomic_init(>sweep_ms, 2);
 latch_init(>clean_thread_exit);
 ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
 ct->ipf = ipf_init();
@@ -1480,6 +1481,21 @@ set_label(struct dp_packet *pkt, struct conn *conn,
 }
 
 
+int
+conntrack_set_sweep_interval(struct conntrack *ct, uint32_t ms)
+{
+atomic_store_relaxed(>sweep_ms, ms);
+return 0;
+}
+
+uint32_t
+conntrack_get_sweep_interval(struct conntrack *ct)
+{
+uint32_t ms;
+atomic_read_relaxed(>sweep_ms, );
+return ms;
+}
+
 static size_t
 ct_sweep(struct conntrack *ct, struct rculist *list, long long now)
 OVS_NO_THREAD_SAFETY_ANALYSIS
@@ -1504,7 +1520,7 @@ ct_sweep(struct conntrack *ct, struct rculist *list, long 
long now)
 static long long
 conntrack_clean(struct conntrack *ct, long long now)
 {
-long long next_wakeup = now + 20 * 1000;
+long long next_wakeup = now + conntrack_get_sweep_interval(ct);
 unsigned int n_conn_limit, i;
 size_t clean_end, count = 0;
 
diff --git a/lib/conntrack.h b/lib/conntrack.h
index b064abc9f..524ec0acb 100644
--- a/lib/conntrack.h
+++ b/lib/conntrack.h
@@ -139,6 +139,8 @@ int conntrack_set_maxconns(struct conntrack *ct, uint32_t 
maxconns);
 int conntrack_get_maxconns(struct conntrack *ct, uint32_t *maxconns);
 int conntrack_get_nconns(struct conntrack *ct, uint32_t *nconns);
 int conntrack_set_tcp_seq_chk(struct conntrack *ct, bool enabled);
+int conntrack_set_sweep_interval(struct conntrack *ct, uint32_t ms);
+uint32_t conntrack_get_sweep_interval(struct conntrack *ct);
 bool conntrack_get_tcp_seq_chk(struct conntrack *ct);
 struct ipf *conntrack_ipf_ctx(struct conntrack *ct);
 struct conntrack_zone_limit zone_limit_get(struct conntrack *ct,
diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c
index d3b2783ce..0c4b2964f 100644
--- a/lib/ct-dpif.c
+++ b/lib/ct-dpif.c
@@ -368,6 +368,20 @@ ct_dpif_del_limits(struct dpif *dpif, const struct 
ovs_list *zone_limits)
 : EOPNOTSUPP);
 }
 
+int
+ct_dpif_sweep(struct dpif *dpif, uint32_t *ms)
+{
+if (*ms) {
+return (dpif->dpif_class->ct_set_sweep_interval
+? dpif->dpif_class->ct_set_sweep_interval(dpif, *ms)
+: EOPNOTSUPP);
+} else {
+return (dpif->dpif_class->ct_get_sweep_interval
+? dpif

Re: [ovs-discuss] ovs-vswitchd crashes serveral times a day

2023-04-05 Thread Paolo Valerio via discuss
Lazuardi Nasution  writes:

> Hi Paolo,
>
> Would you mind to explain this to me? Currently, I'm still looking for
> compiling options of installed OVS-DPDK from Ubuntu repo. After that, I'll try
> your patch and compile it with same options.
>

the idea is to avoid to include two keys with the same hash belonging to
the same connection even for the nat case.

Considering a flow like this:

tcp,in_port="ovs-p0" actions=ct(commit,nat(src)),output:"ovs-p1"

and a TCP syn matching this rule, an entry in ct is created. This
normally, if no other packets refresh the entry or move the state,
timeouts in 30s.
You can see that with:

ovs-appctl dpctl/dump-conntrack -s

tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=47838,dport=8080),reply=(src=10.1.1.2,dst=10.1.1.1,sport=8080,dport=47838),timeout=30,protoinfo=(state=SYN_SENT)

There's a timespan between the expiration and the actual clean-up of the
connection. If a new connection with the same 5-tuple (or even a
retransmission) is received in that timespan, the issue should occur.

In ovs 3.x the patch (intended for testing only) should be slightly
different as some things changed there.
This should be enough for a quick test:

-- >8 --
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 13c5ab628..7f6f1c2a8 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -481,8 +481,10 @@ conn_clean__(struct conntrack *ct, struct conn *conn)
 cmap_remove(>conns, >cm_node, hash);
 
 if (conn->nat_conn) {
-hash = conn_key_hash(>nat_conn->key, ct->hash_basis);
-cmap_remove(>conns, >nat_conn->cm_node, hash);
+uint32_t nc_hash = conn_key_hash(>nat_conn->key, ct->hash_basis);
+if (hash != nc_hash) {
+cmap_remove(>conns, >nat_conn->cm_node, nc_hash);
+}
 }
 
 rculist_remove(>node);
@@ -1090,7 +1092,9 @@ conn_not_found(struct conntrack *ct, struct dp_packet 
*pkt,
 nat_conn->alg = NULL;
 nat_conn->nat_conn = NULL;
 uint32_t nat_hash = conn_key_hash(_conn->key, ct->hash_basis);
-cmap_insert(>conns, _conn->cm_node, nat_hash);
+if (nat_hash != ctx->hash) {
+cmap_insert(>conns, _conn->cm_node, nat_hash);
+        }
 }
 
 nc->nat_conn = nat_conn;


> Best regards.
>
> On Wed, Apr 5, 2023, 2:51 AM Paolo Valerio  wrote:
>
> Hello,
>
> thanks for reporting this.
> I had a look at it, and, although this needs to be confirmed, I suspect
> it's related to nat (CT_CONN_TYPE_UN_NAT) and expired connections (but
> not yet reclaimed).
>
> The nat part does not necessarily perform any actual translation, but
> could still be triggered by ct(nat(src)...) which is the all-zero binding
> to avoid collisions, if any.
>
> Is there any chance to test the following patch (targeted for ovs 2.17)?
> This should help to confirm.
>
> -- >8 --
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 08da4ddf7..ba334afb0 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -94,9 +94,8 @@ static bool valid_new(struct dp_packet *pkt, struct
> conn_key *);
>  static struct conn *new_conn(struct conntrack *ct, struct dp_packet *pkt,
>                               struct conn_key *, long long now,
>                               uint32_t tp_id);
> -static void delete_conn_cmn(struct conn *);
> +static void delete_conn__(struct conn *);
>  static void delete_conn(struct conn *);
> -static void delete_conn_one(struct conn *conn);
>  static enum ct_update_res conn_update(struct conntrack *ct, struct conn
> *conn,
>                                        struct dp_packet *pkt,
>                                        struct conn_lookup_ctx *ctx,
> @@ -444,14 +443,13 @@ zone_limit_delete(struct conntrack *ct, uint16_t
> zone)
>  }
>
>  static void
> -conn_clean_cmn(struct conntrack *ct, struct conn *conn)
> +conn_clean_cmn(struct conntrack *ct, struct conn *conn, uint32_t hash)
>      OVS_REQUIRES(ct->ct_lock)
>  {
>      if (conn->alg) {
>          expectation_clean(ct, >key);
>      }
>
> -    uint32_t hash = conn_key_hash(>key, ct->hash_basis);
>      cmap_remove(>conns, >cm_node, hash);
>
>      struct zone_limit *zl = zone_limit_lookup(ct, conn->admit_zone);
> @@ -467,11 +465,14 @@ conn_clean(struct conntrack *ct, struct conn *conn)
>      OVS_REQUIRES(ct->ct_lock)
>  {
>      ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
> +    uint32_t conn_hash = conn_key_hash(>key, ct->hash_basis);
>
> -    conn_clean_cmn(ct, 

Re: [ovs-discuss] ovs-vswitchd crashes serveral times a day

2023-04-04 Thread Paolo Valerio via discuss
Hello,

thanks for reporting this.
I had a look at it, and, although this needs to be confirmed, I suspect
it's related to nat (CT_CONN_TYPE_UN_NAT) and expired connections (but
not yet reclaimed).

The nat part does not necessarily perform any actual translation, but
could still be triggered by ct(nat(src)...) which is the all-zero binding
to avoid collisions, if any.

Is there any chance to test the following patch (targeted for ovs 2.17)?
This should help to confirm.

-- >8 --
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 08da4ddf7..ba334afb0 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -94,9 +94,8 @@ static bool valid_new(struct dp_packet *pkt, struct conn_key 
*);
 static struct conn *new_conn(struct conntrack *ct, struct dp_packet *pkt,
  struct conn_key *, long long now,
  uint32_t tp_id);
-static void delete_conn_cmn(struct conn *);
+static void delete_conn__(struct conn *);
 static void delete_conn(struct conn *);
-static void delete_conn_one(struct conn *conn);
 static enum ct_update_res conn_update(struct conntrack *ct, struct conn *conn,
   struct dp_packet *pkt,
   struct conn_lookup_ctx *ctx,
@@ -444,14 +443,13 @@ zone_limit_delete(struct conntrack *ct, uint16_t zone)
 }

 static void
-conn_clean_cmn(struct conntrack *ct, struct conn *conn)
+conn_clean_cmn(struct conntrack *ct, struct conn *conn, uint32_t hash)
 OVS_REQUIRES(ct->ct_lock)
 {
 if (conn->alg) {
 expectation_clean(ct, >key);
 }

-uint32_t hash = conn_key_hash(>key, ct->hash_basis);
 cmap_remove(>conns, >cm_node, hash);

 struct zone_limit *zl = zone_limit_lookup(ct, conn->admit_zone);
@@ -467,11 +465,14 @@ conn_clean(struct conntrack *ct, struct conn *conn)
 OVS_REQUIRES(ct->ct_lock)
 {
 ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
+uint32_t conn_hash = conn_key_hash(>key, ct->hash_basis);

-conn_clean_cmn(ct, conn);
+conn_clean_cmn(ct, conn, conn_hash);
 if (conn->nat_conn) {
 uint32_t hash = conn_key_hash(>nat_conn->key, ct->hash_basis);
-cmap_remove(>conns, >nat_conn->cm_node, hash);
+if (conn_hash != hash) {
+cmap_remove(>conns, >nat_conn->cm_node, hash);
+}
 }
 ovs_list_remove(>exp_node);
 conn->cleaned = true;
@@ -479,19 +480,6 @@ conn_clean(struct conntrack *ct, struct conn *conn)
 atomic_count_dec(>n_conn);
 }

-static void
-conn_clean_one(struct conntrack *ct, struct conn *conn)
-OVS_REQUIRES(ct->ct_lock)
-{
-conn_clean_cmn(ct, conn);
-if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
-ovs_list_remove(>exp_node);
-conn->cleaned = true;
-atomic_count_dec(>n_conn);
-}
-ovsrcu_postpone(delete_conn_one, conn);
-}
-
 /* Destroys the connection tracker 'ct' and frees all the allocated memory.
  * The caller of this function must already have shut down packet input
  * and PMD threads (which would have been quiesced).  */
@@ -505,7 +493,10 @@ conntrack_destroy(struct conntrack *ct)

 ovs_mutex_lock(>ct_lock);
 CMAP_FOR_EACH (conn, cm_node, >conns) {
-conn_clean_one(ct, conn);
+if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
+continue;
+}
+conn_clean(ct, conn);
 }
 cmap_destroy(>conns);

@@ -1052,7 +1043,10 @@ conn_not_found(struct conntrack *ct, struct dp_packet 
*pkt,
 nat_conn->alg = NULL;
 nat_conn->nat_conn = NULL;
 uint32_t nat_hash = conn_key_hash(_conn->key, ct->hash_basis);
-cmap_insert(>conns, _conn->cm_node, nat_hash);
+
+if (nat_hash != ctx->hash) {
+cmap_insert(>conns, _conn->cm_node, nat_hash);
+}
 }

 nc->nat_conn = nat_conn;
@@ -1080,7 +1074,7 @@ conn_not_found(struct conntrack *ct, struct dp_packet 
*pkt,
 nat_res_exhaustion:
 free(nat_conn);
 ovs_list_remove(>exp_node);
-delete_conn_cmn(nc);
+delete_conn__(nc);
 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
 VLOG_WARN_RL(, "Unable to NAT due to tuple space exhaustion - "
  "if DoS attack, use firewalling and/or zone partitioning.");
@@ -2549,7 +2543,7 @@ new_conn(struct conntrack *ct, struct dp_packet *pkt, 
struct conn_key *key,
 }

 static void
-delete_conn_cmn(struct conn *conn)
+delete_conn__(struct conn *conn)
 {
 free(conn->alg);
 free(conn);
@@ -2561,17 +2555,7 @@ delete_conn(struct conn *conn)
 ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
 ovs_mutex_destroy(>lock);
 free(conn->nat_conn);
-delete_conn_cmn(conn);
-}
-
-/* Only used by conn_clean_one(). */
-static void
-delete_conn_one(struct conn *conn)
-{
-if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
-ovs_mutex_destroy(>lock);
-}
-delete_conn_cmn(conn);
+delete_conn__(conn);
 }

 /* Convert a conntrack address 'a' into an IP 

Re: [ovs-dev] [PATCH v2] ovs-dpctl: Add new command dpctl/ct-sweep-next-run.

2023-03-31 Thread Paolo Valerio
Ilya Maximets  writes:

> On 2/27/23 13:30, Paolo Valerio wrote:
>> Since 3d9c1b855a5f ("conntrack: Replace timeout based expiration lists
>> with rculists.") the sweep interval changed as well as the constraints
>> related to the sweeper.
>> Being able to change the default reschedule time may be convenient in
>> some conditions, like debugging.
>> This patch introduces new commands allowing to get and set the sweep
>> next run in ms.
>> 
>> Signed-off-by: Paolo Valerio 
>> ---
>> v2:
>> - resolved conflict in NEWS
>> - added missing comment
>> - added missing '\' in dpctl.man
>> ---
>>  NEWS|4 +++
>>  lib/conntrack-private.h |1 +
>>  lib/conntrack.c |   18 +-
>>  lib/conntrack.h |2 ++
>>  lib/ct-dpif.c   |   14 +++
>>  lib/ct-dpif.h   |1 +
>>  lib/dpctl.c |   61 
>> +++
>>  lib/dpctl.man   |8 ++
>>  lib/dpif-netdev.c   |   17 +
>>  lib/dpif-netlink.c  |2 ++
>>  lib/dpif-provider.h |4 +++
>>  11 files changed, 131 insertions(+), 1 deletion(-)
>> 
>> diff --git a/NEWS b/NEWS
>> index 85b349621..4c4ef4b2b 100644
>> --- a/NEWS
>> +++ b/NEWS
>> @@ -10,6 +10,10 @@ Post-v3.1.0
>> in order to create OVSDB sockets with access mode of 0770.
>> - QoS:
>>   * Added new configuration option 'jitter' for a linux-netem QoS type.
>> +   - ovs-appctl:
>> + * New commands "dpctl/{ct-get-sweep-next-run,ct-set-sweep-next-run}" 
>> that
>> +   allow to get and set, for the userspace datapath, the next run 
>> interval
>> +   for the conntrack garbage collector.
>
> Hi, Paolo.  Thanks for the patch!
>
> It looks good to me in general, but the command name seems a bit
> strange.  It sounds like it is a one-shot configuration that only
> applies to the next run and will be dropped to default afterwards.
> But that doesn't seem to be the case in the code.  It's a permanent
> configuration for the sweep interval.  So, maybe we should call it
> ct-[gs]et-sweep-interval, or something like that ?
>
> What do you think?
>

Agreed, ct-[gs]et-sweep-interval seems a better name.

> Also, some small unit test, even a basic set+get check, would be
> nice to have.  We have some similar tests in tests/ofproto-dpif.at.
>

sure, I'll add it.

Thank you.

> Best regards, Ilya Maximets.

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] system-traffic.at: Add icmp error tests while dnatting address and port.

2023-02-27 Thread Paolo Valerio
Ilya Maximets  writes:

> On 2/27/23 12:08, Paolo Valerio wrote:
>> The two tests verify, for both icmp and icmpv6, that the correct port
>> translation happen in the inner packet in the case an error is
>> received in the reply direction.
>> 
>> Signed-off-by: Paolo Valerio 
>> ---
>>  tests/system-traffic.at |   72 
>> +++
>>  1 file changed, 72 insertions(+)
>> 
>> diff --git a/tests/system-traffic.at b/tests/system-traffic.at
>> index 3a15b88a2..02fd0ee1b 100644
>> --- a/tests/system-traffic.at
>> +++ b/tests/system-traffic.at
>> @@ -3561,6 +3561,42 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | 
>> FORMAT_CT(172.16.0.3)], [0], [dnl
>>  OVS_TRAFFIC_VSWITCHD_STOP
>>  AT_CLEANUP
>>  
>> +AT_SETUP([conntrack - ICMP related NAT with single port])
>> +AT_SKIP_IF([test $HAVE_NC = no])
>> +AT_SKIP_IF([test $HAVE_TCPDUMP = no])
>> +CHECK_CONNTRACK()
>> +CHECK_CONNTRACK_NAT()
>> +OVS_TRAFFIC_VSWITCHD_START()
>> +
>> +ADD_NAMESPACES(at_ns0, at_ns1)
>> +
>> +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24", "f0:00:00:01:01:01")
>> +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24", "f0:00:00:01:01:02")
>> +
>> +NS_CHECK_EXEC([at_ns0], [ip neigh add 10.1.1.240 lladdr f0:00:00:01:01:02 
>> dev p0])
>> +NS_CHECK_EXEC([at_ns1], [ip neigh add 10.1.1.1 lladdr f0:00:00:01:01:01 dev 
>> p1])
>> +
>> +AT_DATA([flows.txt], [dnl
>> +table=0,ip,ct_state=-trk,actions=ct(table=0,nat)
>> +table=0,in_port=ovs-p0,udp,ct_state=+trk+new,actions=ct(commit,nat(dst=10.1.1.2:8080)),ovs-p1
>> +table=0,in_port=ovs-p1,ct_state=+trk+rel+rpl,icmp,actions=ovs-p0
>> +])
>> +
>> +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
>> +
>> +rm p0.pcap
>> +NETNS_DAEMONIZE([at_ns0], [tcpdump -l -U -i p0 -w p0.pcap 2>tcpdump0_err], 
>> [tcpdump0.pid])
>> +NS_CHECK_EXEC([at_ns0], [bash -c "echo dest_unreach | nc $NC_EOF_OPT -p 
>> 1234 -u 10.1.1.240 80"])
>> +
>> +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.1," | 
>> sort], [0], [dnl
>> +udp,orig=(src=10.1.1.1,dst=10.1.1.240,sport=1234,dport=80),reply=(src=10.1.1.2,dst=10.1.1.1,sport=8080,dport=1234)
>> +])
>> +
>> +OVS_WAIT_UNTIL([ovs-pcap p0.pcap | grep -Eq 
>> "f0010101f0010102080045c00045[[[:xdigit:]]]{4}4001[[[:xdigit:]]]{4}0a0101f00a010101030314164529[[[:xdigit:]]]{4}40004011[[[:xdigit:]]]{4}0a0101010a0101f004d2005000156b24646573745f756e72656163680a"])
>> +
>> +OVS_TRAFFIC_VSWITCHD_STOP
>> +AT_CLEANUP
>> +
>>  AT_SETUP([conntrack - IPv4 fragmentation])
>>  CHECK_CONNTRACK()
>>  OVS_TRAFFIC_VSWITCHD_START()
>> @@ -6555,6 +6591,42 @@ 
>> udp,orig=(src=fc00::1,dst=fc00::2,sport=,dport=),reply=(src=fc
>>  OVS_TRAFFIC_VSWITCHD_STOP
>>  AT_CLEANUP
>>  
>> +AT_SETUP([conntrack - ICMPv6 related NAT with single port])
>
> Looks like this test is failing Intel CI.
> Could you, please, check?
>

thanks, I sent a v2. It should fix the problem.

> Best regards, Ilya Maximets.
>
>> +AT_SKIP_IF([test $HAVE_NC = no])
>> +AT_SKIP_IF([test $HAVE_TCPDUMP = no])
>> +CHECK_CONNTRACK()
>> +CHECK_CONNTRACK_NAT()
>> +OVS_TRAFFIC_VSWITCHD_START()
>> +
>> +ADD_NAMESPACES(at_ns0, at_ns1)
>> +
>> +ADD_VETH(p0, at_ns0, br0, "fc00::1/96", "f0:00:00:01:01:01", [], "nodad")
>> +ADD_VETH(p1, at_ns1, br0, "fc00::2/96", "f0:00:00:01:01:02", [], "nodad")
>> +
>> +NS_CHECK_EXEC([at_ns0], [ip -6 neigh add fc00::240 lladdr f0:00:00:01:01:02 
>> dev p0])
>> +NS_CHECK_EXEC([at_ns1], [ip -6 neigh add fc00::1 lladdr f0:00:00:01:01:01 
>> dev p1])
>> +
>> +AT_DATA([flows.txt], [dnl
>> +table=0,ipv6,ct_state=-trk,actions=ct(table=0,nat)
>> +table=0,in_port=ovs-p0,udp6,ct_state=+trk+new,actions=ct(commit,nat(dst=[[fc00::2]]:8080)),ovs-p1
>> +table=0,in_port=ovs-p1,ct_state=+trk+rel+rpl,icmp6,actions=ovs-p0
>> +])
>> +
>> +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
>> +
>> +rm p0.pcap
>> +NETNS_DAEMONIZE([at_ns0], [tcpdump -l -U -i p0 -w p0.pcap 2>tcpdump0_err], 
>> [tcpdump0.pid])
>> +NS_CHECK_EXEC([at_ns0], [bash -c "echo dest_unreach | nc -6 $NC_EOF_OPT -p 
>> 1234 -u fc00::240 80"])
>> +
>> +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=fc00::1," | 
>> sort], [0], [dnl
>> +udp,orig=(src=fc00::1,dst=fc00::240,sport=1234,dport=80),reply=(src=fc00::2,dst=fc00::1,sport=8080,dport=1234)
>> +])
>> +
>> +OVS_WAIT_UNTIL([ovs-pcap p0.pcap | grep -Eq 
>> "f0010101f001010286dd60[[[:xdigit:]]]{6}00453a40fc000240fc010104[[[:xdigit:]]]{4}60[[[:xdigit:]]]{6}00151140fc01fc00024004d20050001587d4646573745f756e72656163680a"])
>> +
>> +OVS_TRAFFIC_VSWITCHD_STOP
>> +AT_CLEANUP
>> +
>>  AT_SETUP([conntrack - IPv6 FTP with SNAT])
>>  AT_SKIP_IF([test $HAVE_FTP = no])
>>  CHECK_CONNTRACK()
>> 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2] system-traffic.at: Add icmp error tests while dnatting address and port.

2023-02-27 Thread Paolo Valerio
The two tests verify, for both icmp and icmpv6, that the correct port
translation happen in the inner packet in the case an error is
received in the reply direction.

Signed-off-by: Paolo Valerio 
---
v2:
- added missing OVS_WAIT_UNTIL for tcpdump
- removed nc dependency and replaced with packet-out
---
 tests/system-traffic.at |   74 +++
 1 file changed, 74 insertions(+)

diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index 3a15b88a2..380372430 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -3561,6 +3561,43 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | 
FORMAT_CT(172.16.0.3)], [0], [dnl
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 
+AT_SETUP([conntrack - ICMP related NAT with single port])
+AT_SKIP_IF([test $HAVE_TCPDUMP = no])
+CHECK_CONNTRACK()
+CHECK_CONNTRACK_NAT()
+OVS_TRAFFIC_VSWITCHD_START()
+
+ADD_NAMESPACES(at_ns0, at_ns1)
+
+ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24", "f0:00:00:01:01:01")
+ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24", "f0:00:00:01:01:02")
+
+AT_DATA([flows.txt], [dnl
+table=0,ip,ct_state=-trk,actions=ct(table=0,nat)
+table=0,in_port=ovs-p0,ct_state=+trk+new,udp,actions=ct(commit,nat(dst=10.1.1.2:8080)),ovs-p1
+table=0,in_port=ovs-p1,ct_state=+trk+rel+rpl,icmp,actions=ovs-p0
+])
+
+AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
+
+rm p0.pcap
+OVS_DAEMONIZE([tcpdump -l -U -i ovs-p0 -w p0.pcap 2> tcpdump0_err], 
[tcpdump0.pid])
+OVS_WAIT_UNTIL([grep "listening" tcpdump0_err])
+
+dnl Send UDP packet from 10.1.1.1:1234 to 10.1.1.240:80
+AT_CHECK([ovs-ofctl packet-out br0 
"in_port=ovs-p0,packet=f0010102f00101010800452944c140004011df100a0101010a0101f004d2005000156b24646573745f756e72656163680a,actions=resubmit(,0)"])
+dnl Send "destination unreachable" response
+AT_CHECK([ovs-ofctl packet-out br0 
"in_port=ovs-p1,packet=f0010101f0010102080045c000456a374001f9bc0a0101020a01010103031328452944c140004011dffe0a0101010a01010204d21f9000154cd2646573745f756e72656163680a,actions=resubmit(,0)"])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.1," | 
sort], [0], [dnl
+udp,orig=(src=10.1.1.1,dst=10.1.1.240,sport=1234,dport=80),reply=(src=10.1.1.2,dst=10.1.1.1,sport=8080,dport=1234)
+])
+
+OVS_WAIT_UNTIL([ovs-pcap p0.pcap | grep -q 
"f0010101f0010102080045c000456a374001f8ce0a0101f00a01010103031416452944c140004011df100a0101010a0101f004d2005000156b24646573745f756e72656163680a"])
+
+OVS_TRAFFIC_VSWITCHD_STOP
+AT_CLEANUP
+
 AT_SETUP([conntrack - IPv4 fragmentation])
 CHECK_CONNTRACK()
 OVS_TRAFFIC_VSWITCHD_START()
@@ -6555,6 +6592,43 @@ 
udp,orig=(src=fc00::1,dst=fc00::2,sport=,dport=),reply=(src=fc
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 
+AT_SETUP([conntrack - ICMPv6 related NAT with single port])
+AT_SKIP_IF([test $HAVE_TCPDUMP = no])
+CHECK_CONNTRACK()
+CHECK_CONNTRACK_NAT()
+OVS_TRAFFIC_VSWITCHD_START()
+
+ADD_NAMESPACES(at_ns0, at_ns1)
+
+ADD_VETH(p0, at_ns0, br0, "fc00::1/96", "f0:00:00:01:01:01", [], "nodad")
+ADD_VETH(p1, at_ns1, br0, "fc00::2/96", "f0:00:00:01:01:02", [], "nodad")
+
+AT_DATA([flows.txt], [dnl
+table=0,ipv6,ct_state=-trk,actions=ct(table=0,nat)
+table=0,in_port=ovs-p0,ct_state=+trk+new,udp6,actions=ct(commit,nat(dst=[[fc00::2]]:8080)),ovs-p1
+table=0,in_port=ovs-p1,ct_state=+trk+rel+rpl,icmp6,actions=ovs-p0
+])
+
+AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
+
+rm p0.pcap
+OVS_DAEMONIZE([tcpdump -l -U -i ovs-p0 -w p0.pcap 2> tcpdump0_err], 
[tcpdump0.pid])
+OVS_WAIT_UNTIL([grep "listening" tcpdump0_err])
+
+dnl Send UDP packet from [[fc00::1]]:1234 to [[fc00::240]]:80
+AT_CHECK([ovs-ofctl packet-out br0 
"in_port=ovs-p0,packet=f0010102f001010186dd60066ced00151140fc01fc00024004d20050001587d4646573745f756e72656163680a,actions=resubmit(,0)"])
+dnl Send "destination unreachable" response
+AT_CHECK([ovs-ofctl packet-out br0 
"in_port=ovs-p1,packet=f0010101f001010286dd600733ed00453a40fc02fc010104285560066ced00151140fc01fc0204d21f9000156ad2646573745f756e72656163680a,actions=resubmit(,0)"])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=fc00::1," | sort], 
[0], [dnl
+udp,orig=(src=fc00::1,dst=fc00::240,sport=1234,dport=80),reply=(src=fc00::2,dst=fc00::1,sport=8080,dport=1234)
+])
+
+OVS_WAIT_UNTIL([ovs-pcap p0.pcap | grep -q 
"f0010101f001010286dd600733ed00453a40fc000240fc010104261760066ced00151140fc01fc00024004d20050001587d4646573745f756e72656163680a"])
+
+OVS_

[ovs-dev] [PATCH v2] ovs-dpctl: Add new command dpctl/ct-sweep-next-run.

2023-02-27 Thread Paolo Valerio
Since 3d9c1b855a5f ("conntrack: Replace timeout based expiration lists
with rculists.") the sweep interval changed as well as the constraints
related to the sweeper.
Being able to change the default reschedule time may be convenient in
some conditions, like debugging.
This patch introduces new commands allowing to get and set the sweep
next run in ms.

Signed-off-by: Paolo Valerio 
---
v2:
- resolved conflict in NEWS
- added missing comment
- added missing '\' in dpctl.man
---
 NEWS|4 +++
 lib/conntrack-private.h |1 +
 lib/conntrack.c |   18 +-
 lib/conntrack.h |2 ++
 lib/ct-dpif.c   |   14 +++
 lib/ct-dpif.h   |1 +
 lib/dpctl.c |   61 +++
 lib/dpctl.man   |8 ++
 lib/dpif-netdev.c   |   17 +
 lib/dpif-netlink.c  |2 ++
 lib/dpif-provider.h |4 +++
 11 files changed, 131 insertions(+), 1 deletion(-)

diff --git a/NEWS b/NEWS
index 85b349621..4c4ef4b2b 100644
--- a/NEWS
+++ b/NEWS
@@ -10,6 +10,10 @@ Post-v3.1.0
in order to create OVSDB sockets with access mode of 0770.
- QoS:
  * Added new configuration option 'jitter' for a linux-netem QoS type.
+   - ovs-appctl:
+ * New commands "dpctl/{ct-get-sweep-next-run,ct-set-sweep-next-run}" that
+   allow to get and set, for the userspace datapath, the next run interval
+   for the conntrack garbage collector.
 
 
 v3.1.0 - 16 Feb 2023
diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
index fae8b3a9b..bb326868e 100644
--- a/lib/conntrack-private.h
+++ b/lib/conntrack-private.h
@@ -224,6 +224,7 @@ struct conntrack {
 struct ipf *ipf; /* Fragmentation handling context. */
 uint32_t zone_limit_seq; /* Used to disambiguate zone limit counts. */
 atomic_bool tcp_seq_chk; /* Check TCP sequence numbers. */
+atomic_uint32_t sweep_ms; /* Next sweep interval. */
 };
 
 /* Lock acquisition order:
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 5029b2cda..9356c1282 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -320,6 +320,7 @@ conntrack_init(void)
 atomic_count_init(>n_conn, 0);
 atomic_init(>n_conn_limit, DEFAULT_N_CONN_LIMIT);
 atomic_init(>tcp_seq_chk, true);
+atomic_init(>sweep_ms, 2);
 latch_init(>clean_thread_exit);
 ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
 ct->ipf = ipf_init();
@@ -1480,6 +1481,21 @@ set_label(struct dp_packet *pkt, struct conn *conn,
 }
 
 
+int
+conntrack_set_sweep_next_run(struct conntrack *ct, uint32_t ms)
+{
+atomic_store_relaxed(>sweep_ms, ms);
+return 0;
+}
+
+uint32_t
+conntrack_get_sweep_next_run(struct conntrack *ct)
+{
+uint32_t ms;
+atomic_read_relaxed(>sweep_ms, );
+return ms;
+}
+
 static size_t
 ct_sweep(struct conntrack *ct, struct rculist *list, long long now)
 OVS_NO_THREAD_SAFETY_ANALYSIS
@@ -1504,7 +1520,7 @@ ct_sweep(struct conntrack *ct, struct rculist *list, long 
long now)
 static long long
 conntrack_clean(struct conntrack *ct, long long now)
 {
-long long next_wakeup = now + 20 * 1000;
+long long next_wakeup = now + conntrack_get_sweep_next_run(ct);
 unsigned int n_conn_limit, i;
 size_t clean_end, count = 0;
 
diff --git a/lib/conntrack.h b/lib/conntrack.h
index b064abc9f..2306cf375 100644
--- a/lib/conntrack.h
+++ b/lib/conntrack.h
@@ -139,6 +139,8 @@ int conntrack_set_maxconns(struct conntrack *ct, uint32_t 
maxconns);
 int conntrack_get_maxconns(struct conntrack *ct, uint32_t *maxconns);
 int conntrack_get_nconns(struct conntrack *ct, uint32_t *nconns);
 int conntrack_set_tcp_seq_chk(struct conntrack *ct, bool enabled);
+int conntrack_set_sweep_next_run(struct conntrack *ct, uint32_t ms);
+uint32_t conntrack_get_sweep_next_run(struct conntrack *ct);
 bool conntrack_get_tcp_seq_chk(struct conntrack *ct);
 struct ipf *conntrack_ipf_ctx(struct conntrack *ct);
 struct conntrack_zone_limit zone_limit_get(struct conntrack *ct,
diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c
index d3b2783ce..0a08eb11c 100644
--- a/lib/ct-dpif.c
+++ b/lib/ct-dpif.c
@@ -368,6 +368,20 @@ ct_dpif_del_limits(struct dpif *dpif, const struct 
ovs_list *zone_limits)
 : EOPNOTSUPP);
 }
 
+int
+ct_dpif_sweep(struct dpif *dpif, uint32_t *ms)
+{
+if (*ms) {
+return (dpif->dpif_class->ct_set_sweep_next_run
+? dpif->dpif_class->ct_set_sweep_next_run(dpif, *ms)
+: EOPNOTSUPP);
+} else {
+return (dpif->dpif_class->ct_get_sweep_next_run
+? dpif->dpif_class->ct_get_sweep_next_run(dpif, ms)
+: EOPNOTSUPP);
+}
+}
+
 int
 ct_dpif_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
 {
diff --git a/lib/ct-dpif.h b/lib/ct-dpif.h
index 5edbbfd3b..1e265604f 100644
--- a/lib/ct-dpif.h
+++ b/lib/ct-dpif.h
@@ -298,6 +298,7 

Re: [ovs-dev] [PATCH 1/2] cli: add option to display the version from Cargo.toml.

2023-02-27 Thread Paolo Valerio
Sorry for the noise, but this local test got sent unintentionally.

Please, ignore it.

Paolo Valerio  writes:

> Signed-off-by: Paolo Valerio 
> ---
>  src/cli/cli.rs |1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/src/cli/cli.rs b/src/cli/cli.rs
> index a5b08e6..f8593e1 100644
> --- a/src/cli/cli.rs
> +++ b/src/cli/cli.rs
> @@ -73,6 +73,7 @@ impl Debug for dyn SubCommand {
>  ///
>  /// packet-tracer is a tool for capturing networking-related events from the 
> system using ebpf and analyzing them.
>  #[derive(Args, Default, Debug)]
> +#[command(version)]
>  pub(crate) struct MainConfig {}
>  
>  /// ThinCli handles the first (a.k.a "thin") round of Command Line Interface 
> parsing.

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 2/2] WIP

2023-02-27 Thread Paolo Valerio
Signed-off-by: Paolo Valerio 

Signed-off-by: Paolo Valerio 
---
 src/main.rs |1 +
 1 file changed, 1 insertion(+)

diff --git a/src/main.rs b/src/main.rs
index c922fae..c28a07f 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -2,6 +2,7 @@ use anyhow::Result;
 use log::error;
 use simplelog::{Config, LevelFilter, SimpleLogger};
 
+
 mod cli;
 mod collect;
 mod core;

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 1/2] cli: add option to display the version from Cargo.toml.

2023-02-27 Thread Paolo Valerio
Signed-off-by: Paolo Valerio 
---
 src/cli/cli.rs |1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cli/cli.rs b/src/cli/cli.rs
index a5b08e6..f8593e1 100644
--- a/src/cli/cli.rs
+++ b/src/cli/cli.rs
@@ -73,6 +73,7 @@ impl Debug for dyn SubCommand {
 ///
 /// packet-tracer is a tool for capturing networking-related events from the 
system using ebpf and analyzing them.
 #[derive(Args, Default, Debug)]
+#[command(version)]
 pub(crate) struct MainConfig {}
 
 /// ThinCli handles the first (a.k.a "thin") round of Command Line Interface 
parsing.

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] ovs-dpctl: Add new command dpctl/ct-sweep-next-run.

2023-02-27 Thread Paolo Valerio
Since 3d9c1b855a5f ("conntrack: Replace timeout based expiration lists
with rculists.") the sweep interval changed as well as the constraints
related to the sweeper.
Being able to change the default reschedule time may be convenient in
some conditions, like debugging.
This patch introduces new commands allowing to get and set the sweep
next run in ms.

Signed-off-by: Paolo Valerio 
---
 NEWS|4 +++
 lib/conntrack-private.h |1 +
 lib/conntrack.c |   18 +-
 lib/conntrack.h |2 ++
 lib/ct-dpif.c   |   14 +++
 lib/ct-dpif.h   |1 +
 lib/dpctl.c |   61 +++
 lib/dpctl.man   |8 ++
 lib/dpif-netdev.c   |   17 +
 lib/dpif-netlink.c  |2 ++
 lib/dpif-provider.h |4 +++
 11 files changed, 131 insertions(+), 1 deletion(-)

diff --git a/NEWS b/NEWS
index 391badd7c..c80f44429 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,10 @@ Post-v3.1.0
  * OVS now collects per-interface upcall statistics that can be obtained
via 'ovs-appctl dpctl/show -s' or the interface's statistics column
in OVSDB.  Available with upstream kernel 6.2+.
+   - ovs-appctl:
+ * New commands "dpctl/{ct-get-sweep-next-run,ct-set-sweep-next-run}" that
+   allow to get and set, for the userspace datapath, the next run interval
+   for the conntrack garbage collector.
 
 
 v3.1.0 - 16 Feb 2023
diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
index fae8b3a9b..3438c3554 100644
--- a/lib/conntrack-private.h
+++ b/lib/conntrack-private.h
@@ -224,6 +224,7 @@ struct conntrack {
 struct ipf *ipf; /* Fragmentation handling context. */
 uint32_t zone_limit_seq; /* Used to disambiguate zone limit counts. */
 atomic_bool tcp_seq_chk; /* Check TCP sequence numbers. */
+atomic_uint32_t sweep_ms;
 };
 
 /* Lock acquisition order:
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 524670e45..e9a37f2c1 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -320,6 +320,7 @@ conntrack_init(void)
 atomic_count_init(>n_conn, 0);
 atomic_init(>n_conn_limit, DEFAULT_N_CONN_LIMIT);
 atomic_init(>tcp_seq_chk, true);
+atomic_init(>sweep_ms, 2);
 latch_init(>clean_thread_exit);
 ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
 ct->ipf = ipf_init();
@@ -1480,6 +1481,21 @@ set_label(struct dp_packet *pkt, struct conn *conn,
 }
 
 
+int
+conntrack_set_sweep_next_run(struct conntrack *ct, uint32_t ms)
+{
+atomic_store_relaxed(>sweep_ms, ms);
+return 0;
+}
+
+uint32_t
+conntrack_get_sweep_next_run(struct conntrack *ct)
+{
+uint32_t ms;
+atomic_read_relaxed(>sweep_ms, );
+return ms;
+}
+
 static size_t
 ct_sweep(struct conntrack *ct, struct rculist *list, long long now)
 OVS_NO_THREAD_SAFETY_ANALYSIS
@@ -1504,7 +1520,7 @@ ct_sweep(struct conntrack *ct, struct rculist *list, long 
long now)
 static long long
 conntrack_clean(struct conntrack *ct, long long now)
 {
-long long next_wakeup = now + 20 * 1000;
+long long next_wakeup = now + conntrack_get_sweep_next_run(ct);
 unsigned int n_conn_limit, i;
 size_t clean_end, count = 0;
 
diff --git a/lib/conntrack.h b/lib/conntrack.h
index b064abc9f..2306cf375 100644
--- a/lib/conntrack.h
+++ b/lib/conntrack.h
@@ -139,6 +139,8 @@ int conntrack_set_maxconns(struct conntrack *ct, uint32_t 
maxconns);
 int conntrack_get_maxconns(struct conntrack *ct, uint32_t *maxconns);
 int conntrack_get_nconns(struct conntrack *ct, uint32_t *nconns);
 int conntrack_set_tcp_seq_chk(struct conntrack *ct, bool enabled);
+int conntrack_set_sweep_next_run(struct conntrack *ct, uint32_t ms);
+uint32_t conntrack_get_sweep_next_run(struct conntrack *ct);
 bool conntrack_get_tcp_seq_chk(struct conntrack *ct);
 struct ipf *conntrack_ipf_ctx(struct conntrack *ct);
 struct conntrack_zone_limit zone_limit_get(struct conntrack *ct,
diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c
index d3b2783ce..0a08eb11c 100644
--- a/lib/ct-dpif.c
+++ b/lib/ct-dpif.c
@@ -368,6 +368,20 @@ ct_dpif_del_limits(struct dpif *dpif, const struct 
ovs_list *zone_limits)
 : EOPNOTSUPP);
 }
 
+int
+ct_dpif_sweep(struct dpif *dpif, uint32_t *ms)
+{
+if (*ms) {
+return (dpif->dpif_class->ct_set_sweep_next_run
+? dpif->dpif_class->ct_set_sweep_next_run(dpif, *ms)
+: EOPNOTSUPP);
+} else {
+return (dpif->dpif_class->ct_get_sweep_next_run
+? dpif->dpif_class->ct_get_sweep_next_run(dpif, ms)
+: EOPNOTSUPP);
+}
+}
+
 int
 ct_dpif_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
 {
diff --git a/lib/ct-dpif.h b/lib/ct-dpif.h
index 5edbbfd3b..1e265604f 100644
--- a/lib/ct-dpif.h
+++ b/lib/ct-dpif.h
@@ -298,6 +298,7 @@ int ct_dpif_set_limits(struct dpif *dpif, const uint32_t 
*

[ovs-dev] [PATCH] system-traffic.at: Add icmp error tests while dnatting address and port.

2023-02-27 Thread Paolo Valerio
The two tests verify, for both icmp and icmpv6, that the correct port
translation happen in the inner packet in the case an error is
received in the reply direction.

Signed-off-by: Paolo Valerio 
---
 tests/system-traffic.at |   72 +++
 1 file changed, 72 insertions(+)

diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index 3a15b88a2..02fd0ee1b 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -3561,6 +3561,42 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | 
FORMAT_CT(172.16.0.3)], [0], [dnl
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 
+AT_SETUP([conntrack - ICMP related NAT with single port])
+AT_SKIP_IF([test $HAVE_NC = no])
+AT_SKIP_IF([test $HAVE_TCPDUMP = no])
+CHECK_CONNTRACK()
+CHECK_CONNTRACK_NAT()
+OVS_TRAFFIC_VSWITCHD_START()
+
+ADD_NAMESPACES(at_ns0, at_ns1)
+
+ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24", "f0:00:00:01:01:01")
+ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24", "f0:00:00:01:01:02")
+
+NS_CHECK_EXEC([at_ns0], [ip neigh add 10.1.1.240 lladdr f0:00:00:01:01:02 dev 
p0])
+NS_CHECK_EXEC([at_ns1], [ip neigh add 10.1.1.1 lladdr f0:00:00:01:01:01 dev 
p1])
+
+AT_DATA([flows.txt], [dnl
+table=0,ip,ct_state=-trk,actions=ct(table=0,nat)
+table=0,in_port=ovs-p0,udp,ct_state=+trk+new,actions=ct(commit,nat(dst=10.1.1.2:8080)),ovs-p1
+table=0,in_port=ovs-p1,ct_state=+trk+rel+rpl,icmp,actions=ovs-p0
+])
+
+AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
+
+rm p0.pcap
+NETNS_DAEMONIZE([at_ns0], [tcpdump -l -U -i p0 -w p0.pcap 2>tcpdump0_err], 
[tcpdump0.pid])
+NS_CHECK_EXEC([at_ns0], [bash -c "echo dest_unreach | nc $NC_EOF_OPT -p 1234 
-u 10.1.1.240 80"])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.1," | 
sort], [0], [dnl
+udp,orig=(src=10.1.1.1,dst=10.1.1.240,sport=1234,dport=80),reply=(src=10.1.1.2,dst=10.1.1.1,sport=8080,dport=1234)
+])
+
+OVS_WAIT_UNTIL([ovs-pcap p0.pcap | grep -Eq 
"f0010101f0010102080045c00045[[[:xdigit:]]]{4}4001[[[:xdigit:]]]{4}0a0101f00a010101030314164529[[[:xdigit:]]]{4}40004011[[[:xdigit:]]]{4}0a0101010a0101f004d2005000156b24646573745f756e72656163680a"])
+
+OVS_TRAFFIC_VSWITCHD_STOP
+AT_CLEANUP
+
 AT_SETUP([conntrack - IPv4 fragmentation])
 CHECK_CONNTRACK()
 OVS_TRAFFIC_VSWITCHD_START()
@@ -6555,6 +6591,42 @@ 
udp,orig=(src=fc00::1,dst=fc00::2,sport=,dport=),reply=(src=fc
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 
+AT_SETUP([conntrack - ICMPv6 related NAT with single port])
+AT_SKIP_IF([test $HAVE_NC = no])
+AT_SKIP_IF([test $HAVE_TCPDUMP = no])
+CHECK_CONNTRACK()
+CHECK_CONNTRACK_NAT()
+OVS_TRAFFIC_VSWITCHD_START()
+
+ADD_NAMESPACES(at_ns0, at_ns1)
+
+ADD_VETH(p0, at_ns0, br0, "fc00::1/96", "f0:00:00:01:01:01", [], "nodad")
+ADD_VETH(p1, at_ns1, br0, "fc00::2/96", "f0:00:00:01:01:02", [], "nodad")
+
+NS_CHECK_EXEC([at_ns0], [ip -6 neigh add fc00::240 lladdr f0:00:00:01:01:02 
dev p0])
+NS_CHECK_EXEC([at_ns1], [ip -6 neigh add fc00::1 lladdr f0:00:00:01:01:01 dev 
p1])
+
+AT_DATA([flows.txt], [dnl
+table=0,ipv6,ct_state=-trk,actions=ct(table=0,nat)
+table=0,in_port=ovs-p0,udp6,ct_state=+trk+new,actions=ct(commit,nat(dst=[[fc00::2]]:8080)),ovs-p1
+table=0,in_port=ovs-p1,ct_state=+trk+rel+rpl,icmp6,actions=ovs-p0
+])
+
+AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
+
+rm p0.pcap
+NETNS_DAEMONIZE([at_ns0], [tcpdump -l -U -i p0 -w p0.pcap 2>tcpdump0_err], 
[tcpdump0.pid])
+NS_CHECK_EXEC([at_ns0], [bash -c "echo dest_unreach | nc -6 $NC_EOF_OPT -p 
1234 -u fc00::240 80"])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=fc00::1," | sort], 
[0], [dnl
+udp,orig=(src=fc00::1,dst=fc00::240,sport=1234,dport=80),reply=(src=fc00::2,dst=fc00::1,sport=8080,dport=1234)
+])
+
+OVS_WAIT_UNTIL([ovs-pcap p0.pcap | grep -Eq 
"f0010101f001010286dd60[[[:xdigit:]]]{6}00453a40fc000240fc010104[[[:xdigit:]]]{4}60[[[:xdigit:]]]{6}00151140fc01fc00024004d20050001587d4646573745f756e72656163680a"])
+
+OVS_TRAFFIC_VSWITCHD_STOP
+AT_CLEANUP
+
 AT_SETUP([conntrack - IPv6 FTP with SNAT])
 AT_SKIP_IF([test $HAVE_FTP = no])
 CHECK_CONNTRACK()

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v2] conntrack: fix conntrack_clean may access the same exp_list each time be called

2023-02-21 Thread Paolo Valerio
Liang Mancang  writes:

> when a exp_list contains more than the clean_end's number of nodes,
> and these nodes will not expire immediately. Then, every times we
> call conntrack_clean, it use the same next_sweep to get exp_list.
>
> Actually, we should add i every times after we call ct_sweep.
>
> v2: delete unnecessary line.
>

It's better to place the log after "---" at the bottom of the commit
message. I don't know if it's worth a new version only for this. If no
other respin will be needed maybe could be removed while applying.

Other than that, the change looks good to me.
Thanks for fixing this:

Acked-by: Paolo Valerio 


> Fixes: 3d9c1b855a5f ("conntrack: Replace timeout based expiration lists with 
> rculists.")
> Signed-off-by: Liang Mancang 
> ---
>  lib/conntrack.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 524670e45..8cf7779c6 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -1512,12 +1512,12 @@ conntrack_clean(struct conntrack *ct, long long now)
>  clean_end = n_conn_limit / 64;
>  
>  for (i = ct->next_sweep; i < N_EXP_LISTS; i++) {
> -count += ct_sweep(ct, >exp_lists[i], now);
> -
>  if (count > clean_end) {
>  next_wakeup = 0;
>  break;
>  }
> +
> +count += ct_sweep(ct, >exp_lists[i], now);
>  }
>  
>  ct->next_sweep = (i < N_EXP_LISTS) ? i : 0;
> -- 
> 2.30.0.windows.2
>
> ___
> dev mailing list
> d...@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] conntrack:fix conntrack_clean may access the same exp_list each time be called

2023-02-21 Thread Paolo Valerio
Liang Mancang  writes:

> On Mon, Feb 20, 2023 at 07:38:39PM +0100, Paolo Valerio wrote:
>> Paolo Valerio  writes:
>> 
>> > Hello Liang,
>> >
>> > Liang Mancang  writes:
>> >
>> >> when a exp_list contains more than the clean_end's number of nodes,
>> >> and these nodes will not expire immediately. Then, every times we
>> >> call conntrack_clean, it use the same next_sweep to get exp_list.
>> >>
>> >
>> > Yes, in general, if the previous count exceeds the clean_end, it should
>> > not make the sweeper restart from a list just swept, but it should not
>> > happen that a single list contains more than n_conn_limit / 64.
>> >
>> > Did you observe a single exp_list containing more than n_conn_limit / 64
>> > entries?
> We only select exp_list for a conntrack entry when createing it, but never 
> move 
> them when update their expires or delete them. So the number of each exp_list
> will become unbalanced after long-time running.

of course, if not balanced that could happen.

>> >
>> >> Actually, we should add i every times after we call ct_sweep.
>> >>
>> >> Signed-off-by: Liang Mancang 
>> >> ---
>> >>  lib/conntrack.c | 5 +++--
>> >>  1 file changed, 3 insertions(+), 2 deletions(-)
>> >>
>> >> diff --git a/lib/conntrack.c b/lib/conntrack.c
>> >> index 524670e45..5029b2cda 100644
>> >> --- a/lib/conntrack.c
>> >> +++ b/lib/conntrack.c
>> >> @@ -1512,12 +1512,13 @@ conntrack_clean(struct conntrack *ct, long long 
>> >> now)
>> >>  clean_end = n_conn_limit / 64;
>> >>  
>> >>  for (i = ct->next_sweep; i < N_EXP_LISTS; i++) {
>> >> -count += ct_sweep(ct, >exp_lists[i], now);
>> >> -
>> >>  if (count > clean_end) {
>> >>  next_wakeup = 0;
>> >> +
>> >
>> > This new line is not needed, and a Fixes tag could be added:
>> >
>> > Fixes: 3d9c1b855a5f ("conntrack: Replace timeout based expiration lists 
>> > with rculists.")
>> >
>> > The patch LGTM, 
>> >
>> 
>> Sorry, the last line slipped out. Please consider my question and the
>> other comments. I will explicitly tag the patch once we're done.
>> 
> I sent v2 for this.

Thanks.

>> >>  break;
>> >>  }
>> >> +
>> >> +count += ct_sweep(ct, >exp_lists[i], now);
>> >>  }
>> >>  
>> >>  ct->next_sweep = (i < N_EXP_LISTS) ? i : 0;
>> >> -- 
>> >> 2.30.0.windows.2
>> >>
>> >> ___
>> >> dev mailing list
>> >> d...@openvswitch.org

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] conntrack:fix conntrack_clean may access the same exp_list each time be called

2023-02-20 Thread Paolo Valerio
Paolo Valerio  writes:

> Hello Liang,
>
> Liang Mancang  writes:
>
>> when a exp_list contains more than the clean_end's number of nodes,
>> and these nodes will not expire immediately. Then, every times we
>> call conntrack_clean, it use the same next_sweep to get exp_list.
>>
>
> Yes, in general, if the previous count exceeds the clean_end, it should
> not make the sweeper restart from a list just swept, but it should not
> happen that a single list contains more than n_conn_limit / 64.
>
> Did you observe a single exp_list containing more than n_conn_limit / 64
> entries?
>
>> Actually, we should add i every times after we call ct_sweep.
>>
>> Signed-off-by: Liang Mancang 
>> ---
>>  lib/conntrack.c | 5 +++--
>>  1 file changed, 3 insertions(+), 2 deletions(-)
>>
>> diff --git a/lib/conntrack.c b/lib/conntrack.c
>> index 524670e45..5029b2cda 100644
>> --- a/lib/conntrack.c
>> +++ b/lib/conntrack.c
>> @@ -1512,12 +1512,13 @@ conntrack_clean(struct conntrack *ct, long long now)
>>  clean_end = n_conn_limit / 64;
>>  
>>  for (i = ct->next_sweep; i < N_EXP_LISTS; i++) {
>> -count += ct_sweep(ct, >exp_lists[i], now);
>> -
>>  if (count > clean_end) {
>>  next_wakeup = 0;
>> +
>
> This new line is not needed, and a Fixes tag could be added:
>
> Fixes: 3d9c1b855a5f ("conntrack: Replace timeout based expiration lists with 
> rculists.")
>
> The patch LGTM, 
>

Sorry, the last line slipped out. Please consider my question and the
other comments. I will explicitly tag the patch once we're done.

>>  break;
>>  }
>> +
>> +count += ct_sweep(ct, >exp_lists[i], now);
>>  }
>>  
>>  ct->next_sweep = (i < N_EXP_LISTS) ? i : 0;
>> -- 
>> 2.30.0.windows.2
>>
>> ___
>> dev mailing list
>> d...@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] conntrack:fix conntrack_clean may access the same exp_list each time be called

2023-02-20 Thread Paolo Valerio
Hello Liang,

Liang Mancang  writes:

> when a exp_list contains more than the clean_end's number of nodes,
> and these nodes will not expire immediately. Then, every times we
> call conntrack_clean, it use the same next_sweep to get exp_list.
>

Yes, in general, if the previous count exceeds the clean_end, it should
not make the sweeper restart from a list just swept, but it should not
happen that a single list contains more than n_conn_limit / 64.

Did you observe a single exp_list containing more than n_conn_limit / 64
entries?

> Actually, we should add i every times after we call ct_sweep.
>
> Signed-off-by: Liang Mancang 
> ---
>  lib/conntrack.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 524670e45..5029b2cda 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -1512,12 +1512,13 @@ conntrack_clean(struct conntrack *ct, long long now)
>  clean_end = n_conn_limit / 64;
>  
>  for (i = ct->next_sweep; i < N_EXP_LISTS; i++) {
> -count += ct_sweep(ct, >exp_lists[i], now);
> -
>  if (count > clean_end) {
>  next_wakeup = 0;
> +

This new line is not needed, and a Fixes tag could be added:

Fixes: 3d9c1b855a5f ("conntrack: Replace timeout based expiration lists with 
rculists.")

The patch LGTM, 

>  break;
>  }
> +
> +count += ct_sweep(ct, >exp_lists[i], now);
>  }
>  
>  ct->next_sweep = (i < N_EXP_LISTS) ? i : 0;
> -- 
> 2.30.0.windows.2
>
> ___
> dev mailing list
> d...@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v6] conntrack: Properly unNAT inner header of related traffic

2023-02-09 Thread Paolo Valerio
Hi Ales,

I just have two small nits, but other than that the patch LGTM.

Acked-by: Paolo Valerio 

Ales Musil  writes:

> The inner header was not handled properly.
> Simplify the code which allows proper handling
> of the inner headers.
>
> Reported-at: https://bugzilla.redhat.com/2137754
> Signed-off-by: Ales Musil 
> ---
> v6: Rebase on top of current master.
> Address comments from Paolo:
> - Add test case for ICMP related in reply direction.
> - Fix a mistake when the inner header was using
> wrong nat_action.
> v5: Rebase on top of current master.
> Address comments from Dumitru:
> - Use explicit struct sizes for inner_l3 pointer.
> - Use copied conn_key for reverse operation instead
> of double reverse of the original one.
> - Update the test case to use separate zone instead
> of default one.
> v4: Rebase on top of current master.
> Use output of ovs-pcap in tests rather than tcpdump.
> v3: Rebase on top of current master.
> Update the BZ reference.
> Update the test case.
> ---
>  lib/conntrack.c | 254 ++--
>  tests/system-traffic.at | 107 +
>  2 files changed, 198 insertions(+), 163 deletions(-)
>
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 550b2be9b..3162924ca 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -764,109 +764,61 @@ handle_alg_ctl(struct conntrack *ct, const struct 
> conn_lookup_ctx *ctx,
>  }
>  
>  static void
> -pat_packet(struct dp_packet *pkt, const struct conn *conn)
> +pat_packet(struct dp_packet *pkt, const struct conn_key *key)
>  {
> -if (conn->nat_action & NAT_ACTION_SRC) {
> -if (conn->key.nw_proto == IPPROTO_TCP) {
> -struct tcp_header *th = dp_packet_l4(pkt);
> -packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
> -} else if (conn->key.nw_proto == IPPROTO_UDP) {
> -struct udp_header *uh = dp_packet_l4(pkt);
> -packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
> -}
> -} else if (conn->nat_action & NAT_ACTION_DST) {
> -if (conn->key.nw_proto == IPPROTO_TCP) {
> -packet_set_tcp_port(pkt, conn->rev_key.dst.port,
> -conn->rev_key.src.port);
> -} else if (conn->key.nw_proto == IPPROTO_UDP) {
> -packet_set_udp_port(pkt, conn->rev_key.dst.port,
> -conn->rev_key.src.port);
> -}
> +if (key->nw_proto == IPPROTO_TCP) {
> +packet_set_tcp_port(pkt, key->dst.port, key->src.port);
> +} else if (key->nw_proto == IPPROTO_UDP) {
> +packet_set_udp_port(pkt, key->dst.port, key->src.port);
>  }
>  }
>  
> -static void
> -nat_packet(struct dp_packet *pkt, const struct conn *conn, bool related)
> +static uint16_t
> +nat_action_reverse(uint16_t nat_action)
>  {
> -if (conn->nat_action & NAT_ACTION_SRC) {
> -pkt->md.ct_state |= CS_SRC_NAT;
> -if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
> -struct ip_header *nh = dp_packet_l3(pkt);
> -packet_set_ipv4_addr(pkt, >ip_src,
> - conn->rev_key.dst.addr.ipv4);
> -} else {
> -struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
> -packet_set_ipv6_addr(pkt, conn->key.nw_proto,
> - nh6->ip6_src.be32,
> - >rev_key.dst.addr.ipv6, true);
> -}
> -if (!related) {
> -pat_packet(pkt, conn);
> -}
> -} else if (conn->nat_action & NAT_ACTION_DST) {
> -pkt->md.ct_state |= CS_DST_NAT;
> -if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
> -struct ip_header *nh = dp_packet_l3(pkt);
> -packet_set_ipv4_addr(pkt, >ip_dst,
> - conn->rev_key.src.addr.ipv4);
> -} else {
> -struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
> -packet_set_ipv6_addr(pkt, conn->key.nw_proto,
> - nh6->ip6_dst.be32,
> - >rev_key.src.addr.ipv6, true);
> -}
> -if (!related) {
> -pat_packet(pkt, conn);
> -}
> +if (nat_action & NAT_ACTION_SRC) {
> +VLOG_INFO("original SRC");

Not sure this is useful. I'd remove it including the one below.

> +nat_action ^= NAT_ACTION_SRC;
> +nat_action |= NAT_ACTION_DST;
> +} else if (nat_action & NAT_

Re: [ovs-dev] [PATCH v5] conntrack: Properly unNAT inner header of related traffic

2023-02-06 Thread Paolo Valerio
Ales Musil  writes:

> On Sun, Feb 5, 2023 at 7:17 PM Paolo Valerio  wrote:
>
> Ales Musil  writes:
>
> > The inner header was not handled properly.
> > Simplify the code which allows proper handling
> > of the inner headers.
> >
> > Reported-at: https://bugzilla.redhat.com/2137754
> > Signed-off-by: Ales Musil 
> > ---
> > v5: Rebase on top of current master.
> >     Address comments from Dumitru:
> >     - Use explicit struct sizes for inner_l3 pointer.
> >     - Use copied conn_key for reverse operation instead
> >     of double reverse of the original one.
> >     - Update the test case to use separate zone instead
> >     of default one.
> > v4: Rebase on top of current master.
> >     Use output of ovs-pcap in tests rather than tcpdump.
> > v3: Rebase on top of current master.
> >     Update the BZ reference.
> >     Update the test case.
> > ---
>
> Hello Ales,
>
>
> Hi Paolo,
>
> thank you for the review.
>  
>
>
> thanks for the patch.
> One noticeable thing is that the patch doesn't enforce the commit flag
> as it happens for the kernel datapath. This seems what you want
> considering the flows in the test.
>
>
> It wasn't doing it even before, this seems to be out of scope of this patch
> as this tries to fix the problem with inner header translation. However I 
> agree
> that userspace and kernel should behave the same way, if you don't mind it
> could
> be a follow up patch.
>  

in general, this doesn't happen for the kernel datapath as well for
the reply direction, see "conntrack - ICMP related with NAT". This was
the point I wanted to make asking if you happen to test it in the reply
dir without committing.

Keeping it out of this patch sounds good to me.

>
>
> E.g. with this diff on top of your patch:
>
> diff --git a/tests/system-traffic.at b/tests/system-traffic.at
> index 798343877..b309635b9 100644
> --- a/tests/system-traffic.at
> +++ b/tests/system-traffic.at
> @@ -7147,7 +7147,6 @@ dnl Send traffic from client to CT, do DNAT if the
> traffic is new otherwise send
>  AT_DATA([flows.txt], [dnl
>  table=0,ip,actions=ct(table=1,zone=42,nat)
>  table=1,in_port=ovs-client,ip,ct_state=+trk+new,actions=ct(commit,table=
> 2,zone=42,nat(dst(192.168.10.20))
> 
> -table=1,in_port=ovs-client,icmp,ct_state=+trk+rel,actions=ct(commit,table=
> 2,zone=42,nat)
>  table=1,ip,actions=resubmit(,2)
>  table=2,in_port=ovs-client,ip,ct_state=+trk+new,actions=output:ovs-server
>  table=2,in_port=ovs-client,icmp,ct_state=+trk+rel,actions=
> output:ovs-server
> @@ -7176,8 +7175,7 @@ AT_CHECK([ovs-appctl revalidator/purge], [0])
>  AT_CHECK([ovs-ofctl -O OpenFlow15 dump-flows br0 | ofctl_strip | sort ],
> [0], [dnl
>   n_packets=3, n_bytes=154, reset_counts ip 
> actions=ct(table=1,zone=42,nat)
>   table=1, n_packets=1, n_bytes=42, reset_counts ct_state=
> +new+trk,ip,in_port=1 actions=ct(commit,table=2,zone=42,nat(dst=
> 192.168.10.20))
> - table=1, n_packets=1, n_bytes=42, reset_counts ip actions=resubmit(,2)
> - table=1, n_packets=1, n_bytes=70, reset_counts ct_state=
> +rel+trk,icmp,in_port=1 actions=ct(commit,table=2,zone=42,nat)
> + table=1, n_packets=2, n_bytes=112, reset_counts ip actions=resubmit(,2)
>   table=2, n_packets=1, n_bytes=42, reset_counts ct_state=
> +new+trk,ip,in_port=1 actions=output:2
>   table=2, n_packets=1, n_bytes=42, reset_counts ct_state=
> +rpl+trk,ip,in_port=2 actions=output:1
>   table=2, n_packets=1, n_bytes=70, reset_counts ct_state=
> +rel+trk,icmp,in_port=1 actions=output:2
>
> the test passes for the userspace datapath, but fails for the kernel.
>
> I have a question, though, did you happen to test for both datapaths
> what happens if a middlebox sends the icmp error from the reply
> direction instead without your patch?
> I assume things worked (without commit for both datapaths) in that case.
>
>
> Another thing that IMO could be nice to add is a test case for the same
> scenario, but in the reply direction. At least, both directions will be
> covered and verified.
>
>
> I've added a test case for the reply direction. It actually caught small
> mistake I made
> which should be both in v6.
>  
>
>
> Paolo
>
> >  lib/conntrack.c         | 252 ++--
> >  tests/system-traffic.at |  66 +++
> >  2 files changed, 155 insertions(+), 163 deletions(

Re: [ovs-dev] [PATCH v5] conntrack: Properly unNAT inner header of related traffic

2023-02-05 Thread Paolo Valerio
Ales Musil  writes:

> The inner header was not handled properly.
> Simplify the code which allows proper handling
> of the inner headers.
>
> Reported-at: https://bugzilla.redhat.com/2137754
> Signed-off-by: Ales Musil 
> ---
> v5: Rebase on top of current master.
> Address comments from Dumitru:
> - Use explicit struct sizes for inner_l3 pointer.
> - Use copied conn_key for reverse operation instead
> of double reverse of the original one.
> - Update the test case to use separate zone instead
> of default one.
> v4: Rebase on top of current master.
> Use output of ovs-pcap in tests rather than tcpdump.
> v3: Rebase on top of current master.
> Update the BZ reference.
> Update the test case.
> ---

Hello Ales,

thanks for the patch.
One noticeable thing is that the patch doesn't enforce the commit flag
as it happens for the kernel datapath. This seems what you want
considering the flows in the test.

E.g. with this diff on top of your patch:

diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index 798343877..b309635b9 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -7147,7 +7147,6 @@ dnl Send traffic from client to CT, do DNAT if the 
traffic is new otherwise send
 AT_DATA([flows.txt], [dnl
 table=0,ip,actions=ct(table=1,zone=42,nat)
 
table=1,in_port=ovs-client,ip,ct_state=+trk+new,actions=ct(commit,table=2,zone=42,nat(dst(192.168.10.20))
-table=1,in_port=ovs-client,icmp,ct_state=+trk+rel,actions=ct(commit,table=2,zone=42,nat)
 table=1,ip,actions=resubmit(,2)
 table=2,in_port=ovs-client,ip,ct_state=+trk+new,actions=output:ovs-server
 table=2,in_port=ovs-client,icmp,ct_state=+trk+rel,actions=output:ovs-server
@@ -7176,8 +7175,7 @@ AT_CHECK([ovs-appctl revalidator/purge], [0])
 AT_CHECK([ovs-ofctl -O OpenFlow15 dump-flows br0 | ofctl_strip | sort ], [0], 
[dnl
  n_packets=3, n_bytes=154, reset_counts ip actions=ct(table=1,zone=42,nat)
  table=1, n_packets=1, n_bytes=42, reset_counts ct_state=+new+trk,ip,in_port=1 
actions=ct(commit,table=2,zone=42,nat(dst=192.168.10.20))
- table=1, n_packets=1, n_bytes=42, reset_counts ip actions=resubmit(,2)
- table=1, n_packets=1, n_bytes=70, reset_counts 
ct_state=+rel+trk,icmp,in_port=1 actions=ct(commit,table=2,zone=42,nat)
+ table=1, n_packets=2, n_bytes=112, reset_counts ip actions=resubmit(,2)
  table=2, n_packets=1, n_bytes=42, reset_counts ct_state=+new+trk,ip,in_port=1 
actions=output:2
  table=2, n_packets=1, n_bytes=42, reset_counts ct_state=+rpl+trk,ip,in_port=2 
actions=output:1
  table=2, n_packets=1, n_bytes=70, reset_counts 
ct_state=+rel+trk,icmp,in_port=1 actions=output:2

the test passes for the userspace datapath, but fails for the kernel.

I have a question, though, did you happen to test for both datapaths
what happens if a middlebox sends the icmp error from the reply
direction instead without your patch?
I assume things worked (without commit for both datapaths) in that case.

Another thing that IMO could be nice to add is a test case for the same
scenario, but in the reply direction. At least, both directions will be
covered and verified.

Paolo

>  lib/conntrack.c | 252 ++--
>  tests/system-traffic.at |  66 +++
>  2 files changed, 155 insertions(+), 163 deletions(-)
>
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 550b2be9b..b207f379d 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -764,109 +764,59 @@ handle_alg_ctl(struct conntrack *ct, const struct 
> conn_lookup_ctx *ctx,
>  }
>  
>  static void
> -pat_packet(struct dp_packet *pkt, const struct conn *conn)
> +pat_packet(struct dp_packet *pkt, const struct conn_key *key)
>  {
> -if (conn->nat_action & NAT_ACTION_SRC) {
> -if (conn->key.nw_proto == IPPROTO_TCP) {
> -struct tcp_header *th = dp_packet_l4(pkt);
> -packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
> -} else if (conn->key.nw_proto == IPPROTO_UDP) {
> -struct udp_header *uh = dp_packet_l4(pkt);
> -packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
> -}
> -} else if (conn->nat_action & NAT_ACTION_DST) {
> -if (conn->key.nw_proto == IPPROTO_TCP) {
> -packet_set_tcp_port(pkt, conn->rev_key.dst.port,
> -conn->rev_key.src.port);
> -} else if (conn->key.nw_proto == IPPROTO_UDP) {
> -packet_set_udp_port(pkt, conn->rev_key.dst.port,
> -conn->rev_key.src.port);
> -}
> +if (key->nw_proto == IPPROTO_TCP) {
> +packet_set_tcp_port(pkt, key->dst.port, key->src.port);
> +} else if (key->nw_proto == IPPROTO_UDP) {
> +packet_set_udp_port(pkt, key->dst.port, key->src.port);
>  }
>  }
>  
> -static void
> -nat_packet(struct dp_packet *pkt, const struct conn *conn, bool related)
> +static uint16_t
> +nat_action_reverse(uint16_t nat_action)
>  {
> -if 

Re: [ovs-dev] [PATCH v5 2/2] openflow: Add extension to flush CT by generic match

2022-12-16 Thread Paolo Valerio
Ales Musil  writes:

> Add extension that allows to flush connections from CT
> by specifying fields that the connections should be
> matched against. This allows to match only some fields
> of the connection e.g. source address for orig direrction.
>
> Reported-at: https://bugzilla.redhat.com/2120546
> Signed-off-by: Ales Musil 
> ---
> v5: Add missing usage and man for ovs-ofctl command.
> v4: Allow ovs-ofctl flush/conntrack without any zone/tuple.
> v3: Rebase on top of master.
> v2: Rebase on top of master.
> Use suggestion from Ilya.
> ---

Thanks Ales.

LGTM,

Acked-by: Paolo Valerio 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v4 2/2] openflow: Add extension to flush CT by generic match

2022-12-16 Thread Paolo Valerio
Ales Musil  writes:

> Add extension that allows to flush connections from CT
> by specifying fields that the connections should be
> matched against. This allows to match only some fields
> of the connection e.g. source address for orig direrction.
>
> Reported-at: https://bugzilla.redhat.com/2120546
> Signed-off-by: Ales Musil 
> ---
> v4: Allow ovs-ofctl flush/conntrack without any zone/tuple.
> v3: Rebase on top of master.
> v2: Rebase on top of master.
> Use suggestion from Ilya.
> ---
>  NEWS   |   3 +
>  include/openflow/nicira-ext.h  |  30 +++
>  include/openvswitch/ofp-msgs.h |   4 +
>  include/openvswitch/ofp-util.h |   4 +
>  lib/ofp-bundle.c   |   1 +
>  lib/ofp-ct-util.c  | 146 +
>  lib/ofp-ct-util.h  |   9 ++
>  lib/ofp-print.c|  20 +
>  lib/ofp-util.c |  25 ++
>  lib/rconn.c|   1 +
>  ofproto/ofproto-dpif.c |   8 +-
>  ofproto/ofproto-provider.h |   7 +-
>  ofproto/ofproto.c  |  30 ++-
>  tests/ofp-print.at |  93 +
>  tests/ovs-ofctl.at |  26 ++
>  tests/system-traffic.at| 116 ++
>  utilities/ovs-ofctl.c  |  38 +
>  17 files changed, 503 insertions(+), 58 deletions(-)
>
> diff --git a/NEWS b/NEWS
> index ff8904b02..46b8faa41 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -16,6 +16,9 @@ Post-v3.0.0
>   by specifying 'max-rate' or '[r]stp-path-cost' accordingly.
> - ovs-dpctl and related ovs-appctl commands:
>   * "flush-conntrack" is capable of handling partial 5-tuple.
> +   - OpenFlow:
> +  * New OpenFlow extension NXT_CT_FLUSH to flush connections matching
> +the specified fields.
>

I guess we miss an entry for ovs-ofctl flush-conntrack

>  
>  v3.0.0 - 15 Aug 2022
> diff --git a/include/openflow/nicira-ext.h b/include/openflow/nicira-ext.h
> index b68804991..32ce56d31 100644
> --- a/include/openflow/nicira-ext.h
> +++ b/include/openflow/nicira-ext.h
> @@ -1064,4 +1064,34 @@ struct nx_zone_id {
>  };
>  OFP_ASSERT(sizeof(struct nx_zone_id) == 8);
>  
> +/* CT flush available TLVs. */
> +enum nx_ct_flush_tlv_type {
> +/* Outer types. */
> +NXT_CT_ORIG_DIRECTION,/* CT orig direction outer type. */
> +NXT_CT_REPLY_DIRECTION,   /* CT reply direction outer type. */
> +
> +/* Nested types. */
> +NXT_CT_SRC,   /* CT source IPv6 or mapped IPv4 address. */
> +NXT_CT_DST,   /* CT destination IPv6 or mapped IPv4 address. 
> */
> +NXT_CT_SRC_PORT,  /* CT source port. */
> +NXT_CT_DST_PORT,  /* CT destination port. */
> +NXT_CT_ICMP_ID,   /* CT ICMP id. */
> +NXT_CT_ICMP_TYPE, /* CT ICMP type. */
> +NXT_CT_ICMP_CODE, /* CT ICMP code. */
> +
> +/* Primitive types. */
> +NXT_CT_ZONE_ID,   /* CT zone id. */
> +};
> +
> +/* NXT_CT_FLUSH.
> + *
> + * Flushes the connection tracking specified by 5-tuple.
> + * The struct should be followed by TLVs specifying the matching parameters. 
> */
> +struct nx_ct_flush {
> +uint8_t ip_proto;  /* IP protocol. */
> +uint8_t family;/* L3 address family. */
> +uint8_t zero[6];   /* Must be zero. */
> +};
> +OFP_ASSERT(sizeof(struct nx_ct_flush) == 8);
> +
>  #endif /* openflow/nicira-ext.h */
> diff --git a/include/openvswitch/ofp-msgs.h b/include/openvswitch/ofp-msgs.h
> index 921a937e5..659b0a3e7 100644
> --- a/include/openvswitch/ofp-msgs.h
> +++ b/include/openvswitch/ofp-msgs.h
> @@ -526,6 +526,9 @@ enum ofpraw {
>  
>  /* NXST 1.0+ (4): struct nx_ipfix_stats_reply[]. */
>  OFPRAW_NXST_IPFIX_FLOW_REPLY,
> +
> +/* NXT 1.0+ (32): struct nx_ct_flush, uint8_t[8][]. */
> +OFPRAW_NXT_CT_FLUSH,
>  };
>  
>  /* Decoding messages into OFPRAW_* values. */
> @@ -772,6 +775,7 @@ enum ofptype {
>  OFPTYPE_IPFIX_FLOW_STATS_REQUEST, /* OFPRAW_NXST_IPFIX_FLOW_REQUEST */
>  OFPTYPE_IPFIX_FLOW_STATS_REPLY,   /* OFPRAW_NXST_IPFIX_FLOW_REPLY */
>  OFPTYPE_CT_FLUSH_ZONE,/* OFPRAW_NXT_CT_FLUSH_ZONE. */
> +OFPTYPE_CT_FLUSH,   /* OFPRAW_NXT_CT_FLUSH. */
>  
>  /* Flow monitor extension. */
>  OFPTYPE_FLOW_MONITOR_CANCEL,  /* OFPRAW_NXT_FLOW_MONITOR_CANCEL.
> diff --git a/include/openvswitch/ofp-util.h b/include/openvswitch/ofp-util.h
> index 84937ae26..e10d90b9f 100644
> --- a/include/openvswitch/ofp-util.h
> +++ b/include/openvswitch/ofp-util.h
> @@ -65,6 +65,10 @@ struct ofpbuf *ofputil_encode_echo_reply(const struct 
> ofp_header *);
>  
>  struct ofpbuf *ofputil_encode_barrier_request(enum ofp_version);
>  
> +struct ofpbuf *ofputil_ct_match_encode(const struct ofputil_ct_match *match,
> +   uint16_t *zone_id,
> +   enum ofp_version version);
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git 

Re: [ovs-dev] [PATCH v4 1/2] ofp, dpif: Allow CT flush based on partial match

2022-12-16 Thread Paolo Valerio
Ales Musil  writes:

> Currently, the CT can be flushed by dpctl only be specifying
> the whole 5-tuple. This is not very convenient when there are
> only some fields known to the user of CT flush. Add new struct
> ofputil_ct_match which represents the generic filtering that can
> be done for CT flush. The match is done only on fields that are
> non-zero with exception to the icmp fields.
>
> This allows the filtering just within dpctl, however
> it is a preparation for OpenFlow extension.
>
> Reported-at: https://bugzilla.redhat.com/2120546
> Signed-off-by: Ales Musil 
> ---
> v4: Fix a flush all scenario.
> v3: Rebase on top of master.
> Address the C99 comment and missing dpif_close call.
> v2: Rebase on top of master.
> Address comments from Paolo.
> ---

stressed a bit more some corner cases mostly related to icmp and after a
quick discussion offline things LGTM

Acked-by: Paolo Valerio 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v3 2/2] openflow: Add extension to flush CT by generic match

2022-12-15 Thread Paolo Valerio
Ales Musil  writes:

> Add extension that allows to flush connections from CT
> by specifying fields that the connections should be
> matched against. This allows to match only some fields
> of the connection e.g. source address for orig direrction.
>
> Reported-at: https://bugzilla.redhat.com/2120546
> Signed-off-by: Ales Musil 
> ---
> v3: Rebase on top of master.
> v2: Rebase on top of master.
> Use suggestion from Ilya.
> ---

Although a second opinion would be nice to have here,
the patch LGTM and the tests succeeded.

Acked-by: Paolo Valerio 

>  NEWS   |   3 +
>  include/openflow/nicira-ext.h  |  30 +++
>  include/openvswitch/ofp-msgs.h |   4 +
>  include/openvswitch/ofp-util.h |   4 +
>  lib/ofp-bundle.c   |   1 +
>  lib/ofp-ct-util.c  | 146 +
>  lib/ofp-ct-util.h  |   9 ++
>  lib/ofp-print.c|  20 +
>  lib/ofp-util.c |  25 ++
>  lib/rconn.c|   1 +
>  ofproto/ofproto-dpif.c |   8 +-
>  ofproto/ofproto-provider.h |   7 +-
>  ofproto/ofproto.c  |  30 ++-
>  tests/ofp-print.at |  93 +
>  tests/ovs-ofctl.at |  12 +++
>  tests/system-traffic.at| 116 ++
>  utilities/ovs-ofctl.c  |  38 +
>  17 files changed, 489 insertions(+), 58 deletions(-)
>
> diff --git a/NEWS b/NEWS
> index ff8904b02..46b8faa41 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -16,6 +16,9 @@ Post-v3.0.0
>   by specifying 'max-rate' or '[r]stp-path-cost' accordingly.
> - ovs-dpctl and related ovs-appctl commands:
>   * "flush-conntrack" is capable of handling partial 5-tuple.
> +   - OpenFlow:
> +  * New OpenFlow extension NXT_CT_FLUSH to flush connections matching
> +the specified fields.
>  
>  
>  v3.0.0 - 15 Aug 2022
> diff --git a/include/openflow/nicira-ext.h b/include/openflow/nicira-ext.h
> index b68804991..32ce56d31 100644
> --- a/include/openflow/nicira-ext.h
> +++ b/include/openflow/nicira-ext.h
> @@ -1064,4 +1064,34 @@ struct nx_zone_id {
>  };
>  OFP_ASSERT(sizeof(struct nx_zone_id) == 8);
>  
> +/* CT flush available TLVs. */
> +enum nx_ct_flush_tlv_type {
> +/* Outer types. */
> +NXT_CT_ORIG_DIRECTION,/* CT orig direction outer type. */
> +NXT_CT_REPLY_DIRECTION,   /* CT reply direction outer type. */
> +
> +/* Nested types. */
> +NXT_CT_SRC,   /* CT source IPv6 or mapped IPv4 address. */
> +NXT_CT_DST,   /* CT destination IPv6 or mapped IPv4 address. 
> */
> +NXT_CT_SRC_PORT,  /* CT source port. */
> +NXT_CT_DST_PORT,  /* CT destination port. */
> +NXT_CT_ICMP_ID,   /* CT ICMP id. */
> +NXT_CT_ICMP_TYPE, /* CT ICMP type. */
> +NXT_CT_ICMP_CODE, /* CT ICMP code. */
> +
> +/* Primitive types. */
> +NXT_CT_ZONE_ID,   /* CT zone id. */
> +};
> +
> +/* NXT_CT_FLUSH.
> + *
> + * Flushes the connection tracking specified by 5-tuple.
> + * The struct should be followed by TLVs specifying the matching parameters. 
> */
> +struct nx_ct_flush {
> +uint8_t ip_proto;  /* IP protocol. */
> +uint8_t family;/* L3 address family. */
> +uint8_t zero[6];   /* Must be zero. */
> +};
> +OFP_ASSERT(sizeof(struct nx_ct_flush) == 8);
> +
>  #endif /* openflow/nicira-ext.h */
> diff --git a/include/openvswitch/ofp-msgs.h b/include/openvswitch/ofp-msgs.h
> index 921a937e5..659b0a3e7 100644
> --- a/include/openvswitch/ofp-msgs.h
> +++ b/include/openvswitch/ofp-msgs.h
> @@ -526,6 +526,9 @@ enum ofpraw {
>  
>  /* NXST 1.0+ (4): struct nx_ipfix_stats_reply[]. */
>  OFPRAW_NXST_IPFIX_FLOW_REPLY,
> +
> +/* NXT 1.0+ (32): struct nx_ct_flush, uint8_t[8][]. */
> +OFPRAW_NXT_CT_FLUSH,
>  };
>  
>  /* Decoding messages into OFPRAW_* values. */
> @@ -772,6 +775,7 @@ enum ofptype {
>  OFPTYPE_IPFIX_FLOW_STATS_REQUEST, /* OFPRAW_NXST_IPFIX_FLOW_REQUEST */
>  OFPTYPE_IPFIX_FLOW_STATS_REPLY,   /* OFPRAW_NXST_IPFIX_FLOW_REPLY */
>  OFPTYPE_CT_FLUSH_ZONE,/* OFPRAW_NXT_CT_FLUSH_ZONE. */
> +OFPTYPE_CT_FLUSH,   /* OFPRAW_NXT_CT_FLUSH. */
>  
>  /* Flow monitor extension. */
>  OFPTYPE_FLOW_MONITOR_CANCEL,  /* OFPRAW_NXT_FLOW_MONITOR_CANCEL.
> diff --git a/include/openvswitch/ofp-util.h b/include/openvswitch/ofp-util.h
> index 84937ae26..e10d90b9f 100644
> --- a/include/openvswitch/ofp-util.h
> +++ b/include/openvswitch/ofp-util.h
> @@ -65,6 +65,10 @@ struct ofpbuf *ofputil_encode_echo_reply(const struct 
> of

Re: [ovs-dev] [PATCH v3 1/2] ofp, dpif: Allow CT flush based on partial match

2022-12-15 Thread Paolo Valerio
Ales Musil  writes:

> On Thu, Dec 15, 2022 at 4:28 PM Paolo Valerio  wrote:
>
> Ales Musil  writes:
>
> > Currently, the CT can be flushed by dpctl only be specifying
> > the whole 5-tuple. This is not very convenient when there are
> > only some fields known to the user of CT flush. Add new struct
> > ofputil_ct_match which represents the generic filtering that can
> > be done for CT flush. The match is done only on fields that are
> > non-zero with exception to the icmp fields.
> >
> > This allows the filtering just within dpctl, however
> > it is a preparation for OpenFlow extension.
> >
> > Reported-at: https://bugzilla.redhat.com/2120546
> > Signed-off-by: Ales Musil 
> > ---
> > v3: Rebase on top of master.
> >     Address the C99 comment and missing dpif_close call.
> > v2: Rebase on top of master.
> >     Address comments from Paolo.
> > ---
> >  NEWS                           |   2 +
> >  include/openvswitch/ofp-util.h |  28 +++
> >  lib/automake.mk                |   2 +
> >  lib/ct-dpif.c                  | 201 +
> >  lib/ct-dpif.h                  |   4 +-
> >  lib/dpctl.c                    |  45 +++--
> >  lib/dpctl.man                  |   3 +-
> >  lib/ofp-ct-util.c              | 311 +
> >  lib/ofp-ct-util.h              |  34 
> >  tests/system-traffic.at        |  82 -
> >  10 files changed, 568 insertions(+), 144 deletions(-)
> >  create mode 100644 lib/ofp-ct-util.c
> >  create mode 100644 lib/ofp-ct-util.h
> >
> > diff --git a/NEWS b/NEWS
> > index 265375e1c..ff8904b02 100644
> > --- a/NEWS
> > +++ b/NEWS
> > @@ -14,6 +14,8 @@ Post-v3.0.0
> >       10 Gbps link speed by default in case the actual link speed cannot
> be
> >       determined.  Previously it was 10 Mbps.  Values can still be
> overridden
> >       by specifying 'max-rate' or '[r]stp-path-cost' accordingly.
> > +   - ovs-dpctl and related ovs-appctl commands:
> > +     * "flush-conntrack" is capable of handling partial 5-tuple.
> > 
> > 
> >  v3.0.0 - 15 Aug 2022
> > diff --git a/include/openvswitch/ofp-util.h b/include/openvswitch/
> ofp-util.h
> > index 091a09cad..84937ae26 100644
> > --- a/include/openvswitch/ofp-util.h
> > +++ b/include/openvswitch/ofp-util.h
> > @@ -19,6 +19,9 @@
> > 
> >  #include 
> >  #include 
> > +#include 
> > +#include 
> > +
> >  #include "openvswitch/ofp-protocol.h"
> > 
> >  struct ofp_header;
> > @@ -27,6 +30,31 @@ struct ofp_header;
> >  extern "C" {
> >  #endif
> > 
> > +struct ofputil_ct_tuple {
> > +    struct in6_addr src;
> > +    struct in6_addr dst;
> > +
> > +    union {
> > +        ovs_be16 src_port;
> > +        ovs_be16 icmp_id;
> > +    };
> > +    union {
> > +        ovs_be16 dst_port;
> > +        struct {
> > +            uint8_t icmp_code;
> > +            uint8_t icmp_type;
> > +        };
> > +    };
> > +};
> > +
> > +struct ofputil_ct_match {
> > +    uint8_t ip_proto;
> > +    uint16_t l3_type;
> > +
> > +    struct ofputil_ct_tuple tuple_orig;
> > +    struct ofputil_ct_tuple tuple_reply;
> > +};
> > +
> >  bool ofputil_decode_hello(const struct ofp_header *,
> >                            uint32_t *allowed_versions);
> >  struct ofpbuf *ofputil_encode_hello(uint32_t version_bitmap);
> > diff --git a/lib/automake.mk b/lib/automake.mk
> > index a0fabe38f..37135f118 100644
> > --- a/lib/automake.mk
> > +++ b/lib/automake.mk
> > @@ -227,6 +227,8 @@ lib_libopenvswitch_la_SOURCES = \
> >       lib/ofp-actions.c \
> >       lib/ofp-bundle.c \
> >       lib/ofp-connection.c \
> > +     lib/ofp-ct-util.c \
> > +     lib/ofp-ct-util.h \
> >       lib/ofp-ed-props.c \
> >       lib/ofp-errors.c \
> >       lib/ofp-flow.c \
> > diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c
> > index 6f17a26b5..906e827c1 100644
> > --- a/lib/ct-dpif.c
> > +++ b/lib/ct-dpif.c
> &

Re: [ovs-dev] [PATCH v3 1/2] ofp, dpif: Allow CT flush based on partial match

2022-12-15 Thread Paolo Valerio
state);
>  void ct_dpif_format_tcp_stat(struct ds *, int, int);
> -bool ct_dpif_parse_tuple(struct ct_dpif_tuple *, const char *s, struct ds *);
>  void ct_dpif_push_zone_limit(struct ovs_list *, uint16_t zone, uint32_t 
> limit,
>   uint32_t count);
>  void ct_dpif_free_zone_limits(struct ovs_list *);
> diff --git a/lib/dpctl.c b/lib/dpctl.c
> index 29041fa3e..3cdedbe97 100644
> --- a/lib/dpctl.c
> +++ b/lib/dpctl.c
> @@ -40,6 +40,7 @@
>  #include "netdev.h"
>  #include "netlink.h"
>  #include "odp-util.h"
> +#include "ofp-ct-util.h"
>  #include "openvswitch/ofpbuf.h"
>  #include "packets.h"
>  #include "openvswitch/shash.h"
> @@ -1707,47 +1708,41 @@ dpctl_flush_conntrack(int argc, const char *argv[],
>struct dpctl_params *dpctl_p)
>  {
>  struct dpif *dpif = NULL;
> -struct ct_dpif_tuple tuple, *ptuple = NULL;
> -struct ds ds = DS_EMPTY_INITIALIZER;
> -uint16_t zone, *pzone = NULL;
> -int error;
> +struct ofputil_ct_match match = {0};
> +uint16_t zone;
> +bool with_zone = false;
>  int args = argc - 1;
>  
>  /* Parse ct tuple */
> -if (args && ct_dpif_parse_tuple(, argv[args], )) {
> -ptuple = 
> +if (args) {
> +struct ds ds = DS_EMPTY_INITIALIZER;
> +if (!ofputil_ct_match_parse(, argv[args], )) {
> +dpctl_error(dpctl_p, EINVAL, "%s", ds_cstr());
> +ds_destroy();
> +return EINVAL;
> +}
>  args--;
>  }
>  
>  /* Parse zone */
>  if (args && ovs_scan(argv[args], "zone=%"SCNu16, )) {
> -pzone = 
> +with_zone = true;
>  args--;
>  }
>  
> -/* Report error if there are more than one unparsed argument. */
> -if (args > 1) {
> -ds_put_cstr(, "invalid arguments");
> -error = EINVAL;
> -goto error;
> -}
> -
> -error = opt_dpif_open(argc, argv, dpctl_p, 4, );
> +int error = opt_dpif_open(argc, argv, dpctl_p, 4, );
>  if (error) {
> -return error;
> +dpctl_error(dpctl_p, error, "Cannot open dpif");
> +goto end;

just returning error here is fine, right?

>  }
>  
> -error = ct_dpif_flush(dpif, pzone, ptuple);
> -if (!error) {
> -dpif_close(dpif);
> -return 0;
> -} else {
> -ds_put_cstr(, "failed to flush conntrack");
> +error = ct_dpif_flush(dpif, with_zone ?  : NULL, );
> +if (error) {
> +dpctl_error(dpctl_p, error, "Failed to flush conntrack");
> +goto end;

Given the above, the label could be removed, and so the goto here

Other than that, the patch LGTM:

Acked-by: Paolo Valerio 

>  }
>  
> -error:
> -dpctl_error(dpctl_p, error, "%s", ds_cstr());
> -ds_destroy();
> +end:
>  dpif_close(dpif);
>  return error;
>  }
> diff --git a/lib/dpctl.man b/lib/dpctl.man
> index 87ea8087b..b0cabe05d 100644
> --- a/lib/dpctl.man
> +++ b/lib/dpctl.man
> @@ -312,7 +312,8 @@ If \fBzone\fR=\fIzone\fR is specified, only flushes the 
> connections in
>  If \fIct-tuple\fR is provided, flushes the connection entry specified by
>  \fIct-tuple\fR in \fIzone\fR. The zone defaults to 0 if it is not provided.
>  The userspace connection tracker requires flushing with the original 
> pre-NATed
> -tuple and a warning log will be otherwise generated.
> +tuple and a warning log will be otherwise generated. The tuple can be partial
> +and will remove all connections that are matching on the specified fields.
>  An example of an IPv4 ICMP \fIct-tuple\fR:
>  .IP
>  
> "ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=1,icmp_type=8,icmp_code=0,icmp_id=10"
> diff --git a/lib/ofp-ct-util.c b/lib/ofp-ct-util.c
> new file mode 100644
> index 0..9112305cc
> --- /dev/null
> +++ b/lib/ofp-ct-util.c
> @@ -0,0 +1,311 @@
> +
> +/* Copyright (c) 2022, Red Hat, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under th

Re: [ovs-dev] [PATCH 1/2] ofp, dpif: Allow CT flush based on partial match

2022-11-28 Thread Paolo Valerio
Hi Ales,

the patch lgtm, and works as expected. 
There are some nit/remarks below, but other than that, I'm ok with the
change.

Ales Musil  writes:

> Currently, the CT can be flushed by dpctl only be specifying
> the whole 5-tuple. This is not very convenient when there are
> only some fields known to the user of CT flush. Add new struct
> ofputil_ct_match which represents the generic filtering that can
> be done for CT flush. The match is done only on fields that are
> non-zero with exception to the icmp fields.
>
> This allows the filtering just within dpctl, however
> it is a preparation for OpenFlow extension.
>
> Reported-at: https://bugzilla.redhat.com/2120546
> Signed-off-by: Ales Musil 
> ---
>  NEWS   |   2 +
>  include/openvswitch/ofp-util.h |  28 +++
>  lib/automake.mk|   2 +
>  lib/ct-dpif.c  | 201 +
>  lib/ct-dpif.h  |   4 +-
>  lib/dpctl.c|  14 +-
>  lib/dpctl.man  |   3 +-
>  lib/ofp-ct-util.c  | 311 +
>  lib/ofp-ct-util.h  |  34 
>  tests/system-traffic.at|  80 +
>  10 files changed, 557 insertions(+), 122 deletions(-)
>  create mode 100644 lib/ofp-ct-util.c
>  create mode 100644 lib/ofp-ct-util.h
>
> diff --git a/NEWS b/NEWS
> index ff77ee404..81909812e 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -23,6 +23,8 @@ Post-v3.0.0
> bug and CVE fixes addressed since its release.
> If a user wishes to benefit from these fixes it is recommended to use
> DPDK 21.11.2.
> +   - ovs-dpctl and related ovs-appctl commands:
> + * "flush-conntrack" is capable of handling partial 5-tuple.
>  
>  
>  v3.0.0 - 15 Aug 2022
> diff --git a/include/openvswitch/ofp-util.h b/include/openvswitch/ofp-util.h
> index 091a09cad..84937ae26 100644
> --- a/include/openvswitch/ofp-util.h
> +++ b/include/openvswitch/ofp-util.h
> @@ -19,6 +19,9 @@
>  
>  #include 
>  #include 
> +#include 
> +#include 
> +
>  #include "openvswitch/ofp-protocol.h"
>  
>  struct ofp_header;
> @@ -27,6 +30,31 @@ struct ofp_header;
>  extern "C" {
>  #endif
>  
> +struct ofputil_ct_tuple {
> +struct in6_addr src;
> +struct in6_addr dst;
> +
> +union {
> +ovs_be16 src_port;
> +ovs_be16 icmp_id;
> +};
> +union {
> +ovs_be16 dst_port;
> +struct {
> +uint8_t icmp_code;
> +uint8_t icmp_type;
> +};
> +};
> +};
> +
> +struct ofputil_ct_match {
> +uint8_t ip_proto;
> +uint16_t l3_type;
> +
> +struct ofputil_ct_tuple tuple_orig;
> +struct ofputil_ct_tuple tuple_reply;
> +};
> +
>  bool ofputil_decode_hello(const struct ofp_header *,
>uint32_t *allowed_versions);
>  struct ofpbuf *ofputil_encode_hello(uint32_t version_bitmap);
> diff --git a/lib/automake.mk b/lib/automake.mk
> index a0fabe38f..37135f118 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -227,6 +227,8 @@ lib_libopenvswitch_la_SOURCES = \
>   lib/ofp-actions.c \
>   lib/ofp-bundle.c \
>   lib/ofp-connection.c \
> + lib/ofp-ct-util.c \
> + lib/ofp-ct-util.h \
>   lib/ofp-ed-props.c \
>   lib/ofp-errors.c \
>   lib/ofp-flow.c \
> diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c
> index cfc2315e3..7fbf2bea6 100644
> --- a/lib/ct-dpif.c
> +++ b/lib/ct-dpif.c
> @@ -20,6 +20,7 @@
>  #include 
>  
>  #include "ct-dpif.h"
> +#include "ofp-ct-util.h"
>  #include "openvswitch/ofp-parse.h"
>  #include "openvswitch/vlog.h"
>  
> @@ -80,6 +81,31 @@ ct_dpif_dump_start(struct dpif *dpif, struct 
> ct_dpif_dump_state **dump,
>  return err;
>  }
>  
> +static void
> +ct_dpif_tuple_from_ofputil_ct_tuple(const struct ofputil_ct_tuple *ofp_tuple,
> +struct ct_dpif_tuple *tuple,
> +uint16_t l3_type, uint8_t ip_proto)
> +{
> +if (l3_type == AF_INET) {
> +tuple->src.ip = in6_addr_get_mapped_ipv4(_tuple->src);
> +tuple->dst.ip = in6_addr_get_mapped_ipv4(_tuple->dst);
> +} else {
> +tuple->src.in6 = ofp_tuple->src;
> +tuple->dst.in6 = ofp_tuple->dst;
> +}
> +
> +tuple->l3_type = l3_type;
> +tuple->ip_proto = ip_proto;
> +tuple->src_port = ofp_tuple->src_port;
> +
> +if (ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6) {
> +tuple->icmp_code = ofp_tuple->icmp_code;
> +tuple->icmp_type = ofp_tuple->icmp_type;
> +} else {
> +tuple->dst_port = ofp_tuple->dst_port;
> +}
> +}
> +
>  /* Dump one connection from a tracker, and put it in 'entry'.
>   *
>   * 'dump' should have been initialized by ct_dpif_dump_start().
> @@ -109,7 +135,62 @@ ct_dpif_dump_done(struct ct_dpif_dump_state *dump)
>  ? dpif->dpif_class->ct_dump_done(dpif, dump)
>  : EOPNOTSUPP);
>  }
> -
> +
> +static int
> +ct_dpif_flush_tuple(struct dpif *dpif, const uint16_t 

[ovs-dev] [PATCH] conntrack: Show parent key if present.

2022-10-31 Thread Paolo Valerio
Similarly to what happens when CTA_TUPLE_MASTER is present in a ct
netlink dump, add the ability to print out the parent key to the
userspace implementation as well.

Signed-off-by: Paolo Valerio 
---
 lib/conntrack.c |4 
 1 file changed, 4 insertions(+)

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 13c5ab628..550b2be9b 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -2647,6 +2647,10 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct 
ct_dpif_entry *entry,
 conn_key_to_tuple(>key, >tuple_orig);
 conn_key_to_tuple(>rev_key, >tuple_reply);
 
+if (conn->alg_related) {
+conn_key_to_tuple(>parent_key, >tuple_parent);
+}
+
 entry->zone = conn->key.zone;
 
 ovs_mutex_lock(>lock);

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] conntrack: Refactor nat handling functions

2022-10-27 Thread Paolo Valerio
Ales Musil  writes:

> On Thu, Oct 27, 2022 at 11:14 AM Ales Musil  wrote:
>
> In order to support NAT of inner packet
> for ICMP related traffic refactor the nat
> functions. This fixes the issue that the
> NAT was not performed on inner header in orig
> direction and avoids some code duplication.
>
> Reported-at: https://bugzilla.redhat.com/2120546
> Signed-off-by: Ales Musil 
> ---
>  lib/conntrack.c         | 250 ++--
>  tests/system-traffic.at |  67 +++
>  2 files changed, 155 insertions(+), 162 deletions(-)
>
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 13c5ab628..b8b9f9c49 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -764,109 +764,59 @@ handle_alg_ctl(struct conntrack *ct, const struct
> conn_lookup_ctx *ctx,
>  }
>
>  static void
> -pat_packet(struct dp_packet *pkt, const struct conn *conn)
> +pat_packet(struct dp_packet *pkt, const struct conn_key *key)
>  {
> -    if (conn->nat_action & NAT_ACTION_SRC) {
> -        if (conn->key.nw_proto == IPPROTO_TCP) {
> -            struct tcp_header *th = dp_packet_l4(pkt);
> -            packet_set_tcp_port(pkt, conn->rev_key.dst.port, 
> th->tcp_dst);
> -        } else if (conn->key.nw_proto == IPPROTO_UDP) {
> -            struct udp_header *uh = dp_packet_l4(pkt);
> -            packet_set_udp_port(pkt, conn->rev_key.dst.port, 
> uh->udp_dst);
> -        }
> -    } else if (conn->nat_action & NAT_ACTION_DST) {
> -        if (conn->key.nw_proto == IPPROTO_TCP) {
> -            packet_set_tcp_port(pkt, conn->rev_key.dst.port,
> -                                conn->rev_key.src.port);
> -        } else if (conn->key.nw_proto == IPPROTO_UDP) {
> -            packet_set_udp_port(pkt, conn->rev_key.dst.port,
> -                                conn->rev_key.src.port);
> -        }
> +    if (key->nw_proto == IPPROTO_TCP) {
> +        packet_set_tcp_port(pkt, key->dst.port, key->src.port);
> +    } else if (key->nw_proto == IPPROTO_UDP) {
> +        packet_set_udp_port(pkt, key->dst.port, key->src.port);
>      }
>  }
>
> -static void
> -nat_packet(struct dp_packet *pkt, const struct conn *conn, bool related)
> +static uint16_t
> +nat_action_reverse(uint16_t nat_action)
>  {
> -    if (conn->nat_action & NAT_ACTION_SRC) {
> -        pkt->md.ct_state |= CS_SRC_NAT;
> -        if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
> -            struct ip_header *nh = dp_packet_l3(pkt);
> -            packet_set_ipv4_addr(pkt, >ip_src,
> -                                 conn->rev_key.dst.addr.ipv4);
> -        } else {
> -            struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
> -            packet_set_ipv6_addr(pkt, conn->key.nw_proto,
> -                                 nh6->ip6_src.be32,
> -                                 >rev_key.dst.addr.ipv6, true);
> -        }
> -        if (!related) {
> -            pat_packet(pkt, conn);
> -        }
> -    } else if (conn->nat_action & NAT_ACTION_DST) {
> -        pkt->md.ct_state |= CS_DST_NAT;
> -        if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
> -            struct ip_header *nh = dp_packet_l3(pkt);
> -            packet_set_ipv4_addr(pkt, >ip_dst,
> -                                 conn->rev_key.src.addr.ipv4);
> -        } else {
> -            struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
> -            packet_set_ipv6_addr(pkt, conn->key.nw_proto,
> -                                 nh6->ip6_dst.be32,
> -                                 >rev_key.src.addr.ipv6, true);
> -        }
> -        if (!related) {
> -            pat_packet(pkt, conn);
> -        }
> +    if (nat_action & NAT_ACTION_SRC) {
> +        nat_action ^= NAT_ACTION_SRC;
> +        nat_action |= NAT_ACTION_DST;
> +    } else if (nat_action & NAT_ACTION_DST) {
> +        nat_action ^= NAT_ACTION_DST;
> +        nat_action |= NAT_ACTION_SRC;
>      }
> +    return nat_action;
>  }
>
>  static void
> -un_pat_packet(struct dp_packet *pkt, const struct conn *conn)
> +nat_packet_ipv4(struct dp_packet *pkt, const struct conn_key *key,
> +                uint16_t nat_action)
>  {
> -    if (conn->nat_action & NAT_ACTION_SRC) {
> -        if (conn->key.nw_proto == IPPROTO_TCP) {
> -            struct tcp_header *th = dp_packet_l4(pkt);
> -            packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port);
> -        } else if (conn->key.nw_proto == IPPROTO_UDP) {
> -            struct udp_header *uh = dp_packet_l4(pkt);
> -            packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port);
> -        }
> -    } else if (conn->nat_action & NAT_ACTION_DST) {

Re: [ovs-dev] [PATCH] odp-util: Add missing comma in format_odp_conntrack_action()

2022-10-26 Thread Paolo Valerio
Ilya Maximets  writes:

> On 10/21/22 15:22, Paolo Valerio wrote:
>> If OVS_CT_ATTR_TIMEOUT is included, the resulting output is
>> the following:
>> 
>> actions:ct(commit,timeout=1nat(src=10.1.1.240))
>> 
>> Fix it by trivially adding a trailing ',' to timeout as well.
>> 
>> Signed-off-by: Paolo Valerio 
>> ---
>>  lib/odp-util.c |2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>> 
>> diff --git a/lib/odp-util.c b/lib/odp-util.c
>> index ba5be4bb3..72e076e1c 100644
>> --- a/lib/odp-util.c
>> +++ b/lib/odp-util.c
>> @@ -1004,7 +1004,7 @@ format_odp_conntrack_action(struct ds *ds, const 
>> struct nlattr *attr)
>>  ds_put_format(ds, "helper=%s,", helper);
>>  }
>>  if (timeout) {
>> -ds_put_format(ds, "timeout=%s", timeout);
>> +ds_put_format(ds, "timeout=%s,", timeout);
>>  }
>>  if (nat) {
>>  format_odp_ct_nat(ds, nat);
>> 
>
> Hi.  Thanks for the patch!
> Could you also, please, add a test case to tests/odp.at for this?

Sure, thanks for pointing that out.
Sent v2:

https://patchwork.ozlabs.org/project/openvswitch/patch/166677384931.806968.5359905777279608036.st...@fed.void/

>
> Best regards, Ilya Maximets.

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2] odp-util: Add missing separator in format_odp_conntrack_action()

2022-10-26 Thread Paolo Valerio
If OVS_CT_ATTR_TIMEOUT is included, the resulting output is
the following:

actions:ct(commit,timeout=1nat(src=10.1.1.240))

Fix it by trivially adding a trailing ',' to timeout as well.

Signed-off-by: Paolo Valerio 
---
v2: added test case in odp.at
---
 lib/odp-util.c |2 +-
 tests/odp.at   |2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/odp-util.c b/lib/odp-util.c
index ba5be4bb3..72e076e1c 100644
--- a/lib/odp-util.c
+++ b/lib/odp-util.c
@@ -1004,7 +1004,7 @@ format_odp_conntrack_action(struct ds *ds, const struct 
nlattr *attr)
 ds_put_format(ds, "helper=%s,", helper);
 }
 if (timeout) {
-ds_put_format(ds, "timeout=%s", timeout);
+ds_put_format(ds, "timeout=%s,", timeout);
 }
 if (nat) {
 format_odp_ct_nat(ds, nat);
diff --git a/tests/odp.at b/tests/odp.at
index 7a1cf3b2c..88b7cfd91 100644
--- a/tests/odp.at
+++ b/tests/odp.at
@@ -348,7 +348,9 @@ ct(commit,helper=tftp)
 ct(commit,timeout=ovs_tp_1_tcp4)
 ct(nat)
 ct(commit,nat(src))
+ct(commit,timeout=ovs_tp_1_tcp4,nat(src))
 ct(commit,nat(dst))
+ct(commit,timeout=ovs_tp_1_tcp4,nat(dst))
 ct(commit,nat(src=10.0.0.240,random))
 ct(commit,nat(src=10.0.0.240:32768-65535,random))
 ct(commit,nat(dst=10.0.0.128-10.0.0.254,hash))

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] odp-util: Add missing comma in format_odp_conntrack_action()

2022-10-21 Thread Paolo Valerio
If OVS_CT_ATTR_TIMEOUT is included, the resulting output is
the following:

actions:ct(commit,timeout=1nat(src=10.1.1.240))

Fix it by trivially adding a trailing ',' to timeout as well.

Signed-off-by: Paolo Valerio 
---
 lib/odp-util.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/odp-util.c b/lib/odp-util.c
index ba5be4bb3..72e076e1c 100644
--- a/lib/odp-util.c
+++ b/lib/odp-util.c
@@ -1004,7 +1004,7 @@ format_odp_conntrack_action(struct ds *ds, const struct 
nlattr *attr)
 ds_put_format(ds, "helper=%s,", helper);
 }
 if (timeout) {
-ds_put_format(ds, "timeout=%s", timeout);
+ds_put_format(ds, "timeout=%s,", timeout);
 }
 if (nat) {
 format_odp_ct_nat(ds, nat);

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [RFC PATCH 1/2] dpif: Add support for CT flush with partial tuple

2022-10-17 Thread Paolo Valerio
Hello Ales,

overall the approach is ok, the only concern is that, unless I'm missing
something, in case of many connections, the exact match deletion could
potentially take a while, whereas in the previous case the cost
was basically a lookup (constant time) and of course the remaining
deletion operation.

It would be nice to avoid the extra cost when the whole 5-tuple is
specified. WDYT?

Ales Musil  writes:

> Curreently in order to flush conntrack you would need to
> specify full 5-tuple. Add support for partial match
> it still has some limitations however it is capable of flushing
> all that match specified field e.g. source ip address.
>
> Reported-at: https://bugzilla.redhat.com/2120546
> Signed-off-by: Ales Musil 
> ---
>  NEWS|   2 +
>  lib/ct-dpif.c   | 178 +++-
>  lib/dpctl.man   |   3 +-
>  tests/system-traffic.at |  84 ++-
>  4 files changed, 226 insertions(+), 41 deletions(-)
>
> diff --git a/NEWS b/NEWS
> index ff77ee404..81909812e 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -23,6 +23,8 @@ Post-v3.0.0
> bug and CVE fixes addressed since its release.
> If a user wishes to benefit from these fixes it is recommended to use
> DPDK 21.11.2.
> +   - ovs-dpctl and related ovs-appctl commands:
> + * "flush-conntrack" is capable of handling partial 5-tuple.
>  
>  
>  v3.0.0 - 15 Aug 2022
> diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c
> index cfc2315e3..57995f5e5 100644
> --- a/lib/ct-dpif.c
> +++ b/lib/ct-dpif.c
> @@ -18,6 +18,8 @@
>  #include "dpif-provider.h"
>  
>  #include 
> +#include 
> +#include 
>  
>  #include "ct-dpif.h"
>  #include "openvswitch/ofp-parse.h"
> @@ -109,7 +111,113 @@ ct_dpif_dump_done(struct ct_dpif_dump_state *dump)
>  ? dpif->dpif_class->ct_dump_done(dpif, dump)
>  : EOPNOTSUPP);
>  }
> -

was this intentional?
Just checking

> +
> +static inline bool
> +ct_dpif_inet_addr_cmp_partial(const union ct_dpif_inet_addr *partial,
> +  const union ct_dpif_inet_addr *addr)
> +{
> +if (!ipv6_is_zero(>in6) &&
> +!ipv6_addr_equals(>in6, >in6)) {
> +return false;
> +}
> +return true;
> +}
> +
> +/* Compares the non-zero members if they match. This is usefull for clearing
> + * up all conntracks specified by a partial tuple. */
> +static inline bool
> +ct_dpif_tuple_cmp_partial(const struct ct_dpif_tuple *partial,
> +  const struct ct_dpif_tuple *tuple)
> +{
> +/* There is no point in continuing if both do not use the same eth type. 
> */
> +if (partial->l3_type != tuple->l3_type) {
> +return false;
> +}
> +
> +if (partial->ip_proto && partial->ip_proto != tuple->ip_proto) {
> +return false;
> +}
> +
> +if (!ct_dpif_inet_addr_cmp_partial(>src, >src)) {
> +return false;
> +}
> +
> +if (!ct_dpif_inet_addr_cmp_partial(>dst, >dst)) {
> +return false;
> +}
> +
> +if (partial->ip_proto == IPPROTO_TCP || partial->ip_proto == 
> IPPROTO_UDP) {
> +
> +if (partial->src_port && partial->src_port != tuple->src_port) {
> +return false;
> +}
> +
> +if (partial->dst_port && partial->dst_port != tuple->dst_port) {
> +return false;
> +}
> +} else if (partial->ip_proto == IPPROTO_ICMP ||
> +   partial->ip_proto == IPPROTO_ICMPV6) {
> +
> +if (partial->icmp_id != tuple->icmp_id) {
> +return false;
> +}
> +
> +if (partial->icmp_type != tuple->icmp_type) {
> +return false;
> +}
> +
> +if (partial->icmp_code != tuple->icmp_code) {
> +return false;
> +}
> +}
> +
> +return true;
> +}
> +
> +static int
> +ct_dpif_flush_tuple(struct dpif *dpif, const uint16_t *zone,
> +const struct ct_dpif_tuple *tuple) {
> +struct ct_dpif_dump_state *dump;
> +struct ct_dpif_entry cte;
> +int error;
> +int tot_bkts;
> +
> +if (!dpif->dpif_class->ct_flush) {
> +return EOPNOTSUPP;
> +}
> +
> +if (VLOG_IS_DBG_ENABLED()) {
> +struct ds ds = DS_EMPTY_INITIALIZER;
> +ct_dpif_format_tuple(, tuple);
> +VLOG_DBG("%s: ct_flush: %s in zone %d", dpif_name(dpif), 
> ds_cstr(),
> +  zone ? *zone : 0);
> +ds_destroy();
> +}
> +
> +error = ct_dpif_dump_start(dpif, , zone, _bkts);
> +if (error) {
> +return error;
> +}
> +
> +while (!(error = ct_dpif_dump_next(dump, ))) {
> +if (zone && *zone != cte.zone) {
> +continue;
> +}
> +
> +if (ct_dpif_tuple_cmp_partial(tuple, _orig)) {
> +error = dpif->dpif_class->ct_flush(dpif, ,
> +   _orig);
> +if (error) {
> +break;
> +}
> +}
> +}
> +if (error == EOF) {
> +error = 0;
> +}

[ovs-dev] [PATCH v2] ct-dpif: Replace ct_dpif_format_flags() with format_flags_masked().

2022-10-12 Thread Paolo Valerio
This patch removes ct_dpif_format_flags() in favor of the existing
format_flags_masked().
This has the extra bonus of showing keys with empty values as "key=0",
instead of showing "key=".

E.g., the following:

NEW tcp,orig=([...]),reply=([...]),id=1800618864,
status=CONFIRMED|SRC_NAT_DONE|DST_NAT_DONE,timeout=120,
protoinfo=(state_orig=SYN_SENT,state_reply=SYN_SENT,wscale_orig=7,
   wscale_reply=0,flags_orig=WINDOW_SCALE|SACK_PERM,flags_reply=)

becomes:

NEW tcp,orig=([...]),reply=([...]),id=1800618864,
status=CONFIRMED|SRC_NAT_DONE|DST_NAT_DONE,timeout=120,
protoinfo=(state_orig=SYN_SENT,state_reply=SYN_SENT,wscale_orig=7,
   wscale_reply=0,flags_orig=WINDOW_SCALE|SACK_PERM,flags_reply=0)

Signed-off-by: Paolo Valerio 
---
v2:
 - updated commit message (was "ct-dpif: Do not show flag key if empty.")
 - instead of hiding the key, ct_dpif_format_flags() got replaced by
   format_flags_masked() which will show "key=0" in case of empty flags
---
 lib/ct-dpif.c |   76 +
 lib/ct-dpif.h |4 +++
 2 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c
index cfc2315e3..6f17a26b5 100644
--- a/lib/ct-dpif.c
+++ b/lib/ct-dpif.c
@@ -35,20 +35,11 @@ static void ct_dpif_format_counters(struct ds *,
 const struct ct_dpif_counters *);
 static void ct_dpif_format_timestamp(struct ds *,
  const struct ct_dpif_timestamp *);
-static void ct_dpif_format_flags(struct ds *, const char *title,
- uint32_t flags, const struct flags *);
 static void ct_dpif_format_protoinfo(struct ds *, const char *title,
  const struct ct_dpif_protoinfo *,
  bool verbose);
 static void ct_dpif_format_helper(struct ds *, const char *title,
   const struct ct_dpif_helper *);
-
-static const struct flags ct_dpif_status_flags[] = {
-#define CT_DPIF_STATUS_FLAG(FLAG) { CT_DPIF_STATUS_##FLAG, #FLAG },
-CT_DPIF_STATUS_FLAGS
-#undef CT_DPIF_STATUS_FLAG
-{ 0, NULL } /* End marker. */
-};
 
 /* Dumping */
 
@@ -275,6 +266,20 @@ ct_dpif_entry_uninit(struct ct_dpif_entry *entry)
 }
 }
 
+static const char *
+ct_dpif_status_flags(uint32_t flags)
+{
+switch (flags) {
+#define CT_DPIF_STATUS_FLAG(FLAG) \
+case CT_DPIF_STATUS_##FLAG: \
+return #FLAG;
+CT_DPIF_STATUS_FLAGS
+#undef CT_DPIF_TCP_FLAG
+default:
+return NULL;
+}
+}
+
 void
 ct_dpif_format_entry(const struct ct_dpif_entry *entry, struct ds *ds,
  bool verbose, bool print_stats)
@@ -305,8 +310,9 @@ ct_dpif_format_entry(const struct ct_dpif_entry *entry, 
struct ds *ds,
 ds_put_format(ds, ",zone=%"PRIu16, entry->zone);
 }
 if (verbose) {
-ct_dpif_format_flags(ds, ",status=", entry->status,
- ct_dpif_status_flags);
+format_flags_masked(ds, ",status", ct_dpif_status_flags,
+entry->status, CT_DPIF_STATUS_MASK,
+CT_DPIF_STATUS_MASK);
 }
 if (print_stats) {
 ds_put_format(ds, ",timeout=%"PRIu32, entry->timeout);
@@ -415,28 +421,6 @@ ct_dpif_format_tuple(struct ds *ds, const struct 
ct_dpif_tuple *tuple)
 }
 }
 
-static void
-ct_dpif_format_flags(struct ds *ds, const char *title, uint32_t flags,
- const struct flags *table)
-{
-if (title) {
-ds_put_cstr(ds, title);
-}
-for (; table->name; table++) {
-if (flags & table->flag) {
-ds_put_format(ds, "%s|", table->name);
-}
-}
-ds_chomp(ds, '|');
-}
-
-static const struct flags tcp_flags[] = {
-#define CT_DPIF_TCP_FLAG(FLAG)  { CT_DPIF_TCPF_##FLAG, #FLAG },
-CT_DPIF_TCP_FLAGS
-#undef CT_DPIF_TCP_FLAG
-{ 0, NULL } /* End marker. */
-};
-
 const char *ct_dpif_tcp_state_string[] = {
 #define CT_DPIF_TCP_STATE(STATE) [CT_DPIF_TCPS_##STATE] = #STATE,
 CT_DPIF_TCP_STATES
@@ -498,6 +482,20 @@ ct_dpif_format_protoinfo_tcp(struct ds *ds,
 ct_dpif_format_enum(ds, "state=", tcp_state, ct_dpif_tcp_state_string);
 }
 
+static const char *
+ct_dpif_tcp_flags(uint32_t flags)
+{
+switch (flags) {
+#define CT_DPIF_TCP_FLAG(FLAG) \
+case CT_DPIF_TCPF_##FLAG: \
+return #FLAG;
+CT_DPIF_TCP_FLAGS
+#undef CT_DPIF_TCP_FLAG
+default:
+return NULL;
+}
+}
+
 static void
 ct_dpif_format_protoinfo_tcp_verbose(struct ds *ds,
  const struct ct_dpif_protoinfo *protoinfo)
@@ -512,10 +510,14 @@ ct_dpif_format_protoinfo_tcp_verbose(struct ds *ds,
   protoinfo->tcp.wscale_orig,
   protoinfo->tcp.wscale_reply);
 }
-ct_dpif_format_fla

Re: [ovs-dev] [PATCH v3] ofproto-dpif-xlate: Update tunnel neighbor when receive gratuitous arp.

2022-09-21 Thread Paolo Valerio
Hello Han,

"Han Ding"  writes:

> Commit ba07cf222a add the feature "Handle gratuitous ARP requests and
> replies in tnl_arp_snoop()". But commit 83c2757bd1 just allow the ARP whitch
> the destination address of the ARP is matched against the known xbridge 
> addresses.
> So the modification of commit ba07cf222a is not effective. When ovs receive 
> the
> gratuitous ARP from underlay gateway which the source address and destination
> address are all gateway IP, tunnel neighbor will not be updated.
>

I think it would be clearer formatting the commits like below:

$ git -P show -s --format="%h (\"%s\")" --abbrev=12 ba07cf222a
ba07cf222a0c ("Handle gratuitous ARP requests and replies in tnl_arp_snoop()")

$ git -P show -s --format="%h (\"%s\")" --abbrev=12 83c2757bd1
83c2757bd16e ("xlate: Move tnl_neigh_snoop() to terminate_native_tunnel()")

I guess that the last commit deserves a Fixes tag as well.

> Signed-off-by: Han Ding 
> ---
>
> Notes:
> v3
> Correct the spell mistake.
>
> v2
> Change author name.  
>
>  ofproto/ofproto-dpif-xlate.c | 10 +++---
>  tests/tunnel-push-pop.at | 20 
>  2 files changed, 27 insertions(+), 3 deletions(-)
>
> diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
> index 8e5d030ac..6c69f981b 100644
> --- a/ofproto/ofproto-dpif-xlate.c
> +++ b/ofproto/ofproto-dpif-xlate.c
> @@ -4126,6 +4126,11 @@ xport_has_ip(const struct xport *xport)
>  return n_in6 ? true : false;
>  }
>
> +#define IS_VALID_NEIGHBOR_REPLY(flow, ctx) \
> +((flow->dl_type == htons(ETH_TYPE_ARP) || \
> +  flow->nw_proto == IPPROTO_ICMPV6) && \
> + is_neighbor_reply_correct(ctx, flow))
> +

Although terminate_native_tunnel() would be the only user, I guess a
static function could be ok here, instead.

>  static bool
>  terminate_native_tunnel(struct xlate_ctx *ctx, const struct xport *xport,
>  struct flow *flow, struct flow_wildcards *wc,
> @@ -4146,9 +4151,8 @@ terminate_native_tunnel(struct xlate_ctx *ctx, const 
> struct xport *xport,
>  /* If no tunnel port was found and it's about an ARP or ICMPv6 
> packet,
>   * do tunnel neighbor snooping. */
>  if (*tnl_port == ODPP_NONE &&
> -(flow->dl_type == htons(ETH_TYPE_ARP) ||
> - flow->nw_proto == IPPROTO_ICMPV6) &&
> - is_neighbor_reply_correct(ctx, flow)) {
> +(IS_VALID_NEIGHBOR_REPLY(flow, ctx) ||
> + is_garp(flow, wc))) {

AFAICT, this seems ok to me and the tests related to tunnel_push_pop
succeed. There's probably some room for improvement in the code down to
tnl_arp_snoop(), but I guess it's a bit out of scope of this patch.

>  tnl_neigh_snoop(flow, wc, ctx->xbridge->name,
>  ctx->xin->allow_side_effects);
>  } else if (*tnl_port != ODPP_NONE &&
> diff --git a/tests/tunnel-push-pop.at b/tests/tunnel-push-pop.at
> index c63344196..0bac362f4 100644
> --- a/tests/tunnel-push-pop.at
> +++ b/tests/tunnel-push-pop.at
> @@ -369,6 +369,26 @@ AT_CHECK([ovs-appctl tnl/neigh/show | grep br | sort], 
> [0], [dnl
>  1.1.2.92  f8:bc:12:44:34:b6   br0
>  ])
>
> +dnl Receiving Gratuitous ARP request with correct VLAN id should alter 
> tunnel neighbor cache
> +AT_CHECK([ovs-appctl netdev-dummy/receive p0 
> 'recirc_id(0),in_port(1),eth(src=f8:bc:12:44:34:c8,dst=ff:ff:ff:ff:ff:ff),eth_type(0x8100),vlan(vid=10,pcp=7),encap(eth_type(0x0806),arp(sip=1.1.2.92,tip=1.1.2.92,op=1,sha=f8:bc:12:44:34:c8,tha=00:00:00:00:00:00))'])
> +
> +ovs-appctl time/warp 1000
> +ovs-appctl time/warp 1000
> +
> +AT_CHECK([ovs-appctl tnl/neigh/show | grep br | sort], [0], [dnl
> +1.1.2.92  f8:bc:12:44:34:c8   br0
> +])
> +
> +dnl Receiving Gratuitous ARP reply with correct VLAN id should alter tunnel 
> neighbor cache
> +AT_CHECK([ovs-appctl netdev-dummy/receive p0 
> 'recirc_id(0),in_port(1),eth(src=f8:bc:12:44:34:b2,dst=ff:ff:ff:ff:ff:ff),eth_type(0x8100),vlan(vid=10,pcp=7),encap(eth_type(0x0806),arp(sip=1.1.2.92,tip=1.1.2.92,op=2,sha=f8:bc:12:44:34:b2,tha=f8:bc:12:44:34:b2))'])
> +
> +ovs-appctl time/warp 1000
> +ovs-appctl time/warp 1000
> +
> +AT_CHECK([ovs-appctl tnl/neigh/show | grep br | sort], [0], [dnl
> +1.1.2.92  f8:bc:12:44:34:b2   br0
> +])
> +
>  dnl Receive ARP reply without VLAN header
>  AT_CHECK([ovs-vsctl set port br0 tag=0])
>  AT_CHECK([ovs-appctl tnl/neigh/flush], [0], [OK
> --
> 2.27.0
>
>
>
>
> ___
> dev mailing list
> d...@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH 2/2] ct-dpif: Do not show flag key if empty.

2022-09-09 Thread Paolo Valerio
Ilya Maximets  writes:

> On 8/4/22 18:07, Paolo Valerio wrote:
>> This patch avoids to show flags_orig/flags_reply key if they have no value.
>> E.g., the following:
>> 
>> NEW tcp,orig=([...]),reply=([...]),id=1800618864,
>> status=CONFIRMED|SRC_NAT_DONE|DST_NAT_DONE,timeout=120,
>> protoinfo=(state_orig=SYN_SENT,state_reply=SYN_SENT,wscale_orig=7,
>>wscale_reply=0,flags_orig=WINDOW_SCALE|SACK_PERM,flags_reply=)
>> 
>> becomes:
>> 
>> NEW tcp,orig=([...]),reply=([...]),id=1800618864,
>> status=CONFIRMED|SRC_NAT_DONE|DST_NAT_DONE,timeout=120,
>> protoinfo=(state_orig=SYN_SENT,state_reply=SYN_SENT,wscale_orig=7,
>>        wscale_reply=0,flags_orig=WINDOW_SCALE|SACK_PERM)
>> 
>> Signed-off-by: Paolo Valerio 
>> ---
>>  lib/ct-dpif.c |   14 ++
>>  1 file changed, 10 insertions(+), 4 deletions(-)
>> 
>> diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c
>> index cfc2315e3..f1a375523 100644
>> --- a/lib/ct-dpif.c
>> +++ b/lib/ct-dpif.c
>> @@ -512,10 +512,16 @@ ct_dpif_format_protoinfo_tcp_verbose(struct ds *ds,
>>protoinfo->tcp.wscale_orig,
>>protoinfo->tcp.wscale_reply);
>>  }
>> -ct_dpif_format_flags(ds, ",flags_orig=", protoinfo->tcp.flags_orig,
>> - tcp_flags);
>> -ct_dpif_format_flags(ds, ",flags_reply=", protoinfo->tcp.flags_reply,
>> - tcp_flags);
>> +
>> +if (protoinfo->tcp.flags_orig) {
>> +ct_dpif_format_flags(ds, ",flags_orig=", protoinfo->tcp.flags_orig,
>> + tcp_flags);
>> +}
>> +
>> +if (protoinfo->tcp.flags_reply) {
>> +ct_dpif_format_flags(ds, ",flags_reply=", 
>> protoinfo->tcp.flags_reply,
>> + tcp_flags);
>> +}
>
> Hmm.  I'm trying to understand why ct_dpif_format_flags() exists at all.
> Shouldn't this be just:
>
>   format_flags_masked(ds, "flags_orig", packet_tcp_flag_to_string,
>   protoinfo->tcp.flags_orig, TCP_FLAGS(OVS_BE16_MAX),
>   TCP_FLAGS(OVS_BE16_MAX));
>
> ?
>
> This will change the appearance of the flags, so maybe tcp_flags[] array
> should be replaced with a simple conversion function.
>

Uhm, I guess you're right. It seems redundant and could be removed.
What about something like this?

diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c
index cfc2315e3..6f17a26b5 100644
--- a/lib/ct-dpif.c
+++ b/lib/ct-dpif.c
@@ -35,20 +35,11 @@ static void ct_dpif_format_counters(struct ds *,
 const struct ct_dpif_counters *);
 static void ct_dpif_format_timestamp(struct ds *,
  const struct ct_dpif_timestamp *);
-static void ct_dpif_format_flags(struct ds *, const char *title,
- uint32_t flags, const struct flags *);
 static void ct_dpif_format_protoinfo(struct ds *, const char *title,
  const struct ct_dpif_protoinfo *,
  bool verbose);
 static void ct_dpif_format_helper(struct ds *, const char *title,
   const struct ct_dpif_helper *);
-
-static const struct flags ct_dpif_status_flags[] = {
-#define CT_DPIF_STATUS_FLAG(FLAG) { CT_DPIF_STATUS_##FLAG, #FLAG },
-CT_DPIF_STATUS_FLAGS
-#undef CT_DPIF_STATUS_FLAG
-{ 0, NULL } /* End marker. */
-};
 
 /* Dumping */
 
@@ -275,6 +266,20 @@ ct_dpif_entry_uninit(struct ct_dpif_entry *entry)
 }
 }
 
+static const char *
+ct_dpif_status_flags(uint32_t flags)
+{
+switch (flags) {
+#define CT_DPIF_STATUS_FLAG(FLAG) \
+case CT_DPIF_STATUS_##FLAG: \
+return #FLAG;
+CT_DPIF_STATUS_FLAGS
+#undef CT_DPIF_TCP_FLAG
+default:
+return NULL;
+}
+}
+
 void
 ct_dpif_format_entry(const struct ct_dpif_entry *entry, struct ds *ds,
  bool verbose, bool print_stats)
@@ -305,8 +310,9 @@ ct_dpif_format_entry(const struct ct_dpif_entry *entry, 
struct ds *ds,
 ds_put_format(ds, ",zone=%"PRIu16, entry->zone);
 }
 if (verbose) {
-ct_dpif_format_flags(ds, ",status=", entry->status,
- ct_dpif_status_flags);
+format_flags_masked(ds, ",status", ct_dpif_status_flags,
+entry->status, CT_DPIF_STATUS_MASK,
+CT_DPIF_STATUS_MASK);
 }
 if (print_stats) {
 ds_put_format(ds, ",timeout=%"PRIu32, entry->timeout);
@@ -415,28 +421,6 @@ ct_dpif_format_tuple(struct ds *ds, const struct 
ct_dpif_tuple *tuple)

Re: [ovs-dev] [PATCH] system-traffic: Fix IPv4 fragmentation test sequence for check-kernel.

2022-08-09 Thread Paolo Valerio
Ilya Maximets  writes:

> On 8/5/22 23:49, Paolo Valerio wrote:
>> Ilya Maximets  writes:
>> 
>>> On 8/5/22 17:08, Paolo Valerio wrote:
>>>> The following test sequence:
>>>>
>>>> conntrack - IPv4 fragmentation incomplete reassembled packet
>>>> conntrack - IPv4 fragmentation with fragments specified
>>>>
>>>> leads to a systematic failure of the latter test on the kernel
>>>> datapath (linux).  Multiple executions of the former may also lead to
>>>> multiple failures.
>>>> This is due to the fact that fragments not yet reassembled are kept in
>>>> a queue for /proc/sys/net/ipv4/ipfrag_time seconds, and if the
>>>> kernel receives a fragment already present in the queue, it returns
>>>> -EINVAL.
>>>
>>> Thanks for the patch!  I've been looking at the issue earlier
>>> this week.  One thing I don't understand is that we're reloading
>>> all the netfilter modules between tests, shouldn't this clear
>>> all the pending queues?  Or this re-assembly is happening outside
>>> of the conntrack?
>>>
>> 
>> That's a fair point.
>> AFAICT, queues and the pending fragments sit in a per netns fragment
>> queue directory. In the case of the kernel dp ovs_dp_get_net(dp). If my
>> reading is correct, IPv4 pending fragments should be removed when the
>> netns is destroyed.
>
> Hmm, ok.  Thanks for the explanation.  I tried to prototype some
> change to run all tests in a separate namespace that gets removed
> after each test, but the integration with autotest doesn't work
> well this way.  I guess, we either need a way to put current shell
> (not the forked one) into a new namespace, for which I didn't find
> any supported APIs, or we'll have to heavily modify all the tests
> and macros, which doesn't sound like a lot of fun.
>

In general, the idea seems a good one to me aside from this specific
issue.
Yes, no APIs spotted, and I agree that all those modifications don't
sound particularly fun :)

> For now, I confirmed that the fix is working on my setup.
> Applied and backported down to 2.13.
>

Thank you Ilya!

> Best regards, Ilya Maximets.
>
>> 
>>>>
>>>> Below the related log message:
>>>> |00058|dpif|WARN|system@ovs-system: execute ct(commit) failed (Invalid 
>>>> argument)
>>>>   on packet 
>>>> udp,vlan_tci=0x,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,
>>>>   
>>>> nw_src=10.1.1.1,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=0,nw_ttl=0,nw_frag=first,tp_src=1,
>>>>   tp_dst=2 udp_csum:0
>>>>
>>>> Fix the sequence by sending the second fragment in "conntrack - IPv4
>>>> fragmentation incomplete reassembled packet", once the checks are
>>>> done.
>>>>
>>>> IPv6 tests are not affected as the defrag kernel code path pretends to
>>>> add the duplicate fragment to the queue returning -EINPROGRESS, when a
>>>> duplicate is detected.
>>>>
>>>> Signed-off-by: Paolo Valerio 
>>>> ---
>>>>  tests/system-traffic.at |5 +
>>>>  1 file changed, 5 insertions(+)
>>>>
>>>> diff --git a/tests/system-traffic.at b/tests/system-traffic.at
>>>> index 1a864057c..8497b4d9e 100644
>>>> --- a/tests/system-traffic.at
>>>> +++ b/tests/system-traffic.at
>>>> @@ -3452,6 +3452,11 @@ AT_CHECK([ovs-ofctl bundle br0 bundle.txt])
>>>>  AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], 
>>>> [dnl
>>>>  ])
>>>>  
>>>> +dnl Send the second fragment in order to avoid keeping the first fragment
>>>> +dnl in the queue until the expiration occurs. Fragments already queued, 
>>>> if resent,
>>>> +dnl may lead to failures on the kernel datapath.
>>>> +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1, 
>>>> packet=5054000a505400090800453100320011a4860a0101010a010102000100020008001020304050607080910203040506070809,
>>>>  actions=ct(commit)"])
>>>> +
>>>>  OVS_TRAFFIC_VSWITCHD_STOP
>>>>  AT_CLEANUP
>>>>  
>>>>
>> 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] system-traffic: Fix IPv4 fragmentation test sequence for check-kernel.

2022-08-05 Thread Paolo Valerio
Ilya Maximets  writes:

> On 8/5/22 17:08, Paolo Valerio wrote:
>> The following test sequence:
>> 
>> conntrack - IPv4 fragmentation incomplete reassembled packet
>> conntrack - IPv4 fragmentation with fragments specified
>> 
>> leads to a systematic failure of the latter test on the kernel
>> datapath (linux).  Multiple executions of the former may also lead to
>> multiple failures.
>> This is due to the fact that fragments not yet reassembled are kept in
>> a queue for /proc/sys/net/ipv4/ipfrag_time seconds, and if the
>> kernel receives a fragment already present in the queue, it returns
>> -EINVAL.
>
> Thanks for the patch!  I've been looking at the issue earlier
> this week.  One thing I don't understand is that we're reloading
> all the netfilter modules between tests, shouldn't this clear
> all the pending queues?  Or this re-assembly is happening outside
> of the conntrack?
>

That's a fair point.
AFAICT, queues and the pending fragments sit in a per netns fragment
queue directory. In the case of the kernel dp ovs_dp_get_net(dp). If my
reading is correct, IPv4 pending fragments should be removed when the
netns is destroyed.

>> 
>> Below the related log message:
>> |00058|dpif|WARN|system@ovs-system: execute ct(commit) failed (Invalid 
>> argument)
>>   on packet 
>> udp,vlan_tci=0x,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,
>>   
>> nw_src=10.1.1.1,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=0,nw_ttl=0,nw_frag=first,tp_src=1,
>>   tp_dst=2 udp_csum:0
>> 
>> Fix the sequence by sending the second fragment in "conntrack - IPv4
>> fragmentation incomplete reassembled packet", once the checks are
>> done.
>> 
>> IPv6 tests are not affected as the defrag kernel code path pretends to
>> add the duplicate fragment to the queue returning -EINPROGRESS, when a
>> duplicate is detected.
>> 
>> Signed-off-by: Paolo Valerio 
>> ---
>>  tests/system-traffic.at |5 +
>>  1 file changed, 5 insertions(+)
>> 
>> diff --git a/tests/system-traffic.at b/tests/system-traffic.at
>> index 1a864057c..8497b4d9e 100644
>> --- a/tests/system-traffic.at
>> +++ b/tests/system-traffic.at
>> @@ -3452,6 +3452,11 @@ AT_CHECK([ovs-ofctl bundle br0 bundle.txt])
>>  AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl
>>  ])
>>  
>> +dnl Send the second fragment in order to avoid keeping the first fragment
>> +dnl in the queue until the expiration occurs. Fragments already queued, if 
>> resent,
>> +dnl may lead to failures on the kernel datapath.
>> +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1, 
>> packet=5054000a505400090800453100320011a4860a0101010a010102000100020008001020304050607080910203040506070809,
>>  actions=ct(commit)"])
>> +
>>  OVS_TRAFFIC_VSWITCHD_STOP
>>  AT_CLEANUP
>>  
>> 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] system-traffic: Fix IPv4 fragmentation test sequence for check-kernel.

2022-08-05 Thread Paolo Valerio
The following test sequence:

conntrack - IPv4 fragmentation incomplete reassembled packet
conntrack - IPv4 fragmentation with fragments specified

leads to a systematic failure of the latter test on the kernel
datapath (linux).  Multiple executions of the former may also lead to
multiple failures.
This is due to the fact that fragments not yet reassembled are kept in
a queue for /proc/sys/net/ipv4/ipfrag_time seconds, and if the
kernel receives a fragment already present in the queue, it returns
-EINVAL.

Below the related log message:
|00058|dpif|WARN|system@ovs-system: execute ct(commit) failed (Invalid argument)
  on packet 
udp,vlan_tci=0x,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,
  
nw_src=10.1.1.1,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=0,nw_ttl=0,nw_frag=first,tp_src=1,
  tp_dst=2 udp_csum:0

Fix the sequence by sending the second fragment in "conntrack - IPv4
fragmentation incomplete reassembled packet", once the checks are
done.

IPv6 tests are not affected as the defrag kernel code path pretends to
add the duplicate fragment to the queue returning -EINPROGRESS, when a
duplicate is detected.

Signed-off-by: Paolo Valerio 
---
 tests/system-traffic.at |5 +
 1 file changed, 5 insertions(+)

diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index 1a864057c..8497b4d9e 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -3452,6 +3452,11 @@ AT_CHECK([ovs-ofctl bundle br0 bundle.txt])
 AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl
 ])
 
+dnl Send the second fragment in order to avoid keeping the first fragment
+dnl in the queue until the expiration occurs. Fragments already queued, if 
resent,
+dnl may lead to failures on the kernel datapath.
+AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1, 
packet=5054000a505400090800453100320011a4860a0101010a010102000100020008001020304050607080910203040506070809,
 actions=ct(commit)"])
+
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 1/2] netlink-conntrack: Do not fail to parse if optional TCP protocol attributes are not found.

2022-08-04 Thread Paolo Valerio
Some of the CTA_PROTOINFO_TCP nested attributes are not always
included in the received message, but the parsing logic considers them
as required, failing in case they are not found.

This was observed while monitoring some connections by reading the
events sent by conntrack:

./ovstest test-netlink-conntrack monitor
[...]
2022-08-04T09:39:02Z|7|netlink_conntrack|ERR|Could not parse nested TCP 
protoinfo
  options. Possibly incompatible Linux kernel version.
2022-08-04T09:39:02Z|8|netlink_notifier|WARN|unexpected netlink message 
contents
[...]

All the TCP DELETE/DESTROY events fail to parse with the message
above.

Fix it by turning the relevant attributes to optional.

Signed-off-by: Paolo Valerio 
---
- [1] is the related piece of code that skips flags and wscale for the
  destroy evts.

[1] 
https://github.com/torvalds/linux/blob/master/net/netfilter/nf_conntrack_proto_tcp.c#L1202
---
 lib/netlink-conntrack.c |   45 +++--
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/lib/netlink-conntrack.c b/lib/netlink-conntrack.c
index 78f1bf60b..4fcde9ba1 100644
--- a/lib/netlink-conntrack.c
+++ b/lib/netlink-conntrack.c
@@ -672,13 +672,13 @@ nl_ct_parse_protoinfo_tcp(struct nlattr *nla,
 static const struct nl_policy policy[] = {
 [CTA_PROTOINFO_TCP_STATE] = { .type = NL_A_U8, .optional = false },
 [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NL_A_U8,
-.optional = false },
+.optional = true },
 [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NL_A_U8,
- .optional = false },
+ .optional = true },
 [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .type = NL_A_U16,
-   .optional = false },
+   .optional = true },
 [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .type = NL_A_U16,
-.optional = false },
+.optional = true },
 };
 struct nlattr *attrs[ARRAY_SIZE(policy)];
 bool parsed;
@@ -695,20 +695,29 @@ nl_ct_parse_protoinfo_tcp(struct nlattr *nla,
  * connection, but our structures store a separate state for
  * each endpoint.  Here we duplicate the state. */
 protoinfo->tcp.state_orig = protoinfo->tcp.state_reply = state;
-protoinfo->tcp.wscale_orig = nl_attr_get_u8(
-attrs[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
-protoinfo->tcp.wscale_reply = nl_attr_get_u8(
-attrs[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
-flags_orig =
-nl_attr_get_unspec(attrs[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL],
-   sizeof *flags_orig);
-protoinfo->tcp.flags_orig =
-ip_ct_tcp_flags_to_dpif(flags_orig->flags);
-flags_reply =
-nl_attr_get_unspec(attrs[CTA_PROTOINFO_TCP_FLAGS_REPLY],
-   sizeof *flags_reply);
-protoinfo->tcp.flags_reply =
-ip_ct_tcp_flags_to_dpif(flags_reply->flags);
+
+if (attrs[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]) {
+protoinfo->tcp.wscale_orig =
+nl_attr_get_u8(attrs[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
+}
+if (attrs[CTA_PROTOINFO_TCP_WSCALE_REPLY]) {
+protoinfo->tcp.wscale_reply =
+nl_attr_get_u8(attrs[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
+}
+if (attrs[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) {
+flags_orig =
+nl_attr_get_unspec(attrs[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL],
+   sizeof *flags_orig);
+protoinfo->tcp.flags_orig =
+ip_ct_tcp_flags_to_dpif(flags_orig->flags);
+}
+if (attrs[CTA_PROTOINFO_TCP_FLAGS_REPLY]) {
+flags_reply =
+nl_attr_get_unspec(attrs[CTA_PROTOINFO_TCP_FLAGS_REPLY],
+   sizeof *flags_reply);
+protoinfo->tcp.flags_reply =
+ip_ct_tcp_flags_to_dpif(flags_reply->flags);
+}
 } else {
 VLOG_ERR_RL(, "Could not parse nested TCP protoinfo options. "
 "Possibly incompatible Linux kernel version.");

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 2/2] ct-dpif: Do not show flag key if empty.

2022-08-04 Thread Paolo Valerio
This patch avoids to show flags_orig/flags_reply key if they have no value.
E.g., the following:

NEW tcp,orig=([...]),reply=([...]),id=1800618864,
status=CONFIRMED|SRC_NAT_DONE|DST_NAT_DONE,timeout=120,
protoinfo=(state_orig=SYN_SENT,state_reply=SYN_SENT,wscale_orig=7,
   wscale_reply=0,flags_orig=WINDOW_SCALE|SACK_PERM,flags_reply=)

becomes:

NEW tcp,orig=([...]),reply=([...]),id=1800618864,
status=CONFIRMED|SRC_NAT_DONE|DST_NAT_DONE,timeout=120,
protoinfo=(state_orig=SYN_SENT,state_reply=SYN_SENT,wscale_orig=7,
   wscale_reply=0,flags_orig=WINDOW_SCALE|SACK_PERM)

Signed-off-by: Paolo Valerio 
---
 lib/ct-dpif.c |   14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c
index cfc2315e3..f1a375523 100644
--- a/lib/ct-dpif.c
+++ b/lib/ct-dpif.c
@@ -512,10 +512,16 @@ ct_dpif_format_protoinfo_tcp_verbose(struct ds *ds,
   protoinfo->tcp.wscale_orig,
   protoinfo->tcp.wscale_reply);
 }
-ct_dpif_format_flags(ds, ",flags_orig=", protoinfo->tcp.flags_orig,
- tcp_flags);
-ct_dpif_format_flags(ds, ",flags_reply=", protoinfo->tcp.flags_reply,
- tcp_flags);
+
+if (protoinfo->tcp.flags_orig) {
+ct_dpif_format_flags(ds, ",flags_orig=", protoinfo->tcp.flags_orig,
+ tcp_flags);
+}
+
+if (protoinfo->tcp.flags_reply) {
+ct_dpif_format_flags(ds, ",flags_reply=", protoinfo->tcp.flags_reply,
+ tcp_flags);
+}
 }
 
 static void

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


  1   2   3   4   >