On 2/16/24 07:52, Han Zhou wrote:
> In commit 41eefcb2807d, the format of external_ids:ovn-chassis-id for
> tunnels was modified to include the local encapsulation IP. This change
> can lead to the recreation of tunnels during an upgrade, potentially
> disrupting the dataplane temporarily, especially in large-scale
> environments.
> 
> This patch resolves the issue by recognizing the previous format. Thus,
> if the only modification is to the ID format, the tunnel will not be
> recreated. Instead, the external_ids will be updated directly. This
> approach ensures that the upgrade process is non-disruptive.
> 
> Fixes: 41eefcb2807d ("encaps: Create separate tunnels for multiple local 
> encap IPs.")
> Signed-off-by: Han Zhou <[email protected]>
> ---

Hi Han,

>  controller/encaps.c     | 83 ++++++++++++++++++++++++++++++++---------
>  tests/ovn-controller.at | 44 ++++++++++++++++++++++
>  2 files changed, 110 insertions(+), 17 deletions(-)
> 
> diff --git a/controller/encaps.c b/controller/encaps.c
> index 28237f6191c8..1d0d47523e77 100644
> --- a/controller/encaps.c
> +++ b/controller/encaps.c
> @@ -104,40 +104,62 @@ encaps_tunnel_id_create(const char *chassis_id, const 
> char *remote_encap_ip,
>                       '%', local_encap_ip);
>  }
>  
> +/*
> + * The older version of encaps_tunnel_id_create, which doesn't include
> + * local_encap_ip in the ID. This is used for backward compatibility support.
> + */
> +static char *
> +encaps_tunnel_id_create_old(const char *chassis_id,

Nit: I'd call this encaps_tunnel_id_create_legacy().

> +                            const char *remote_encap_ip)
> +{
> +    return xasprintf("%s%c%s", chassis_id, '@', remote_encap_ip);
> +}
> +
>  /*
>   * Parses a 'tunnel_id' of the form <chassis_name>@<remote IP>%<local IP>.
>   * If the 'chassis_id' argument is not NULL the function will allocate memory
>   * and store the chassis_name part of the tunnel-id at '*chassis_id'.
>   * Same for remote_encap_ip and local_encap_ip.
> + *
> + * The old form <chassis_name>@<remote IP> is also supported for backward
> + * compatibility during upgrade.
>   */
>  bool
>  encaps_tunnel_id_parse(const char *tunnel_id, char **chassis_id,
>                         char **remote_encap_ip, char **local_encap_ip)
>  {
> -    /* Find the @.  Fail if there is no @ or if any part is empty. */
> -    const char *d = strchr(tunnel_id, '@');
> -    if (d == tunnel_id || !d || !d[1]) {
> -        return false;
> +    char *tokstr = xstrdup(tunnel_id);
> +    char *saveptr = NULL;
> +    bool ret = false;
> +
> +    char *token_chassis = strtok_r(tokstr, "@", &saveptr);
> +    if (!token_chassis) {
> +        goto out;
>      }
>  
> -    /* Find the %.  Fail if there is no % or if any part is empty. */
> -    const char *d2 = strchr(d + 1, '%');
> -    if (d2 == d + 1 || !d2 || !d2[1]) {
> -        return false;
> +    char *token_remote_ip = strtok_r(NULL, "%", &saveptr);
> +    if (!token_remote_ip) {
> +        goto out;
>      }
>  
> +    char *token_local_ip = strtok_r(NULL, "", &saveptr);
> +
>      if (chassis_id) {
> -        *chassis_id = xmemdup0(tunnel_id, d - tunnel_id);
> +        *chassis_id = xstrdup(token_chassis);
>      }
> -
>      if (remote_encap_ip) {
> -        *remote_encap_ip = xmemdup0(d + 1, d2 - (d + 1));
> +        *remote_encap_ip = xstrdup(token_remote_ip);
>      }
> -
>      if (local_encap_ip) {
> -        *local_encap_ip = xstrdup(d2 + 1);
> +        /* To support backward compatibility during upgrade, ignore local ip 
> if
> +         * it is not encoded in the tunnel id yet. */
> +        *local_encap_ip = token_local_ip ? xstrdup(token_local_ip) : NULL;

This can be simplified to:

*local_encap_ip = nullable_xstrdup(token_local_ip);

>      }
> -    return true;
> +
> +    ret = true;
> +out:
> +    free(tokstr);
> +    return ret;
>  }

I think I'd use strsep instead, seems a tiny bit cleaner.  I see
encaps_tunnel_id_match() also uses strtok_r() so I'll leave it up
to you to decide whether we should follow the same style or not.

I'll leave the strsep() alternative here just in case:

bool
encaps_tunnel_id_parse(const char *tunnel_id, char **chassis_id,
                       char **remote_encap_ip, char **local_encap_ip)
{
    char *str = xstrdup(tunnel_id);
    char *next = str;
    bool ret = false;

    char *token_chassis = strsep(&next, "@");
    if (!token_chassis) {
        goto out;
    }

    char *token_remote_ip = strsep(&next, "%");
    if (!token_remote_ip) {
        goto out;
    }

    char *token_local_ip = next;
    if (chassis_id) {
        *chassis_id = xstrdup(token_chassis);
    }
    if (remote_encap_ip) {
        *remote_encap_ip = xstrdup(token_remote_ip);
    }
    if (local_encap_ip) {
        /* To support backward compatibility during upgrade, ignore local ip if
         * it is not encoded in the tunnel id yet. */
        *local_encap_ip = nullable_xstrdup(token_local_ip);
    }

    ret = true;
out:
    free(str);
    return ret;
}

>  
>  /*
> @@ -145,6 +167,10 @@ encaps_tunnel_id_parse(const char *tunnel_id, char 
> **chassis_id,
>   *      <chassis_id>@<remote_encap_ip>%<local_encap_ip>
>   * contains 'chassis_id' and, if specified, the given 'remote_encap_ip' and
>   * 'local_encap_ip'. Returns false otherwise.
> + *
> + * The old format <chassis_id>@<remote_encap_ip> is also supported for 
> backward
> + * compatibility during upgrade, and the local_encap_ip matching is ignored 
> in
> + * that case.
>   */
>  bool
>  encaps_tunnel_id_match(const char *tunnel_id, const char *chassis_id,
> @@ -166,8 +192,10 @@ encaps_tunnel_id_match(const char *tunnel_id, const char 
> *chassis_id,
>      }
>  
>      char *token_local_ip = strtok_r(NULL, "", &saveptr);
> -    if (local_encap_ip &&
> -        (!token_local_ip || strcmp(token_local_ip, local_encap_ip))) {
> +    if (!token_local_ip) {
> +        /* It is old format. To support backward compatibility during 
> upgrade,
> +         * just ignore local_ip. */
> +    } else if (local_encap_ip && strcmp(token_local_ip, local_encap_ip)) {
>          goto out;
>      }
>  
> @@ -189,6 +217,7 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
> sbrec_sb_global *sbg,
>      const char *dst_port = smap_get(&encap->options, "dst_port");
>      const char *csum = smap_get(&encap->options, "csum");
>      char *tunnel_entry_id = NULL;
> +    char *tunnel_entry_id_old = NULL;
>  
>      /*
>       * Since a chassis may have multiple encap-ip, we can't just add the
> @@ -198,6 +227,8 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
> sbrec_sb_global *sbg,
>       */
>      tunnel_entry_id = encaps_tunnel_id_create(new_chassis_id, encap->ip,
>                                                local_ip);
> +    tunnel_entry_id_old = encaps_tunnel_id_create_old(new_chassis_id,
> +                                                      encap->ip);
>      if (csum && (!strcmp(csum, "true") || !strcmp(csum, "false"))) {
>          smap_add(&options, "csum", csum);
>      }
> @@ -269,11 +300,28 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
> sbrec_sb_global *sbg,
>       * it). */
>      struct tunnel_node *tunnel = shash_find_data(&tc->tunnel,
>                                                   tunnel_entry_id);
> +    bool old_id_format = false;
> +    if (!tunnel) {
> +        tunnel = shash_find_data(&tc->tunnel, tunnel_entry_id_old);
> +        old_id_format = true;
> +    }
>      if (tunnel
>          && tunnel->port->n_interfaces == 1
>          && !strcmp(tunnel->port->interfaces[0]->type, encap->type)
>          && smap_equal(&tunnel->port->interfaces[0]->options, &options)) {
> -        shash_find_and_delete(&tc->tunnel, tunnel_entry_id);
> +        if (old_id_format) {
> +            /* We must be upgrading from an older version. We can reuse the
> +             * existing tunnel, but needs to update the tunnel's ID to the 
> new
> +             * format. */
> +            ovsrec_port_update_external_ids_setkey(tunnel->port, 
> OVN_TUNNEL_ID,
> +                                                   tunnel_entry_id);
> +            ovsrec_interface_update_external_ids_setkey(
> +                tunnel->port->interfaces[0], OVN_TUNNEL_ID, tunnel_entry_id);
> +
> +            shash_find_and_delete(&tc->tunnel, tunnel_entry_id_old);
> +        } else {
> +            shash_find_and_delete(&tc->tunnel, tunnel_entry_id);
> +        }
>          free(tunnel);
>          goto exit;
>      }
> @@ -306,6 +354,7 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
> sbrec_sb_global *sbg,
>  
>  exit:
>      free(tunnel_entry_id);
> +    free(tunnel_entry_id_old);
>      smap_destroy(&options);
>  }
>  
> diff --git a/tests/ovn-controller.at b/tests/ovn-controller.at
> index f77e032d4839..0b60806089cf 100644
> --- a/tests/ovn-controller.at
> +++ b/tests/ovn-controller.at
> @@ -2732,3 +2732,47 @@ OVS_WAIT_FOR_OUTPUT([ovn-appctl -t ovn-controller 
> connection-status], [0], [conn
>  
>  OVN_CLEANUP([hv1])
>  AT_CLEANUP
> +
> +OVN_FOR_EACH_NORTHD([
> +AT_SETUP([ovn-controller - tunnel chassis id backward compatibility])
> +
> +ovn_start
> +
> +net_add n1
> +sim_add hv1
> +as hv1
> +ovs-vsctl add-br br-phys
> +ovn_attach n1 br-phys 192.168.0.1
> +
> +ovn-nbctl --wait=sb sync

I assume this is to ensure that the the NB_Global/SB_Global contents are
populated before we continue.

In any case, it probably needs a "check" prefix.

> +wait_row_count Chassis 1 name=hv1
> +
> +ovn-sbctl chassis-add fakechassis geneve 192.168.0.2
> +fakech_tunnel=ovn-fakech-0
> +OVS_WAIT_UNTIL([ovs-vsctl list port $fakech_tunnel])
> +port_uuid=$(ovs-vsctl get port $fakech_tunnel _uuid)
> +
> +AT_CHECK([ovs-vsctl get port $fakech_tunnel external_ids:ovn-chassis-id], 
> [0], [dnl
> +"[email protected]%192.168.0.1"
> +])
> +
> +# Stop ovn-controller without deleting tunnel
> +ovs-appctl --timeout=10 -t ovn-controller exit --restart

This should be ovn-appctl and we should prefix it with "check".  Why
isn't the default 30 second timeout good enough?

I think this should do:
check ovn-appctl -t ovn-controller exit --restart

> +
> +# Change the tunnel external id to the old format, and then start
> +# ovn-controller, pretending we are upgrading from an older version.
> +ovs-vsctl set port $fakech_tunnel 
> external_ids:[email protected]
> +AT_CHECK([ovs-vsctl get port $fakech_tunnel external_ids:ovn-chassis-id], 
> [0], [dnl
> +"[email protected]"
> +])
> +
> +start_daemon ovn-controller
> +
> +# The tunnel id should be updated to the new format but the tunnel's uuid
> +# should kept the same (no recreation).
> +OVS_WAIT_UNTIL([test x$(ovs-vsctl get port $fakech_tunnel 
> external_ids:ovn-chassis-id) = x\"[email protected]%192.168.0.1\"])
> +AT_CHECK([test x"$port_uuid"=$(ovs-vsctl get port $fakech_tunnel _uuid)])
> +
> +OVN_CLEANUP([hv1])
> +AT_CLEANUP
> +])

With the above small issues fixed:

Acked-by: Dumitru Ceara <[email protected]>

Regards,
Dumitru

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to