On Mon, Oct 4, 2021 at 12:37 PM Ihar Hrachyshka <[email protected]> wrote:
>
> The 15-bit port key range used for multicast groups can't be covered
> by 12-bit key space available for port keys in VXLAN. To make
> multicast keys work, we have to transform 16-bit multicast port keys
> to 12-bit keys before fanning out packets through VXLAN tunnels.
> Otherwise significant bits are not retained, and multicast / broadcast
> traffic does not reach ports located on other chassis.
>
> This patch introduces a mapping scheme between core 16-bit multicast
> port keys and 12-bit key range available in VXLAN. The scheme is as
> follows:
>
> 1) Before sending a packet through VXLAN tunnel, the most significant
> bit of a 16-bit port key is copied into the most significant bit of
> 12-bit VXLAN key. The 11 least significant bits of a 16-bit port
> key are copied to the least significant bits of 12-bit VXLAN key.
>
> 2) When receiving a packet through VXLAN tunnel, the most significant
> bit of a VXLAN 12-bit port key is copied into the most significant
> bit of 16-bit port key used in core. The 11 least significant bits
> of a VXLAN 12-bit port key are copied into the least significant
> bits of a 16-bit port key used in core.
>
> This change also implies that the range available for multicast port
> keys is more limited and fits into 11-bit space. The same rule should
> be enforced for unicast port keys, like we already do for tunnel keys
> when a VXLAN encap is present in a cluster. This enforcement is
> implied here but missing in master and will be implemented in a
> separate patch. (The missing enforcement is an oversight of the
> original patch that added support for VXLAN tunnels.)
>
> Fixes: b07f1bc3d068 ("Add VXLAN support for non-VTEP datapath bindings")
> Signed-off-by: Ihar Hrachyshka <[email protected]>
Thanks for fixing this.
The approach taken makes sense to me. I applied this patch to the main branch
and branch-21.09.
If this needs to be backported to other branches, then please submit
backport patches
as it doesn't apply cleanly to branch-21.06.
Numan
>
> --
>
> v1: initial commit
> v2: updated docs
> v2: removed newly added but unused macros
> v3: rebase
> v4: fix memory leak in consider_mc_group
> ---
> controller-vtep/gateway.c | 2 +
> controller/physical.c | 102 ++++++++++++++++++++++++++++++--------
> lib/mcast-group-index.h | 2 +
> ovn-architecture.7.xml | 13 +++--
> tests/ovn.at | 23 +++++++--
> 5 files changed, 112 insertions(+), 30 deletions(-)
>
> diff --git a/controller-vtep/gateway.c b/controller-vtep/gateway.c
> index e9419138b..288772dc4 100644
> --- a/controller-vtep/gateway.c
> +++ b/controller-vtep/gateway.c
> @@ -61,6 +61,8 @@ create_chassis_rec(struct ovsdb_idl_txn *txn, const char
> *name,
> sbrec_encap_set_options(encap_rec, &options);
> sbrec_encap_set_chassis_name(encap_rec, name);
> sbrec_chassis_set_encaps(chassis_rec, &encap_rec, 1);
> + const struct smap oc = SMAP_CONST1(&oc, "is-vtep", "true");
> + sbrec_chassis_set_other_config(chassis_rec, &oc);
>
> return chassis_rec;
> }
> diff --git a/controller/physical.c b/controller/physical.c
> index 0cfb158c8..58e4157f1 100644
> --- a/controller/physical.c
> +++ b/controller/physical.c
> @@ -37,6 +37,7 @@
> #include "openvswitch/ofp-parse.h"
> #include "ovn-controller.h"
> #include "lib/chassis-index.h"
> +#include "lib/mcast-group-index.h"
> #include "lib/ovn-sb-idl.h"
> #include "lib/ovn-util.h"
> #include "physical.h"
> @@ -63,6 +64,7 @@ static void
> load_logical_ingress_metadata(const struct sbrec_port_binding *binding,
> const struct zone_ids *zone_ids,
> struct ofpbuf *ofpacts_p);
> +static int64_t get_vxlan_port_key(int64_t port_key);
>
> /* UUID to identify OF flows not associated with ovsdb rows. */
> static struct uuid *hc_uuid = NULL;
> @@ -160,8 +162,9 @@ put_encapsulation(enum mf_field_id mff_ovn_geneve,
> } else if (tun->type == VXLAN) {
> uint64_t vni = datapath->tunnel_key;
> if (!is_ramp_switch) {
> - /* Only some bits are used for regular tunnels. */
> - vni |= (uint64_t) outport << 12;
> + /* Map southbound 16-bit port key to limited 12-bit range
> + * available for VXLAN, which differs for multicast groups. */
> + vni |= get_vxlan_port_key(outport) << 12;
> }
> put_load(vni, MFF_TUN_ID, 0, 24, ofpacts);
> } else {
> @@ -1372,6 +1375,43 @@ out:
> }
> }
>
> +static int64_t
> +get_vxlan_port_key(int64_t port_key)
> +{
> + if (port_key >= OVN_MIN_MULTICAST) {
> + /* 0b1<11 least significant bits> */
> + return OVN_VXLAN_MIN_MULTICAST |
> + (port_key & (OVN_VXLAN_MIN_MULTICAST - 1));
> + }
> + return port_key;
> +}
> +
> +static void
> +fanout_to_chassis(enum mf_field_id mff_ovn_geneve,
> + struct sset *remote_chassis,
> + const struct hmap *chassis_tunnels,
> + const struct sbrec_datapath_binding *datapath,
> + uint16_t outport, bool is_ramp_switch,
> + struct ofpbuf *remote_ofpacts)
> +{
> + const char *chassis_name;
> + const struct chassis_tunnel *prev = NULL;
> + SSET_FOR_EACH (chassis_name, remote_chassis) {
> + const struct chassis_tunnel *tun
> + = chassis_tunnel_find(chassis_tunnels, chassis_name, NULL);
> + if (!tun) {
> + continue;
> + }
> +
> + if (!prev || tun->type != prev->type) {
> + put_encapsulation(mff_ovn_geneve, tun, datapath,
> + outport, is_ramp_switch, remote_ofpacts);
> + prev = tun;
> + }
> + ofpact_put_OUTPUT(remote_ofpacts)->port = tun->ofport;
> + }
> +}
> +
> static void
> consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
> enum mf_field_id mff_ovn_geneve,
> @@ -1390,6 +1430,7 @@ consider_mc_group(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
> }
>
> struct sset remote_chassis = SSET_INITIALIZER(&remote_chassis);
> + struct sset vtep_chassis = SSET_INITIALIZER(&vtep_chassis);
> struct match match;
>
> match_init_catchall(&match);
> @@ -1398,7 +1439,8 @@ consider_mc_group(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
>
> /* Go through all of the ports in the multicast group:
> *
> - * - For remote ports, add the chassis to 'remote_chassis'.
> + * - For remote ports, add the chassis to 'remote_chassis' or
> + * 'vtep_chassis'.
> *
> * - For local ports (other than logical patch ports), add actions
> * to 'ofpacts' to set the output port and resubmit.
> @@ -1465,7 +1507,12 @@ consider_mc_group(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
> /* Add remote chassis only when localnet port not exist,
> * otherwise multicast will reach remote ports through localnet
> * port. */
> - sset_add(&remote_chassis, port->chassis->name);
> + if (smap_get_bool(&port->chassis->other_config,
> + "is-vtep", false)) {
> + sset_add(&vtep_chassis, port->chassis->name);
> + } else {
> + sset_add(&remote_chassis, port->chassis->name);
> + }
> }
> }
>
> @@ -1490,7 +1537,8 @@ consider_mc_group(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
> *
> * Handle output to the remote chassis in the multicast group, if
> * any. */
> - if (!sset_is_empty(&remote_chassis) || remote_ofpacts.size > 0) {
> + if (!sset_is_empty(&remote_chassis) ||
> + !sset_is_empty(&vtep_chassis) || remote_ofpacts.size > 0) {
> if (remote_ofpacts.size > 0) {
> /* Following delivery to logical patch ports, restore the
> * multicast group as the logical output port. */
> @@ -1498,22 +1546,12 @@ consider_mc_group(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
> &remote_ofpacts);
> }
>
> - const char *chassis_name;
> - const struct chassis_tunnel *prev = NULL;
> - SSET_FOR_EACH (chassis_name, &remote_chassis) {
> - const struct chassis_tunnel *tun
> - = chassis_tunnel_find(chassis_tunnels, chassis_name, NULL);
> - if (!tun) {
> - continue;
> - }
> -
> - if (!prev || tun->type != prev->type) {
> - put_encapsulation(mff_ovn_geneve, tun, mc->datapath,
> - mc->tunnel_key, true, &remote_ofpacts);
> - prev = tun;
> - }
> - ofpact_put_OUTPUT(&remote_ofpacts)->port = tun->ofport;
> - }
> + fanout_to_chassis(mff_ovn_geneve, &remote_chassis, chassis_tunnels,
> + mc->datapath, mc->tunnel_key, false,
> + &remote_ofpacts);
> + fanout_to_chassis(mff_ovn_geneve, &vtep_chassis, chassis_tunnels,
> + mc->datapath, mc->tunnel_key, true,
> + &remote_ofpacts);
>
> if (remote_ofpacts.size) {
> if (local_ports) {
> @@ -1527,6 +1565,7 @@ consider_mc_group(struct ovsdb_idl_index
> *sbrec_port_binding_by_name,
> ofpbuf_uninit(&ofpacts);
> ofpbuf_uninit(&remote_ofpacts);
> sset_destroy(&remote_chassis);
> + sset_destroy(&vtep_chassis);
> }
>
> bool
> @@ -1689,6 +1728,27 @@ physical_run(struct physical_ctx *p_ctx,
> &ofpacts, hc_uuid);
> }
>
> + /* Add VXLAN specific rules to transform port keys
> + * from 12 bits to 16 bits used elsewhere. */
> + HMAP_FOR_EACH (tun, hmap_node, p_ctx->chassis_tunnels) {
> + if (tun->type == VXLAN) {
> + ofpbuf_clear(&ofpacts);
> +
> + struct match match = MATCH_CATCHALL_INITIALIZER;
> + match_set_in_port(&match, tun->ofport);
> + ovs_be64 mcast_bits = htonll((OVN_VXLAN_MIN_MULTICAST << 12));
> + match_set_tun_id_masked(&match, mcast_bits, mcast_bits);
> +
> + put_load(1, MFF_LOG_OUTPORT, 15, 1, &ofpacts);
> + put_move(MFF_TUN_ID, 12, MFF_LOG_OUTPORT, 0, 11, &ofpacts);
> + put_move(MFF_TUN_ID, 0, MFF_LOG_DATAPATH, 0, 12, &ofpacts);
> + put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts);
> +
> + ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 105, 0,
> + &match, &ofpacts, hc_uuid);
> + }
> + }
> +
> /* Handle ramp switch encapsulations. */
> HMAP_FOR_EACH (tun, hmap_node, p_ctx->chassis_tunnels) {
> if (tun->type != VXLAN) {
> diff --git a/lib/mcast-group-index.h b/lib/mcast-group-index.h
> index 72af117a4..5bc725451 100644
> --- a/lib/mcast-group-index.h
> +++ b/lib/mcast-group-index.h
> @@ -23,6 +23,8 @@ struct sbrec_datapath_binding;
> #define OVN_MIN_MULTICAST 32768
> #define OVN_MAX_MULTICAST 65535
>
> +#define OVN_VXLAN_MIN_MULTICAST 2048
> +
> enum ovn_mcast_tunnel_keys {
>
> OVN_MCAST_FLOOD_TUNNEL_KEY = OVN_MIN_MULTICAST,
> diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml
> index 3d2910358..5c5acfaab 100644
> --- a/ovn-architecture.7.xml
> +++ b/ovn-architecture.7.xml
> @@ -2797,11 +2797,14 @@
> </li>
>
> <li>
> - 12-bit logical egress port identifier. IDs 0 through 32767 have the
> same
> - meaning as for logical ingress ports. IDs 32768 through 65535,
> - inclusive, may be assigned to logical multicast groups (see the
> - <code>tunnel_key</code> column in the OVN Southbound
> - <code>Multicast_Group</code> table).
> + 12-bit logical egress port identifier. IDs 0 through 2047 are used for
> + unicast output ports. IDs 2048 through 4095, inclusive, may be assigned
> + to logical multicast groups (see the <code>tunnel_key</code> column in
> + the OVN Southbound <code>Multicast_Group</code> table). For multicast
> + group tunnel keys, a special mapping scheme is used to internally
> + transform from internal OVN 16-bit keys to 12-bit values before sending
> + packets through a VXLAN tunnel, and back from 12-bit tunnel keys to
> + 16-bit values when receiving packets from a VXLAN tunnel.
> </li>
>
> <li>
> diff --git a/tests/ovn.at b/tests/ovn.at
> index 172b5c713..3ae4ef752 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -3271,8 +3271,13 @@ AT_SETUP([VLAN transparency, passthru=true, ARP
> responder disabled])
> ovn_start
>
> net_add net
> -check ovs-vsctl add-br br-phys
> -ovn_attach net br-phys 192.168.0.1
> +for i in 1 2; do
> + sim_add hv-$i
> + as hv-$i
> + check ovs-vsctl add-br br-phys
> + check ovs-vsctl set open . external-ids:ovn-bridge-mappings=phys:br-phys
> + ovn_attach net br-phys 192.168.0.$i 24 vxlan
> +done
>
> check ovn-nbctl ls-add ls
> check ovn-nbctl --wait=sb add Logical-Switch ls other_config
> vlan-passthru=true
> @@ -3283,6 +3288,7 @@ for i in 1 2; do
> done
>
> for i in 1 2; do
> + as hv-$i
> check ovs-vsctl add-port br-int vif$i -- set Interface vif$i
> external-ids:iface-id=lsp$i \
> options:tx_pcap=vif$i-tx.pcap \
> options:rxq_pcap=vif$i-rx.pcap \
> @@ -3298,18 +3304,27 @@ AT_CHECK([grep -w "ls_in_arp_rsp" lsflows | sort],
> [0], [dnl
> table=16(ls_in_arp_rsp ), priority=0 , match=(1), action=(next;)
> ])
>
> +for i in 1 2; do
> + : > $i.expected
> +done
> +
> test_arp() {
> local inport=$1 outport=$2 sha=$3 spa=$4 tpa=$5 reply_ha=$6
> tag=8100fefe
> local
> request=ffffffffffff${sha}${tag}08060001080006040001${sha}${spa}ffffffffffff${tpa}
> - ovs-appctl netdev-dummy/receive vif$inport $request
> + as hv-$inport ovs-appctl netdev-dummy/receive vif$inport $request
> echo $request >> $outport.expected
>
> local
> reply=${sha}${reply_ha}${tag}08060001080006040002${reply_ha}${tpa}${sha}${spa}
> - ovs-appctl netdev-dummy/receive vif$outport $reply
> + as hv-$outport ovs-appctl netdev-dummy/receive vif$outport $reply
> echo $reply >> $inport.expected
> }
>
> +# Pre-populate the hypervisors' ARP tables so that we don't lose any
> +# packets for ARP resolution (native tunneling doesn't queue packets
> +# for ARP resolution).
> +OVN_POPULATE_ARP
> +
> test_arp 1 2 f00000000001 0a000001 0a000002 f00000000002
> test_arp 2 1 f00000000002 0a000002 0a000001 f00000000001
>
> --
> 2.31.1
>
> _______________________________________________
> dev mailing list
> [email protected]
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev