Hi Numan, Ihar, Han, Mark,

would somebody find some time to take a look on this please?

Thanks.

Regards,
Vladislav Odintsov

> On 1 Apr 2022, at 11:32, Vladislav Odintsov <[email protected]> wrote:
> 
> When HW VTEP is used with OVN logical networks routed by
> Logical Routers, only directly-attached network can communicate
> with external VLAN (via vtep LSP). There is no connectivity
> between VLAN network which is located behind VTEP LSP and lswitch1
> behind LR.
> 
> Visual description:
> 
>        ┌──────┐
> ┌─────► │  VM  │
> │       └───┬──┘
> │    ┌──────┴───────┐
> │    │   lswitch1   │
> │    │192.168.0.0/24│
> │    └──────┬───────┘
> │         ┌─┴──┐
> │         │ LR │
> │         └─┬──┘
> │    ┌──────┴───────┐
> │    │   lswitch2   │
> │    │192.168.1.0/24│ ◄──┐
> │    └──────┬───────┘    │
> │       ┌───┴────┐       │
> │FAIL   │vtep LSP│     OK│
> │       └───┬────┘       │
> │        ┌──┴────┐       │
> └────────┤VLAN 10├───────┘
>         └───────┘
> 
> This patch adds support to have connectivity between such routed networks.
> User must set LRP gateway chassis (or ha chassis group) in the lswitch2's
> associated LRP, which will enable answering ARP requests for LR IP from
> VLAN (lswitch2) side.
> 
> Traffic from VLAN to overlay will go from HW VTEP switch through gateway
> chassis to hypervisor and return traffic will go directly from hypervisor
> to HW VTEP switch and then got forwarded to VLAN.
> 
> Now chassis, on which chassis redirect LRP is claimed, will answer ARP 
> requests
> coming from HW VTEP switch. For this reason the hairpin traffic from VTEP 
> switch
> is allowed:
> 
> 1. the blocking OF rule in table `OFTABLE_REMOTE_OUTPUT` which matched on
>   reg10=0x2/0x2 is changed to match on reg10=0x2/0x3. We check if traffic is
>   from VTEP (ramp) switch and also we check if loopback flag was not set and
>   only in this case such traffic is not allowed.
> 2. OF rule in table `OFTABLE_REMOTE_OUTPUT`, which directs traffic to HW VTEP
>   (ramp) switch is modified to flush IN_PORT in order to allow sending traffic
>   to OF port from which this traffic came (hairpin).
> 
> New flows are added to ls_in_ip_hairpin table to support ARP traffic hairpin
> for VTEP switches:
> 1. if traffic has come from VTEP (ramp) switch AND l3dgw ports of associated
>   LSPs reside on current chassis (is_chassis_resident()), such traffic
>   is passed to next table for further ARP responder processing. Priority of
>   this flow is 2000.
> 2. if traffic has come from VTEP (ramp) switch, send it to L2_LKUP table to
>   skip ARP responder processing. We serve ARP requests from VTEP lports only
>   on gateway chassis. Priority of this flow is 1000.
> 
> Signed-off-by: Vladislav Odintsov <[email protected]>
> ---
> controller-vtep/vtep.c  | 31 ++++++++++++++++++++---
> controller/physical.c   | 18 ++++++++++----
> northd/northd.c         | 50 +++++++++++++++++++++++++++++++++----
> northd/ovn-northd.8.xml | 55 +++++++++++++++++++++++++++++------------
> tests/ovn-northd.at     |  9 ++++---
> tests/ovn.at            | 37 +++++++++++++++++++++------
> 6 files changed, 159 insertions(+), 41 deletions(-)
> 
> diff --git a/controller-vtep/vtep.c b/controller-vtep/vtep.c
> index ecca00b54..c3d09425c 100644
> --- a/controller-vtep/vtep.c
> +++ b/controller-vtep/vtep.c
> @@ -311,6 +311,9 @@ vtep_macs_run(struct ovsdb_idl_txn *vtep_idl_txn, struct 
> shash *ucast_macs_rmts,
>                     hash_uint64((uint64_t) vtep_ls->tunnel_key[0]));
>     }
> 
> +    const char *dp, *peer;
> +    const struct sbrec_port_binding *lrp_pb, *peer_pb;
> +
>     SHASH_FOR_EACH (node, non_vtep_pbs) {
>         const struct sbrec_port_binding *port_binding_rec = node->data;
>         const struct sbrec_chassis *chassis_rec;
> @@ -324,7 +327,25 @@ vtep_macs_run(struct ovsdb_idl_txn *vtep_idl_txn, struct 
> shash *ucast_macs_rmts,
>             continue;
>         }
> 
> -        tnl_key = port_binding_rec->datapath->tunnel_key;
> +        if (!strcmp(port_binding_rec->type, "chassisredirect")) {
> +            dp = smap_get(&port_binding_rec->options, "distributed-port");
> +            lrp_pb = shash_find_data(non_vtep_pbs, dp);
> +            if (!lrp_pb) {
> +                continue;
> +            }
> +
> +            peer = smap_get(&lrp_pb->options, "peer");
> +            if (!peer) {
> +                continue;
> +            }
> +
> +            peer_pb = shash_find_data(non_vtep_pbs, peer);
> +            tnl_key = peer_pb->datapath->tunnel_key;
> +        }
> +        else {
> +            tnl_key = port_binding_rec->datapath->tunnel_key;
> +        }
> +
>         HMAP_FOR_EACH_WITH_HASH (ls_node, hmap_node,
>                                  hash_uint64((uint64_t) tnl_key),
>                                  &ls_map) {
> @@ -454,7 +475,7 @@ vtep_macs_run(struct ovsdb_idl_txn *vtep_idl_txn, struct 
> shash *ucast_macs_rmts,
> static bool
> vtep_lswitch_cleanup(struct ovsdb_idl *vtep_idl)
> {
> -   const struct vteprec_logical_switch *vtep_ls;
> +    const struct vteprec_logical_switch *vtep_ls;
>     bool done = true;
> 
>     VTEPREC_LOGICAL_SWITCH_FOR_EACH (vtep_ls, vtep_idl) {
> @@ -572,9 +593,11 @@ vtep_run(struct controller_vtep_ctx *ctx)
>     /* Collects and classifies 'Port_Binding's. */
>     SBREC_PORT_BINDING_FOR_EACH(port_binding_rec, ctx->ovnsb_idl) {
>         struct shash *target =
> -            !strcmp(port_binding_rec->type, "vtep") ? &vtep_pbs : 
> &non_vtep_pbs;
> +            !strcmp(port_binding_rec->type, "vtep") ? &vtep_pbs
> +                                                    : &non_vtep_pbs;
> 
> -        if (!port_binding_rec->chassis) {
> +        if (!port_binding_rec->chassis &&
> +            strcmp(port_binding_rec->type, "patch")) {
>             continue;
>         }
>         shash_add(target, port_binding_rec->logical_port, port_binding_rec);
> diff --git a/controller/physical.c b/controller/physical.c
> index 02fcd5ea8..2a70f4fe0 100644
> --- a/controller/physical.c
> +++ b/controller/physical.c
> @@ -299,9 +299,16 @@ put_remote_port_redirect_overlay(const struct
>         if (!rem_tun) {
>             return;
>         }
> +
> +        bool is_vtep_port = !strcmp(binding->type, "vtep");
> +        /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for ARP/ND
> +         * responder in L3 networks. */
> +        if (is_vtep_port) {
> +            put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, ofpacts_p);
> +        }
> +
>         put_encapsulation(mff_ovn_geneve, tun, binding->datapath, port_key,
> -                          !strcmp(binding->type, "vtep"),
> -                          ofpacts_p);
> +                          is_vtep_port, ofpacts_p);
>         /* Output to tunnel. */
>         ofpact_put_OUTPUT(ofpacts_p)->port = rem_tun->ofport;
>     } else {
> @@ -1841,12 +1848,13 @@ physical_run(struct physical_ctx *p_ctx,
>      * Handles packets received from a VXLAN tunnel which get resubmitted to
>      * OFTABLE_LOG_INGRESS_PIPELINE due to lack of needed metadata in VXLAN,
>      * explicitly skip sending back out any tunnels and resubmit to table 38
> -     * for local delivery.
> +     * for local delivery, except packets which have MLF_ALLOW_LOOPBACK bit
> +     * set.
>      */
>     struct match match;
>     match_init_catchall(&match);
> -    match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0,
> -                         MLF_RCV_FROM_RAMP, MLF_RCV_FROM_RAMP);
> +    match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0, MLF_RCV_FROM_RAMP,
> +                         MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK);
> 
>     /* Resubmit to table 38. */
>     ofpbuf_clear(&ofpacts);
> diff --git a/northd/northd.c b/northd/northd.c
> index a2cf8d6fc..0a055b519 100644
> --- a/northd/northd.c
> +++ b/northd/northd.c
> @@ -199,6 +199,7 @@ enum ovn_stage {
> #define REGBIT_LKUP_FDB           "reg0[11]"
> #define REGBIT_HAIRPIN_REPLY      "reg0[12]"
> #define REGBIT_ACL_LABEL          "reg0[13]"
> +#define REGBIT_FROM_RAMP          "reg0[14]"
> 
> #define REG_ORIG_DIP_IPV4         "reg1"
> #define REG_ORIG_DIP_IPV6         "xxreg1"
> @@ -5510,8 +5511,9 @@ build_lswitch_input_port_sec_op(
>     }
> 
>     if (!strcmp(op->nbsp->type, "vtep")) {
> +        ds_put_format(actions, REGBIT_FROM_RAMP" = 1; ");
>         ds_put_format(actions, "next(pipeline=ingress, table=%d);",
> -                      ovn_stage_get_table(S_SWITCH_IN_L2_LKUP));
> +                      ovn_stage_get_table(S_SWITCH_IN_HAIRPIN));
>     } else {
>         ds_put_cstr(actions, "next;");
>     }
> @@ -6960,6 +6962,45 @@ build_lb_hairpin(struct ovn_datapath *od, struct hmap 
> *lflows)
>     }
> }
> 
> +static void
> +build_vtep_hairpin(struct ovn_datapath *od, struct hmap *lflows)
> +{
> +    /* Ingress Pre-ARP flows for VTEP hairpining traffic. Priority 1000:
> +     * Packets that received from non-VTEP ports should continue processing. 
> */
> +
> +    char *action = xasprintf("next(pipeline=ingress, table=%d);",
> +                             ovn_stage_get_table(S_SWITCH_IN_L2_LKUP));
> +    /* send all traffic from VTEP directly to L2LKP table. */
> +    ovn_lflow_add(lflows, od, S_SWITCH_IN_HAIRPIN, 1000,
> +                  REGBIT_FROM_RAMP" == 1", action);
> +    free(action);
> +
> +    struct ds match = DS_EMPTY_INITIALIZER;
> +    int n_ports = od->n_router_ports;
> +    bool dp_has_l3dgw_ports = false;
> +    for (int i = 0; i < n_ports; i++) {
> +        if (is_l3dgw_port(od->router_ports[i]->peer)) {
> +            ds_put_format(&match, "%sis_chassis_resident(%s)%s",
> +                          i == 0 ? REGBIT_FROM_RAMP" == 1 && (" : "",
> +                          od->router_ports[i]->peer->cr_port->json_key,
> +                          i < n_ports - 1 ? " || " : ")");
> +            dp_has_l3dgw_ports = true;
> +        }
> +    }
> +
> +    /* Ingress pre-arp flow for traffic from VTEP (ramp) switch.
> +    * Priority 2000: Packets, that were received from VTEP (ramp) switch and
> +    * router ports of current datapath are l3dgw ports and they reside on
> +    * current chassis, should be passed to next table for ARP/ND hairpin
> +    * processing.
> +    */
> +    if (dp_has_l3dgw_ports) {
> +        ovn_lflow_add(lflows, od, S_SWITCH_IN_HAIRPIN, 2000, ds_cstr(&match),
> +                      "next;");
> +    }
> +    ds_destroy(&match);
> +}
> +
> /* Build logical flows for the forwarding groups */
> static void
> build_fwd_group_lflows(struct ovn_datapath *od, struct hmap *lflows)
> @@ -7652,6 +7693,7 @@ build_lswitch_lflows_pre_acl_and_acl(struct 
> ovn_datapath *od,
>         build_qos(od, lflows);
>         build_stateful(od, lflows);
>         build_lb_hairpin(od, lflows);
> +        build_vtep_hairpin(od, lflows);
>     }
> }
> 
> @@ -7680,8 +7722,7 @@ build_lswitch_lflows_admission_control(struct 
> ovn_datapath *od,
> }
> 
> /* Ingress table 18: ARP/ND responder, skip requests coming from localnet
> - * and vtep ports. (priority 100); see ovn-northd.8.xml for the
> - * rationale. */
> + * ports. (priority 100); see ovn-northd.8.xml for the rationale. */
> 
> static void
> build_lswitch_arp_nd_responder_skip_local(struct ovn_port *op,
> @@ -7689,8 +7730,7 @@ build_lswitch_arp_nd_responder_skip_local(struct 
> ovn_port *op,
>                                           struct ds *match)
> {
>     if (op->nbsp) {
> -        if ((!strcmp(op->nbsp->type, "localnet")) ||
> -            (!strcmp(op->nbsp->type, "vtep"))) {
> +        if (!strcmp(op->nbsp->type, "localnet")) {
>             ds_clear(match);
>             ds_put_format(match, "inport == %s", op->json_key);
>             ovn_lflow_add_with_lport_and_hint(lflows, op->od,
> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> index 4784bff04..08864acfe 100644
> --- a/northd/ovn-northd.8.xml
> +++ b/northd/ovn-northd.8.xml
> @@ -1094,6 +1094,29 @@
> 
>     <h3>Ingress Table 17: Hairpin</h3>
>     <ul>
> +      <li>
> +        A priority-2000 flow that passes traffic to next table for hairpin 
> ARP
> +        processing with
> +        <code>next;</code> action, which matches on:
> +        <ul>
> +          <li>
> +            <code>reg0[14]</code> register bit, that is set in
> +            ls_in_port_sec_l2 table for traffic received from HW VTEP (ramp)
> +            switch AND
> +          </li>
> +          <li>
> +            all Distrubited Gateway ports, which are associated with current 
> LS
> +            <code>router</code> type LSPs if 
> <code>is_chassis_resident()</code>
> +            for them is true.
> +          </li>
> +        </ul>
> +      </li>
> +      <li>
> +        A priority-1000 flow that matches on <code>reg0[14]</code> register
> +        bit that is set in ls_in_port_sec_l2 table for traffic received from 
> HW
> +        VTEP (ramp) switch. This traffic is passed to ingress table
> +        ls_in_l2_lkup.
> +      </li>
>       <li>
>         A priority-1 flow that hairpins traffic matched by non-default
>         flows in the Pre-Hairpin table. Hairpinning is done at L2, Ethernet
> @@ -1150,27 +1173,27 @@
>     </p>
> 
>     <p>
> -      Note that ARP requests received from <code>localnet</code> or
> -      <code>vtep</code> logical inports can either go directly to VMs, in
> -      which case the VM responds or can hit an ARP responder for a logical
> -      router port if the packet is used to resolve a logical router port
> -      next hop address.  In either case, logical switch ARP responder rules
> -      will not be hit.  It contains these logical flows:
> +      Note that ARP requests received from <code>localnet</code> logical
> +      inports can either go directly to VMs, in which case the VM responds or
> +      can hit an ARP responder for a logical router port if the packet is 
> used
> +      to resolve a logical router port next hop address.  In either case,
> +      logical switch ARP responder rules will not be hit.  It contains these
> +      logical flows:
>      </p>
> 
>     <ul>
>       <li>
>         Priority-100 flows to skip the ARP responder if inport is of type
> -        <code>localnet</code> or <code>vtep</code> and advances directly
> -        to the next table.  ARP requests sent to <code>localnet</code> or
> -        <code>vtep</code> ports can be received by multiple hypervisors.
> -        Now, because the same mac binding rules are downloaded to all
> -        hypervisors, each of the multiple hypervisors will respond.  This
> -        will confuse L2 learning on the source of the ARP requests.  ARP
> -        requests received on an inport of type <code>router</code> are not
> -        expected to hit any logical switch ARP responder flows.  However,
> -        no skip flows are installed for these packets, as there would be
> -        some additional flow cost for this and the value appears limited.
> +        <code>localnet</code> advances directly to the next table.  ARP
> +        requests sent to <code>localnet</code> ports can be received by
> +        multiple hypervisors.  Now, because the same mac binding rules are
> +        downloaded to all hypervisors, each of the multiple hypervisors will
> +        respond.  This will confuse L2 learning on the source of the ARP
> +        requests.  ARP requests received on an inport of type
> +        <code>router</code> are not expected to hit any logical switch ARP
> +        responder flows.  However, no skip flows are installed for these
> +        packets, as there would be some additional flow cost for this and the
> +        value appears limited.
>       </li>
> 
>       <li>
> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> index 17d4f31b3..eab84886a 100644
> --- a/tests/ovn-northd.at
> +++ b/tests/ovn-northd.at
> @@ -2592,9 +2592,10 @@ AT_CHECK([ovn-sbctl lflow-list sw0 | grep 
> ls_in_nat_hairpin | sort | sed 's/tabl
>   table=??(ls_in_nat_hairpin  ), priority=90   , match=(ip && reg0[[12]] == 
> 1), action=(ct_snat;)
> ])
> 
> -AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed 
> 's/table=../table=??/'], [0], [dnl
> +AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed 
> 's/table=../table=??/g'], [0], [dnl
>   table=??(ls_in_hairpin      ), priority=0    , match=(1), action=(next;)
>   table=??(ls_in_hairpin      ), priority=1    , match=((reg0[[6]] == 1 || 
> reg0[[12]] == 1)), action=(eth.dst <-> eth.src; outport = inport; 
> flags.loopback = 1; output;)
> +  table=??(ls_in_hairpin      ), priority=1000 , match=(reg0[[14]] == 1), 
> action=(next(pipeline=ingress, table=??);)
> ])
> 
> check ovn-nbctl -- ls-lb-del sw0 lb0
> @@ -2608,8 +2609,9 @@ AT_CHECK([ovn-sbctl lflow-list sw0 | grep 
> ls_in_nat_hairpin | sort | sed 's/tabl
>   table=??(ls_in_nat_hairpin  ), priority=0    , match=(1), action=(next;)
> ])
> 
> -AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed 
> 's/table=../table=??/'], [0], [dnl
> +AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed 
> 's/table=../table=??/g'], [0], [dnl
>   table=??(ls_in_hairpin      ), priority=0    , match=(1), action=(next;)
> +  table=??(ls_in_hairpin      ), priority=1000 , match=(reg0[[14]] == 1), 
> action=(next(pipeline=ingress, table=??);)
> ])
> 
> check ovn-nbctl -- add load_balancer_group $lbg load_balancer $lb0
> @@ -2627,9 +2629,10 @@ AT_CHECK([ovn-sbctl lflow-list sw0 | grep 
> ls_in_nat_hairpin | sort | sed 's/tabl
>   table=??(ls_in_nat_hairpin  ), priority=90   , match=(ip && reg0[[12]] == 
> 1), action=(ct_snat;)
> ])
> 
> -AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed 
> 's/table=../table=??/'], [0], [dnl
> +AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed 
> 's/table=../table=??/g'], [0], [dnl
>   table=??(ls_in_hairpin      ), priority=0    , match=(1), action=(next;)
>   table=??(ls_in_hairpin      ), priority=1    , match=((reg0[[6]] == 1 || 
> reg0[[12]] == 1)), action=(eth.dst <-> eth.src; outport = inport; 
> flags.loopback = 1; output;)
> +  table=??(ls_in_hairpin      ), priority=1000 , match=(reg0[[14]] == 1), 
> action=(next(pipeline=ingress, table=??);)
> ])
> 
> AT_CLEANUP
> diff --git a/tests/ovn.at b/tests/ovn.at
> index 166b5f72e..0e24f7abb 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -4023,10 +4023,20 @@ ovn-nbctl lsp-set-addresses lp-vtep unknown
> ovn-nbctl lsp-add lsw0 lpr
> ovn-nbctl lr-add lr
> ovn-nbctl lrp-add lr lrp1 f0:00:00:00:00:f1 192.168.1.1/24
> -ovn-nbctl set Logical_Switch_Port lpr type=router \
> -                             options:router-port=lrp1 \
> -    addresses='"f0:00:00:00:00:f1 192.168.1.1"'
> -
> +ovn-nbctl set Logical_Switch_Port lpr \
> +    type=router \
> +    options:router-port=lrp1 \
> +    addresses=router
> +ovn-nbctl lrp-set-gateway-chassis lrp1 hv1
> +
> +ovn-nbctl lsp-add lsw0 lpr2
> +ovn-nbctl lr-add lr2
> +ovn-nbctl lrp-add lr lrp2 f0:00:00:00:00:f2 192.168.1.254/24
> +ovn-nbctl set Logical_Switch_Port lpr2 \
> +    type=router \
> +    options:router-port=lrp2 \
> +    addresses=router
> +ovn-nbctl lrp-set-gateway-chassis lrp2 hv1
> 
> net_add n1               # Network to connect hv1, hv2, and vtep
> net_add n2               # Network to connect vtep and hv3
> @@ -4147,17 +4157,28 @@ for s in 1 2 3; do
>     test_packet $s f0000000ffff f0000000000$s 0${s}66 $unknown           #3
> done
> 
> -# ARP request should not be responded to by logical switch router
> -# type arp responder on HV1 and HV2 and should reach directly to
> -# vif1 and vif2
> +# ARP request from VTEP to LRP should be responded by ARP responder.
> sha=f00000000003
> spa=`ip_to_hex 192 168 1 2`
> tpa=`ip_to_hex 192 168 1 1`
> request=ffffffffffff${sha}08060001080006040001${sha}${spa}ffffffffffff${tpa}
> as hv3 ovs-appctl netdev-dummy/receive vif3 $request
> -echo $request >> 1.expected
> echo $request >> 2.expected
> 
> +lrpmac=f000000000f1
> +response=${sha}${lrpmac}08060001080006040002${lrpmac}${tpa}${sha}${spa}
> +# since lrp1 has gateway chassis set on hv1, hv1 will suppress arp request 
> and
> +# answer with arp reply by OVS directly to vtep lport. all other lports,
> +# except lport from which this request was initiated, will receive arp 
> request.
> +# we expect arp reply packet on hv3
> +echo $response >> 3.expected
> +
> +AT_CHECK([ovn-sbctl lflow-list lsw0 | grep 'reg0[\[14\]]' | sort | sed 
> 's/table=../table=??/g'], [0], [dnl
> +  table=??(ls_in_port_sec_l2  ), priority=50   , match=(inport == 
> "lp-vtep"), action=(reg0[[14]] = 1; next(pipeline=ingress, table=??);)
> +  table=??(ls_in_hairpin      ), priority=1000 , match=(reg0[[14]] == 1), 
> action=(next(pipeline=ingress, table=??);)
> +  table=??(ls_in_hairpin      ), priority=2000 , match=(reg0[[14]] == 1 && 
> (is_chassis_resident("cr-lrp1") || is_chassis_resident("cr-lrp2"))), 
> action=(next;)
> +])
> +
> # dump information with counters
> echo "------ OVN dump ------"
> ovn-nbctl show
> -- 
> 2.26.3
> 

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to