Hi Numan, Ihar, Han, Mark, would somebody find some time to take a look on this please?
Thanks. Regards, Vladislav Odintsov > On 1 Apr 2022, at 11:32, Vladislav Odintsov <[email protected]> wrote: > > When HW VTEP is used with OVN logical networks routed by > Logical Routers, only directly-attached network can communicate > with external VLAN (via vtep LSP). There is no connectivity > between VLAN network which is located behind VTEP LSP and lswitch1 > behind LR. > > Visual description: > > ┌──────┐ > ┌─────► │ VM │ > │ └───┬──┘ > │ ┌──────┴───────┐ > │ │ lswitch1 │ > │ │192.168.0.0/24│ > │ └──────┬───────┘ > │ ┌─┴──┐ > │ │ LR │ > │ └─┬──┘ > │ ┌──────┴───────┐ > │ │ lswitch2 │ > │ │192.168.1.0/24│ ◄──┐ > │ └──────┬───────┘ │ > │ ┌───┴────┐ │ > │FAIL │vtep LSP│ OK│ > │ └───┬────┘ │ > │ ┌──┴────┐ │ > └────────┤VLAN 10├───────┘ > └───────┘ > > This patch adds support to have connectivity between such routed networks. > User must set LRP gateway chassis (or ha chassis group) in the lswitch2's > associated LRP, which will enable answering ARP requests for LR IP from > VLAN (lswitch2) side. > > Traffic from VLAN to overlay will go from HW VTEP switch through gateway > chassis to hypervisor and return traffic will go directly from hypervisor > to HW VTEP switch and then got forwarded to VLAN. > > Now chassis, on which chassis redirect LRP is claimed, will answer ARP > requests > coming from HW VTEP switch. For this reason the hairpin traffic from VTEP > switch > is allowed: > > 1. the blocking OF rule in table `OFTABLE_REMOTE_OUTPUT` which matched on > reg10=0x2/0x2 is changed to match on reg10=0x2/0x3. We check if traffic is > from VTEP (ramp) switch and also we check if loopback flag was not set and > only in this case such traffic is not allowed. > 2. OF rule in table `OFTABLE_REMOTE_OUTPUT`, which directs traffic to HW VTEP > (ramp) switch is modified to flush IN_PORT in order to allow sending traffic > to OF port from which this traffic came (hairpin). > > New flows are added to ls_in_ip_hairpin table to support ARP traffic hairpin > for VTEP switches: > 1. if traffic has come from VTEP (ramp) switch AND l3dgw ports of associated > LSPs reside on current chassis (is_chassis_resident()), such traffic > is passed to next table for further ARP responder processing. Priority of > this flow is 2000. > 2. if traffic has come from VTEP (ramp) switch, send it to L2_LKUP table to > skip ARP responder processing. We serve ARP requests from VTEP lports only > on gateway chassis. Priority of this flow is 1000. > > Signed-off-by: Vladislav Odintsov <[email protected]> > --- > controller-vtep/vtep.c | 31 ++++++++++++++++++++--- > controller/physical.c | 18 ++++++++++---- > northd/northd.c | 50 +++++++++++++++++++++++++++++++++---- > northd/ovn-northd.8.xml | 55 +++++++++++++++++++++++++++++------------ > tests/ovn-northd.at | 9 ++++--- > tests/ovn.at | 37 +++++++++++++++++++++------ > 6 files changed, 159 insertions(+), 41 deletions(-) > > diff --git a/controller-vtep/vtep.c b/controller-vtep/vtep.c > index ecca00b54..c3d09425c 100644 > --- a/controller-vtep/vtep.c > +++ b/controller-vtep/vtep.c > @@ -311,6 +311,9 @@ vtep_macs_run(struct ovsdb_idl_txn *vtep_idl_txn, struct > shash *ucast_macs_rmts, > hash_uint64((uint64_t) vtep_ls->tunnel_key[0])); > } > > + const char *dp, *peer; > + const struct sbrec_port_binding *lrp_pb, *peer_pb; > + > SHASH_FOR_EACH (node, non_vtep_pbs) { > const struct sbrec_port_binding *port_binding_rec = node->data; > const struct sbrec_chassis *chassis_rec; > @@ -324,7 +327,25 @@ vtep_macs_run(struct ovsdb_idl_txn *vtep_idl_txn, struct > shash *ucast_macs_rmts, > continue; > } > > - tnl_key = port_binding_rec->datapath->tunnel_key; > + if (!strcmp(port_binding_rec->type, "chassisredirect")) { > + dp = smap_get(&port_binding_rec->options, "distributed-port"); > + lrp_pb = shash_find_data(non_vtep_pbs, dp); > + if (!lrp_pb) { > + continue; > + } > + > + peer = smap_get(&lrp_pb->options, "peer"); > + if (!peer) { > + continue; > + } > + > + peer_pb = shash_find_data(non_vtep_pbs, peer); > + tnl_key = peer_pb->datapath->tunnel_key; > + } > + else { > + tnl_key = port_binding_rec->datapath->tunnel_key; > + } > + > HMAP_FOR_EACH_WITH_HASH (ls_node, hmap_node, > hash_uint64((uint64_t) tnl_key), > &ls_map) { > @@ -454,7 +475,7 @@ vtep_macs_run(struct ovsdb_idl_txn *vtep_idl_txn, struct > shash *ucast_macs_rmts, > static bool > vtep_lswitch_cleanup(struct ovsdb_idl *vtep_idl) > { > - const struct vteprec_logical_switch *vtep_ls; > + const struct vteprec_logical_switch *vtep_ls; > bool done = true; > > VTEPREC_LOGICAL_SWITCH_FOR_EACH (vtep_ls, vtep_idl) { > @@ -572,9 +593,11 @@ vtep_run(struct controller_vtep_ctx *ctx) > /* Collects and classifies 'Port_Binding's. */ > SBREC_PORT_BINDING_FOR_EACH(port_binding_rec, ctx->ovnsb_idl) { > struct shash *target = > - !strcmp(port_binding_rec->type, "vtep") ? &vtep_pbs : > &non_vtep_pbs; > + !strcmp(port_binding_rec->type, "vtep") ? &vtep_pbs > + : &non_vtep_pbs; > > - if (!port_binding_rec->chassis) { > + if (!port_binding_rec->chassis && > + strcmp(port_binding_rec->type, "patch")) { > continue; > } > shash_add(target, port_binding_rec->logical_port, port_binding_rec); > diff --git a/controller/physical.c b/controller/physical.c > index 02fcd5ea8..2a70f4fe0 100644 > --- a/controller/physical.c > +++ b/controller/physical.c > @@ -299,9 +299,16 @@ put_remote_port_redirect_overlay(const struct > if (!rem_tun) { > return; > } > + > + bool is_vtep_port = !strcmp(binding->type, "vtep"); > + /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for ARP/ND > + * responder in L3 networks. */ > + if (is_vtep_port) { > + put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, ofpacts_p); > + } > + > put_encapsulation(mff_ovn_geneve, tun, binding->datapath, port_key, > - !strcmp(binding->type, "vtep"), > - ofpacts_p); > + is_vtep_port, ofpacts_p); > /* Output to tunnel. */ > ofpact_put_OUTPUT(ofpacts_p)->port = rem_tun->ofport; > } else { > @@ -1841,12 +1848,13 @@ physical_run(struct physical_ctx *p_ctx, > * Handles packets received from a VXLAN tunnel which get resubmitted to > * OFTABLE_LOG_INGRESS_PIPELINE due to lack of needed metadata in VXLAN, > * explicitly skip sending back out any tunnels and resubmit to table 38 > - * for local delivery. > + * for local delivery, except packets which have MLF_ALLOW_LOOPBACK bit > + * set. > */ > struct match match; > match_init_catchall(&match); > - match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0, > - MLF_RCV_FROM_RAMP, MLF_RCV_FROM_RAMP); > + match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0, MLF_RCV_FROM_RAMP, > + MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK); > > /* Resubmit to table 38. */ > ofpbuf_clear(&ofpacts); > diff --git a/northd/northd.c b/northd/northd.c > index a2cf8d6fc..0a055b519 100644 > --- a/northd/northd.c > +++ b/northd/northd.c > @@ -199,6 +199,7 @@ enum ovn_stage { > #define REGBIT_LKUP_FDB "reg0[11]" > #define REGBIT_HAIRPIN_REPLY "reg0[12]" > #define REGBIT_ACL_LABEL "reg0[13]" > +#define REGBIT_FROM_RAMP "reg0[14]" > > #define REG_ORIG_DIP_IPV4 "reg1" > #define REG_ORIG_DIP_IPV6 "xxreg1" > @@ -5510,8 +5511,9 @@ build_lswitch_input_port_sec_op( > } > > if (!strcmp(op->nbsp->type, "vtep")) { > + ds_put_format(actions, REGBIT_FROM_RAMP" = 1; "); > ds_put_format(actions, "next(pipeline=ingress, table=%d);", > - ovn_stage_get_table(S_SWITCH_IN_L2_LKUP)); > + ovn_stage_get_table(S_SWITCH_IN_HAIRPIN)); > } else { > ds_put_cstr(actions, "next;"); > } > @@ -6960,6 +6962,45 @@ build_lb_hairpin(struct ovn_datapath *od, struct hmap > *lflows) > } > } > > +static void > +build_vtep_hairpin(struct ovn_datapath *od, struct hmap *lflows) > +{ > + /* Ingress Pre-ARP flows for VTEP hairpining traffic. Priority 1000: > + * Packets that received from non-VTEP ports should continue processing. > */ > + > + char *action = xasprintf("next(pipeline=ingress, table=%d);", > + ovn_stage_get_table(S_SWITCH_IN_L2_LKUP)); > + /* send all traffic from VTEP directly to L2LKP table. */ > + ovn_lflow_add(lflows, od, S_SWITCH_IN_HAIRPIN, 1000, > + REGBIT_FROM_RAMP" == 1", action); > + free(action); > + > + struct ds match = DS_EMPTY_INITIALIZER; > + int n_ports = od->n_router_ports; > + bool dp_has_l3dgw_ports = false; > + for (int i = 0; i < n_ports; i++) { > + if (is_l3dgw_port(od->router_ports[i]->peer)) { > + ds_put_format(&match, "%sis_chassis_resident(%s)%s", > + i == 0 ? REGBIT_FROM_RAMP" == 1 && (" : "", > + od->router_ports[i]->peer->cr_port->json_key, > + i < n_ports - 1 ? " || " : ")"); > + dp_has_l3dgw_ports = true; > + } > + } > + > + /* Ingress pre-arp flow for traffic from VTEP (ramp) switch. > + * Priority 2000: Packets, that were received from VTEP (ramp) switch and > + * router ports of current datapath are l3dgw ports and they reside on > + * current chassis, should be passed to next table for ARP/ND hairpin > + * processing. > + */ > + if (dp_has_l3dgw_ports) { > + ovn_lflow_add(lflows, od, S_SWITCH_IN_HAIRPIN, 2000, ds_cstr(&match), > + "next;"); > + } > + ds_destroy(&match); > +} > + > /* Build logical flows for the forwarding groups */ > static void > build_fwd_group_lflows(struct ovn_datapath *od, struct hmap *lflows) > @@ -7652,6 +7693,7 @@ build_lswitch_lflows_pre_acl_and_acl(struct > ovn_datapath *od, > build_qos(od, lflows); > build_stateful(od, lflows); > build_lb_hairpin(od, lflows); > + build_vtep_hairpin(od, lflows); > } > } > > @@ -7680,8 +7722,7 @@ build_lswitch_lflows_admission_control(struct > ovn_datapath *od, > } > > /* Ingress table 18: ARP/ND responder, skip requests coming from localnet > - * and vtep ports. (priority 100); see ovn-northd.8.xml for the > - * rationale. */ > + * ports. (priority 100); see ovn-northd.8.xml for the rationale. */ > > static void > build_lswitch_arp_nd_responder_skip_local(struct ovn_port *op, > @@ -7689,8 +7730,7 @@ build_lswitch_arp_nd_responder_skip_local(struct > ovn_port *op, > struct ds *match) > { > if (op->nbsp) { > - if ((!strcmp(op->nbsp->type, "localnet")) || > - (!strcmp(op->nbsp->type, "vtep"))) { > + if (!strcmp(op->nbsp->type, "localnet")) { > ds_clear(match); > ds_put_format(match, "inport == %s", op->json_key); > ovn_lflow_add_with_lport_and_hint(lflows, op->od, > diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml > index 4784bff04..08864acfe 100644 > --- a/northd/ovn-northd.8.xml > +++ b/northd/ovn-northd.8.xml > @@ -1094,6 +1094,29 @@ > > <h3>Ingress Table 17: Hairpin</h3> > <ul> > + <li> > + A priority-2000 flow that passes traffic to next table for hairpin > ARP > + processing with > + <code>next;</code> action, which matches on: > + <ul> > + <li> > + <code>reg0[14]</code> register bit, that is set in > + ls_in_port_sec_l2 table for traffic received from HW VTEP (ramp) > + switch AND > + </li> > + <li> > + all Distrubited Gateway ports, which are associated with current > LS > + <code>router</code> type LSPs if > <code>is_chassis_resident()</code> > + for them is true. > + </li> > + </ul> > + </li> > + <li> > + A priority-1000 flow that matches on <code>reg0[14]</code> register > + bit that is set in ls_in_port_sec_l2 table for traffic received from > HW > + VTEP (ramp) switch. This traffic is passed to ingress table > + ls_in_l2_lkup. > + </li> > <li> > A priority-1 flow that hairpins traffic matched by non-default > flows in the Pre-Hairpin table. Hairpinning is done at L2, Ethernet > @@ -1150,27 +1173,27 @@ > </p> > > <p> > - Note that ARP requests received from <code>localnet</code> or > - <code>vtep</code> logical inports can either go directly to VMs, in > - which case the VM responds or can hit an ARP responder for a logical > - router port if the packet is used to resolve a logical router port > - next hop address. In either case, logical switch ARP responder rules > - will not be hit. It contains these logical flows: > + Note that ARP requests received from <code>localnet</code> logical > + inports can either go directly to VMs, in which case the VM responds or > + can hit an ARP responder for a logical router port if the packet is > used > + to resolve a logical router port next hop address. In either case, > + logical switch ARP responder rules will not be hit. It contains these > + logical flows: > </p> > > <ul> > <li> > Priority-100 flows to skip the ARP responder if inport is of type > - <code>localnet</code> or <code>vtep</code> and advances directly > - to the next table. ARP requests sent to <code>localnet</code> or > - <code>vtep</code> ports can be received by multiple hypervisors. > - Now, because the same mac binding rules are downloaded to all > - hypervisors, each of the multiple hypervisors will respond. This > - will confuse L2 learning on the source of the ARP requests. ARP > - requests received on an inport of type <code>router</code> are not > - expected to hit any logical switch ARP responder flows. However, > - no skip flows are installed for these packets, as there would be > - some additional flow cost for this and the value appears limited. > + <code>localnet</code> advances directly to the next table. ARP > + requests sent to <code>localnet</code> ports can be received by > + multiple hypervisors. Now, because the same mac binding rules are > + downloaded to all hypervisors, each of the multiple hypervisors will > + respond. This will confuse L2 learning on the source of the ARP > + requests. ARP requests received on an inport of type > + <code>router</code> are not expected to hit any logical switch ARP > + responder flows. However, no skip flows are installed for these > + packets, as there would be some additional flow cost for this and the > + value appears limited. > </li> > > <li> > diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at > index 17d4f31b3..eab84886a 100644 > --- a/tests/ovn-northd.at > +++ b/tests/ovn-northd.at > @@ -2592,9 +2592,10 @@ AT_CHECK([ovn-sbctl lflow-list sw0 | grep > ls_in_nat_hairpin | sort | sed 's/tabl > table=??(ls_in_nat_hairpin ), priority=90 , match=(ip && reg0[[12]] == > 1), action=(ct_snat;) > ]) > > -AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed > 's/table=../table=??/'], [0], [dnl > +AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed > 's/table=../table=??/g'], [0], [dnl > table=??(ls_in_hairpin ), priority=0 , match=(1), action=(next;) > table=??(ls_in_hairpin ), priority=1 , match=((reg0[[6]] == 1 || > reg0[[12]] == 1)), action=(eth.dst <-> eth.src; outport = inport; > flags.loopback = 1; output;) > + table=??(ls_in_hairpin ), priority=1000 , match=(reg0[[14]] == 1), > action=(next(pipeline=ingress, table=??);) > ]) > > check ovn-nbctl -- ls-lb-del sw0 lb0 > @@ -2608,8 +2609,9 @@ AT_CHECK([ovn-sbctl lflow-list sw0 | grep > ls_in_nat_hairpin | sort | sed 's/tabl > table=??(ls_in_nat_hairpin ), priority=0 , match=(1), action=(next;) > ]) > > -AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed > 's/table=../table=??/'], [0], [dnl > +AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed > 's/table=../table=??/g'], [0], [dnl > table=??(ls_in_hairpin ), priority=0 , match=(1), action=(next;) > + table=??(ls_in_hairpin ), priority=1000 , match=(reg0[[14]] == 1), > action=(next(pipeline=ingress, table=??);) > ]) > > check ovn-nbctl -- add load_balancer_group $lbg load_balancer $lb0 > @@ -2627,9 +2629,10 @@ AT_CHECK([ovn-sbctl lflow-list sw0 | grep > ls_in_nat_hairpin | sort | sed 's/tabl > table=??(ls_in_nat_hairpin ), priority=90 , match=(ip && reg0[[12]] == > 1), action=(ct_snat;) > ]) > > -AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed > 's/table=../table=??/'], [0], [dnl > +AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed > 's/table=../table=??/g'], [0], [dnl > table=??(ls_in_hairpin ), priority=0 , match=(1), action=(next;) > table=??(ls_in_hairpin ), priority=1 , match=((reg0[[6]] == 1 || > reg0[[12]] == 1)), action=(eth.dst <-> eth.src; outport = inport; > flags.loopback = 1; output;) > + table=??(ls_in_hairpin ), priority=1000 , match=(reg0[[14]] == 1), > action=(next(pipeline=ingress, table=??);) > ]) > > AT_CLEANUP > diff --git a/tests/ovn.at b/tests/ovn.at > index 166b5f72e..0e24f7abb 100644 > --- a/tests/ovn.at > +++ b/tests/ovn.at > @@ -4023,10 +4023,20 @@ ovn-nbctl lsp-set-addresses lp-vtep unknown > ovn-nbctl lsp-add lsw0 lpr > ovn-nbctl lr-add lr > ovn-nbctl lrp-add lr lrp1 f0:00:00:00:00:f1 192.168.1.1/24 > -ovn-nbctl set Logical_Switch_Port lpr type=router \ > - options:router-port=lrp1 \ > - addresses='"f0:00:00:00:00:f1 192.168.1.1"' > - > +ovn-nbctl set Logical_Switch_Port lpr \ > + type=router \ > + options:router-port=lrp1 \ > + addresses=router > +ovn-nbctl lrp-set-gateway-chassis lrp1 hv1 > + > +ovn-nbctl lsp-add lsw0 lpr2 > +ovn-nbctl lr-add lr2 > +ovn-nbctl lrp-add lr lrp2 f0:00:00:00:00:f2 192.168.1.254/24 > +ovn-nbctl set Logical_Switch_Port lpr2 \ > + type=router \ > + options:router-port=lrp2 \ > + addresses=router > +ovn-nbctl lrp-set-gateway-chassis lrp2 hv1 > > net_add n1 # Network to connect hv1, hv2, and vtep > net_add n2 # Network to connect vtep and hv3 > @@ -4147,17 +4157,28 @@ for s in 1 2 3; do > test_packet $s f0000000ffff f0000000000$s 0${s}66 $unknown #3 > done > > -# ARP request should not be responded to by logical switch router > -# type arp responder on HV1 and HV2 and should reach directly to > -# vif1 and vif2 > +# ARP request from VTEP to LRP should be responded by ARP responder. > sha=f00000000003 > spa=`ip_to_hex 192 168 1 2` > tpa=`ip_to_hex 192 168 1 1` > request=ffffffffffff${sha}08060001080006040001${sha}${spa}ffffffffffff${tpa} > as hv3 ovs-appctl netdev-dummy/receive vif3 $request > -echo $request >> 1.expected > echo $request >> 2.expected > > +lrpmac=f000000000f1 > +response=${sha}${lrpmac}08060001080006040002${lrpmac}${tpa}${sha}${spa} > +# since lrp1 has gateway chassis set on hv1, hv1 will suppress arp request > and > +# answer with arp reply by OVS directly to vtep lport. all other lports, > +# except lport from which this request was initiated, will receive arp > request. > +# we expect arp reply packet on hv3 > +echo $response >> 3.expected > + > +AT_CHECK([ovn-sbctl lflow-list lsw0 | grep 'reg0[\[14\]]' | sort | sed > 's/table=../table=??/g'], [0], [dnl > + table=??(ls_in_port_sec_l2 ), priority=50 , match=(inport == > "lp-vtep"), action=(reg0[[14]] = 1; next(pipeline=ingress, table=??);) > + table=??(ls_in_hairpin ), priority=1000 , match=(reg0[[14]] == 1), > action=(next(pipeline=ingress, table=??);) > + table=??(ls_in_hairpin ), priority=2000 , match=(reg0[[14]] == 1 && > (is_chassis_resident("cr-lrp1") || is_chassis_resident("cr-lrp2"))), > action=(next;) > +]) > + > # dump information with counters > echo "------ OVN dump ------" > ovn-nbctl show > -- > 2.26.3 > _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
