Hi, this is a quick reminder. Thanks.
Regards, Vladislav Odintsov > On 15 Apr 2022, at 11:46, Vladislav Odintsov <[email protected]> wrote: > > Hi Numan, Ihar, Han, Mark, > > would somebody find some time to take a look on this please? > > Thanks. > > Regards, > Vladislav Odintsov > >> On 1 Apr 2022, at 11:32, Vladislav Odintsov <[email protected] >> <mailto:[email protected]>> wrote: >> >> When HW VTEP is used with OVN logical networks routed by >> Logical Routers, only directly-attached network can communicate >> with external VLAN (via vtep LSP). There is no connectivity >> between VLAN network which is located behind VTEP LSP and lswitch1 >> behind LR. >> >> Visual description: >> >> ┌──────┐ >> ┌─────► │ VM │ >> │ └───┬──┘ >> │ ┌──────┴───────┐ >> │ │ lswitch1 │ >> │ │192.168.0.0/24│ >> │ └──────┬───────┘ >> │ ┌─┴──┐ >> │ │ LR │ >> │ └─┬──┘ >> │ ┌──────┴───────┐ >> │ │ lswitch2 │ >> │ │192.168.1.0/24│ ◄──┐ >> │ └──────┬───────┘ │ >> │ ┌───┴────┐ │ >> │FAIL │vtep LSP│ OK│ >> │ └───┬────┘ │ >> │ ┌──┴────┐ │ >> └────────┤VLAN 10├───────┘ >> └───────┘ >> >> This patch adds support to have connectivity between such routed networks. >> User must set LRP gateway chassis (or ha chassis group) in the lswitch2's >> associated LRP, which will enable answering ARP requests for LR IP from >> VLAN (lswitch2) side. >> >> Traffic from VLAN to overlay will go from HW VTEP switch through gateway >> chassis to hypervisor and return traffic will go directly from hypervisor >> to HW VTEP switch and then got forwarded to VLAN. >> >> Now chassis, on which chassis redirect LRP is claimed, will answer ARP >> requests >> coming from HW VTEP switch. For this reason the hairpin traffic from VTEP >> switch >> is allowed: >> >> 1. the blocking OF rule in table `OFTABLE_REMOTE_OUTPUT` which matched on >> reg10=0x2/0x2 is changed to match on reg10=0x2/0x3. We check if traffic is >> from VTEP (ramp) switch and also we check if loopback flag was not set and >> only in this case such traffic is not allowed. >> 2. OF rule in table `OFTABLE_REMOTE_OUTPUT`, which directs traffic to HW VTEP >> (ramp) switch is modified to flush IN_PORT in order to allow sending >> traffic >> to OF port from which this traffic came (hairpin). >> >> New flows are added to ls_in_ip_hairpin table to support ARP traffic hairpin >> for VTEP switches: >> 1. if traffic has come from VTEP (ramp) switch AND l3dgw ports of associated >> LSPs reside on current chassis (is_chassis_resident()), such traffic >> is passed to next table for further ARP responder processing. Priority of >> this flow is 2000. >> 2. if traffic has come from VTEP (ramp) switch, send it to L2_LKUP table to >> skip ARP responder processing. We serve ARP requests from VTEP lports only >> on gateway chassis. Priority of this flow is 1000. >> >> Signed-off-by: Vladislav Odintsov <[email protected] >> <mailto:[email protected]>> >> --- >> controller-vtep/vtep.c | 31 ++++++++++++++++++++--- >> controller/physical.c | 18 ++++++++++---- >> northd/northd.c | 50 +++++++++++++++++++++++++++++++++---- >> northd/ovn-northd.8.xml | 55 +++++++++++++++++++++++++++++------------ >> tests/ovn-northd.at <http://ovn-northd.at/> | 9 ++++--- >> tests/ovn.at <http://ovn.at/> | 37 +++++++++++++++++++++------ >> 6 files changed, 159 insertions(+), 41 deletions(-) >> >> diff --git a/controller-vtep/vtep.c b/controller-vtep/vtep.c >> index ecca00b54..c3d09425c 100644 >> --- a/controller-vtep/vtep.c >> +++ b/controller-vtep/vtep.c >> @@ -311,6 +311,9 @@ vtep_macs_run(struct ovsdb_idl_txn *vtep_idl_txn, struct >> shash *ucast_macs_rmts, >> hash_uint64((uint64_t) vtep_ls->tunnel_key[0])); >> } >> >> + const char *dp, *peer; >> + const struct sbrec_port_binding *lrp_pb, *peer_pb; >> + >> SHASH_FOR_EACH (node, non_vtep_pbs) { >> const struct sbrec_port_binding *port_binding_rec = node->data; >> const struct sbrec_chassis *chassis_rec; >> @@ -324,7 +327,25 @@ vtep_macs_run(struct ovsdb_idl_txn *vtep_idl_txn, >> struct shash *ucast_macs_rmts, >> continue; >> } >> >> - tnl_key = port_binding_rec->datapath->tunnel_key; >> + if (!strcmp(port_binding_rec->type, "chassisredirect")) { >> + dp = smap_get(&port_binding_rec->options, "distributed-port"); >> + lrp_pb = shash_find_data(non_vtep_pbs, dp); >> + if (!lrp_pb) { >> + continue; >> + } >> + >> + peer = smap_get(&lrp_pb->options, "peer"); >> + if (!peer) { >> + continue; >> + } >> + >> + peer_pb = shash_find_data(non_vtep_pbs, peer); >> + tnl_key = peer_pb->datapath->tunnel_key; >> + } >> + else { >> + tnl_key = port_binding_rec->datapath->tunnel_key; >> + } >> + >> HMAP_FOR_EACH_WITH_HASH (ls_node, hmap_node, >> hash_uint64((uint64_t) tnl_key), >> &ls_map) { >> @@ -454,7 +475,7 @@ vtep_macs_run(struct ovsdb_idl_txn *vtep_idl_txn, struct >> shash *ucast_macs_rmts, >> static bool >> vtep_lswitch_cleanup(struct ovsdb_idl *vtep_idl) >> { >> - const struct vteprec_logical_switch *vtep_ls; >> + const struct vteprec_logical_switch *vtep_ls; >> bool done = true; >> >> VTEPREC_LOGICAL_SWITCH_FOR_EACH (vtep_ls, vtep_idl) { >> @@ -572,9 +593,11 @@ vtep_run(struct controller_vtep_ctx *ctx) >> /* Collects and classifies 'Port_Binding's. */ >> SBREC_PORT_BINDING_FOR_EACH(port_binding_rec, ctx->ovnsb_idl) { >> struct shash *target = >> - !strcmp(port_binding_rec->type, "vtep") ? &vtep_pbs : >> &non_vtep_pbs; >> + !strcmp(port_binding_rec->type, "vtep") ? &vtep_pbs >> + : &non_vtep_pbs; >> >> - if (!port_binding_rec->chassis) { >> + if (!port_binding_rec->chassis && >> + strcmp(port_binding_rec->type, "patch")) { >> continue; >> } >> shash_add(target, port_binding_rec->logical_port, port_binding_rec); >> diff --git a/controller/physical.c b/controller/physical.c >> index 02fcd5ea8..2a70f4fe0 100644 >> --- a/controller/physical.c >> +++ b/controller/physical.c >> @@ -299,9 +299,16 @@ put_remote_port_redirect_overlay(const struct >> if (!rem_tun) { >> return; >> } >> + >> + bool is_vtep_port = !strcmp(binding->type, "vtep"); >> + /* rewrite MFF_IN_PORT to bypass OpenFlow loopback check for ARP/ND >> + * responder in L3 networks. */ >> + if (is_vtep_port) { >> + put_load(ofp_to_u16(OFPP_NONE), MFF_IN_PORT, 0, 16, ofpacts_p); >> + } >> + >> put_encapsulation(mff_ovn_geneve, tun, binding->datapath, port_key, >> - !strcmp(binding->type, "vtep"), >> - ofpacts_p); >> + is_vtep_port, ofpacts_p); >> /* Output to tunnel. */ >> ofpact_put_OUTPUT(ofpacts_p)->port = rem_tun->ofport; >> } else { >> @@ -1841,12 +1848,13 @@ physical_run(struct physical_ctx *p_ctx, >> * Handles packets received from a VXLAN tunnel which get resubmitted to >> * OFTABLE_LOG_INGRESS_PIPELINE due to lack of needed metadata in VXLAN, >> * explicitly skip sending back out any tunnels and resubmit to table 38 >> - * for local delivery. >> + * for local delivery, except packets which have MLF_ALLOW_LOOPBACK bit >> + * set. >> */ >> struct match match; >> match_init_catchall(&match); >> - match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0, >> - MLF_RCV_FROM_RAMP, MLF_RCV_FROM_RAMP); >> + match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0, >> MLF_RCV_FROM_RAMP, >> + MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK); >> >> /* Resubmit to table 38. */ >> ofpbuf_clear(&ofpacts); >> diff --git a/northd/northd.c b/northd/northd.c >> index a2cf8d6fc..0a055b519 100644 >> --- a/northd/northd.c >> +++ b/northd/northd.c >> @@ -199,6 +199,7 @@ enum ovn_stage { >> #define REGBIT_LKUP_FDB "reg0[11]" >> #define REGBIT_HAIRPIN_REPLY "reg0[12]" >> #define REGBIT_ACL_LABEL "reg0[13]" >> +#define REGBIT_FROM_RAMP "reg0[14]" >> >> #define REG_ORIG_DIP_IPV4 "reg1" >> #define REG_ORIG_DIP_IPV6 "xxreg1" >> @@ -5510,8 +5511,9 @@ build_lswitch_input_port_sec_op( >> } >> >> if (!strcmp(op->nbsp->type, "vtep")) { >> + ds_put_format(actions, REGBIT_FROM_RAMP" = 1; "); >> ds_put_format(actions, "next(pipeline=ingress, table=%d);", >> - ovn_stage_get_table(S_SWITCH_IN_L2_LKUP)); >> + ovn_stage_get_table(S_SWITCH_IN_HAIRPIN)); >> } else { >> ds_put_cstr(actions, "next;"); >> } >> @@ -6960,6 +6962,45 @@ build_lb_hairpin(struct ovn_datapath *od, struct hmap >> *lflows) >> } >> } >> >> +static void >> +build_vtep_hairpin(struct ovn_datapath *od, struct hmap *lflows) >> +{ >> + /* Ingress Pre-ARP flows for VTEP hairpining traffic. Priority 1000: >> + * Packets that received from non-VTEP ports should continue >> processing. */ >> + >> + char *action = xasprintf("next(pipeline=ingress, table=%d);", >> + ovn_stage_get_table(S_SWITCH_IN_L2_LKUP)); >> + /* send all traffic from VTEP directly to L2LKP table. */ >> + ovn_lflow_add(lflows, od, S_SWITCH_IN_HAIRPIN, 1000, >> + REGBIT_FROM_RAMP" == 1", action); >> + free(action); >> + >> + struct ds match = DS_EMPTY_INITIALIZER; >> + int n_ports = od->n_router_ports; >> + bool dp_has_l3dgw_ports = false; >> + for (int i = 0; i < n_ports; i++) { >> + if (is_l3dgw_port(od->router_ports[i]->peer)) { >> + ds_put_format(&match, "%sis_chassis_resident(%s)%s", >> + i == 0 ? REGBIT_FROM_RAMP" == 1 && (" : "", >> + od->router_ports[i]->peer->cr_port->json_key, >> + i < n_ports - 1 ? " || " : ")"); >> + dp_has_l3dgw_ports = true; >> + } >> + } >> + >> + /* Ingress pre-arp flow for traffic from VTEP (ramp) switch. >> + * Priority 2000: Packets, that were received from VTEP (ramp) switch and >> + * router ports of current datapath are l3dgw ports and they reside on >> + * current chassis, should be passed to next table for ARP/ND hairpin >> + * processing. >> + */ >> + if (dp_has_l3dgw_ports) { >> + ovn_lflow_add(lflows, od, S_SWITCH_IN_HAIRPIN, 2000, >> ds_cstr(&match), >> + "next;"); >> + } >> + ds_destroy(&match); >> +} >> + >> /* Build logical flows for the forwarding groups */ >> static void >> build_fwd_group_lflows(struct ovn_datapath *od, struct hmap *lflows) >> @@ -7652,6 +7693,7 @@ build_lswitch_lflows_pre_acl_and_acl(struct >> ovn_datapath *od, >> build_qos(od, lflows); >> build_stateful(od, lflows); >> build_lb_hairpin(od, lflows); >> + build_vtep_hairpin(od, lflows); >> } >> } >> >> @@ -7680,8 +7722,7 @@ build_lswitch_lflows_admission_control(struct >> ovn_datapath *od, >> } >> >> /* Ingress table 18: ARP/ND responder, skip requests coming from localnet >> - * and vtep ports. (priority 100); see ovn-northd.8.xml for the >> - * rationale. */ >> + * ports. (priority 100); see ovn-northd.8.xml for the rationale. */ >> >> static void >> build_lswitch_arp_nd_responder_skip_local(struct ovn_port *op, >> @@ -7689,8 +7730,7 @@ build_lswitch_arp_nd_responder_skip_local(struct >> ovn_port *op, >> struct ds *match) >> { >> if (op->nbsp) { >> - if ((!strcmp(op->nbsp->type, "localnet")) || >> - (!strcmp(op->nbsp->type, "vtep"))) { >> + if (!strcmp(op->nbsp->type, "localnet")) { >> ds_clear(match); >> ds_put_format(match, "inport == %s", op->json_key); >> ovn_lflow_add_with_lport_and_hint(lflows, op->od, >> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml >> index 4784bff04..08864acfe 100644 >> --- a/northd/ovn-northd.8.xml >> +++ b/northd/ovn-northd.8.xml >> @@ -1094,6 +1094,29 @@ >> >> <h3>Ingress Table 17: Hairpin</h3> >> <ul> >> + <li> >> + A priority-2000 flow that passes traffic to next table for hairpin >> ARP >> + processing with >> + <code>next;</code> action, which matches on: >> + <ul> >> + <li> >> + <code>reg0[14]</code> register bit, that is set in >> + ls_in_port_sec_l2 table for traffic received from HW VTEP (ramp) >> + switch AND >> + </li> >> + <li> >> + all Distrubited Gateway ports, which are associated with >> current LS >> + <code>router</code> type LSPs if >> <code>is_chassis_resident()</code> >> + for them is true. >> + </li> >> + </ul> >> + </li> >> + <li> >> + A priority-1000 flow that matches on <code>reg0[14]</code> register >> + bit that is set in ls_in_port_sec_l2 table for traffic received >> from HW >> + VTEP (ramp) switch. This traffic is passed to ingress table >> + ls_in_l2_lkup. >> + </li> >> <li> >> A priority-1 flow that hairpins traffic matched by non-default >> flows in the Pre-Hairpin table. Hairpinning is done at L2, Ethernet >> @@ -1150,27 +1173,27 @@ >> </p> >> >> <p> >> - Note that ARP requests received from <code>localnet</code> or >> - <code>vtep</code> logical inports can either go directly to VMs, in >> - which case the VM responds or can hit an ARP responder for a logical >> - router port if the packet is used to resolve a logical router port >> - next hop address. In either case, logical switch ARP responder rules >> - will not be hit. It contains these logical flows: >> + Note that ARP requests received from <code>localnet</code> logical >> + inports can either go directly to VMs, in which case the VM responds >> or >> + can hit an ARP responder for a logical router port if the packet is >> used >> + to resolve a logical router port next hop address. In either case, >> + logical switch ARP responder rules will not be hit. It contains these >> + logical flows: >> </p> >> >> <ul> >> <li> >> Priority-100 flows to skip the ARP responder if inport is of type >> - <code>localnet</code> or <code>vtep</code> and advances directly >> - to the next table. ARP requests sent to <code>localnet</code> or >> - <code>vtep</code> ports can be received by multiple hypervisors. >> - Now, because the same mac binding rules are downloaded to all >> - hypervisors, each of the multiple hypervisors will respond. This >> - will confuse L2 learning on the source of the ARP requests. ARP >> - requests received on an inport of type <code>router</code> are not >> - expected to hit any logical switch ARP responder flows. However, >> - no skip flows are installed for these packets, as there would be >> - some additional flow cost for this and the value appears limited. >> + <code>localnet</code> advances directly to the next table. ARP >> + requests sent to <code>localnet</code> ports can be received by >> + multiple hypervisors. Now, because the same mac binding rules are >> + downloaded to all hypervisors, each of the multiple hypervisors will >> + respond. This will confuse L2 learning on the source of the ARP >> + requests. ARP requests received on an inport of type >> + <code>router</code> are not expected to hit any logical switch ARP >> + responder flows. However, no skip flows are installed for these >> + packets, as there would be some additional flow cost for this and >> the >> + value appears limited. >> </li> >> >> <li> >> diff --git a/tests/ovn-northd.at <http://ovn-northd.at/> >> b/tests/ovn-northd.at <http://ovn-northd.at/> >> index 17d4f31b3..eab84886a 100644 >> --- a/tests/ovn-northd.at <http://ovn-northd.at/> >> +++ b/tests/ovn-northd.at <http://ovn-northd.at/> >> @@ -2592,9 +2592,10 @@ AT_CHECK([ovn-sbctl lflow-list sw0 | grep >> ls_in_nat_hairpin | sort | sed 's/tabl >> table=??(ls_in_nat_hairpin ), priority=90 , match=(ip && reg0[[12]] == >> 1), action=(ct_snat;) >> ]) >> >> -AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed >> 's/table=../table=??/'], [0], [dnl >> +AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed >> 's/table=../table=??/g'], [0], [dnl >> table=??(ls_in_hairpin ), priority=0 , match=(1), action=(next;) >> table=??(ls_in_hairpin ), priority=1 , match=((reg0[[6]] == 1 || >> reg0[[12]] == 1)), action=(eth.dst <-> eth.src; outport = inport; >> flags.loopback = 1; output;) >> + table=??(ls_in_hairpin ), priority=1000 , match=(reg0[[14]] == 1), >> action=(next(pipeline=ingress, table=??);) >> ]) >> >> check ovn-nbctl -- ls-lb-del sw0 lb0 >> @@ -2608,8 +2609,9 @@ AT_CHECK([ovn-sbctl lflow-list sw0 | grep >> ls_in_nat_hairpin | sort | sed 's/tabl >> table=??(ls_in_nat_hairpin ), priority=0 , match=(1), action=(next;) >> ]) >> >> -AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed >> 's/table=../table=??/'], [0], [dnl >> +AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed >> 's/table=../table=??/g'], [0], [dnl >> table=??(ls_in_hairpin ), priority=0 , match=(1), action=(next;) >> + table=??(ls_in_hairpin ), priority=1000 , match=(reg0[[14]] == 1), >> action=(next(pipeline=ingress, table=??);) >> ]) >> >> check ovn-nbctl -- add load_balancer_group $lbg load_balancer $lb0 >> @@ -2627,9 +2629,10 @@ AT_CHECK([ovn-sbctl lflow-list sw0 | grep >> ls_in_nat_hairpin | sort | sed 's/tabl >> table=??(ls_in_nat_hairpin ), priority=90 , match=(ip && reg0[[12]] == >> 1), action=(ct_snat;) >> ]) >> >> -AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed >> 's/table=../table=??/'], [0], [dnl >> +AT_CHECK([ovn-sbctl lflow-list sw0 | grep ls_in_hairpin | sort | sed >> 's/table=../table=??/g'], [0], [dnl >> table=??(ls_in_hairpin ), priority=0 , match=(1), action=(next;) >> table=??(ls_in_hairpin ), priority=1 , match=((reg0[[6]] == 1 || >> reg0[[12]] == 1)), action=(eth.dst <-> eth.src; outport = inport; >> flags.loopback = 1; output;) >> + table=??(ls_in_hairpin ), priority=1000 , match=(reg0[[14]] == 1), >> action=(next(pipeline=ingress, table=??);) >> ]) >> >> AT_CLEANUP >> diff --git a/tests/ovn.at <http://ovn.at/> b/tests/ovn.at <http://ovn.at/> >> index 166b5f72e..0e24f7abb 100644 >> --- a/tests/ovn.at <http://ovn.at/> >> +++ b/tests/ovn.at <http://ovn.at/> >> @@ -4023,10 +4023,20 @@ ovn-nbctl lsp-set-addresses lp-vtep unknown >> ovn-nbctl lsp-add lsw0 lpr >> ovn-nbctl lr-add lr >> ovn-nbctl lrp-add lr lrp1 f0:00:00:00:00:f1 192.168.1.1/24 >> -ovn-nbctl set Logical_Switch_Port lpr type=router \ >> - options:router-port=lrp1 \ >> - addresses='"f0:00:00:00:00:f1 192.168.1.1"' >> - >> +ovn-nbctl set Logical_Switch_Port lpr \ >> + type=router \ >> + options:router-port=lrp1 \ >> + addresses=router >> +ovn-nbctl lrp-set-gateway-chassis lrp1 hv1 >> + >> +ovn-nbctl lsp-add lsw0 lpr2 >> +ovn-nbctl lr-add lr2 >> +ovn-nbctl lrp-add lr lrp2 f0:00:00:00:00:f2 192.168.1.254/24 >> +ovn-nbctl set Logical_Switch_Port lpr2 \ >> + type=router \ >> + options:router-port=lrp2 \ >> + addresses=router >> +ovn-nbctl lrp-set-gateway-chassis lrp2 hv1 >> >> net_add n1 # Network to connect hv1, hv2, and vtep >> net_add n2 # Network to connect vtep and hv3 >> @@ -4147,17 +4157,28 @@ for s in 1 2 3; do >> test_packet $s f0000000ffff f0000000000$s 0${s}66 $unknown #3 >> done >> >> -# ARP request should not be responded to by logical switch router >> -# type arp responder on HV1 and HV2 and should reach directly to >> -# vif1 and vif2 >> +# ARP request from VTEP to LRP should be responded by ARP responder. >> sha=f00000000003 >> spa=`ip_to_hex 192 168 1 2` >> tpa=`ip_to_hex 192 168 1 1` >> request=ffffffffffff${sha}08060001080006040001${sha}${spa}ffffffffffff${tpa} >> as hv3 ovs-appctl netdev-dummy/receive vif3 $request >> -echo $request >> 1.expected >> echo $request >> 2.expected >> >> +lrpmac=f000000000f1 >> +response=${sha}${lrpmac}08060001080006040002${lrpmac}${tpa}${sha}${spa} >> +# since lrp1 has gateway chassis set on hv1, hv1 will suppress arp request >> and >> +# answer with arp reply by OVS directly to vtep lport. all other lports, >> +# except lport from which this request was initiated, will receive arp >> request. >> +# we expect arp reply packet on hv3 >> +echo $response >> 3.expected >> + >> +AT_CHECK([ovn-sbctl lflow-list lsw0 | grep 'reg0[\[14\]]' | sort | sed >> 's/table=../table=??/g'], [0], [dnl >> + table=??(ls_in_port_sec_l2 ), priority=50 , match=(inport == >> "lp-vtep"), action=(reg0[[14]] = 1; next(pipeline=ingress, table=??);) >> + table=??(ls_in_hairpin ), priority=1000 , match=(reg0[[14]] == 1), >> action=(next(pipeline=ingress, table=??);) >> + table=??(ls_in_hairpin ), priority=2000 , match=(reg0[[14]] == 1 && >> (is_chassis_resident("cr-lrp1") || is_chassis_resident("cr-lrp2"))), >> action=(next;) >> +]) >> + >> # dump information with counters >> echo "------ OVN dump ------" >> ovn-nbctl show >> -- >> 2.26.3 >> > _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
