Introduce the capbility to periodically send ARP/ND packets for ECMP nexthops in order to resolve their L2 address. This is a preliminary patch to introduce the capability to flush stale ECMP CT entries.
Signed-off-by: Lorenzo Bianconi <[email protected]> --- NEWS | 5 + controller/ovn-controller.8.xml | 10 ++ controller/ovn-controller.c | 2 + controller/pinctrl.c | 284 +++++++++++++++++++++++++++++++- controller/pinctrl.h | 2 + 5 files changed, 300 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index da3aba739..1f8f54d5d 100644 --- a/NEWS +++ b/NEWS @@ -4,6 +4,11 @@ Post v24.09.0 hash (with specified hash fields) for ECMP routes while choosing nexthop. - ovn-ic: Add support for route tag to prevent route learning. + - Add "arp-max-timeout-sec" config option to vswitchd external-ids to + cap the time between when ovn-controller sends ARP/ND packets for + ECMP-nexthop. + By default ovn-controller continuously sends ARP/ND packets for + ECMP-nexthop. OVN v24.09.0 - 13 Sep 2024 -------------------------- diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml index aeaa374c1..7f95a9932 100644 --- a/controller/ovn-controller.8.xml +++ b/controller/ovn-controller.8.xml @@ -385,6 +385,16 @@ cap for the exponential backoff used by <code>ovn-controller</code> to send GARPs packets. </dd> + <dt><code>external_ids:arp-nd-max-timeout-sec</code></dt> + <dd> + When used, this configuration value specifies the maximum timeout + (in seconds) between two consecutive ARP/ND packets sent by + <code>ovn-controller</code> to resolve ECMP nexthop mac address. + <code>ovn-controller</code> by default continuously sends ARP/ND + packets. Setting <code>external_ids:arp-nd-max-timeout-sec</code> + allows to cap for the exponential backoff used by <code>ovn-controller + </code> to send ARPs/NDs packets. + </dd> <dt><code>external_ids:ovn-bridge-remote</code></dt> <dd> <p> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c index ab9593850..c93995b94 100644 --- a/controller/ovn-controller.c +++ b/controller/ovn-controller.c @@ -5699,6 +5699,8 @@ main(int argc, char *argv[]) sbrec_mac_binding_table_get( ovnsb_idl_loop.idl), sbrec_bfd_table_get(ovnsb_idl_loop.idl), + sbrec_ecmp_nexthop_table_get( + ovnsb_idl_loop.idl), br_int, chassis, &runtime_data->local_datapaths, &runtime_data->active_tunnels, diff --git a/controller/pinctrl.c b/controller/pinctrl.c index 07cd140e0..d97b10710 100644 --- a/controller/pinctrl.c +++ b/controller/pinctrl.c @@ -170,6 +170,9 @@ static struct seq *pinctrl_main_seq; static long long int garp_rarp_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT; static bool garp_rarp_continuous; +static long long int arp_nd_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT; +static bool arp_nd_continuous; + static void *pinctrl_handler(void *arg); struct pinctrl { @@ -230,13 +233,17 @@ static void run_activated_ports( const struct sbrec_chassis *chassis); static void init_send_garps_rarps(void); +static void init_send_arps_nds(void); static void destroy_send_garps_rarps(void); +static void destroy_send_arps_nds(void); static void send_garp_rarp_wait(long long int send_garp_rarp_time); +static void send_arp_nd_wait(long long int send_arp_nd_time); static void send_garp_rarp_prepare( struct ovsdb_idl_txn *ovnsb_idl_txn, struct ovsdb_idl_index *sbrec_port_binding_by_datapath, struct ovsdb_idl_index *sbrec_port_binding_by_name, struct ovsdb_idl_index *sbrec_mac_binding_by_lport_ip, + const struct sbrec_ecmp_nexthop_table *ecmp_nh_table, const struct ovsrec_bridge *, const struct sbrec_chassis *, const struct hmap *local_datapaths, @@ -246,6 +253,9 @@ static void send_garp_rarp_prepare( static void send_garp_rarp_run(struct rconn *swconn, long long int *send_garp_rarp_time) OVS_REQUIRES(pinctrl_mutex); +static void send_arp_nd_run(struct rconn *swconn, + long long int *send_arp_nd_time) + OVS_REQUIRES(pinctrl_mutex); static void pinctrl_handle_nd_na(struct rconn *swconn, const struct flow *ip_flow, const struct match *md, @@ -555,6 +565,7 @@ pinctrl_init(void) { init_put_mac_bindings(); init_send_garps_rarps(); + init_send_arps_nds(); init_ipv6_ras(); init_ipv6_prefixd(); init_buffered_packets_ctx(); @@ -3997,6 +4008,7 @@ pinctrl_handler(void *arg_) static long long int send_ipv6_ra_time = LLONG_MAX; /* Next GARP/RARP announcement in ms. */ static long long int send_garp_rarp_time = LLONG_MAX; + static long long int send_arp_nd_time = LLONG_MAX; /* Next multicast query (IGMP) in ms. */ static long long int send_mcast_query_time = LLONG_MAX; static long long int svc_monitors_next_run_time = LLONG_MAX; @@ -4034,6 +4046,7 @@ pinctrl_handler(void *arg_) if (may_inject_pkts()) { ovs_mutex_lock(&pinctrl_mutex); send_garp_rarp_run(swconn, &send_garp_rarp_time); + send_arp_nd_run(swconn, &send_arp_nd_time); send_ipv6_ras(swconn, &send_ipv6_ra_time); send_ipv6_prefixd(swconn, &send_prefixd_time); send_mac_binding_buffered_pkts(swconn); @@ -4052,6 +4065,7 @@ pinctrl_handler(void *arg_) rconn_recv_wait(swconn); if (rconn_is_connected(swconn)) { send_garp_rarp_wait(send_garp_rarp_time); + send_arp_nd_wait(send_arp_nd_time); ipv6_ra_wait(send_ipv6_ra_time); ip_mcast_querier_wait(send_mcast_query_time); svc_monitors_wait(svc_monitors_next_run_time); @@ -4148,6 +4162,7 @@ pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn, const struct sbrec_service_monitor_table *svc_mon_table, const struct sbrec_mac_binding_table *mac_binding_table, const struct sbrec_bfd_table *bfd_table, + const struct sbrec_ecmp_nexthop_table *ecmp_nh_table, const struct ovsrec_bridge *br_int, const struct sbrec_chassis *chassis, const struct hmap *local_datapaths, @@ -4164,8 +4179,9 @@ pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn, sbrec_port_binding_by_key, chassis); send_garp_rarp_prepare(ovnsb_idl_txn, sbrec_port_binding_by_datapath, sbrec_port_binding_by_name, - sbrec_mac_binding_by_lport_ip, br_int, chassis, - local_datapaths, active_tunnels, ovs_table); + sbrec_mac_binding_by_lport_ip, ecmp_nh_table, + br_int, chassis, local_datapaths, active_tunnels, + ovs_table); prepare_ipv6_ras(local_active_ports_ras, sbrec_port_binding_by_name); prepare_ipv6_prefixd(ovnsb_idl_txn, sbrec_port_binding_by_name, local_active_ports_ipv6_pd, chassis, @@ -4700,6 +4716,7 @@ pinctrl_destroy(void) latch_destroy(&pinctrl.pinctrl_thread_exit); rconn_destroy(pinctrl.swconn); destroy_send_garps_rarps(); + destroy_send_arps_nds(); destroy_ipv6_ras(); destroy_ipv6_prefixd(); destroy_buffered_packets_ctx(); @@ -5208,6 +5225,150 @@ send_garp_rarp_update(struct ovsdb_idl_txn *ovnsb_idl_txn, } } +struct arp_nd_data { + struct hmap_node hmap_node; + struct eth_addr ea; /* Ethernet address of port. */ + struct in6_addr src_ip; /* IP address of port. */ + struct in6_addr dst_ip; /* Destination IP address */ + long long int announce_time; /* Next announcement in ms. */ + int backoff; /* Backoff timeout for the next + * announcement (in msecs). */ + uint32_t dp_key; /* Datapath used to output this GARP. */ + uint32_t port_key; /* Port to inject the GARP into. */ +}; + +static struct hmap send_arp_nd_data; + +static void +init_send_arps_nds(void) +{ + hmap_init(&send_arp_nd_data); +} + +static void +destroy_send_arps_nds(void) +{ + struct arp_nd_data *e; + HMAP_FOR_EACH_POP (e, hmap_node, &send_arp_nd_data) { + free(e); + } + hmap_destroy(&send_arp_nd_data); +} + +static struct arp_nd_data * +arp_nd_find_data(const struct sbrec_port_binding *pb, + const struct in6_addr *nexthop) +{ + uint32_t hash = 0; + + hash = hash_add_in6_addr(hash, nexthop); + hash = hash_add(hash, pb->datapath->tunnel_key); + hash = hash_add(hash, pb->tunnel_key); + + struct arp_nd_data *e; + HMAP_FOR_EACH_WITH_HASH (e, hmap_node, hash, &send_arp_nd_data) { + if (ipv6_addr_equals(&e->dst_ip, nexthop) && + e->port_key == pb->tunnel_key) { + return e; + } + } + + return NULL; +} + +static bool +arp_nd_find_is_stale(const struct arp_nd_data *e, + const struct sbrec_ecmp_nexthop_table *ecmp_nh_table, + const struct sbrec_chassis *chassis) +{ + const struct sbrec_ecmp_nexthop *sb_ecmp_nexthop; + SBREC_ECMP_NEXTHOP_TABLE_FOR_EACH (sb_ecmp_nexthop, ecmp_nh_table) { + const struct sbrec_port_binding *pb = sb_ecmp_nexthop->port; + if (pb->chassis != chassis) { + continue; + } + + struct lport_addresses laddrs; + if (!extract_ip_addresses(sb_ecmp_nexthop->nexthop, &laddrs)) { + continue; + } + + struct in6_addr dst_ip = laddrs.n_ipv4_addrs + ? in6_addr_mapped_ipv4(laddrs.ipv4_addrs[0].addr) + : laddrs.ipv6_addrs[0].addr; + destroy_lport_addresses(&laddrs); + + if (pb->tunnel_key == e->port_key && + pb->datapath->tunnel_key == e->dp_key && + ipv6_addr_equals(&e->dst_ip, &dst_ip)) { + return false; + } + } + return true; +} + +static struct arp_nd_data * +arp_nd_alloc_data(const struct eth_addr ea, + struct in6_addr src_ip, struct in6_addr dst_ip, + const struct sbrec_port_binding *pb) +{ + struct arp_nd_data *e = xmalloc(sizeof *e); + e->ea = ea; + e->src_ip = src_ip; + e->dst_ip = dst_ip; + e->announce_time = time_msec() + 1000; + e->backoff = 1000; /* msec. */ + e->dp_key = pb->datapath->tunnel_key; + e->port_key = pb->tunnel_key; + + uint32_t hash = 0; + hash = hash_add_in6_addr(hash, &dst_ip); + hash = hash_add(hash, e->dp_key); + hash = hash_add(hash, e->port_key); + hmap_insert(&send_arp_nd_data, &e->hmap_node, hash); + notify_pinctrl_handler(); + + return e; +} + +/* Add or update a vif for which ARPs need to be announced. */ +static void +send_arp_nd_update(const struct sbrec_port_binding *pb, const char *nexthop, + long long int max_arp_timeout, bool continuous_arp_nd) +{ + struct lport_addresses laddrs; + if (!extract_ip_addresses(nexthop, &laddrs)) { + return; + } + + struct in6_addr dst_ip = laddrs.n_ipv4_addrs + ? in6_addr_mapped_ipv4(laddrs.ipv4_addrs[0].addr) + : laddrs.ipv6_addrs[0].addr; + destroy_lport_addresses(&laddrs); + + struct arp_nd_data *e = arp_nd_find_data(pb, &dst_ip); + if (!e) { + if (!extract_lsp_addresses(pb->mac[0], &laddrs)) { + return; + } + + if (laddrs.n_ipv4_addrs) { + arp_nd_alloc_data(laddrs.ea, + in6_addr_mapped_ipv4(laddrs.ipv4_addrs[0].addr), + dst_ip, pb); + } else if (laddrs.n_ipv6_addrs) { + arp_nd_alloc_data(laddrs.ea, laddrs.ipv6_addrs[0].addr, + dst_ip, pb); + } + destroy_lport_addresses(&laddrs); + } else if (max_arp_timeout != arp_nd_max_timeout || + continuous_arp_nd != arp_nd_continuous) { + /* reset backoff */ + e->announce_time = time_msec() + 1000; + e->backoff = 1000; /* msec. */ + } +} + /* Remove a vif from GARP announcements. */ static void send_garp_rarp_delete(const char *lport) @@ -6546,6 +6707,16 @@ send_garp_rarp_wait(long long int send_garp_rarp_time) } } +static void +send_arp_nd_wait(long long int send_arp_nd_time) +{ + /* Set the poll timer for next arp packet only if there is data to + * be sent. */ + if (hmap_count(&send_arp_nd_data)) { + poll_timer_wait_until(send_arp_nd_time); + } +} + /* Called with in the pinctrl_handler thread context. */ static void send_garp_rarp_run(struct rconn *swconn, long long int *send_garp_rarp_time) @@ -6568,6 +6739,84 @@ send_garp_rarp_run(struct rconn *swconn, long long int *send_garp_rarp_time) } } +static long long int +send_arp_nd(struct rconn *swconn, struct arp_nd_data *e, + long long int current_time) + OVS_REQUIRES(pinctrl_mutex) +{ + if (current_time < e->announce_time) { + return e->announce_time; + } + + /* Compose a ARP request packet. */ + uint64_t packet_stub[128 / 8]; + struct dp_packet packet; + dp_packet_use_stub(&packet, packet_stub, sizeof packet_stub); + if (IN6_IS_ADDR_V4MAPPED(&e->src_ip)) { + compose_arp(&packet, ARP_OP_REQUEST, e->ea, eth_addr_zero, + true, in6_addr_get_mapped_ipv4(&e->src_ip), + in6_addr_get_mapped_ipv4(&e->dst_ip)); + } else { + compose_nd_ns(&packet, e->ea, &e->src_ip, &e->dst_ip); + } + + /* Inject ARP request. */ + uint64_t ofpacts_stub[4096 / 8]; + struct ofpbuf ofpacts = OFPBUF_STUB_INITIALIZER(ofpacts_stub); + enum ofp_version version = rconn_get_version(swconn); + put_load(e->dp_key, MFF_LOG_DATAPATH, 0, 64, &ofpacts); + put_load(e->port_key, MFF_LOG_OUTPORT, 0, 32, &ofpacts); + struct ofpact_resubmit *resubmit = ofpact_put_RESUBMIT(&ofpacts); + resubmit->in_port = OFPP_CONTROLLER; + resubmit->table_id = OFTABLE_LOCAL_OUTPUT; + + struct ofputil_packet_out po = { + .packet = dp_packet_data(&packet), + .packet_len = dp_packet_size(&packet), + .buffer_id = UINT32_MAX, + .ofpacts = ofpacts.data, + .ofpacts_len = ofpacts.size, + }; + match_set_in_port(&po.flow_metadata, OFPP_CONTROLLER); + enum ofputil_protocol proto = ofputil_protocol_from_ofp_version(version); + queue_msg(swconn, ofputil_encode_packet_out(&po, proto)); + dp_packet_uninit(&packet); + ofpbuf_uninit(&ofpacts); + + /* Set the next announcement. At most 5 announcements are sent for a + * vif if arp_nd_max_timeout is not specified otherwise cap the max + * timeout to arp_nd_max_timeout. */ + if (arp_nd_continuous || e->backoff < arp_nd_max_timeout) { + e->announce_time = current_time + e->backoff; + } else { + e->announce_time = LLONG_MAX; + } + e->backoff = MIN(arp_nd_max_timeout, e->backoff * 2); + + return e->announce_time; +} + +static void +send_arp_nd_run(struct rconn *swconn, long long int *send_arp_nd_time) + OVS_REQUIRES(pinctrl_mutex) +{ + if (!hmap_count(&send_arp_nd_data)) { + return; + } + + /* Send ARPs, and update the next announcement. */ + long long int current_time = time_msec(); + *send_arp_nd_time = LLONG_MAX; + + struct arp_nd_data *e; + HMAP_FOR_EACH (e, hmap_node, &send_arp_nd_data) { + long long int next_announce = send_arp_nd(swconn, e, current_time); + if (*send_arp_nd_time > next_announce) { + *send_arp_nd_time = next_announce; + } + } +} + /* Called by pinctrl_run(). Runs with in the main ovn-controller * thread context. */ static void @@ -6575,6 +6824,7 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn, struct ovsdb_idl_index *sbrec_port_binding_by_datapath, struct ovsdb_idl_index *sbrec_port_binding_by_name, struct ovsdb_idl_index *sbrec_mac_binding_by_lport_ip, + const struct sbrec_ecmp_nexthop_table *ecmp_nh_table, const struct ovsrec_bridge *br_int, const struct sbrec_chassis *chassis, const struct hmap *local_datapaths, @@ -6587,7 +6837,8 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn, struct sset nat_ip_keys = SSET_INITIALIZER(&nat_ip_keys); struct shash nat_addresses; unsigned long long garp_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT; - bool garp_continuous = false; + unsigned long long max_arp_nd_timeout = GARP_RARP_DEF_MAX_TIMEOUT; + bool garp_continuous = false, continuous_arp_nd = true; const struct ovsrec_open_vswitch *cfg = ovsrec_open_vswitch_table_first(ovs_table); if (cfg) { @@ -6597,6 +6848,11 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn, if (!garp_max_timeout) { garp_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT; } + + max_arp_nd_timeout = smap_get_ullong( + &cfg->external_ids, "arp-nd-max-timeout-sec", + GARP_RARP_DEF_MAX_TIMEOUT / 1000) * 1000; + continuous_arp_nd = !!max_arp_nd_timeout; } shash_init(&nat_addresses); @@ -6610,6 +6866,7 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn, &nat_ip_keys, &local_l3gw_ports, chassis, active_tunnels, &nat_addresses); + /* For deleted ports and deleted nat ips, remove from * send_garp_rarp_data. */ struct shash_node *iter; @@ -6645,6 +6902,24 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn, } } + struct arp_nd_data *e; + const struct sbrec_ecmp_nexthop *sb_ecmp_nexthop; + HMAP_FOR_EACH_SAFE (e, hmap_node, &send_arp_nd_data) { + if (arp_nd_find_is_stale(e, ecmp_nh_table, chassis)) { + hmap_remove(&send_arp_nd_data, &e->hmap_node); + free(e); + notify_pinctrl_handler(); + } + } + + SBREC_ECMP_NEXTHOP_TABLE_FOR_EACH (sb_ecmp_nexthop, ecmp_nh_table) { + const struct sbrec_port_binding *pb = sb_ecmp_nexthop->port; + if (pb && !strcmp(pb->type, "l3gateway") && pb->chassis == chassis) { + send_arp_nd_update(pb, sb_ecmp_nexthop->nexthop, + max_arp_nd_timeout, continuous_arp_nd); + } + } + /* pinctrl_handler thread will send the GARPs. */ sset_destroy(&localnet_vifs); @@ -6662,6 +6937,9 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn, garp_rarp_max_timeout = garp_max_timeout; garp_rarp_continuous = garp_continuous; + + arp_nd_max_timeout = max_arp_nd_timeout; + arp_nd_continuous = continuous_arp_nd; } static bool diff --git a/controller/pinctrl.h b/controller/pinctrl.h index 3462b670c..5c8ea7aea 100644 --- a/controller/pinctrl.h +++ b/controller/pinctrl.h @@ -36,6 +36,7 @@ struct sbrec_dns_table; struct sbrec_controller_event_table; struct sbrec_service_monitor_table; struct sbrec_bfd_table; +struct sbrec_ecmp_nexthop_table; struct sbrec_port_binding; struct sbrec_mac_binding_table; @@ -54,6 +55,7 @@ void pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn, const struct sbrec_service_monitor_table *, const struct sbrec_mac_binding_table *, const struct sbrec_bfd_table *, + const struct sbrec_ecmp_nexthop_table *, const struct ovsrec_bridge *, const struct sbrec_chassis *, const struct hmap *local_datapaths, const struct sset *active_tunnels, -- 2.47.0 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
