Introduce the capbility to periodically send ARP request packets for ECMP nexthops in order to resolve their L2 address. This is a preliminary patch to introduce the capability to flush stale ECMP CT entries.
Signed-off-by: Lorenzo Bianconi <[email protected]> --- controller/ovn-controller.8.xml | 11 ++ controller/ovn-controller.c | 2 + controller/pinctrl.c | 248 +++++++++++++++++++++++++++++--- controller/pinctrl.h | 2 + 4 files changed, 241 insertions(+), 22 deletions(-) diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml index 17d762810..2846dcc39 100644 --- a/controller/ovn-controller.8.xml +++ b/controller/ovn-controller.8.xml @@ -384,6 +384,17 @@ cap for the exponential backoff used by <code>ovn-controller</code> to send GARPs packets. </dd> + <dt><code>external_ids:arp-max-timeout-sec</code></dt> + <dd> + When used, this configuration value specifies the maximum timeout + (in seconds) between two consecutive ARP packets sent by + <code>ovn-controller</code> to resolve ECMP nexthop mac address. + <code>ovn-controller</code> by default sends just 4 ARP packets + with an exponential backoff timeout. + Setting <code>external_ids:arp-max-timeout-sec</code> allows to + cap for the exponential backoff used by <code>ovn-controller</code> + to send ARPs packets. + </dd> <dt><code>external_ids:ovn-bridge-remote</code></dt> <dd> <p> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c index c48667887..1094e3107 100644 --- a/controller/ovn-controller.c +++ b/controller/ovn-controller.c @@ -5650,6 +5650,8 @@ main(int argc, char *argv[]) sbrec_mac_binding_table_get( ovnsb_idl_loop.idl), sbrec_bfd_table_get(ovnsb_idl_loop.idl), + sbrec_ecmp_nexthop_table_get( + ovnsb_idl_loop.idl), br_int, chassis, &runtime_data->local_datapaths, &runtime_data->active_tunnels, diff --git a/controller/pinctrl.c b/controller/pinctrl.c index c86b4f940..1ef081cb2 100644 --- a/controller/pinctrl.c +++ b/controller/pinctrl.c @@ -170,6 +170,9 @@ static struct seq *pinctrl_main_seq; static long long int garp_rarp_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT; static bool garp_rarp_continuous; +static long long int arp_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT; +static bool arp_continuous; + static void *pinctrl_handler(void *arg); struct pinctrl { @@ -229,13 +232,17 @@ static void run_activated_ports( const struct sbrec_chassis *chassis); static void init_send_garps_rarps(void); +static void init_send_arps(void); static void destroy_send_garps_rarps(void); +static void destroy_send_arps(void); static void send_garp_rarp_wait(long long int send_garp_rarp_time); +static void send_arp_wait(long long int send_arp_time); static void send_garp_rarp_prepare( struct ovsdb_idl_txn *ovnsb_idl_txn, struct ovsdb_idl_index *sbrec_port_binding_by_datapath, struct ovsdb_idl_index *sbrec_port_binding_by_name, struct ovsdb_idl_index *sbrec_mac_binding_by_lport_ip, + const struct sbrec_ecmp_nexthop_table *ecmp_nh_table, const struct ovsrec_bridge *, const struct sbrec_chassis *, const struct hmap *local_datapaths, @@ -245,6 +252,9 @@ static void send_garp_rarp_prepare( static void send_garp_rarp_run(struct rconn *swconn, long long int *send_garp_rarp_time) OVS_REQUIRES(pinctrl_mutex); +static void send_arp_run(struct rconn *swconn, + long long int *send_arp_time) + OVS_REQUIRES(pinctrl_mutex); static void pinctrl_handle_nd_na(struct rconn *swconn, const struct flow *ip_flow, const struct match *md, @@ -554,6 +564,7 @@ pinctrl_init(void) { init_put_mac_bindings(); init_send_garps_rarps(); + init_send_arps(); init_ipv6_ras(); init_ipv6_prefixd(); init_buffered_packets_ctx(); @@ -4000,6 +4011,7 @@ pinctrl_handler(void *arg_) static long long int send_ipv6_ra_time = LLONG_MAX; /* Next GARP/RARP announcement in ms. */ static long long int send_garp_rarp_time = LLONG_MAX; + static long long int send_arp_time = LLONG_MAX; /* Next multicast query (IGMP) in ms. */ static long long int send_mcast_query_time = LLONG_MAX; static long long int svc_monitors_next_run_time = LLONG_MAX; @@ -4037,6 +4049,7 @@ pinctrl_handler(void *arg_) if (may_inject_pkts()) { ovs_mutex_lock(&pinctrl_mutex); send_garp_rarp_run(swconn, &send_garp_rarp_time); + send_arp_run(swconn, &send_arp_time); send_ipv6_ras(swconn, &send_ipv6_ra_time); send_ipv6_prefixd(swconn, &send_prefixd_time); send_mac_binding_buffered_pkts(swconn); @@ -4055,6 +4068,7 @@ pinctrl_handler(void *arg_) rconn_recv_wait(swconn); if (rconn_is_connected(swconn)) { send_garp_rarp_wait(send_garp_rarp_time); + send_arp_wait(send_arp_time); ipv6_ra_wait(send_ipv6_ra_time); ip_mcast_querier_wait(send_mcast_query_time); svc_monitors_wait(svc_monitors_next_run_time); @@ -4151,6 +4165,7 @@ pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn, const struct sbrec_service_monitor_table *svc_mon_table, const struct sbrec_mac_binding_table *mac_binding_table, const struct sbrec_bfd_table *bfd_table, + const struct sbrec_ecmp_nexthop_table *ecmp_nh_table, const struct ovsrec_bridge *br_int, const struct sbrec_chassis *chassis, const struct hmap *local_datapaths, @@ -4167,8 +4182,9 @@ pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn, sbrec_port_binding_by_key, chassis); send_garp_rarp_prepare(ovnsb_idl_txn, sbrec_port_binding_by_datapath, sbrec_port_binding_by_name, - sbrec_mac_binding_by_lport_ip, br_int, chassis, - local_datapaths, active_tunnels, ovs_table); + sbrec_mac_binding_by_lport_ip, ecmp_nh_table, + br_int, chassis, local_datapaths, active_tunnels, + ovs_table); prepare_ipv6_ras(local_active_ports_ras, sbrec_port_binding_by_name); prepare_ipv6_prefixd(ovnsb_idl_txn, sbrec_port_binding_by_name, local_active_ports_ipv6_pd, chassis, @@ -4702,6 +4718,7 @@ pinctrl_destroy(void) latch_destroy(&pinctrl.pinctrl_thread_exit); rconn_destroy(pinctrl.swconn); destroy_send_garps_rarps(); + destroy_send_arps(); destroy_ipv6_ras(); destroy_ipv6_prefixd(); destroy_buffered_packets_ctx(); @@ -5029,7 +5046,8 @@ wait_put_mac_bindings(struct ovsdb_idl_txn *ovnsb_idl_txn) */ struct garp_rarp_data { struct eth_addr ea; /* Ethernet address of port. */ - ovs_be32 ipv4; /* Ipv4 address of port. */ + ovs_be32 src_ipv4; /* Ipv4 address of port. */ + ovs_be32 dst_ipv4; /* Destination Ipv4 address. */ long long int announce_time; /* Next announcement in ms. */ int backoff; /* Backoff timeout for the next * announcement (in msecs). */ @@ -5052,19 +5070,37 @@ destroy_send_garps_rarps(void) shash_destroy_free_data(&send_garp_rarp_data); } +/* Contains ARPs data to be sent to track ECMP next-hop mac address. Protected + * by pinctrl_mutex. */ +static struct shash send_arp_data; + +static void +init_send_arps(void) +{ + shash_init(&send_arp_data); +} + +static void +destroy_send_arps(void) +{ + shash_destroy_free_data(&send_arp_data); +} + /* Runs with in the main ovn-controller thread context. */ static void -add_garp_rarp(const char *name, const struct eth_addr ea, ovs_be32 ip, - uint32_t dp_key, uint32_t port_key) +add_garp_rarp(const char *name, const struct eth_addr ea, ovs_be32 src_ip, + ovs_be32 dst_ip, uint32_t dp_key, uint32_t port_key, + struct shash *shash) { struct garp_rarp_data *garp_rarp = xmalloc(sizeof *garp_rarp); garp_rarp->ea = ea; - garp_rarp->ipv4 = ip; + garp_rarp->src_ipv4 = src_ip; + garp_rarp->dst_ipv4 = dst_ip; garp_rarp->announce_time = time_msec() + 1000; garp_rarp->backoff = 1000; /* msec. */ garp_rarp->dp_key = dp_key; garp_rarp->port_key = port_key; - shash_add(&send_garp_rarp_data, name, garp_rarp); + shash_add(shash, name, garp_rarp); /* Notify pinctrl_handler so that it can wakeup and process * these GARP/RARP requests. */ @@ -5112,10 +5148,11 @@ send_garp_rarp_update(struct ovsdb_idl_txn *ovnsb_idl_txn, garp_rarp->backoff = 1000; /* msec. */ } } else if (ovnsb_idl_txn) { - add_garp_rarp(name, laddrs->ea, + add_garp_rarp(name, laddrs->ea, laddrs->ipv4_addrs[i].addr, laddrs->ipv4_addrs[i].addr, binding_rec->datapath->tunnel_key, - binding_rec->tunnel_key); + binding_rec->tunnel_key, + &send_garp_rarp_data); send_garp_locally(ovnsb_idl_txn, sbrec_mac_binding_by_lport_ip, local_datapaths, binding_rec, laddrs->ea, @@ -5142,9 +5179,10 @@ send_garp_rarp_update(struct ovsdb_idl_txn *ovnsb_idl_txn, garp_rarp->backoff = 1000; /* msec. */ } } else { - add_garp_rarp(name, laddrs->ea, - 0, binding_rec->datapath->tunnel_key, - binding_rec->tunnel_key); + add_garp_rarp(name, laddrs->ea, 0, 0, + binding_rec->datapath->tunnel_key, + binding_rec->tunnel_key, + &send_garp_rarp_data); } free(name); } @@ -5182,10 +5220,9 @@ send_garp_rarp_update(struct ovsdb_idl_txn *ovnsb_idl_txn, ip = laddrs.ipv4_addrs[0].addr; } - add_garp_rarp(binding_rec->logical_port, - laddrs.ea, ip, + add_garp_rarp(binding_rec->logical_port, laddrs.ea, ip, ip, binding_rec->datapath->tunnel_key, - binding_rec->tunnel_key); + binding_rec->tunnel_key, &send_garp_rarp_data); if (ip) { send_garp_locally(ovnsb_idl_txn, sbrec_mac_binding_by_lport_ip, local_datapaths, binding_rec, laddrs.ea, ip); @@ -5196,12 +5233,43 @@ send_garp_rarp_update(struct ovsdb_idl_txn *ovnsb_idl_txn, } } +/* Add or update a vif for which ARPs need to be announced. */ +static void +send_arp_update(const struct sbrec_port_binding *pb, const char *nexthop, + long long int max_arp_timeout, bool continuous_arp) +{ + volatile struct garp_rarp_data *arp = shash_find_data(&send_arp_data, + nexthop); + if (arp) { + arp->dp_key = pb->datapath->tunnel_key; + arp->port_key = pb->tunnel_key; + if (max_arp_timeout != arp_max_timeout || + continuous_arp != arp_continuous) { + /* reset backoff */ + arp->announce_time = time_msec() + 1000; + arp->backoff = 1000; /* msec. */ + } + } else { + struct lport_addresses laddrs; + if (!extract_lsp_addresses(pb->mac[0], &laddrs)) { + return; + } + if (laddrs.n_ipv4_addrs) { + ovs_be32 dst_ip; + inet_pton(AF_INET, nexthop, &dst_ip); + add_garp_rarp(nexthop, laddrs.ea, laddrs.ipv4_addrs[0].addr, + dst_ip, pb->datapath->tunnel_key, pb->tunnel_key, + &send_arp_data); + } + destroy_lport_addresses(&laddrs); + } +} + /* Remove a vif from GARP announcements. */ static void -send_garp_rarp_delete(const char *lport) +send_garp_rarp_delete(struct shash *shash, const char *lport) { - struct garp_rarp_data *garp_rarp = shash_find_and_delete - (&send_garp_rarp_data, lport); + struct garp_rarp_data *garp_rarp = shash_find_and_delete(shash, lport); free(garp_rarp); notify_pinctrl_handler(); } @@ -5220,9 +5288,9 @@ send_garp_rarp(struct rconn *swconn, struct garp_rarp_data *garp_rarp, uint64_t packet_stub[128 / 8]; struct dp_packet packet; dp_packet_use_stub(&packet, packet_stub, sizeof packet_stub); - if (garp_rarp->ipv4) { + if (garp_rarp->src_ipv4) { compose_arp(&packet, ARP_OP_REQUEST, garp_rarp->ea, eth_addr_zero, - true, garp_rarp->ipv4, garp_rarp->ipv4); + true, garp_rarp->src_ipv4, garp_rarp->dst_ipv4); } else { compose_rarp(&packet, garp_rarp->ea); } @@ -6524,6 +6592,25 @@ get_nat_addresses_and_keys(struct ovsdb_idl_index *sbrec_port_binding_by_name, } } +static void +get_local_ecmp_nexthop_map( + const struct sbrec_ecmp_nexthop_table *ecmp_nh_table, + struct ovsdb_idl_index *sbrec_port_binding_by_name, + const struct sbrec_chassis *chassis, + struct smap *local_ecmp_nexthop_map) +{ + const struct sbrec_ecmp_nexthop *sb_ecmp_nexthop; + SBREC_ECMP_NEXTHOP_TABLE_FOR_EACH (sb_ecmp_nexthop, ecmp_nh_table) { + const struct sbrec_port_binding *pb = + lport_lookup_by_name(sbrec_port_binding_by_name, + sb_ecmp_nexthop->port); + if (pb && !strcmp(pb->type, "l3gateway") && pb->chassis == chassis) { + smap_add_once(local_ecmp_nexthop_map, sb_ecmp_nexthop->nexthop, + sb_ecmp_nexthop->port); + } + } +} + static void send_garp_rarp_wait(long long int send_garp_rarp_time) { @@ -6534,6 +6621,16 @@ send_garp_rarp_wait(long long int send_garp_rarp_time) } } +static void +send_arp_wait(long long int send_arp_time) +{ + /* Set the poll timer for next arp packet only if there is data to + * be sent. */ + if (!shash_is_empty(&send_arp_data)) { + poll_timer_wait_until(send_arp_time); + } +} + /* Called with in the pinctrl_handler thread context. */ static void send_garp_rarp_run(struct rconn *swconn, long long int *send_garp_rarp_time) @@ -6556,6 +6653,80 @@ send_garp_rarp_run(struct rconn *swconn, long long int *send_garp_rarp_time) } } +static long long int +send_arp(struct rconn *swconn, struct garp_rarp_data *garp_rarp, + long long int current_time) + OVS_REQUIRES(pinctrl_mutex) +{ + if (current_time < garp_rarp->announce_time) { + return garp_rarp->announce_time; + } + + /* Compose a ARP request packet. */ + uint64_t packet_stub[128 / 8]; + struct dp_packet packet; + dp_packet_use_stub(&packet, packet_stub, sizeof packet_stub); + compose_arp(&packet, ARP_OP_REQUEST, garp_rarp->ea, eth_addr_zero, + true, garp_rarp->src_ipv4, garp_rarp->dst_ipv4); + + /* Inject ARP request. */ + uint64_t ofpacts_stub[4096 / 8]; + struct ofpbuf ofpacts = OFPBUF_STUB_INITIALIZER(ofpacts_stub); + enum ofp_version version = rconn_get_version(swconn); + put_load(garp_rarp->dp_key, MFF_LOG_DATAPATH, 0, 64, &ofpacts); + put_load(garp_rarp->port_key, MFF_LOG_OUTPORT, 0, 32, &ofpacts); + struct ofpact_resubmit *resubmit = ofpact_put_RESUBMIT(&ofpacts); + resubmit->in_port = OFPP_CONTROLLER; + resubmit->table_id = OFTABLE_LOCAL_OUTPUT; + + struct ofputil_packet_out po = { + .packet = dp_packet_data(&packet), + .packet_len = dp_packet_size(&packet), + .buffer_id = UINT32_MAX, + .ofpacts = ofpacts.data, + .ofpacts_len = ofpacts.size, + }; + match_set_in_port(&po.flow_metadata, OFPP_CONTROLLER); + enum ofputil_protocol proto = ofputil_protocol_from_ofp_version(version); + queue_msg(swconn, ofputil_encode_packet_out(&po, proto)); + dp_packet_uninit(&packet); + ofpbuf_uninit(&ofpacts); + + /* Set the next announcement. At most 5 announcements are sent for a + * vif if arp_max_timeout is not specified otherwise cap the max + * timeout to arp_max_timeout. */ + if (arp_continuous || garp_rarp->backoff < arp_max_timeout) { + garp_rarp->announce_time = current_time + garp_rarp->backoff; + } else { + garp_rarp->announce_time = LLONG_MAX; + } + garp_rarp->backoff = MIN(arp_max_timeout, garp_rarp->backoff * 2); + + return garp_rarp->announce_time; +} + +static void +send_arp_run(struct rconn *swconn, long long int *send_arp_time) + OVS_REQUIRES(pinctrl_mutex) +{ + if (shash_is_empty(&send_arp_data)) { + return; + } + + /* Send ARPs, and update the next announcement. */ + long long int current_time = time_msec(); + *send_arp_time = LLONG_MAX; + + struct shash_node *iter; + SHASH_FOR_EACH (iter, &send_arp_data) { + long long int next_announce = send_arp(swconn, iter->data, + current_time); + if (*send_arp_time > next_announce) { + *send_arp_time = next_announce; + } + } +} + /* Called by pinctrl_run(). Runs with in the main ovn-controller * thread context. */ static void @@ -6563,6 +6734,7 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn, struct ovsdb_idl_index *sbrec_port_binding_by_datapath, struct ovsdb_idl_index *sbrec_port_binding_by_name, struct ovsdb_idl_index *sbrec_mac_binding_by_lport_ip, + const struct sbrec_ecmp_nexthop_table *ecmp_nh_table, const struct ovsrec_bridge *br_int, const struct sbrec_chassis *chassis, const struct hmap *local_datapaths, @@ -6572,10 +6744,13 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn, { struct sset localnet_vifs = SSET_INITIALIZER(&localnet_vifs); struct sset local_l3gw_ports = SSET_INITIALIZER(&local_l3gw_ports); + struct smap local_ecmp_nexthop_map = + SMAP_INITIALIZER(&local_ecmp_nexthop_map); struct sset nat_ip_keys = SSET_INITIALIZER(&nat_ip_keys); struct shash nat_addresses; unsigned long long garp_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT; - bool garp_continuous = false; + unsigned long long max_arp_timeout = GARP_RARP_DEF_MAX_TIMEOUT; + bool garp_continuous = false, continuous_arp = true; const struct ovsrec_open_vswitch *cfg = ovsrec_open_vswitch_table_first(ovs_table); if (cfg) { @@ -6585,6 +6760,11 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn, if (!garp_max_timeout) { garp_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT; } + + max_arp_timeout = smap_get_ullong( + &cfg->external_ids, "arp-max-timeout-sec", + GARP_RARP_DEF_MAX_TIMEOUT / 1000) * 1000; + continuous_arp = !!max_arp_timeout; } shash_init(&nat_addresses); @@ -6598,13 +6778,23 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn, &nat_ip_keys, &local_l3gw_ports, chassis, active_tunnels, &nat_addresses); + + get_local_ecmp_nexthop_map(ecmp_nh_table, sbrec_port_binding_by_name, + chassis, &local_ecmp_nexthop_map); + /* For deleted ports and deleted nat ips, remove from * send_garp_rarp_data. */ struct shash_node *iter; SHASH_FOR_EACH_SAFE (iter, &send_garp_rarp_data) { if (!sset_contains(&localnet_vifs, iter->name) && !sset_contains(&nat_ip_keys, iter->name)) { - send_garp_rarp_delete(iter->name); + send_garp_rarp_delete(&send_garp_rarp_data, iter->name); + } + } + + SHASH_FOR_EACH_SAFE (iter, &send_arp_data) { + if (!smap_get(&local_ecmp_nexthop_map, iter->name)) { + send_garp_rarp_delete(&send_arp_data, iter->name); } } @@ -6633,10 +6823,21 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn, } } + struct smap_node *node; + SMAP_FOR_EACH (node, &local_ecmp_nexthop_map) { + const struct sbrec_port_binding *pb = lport_lookup_by_name( + sbrec_port_binding_by_name, node->value); + if (pb) { + send_arp_update(pb, node->key, max_arp_timeout, + continuous_arp); + } + } + /* pinctrl_handler thread will send the GARPs. */ sset_destroy(&localnet_vifs); sset_destroy(&local_l3gw_ports); + smap_destroy(&local_ecmp_nexthop_map); SHASH_FOR_EACH_SAFE (iter, &nat_addresses) { struct lport_addresses *laddrs = iter->data; @@ -6650,6 +6851,9 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn, garp_rarp_max_timeout = garp_max_timeout; garp_rarp_continuous = garp_continuous; + + arp_max_timeout = max_arp_timeout; + arp_continuous = continuous_arp; } static bool diff --git a/controller/pinctrl.h b/controller/pinctrl.h index 3462b670c..5c8ea7aea 100644 --- a/controller/pinctrl.h +++ b/controller/pinctrl.h @@ -36,6 +36,7 @@ struct sbrec_dns_table; struct sbrec_controller_event_table; struct sbrec_service_monitor_table; struct sbrec_bfd_table; +struct sbrec_ecmp_nexthop_table; struct sbrec_port_binding; struct sbrec_mac_binding_table; @@ -54,6 +55,7 @@ void pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn, const struct sbrec_service_monitor_table *, const struct sbrec_mac_binding_table *, const struct sbrec_bfd_table *, + const struct sbrec_ecmp_nexthop_table *, const struct ovsrec_bridge *, const struct sbrec_chassis *, const struct hmap *local_datapaths, const struct sset *active_tunnels, -- 2.46.0 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
