On 11/15/22 10:44, Lorenzo Bianconi wrote:
>> On Mon, Oct 24, 2022 at 11:29 AM Lorenzo Bianconi <
>> lorenzo.bianc...@redhat.com> wrote:
>>
>>> Improve buffered packet management in ovn-controller avoid useless loop
>>> in run_buffered_binding routine and using datapath key and output port
>>> key as buffered_packets_map hashmap hash. Add new selftest for buffered
>>> packets.
>>>
>>> Signed-off-by: Lorenzo Bianconi <lorenzo.bianc...@redhat.com>
>>> ---

Hi Lorenzo,

Thanks for the patch!  I have a few comments below.

>>> Changes since v2:
>>> - improve hash function
>>> Changes since v1:
>>> - improve code readability
>>> ---
>>>  controller/pinctrl.c | 118 ++++++++++++++++++++++++++++------------
>>>  tests/system-ovn.at  | 124 +++++++++++++++++++++++++++++++++++++++++++
>>>  2 files changed, 208 insertions(+), 34 deletions(-)
>>>
>>> diff --git a/controller/pinctrl.c b/controller/pinctrl.c
>>> index 8859cb080..dfcd0cea8 100644
>>> --- a/controller/pinctrl.c
>>> +++ b/controller/pinctrl.c
>>> @@ -182,6 +182,7 @@ static void destroy_buffered_packets_map(void);
>>>  static void
>>>  run_buffered_binding(struct ovsdb_idl_index
>>> *sbrec_mac_binding_by_lport_ip,
>>>                       struct ovsdb_idl_index
>>> *sbrec_port_binding_by_datapath,
>>> +                     struct ovsdb_idl_index *sbrec_port_binding_by_name,
>>>                       const struct hmap *local_datapaths)
>>>      OVS_REQUIRES(pinctrl_mutex);
>>>
>>> @@ -1430,6 +1431,9 @@ struct buffered_packets {
>>>      struct in6_addr ip;
>>>      struct eth_addr ea;
>>>
>>> +    uint64_t dp_key;
>>> +    uint64_t port_key;
>>> +
>>>      long long int timestamp;
>>>
>>>      struct buffer_info data[BUFFER_QUEUE_DEPTH];
>>> @@ -1556,38 +1560,38 @@ buffered_packets_map_gc(void)
>>>  }
>>>
>>>  static struct buffered_packets *
>>> -pinctrl_find_buffered_packets(const struct in6_addr *ip, uint32_t hash)
>>> +pinctrl_find_buffered_packets(uint64_t dp_key, uint64_t port_key,
>>> +                              uint32_t hash)
>>>  {
>>>      struct buffered_packets *qp;
>>> -
>>> -    HMAP_FOR_EACH_WITH_HASH (qp, hmap_node, hash,
>>> -                             &buffered_packets_map) {
>>> -        if (IN6_ARE_ADDR_EQUAL(&qp->ip, ip)) {
>>> +    HMAP_FOR_EACH_WITH_HASH (qp, hmap_node, hash, &buffered_packets_map) {
>>> +        if (qp->dp_key == dp_key && qp->port_key == port_key) {
>>>              return qp;
>>>          }
>>>      }
>>>      return NULL;
>>>  }
>>>
>>> +static uint32_t
>>> +pinctrl_buffer_apcket_hash(uint64_t dp_key, uint64_t port_key)
>>>
>>
>> nit: typo in the function name "pinctrl_buffer_apcket_hash" ->
>> "pinctrl_buffer_packet_hash".
>> I guess this can be fixed during merge, let's see if others agree.
>>
>>
>>> +{
>>> +    uint32_t hash = 0;
>>> +    hash = hash_add64(hash, port_key);
>>> +    hash = hash_add64(hash, dp_key);
>>> +    return hash_finish(hash, 16);
>>> +}
>>> +
>>>  /* Called with in the pinctrl_handler thread context. */
>>>  static int
>>>  pinctrl_handle_buffered_packets(struct dp_packet *pkt_in,
>>>                                  const struct match *md, bool is_arp)
>>>      OVS_REQUIRES(pinctrl_mutex)
>>>  {
>>> -    struct buffered_packets *bp;
>>> -    struct dp_packet *clone;
>>> -    struct in6_addr addr;
>>> -
>>> -    if (is_arp) {
>>> -        addr = in6_addr_mapped_ipv4(htonl(md->flow.regs[0]));
>>> -    } else {
>>> -        ovs_be128 ip6 = hton128(flow_get_xxreg(&md->flow, 0));
>>> -        memcpy(&addr, &ip6, sizeof addr);
>>> -    }
>>> -
>>> -    uint32_t hash = hash_bytes(&addr, sizeof addr, 0);
>>> -    bp = pinctrl_find_buffered_packets(&addr, hash);
>>> +    uint64_t dp_key = ntohll(md->flow.metadata);
>>> +    uint64_t oport_key = md->flow.regs[MFF_LOG_OUTPORT - MFF_REG0];
>>> +    uint32_t hash = pinctrl_buffer_apcket_hash(dp_key, oport_key);
>>> +    struct buffered_packets *bp
>>> +        = pinctrl_find_buffered_packets(dp_key, oport_key, hash);
>>>      if (!bp) {
>>>          if (hmap_count(&buffered_packets_map) >= 1000) {

Before your patch we would hit this hard coded limit if there were 1000
next hops for which we were bufferring packets.

Now, with your change, we will hit his limit only if there are 1000
ports for which we are bufferring packets.  This seems very unlikely and
I wonder if this will ever happen in practice.  Do we need to change its
value?  Also, let's add a define for it somewhere to make it a bit less
"magic".

>>>              COVERAGE_INC(pinctrl_drop_buffered_packets_map);
>>> @@ -1597,12 +1601,20 @@ pinctrl_handle_buffered_packets(struct dp_packet
>>> *pkt_in,
>>>          bp = xmalloc(sizeof *bp);
>>>          hmap_insert(&buffered_packets_map, &bp->hmap_node, hash);
>>>          bp->head = bp->tail = 0;
>>> -        bp->ip = addr;
>>> +        if (is_arp) {
>>> +            bp->ip = in6_addr_mapped_ipv4(htonl(md->flow.regs[0]));
>>> +        } else {
>>> +            ovs_be128 ip6 = hton128(flow_get_xxreg(&md->flow, 0));
>>> +            memcpy(&bp->ip, &ip6, sizeof bp->ip);
>>> +        }
>>> +        bp->dp_key = dp_key;
>>> +        bp->port_key = oport_key;
>>>      }
>>>      bp->timestamp = time_msec();
>>>      /* clone the packet to send it later with correct L2 address */
>>> -    clone = dp_packet_clone_data(dp_packet_data(pkt_in),
>>> -                                 dp_packet_size(pkt_in));
>>> +    struct dp_packet *clone
>>> +        = dp_packet_clone_data(dp_packet_data(pkt_in),
>>> +                               dp_packet_size(pkt_in));
>>>      buffered_push_packet(bp, clone, md);
>>>
>>>      return 0;
>>> @@ -3586,6 +3598,7 @@ pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn,
>>>                    sbrec_ip_multicast_opts);
>>>      run_buffered_binding(sbrec_mac_binding_by_lport_ip,
>>>                           sbrec_port_binding_by_datapath,
>>> +                         sbrec_port_binding_by_name,
>>>                           local_datapaths);
>>>      sync_svc_monitors(ovnsb_idl_txn, svc_mon_table,
>>> sbrec_port_binding_by_name,
>>>                        chassis);
>>> @@ -4354,15 +4367,28 @@ run_put_mac_bindings(struct ovsdb_idl_txn
>>> *ovnsb_idl_txn,
>>>      }
>>>  }
>>>
>>> +static struct buffered_packets *
>>> +pinctrl_get_buffered_packets(uint64_t dp_key, uint64_t port_key)
>>> +{
>>> +    uint32_t hash = pinctrl_buffer_apcket_hash(dp_key, port_key);
>>> +    return pinctrl_find_buffered_packets(dp_key, port_key, hash);
>>> +}
>>> +
>>>  static void
>>>  run_buffered_binding(struct ovsdb_idl_index
>>> *sbrec_mac_binding_by_lport_ip,
>>>                       struct ovsdb_idl_index
>>> *sbrec_port_binding_by_datapath,
>>> +                     struct ovsdb_idl_index *sbrec_port_binding_by_name,
>>>                       const struct hmap *local_datapaths)
>>>      OVS_REQUIRES(pinctrl_mutex)
>>>  {
>>>      const struct local_datapath *ld;
>>>      bool notify = false;
>>>
>>> +    if (!hmap_count(&buffered_packets_map)) {
>>> +        /* No work to do. */
>>> +        return;
>>> +    }
>>> +
>>>      HMAP_FOR_EACH (ld, hmap_node, local_datapaths) {
>>>          /* MAC_Binding.logical_port will always belong to a
>>>           * a router datapath. Hence we can skip logical switch
>>> @@ -4381,20 +4407,44 @@ run_buffered_binding(struct ovsdb_idl_index
>>> *sbrec_mac_binding_by_lport_ip,
>>>              if (strcmp(pb->type, "patch") && strcmp(pb->type,
>>> "l3gateway")) {
>>>                  continue;
>>>              }
>>> -            struct buffered_packets *cur_qp;
>>> -            HMAP_FOR_EACH_SAFE (cur_qp, hmap_node, &buffered_packets_map)
>>> {
>>> -                struct ds ip_s = DS_EMPTY_INITIALIZER;
>>> -                ipv6_format_mapped(&cur_qp->ip, &ip_s);
>>> -                const struct sbrec_mac_binding *b = mac_binding_lookup(
>>> -                        sbrec_mac_binding_by_lport_ip, pb->logical_port,
>>> -                        ds_cstr(&ip_s));
>>> -                if (b && ovs_scan(b->mac, ETH_ADDR_SCAN_FMT,
>>> -                                  ETH_ADDR_SCAN_ARGS(cur_qp->ea))) {
>>> -                    hmap_remove(&buffered_packets_map,
>>> &cur_qp->hmap_node);
>>> -                    ovs_list_push_back(&buffered_mac_bindings,
>>> &cur_qp->list);
>>> -                    notify = true;
>>> +
>>> +            struct buffered_packets *cur_qp
>>> +                = pinctrl_get_buffered_packets(ld->datapath->tunnel_key,
>>> +                                               pb->tunnel_key);
>>> +            if (!cur_qp) {
>>> +                /* If primary lookup fails, check chassisredirect port
>>> +                 * for distributed gw router port use-case. */
>>> +                char *redirect_name = xasprintf("cr-%s",
>>> pb->logical_port);
>>> +                const struct sbrec_port_binding *cr_pb
>>> +                    = lport_lookup_by_name(sbrec_port_binding_by_name,
>>> +                                           redirect_name);
>>> +                free(redirect_name);
>>> +                if (cr_pb) {
>>> +                    cur_qp = pinctrl_get_buffered_packets(
>>> +                            ld->datapath->tunnel_key, cr_pb->tunnel_key);
>>>                  }
>>> -                ds_destroy(&ip_s);
>>> +            }
>>> +
>>> +            if (!cur_qp) {
>>> +                continue;
>>> +            }
>>> +
>>> +            struct ds ip_s = DS_EMPTY_INITIALIZER;
>>> +            ipv6_format_mapped(&cur_qp->ip, &ip_s);
>>> +            const struct sbrec_mac_binding *b = mac_binding_lookup(
>>> +                    sbrec_mac_binding_by_lport_ip, pb->logical_port,
>>> +                    ds_cstr(&ip_s));
>>> +            if (b && ovs_scan(b->mac, ETH_ADDR_SCAN_FMT,
>>> +                              ETH_ADDR_SCAN_ARGS(cur_qp->ea))) {
>>> +                hmap_remove(&buffered_packets_map, &cur_qp->hmap_node);
>>> +                ovs_list_push_back(&buffered_mac_bindings, &cur_qp->list);
>>> +                notify = true;
>>> +            }
>>> +            ds_destroy(&ip_s);
>>> +
>>> +            if (!hmap_count(&buffered_packets_map)) {
>>> +                /* No more work to do. */
>>> +                break;
>>>              }
>>>          }
>>>          sbrec_port_binding_index_destroy_row(target);
>>> diff --git a/tests/system-ovn.at b/tests/system-ovn.at
>>> index 20c058415..a186a6e63 100644
>>> --- a/tests/system-ovn.at
>>> +++ b/tests/system-ovn.at
>>> @@ -8597,3 +8597,127 @@ OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port
>>> patch-.*/d
>>>  /connection dropped.*/d"])
>>>  AT_CLEANUP
>>>  ])
>>> +
>>> +OVN_FOR_EACH_NORTHD([
>>> +AT_SETUP([IP buffering])
>>> +AT_KEYWORDS([ip-buffering])
>>> +AT_SKIP_IF([test $HAVE_TCPDUMP = no])
>>> +
>>> +ovn_start
>>> +OVS_TRAFFIC_VSWITCHD_START()
>>> +
>>> +ADD_BR([br-int])
>>> +ADD_BR([br-ext])
>>> +
>>> +ovs-ofctl add-flow br-ext action=normal
>>> +# Set external-ids in br-int needed for ovn-controller
>>> +ovs-vsctl \
>>> +        -- set Open_vSwitch . external-ids:system-id=hv1 \
>>> +        -- set Open_vSwitch .
>>> external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
>>> +        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
>>> +        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
>>> +        -- set bridge br-int fail-mode=secure
>>> other-config:disable-in-band=true
>>> +
>>> +# Start ovn-controller
>>> +start_daemon ovn-controller
>>> +
>>> +ADD_NAMESPACES(sw01)
>>> +ADD_VETH(sw01, sw01, br-int, "192.168.1.2/24", "f0:00:00:01:02:03", \
>>> +         "192.168.1.1")
>>> +ADD_NAMESPACES(sw11)
>>> +ADD_VETH(sw11, sw11, br-int, "192.168.2.2/24", "f0:00:00:02:02:03", \
>>> +         "192.168.2.1")
>>> +ADD_NAMESPACES(remote)
>>> +ADD_VETH(remote, remote, br-ext, "172.16.1.2/24", "f0:00:00:01:02:05", \
>>> +         "172.16.1.1")
>>> +ADD_NAMESPACES(remote1)
>>> +ADD_VETH(remote1, remote1, br-ext, "172.16.1.4/24", "f0:00:00:01:02:06",
>>> \
>>> +         "172.16.1.1")
>>> +
>>> +NS_CHECK_EXEC([remote], [tcpdump -c 3 -nneei remote -Q in src 192.168.1.2
>>> and dst 172.16.1.2 and icmp > icmp.pcap &])
>>> +NS_CHECK_EXEC([remote], [tcpdump -c 1 -nneei remote -Q in arp and
>>> arp[[24:4]]==0xac100102 > arp.pcap &])
>>> +NS_CHECK_EXEC([remote1], [tcpdump -c 3 -nneei remote1 -Q in src
>>> 172.16.1.3 and dst 172.16.1.4 and icmp > icmp1.pcap 2>/dev/null &])
>>> +NS_CHECK_EXEC([remote1], [tcpdump -c 1 -nneei remote1 -Q in arp and
>>> arp[[24:4]]==0xac100104 > arp1.pcap 2>/dev/null &])

Please use NETNS_DAEMONIZE() instead.

>>> +
>>> +ovn-nbctl create Logical_Router name=R1 options:chassis=hv1

Please use "check ovn-nbctl .." everywhere applicable.

>>> +ovn-nbctl ls-add sw0
>>> +ovn-nbctl ls-add sw1
>>> +ovn-nbctl ls-add public
>>> +
>>> +ovn-nbctl lrp-add R1 rp-sw0 00:00:01:01:02:03 192.168.1.1/24
>>> +ovn-nbctl lrp-add R1 rp-sw1 00:00:03:01:02:03 192.168.2.1/24
>>> +ovn-nbctl lrp-add R1 rp-public 00:00:02:01:02:03 172.16.1.1/24
>>> +
>>> +ovn-nbctl lsp-add sw0 sw0-rp -- set Logical_Switch_Port sw0-rp \
>>> +    type=router options:router-port=rp-sw0 \
>>> +    -- lsp-set-addresses sw0-rp router
>>> +ovn-nbctl lsp-add sw1 sw1-rp -- set Logical_Switch_Port sw1-rp \
>>> +    type=router options:router-port=rp-sw1 \
>>> +    -- lsp-set-addresses sw1-rp router
>>> +
>>> +ovn-nbctl lsp-add public public-rp -- set Logical_Switch_Port public-rp \
>>> +    type=router options:router-port=rp-public \
>>> +    -- lsp-set-addresses public-rp router
>>> +
>>> +ovn-nbctl lsp-add sw0 sw01 \
>>> +    -- lsp-set-addresses sw01 "f0:00:00:01:02:03 192.168.1.2"
>>> +
>>> +ovn-nbctl lsp-add sw1 sw11 \
>>> +    -- lsp-set-addresses sw11 "f0:00:00:02:02:03 192.168.2.2"
>>> +
>>> +ovn-nbctl lr-nat-add R1 dnat_and_snat 172.16.1.3 192.168.2.2 sw11
>>> 00:00:02:02:03:10
>>> +
>>> +AT_CHECK([ovs-vsctl set Open_vSwitch .
>>> external-ids:ovn-bridge-mappings=phynet:br-ext])
>>> +ovn-nbctl lsp-add public public1 \
>>> +        -- lsp-set-addresses public1 unknown \
>>> +        -- lsp-set-type public1 localnet \
>>> +        -- lsp-set-options public1 network_name=phynet
>>> +
>>> +OVN_POPULATE_ARP
>>> +

We need to wait for the ports to be claimed and for OVS flows to be
installed.  We're missing:

wait_for_ports_up
check ovn-nbctl --wait=hv sync

Without this the ping below fails on my machine.

>>> +NS_CHECK_EXEC([sw01], [ping -q -c 3 -i 0.3 -w 2 172.16.1.2 |
>>> FORMAT_PING], \
>>> +[0], [dnl
>>> +3 packets transmitted, 3 received, 0% packet loss, time 0ms
>>> +])
>>> +
>>> +OVS_WAIT_UNTIL([
>>> +        total_arp_pkts=$(cat arp.pcap | wc -l)
>>> +        test "${total_arp_pkts}" = "1"
>>> +])
>>> +
>>> +OVS_WAIT_UNTIL([
>>> +        total_icmp_pkts=$(cat icmp.pcap | wc -l)
>>> +        test "${total_icmp_pkts}" = "3"
>>> +])
>>> +
>>> +NS_CHECK_EXEC([sw11], [ping -q -c 3 -i 0.3 -w 2 172.16.1.4 |
>>> FORMAT_PING], \
>>> +[0], [dnl
>>> +3 packets transmitted, 3 received, 0% packet loss, time 0ms
>>> +])
>>> +
>>> +OVS_WAIT_UNTIL([
>>> +        total_arp1_pkts=$(cat arp1.pcap | wc -l)
>>> +        test "${total_arp1_pkts}" = "1"
>>> +])
>>> +
>>> +OVS_WAIT_UNTIL([
>>> +        total_icmp1_pkts=$(cat icmp1.pcap | wc -l)
>>> +        test "${total_icmp1_pkts}" = "3"
>>> +])
>>> +
>>> +OVS_APP_EXIT_AND_WAIT([ovn-controller])
>>> +
>>> +as ovn-sb
>>> +OVS_APP_EXIT_AND_WAIT([ovsdb-server])
>>> +
>>> +as ovn-nb
>>> +OVS_APP_EXIT_AND_WAIT([ovsdb-server])
>>> +
>>> +as northd
>>> +OVS_APP_EXIT_AND_WAIT([NORTHD_TYPE])
>>> +
>>> +as
>>> +OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port patch-.*/d
>>> +/connection dropped.*/d"])
>>> +AT_CLEANUP
>>> +])
>>> --
>>> 2.37.3
>>>
>>>
>> Other than that it looks good, thanks.
>>
>> Acked-by: Ales Musil <amu...@redhat.com>
> 
> ack, let's see if I need to repost.
> 
> Regards,
> Lorenzo
> 


Thanks,
Dumitru

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to