In real-world vSwitch deployments, handling a few thousand flows,
EMC is quickly saturated, so it's optimal usage is critical to
reach the highest packet forwarding speed of the vSwitch.

EMC lookup is initiated based on the hash value of the packet.
In case the packet does not already have a stored hash value
during processing, the miniflow_hash_5tuple() function is invoked
in the datapath. While packets entering the vSwitch from an
external interface usually have valid hashes (pre-computed by NICs
supporting RSS), the ones coming from vhostuser ports (internal
packets from VMs) do not.

Non-IP traffic received from the VMs experiences very bad EMC hit
rates and hence forwarding performance, because the miniflow_hash_5tuple()
returns the same hash value and these packets will hit the same EMC
entries and cause collisions if there are more than two distinct
megaflows with traffic in the PMD.

The purpose of the patch is to compute proper hashes with sufficient
entropy for EMC lookup also for non-IP traffic to avoid constant EMC
thrashing. The hash calculation has been extended to handle unrecognized
ethernet types and MPLS, using the header fields that are valid for a
specific protocol.

Forwarding of non-IP packets in NFVI scenarios is very likely to happen
based on MAC addresses and/or VLAN tags.
By implementing a special case for matching on MPLS label, this change
prepares a separate commit that will enable hash recalculation for MPLS
packets received from L3 GRE tunnels. Today we skip re-computation of
the hash and the original GRE hash is only updated with the increased
recirc_depth.

Signed-off-by: Gabor Halasz <gabor.hal...@ericsson.com>
---
 lib/flow.c | 195 ++++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 143 insertions(+), 52 deletions(-)

diff --git a/lib/flow.c b/lib/flow.c
index 09b66b8..f47e9a4 100644
--- a/lib/flow.c
+++ b/lib/flow.c
@@ -1967,7 +1967,8 @@ flow_wildcards_set_xxreg_mask(struct flow_wildcards *wc, 
int idx,
     flow_set_xxreg(&wc->masks, idx, mask);
 }
 
-/* Calculates the 5-tuple hash from the given miniflow.
+/* Calculates the 5-tuple (for valid IP packets) or other (L2/MPLS)
+ * hash from the given miniflow.
  * This returns the same value as flow_hash_5tuple for the corresponding
  * flow. */
 uint32_t
@@ -1975,85 +1976,175 @@ miniflow_hash_5tuple(const struct miniflow *flow, 
uint32_t basis)
 {
     BUILD_ASSERT_DECL(FLOW_WC_SEQ == 40);
     uint32_t hash = basis;
+    ovs_be16 dl_type;
+    uint8_t nw_proto;
 
-    if (flow) {
-        ovs_be16 dl_type = MINIFLOW_GET_BE16(flow, dl_type);
-        uint8_t nw_proto;
+    if (!flow) {
+        return hash_finish(hash, 42); /* Arbitrary number. */
+    }
 
-        if (dl_type == htons(ETH_TYPE_IPV6)) {
-            struct flowmap map = FLOWMAP_EMPTY_INITIALIZER;
-            uint64_t value;
+    dl_type = MINIFLOW_GET_BE16(flow, dl_type);
 
-            FLOWMAP_SET(&map, ipv6_src);
-            FLOWMAP_SET(&map, ipv6_dst);
+    if (dl_type == htons(ETH_TYPE_IPV6)) {
+        uint64_t value;
 
-            MINIFLOW_FOR_EACH_IN_FLOWMAP(value, flow, map) {
-                hash = hash_add64(hash, value);
-            }
-        } else if (dl_type == htons(ETH_TYPE_IP)
-                   || dl_type == htons(ETH_TYPE_ARP)) {
-            hash = hash_add(hash, MINIFLOW_GET_U32(flow, nw_src));
-            hash = hash_add(hash, MINIFLOW_GET_U32(flow, nw_dst));
-        } else {
-            goto out;
-        }
+        struct flowmap map = FLOWMAP_EMPTY_INITIALIZER;
+        FLOWMAP_SET(&map, ipv6_src);
+        FLOWMAP_SET(&map, ipv6_dst);
 
         nw_proto = MINIFLOW_GET_U8(flow, nw_proto);
         hash = hash_add(hash, nw_proto);
-        if (nw_proto != IPPROTO_TCP && nw_proto != IPPROTO_UDP
-            && nw_proto != IPPROTO_SCTP && nw_proto != IPPROTO_ICMP
-            && nw_proto != IPPROTO_ICMPV6) {
-            goto out;
+
+        MINIFLOW_FOR_EACH_IN_FLOWMAP(value, flow, map) {
+            hash = hash_add64(hash, value);
+        }
+        switch (nw_proto) {
+        case IPPROTO_TCP:
+        case IPPROTO_UDP:
+        case IPPROTO_SCTP:
+        case IPPROTO_ICMP:
+        case IPPROTO_IGMP:
+        case IPPROTO_ICMPV6:
+            /* Ports are valid, add both ports at once. */
+            hash = hash_add(hash,
+                   (OVS_FORCE uint32_t) miniflow_get_ports(flow));
+            break;
+        default:
+            break;
+        }
+    } else if (dl_type == htons(ETH_TYPE_IP)) {
+        nw_proto = MINIFLOW_GET_U8(flow, nw_proto);
+        hash = hash_add(hash, nw_proto);
+        hash = hash_add(hash, MINIFLOW_GET_U32(flow, nw_src));
+        hash = hash_add(hash, MINIFLOW_GET_U32(flow, nw_dst));
+        switch (nw_proto) {
+        case IPPROTO_TCP:
+        case IPPROTO_UDP:
+        case IPPROTO_SCTP:
+        case IPPROTO_ICMP:
+        case IPPROTO_IGMP:
+            /* Ports are valid, add both ports at once. */
+            hash = hash_add(hash,
+                   (OVS_FORCE uint32_t) miniflow_get_ports(flow));
+            break;
+        default:
+            break;
+        }
+    } else if (eth_type_mpls(dl_type)) {
+        /* Hash MPLS labels. */
+        BUILD_ASSERT_DECL(FLOW_MAX_MPLS_LABELS <= 4);
+        ovs_u128 mh = MINIFLOW_GET_U128(flow, mpls_lse);
+        int i;
+        for (i = 0; i < FLOW_MAX_MPLS_LABELS; i++) {
+            hash = hash_add(hash, mh.u32[i]);
+            if (((struct mpls_hdr *) &mh.u32[i])->mpls_lse.lo
+                & htons(1 << MPLS_BOS_SHIFT)) {
+                break;
+            }
         }
+    } else {
+        /* Other ethertype, hash L2 instead. */
+        /* Hash dst and src MAC (96 bits).*/
+        ovs_u128 macs = MINIFLOW_GET_U128(flow, dl_dst);
+        hash = hash_add64(hash, macs.u64.lo);
+        hash = hash_add(hash, macs.u32[2]);
 
-        /* Add both ports at once. */
-        hash = hash_add(hash, (OVS_FORCE uint32_t) miniflow_get_ports(flow));
+        /* Hash VLAN vids.*/
+        for (int i = 0; i < FLOW_MAX_VLAN_HEADERS / 2; i++) {
+            uint32_t vlan_vids = 0;
+            vlan_vids |= ((uint32_t) miniflow_get_vid(flow, 2 * i)) << 16;
+            vlan_vids |= (uint32_t) miniflow_get_vid(flow, 2 * i + 1);
+            hash = hash_add(hash, vlan_vids);
+        }
     }
-out:
-    return hash_finish(hash, 42);
+
+    return hash_finish(hash, 42); /* Arbitrary number. */
 }
 
 ASSERT_SEQUENTIAL_SAME_WORD(tp_src, tp_dst);
 ASSERT_SEQUENTIAL(ipv6_src, ipv6_dst);
 
-/* Calculates the 5-tuple hash from the given flow. */
+/* Calculates the 5-tuple (for valid IP packets) or other (L2/MPLS)
+ * hash from the given flow. */
 uint32_t
 flow_hash_5tuple(const struct flow *flow, uint32_t basis)
 {
     BUILD_ASSERT_DECL(FLOW_WC_SEQ == 40);
     uint32_t hash = basis;
 
-    if (flow) {
+    if (!flow) {
+        return hash_finish(hash, 42); /* Arbitrary number. */
+    }
 
-        if (flow->dl_type == htons(ETH_TYPE_IPV6)) {
-            const uint64_t *flow_u64 = (const uint64_t *)flow;
-            int ofs = offsetof(struct flow, ipv6_src) / 8;
-            int end = ofs + 2 * sizeof flow->ipv6_src / 8;
+    if (flow->dl_type == htons(ETH_TYPE_IPV6)) {
+        const uint64_t *flow_u64 = (const uint64_t *)flow;
+        int ofs = offsetof(struct flow, ipv6_src) / sizeof(uint64_t);
+        int end = ofs + 2 * sizeof flow->ipv6_src / sizeof(uint64_t);
 
-            for (;ofs < end; ofs++) {
-                hash = hash_add64(hash, flow_u64[ofs]);
-            }
-        } else if (flow->dl_type == htons(ETH_TYPE_IP)
-                   || flow->dl_type == htons(ETH_TYPE_ARP)) {
-            hash = hash_add(hash, (OVS_FORCE uint32_t) flow->nw_src);
-            hash = hash_add(hash, (OVS_FORCE uint32_t) flow->nw_dst);
-        } else {
-            goto out;
-        }
+        hash = hash_add(hash, flow->nw_proto);
 
+        for (;ofs < end; ofs++) {
+            hash = hash_add64(hash, flow_u64[ofs]);
+        }
+
+        switch (flow->nw_proto) {
+        case IPPROTO_TCP:
+        case IPPROTO_UDP:
+        case IPPROTO_SCTP:
+        case IPPROTO_ICMP:
+        case IPPROTO_IGMP:
+        case IPPROTO_ICMPV6:
+            /* Ports are valid, add both ports at once. */
+            hash = hash_add(hash,
+                        ((const uint32_t *) flow)
+                        [offsetof(struct flow, tp_src) / sizeof(uint32_t)]);
+            break;
+        default:
+            break;
+        }
+    } else if (flow->dl_type == htons(ETH_TYPE_IP)) {
         hash = hash_add(hash, flow->nw_proto);
-        if (flow->nw_proto != IPPROTO_TCP && flow->nw_proto != IPPROTO_UDP
-            && flow->nw_proto != IPPROTO_SCTP && flow->nw_proto != IPPROTO_ICMP
-            && flow->nw_proto != IPPROTO_ICMPV6) {
-            goto out;
+        hash = hash_add(hash, (OVS_FORCE uint32_t) flow->nw_src);
+        hash = hash_add(hash, (OVS_FORCE uint32_t) flow->nw_dst);
+        switch (flow->nw_proto) {
+        case IPPROTO_TCP:
+        case IPPROTO_UDP:
+        case IPPROTO_SCTP:
+        case IPPROTO_ICMP:
+        case IPPROTO_IGMP:
+            /* Ports are valid, add both ports at once. */
+            hash = hash_add(hash,
+                        ((const uint32_t *) flow)
+                        [offsetof(struct flow, tp_src) / sizeof(uint32_t)]);
+            break;
+        default:
+            break;
+        }
+    } else if (eth_type_mpls(flow->dl_type)) {
+        /* Hash MPLS labels. */
+        int i;
+        int n_labels = flow_count_mpls_labels(flow, NULL);
+        for (i = 0; i < n_labels; i++) {
+            hash = hash_add(hash, (OVS_FORCE uint32_t) flow->mpls_lse[i]);
         }
+    } else {
+        /* Other ethertype, hash L2 instead. */
+        /* Hash dst and src MAC (96 bits).*/
+        hash = hash_add64(hash, *((uint64_t *) &flow->dl_dst));
+        hash = hash_add(hash, (OVS_FORCE uint32_t)
+                        *((uint8_t *) &flow->dl_dst) + sizeof(uint64_t));
 
-        /* Add both ports at once. */
-        hash = hash_add(hash,
-                        ((const uint32_t *)flow)[offsetof(struct flow, tp_src)
-                                                 / sizeof(uint32_t)]);
+        /* Hash VLAN vids.*/
+        for (int i = 0; i < FLOW_MAX_VLAN_HEADERS / 2; i++) {
+            uint32_t vlan_vids = 0;
+            vlan_vids |= ((uint32_t) vlan_tci_to_vid(
+                                     flow->vlans[2 * i].tci)) << 16;
+            vlan_vids |= (uint32_t) vlan_tci_to_vid(
+                                    flow->vlans[2 * i + 1].tci);
+            hash = hash_add(hash, vlan_vids);
+        }
     }
-out:
+
     return hash_finish(hash, 42); /* Arbitrary number. */
 }
 
-- 
1.9.1

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to