On 17.07.2019 13:05, Ilya Maximets wrote:
> On 15.07.2019 19:36, Harry van Haaren wrote:
>> This commit refactors the generic implementation. The
>> goal of this refactor is to simplify the code to enable
>> "specialization" of the functions at compile time.
>>
>> Given compile-time optimizations, the compiler is able
>> to unroll loops, and create optimized code sequences due
>> to compile time knowledge of loop-trip counts.
>>
>> In order to enable these compiler optimizations, we must
>> refactor the code to pass the loop-trip counts to functions
>> as compile time constants.
>>
>> This patch allows the number of miniflow-bits set per "unit"
>> in the miniflow to be passed around as a function argument.
>>
>> Note that this patch does NOT yet take advantage of doing so,
>> this is only a refactor to enable it in the next patches.
>>
>> Signed-off-by: Harry van Haaren <[email protected]>
>> Tested-by: Malvika Gupta <[email protected]>
>>
>> ---
>>
>> v11:
>> - Rebased to previous changes
>> - Fix typo in commit message (Ian)
>> - Fix variable declaration spacing (Ian)
>> - Remove function names from comments (Ian)
>> - Replace magic 8 with sizeof(uint64_t) (Ian)
>> - Captialize and end comments with a stop. (Ian/Ilya)
>> - Add build time assert to validate FLOWMAP_UNITS (Ilya)
>> - Add note on ALWAYS_INLINE operation
>> - Add space after ULLONG_FOR_EACH_1 (Ilya)
>> - Use hash_add_words64() instead of rolling own loop (Ilya)
>> Note that hash_words64_inline() calls hash_finish() with an
>> fixed value, so it was not the right hash function for this
>> usage. Used hash_add_words64() and manual hash_finish() to
>> re-use as much of hashing code as we can.
>>
>> v10:
>> - Rebase updates from previous patches
>> - Fix whitespace indentation of func params
>> - Removed restrict keyword, Windows CI failing when it is used (Ian)
>> - Fix integer 0 used to set NULL pointer (Ilya)
>> - Postpone free() call on cls->blocks_scratch (Ilya)
>> - Fix indentation of a function
>>
>> v9:
>> - Use count_1bits in favour of __builtin_popcount (Ilya)
>> - Use ALWAYS_INLINE instead of __attribute__ synatx (Ilya)
>>
>> v8:
>> - Rework block_cache and mf_masks to avoid variable-lenght array
>> due to compiler issues. Provisioning for worst case is not a
>> good solution due to magnitue of over-provisioning required.
>> - Rework netdev_flatten function removing unused parameter
>> ---
>> lib/dpif-netdev-lookup-generic.c | 214 +++++++++++++++++++++++++------
>> lib/dpif-netdev-private.h | 8 +-
>> lib/dpif-netdev.c | 77 ++++++++++-
>> 3 files changed, 258 insertions(+), 41 deletions(-)
>>
>> diff --git a/lib/dpif-netdev-lookup-generic.c
>> b/lib/dpif-netdev-lookup-generic.c
>> index 833abf54f..abd166fc3 100644
>> --- a/lib/dpif-netdev-lookup-generic.c
>> +++ b/lib/dpif-netdev-lookup-generic.c
>> @@ -30,68 +30,210 @@
>> #include "packets.h"
>> #include "pvector.h"
>>
>> -/* Returns a hash value for the bits of 'key' where there are 1-bits in
>> - * 'mask'. */
>> -static inline uint32_t
>> -netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
>> - const struct netdev_flow_key *mask)
>> +VLOG_DEFINE_THIS_MODULE(dpif_lookup_generic);
>
> dpif_netdev_lookup_generic?
>
> BTW, this might be not needed if we'll remove logging in the patch #5.
>
>> +
>> +/* Lookup functions below depends on the internal structure of flowmap. */
>> +BUILD_ASSERT_DECL(FLOWMAP_UNITS == 2);
>> +
>> +/* Given a packet, table and mf_masks, this function iterates over each bit
>> + * set in the subtable, and calculates the appropriate metadata to store in
>> the
>> + * blocks_scratch[].
>> + *
>> + * The results of the blocks_scratch[] can be used for hashing, and later
>> for
>> + * verification of if a rule matches the given packet.
>> + */
>> +static inline void
>> +netdev_flow_key_flatten_unit(const uint64_t *pkt_blocks,
>> + const uint64_t *tbl_blocks,
>> + const uint64_t *mf_masks,
>> + uint64_t *blocks_scratch,
>> + const uint64_t pkt_mf_bits,
>> + const uint32_t count)
>> {
>> - const uint64_t *p = miniflow_get_values(&mask->mf);
>> - uint32_t hash = 0;
>> - uint64_t value;
>> + uint32_t i;
>> +
>> + for (i = 0; i < count; i++) {
>> + uint64_t mf_mask = mf_masks[i];
>> + /* Calculate the block index for the packet metadata. */
>> + uint64_t idx_bits = mf_mask & pkt_mf_bits;
>> + const uint32_t pkt_idx = count_1bits(idx_bits);
>> +
>> + /* Check if the packet has the subtable miniflow bit set. If yes,
>> the
>> + * block at the above pkt_idx will be stored, otherwise it is masked
>> + * out to be zero.
>> + */
>> + uint64_t pkt_has_mf_bit = (mf_mask + 1) & pkt_mf_bits;
>> + uint64_t no_bit = ((!pkt_has_mf_bit) > 0) - 1;
>>
>> - NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP (value, key, mask->mf.map) {
>> - hash = hash_add64(hash, value & *p);
>> - p++;
>> + /* Mask packet block by table block, and mask to zero if packet
>> + * doesn't actually contain this block of metadata.
>> + */
>> + blocks_scratch[i] = pkt_blocks[pkt_idx] & tbl_blocks[i] & no_bit;
>> }
>> +}
>> +
>> +/* This function takes a packet, and subtable and writes an array of
>> uint64_t
>> + * blocks. The blocks contain the metadata that the subtable matches on, in
>> + * the same order as the subtable, allowing linear iteration over the
>> blocks.
>> + *
>> + * To calculate the blocks contents, the netdev_flow_key_flatten_unit
>> function
>> + * is called twice, once for each "unit" of the miniflow. This call can be
>> + * inlined by the compiler for performance.
>> + *
>> + * Note that the u0_count and u1_count variables can be compile-time
>> constants,
>> + * allowing the loop in the inlined flatten_unit() function to be
>> compile-time
>> + * unrolled, or possibly removed totally by unrolling by the loop
>> iterations.
>> + * The compile time optimizations enabled by this design improves
>> performance.
>> + */
>> +static inline void
>> +netdev_flow_key_flatten(const struct netdev_flow_key *key,
>> + const struct netdev_flow_key *mask,
>> + const uint64_t *mf_masks,
>> + uint64_t *blocks_scratch,
>> + const uint32_t u0_count,
>> + const uint32_t u1_count)
>> +{
>> + /* Load mask from subtable, mask with packet mf, popcount to get idx. */
>> + const uint64_t *pkt_blocks = miniflow_get_values(&key->mf);
>> + const uint64_t *tbl_blocks = miniflow_get_values(&mask->mf);
>>
>> - return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
>> + /* Packet miniflow bits to be masked by pre-calculated mf_masks. */
>> + const uint64_t pkt_bits_u0 = key->mf.map.bits[0];
>> + const uint32_t pkt_bits_u0_pop = count_1bits(pkt_bits_u0);
>> + const uint64_t pkt_bits_u1 = key->mf.map.bits[1];
>> +
>> + /* Unit 0 flattening */
>> + netdev_flow_key_flatten_unit(&pkt_blocks[0],
>> + &tbl_blocks[0],
>> + &mf_masks[0],
>> + &blocks_scratch[0],
>> + pkt_bits_u0,
>> + u0_count);
>> +
>> + /* Unit 1 flattening:
>> + * Move the pointers forward in the arrays based on u0 offsets, NOTE:
>> + * 1) pkt blocks indexed by actual popcount of u0, which is NOT always
>> + * the same as the amount of bits set in the subtable.
>> + * 2) mf_masks, tbl_block and blocks_scratch are all "flat" arrays, so
>> + * the index is always u0_count.
>> + */
>> + netdev_flow_key_flatten_unit(&pkt_blocks[pkt_bits_u0_pop],
>> + &tbl_blocks[u0_count],
>> + &mf_masks[u0_count],
>> + &blocks_scratch[u0_count],
>> + pkt_bits_u1,
>> + u1_count);
>> }
>>
>> -uint32_t
>> -dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
>> - uint32_t keys_map,
>> - const struct netdev_flow_key *keys[],
>> - struct dpcls_rule **rules)
>> +/* Compares a rule and the blocks representing a key, returns 1 on a match.
>> */
>> +static inline uint64_t
>> +netdev_rule_matches_key(const struct dpcls_rule *rule,
>> + const uint32_t mf_bits_total,
>> + const uint64_t *blocks_scratch)
>> {
>> - int i;
>> - uint32_t found_map;
>> + const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
>> + const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
>> + uint64_t not_match = 0;
>> +
>> + for (int i = 0; i < mf_bits_total; i++) {
>> + not_match |= (blocks_scratch[i] & maskp[i]) != keyp[i];
>> + }
>>
>> - /* Compute hashes for the remaining keys. Each search-key is
>> - * masked with the subtable's mask to avoid hashing the wildcarded
>> - * bits. */
>> + /* invert result to show match as 1 */
>
> Missed comment (style).
>
>> + return !not_match;
>> +}
>> +
>> +/* Const prop version of the function: note that mf bits total and u0 are
>> + * explicitly passed in here, while they're also available at runtime from
>> the
>> + * subtable pointer. By making them compile time, we enable the compiler to
>> + * unroll loops and flatten out code-sequences based on the knowledge of the
>> + * mf_bits_* compile time values. This results in improved performance.
>> + *
>> + * Note: this function is marked with ALWAYS_INLINE to ensure the compiler
>> + * inlines the below code, and then uses the compile time constants to make
>> + * specialized versions of the runtime code. Without ALWAYS_INLINE, the
>> + * compiler might decide to not inline, and performance will suffer.
>> + */
>> +static inline uint32_t ALWAYS_INLINE
>> +lookup_generic_impl(struct dpcls_subtable *subtable,
>> + uint64_t *blocks_scratch,
>> + uint32_t keys_map,
>> + const struct netdev_flow_key *keys[],
>> + struct dpcls_rule **rules,
>> + const uint32_t bit_count_u0,
>> + const uint32_t bit_count_u1)
>> +{
>> + const uint32_t n_pkts = count_1bits(keys_map);
>> + ovs_assert(NETDEV_MAX_BURST >= n_pkts);
>> uint32_t hashes[NETDEV_MAX_BURST];
>> +
>> + const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
>> + uint64_t *mf_masks = subtable->mf_masks;
>> + int i;
>> +
>> + /* Flatten the packet metadata into the blocks_scratch[] using
>> subtable. */
>> + ULLONG_FOR_EACH_1 (i, keys_map) {
>> + netdev_flow_key_flatten(keys[i],
>> + &subtable->mask,
>> + mf_masks,
>> + &blocks_scratch[i * bit_count_total],
>> + bit_count_u0,
>> + bit_count_u1);
>> + }
>> +
>> + /* Hash the now linearized blocks of packet metadata. */
>> ULLONG_FOR_EACH_1 (i, keys_map) {
>> - hashes[i] = netdev_flow_key_hash_in_mask(keys[i], &subtable->mask);
>> + uint64_t *block_ptr = &blocks_scratch[i * bit_count_total];
>> + uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total);
>> + hashes[i] = hash_finish(hash, bit_count_total * 8);
You mentioned before that 'hash_words64_inline()' has a different finish
value, but I don't think that it matters to preserve the actual value of
a hash here. Even 'hash_words64()' should be fine, because it checks if
the argument is a compile time constant and calls the 'hash_words64_inline'
in this case.
>> }
>>
>> - /* Lookup. */
>> + /* Lookup: this returns a bitmask of packets where the hash table had
>> + * an entry for the given hash key. Presence of a hash key does not
>> + * guarantee matching the key, as there can be hash collisions.
>> + */
>> + uint32_t found_map;
>> const struct cmap_node *nodes[NETDEV_MAX_BURST];
>> +
>> found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
>>
>> - /* Check results. When the i-th bit of found_map is set, it means
>> - * that a set of nodes with a matching hash value was found for the
>> - * i-th search-key. Due to possible hash collisions we need to check
>> - * which of the found rules, if any, really matches our masked
>> - * search-key. */
>> + /* Verify that packet actually matched rule. If not found, a hash
>> + * collision has taken place, so continue searching with the next node.
>> + */
>> ULLONG_FOR_EACH_1 (i, found_map) {
>> struct dpcls_rule *rule;
>>
>> CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
>> - if (OVS_LIKELY(dpcls_rule_matches_key(rule, keys[i]))) {
>> + const uint32_t cidx = i * bit_count_total;
>> + uint32_t match = netdev_rule_matches_key(rule, bit_count_total,
>> + &blocks_scratch[cidx]);
>> +
>> + if (OVS_LIKELY(match)) {
>> rules[i] = rule;
>> - /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
>> - * within one second optimization interval. */
>> subtable->hit_cnt++;
>> goto next;
>> }
>> }
>> - /* None of the found rules was a match. Reset the i-th bit to
>> - * keep searching this key in the next subtable. */
>> - ULLONG_SET0(found_map, i); /* Did not match. */
>> +
>> + /* None of the found rules was a match. Clear the i-th bit to
>> + * search for this key in the next subtable. */
>> + ULLONG_SET0(found_map, i);
>> next:
>> - ; /* Keep Sparse happy. */
>> + ; /* Keep Sparse happy. */
>
> You're changing the indentation of this comment back and forth few times
> around the patch series.
>
>> }
>>
>> return found_map;
>> }
>> +
>> +/* Generic lookup function that uses runtime provided mf bits for
>> iterating. */
>> +uint32_t
>> +dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
>> + uint64_t *blocks_scratch,
>> + uint32_t keys_map,
>> + const struct netdev_flow_key *keys[],
>> + struct dpcls_rule **rules)
>> +{
>> + return lookup_generic_impl(subtable, blocks_scratch, keys_map, keys,
>> rules,
>> + subtable->mf_bits_set_unit0,
>> + subtable->mf_bits_set_unit1);
>> +}
>> diff --git a/lib/dpif-netdev-private.h b/lib/dpif-netdev-private.h
>> index b235e23c8..e1ceaa641 100644
>> --- a/lib/dpif-netdev-private.h
>> +++ b/lib/dpif-netdev-private.h
>> @@ -56,13 +56,15 @@ struct dpcls_rule {
>> */
>> typedef
>> uint32_t (*dpcls_subtable_lookup_func)(struct dpcls_subtable *subtable,
>> + uint64_t *blocks_scratch,
>> uint32_t keys_map,
>> const struct netdev_flow_key *keys[],
>> struct dpcls_rule **rules);
>>
>> -/* Prototype for generic lookup func, using same code path as before. */
>> +/* Prototype for generic lookup func, using generic scalar code path. */
>> uint32_t
>> dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
>> + uint64_t *blocks_scratch,
>> uint32_t keys_map,
>> const struct netdev_flow_key *keys[],
>> struct dpcls_rule **rules);
>> @@ -84,13 +86,13 @@ struct dpcls_subtable {
>> uint8_t mf_bits_set_unit0;
>> uint8_t mf_bits_set_unit1;
>>
>> - /* the lookup function to use for this subtable. If there is a known
>> + /* The lookup function to use for this subtable. If there is a known
>> * property of the subtable (eg: only 3 bits of miniflow metadata is
>> * used for the lookup) then this can point at an optimized version of
>> * the lookup function for this particular subtable. */
>> dpcls_subtable_lookup_func lookup_func;
>>
>> - /* caches the masks to match a packet to, reducing runtime calculations
>> */
>> + /* Caches the masks to match a packet to, reducing runtime
>> calculations. */
>
> Above two changes should be part of the previous patches.
>
> BTW, introduction of the 'mf_masks' and 'mf_bits*' should happen in this
> patch,
> not in the "dpif-netdev: Move dpcls lookup structures to .h".
>
>> uint64_t *mf_masks;
>>
>> struct netdev_flow_key mask; /* Wildcards for fields (const). */
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index b42ca35e3..8acc1445a 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -233,6 +233,15 @@ struct dpcls {
>> odp_port_t in_port;
>> struct cmap subtables_map;
>> struct pvector subtables;
>> +
>> + /* Region of memory for this DPCLS instance to use as scratch.
>> + * Size is garaunteed to be large enough to hold all blocks required for
>
> s/garaunteed/guaranteed/
>
>> + * the subtable's to match on. This allows each dpcls lookup to flatten
>> + * the packet miniflows into this blocks_scratch area, without using
>> + * variable lenght arrays. This region is allocated on subtable create,
>> and
>
> s/lenght/length/
>
>> + * will be resized as required if a larger subtable is added. */
>> + uint64_t *blocks_scratch;
>> + uint32_t blocks_scratch_size;
>> };
>>
>> /* Data structure to keep packet order till fastpath processing. */
>> @@ -7649,6 +7658,7 @@ static void
>> dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
>> {
>> cmap_destroy(&subtable->rules);
>> + ovsrcu_postpone(free, subtable->mf_masks);
>> ovsrcu_postpone(free, subtable);
>> }
>>
>> @@ -7659,6 +7669,8 @@ dpcls_init(struct dpcls *cls)
>> {
>> cmap_init(&cls->subtables_map);
>> pvector_init(&cls->subtables);
>> + cls->blocks_scratch = NULL;
>> + cls->blocks_scratch_size = 0;
>> }
>>
>> static void
>> @@ -7686,6 +7698,7 @@ dpcls_destroy(struct dpcls *cls)
>> }
>> cmap_destroy(&cls->subtables_map);
>> pvector_destroy(&cls->subtables);
>> + ovsrcu_postpone(free, cls->blocks_scratch);
>> }
>> }
>>
>> @@ -7701,7 +7714,28 @@ dpcls_create_subtable(struct dpcls *cls, const struct
>> netdev_flow_key *mask)
>> subtable->hit_cnt = 0;
>> netdev_flow_key_clone(&subtable->mask, mask);
>>
>> - /* Decide which hash/lookup/verify function to use. */
>> + /* The count of bits in the mask defines the space required for masks.
>> + * Then call gen_masks() to create the appropriate masks, avoiding the
>> cost
>> + * of doing runtime calculations. */
>> + uint32_t unit0 = count_1bits(mask->mf.map.bits[0]);
>> + uint32_t unit1 = count_1bits(mask->mf.map.bits[1]);
>> + subtable->mf_bits_set_unit0 = unit0;
>> + subtable->mf_bits_set_unit1 = unit1;
>> +
>> + subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
>> + netdev_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
>> +
>> + /* Allocate blocks scratch space only if subtable requires more size
>> than
>> + * is currently allocated. */
>> + const uint32_t blocks_required_per_pkt = unit0 + unit1;
>> + if (cls->blocks_scratch_size < blocks_required_per_pkt) {
>> + free(cls->blocks_scratch);
>> + cls->blocks_scratch = xmalloc(sizeof(uint64_t) * NETDEV_MAX_BURST *
>> + blocks_required_per_pkt);
>> + cls->blocks_scratch_size = blocks_required_per_pkt;
>> + }
>> +
>> + /* Assign the generic lookup - this works with any miniflow
>> fingerprint. */
>> subtable->lookup_func = dpcls_subtable_lookup_generic;
>>
>> cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
>> @@ -7846,6 +7880,43 @@ dpcls_remove(struct dpcls *cls, struct dpcls_rule
>> *rule)
>> }
>> }
>>
>> +/* Inner loop for mask generation of a unit, see netdev_flow_key_gen_masks.
>> */
>> +static inline void
>> +netdev_flow_key_gen_mask_unit(uint64_t iter,
>> + const uint64_t count,
>> + uint64_t *mf_masks)
>> +{
>> + int i;
>> + for (i = 0; i < count; i++) {
>> + uint64_t lowest_bit = (iter & -iter);
>> + iter &= ~lowest_bit;
>> + mf_masks[i] = (lowest_bit - 1);
>> + }
>> + /* Checks that count has covered all bits in the iter bitmap. */
>> + ovs_assert(iter == 0);
>> +}
>> +
>> +/* Generate a mask for each block in the miniflow, based on the bits set.
>> This
>> + * allows easily masking packets with the generated array here, without
>> + * calculations. This replaces runtime-calculating the masks.
>> + * @param key The table to generate the mf_masks for
>> + * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
>> + * @param mf_bits_total Number of bits set in the whole miniflow (both
>> units)
>> + * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
>> + */
>> +void
>> +netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
>> + uint64_t *mf_masks,
>> + const uint32_t mf_bits_u0,
>> + const uint32_t mf_bits_u1)
>> +{
>> + uint64_t iter_u0 = tbl->mf.map.bits[0];
>> + uint64_t iter_u1 = tbl->mf.map.bits[1];
>> +
>> + netdev_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
>> + netdev_flow_key_gen_mask_unit(iter_u1, mf_bits_u1,
>> &mf_masks[mf_bits_u0]);
>> +}
>> +
>> /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each
>> 1-bit
>> * in 'mask' the values in 'key' and 'target' are the same. */
>> bool
>> @@ -7886,6 +7957,7 @@ dpcls_lookup(struct dpcls *cls, const struct
>> netdev_flow_key *keys[],
>> BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
>>
>> struct dpcls_subtable *subtable;
>> + uint64_t *blocks_scratch = cls->blocks_scratch;
>>
>> uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
>>
>> @@ -7906,7 +7978,8 @@ dpcls_lookup(struct dpcls *cls, const struct
>> netdev_flow_key *keys[],
>> * non-overlapping. */
>> PVECTOR_FOR_EACH (subtable, &cls->subtables) {
>> /* Call the subtable specific lookup function. */
>> - found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
>> + found_map = subtable->lookup_func(subtable, blocks_scratch,
>> keys_map,
>> + keys, rules);
>>
>> /* Count the number of subtables searched for this packet match.
>> This
>> * estimates the "spread" of subtables looked at per matched
>> packet. */
>>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev