Re: [ovs-dev] [PATCH v3 01/12] dpcls: Use 32 packet batches for lookups.
Thanks Jarno for spotting this, will do the suggested change. Antonio > -Original Message- > From: dev [mailto:[email protected]] On Behalf Of Jarno > Rajahalme > Sent: Tuesday, October 18, 2016 6:26 PM > To: Bodireddy, Bhanuprakash > Cc: [email protected] > Subject: Re: [ovs-dev] [PATCH v3 01/12] dpcls: Use 32 packet batches for > lookups. > > See one comment below, > > Jarno > > > On Oct 14, 2016, at 7:37 AM, Bhanuprakash Bodireddy > wrote: > > > > This patch increases the number of packets processed in a batch during a > > lookup from 16 to 32. Processing batches of 32 packets improves > > performance and also one of the internal loops can be avoided here. > > > > Signed-off-by: Antonio Fischetti > > Co-authored-by: Bhanuprakash Bodireddy > > > Signed-off-by: Bhanuprakash Bodireddy > > --- > > lib/dpif-netdev.c | 110 ++-- > -- > > 1 file changed, 45 insertions(+), 65 deletions(-) > > > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > > index eb9f764..0a4f338 100644 > > --- a/lib/dpif-netdev.c > > +++ b/lib/dpif-netdev.c > > @@ -4985,23 +4985,21 @@ dpcls_lookup(struct dpcls *cls, const struct > netdev_flow_key keys[], > > int *num_lookups_p) > > { > > /* The received 'cnt' miniflows are the search-keys that will be > processed > > - * in batches of 16 elements. N_MAPS will contain the number of > these > > - * 16-elements batches. i.e. for 'cnt' = 32, N_MAPS will be 2. > The batch > > - * size 16 was experimentally found faster than 8 or 32. */ > > -typedef uint16_t map_type; > > + * to find a matching entry into the available subtables. > > + * The number of bits in map_type is equal to NETDEV_MAX_BURST. */ > > +typedef uint32_t map_type; > > #define MAP_BITS (sizeof(map_type) * CHAR_BIT) > > +BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST); > > > > -#if !defined(__CHECKER__) && !defined(_WIN32) > > -const int N_MAPS = DIV_ROUND_UP(cnt, MAP_BITS); > > -#else > > -enum { N_MAPS = DIV_ROUND_UP(NETDEV_MAX_BURST, MAP_BITS) }; > > -#endif > > -map_type maps[N_MAPS]; > > struct dpcls_subtable *subtable; > > > > -memset(maps, 0xff, sizeof maps); > > -if (cnt % MAP_BITS) { > > -maps[N_MAPS - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra > bits. */ > > +map_type keys_map = TYPE_MAXIMUM(map_type); > > +map_type found_map; > > +uint32_t hashes[MAP_BITS]; > > +const struct cmap_node *nodes[MAP_BITS]; > > + > > +if (cnt != NETDEV_MAX_BURST) { > > +keys_map >>= NETDEV_MAX_BURST - cnt; /* Clear extra bits. */ > > ‘keys_maps’ has MAP_BITS bits set, and after this is should only have the > ‘cnt’ LSBs set. The assert above allows NETDEV_MAX_BURST to be less than > MAP_BITS, hence MAP_BITS must be used here instead. > > Otherwise looks good to me, so with this change: > > Acked-by: Jarno Rajahalme > > > > } > > memset(rules, 0, cnt * sizeof *rules); > > > > @@ -5015,61 +5013,43 @@ dpcls_lookup(struct dpcls *cls, const struct > netdev_flow_key keys[], > > * search-key, the search for that key can stop because the rules > are > > * non-overlapping. */ > > PVECTOR_FOR_EACH (subtable, &cls->subtables) { > > -const struct netdev_flow_key *mkeys = keys; > > -struct dpcls_rule **mrules = rules; > > -map_type remains = 0; > > -int m; > > - > > -BUILD_ASSERT_DECL(sizeof remains == sizeof *maps); > > - > > -/* Loops on each batch of 16 search-keys. */ > > -for (m = 0; m < N_MAPS; m++, mkeys += MAP_BITS, mrules += > MAP_BITS) { > > -uint32_t hashes[MAP_BITS]; > > -const struct cmap_node *nodes[MAP_BITS]; > > -unsigned long map = maps[m]; > > -int i; > > - > > -if (!map) { > > -continue; /* Skip empty maps. */ > > -} > > - > > -/* Compute hashes for the remaining keys. Each search-key > is > > - * masked with the subtable's mask to avoid hashing the > wildcarded > > - * bits. */ > > -ULLONG_FOR_EACH_1(i, map) { > > -hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i], > > - &subtable- > >mask); >
Re: [ovs-dev] [PATCH v3 01/12] dpcls: Use 32 packet batches for lookups.
See one comment below,
Jarno
> On Oct 14, 2016, at 7:37 AM, Bhanuprakash Bodireddy
> wrote:
>
> This patch increases the number of packets processed in a batch during a
> lookup from 16 to 32. Processing batches of 32 packets improves
> performance and also one of the internal loops can be avoided here.
>
> Signed-off-by: Antonio Fischetti
> Co-authored-by: Bhanuprakash Bodireddy
> Signed-off-by: Bhanuprakash Bodireddy
> ---
> lib/dpif-netdev.c | 110 ++
> 1 file changed, 45 insertions(+), 65 deletions(-)
>
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index eb9f764..0a4f338 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -4985,23 +4985,21 @@ dpcls_lookup(struct dpcls *cls, const struct
> netdev_flow_key keys[],
> int *num_lookups_p)
> {
> /* The received 'cnt' miniflows are the search-keys that will be processed
> - * in batches of 16 elements. N_MAPS will contain the number of these
> - * 16-elements batches. i.e. for 'cnt' = 32, N_MAPS will be 2. The
> batch
> - * size 16 was experimentally found faster than 8 or 32. */
> -typedef uint16_t map_type;
> + * to find a matching entry into the available subtables.
> + * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
> +typedef uint32_t map_type;
> #define MAP_BITS (sizeof(map_type) * CHAR_BIT)
> +BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
>
> -#if !defined(__CHECKER__) && !defined(_WIN32)
> -const int N_MAPS = DIV_ROUND_UP(cnt, MAP_BITS);
> -#else
> -enum { N_MAPS = DIV_ROUND_UP(NETDEV_MAX_BURST, MAP_BITS) };
> -#endif
> -map_type maps[N_MAPS];
> struct dpcls_subtable *subtable;
>
> -memset(maps, 0xff, sizeof maps);
> -if (cnt % MAP_BITS) {
> -maps[N_MAPS - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra bits.
> */
> +map_type keys_map = TYPE_MAXIMUM(map_type);
> +map_type found_map;
> +uint32_t hashes[MAP_BITS];
> +const struct cmap_node *nodes[MAP_BITS];
> +
> +if (cnt != NETDEV_MAX_BURST) {
> +keys_map >>= NETDEV_MAX_BURST - cnt; /* Clear extra bits. */
‘keys_maps’ has MAP_BITS bits set, and after this is should only have the ‘cnt’
LSBs set. The assert above allows NETDEV_MAX_BURST to be less than MAP_BITS,
hence MAP_BITS must be used here instead.
Otherwise looks good to me, so with this change:
Acked-by: Jarno Rajahalme
> }
> memset(rules, 0, cnt * sizeof *rules);
>
> @@ -5015,61 +5013,43 @@ dpcls_lookup(struct dpcls *cls, const struct
> netdev_flow_key keys[],
> * search-key, the search for that key can stop because the rules are
> * non-overlapping. */
> PVECTOR_FOR_EACH (subtable, &cls->subtables) {
> -const struct netdev_flow_key *mkeys = keys;
> -struct dpcls_rule **mrules = rules;
> -map_type remains = 0;
> -int m;
> -
> -BUILD_ASSERT_DECL(sizeof remains == sizeof *maps);
> -
> -/* Loops on each batch of 16 search-keys. */
> -for (m = 0; m < N_MAPS; m++, mkeys += MAP_BITS, mrules += MAP_BITS) {
> -uint32_t hashes[MAP_BITS];
> -const struct cmap_node *nodes[MAP_BITS];
> -unsigned long map = maps[m];
> -int i;
> -
> -if (!map) {
> -continue; /* Skip empty maps. */
> -}
> -
> -/* Compute hashes for the remaining keys. Each search-key is
> - * masked with the subtable's mask to avoid hashing the
> wildcarded
> - * bits. */
> -ULLONG_FOR_EACH_1(i, map) {
> -hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i],
> - &subtable->mask);
> -}
> -/* Lookup. */
> -map = cmap_find_batch(&subtable->rules, map, hashes, nodes);
> -/* Check results. When the i-th bit of map is set, it means
> that a
> - * set of nodes with a matching hash value was found for the i-th
> - * search-key. Due to possible hash collisions we need to check
> - * which of the found rules, if any, really matches our masked
> - * search-key. */
> -ULLONG_FOR_EACH_1(i, map) {
> -struct dpcls_rule *rule;
> -
> -CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
> -if (OVS_LIKELY(dpcls_rule_matches_key(rule, &mkeys[i])))
> {
> -mrules[i] = rule;
> -/* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
> - * within one second optimization interval */
> -subtable->hit_cnt++;
> -lookups_match += subtable_pos;
> -goto next;
> -}
> +int i;
> +
> +/* Compute hashes for the remaining keys. Each search-key is
> + * masked with the subtable's mask
Re: [ovs-dev] [PATCH v3 01/12] dpcls: Use 32 packet batches for lookups.
>-Original Message- >From: Daniele Di Proietto [mailto:[email protected]] >Sent: Tuesday, October 18, 2016 4:04 AM >To: Bodireddy, Bhanuprakash >Cc: [email protected] >Subject: Re: [ovs-dev] [PATCH v3 01/12] dpcls: Use 32 packet batches for >lookups. > > > >2016-10-14 7:37 GMT-07:00 Bhanuprakash Bodireddy >: >This patch increases the number of packets processed in a batch during a >lookup from 16 to 32. Processing batches of 32 packets improves >performance and also one of the internal loops can be avoided here. > >Signed-off-by: Antonio Fischetti >Co-authored-by: Bhanuprakash Bodireddy > >Signed-off-by: Bhanuprakash Bodireddy > > >I guess Co-authored-by should have Antonio? My mistake, I am sending out the remaining 4 patches separately with appropriate tags. >Also, the (already existing code) is not trivial, I'd like to take another >look at it Sure, let us know if you have more comments. Regards, Bhanuprakash. >Thanks, >Daniele > >--- > lib/dpif-netdev.c | 110 ++--- >- > 1 file changed, 45 insertions(+), 65 deletions(-) > >diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c >index eb9f764..0a4f338 100644 >--- a/lib/dpif-netdev.c >+++ b/lib/dpif-netdev.c >@@ -4985,23 +4985,21 @@ dpcls_lookup(struct dpcls *cls, const struct >netdev_flow_key keys[], > int *num_lookups_p) > { > /* The received 'cnt' miniflows are the search-keys that will be processed >- * in batches of 16 elements. N_MAPS will contain the number of these >- * 16-elements batches. i.e. for 'cnt' = 32, N_MAPS will be 2. The batch >- * size 16 was experimentally found faster than 8 or 32. */ >- typedef uint16_t map_type; >+ * to find a matching entry into the available subtables. >+ * The number of bits in map_type is equal to NETDEV_MAX_BURST. */ >+ typedef uint32_t map_type; > #define MAP_BITS (sizeof(map_type) * CHAR_BIT) >+ BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST); > >-#if !defined(__CHECKER__) && !defined(_WIN32) >- const int N_MAPS = DIV_ROUND_UP(cnt, MAP_BITS); >-#else >- enum { N_MAPS = DIV_ROUND_UP(NETDEV_MAX_BURST, MAP_BITS) }; >-#endif >- map_type maps[N_MAPS]; > struct dpcls_subtable *subtable; > >- memset(maps, 0xff, sizeof maps); >- if (cnt % MAP_BITS) { >- maps[N_MAPS - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra bits. >*/ >+ map_type keys_map = TYPE_MAXIMUM(map_type); >+ map_type found_map; >+ uint32_t hashes[MAP_BITS]; >+ const struct cmap_node *nodes[MAP_BITS]; >+ >+ if (cnt != NETDEV_MAX_BURST) { >+ keys_map >>= NETDEV_MAX_BURST - cnt; /* Clear extra bits. */ > } > memset(rules, 0, cnt * sizeof *rules); > >@@ -5015,61 +5013,43 @@ dpcls_lookup(struct dpcls *cls, const struct >netdev_flow_key keys[], > * search-key, the search for that key can stop because the rules are > * non-overlapping. */ > PVECTOR_FOR_EACH (subtable, &cls->subtables) { >- const struct netdev_flow_key *mkeys = keys; >- struct dpcls_rule **mrules = rules; >- map_type remains = 0; >- int m; >- >- BUILD_ASSERT_DECL(sizeof remains == sizeof *maps); >- >- /* Loops on each batch of 16 search-keys. */ >- for (m = 0; m < N_MAPS; m++, mkeys += MAP_BITS, mrules += >MAP_BITS) { >- uint32_t hashes[MAP_BITS]; >- const struct cmap_node *nodes[MAP_BITS]; >- unsigned long map = maps[m]; >- int i; >- >- if (!map) { >- continue; /* Skip empty maps. */ >- } >- >- /* Compute hashes for the remaining keys. Each search-key is >- * masked with the subtable's mask to avoid hashing the wildcarded >- * bits. */ >- ULLONG_FOR_EACH_1(i, map) { >- hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i], >- &subtable->mask); >- } >- /* Lookup. */ >- map = cmap_find_batch(&subtable->rules, map, hashes, nodes); >- /* Check results. When the i-th bit of map is set, it means that >a >- * set of nodes with a matching hash value was found for the i-th >- * search-key. Due to possible hash collisions we need to check >- * which of the found rules, if any, really matches our masked >- * search-key. */ >- ULLONG_FOR_EACH_1(i, map) { >- struct dpcls_rule *rule; >- >- CMAP_NODE_FOR_EACH (rule, cmap
Re: [ovs-dev] [PATCH v3 01/12] dpcls: Use 32 packet batches for lookups.
2016-10-14 7:37 GMT-07:00 Bhanuprakash Bodireddy < [email protected]>: > This patch increases the number of packets processed in a batch during a > lookup from 16 to 32. Processing batches of 32 packets improves > performance and also one of the internal loops can be avoided here. > > Signed-off-by: Antonio Fischetti > Co-authored-by: Bhanuprakash Bodireddy > Signed-off-by: Bhanuprakash Bodireddy > I guess Co-authored-by should have Antonio? Also, the (already existing code) is not trivial, I'd like to take another look at it Thanks, Daniele > --- > lib/dpif-netdev.c | 110 ++ > > 1 file changed, 45 insertions(+), 65 deletions(-) > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index eb9f764..0a4f338 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -4985,23 +4985,21 @@ dpcls_lookup(struct dpcls *cls, const struct > netdev_flow_key keys[], > int *num_lookups_p) > { > /* The received 'cnt' miniflows are the search-keys that will be > processed > - * in batches of 16 elements. N_MAPS will contain the number of these > - * 16-elements batches. i.e. for 'cnt' = 32, N_MAPS will be 2. The > batch > - * size 16 was experimentally found faster than 8 or 32. */ > -typedef uint16_t map_type; > + * to find a matching entry into the available subtables. > + * The number of bits in map_type is equal to NETDEV_MAX_BURST. */ > +typedef uint32_t map_type; > #define MAP_BITS (sizeof(map_type) * CHAR_BIT) > +BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST); > > -#if !defined(__CHECKER__) && !defined(_WIN32) > -const int N_MAPS = DIV_ROUND_UP(cnt, MAP_BITS); > -#else > -enum { N_MAPS = DIV_ROUND_UP(NETDEV_MAX_BURST, MAP_BITS) }; > -#endif > -map_type maps[N_MAPS]; > struct dpcls_subtable *subtable; > > -memset(maps, 0xff, sizeof maps); > -if (cnt % MAP_BITS) { > -maps[N_MAPS - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra > bits. */ > +map_type keys_map = TYPE_MAXIMUM(map_type); > +map_type found_map; > +uint32_t hashes[MAP_BITS]; > +const struct cmap_node *nodes[MAP_BITS]; > + > +if (cnt != NETDEV_MAX_BURST) { > +keys_map >>= NETDEV_MAX_BURST - cnt; /* Clear extra bits. */ > } > memset(rules, 0, cnt * sizeof *rules); > > @@ -5015,61 +5013,43 @@ dpcls_lookup(struct dpcls *cls, const struct > netdev_flow_key keys[], > * search-key, the search for that key can stop because the rules are > * non-overlapping. */ > PVECTOR_FOR_EACH (subtable, &cls->subtables) { > -const struct netdev_flow_key *mkeys = keys; > -struct dpcls_rule **mrules = rules; > -map_type remains = 0; > -int m; > - > -BUILD_ASSERT_DECL(sizeof remains == sizeof *maps); > - > -/* Loops on each batch of 16 search-keys. */ > -for (m = 0; m < N_MAPS; m++, mkeys += MAP_BITS, mrules += > MAP_BITS) { > -uint32_t hashes[MAP_BITS]; > -const struct cmap_node *nodes[MAP_BITS]; > -unsigned long map = maps[m]; > -int i; > - > -if (!map) { > -continue; /* Skip empty maps. */ > -} > - > -/* Compute hashes for the remaining keys. Each search-key is > - * masked with the subtable's mask to avoid hashing the > wildcarded > - * bits. */ > -ULLONG_FOR_EACH_1(i, map) { > -hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i], > - &subtable->mask); > -} > -/* Lookup. */ > -map = cmap_find_batch(&subtable->rules, map, hashes, nodes); > -/* Check results. When the i-th bit of map is set, it means > that a > - * set of nodes with a matching hash value was found for the > i-th > - * search-key. Due to possible hash collisions we need to > check > - * which of the found rules, if any, really matches our masked > - * search-key. */ > -ULLONG_FOR_EACH_1(i, map) { > -struct dpcls_rule *rule; > - > -CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) { > -if (OVS_LIKELY(dpcls_rule_matches_key(rule, > &mkeys[i]))) { > -mrules[i] = rule; > -/* Even at 20 Mpps the 32-bit hit_cnt cannot wrap > - * within one second optimization interval */ > -subtable->hit_cnt++; > -lookups_match += subtable_pos; > -goto next; > -} > +int i; > + > +/* Compute hashes for the remaining keys. Each search-key is > + * masked with the subtable's mask to avoid hashing the wildcarded > + * bits. */ > +ULLONG_FOR_EACH_1(i, keys_map) { > +hashes[i] = netd
