Re: [ovs-dev] [PATCH v3 01/12] dpcls: Use 32 packet batches for lookups.

2016-10-19 Thread Fischetti, Antonio
Thanks Jarno for spotting this, will do the suggested change.

Antonio

> -Original Message-
> From: dev [mailto:[email protected]] On Behalf Of Jarno
> Rajahalme
> Sent: Tuesday, October 18, 2016 6:26 PM
> To: Bodireddy, Bhanuprakash 
> Cc: [email protected]
> Subject: Re: [ovs-dev] [PATCH v3 01/12] dpcls: Use 32 packet batches for
> lookups.
> 
> See one comment below,
> 
>   Jarno
> 
> > On Oct 14, 2016, at 7:37 AM, Bhanuprakash Bodireddy
>  wrote:
> >
> > This patch increases the number of packets processed in a batch during a
> > lookup from 16 to 32. Processing batches of 32 packets improves
> > performance and also one of the internal loops can be avoided here.
> >
> > Signed-off-by: Antonio Fischetti 
> > Co-authored-by: Bhanuprakash Bodireddy
> 
> > Signed-off-by: Bhanuprakash Bodireddy 
> > ---
> > lib/dpif-netdev.c | 110 ++--
> --
> > 1 file changed, 45 insertions(+), 65 deletions(-)
> >
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index eb9f764..0a4f338 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -4985,23 +4985,21 @@ dpcls_lookup(struct dpcls *cls, const struct
> netdev_flow_key keys[],
> >  int *num_lookups_p)
> > {
> > /* The received 'cnt' miniflows are the search-keys that will be
> processed
> > - * in batches of 16 elements.  N_MAPS will contain the number of
> these
> > - * 16-elements batches.  i.e. for 'cnt' = 32, N_MAPS will be 2.
> The batch
> > - * size 16 was experimentally found faster than 8 or 32. */
> > -typedef uint16_t map_type;
> > + * to find a matching entry into the available subtables.
> > + * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
> > +typedef uint32_t map_type;
> > #define MAP_BITS (sizeof(map_type) * CHAR_BIT)
> > +BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
> >
> > -#if !defined(__CHECKER__) && !defined(_WIN32)
> > -const int N_MAPS = DIV_ROUND_UP(cnt, MAP_BITS);
> > -#else
> > -enum { N_MAPS = DIV_ROUND_UP(NETDEV_MAX_BURST, MAP_BITS) };
> > -#endif
> > -map_type maps[N_MAPS];
> > struct dpcls_subtable *subtable;
> >
> > -memset(maps, 0xff, sizeof maps);
> > -if (cnt % MAP_BITS) {
> > -maps[N_MAPS - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra
> bits. */
> > +map_type keys_map = TYPE_MAXIMUM(map_type);
> > +map_type found_map;
> > +uint32_t hashes[MAP_BITS];
> > +const struct cmap_node *nodes[MAP_BITS];
> > +
> > +if (cnt != NETDEV_MAX_BURST) {
> > +keys_map >>= NETDEV_MAX_BURST - cnt; /* Clear extra bits. */
> 
> ‘keys_maps’ has MAP_BITS bits set, and after this is should only have the
> ‘cnt’ LSBs set. The assert above allows NETDEV_MAX_BURST to be less than
> MAP_BITS, hence MAP_BITS must be used here instead.
> 
> Otherwise looks good to me, so with this change:
> 
> Acked-by: Jarno Rajahalme 
> 
> 
> > }
> > memset(rules, 0, cnt * sizeof *rules);
> >
> > @@ -5015,61 +5013,43 @@ dpcls_lookup(struct dpcls *cls, const struct
> netdev_flow_key keys[],
> >  * search-key, the search for that key can stop because the rules
> are
> >  * non-overlapping. */
> > PVECTOR_FOR_EACH (subtable, &cls->subtables) {
> > -const struct netdev_flow_key *mkeys = keys;
> > -struct dpcls_rule **mrules = rules;
> > -map_type remains = 0;
> > -int m;
> > -
> > -BUILD_ASSERT_DECL(sizeof remains == sizeof *maps);
> > -
> > -/* Loops on each batch of 16 search-keys. */
> > -for (m = 0; m < N_MAPS; m++, mkeys += MAP_BITS, mrules +=
> MAP_BITS) {
> > -uint32_t hashes[MAP_BITS];
> > -const struct cmap_node *nodes[MAP_BITS];
> > -unsigned long map = maps[m];
> > -int i;
> > -
> > -if (!map) {
> > -continue; /* Skip empty maps. */
> > -}
> > -
> > -/* Compute hashes for the remaining keys.  Each search-key
> is
> > - * masked with the subtable's mask to avoid hashing the
> wildcarded
> > - * bits. */
> > -ULLONG_FOR_EACH_1(i, map) {
> > -hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i],
> > - &subtable-
> >mask);
> 

Re: [ovs-dev] [PATCH v3 01/12] dpcls: Use 32 packet batches for lookups.

2016-10-18 Thread Jarno Rajahalme
See one comment below,

  Jarno

> On Oct 14, 2016, at 7:37 AM, Bhanuprakash Bodireddy 
>  wrote:
> 
> This patch increases the number of packets processed in a batch during a
> lookup from 16 to 32. Processing batches of 32 packets improves
> performance and also one of the internal loops can be avoided here.
> 
> Signed-off-by: Antonio Fischetti 
> Co-authored-by: Bhanuprakash Bodireddy 
> Signed-off-by: Bhanuprakash Bodireddy 
> ---
> lib/dpif-netdev.c | 110 ++
> 1 file changed, 45 insertions(+), 65 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index eb9f764..0a4f338 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -4985,23 +4985,21 @@ dpcls_lookup(struct dpcls *cls, const struct 
> netdev_flow_key keys[],
>  int *num_lookups_p)
> {
> /* The received 'cnt' miniflows are the search-keys that will be processed
> - * in batches of 16 elements.  N_MAPS will contain the number of these
> - * 16-elements batches.  i.e. for 'cnt' = 32, N_MAPS will be 2.  The 
> batch
> - * size 16 was experimentally found faster than 8 or 32. */
> -typedef uint16_t map_type;
> + * to find a matching entry into the available subtables.
> + * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
> +typedef uint32_t map_type;
> #define MAP_BITS (sizeof(map_type) * CHAR_BIT)
> +BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
> 
> -#if !defined(__CHECKER__) && !defined(_WIN32)
> -const int N_MAPS = DIV_ROUND_UP(cnt, MAP_BITS);
> -#else
> -enum { N_MAPS = DIV_ROUND_UP(NETDEV_MAX_BURST, MAP_BITS) };
> -#endif
> -map_type maps[N_MAPS];
> struct dpcls_subtable *subtable;
> 
> -memset(maps, 0xff, sizeof maps);
> -if (cnt % MAP_BITS) {
> -maps[N_MAPS - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra bits. 
> */
> +map_type keys_map = TYPE_MAXIMUM(map_type);
> +map_type found_map;
> +uint32_t hashes[MAP_BITS];
> +const struct cmap_node *nodes[MAP_BITS];
> +
> +if (cnt != NETDEV_MAX_BURST) {
> +keys_map >>= NETDEV_MAX_BURST - cnt; /* Clear extra bits. */

‘keys_maps’ has MAP_BITS bits set, and after this is should only have the ‘cnt’ 
LSBs set. The assert above allows NETDEV_MAX_BURST to be less than MAP_BITS, 
hence MAP_BITS must be used here instead.

Otherwise looks good to me, so with this change:

Acked-by: Jarno Rajahalme 


> }
> memset(rules, 0, cnt * sizeof *rules);
> 
> @@ -5015,61 +5013,43 @@ dpcls_lookup(struct dpcls *cls, const struct 
> netdev_flow_key keys[],
>  * search-key, the search for that key can stop because the rules are
>  * non-overlapping. */
> PVECTOR_FOR_EACH (subtable, &cls->subtables) {
> -const struct netdev_flow_key *mkeys = keys;
> -struct dpcls_rule **mrules = rules;
> -map_type remains = 0;
> -int m;
> -
> -BUILD_ASSERT_DECL(sizeof remains == sizeof *maps);
> -
> -/* Loops on each batch of 16 search-keys. */
> -for (m = 0; m < N_MAPS; m++, mkeys += MAP_BITS, mrules += MAP_BITS) {
> -uint32_t hashes[MAP_BITS];
> -const struct cmap_node *nodes[MAP_BITS];
> -unsigned long map = maps[m];
> -int i;
> -
> -if (!map) {
> -continue; /* Skip empty maps. */
> -}
> -
> -/* Compute hashes for the remaining keys.  Each search-key is
> - * masked with the subtable's mask to avoid hashing the 
> wildcarded
> - * bits. */
> -ULLONG_FOR_EACH_1(i, map) {
> -hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i],
> - &subtable->mask);
> -}
> -/* Lookup. */
> -map = cmap_find_batch(&subtable->rules, map, hashes, nodes);
> -/* Check results.  When the i-th bit of map is set, it means 
> that a
> - * set of nodes with a matching hash value was found for the i-th
> - * search-key.  Due to possible hash collisions we need to check
> - * which of the found rules, if any, really matches our masked
> - * search-key. */
> -ULLONG_FOR_EACH_1(i, map) {
> -struct dpcls_rule *rule;
> -
> -CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
> -if (OVS_LIKELY(dpcls_rule_matches_key(rule, &mkeys[i]))) 
> {
> -mrules[i] = rule;
> -/* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
> - * within one second optimization interval  */
> -subtable->hit_cnt++;
> -lookups_match += subtable_pos;
> -goto next;
> -}
> +int i;
> +
> +/* Compute hashes for the remaining keys.  Each search-key is
> + * masked with the subtable's mask

Re: [ovs-dev] [PATCH v3 01/12] dpcls: Use 32 packet batches for lookups.

2016-10-18 Thread Bodireddy, Bhanuprakash
>-Original Message-
>From: Daniele Di Proietto [mailto:[email protected]]
>Sent: Tuesday, October 18, 2016 4:04 AM
>To: Bodireddy, Bhanuprakash 
>Cc: [email protected]
>Subject: Re: [ovs-dev] [PATCH v3 01/12] dpcls: Use 32 packet batches for
>lookups.
>
>
>
>2016-10-14 7:37 GMT-07:00 Bhanuprakash Bodireddy
>:
>This patch increases the number of packets processed in a batch during a
>lookup from 16 to 32. Processing batches of 32 packets improves
>performance and also one of the internal loops can be avoided here.
>
>Signed-off-by: Antonio Fischetti 
>Co-authored-by: Bhanuprakash Bodireddy
>
>Signed-off-by: Bhanuprakash Bodireddy
>
>
>I guess Co-authored-by should have Antonio?
My mistake, I am sending out the remaining 4 patches separately with 
appropriate tags. 

>Also, the (already existing code) is not trivial, I'd like to take another 
>look at it
Sure, let us know if you have more comments. 

Regards,
Bhanuprakash. 

>Thanks,
>Daniele
>
>---
> lib/dpif-netdev.c | 110 ++---
>-
> 1 file changed, 45 insertions(+), 65 deletions(-)
>
>diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>index eb9f764..0a4f338 100644
>--- a/lib/dpif-netdev.c
>+++ b/lib/dpif-netdev.c
>@@ -4985,23 +4985,21 @@ dpcls_lookup(struct dpcls *cls, const struct
>netdev_flow_key keys[],
>              int *num_lookups_p)
> {
>     /* The received 'cnt' miniflows are the search-keys that will be processed
>-     * in batches of 16 elements.  N_MAPS will contain the number of these
>-     * 16-elements batches.  i.e. for 'cnt' = 32, N_MAPS will be 2.  The batch
>-     * size 16 was experimentally found faster than 8 or 32. */
>-    typedef uint16_t map_type;
>+     * to find a matching entry into the available subtables.
>+     * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
>+    typedef uint32_t map_type;
> #define MAP_BITS (sizeof(map_type) * CHAR_BIT)
>+    BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
>
>-#if !defined(__CHECKER__) && !defined(_WIN32)
>-    const int N_MAPS = DIV_ROUND_UP(cnt, MAP_BITS);
>-#else
>-    enum { N_MAPS = DIV_ROUND_UP(NETDEV_MAX_BURST, MAP_BITS) };
>-#endif
>-    map_type maps[N_MAPS];
>     struct dpcls_subtable *subtable;
>
>-    memset(maps, 0xff, sizeof maps);
>-    if (cnt % MAP_BITS) {
>-        maps[N_MAPS - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra bits.
>*/
>+    map_type keys_map = TYPE_MAXIMUM(map_type);
>+    map_type found_map;
>+    uint32_t hashes[MAP_BITS];
>+    const struct cmap_node *nodes[MAP_BITS];
>+
>+    if (cnt != NETDEV_MAX_BURST) {
>+        keys_map >>= NETDEV_MAX_BURST - cnt; /* Clear extra bits. */
>     }
>     memset(rules, 0, cnt * sizeof *rules);
>
>@@ -5015,61 +5013,43 @@ dpcls_lookup(struct dpcls *cls, const struct
>netdev_flow_key keys[],
>      * search-key, the search for that key can stop because the rules are
>      * non-overlapping. */
>     PVECTOR_FOR_EACH (subtable, &cls->subtables) {
>-        const struct netdev_flow_key *mkeys = keys;
>-        struct dpcls_rule **mrules = rules;
>-        map_type remains = 0;
>-        int m;
>-
>-        BUILD_ASSERT_DECL(sizeof remains == sizeof *maps);
>-
>-        /* Loops on each batch of 16 search-keys. */
>-        for (m = 0; m < N_MAPS; m++, mkeys += MAP_BITS, mrules +=
>MAP_BITS) {
>-            uint32_t hashes[MAP_BITS];
>-            const struct cmap_node *nodes[MAP_BITS];
>-            unsigned long map = maps[m];
>-            int i;
>-
>-            if (!map) {
>-                continue; /* Skip empty maps. */
>-            }
>-
>-            /* Compute hashes for the remaining keys.  Each search-key is
>-             * masked with the subtable's mask to avoid hashing the wildcarded
>-             * bits. */
>-            ULLONG_FOR_EACH_1(i, map) {
>-                hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i],
>-                                                         &subtable->mask);
>-            }
>-            /* Lookup. */
>-            map = cmap_find_batch(&subtable->rules, map, hashes, nodes);
>-            /* Check results.  When the i-th bit of map is set, it means that 
>a
>-             * set of nodes with a matching hash value was found for the i-th
>-             * search-key.  Due to possible hash collisions we need to check
>-             * which of the found rules, if any, really matches our masked
>-             * search-key. */
>-            ULLONG_FOR_EACH_1(i, map) {
>-                struct dpcls_rule *rule;
>-
>-                CMAP_NODE_FOR_EACH (rule, cmap

Re: [ovs-dev] [PATCH v3 01/12] dpcls: Use 32 packet batches for lookups.

2016-10-17 Thread Daniele Di Proietto
2016-10-14 7:37 GMT-07:00 Bhanuprakash Bodireddy <
[email protected]>:

> This patch increases the number of packets processed in a batch during a
> lookup from 16 to 32. Processing batches of 32 packets improves
> performance and also one of the internal loops can be avoided here.
>
> Signed-off-by: Antonio Fischetti 
> Co-authored-by: Bhanuprakash Bodireddy 
> Signed-off-by: Bhanuprakash Bodireddy 
>

I guess Co-authored-by should have Antonio?

Also, the (already existing code) is not trivial, I'd like to take another
look at it

Thanks,

Daniele


> ---
>  lib/dpif-netdev.c | 110 ++
> 
>  1 file changed, 45 insertions(+), 65 deletions(-)
>
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index eb9f764..0a4f338 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -4985,23 +4985,21 @@ dpcls_lookup(struct dpcls *cls, const struct
> netdev_flow_key keys[],
>   int *num_lookups_p)
>  {
>  /* The received 'cnt' miniflows are the search-keys that will be
> processed
> - * in batches of 16 elements.  N_MAPS will contain the number of these
> - * 16-elements batches.  i.e. for 'cnt' = 32, N_MAPS will be 2.  The
> batch
> - * size 16 was experimentally found faster than 8 or 32. */
> -typedef uint16_t map_type;
> + * to find a matching entry into the available subtables.
> + * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
> +typedef uint32_t map_type;
>  #define MAP_BITS (sizeof(map_type) * CHAR_BIT)
> +BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
>
> -#if !defined(__CHECKER__) && !defined(_WIN32)
> -const int N_MAPS = DIV_ROUND_UP(cnt, MAP_BITS);
> -#else
> -enum { N_MAPS = DIV_ROUND_UP(NETDEV_MAX_BURST, MAP_BITS) };
> -#endif
> -map_type maps[N_MAPS];
>  struct dpcls_subtable *subtable;
>
> -memset(maps, 0xff, sizeof maps);
> -if (cnt % MAP_BITS) {
> -maps[N_MAPS - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra
> bits. */
> +map_type keys_map = TYPE_MAXIMUM(map_type);
> +map_type found_map;
> +uint32_t hashes[MAP_BITS];
> +const struct cmap_node *nodes[MAP_BITS];
> +
> +if (cnt != NETDEV_MAX_BURST) {
> +keys_map >>= NETDEV_MAX_BURST - cnt; /* Clear extra bits. */
>  }
>  memset(rules, 0, cnt * sizeof *rules);
>
> @@ -5015,61 +5013,43 @@ dpcls_lookup(struct dpcls *cls, const struct
> netdev_flow_key keys[],
>   * search-key, the search for that key can stop because the rules are
>   * non-overlapping. */
>  PVECTOR_FOR_EACH (subtable, &cls->subtables) {
> -const struct netdev_flow_key *mkeys = keys;
> -struct dpcls_rule **mrules = rules;
> -map_type remains = 0;
> -int m;
> -
> -BUILD_ASSERT_DECL(sizeof remains == sizeof *maps);
> -
> -/* Loops on each batch of 16 search-keys. */
> -for (m = 0; m < N_MAPS; m++, mkeys += MAP_BITS, mrules +=
> MAP_BITS) {
> -uint32_t hashes[MAP_BITS];
> -const struct cmap_node *nodes[MAP_BITS];
> -unsigned long map = maps[m];
> -int i;
> -
> -if (!map) {
> -continue; /* Skip empty maps. */
> -}
> -
> -/* Compute hashes for the remaining keys.  Each search-key is
> - * masked with the subtable's mask to avoid hashing the
> wildcarded
> - * bits. */
> -ULLONG_FOR_EACH_1(i, map) {
> -hashes[i] = netdev_flow_key_hash_in_mask(&mkeys[i],
> - &subtable->mask);
> -}
> -/* Lookup. */
> -map = cmap_find_batch(&subtable->rules, map, hashes, nodes);
> -/* Check results.  When the i-th bit of map is set, it means
> that a
> - * set of nodes with a matching hash value was found for the
> i-th
> - * search-key.  Due to possible hash collisions we need to
> check
> - * which of the found rules, if any, really matches our masked
> - * search-key. */
> -ULLONG_FOR_EACH_1(i, map) {
> -struct dpcls_rule *rule;
> -
> -CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
> -if (OVS_LIKELY(dpcls_rule_matches_key(rule,
> &mkeys[i]))) {
> -mrules[i] = rule;
> -/* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
> - * within one second optimization interval  */
> -subtable->hit_cnt++;
> -lookups_match += subtable_pos;
> -goto next;
> -}
> +int i;
> +
> +/* Compute hashes for the remaining keys.  Each search-key is
> + * masked with the subtable's mask to avoid hashing the wildcarded
> + * bits. */
> +ULLONG_FOR_EACH_1(i, keys_map) {
> +hashes[i] = netd