Re: [PATCH bpf-next v2 1/1] bpf, lpm: make longest_prefix_match() faster

2018-11-22 Thread Daniel Borkmann
On 11/22/2018 06:39 AM, Eric Dumazet wrote:
> At LPC 2018 in Vancouver, Vlad Dumitrescu mentioned that 
> longest_prefix_match()
> has a high cost [1].
> 
> One reason for that cost is a loop handling one byte at a time.
> 
> We can handle more bytes at a time, if enough attention is paid
> to endianness.
> 
> I was able to remove ~55 % of longest_prefix_match() cpu costs.
> 
> [1] 
> https://linuxplumbersconf.org/event/2/contributions/88/attachments/76/87/lpc-bpf-2018-shaping.pdf
> 
> Signed-off-by: Eric Dumazet 
> Cc: Vlad Dumitrescu 
> Cc: Alexei Starovoitov 
> Cc: Daniel Borkmann 

Looks good, applied to bpf-next, thanks!


[PATCH bpf-next v2 1/1] bpf, lpm: make longest_prefix_match() faster

2018-11-21 Thread Eric Dumazet
At LPC 2018 in Vancouver, Vlad Dumitrescu mentioned that longest_prefix_match()
has a high cost [1].

One reason for that cost is a loop handling one byte at a time.

We can handle more bytes at a time, if enough attention is paid
to endianness.

I was able to remove ~55 % of longest_prefix_match() cpu costs.

[1] 
https://linuxplumbersconf.org/event/2/contributions/88/attachments/76/87/lpc-bpf-2018-shaping.pdf

Signed-off-by: Eric Dumazet 
Cc: Vlad Dumitrescu 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
---
v2: fixed Daniel and Alexei email addresses... :/

 kernel/bpf/lpm_trie.c | 59 +++
 1 file changed, 49 insertions(+), 10 deletions(-)

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 
9058317ba9de2eae4f28e8ca98c3c30bc6167c24..bfd4882e1106c699b62217eef0c2cc92f31077c6
 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -168,20 +168,59 @@ static size_t longest_prefix_match(const struct lpm_trie 
*trie,
   const struct lpm_trie_node *node,
   const struct bpf_lpm_trie_key *key)
 {
-   size_t prefixlen = 0;
-   size_t i;
+   u32 limit = min(node->prefixlen, key->prefixlen);
+   u32 prefixlen = 0, i = 0;
 
-   for (i = 0; i < trie->data_size; i++) {
-   size_t b;
+   BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32));
+   BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32));
 
-   b = 8 - fls(node->data[i] ^ key->data[i]);
-   prefixlen += b;
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)
 
-   if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen)
-   return min(node->prefixlen, key->prefixlen);
+   /* data_size >= 16 has very small probability.
+* We do not use a loop for optimal code generation.
+*/
+   if (trie->data_size >= 8) {
+   u64 diff = be64_to_cpu(*(__be64 *)node->data ^
+  *(__be64 *)key->data);
 
-   if (b < 8)
-   break;
+   prefixlen = 64 - fls64(diff);
+   if (prefixlen >= limit)
+   return limit;
+   if (diff)
+   return prefixlen;
+   i = 8;
+   }
+#endif
+
+   while (trie->data_size >= i + 4) {
+   u32 diff = be32_to_cpu(*(__be32 *)&node->data[i] ^
+  *(__be32 *)&key->data[i]);
+
+   prefixlen += 32 - fls(diff);
+   if (prefixlen >= limit)
+   return limit;
+   if (diff)
+   return prefixlen;
+   i += 4;
+   }
+
+   if (trie->data_size >= i + 2) {
+   u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^
+  *(__be16 *)&key->data[i]);
+
+   prefixlen += 16 - fls(diff);
+   if (prefixlen >= limit)
+   return limit;
+   if (diff)
+   return prefixlen;
+   i += 2;
+   }
+
+   if (trie->data_size >= i + 1) {
+   prefixlen += 8 - fls(node->data[i] ^ key->data[i]);
+
+   if (prefixlen >= limit)
+   return limit;
}
 
return prefixlen;
-- 
2.19.1.1215.g8438c0b245-goog