This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 8ca152a02 ORC-1610: Reduce the number of hash computation in
`CuckooSetBytes`
8ca152a02 is described below
commit 8ca152a0285804eec475b0374668b226659e4143
Author: sychen <[email protected]>
AuthorDate: Wed Feb 7 11:06:53 2024 -0800
ORC-1610: Reduce the number of hash computation in `CuckooSetBytes`
### What changes were proposed in this pull request?
Add boundary conditions on "length" with the min/max length stored in the
hashes.
### Why are the changes needed?
https://issues.apache.org/jira/browse/HIVE-24205
> This would significantly reduce the number of hash computation that needs
to happen.
```
main insert:00:00:00.689
main lookup:00:00:01.124
PR insert:00:00:00.628
PR lookup:00:00:01.055
```
```java
Test
public void testLen() {
int maxSize = 200000;
Random gen = new Random();
String[] strings = new String[maxSize];
for (int i = 0; i < maxSize; i++) {
strings[i] = RandomStringUtils.random(Math.abs(gen.nextInt(1000)));
}
byte[][] values = getByteArrays(strings);
StopWatch mainSW = new StopWatch();
// load set
mainSW.start();
CuckooSetBytes main = new CuckooSetBytes(strings.length);
main.fastLookup = false;
for (byte[] v : values) {
main.insert(v);
}
mainSW.split();
System.out.println("main insert:" + mainSW);
// test that the values we added are there
for (byte[] v : values) {
assertTrue(main.lookup(v, 0, v.length));
}
mainSW.stop();
System.out.println("main lookup:" + mainSW);
StopWatch prSW = new StopWatch();
prSW.start();
CuckooSetBytes pr = new CuckooSetBytes(strings.length);
pr.fastLookup = true;
for (byte[] v : values) {
pr.insert(v);
}
prSW.split();
System.out.println("PR insert:" + prSW);
for (byte[] v : values) {
assertTrue(pr.lookup(v, 0, v.length));
}
prSW.stop();
System.out.println("PR lookup:" + prSW);
}
```
### How was this patch tested?
GA
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #1785 from cxzl25/ORC-1610.
Authored-by: sychen <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
(cherry picked from commit 4eff23aa3ff6180f00cab472b91c3f67f82d6683)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
java/core/src/java/org/apache/orc/util/CuckooSetBytes.java | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/java/core/src/java/org/apache/orc/util/CuckooSetBytes.java
b/java/core/src/java/org/apache/orc/util/CuckooSetBytes.java
index cabe3289c..f49347019 100644
--- a/java/core/src/java/org/apache/orc/util/CuckooSetBytes.java
+++ b/java/core/src/java/org/apache/orc/util/CuckooSetBytes.java
@@ -45,6 +45,8 @@ public class CuckooSetBytes {
private int rehashCount = 0;
private static final long INT_MASK = 0x00000000ffffffffL;
private static final long BYTE_MASK = 0x00000000000000ffL;
+ private int maxLen;
+ private int minLen = Integer.MAX_VALUE;
// some prime numbers spaced about at powers of 2 in magnitude
static final int[] primes = {7, 13, 17, 23, 31, 53, 67, 89, 127, 269, 571,
1019, 2089,
4507, 8263, 16361, 32327, 65437, 131111, 258887, 525961, 999983, 2158909,
4074073,
@@ -84,6 +86,9 @@ public class CuckooSetBytes {
* and ending at start+len is present in the set.
*/
public boolean lookup(byte[] b, int start, int len) {
+ if (len < minLen || len > maxLen) {
+ return false;
+ }
return entryEqual(t1, h1(b, start, len), b, start, len) ||
entryEqual(t2, h2(b, start, len), b, start, len);
@@ -98,6 +103,8 @@ public class CuckooSetBytes {
if (lookup(x, 0, x.length)) {
return;
}
+ minLen = Math.min(minLen, x.length);
+ maxLen = Math.max(maxLen, x.length);
// Try to insert up to n times. Rehash if that fails.
for(int i = 0; i != n; i++) {