This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 4eff23aa3 ORC-1610: Reduce the number of hash computation in 
`CuckooSetBytes`
4eff23aa3 is described below

commit 4eff23aa3ff6180f00cab472b91c3f67f82d6683
Author: sychen <[email protected]>
AuthorDate: Wed Feb 7 11:06:53 2024 -0800

    ORC-1610: Reduce the number of hash computation in `CuckooSetBytes`
    
    ### What changes were proposed in this pull request?
    Add boundary conditions on "length" with the min/max length stored in the 
hashes.
    
    ### Why are the changes needed?
    https://issues.apache.org/jira/browse/HIVE-24205
    
    > This would significantly reduce the number of hash computation that needs 
to happen.
    
    ```
    main insert:00:00:00.689
    main lookup:00:00:01.124
    PR insert:00:00:00.628
    PR lookup:00:00:01.055
    ```
    
    ```java
      Test
      public void testLen() {
        int maxSize = 200000;
        Random gen = new Random();
        String[] strings = new String[maxSize];
        for (int i = 0; i < maxSize; i++) {
          strings[i] = RandomStringUtils.random(Math.abs(gen.nextInt(1000)));
        }
        byte[][] values = getByteArrays(strings);
    
        StopWatch mainSW = new StopWatch();
    
        // load set
        mainSW.start();
        CuckooSetBytes main = new CuckooSetBytes(strings.length);
        main.fastLookup = false;
        for (byte[] v : values) {
          main.insert(v);
        }
        mainSW.split();
        System.out.println("main insert:" + mainSW);
        // test that the values we added are there
        for (byte[] v : values) {
          assertTrue(main.lookup(v, 0, v.length));
        }
        mainSW.stop();
        System.out.println("main lookup:" + mainSW);
    
        StopWatch prSW = new StopWatch();
        prSW.start();
        CuckooSetBytes pr = new CuckooSetBytes(strings.length);
        pr.fastLookup = true;
        for (byte[] v : values) {
          pr.insert(v);
        }
        prSW.split();
        System.out.println("PR insert:" + prSW);
        for (byte[] v : values) {
          assertTrue(pr.lookup(v, 0, v.length));
        }
        prSW.stop();
        System.out.println("PR lookup:" + prSW);
      }
    ```
    ### How was this patch tested?
    GA
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #1785 from cxzl25/ORC-1610.
    
    Authored-by: sychen <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 java/core/src/java/org/apache/orc/util/CuckooSetBytes.java | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/java/core/src/java/org/apache/orc/util/CuckooSetBytes.java 
b/java/core/src/java/org/apache/orc/util/CuckooSetBytes.java
index cabe3289c..f49347019 100644
--- a/java/core/src/java/org/apache/orc/util/CuckooSetBytes.java
+++ b/java/core/src/java/org/apache/orc/util/CuckooSetBytes.java
@@ -45,6 +45,8 @@ public class CuckooSetBytes {
   private int rehashCount = 0;
   private static final long INT_MASK  = 0x00000000ffffffffL;
   private static final long BYTE_MASK = 0x00000000000000ffL;
+  private int maxLen;
+  private int minLen = Integer.MAX_VALUE;
   // some prime numbers spaced about at powers of 2 in magnitude
   static final int[] primes = {7, 13, 17, 23, 31, 53, 67, 89, 127, 269, 571, 
1019, 2089,
     4507, 8263, 16361, 32327, 65437, 131111, 258887, 525961, 999983, 2158909, 
4074073,
@@ -84,6 +86,9 @@ public class CuckooSetBytes {
    * and ending at start+len is present in the set.
    */
   public boolean lookup(byte[] b, int start, int len) {
+    if (len < minLen || len > maxLen) {
+      return false;
+    }
 
     return entryEqual(t1, h1(b, start, len), b, start, len) ||
            entryEqual(t2, h2(b, start, len), b, start, len);
@@ -98,6 +103,8 @@ public class CuckooSetBytes {
     if (lookup(x, 0, x.length)) {
       return;
     }
+    minLen = Math.min(minLen, x.length);
+    maxLen = Math.max(maxLen, x.length);
 
     // Try to insert up to n times. Rehash if that fails.
     for(int i = 0; i != n; i++) {

Reply via email to