This is an automated email from the ASF dual-hosted git repository.

yufei pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/main by this push:
     new b531e97f66 Core: Extract filePath comparator into it's own class 
(#10664)
b531e97f66 is described below

commit b531e97f66ef2bf80f3167152e268be0ce25f459
Author: Denys Kuzmenko <[email protected]>
AuthorDate: Mon Aug 5 22:43:34 2024 +0200

    Core: Extract filePath comparator into it's own class (#10664)
---
 .../java/org/apache/iceberg/types/Comparators.java | 41 ++++++++++++++++++++++
 .../java/org/apache/iceberg/deletes/Deletes.java   | 31 +++-------------
 2 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/api/src/main/java/org/apache/iceberg/types/Comparators.java 
b/api/src/main/java/org/apache/iceberg/types/Comparators.java
index d09d9f5395..a803afac10 100644
--- a/api/src/main/java/org/apache/iceberg/types/Comparators.java
+++ b/api/src/main/java/org/apache/iceberg/types/Comparators.java
@@ -173,6 +173,10 @@ public class Comparators {
     return CharSeqComparator.INSTANCE;
   }
 
+  public static Comparator<CharSequence> filePath() {
+    return FilePathComparator.INSTANCE;
+  }
+
   private static class NullsFirst<T> implements Comparator<T> {
     private static final NullsFirst<?> INSTANCE = new NullsFirst<>();
 
@@ -351,4 +355,41 @@ public class Comparators {
       return Integer.compare(s1.length(), s2.length());
     }
   }
+
+  private static class FilePathComparator implements Comparator<CharSequence> {
+    private static final FilePathComparator INSTANCE = new 
FilePathComparator();
+
+    private FilePathComparator() {}
+
+    @Override
+    public int compare(CharSequence s1, CharSequence s2) {
+      if (s1 == s2) {
+        return 0;
+      }
+      int count = s1.length();
+
+      int cmp = Integer.compare(count, s2.length());
+      if (cmp != 0) {
+        return cmp;
+      }
+
+      if (s1 instanceof String && s2 instanceof String) {
+        cmp = Integer.compare(s1.hashCode(), s2.hashCode());
+        if (cmp != 0) {
+          return cmp;
+        }
+      }
+      // File paths inside a delete file normally have more identical chars at 
the beginning. For
+      // example, a typical
+      // path is like 
"s3:/bucket/db/table/data/partition/00000-0-[uuid]-00001.parquet".
+      // The uuid is where the difference starts. So it's faster to find the 
first diff backward.
+      for (int i = count - 1; i >= 0; i--) {
+        cmp = Character.compare(s1.charAt(i), s2.charAt(i));
+        if (cmp != 0) {
+          return cmp;
+        }
+      }
+      return 0;
+    }
+  }
 }
diff --git a/core/src/main/java/org/apache/iceberg/deletes/Deletes.java 
b/core/src/main/java/org/apache/iceberg/deletes/Deletes.java
index ff20ba53ff..cef57cd167 100644
--- a/core/src/main/java/org/apache/iceberg/deletes/Deletes.java
+++ b/core/src/main/java/org/apache/iceberg/deletes/Deletes.java
@@ -36,6 +36,7 @@ import org.apache.iceberg.io.FilterIterator;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
 import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.types.Comparators;
 import org.apache.iceberg.types.Types;
 import org.apache.iceberg.util.CharSequenceMap;
 import org.apache.iceberg.util.Filter;
@@ -398,33 +399,9 @@ public class Deletes {
 
     @Override
     protected boolean shouldKeep(T posDelete) {
-      return charSeqEquals(dataLocation, (CharSequence) 
FILENAME_ACCESSOR.get(posDelete));
-    }
-
-    private boolean charSeqEquals(CharSequence s1, CharSequence s2) {
-      if (s1 == s2) {
-        return true;
-      }
-
-      int count = s1.length();
-      if (count != s2.length()) {
-        return false;
-      }
-
-      if (s1 instanceof String && s2 instanceof String && s1.hashCode() != 
s2.hashCode()) {
-        return false;
-      }
-
-      // File paths inside a delete file normally have more identical chars at 
the beginning. For
-      // example, a typical
-      // path is like 
"s3:/bucket/db/table/data/partition/00000-0-[uuid]-00001.parquet".
-      // The uuid is where the difference starts. So it's faster to find the 
first diff backward.
-      for (int i = count - 1; i >= 0; i--) {
-        if (s1.charAt(i) != s2.charAt(i)) {
-          return false;
-        }
-      }
-      return true;
+      return Comparators.filePath()
+              .compare(dataLocation, (CharSequence) 
FILENAME_ACCESSOR.get(posDelete))
+          == 0;
     }
   }
 }

Reply via email to