This is an automated email from the ASF dual-hosted git repository.

findepi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/main by this push:
     new 122c4408b1 API: Reduce 'Scanning table' log verbosity for long list of 
strings (#14757)
122c4408b1 is described below

commit 122c4408b101bcf846440348040caef016e16626
Author: Raunaq Morarka <[email protected]>
AuthorDate: Wed Dec 10 17:56:27 2025 +0530

    API: Reduce 'Scanning table' log verbosity for long list of strings (#14757)
---
 .../org/apache/iceberg/expressions/ExpressionUtil.java  | 17 ++++++++---------
 .../apache/iceberg/expressions/TestExpressionUtil.java  | 10 ++++++++++
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git 
a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java 
b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java
index d3dc00d914..9bb2b71343 100644
--- a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java
+++ b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java
@@ -24,7 +24,6 @@ import java.time.ZoneOffset;
 import java.time.temporal.ChronoUnit;
 import java.util.List;
 import java.util.Locale;
-import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import java.util.function.Function;
 import java.util.regex.Pattern;
@@ -69,7 +68,6 @@ public class ExpressionUtil {
           
"\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}(:\\d{2}(.\\d{7,9})?)?([-+]\\d{2}:\\d{2}|Z)");
 
   static final int LONG_IN_PREDICATE_ABBREVIATION_THRESHOLD = 10;
-  private static final int LONG_IN_PREDICATE_ABBREVIATION_MIN_GAIN = 5;
 
   private ExpressionUtil() {}
 
@@ -502,19 +500,20 @@ public class ExpressionUtil {
 
   private static List<String> abbreviateValues(List<String> sanitizedValues) {
     if (sanitizedValues.size() >= LONG_IN_PREDICATE_ABBREVIATION_THRESHOLD) {
-      Set<String> distinctValues = ImmutableSet.copyOf(sanitizedValues);
-      if (distinctValues.size()
-          <= sanitizedValues.size() - LONG_IN_PREDICATE_ABBREVIATION_MIN_GAIN) 
{
-        List<String> abbreviatedList = 
Lists.newArrayListWithCapacity(distinctValues.size() + 1);
-        abbreviatedList.addAll(distinctValues);
+      List<String> distinctValues = 
ImmutableSet.copyOf(sanitizedValues).asList();
+      int abbreviatedSize =
+          Math.min(distinctValues.size(), 
LONG_IN_PREDICATE_ABBREVIATION_THRESHOLD);
+      List<String> abbreviatedList = 
Lists.newArrayListWithCapacity(abbreviatedSize + 1);
+      abbreviatedList.addAll(distinctValues.subList(0, abbreviatedSize));
+      if (abbreviatedSize < sanitizedValues.size()) {
         abbreviatedList.add(
             String.format(
                 Locale.ROOT,
                 "... (%d values hidden, %d in total)",
-                sanitizedValues.size() - distinctValues.size(),
+                sanitizedValues.size() - abbreviatedSize,
                 sanitizedValues.size()));
-        return abbreviatedList;
       }
+      return abbreviatedList;
     }
     return sanitizedValues;
   }
diff --git 
a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java 
b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java
index ca08951b1f..d9fe26eacc 100644
--- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java
+++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java
@@ -115,6 +115,16 @@ public class TestExpressionUtil {
         .as("Sanitized string should be abbreviated")
         .isEqualTo("test IN ((2-digit-int), (3-digit-int), ... (8 values 
hidden, 10 in total))");
 
+    Object[] tooLongStringsList =
+        IntStream.range(0, 
ExpressionUtil.LONG_IN_PREDICATE_ABBREVIATION_THRESHOLD + 5)
+            .mapToObj(i -> "string_" + i)
+            .toArray();
+
+    assertThat(ExpressionUtil.toSanitizedString(Expressions.in("test", 
tooLongStringsList)))
+        .as("Sanitized string should be abbreviated")
+        .isEqualTo(
+            "test IN ((hash-14128790), (hash-1056a62b), (hash-22fd6340), 
(hash-3f9d20e4), (hash-136200f0), (hash-25fc9033), (hash-681d31e2), 
(hash-6c1796d4), (hash-382d143e), (hash-272f4e5b), ... (5 values hidden, 15 in 
total))");
+
     // The sanitization resulting in an expression tree does not abbreviate
     List<String> expectedValues = Lists.newArrayList();
     expectedValues.addAll(Collections.nCopies(5, "(2-digit-int)"));

Reply via email to