This is an automated email from the ASF dual-hosted git repository.
lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-paimon.git
The following commit(s) were added to refs/heads/master by this push:
new a279d77ea [core] Fix bugs in TruncateFieldStatsCollector
a279d77ea is described below
commit a279d77ea3c3e0ef1162a9c5eef7fbac86e1f7d6
Author: JingsongLi <[email protected]>
AuthorDate: Wed Jul 5 00:04:27 2023 +0800
[core] Fix bugs in TruncateFieldStatsCollector
---
.../statistics/TruncateFieldStatsCollector.java | 39 ++++++++++++++------
.../paimon/statistics/FieldStatsCollectorTest.java | 43 ++++++++++++++++++++++
2 files changed, 70 insertions(+), 12 deletions(-)
diff --git
a/paimon-common/src/main/java/org/apache/paimon/statistics/TruncateFieldStatsCollector.java
b/paimon-common/src/main/java/org/apache/paimon/statistics/TruncateFieldStatsCollector.java
index bae77bed6..a8ff78133 100644
---
a/paimon-common/src/main/java/org/apache/paimon/statistics/TruncateFieldStatsCollector.java
+++
b/paimon-common/src/main/java/org/apache/paimon/statistics/TruncateFieldStatsCollector.java
@@ -25,8 +25,6 @@ import org.apache.paimon.data.serializer.Serializer;
import org.apache.paimon.format.FieldStats;
import org.apache.paimon.utils.Preconditions;
-import javax.annotation.Nullable;
-
import java.util.regex.Pattern;
/**
@@ -39,6 +37,8 @@ public class TruncateFieldStatsCollector extends
AbstractFieldStatsCollector {
private final int length;
+ private boolean failed = false;
+
public TruncateFieldStatsCollector(int length) {
Preconditions.checkArgument(length > 0, "Truncate length should larger
than zero.");
this.length = length;
@@ -62,35 +62,49 @@ public class TruncateFieldStatsCollector extends
AbstractFieldStatsCollector {
Comparable<Object> c = (Comparable<Object>) field;
if (minValue == null || c.compareTo(minValue) < 0) {
- minValue = truncateMin(field, fieldSerializer);
+ minValue = fieldSerializer.copy(truncateMin(field));
}
if (maxValue == null || c.compareTo(maxValue) > 0) {
- maxValue = truncateMax(field, fieldSerializer);
+ Object max = truncateMax(field);
+ // may fail
+ if (max != null) {
+ maxValue = fieldSerializer.copy(truncateMax(field));
+ }
}
}
@Override
public FieldStats convert(FieldStats source) {
- return new FieldStats(
- truncateMin(source.minValue(), null),
- truncateMax(source.maxValue(), null),
- source.nullCount());
+ Object min = truncateMin(source.minValue());
+ Object max = truncateMax(source.maxValue());
+ if (max == null) {
+ return new FieldStats(null, null, source.nullCount());
+ }
+ return new FieldStats(min, max, source.nullCount());
+ }
+
+ @Override
+ public FieldStats result() {
+ if (failed) {
+ return new FieldStats(null, null, nullCount);
+ }
+ return new FieldStats(minValue, maxValue, nullCount);
}
/** @return a truncated value less or equal than the old value. */
- private Object truncateMin(Object field, @Nullable Serializer<Object>
fieldSerializer) {
+ private Object truncateMin(Object field) {
if (field == null) {
return null;
}
if (field instanceof BinaryString) {
return ((BinaryString) field).substring(0, length);
} else {
- return fieldSerializer == null ? field :
fieldSerializer.copy(field);
+ return field;
}
}
/** @return a value greater or equal than the old value. */
- private Object truncateMax(Object field, @Nullable Serializer<Object>
fieldSerializer) {
+ private Object truncateMax(Object field) {
if (field == null) {
return null;
}
@@ -119,9 +133,10 @@ public class TruncateFieldStatsCollector extends
AbstractFieldStatsCollector {
return
BinaryString.fromString(truncatedStringBuilder.toString());
}
}
+ failed = true;
return null; // Cannot find a valid upper bound
} else {
- return fieldSerializer == null ? field :
fieldSerializer.copy(field);
+ return field;
}
}
}
diff --git
a/paimon-common/src/test/java/org/apache/paimon/statistics/FieldStatsCollectorTest.java
b/paimon-common/src/test/java/org/apache/paimon/statistics/FieldStatsCollectorTest.java
index 80121d805..5285f0691 100644
---
a/paimon-common/src/test/java/org/apache/paimon/statistics/FieldStatsCollectorTest.java
+++
b/paimon-common/src/test/java/org/apache/paimon/statistics/FieldStatsCollectorTest.java
@@ -22,6 +22,7 @@ package org.apache.paimon.statistics;
import org.apache.paimon.data.BinaryString;
import org.apache.paimon.data.GenericRow;
+import org.apache.paimon.data.serializer.BinaryStringSerializer;
import org.apache.paimon.data.serializer.InternalSerializers;
import org.apache.paimon.data.serializer.Serializer;
import org.apache.paimon.format.FieldStats;
@@ -39,6 +40,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
+import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.AssertionsForClassTypes.assertThatThrownBy;
/** Test for {@link FieldStatsCollector}. */
@@ -166,6 +168,47 @@ public class FieldStatsCollectorTest {
Assertions.assertEquals(BinaryString.fromString("\uD83E\uDD19"),
fieldStats.maxValue());
}
+ @Test
+ public void testTruncateCopied() {
+ TruncateFieldStatsCollector collector = new
TruncateFieldStatsCollector(16);
+ BinaryString str = BinaryString.fromString("str");
+ collector.collect(str, (Serializer) BinaryStringSerializer.INSTANCE);
+ FieldStats stats = collector.result();
+ assertThat(stats.minValue()).isNotSameAs(str);
+ assertThat(stats.maxValue()).isNotSameAs(str);
+ }
+
+ @Test
+ public void testFullCopied() {
+ FullFieldStatsCollector collector = new FullFieldStatsCollector();
+ BinaryString str = BinaryString.fromString("str");
+ collector.collect(str, (Serializer) BinaryStringSerializer.INSTANCE);
+ FieldStats stats = collector.result();
+ assertThat(stats.minValue()).isNotSameAs(str);
+ assertThat(stats.maxValue()).isNotSameAs(str);
+ }
+
+ @Test
+ public void testTruncateFail() {
+ TruncateFieldStatsCollector collector = new
TruncateFieldStatsCollector(3);
+
+ StringBuilder builder = new StringBuilder();
+ builder.appendCodePoint(Character.MAX_CODE_POINT);
+ builder.appendCodePoint(Character.MAX_CODE_POINT);
+ builder.appendCodePoint(Character.MAX_CODE_POINT);
+ builder.appendCodePoint(Character.MAX_CODE_POINT);
+ BinaryString str = BinaryString.fromString(builder.toString());
+
+ collector.collect(str, (Serializer) BinaryStringSerializer.INSTANCE);
+ FieldStats stats = collector.result();
+ assertThat(stats.minValue()).isNull();
+ assertThat(stats.maxValue()).isNull();
+
+ stats = collector.convert(new FieldStats(str, str, 0L));
+ assertThat(stats.minValue()).isNull();
+ assertThat(stats.maxValue()).isNull();
+ }
+
private void check(
List<GenericRow> rows,
int column,