This is an automated email from the ASF dual-hosted git repository. haonan pushed a commit to branch iotdb in repository https://gitbox.apache.org/repos/asf/tsfile.git
commit 014f1ae275b984ace30c96c9100fdd30c57af5b6 Author: Zhihao Shen <[email protected]> AuthorDate: Thu Jul 18 09:23:59 2024 +0800 Optimize time and value in filter by TsFile statistics. --- .../read/filter/operator/TimeFilterOperators.java | 32 ++++++- .../read/filter/operator/ValueFilterOperators.java | 101 ++++++++++++++++++++- .../tsfile/read/filter/StatisticsFilterTest.java | 68 ++++++++++++++ 3 files changed, 194 insertions(+), 7 deletions(-) diff --git a/java/tsfile/src/main/java/org/apache/tsfile/read/filter/operator/TimeFilterOperators.java b/java/tsfile/src/main/java/org/apache/tsfile/read/filter/operator/TimeFilterOperators.java index 2c3214b4..ed0d0c57 100644 --- a/java/tsfile/src/main/java/org/apache/tsfile/read/filter/operator/TimeFilterOperators.java +++ b/java/tsfile/src/main/java/org/apache/tsfile/read/filter/operator/TimeFilterOperators.java @@ -20,7 +20,6 @@ package org.apache.tsfile.read.filter.operator; import org.apache.tsfile.read.common.TimeRange; -import org.apache.tsfile.read.filter.basic.DisableStatisticsTimeFilter; import org.apache.tsfile.read.filter.basic.Filter; import org.apache.tsfile.read.filter.basic.OperatorType; import org.apache.tsfile.read.filter.basic.TimeFilter; @@ -492,16 +491,22 @@ public final class TimeFilterOperators { } // base class for TimeIn, TimeNotIn - abstract static class TimeColumnSetFilter extends DisableStatisticsTimeFilter { + abstract static class TimeColumnSetFilter extends TimeFilter { protected final Set<Long> candidates; + protected final Long candidatesMin; + protected final Long candidatesMax; protected TimeColumnSetFilter(Set<Long> candidates) { this.candidates = Objects.requireNonNull(candidates, "candidates cannot be null"); + this.candidatesMin = Collections.min(candidates); + this.candidatesMax = Collections.max(candidates); } protected TimeColumnSetFilter(ByteBuffer buffer) { this.candidates = ReadWriteIOUtils.readLongSet(buffer); + this.candidatesMin = Collections.min(candidates); + this.candidatesMax = Collections.max(candidates); } @Override @@ -548,6 +553,17 @@ public final class TimeFilterOperators { return candidates.contains(time); } + @Override + public boolean satisfyStartEndTime(long startTime, long endTime) { + return startTime <= candidatesMax && endTime >= candidatesMin; + } + + @Override + public boolean containStartEndTime(long startTime, long endTime) { + // Make `allSatisfy` always return false + return false; + } + @Override public List<TimeRange> getTimeRanges() { List<TimeRange> res = new ArrayList<>(); @@ -583,6 +599,18 @@ public final class TimeFilterOperators { return !candidates.contains(time); } + @Override + public boolean satisfyStartEndTime(long startTime, long endTime) { + // Make `canSkip` always return false + return true; + } + + @Override + public boolean containStartEndTime(long startTime, long endTime) { + // Make `allSatisfy` always return false + return false; + } + @Override public List<TimeRange> getTimeRanges() { return Collections.singletonList(new TimeRange(Long.MIN_VALUE, Long.MAX_VALUE)); diff --git a/java/tsfile/src/main/java/org/apache/tsfile/read/filter/operator/ValueFilterOperators.java b/java/tsfile/src/main/java/org/apache/tsfile/read/filter/operator/ValueFilterOperators.java index c2b7284c..905c364b 100644 --- a/java/tsfile/src/main/java/org/apache/tsfile/read/filter/operator/ValueFilterOperators.java +++ b/java/tsfile/src/main/java/org/apache/tsfile/read/filter/operator/ValueFilterOperators.java @@ -35,6 +35,7 @@ import java.io.DataOutputStream; import java.io.IOException; import java.io.Serializable; import java.nio.ByteBuffer; +import java.util.Collections; import java.util.Objects; import java.util.Optional; import java.util.Set; @@ -706,18 +707,24 @@ public final class ValueFilterOperators { } // base class for ValueIn, ValueNotIn - abstract static class ValueColumnSetFilter<T> extends DisableStatisticsValueFilter { + abstract static class ValueColumnSetFilter<T extends Comparable<T>> extends ValueFilter { protected final Set<T> candidates; + protected final T candidatesMin; + protected final T candidatesMax; protected ValueColumnSetFilter(int measurementIndex, Set<T> candidates) { super(measurementIndex); this.candidates = Objects.requireNonNull(candidates, "candidates cannot be null"); + this.candidatesMin = candidates.size() != 0 ? Collections.min(candidates) : null; + this.candidatesMax = candidates.size() != 0 ? Collections.max(candidates) : null; } protected ValueColumnSetFilter(ByteBuffer buffer) { super(buffer); candidates = ReadWriteIOUtils.readObjectSet(buffer); + this.candidatesMin = candidates.size() != 0 ? Collections.min(candidates) : null; + this.candidatesMax = candidates.size() != 0 ? Collections.max(candidates) : null; } @Override @@ -753,8 +760,7 @@ public final class ValueFilterOperators { } } - public static final class ValueIn<T> extends ValueColumnSetFilter<T> { - + public static final class ValueIn<T extends Comparable<T>> extends ValueColumnSetFilter<T> { public ValueIn(int measurementIndex, Set<T> candidates) { super(measurementIndex, candidates); } @@ -765,7 +771,78 @@ public final class ValueFilterOperators { @Override public boolean valueSatisfy(Object value) { - return candidates.contains(value); + return candidates.contains((T) value); + } + + @Override + public boolean canSkip(IMetadata metadata) { + Optional<Statistics<? extends Serializable>> statistics = + metadata.getMeasurementStatistics(measurementIndex); + + // All values are null, but candidates do not contain null + if ((!statistics.isPresent() || isAllNulls(statistics.get())) && candidates.size() != 0) { + return true; + } + + // All values are not null, but candidate is one null value + if (!metadata.hasNullValue(measurementIndex) && candidates.size() == 0) { + return true; + } + + if (statistics.isPresent()) { + T valuesMin = (T) statistics.get().getMinValue(); + T valuesMax = (T) statistics.get().getMaxValue(); + // All values are same + if (valuesMin.compareTo(valuesMax) == 0) { + return !candidates.contains(valuesMin); + } else { + if (candidates.size() != 0) { + // All values are less than min, or greater than max + if (candidatesMin.compareTo(valuesMax) > 0) { + return true; + } + if (candidatesMax.compareTo(valuesMin) < 0) { + return true; + } + } + } + } + + return false; + } + + @Override + protected boolean canSkip(Statistics<? extends Serializable> statistics) { + throw new NotImplementedException(); + } + + @Override + public boolean allSatisfy(IMetadata metadata) { + Optional<Statistics<? extends Serializable>> statistics = + metadata.getMeasurementStatistics(measurementIndex); + + // All values are null, and candidate contains null + // Note null value cannot be added to set + if ((!statistics.isPresent() || isAllNulls(statistics.get())) && candidates.size() == 0) { + return true; + } + + // All values are same + if (statistics.isPresent()) { + T valuesMin = (T) statistics.get().getMinValue(); + T valuesMax = (T) statistics.get().getMaxValue(); + // All values are same + if (valuesMin.compareTo(valuesMax) == 0) { + return candidates.contains(valuesMin); + } + } + + return false; + } + + @Override + protected boolean allSatisfy(Statistics<? extends Serializable> statistics) { + throw new NotImplementedException(); } @Override @@ -777,9 +854,13 @@ public final class ValueFilterOperators { public OperatorType getOperatorType() { return OperatorType.VALUE_IN; } + + private boolean isAllNulls(Statistics<? extends Serializable> statistics) { + return statistics.getCount() == 0; + } } - public static final class ValueNotIn<T> extends ValueColumnSetFilter<T> { + public static final class ValueNotIn<T extends Comparable<T>> extends ValueColumnSetFilter<T> { public ValueNotIn(int measurementIndex, Set<T> candidates) { super(measurementIndex, candidates); @@ -794,6 +875,16 @@ public final class ValueFilterOperators { return !candidates.contains(value); } + @Override + protected boolean canSkip(Statistics<? extends Serializable> statistics) { + return false; + } + + @Override + protected boolean allSatisfy(Statistics<? extends Serializable> statistics) { + return false; + } + @Override public Filter reverse() { return new ValueIn<>(measurementIndex, candidates); diff --git a/java/tsfile/src/test/java/org/apache/tsfile/read/filter/StatisticsFilterTest.java b/java/tsfile/src/test/java/org/apache/tsfile/read/filter/StatisticsFilterTest.java index 2acf3e35..d967756f 100644 --- a/java/tsfile/src/test/java/org/apache/tsfile/read/filter/StatisticsFilterTest.java +++ b/java/tsfile/src/test/java/org/apache/tsfile/read/filter/StatisticsFilterTest.java @@ -30,6 +30,9 @@ import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import java.util.HashSet; +import java.util.Set; + import static org.apache.tsfile.read.filter.FilterTestUtil.newAlignedMetadata; import static org.apache.tsfile.read.filter.FilterTestUtil.newMetadata; import static org.apache.tsfile.read.filter.operator.Not.CONTAIN_NOT_ERR_MSG; @@ -280,4 +283,69 @@ public class StatisticsFilterTest { Assert.assertFalse(valueIsNotNull.allSatisfy(alignedMetadata2)); Assert.assertFalse(valueIsNotNull.allSatisfy(alignedMetadata3)); } + + @Test + public void testIn() { + // Non-null candidates + Set<Long> candidates = new HashSet<>(); + candidates.add(1L); + candidates.add(10L); + + // Time + Filter timeIn = TimeFilterApi.in(candidates); + Assert.assertFalse(timeIn.canSkip(metadata1)); + Assert.assertTrue(timeIn.canSkip(metadata2)); + Assert.assertFalse(timeIn.canSkip(metadata3)); + Assert.assertFalse(timeIn.allSatisfy(metadata1)); + Assert.assertFalse(timeIn.allSatisfy(metadata2)); + Assert.assertFalse(timeIn.allSatisfy(metadata3)); + // Value + Filter valueIn = ValueFilterApi.in(candidates); + Assert.assertFalse(valueIn.canSkip(metadata1)); + Assert.assertTrue(valueIn.canSkip(metadata2)); + Assert.assertFalse(valueIn.canSkip(metadata3)); + Assert.assertTrue(valueIn.canSkip(alignedMetadata3)); + Assert.assertFalse(valueIn.allSatisfy(metadata1)); + Assert.assertFalse(valueIn.allSatisfy(metadata2)); + Assert.assertTrue(valueIn.allSatisfy(metadata3)); + Assert.assertFalse(valueIn.allSatisfy(alignedMetadata3)); + + // Null candidate + Set<Long> nullCandidate = new HashSet<>(); + candidates.add(null); + + // Time cannot be null + Filter valueIn2 = ValueFilterApi.in(nullCandidate); + Assert.assertTrue(valueIn2.canSkip(metadata1)); + Assert.assertTrue(valueIn2.canSkip(metadata2)); + Assert.assertTrue(valueIn2.canSkip(metadata3)); + Assert.assertFalse(valueIn2.canSkip(alignedMetadata3)); + Assert.assertFalse(valueIn2.allSatisfy(metadata1)); + Assert.assertFalse(valueIn2.allSatisfy(metadata2)); + Assert.assertFalse(valueIn2.allSatisfy(metadata3)); + Assert.assertTrue(valueIn2.allSatisfy(alignedMetadata3)); + } + + @Test + public void testNotIn() { + Set<Long> candidates = new HashSet<>(); + candidates.add(1L); + candidates.add(10L); + + Filter timeNotIn = TimeFilterApi.notIn(candidates); + Assert.assertFalse(timeNotIn.canSkip(metadata1)); + Assert.assertFalse(timeNotIn.canSkip(metadata2)); + Assert.assertFalse(timeNotIn.canSkip(metadata3)); + Assert.assertFalse(timeNotIn.allSatisfy(metadata1)); + Assert.assertFalse(timeNotIn.allSatisfy(metadata2)); + Assert.assertFalse(timeNotIn.allSatisfy(metadata3)); + + Filter valueNotIn = ValueFilterApi.notIn(candidates); + Assert.assertFalse(valueNotIn.canSkip(metadata1)); + Assert.assertFalse(valueNotIn.canSkip(metadata2)); + Assert.assertFalse(valueNotIn.canSkip(metadata3)); + Assert.assertFalse(valueNotIn.allSatisfy(metadata1)); + Assert.assertFalse(valueNotIn.allSatisfy(metadata2)); + Assert.assertFalse(valueNotIn.allSatisfy(metadata3)); + } }
