This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/master by this push:
new 60b03ef ORC-705: Predicate evaluation should take into account writer
calendar (#588)
60b03ef is described below
commit 60b03ef5c8ed175be715b90763f32aa6ca21bc97
Author: Panagiotis Garefalakis <[email protected]>
AuthorDate: Wed Dec 30 08:18:12 2020 +0200
ORC-705: Predicate evaluation should take into account writer calendar
(#588)
### What changes were proposed in this pull request?
RecordReaderImp should pass down the writer calendar info
(writerUsedProlepticGregorian) when evaluating predicates to make sure column
stats are properly deserialized (affects TimestampStatistics)
### Why are the changes needed?
Correct evaluation of predicates with Timestamps
### How was this patch tested?
TestRecordReaderImpl.testPredEvalTimestampStatsDiffWriter
---
.../java/org/apache/orc/impl/RecordReaderImpl.java | 10 ++--
.../org/apache/orc/impl/TestRecordReaderImpl.java | 54 +++++++++++++++++++++-
2 files changed, 58 insertions(+), 6 deletions(-)
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index 781e13c..c05a4e7 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -480,19 +480,20 @@ public class RecordReaderImpl implements RecordReader {
OrcFile.WriterVersion writerVersion,
TypeDescription type) {
return evaluatePredicateProto(statsProto, predicate, kind, encoding,
bloomFilter,
- writerVersion, type, false);
+ writerVersion, type, true, false);
}
/**
* Evaluate a predicate with respect to the statistics from the column
* that is referenced in the predicate.
* Includes option to specify if timestamp column stats values
- * should be in UTC.
+ * should be in UTC and if the file writer used proleptic Gregorian calendar.
* @param statsProto the statistics for the column mentioned in the predicate
* @param predicate the leaf predicate we need to evaluation
* @param bloomFilter the bloom filter
* @param writerVersion the version of software that wrote the file
* @param type what is the kind of this column
+ * @param writerUsedProlepticGregorian file written using the proleptic
Gregorian calendar
* @param useUTCTimestamp
* @return the set of truth values that may be returned for the given
* predicate.
@@ -505,8 +506,9 @@ public class RecordReaderImpl implements RecordReader {
OrcProto.BloomFilter bloomFilter,
OrcFile.WriterVersion writerVersion,
TypeDescription type,
+ boolean
writerUsedProlepticGregorian,
boolean useUTCTimestamp) {
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, statsProto);
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, statsProto,
writerUsedProlepticGregorian, true);
ValueRange range = getValueRange(cs, predicate, useUTCTimestamp);
// files written before ORC-135 stores timestamp wrt to local timezone
causing issues with PPD.
@@ -1031,7 +1033,7 @@ public class RecordReaderImpl implements RecordReader {
predicate, bfk, encodings.get(columnIx), bf,
writerVersion, evolution.getFileSchema().
findSubtype(columnIx),
- useUTCTimestamp);
+ writerUsedProlepticGregorian, useUTCTimestamp);
} catch (Exception e) {
exceptionCount[pred] += 1;
if (e instanceof SargCastException) {
diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
index 5350efb..4375d3a 100644
--- a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
@@ -593,7 +593,22 @@ public class TestRecordReaderImpl {
return RecordReaderImpl.evaluatePredicateProto(stats, predicate, null,
encoding, null,
include135 ? OrcFile.WriterVersion.ORC_135:
OrcFile.WriterVersion.ORC_101,
- TypeDescription.createTimestamp(), useUTCTimestamp);
+ TypeDescription.createTimestamp(), true, useUTCTimestamp);
+ }
+
+ static TruthValue
evaluateTimestampWithWriterCalendar(OrcProto.ColumnStatistics stats,
+ PredicateLeaf
predicate,
+ boolean include135,
+ boolean
writerUsedProlepticGregorian,
+ boolean
useUTCTimestamp) {
+ OrcProto.ColumnEncoding encoding =
+ OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT)
+ .build();
+ return RecordReaderImpl.evaluatePredicateProto(stats, predicate, null,
+ encoding, null,
+ include135 ? OrcFile.WriterVersion.ORC_135:
OrcFile.WriterVersion.ORC_101,
+ TypeDescription.createTimestamp(), writerUsedProlepticGregorian,
useUTCTimestamp);
}
static TruthValue evaluateTimestampBloomfilter(OrcProto.ColumnStatistics
stats,
@@ -616,7 +631,7 @@ public class TestRecordReaderImpl {
BloomFilterIO.serialize(builder, bloom);
return RecordReaderImpl.evaluatePredicateProto(stats, predicate, kind,
encoding.build(), builder.build(), version,
- TypeDescription.createTimestamp(), useUTCTimestamp);
+ TypeDescription.createTimestamp(), true, useUTCTimestamp);
}
@Test
@@ -896,6 +911,41 @@ public class TestRecordReaderImpl {
}
@Test
+ public void testPredEvalTimestampStatsDiffWriter() {
+ // Proleptic - NoUTC
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP,
"x",
+ Timestamp.valueOf("1017-01-01 00:00:00"), null);
+ assertEquals(TruthValue.YES_NO,
+ evaluateTimestampWithWriterCalendar(createTimestampStats("1017-01-01
00:00:00", "1017-01-01 00:00:00"),
+ pred, true, true, false));
+
+ // NoProleptic - NoUTC -> 1016-12-26 00:00:00.0
+ long predTime =
DateUtils.convertTimeToProleptic(Timestamp.valueOf("1017-01-01
00:00:00").getTime(), false);
+ PredicateLeaf pred2 = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP,
"x", new Timestamp(predTime), null);
+ assertEquals(TruthValue.YES_NO,
+ evaluateTimestampWithWriterCalendar(createTimestampStats("1017-01-01
00:00:00", "1017-01-01 00:00:00"),
+ pred2, true, false, false));
+
+ // NoProleptic - UTC -> 1016-12-25 16:00:00.0
+ predTime = DateUtils.convertTimeToProleptic(getUtcTimestamp("1017-01-01
00:00:00"), true);
+ PredicateLeaf pred3 = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP,
"x", new Timestamp(predTime), null);
+ assertEquals(TruthValue.YES_NO,
+ evaluateTimestampWithWriterCalendar(createTimestampStats("1017-01-01
00:00:00", "1017-01-01 00:00:00"),
+ pred3, true, false, true));
+
+ // Proleptic - UTC -> 1016-12-31 16:00:00.0
+ predTime = getUtcTimestamp("1017-01-01 00:00:00");
+ PredicateLeaf pred4 =
createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(predTime), null);
+ assertEquals(TruthValue.YES_NO,
+ evaluateTimestampWithWriterCalendar(createTimestampStats("1017-01-01
00:00:00", "1017-01-01 00:00:00"),
+ pred4, true, true, true));
+ }
+
+ @Test
public void testPredEvalWithTimestampStats() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP,