This is an automated email from the ASF dual-hosted git repository.
omalley pushed a commit to branch branch-1.6
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-1.6 by this push:
new 865ad54 ORC-611: Support TS Min/max statistics with ns precision
865ad54 is described below
commit 865ad5499332ebf90b482f4e0465c6c26694c823
Author: Panos Garefalakis <[email protected]>
AuthorDate: Mon Mar 16 16:18:29 2020 +0000
ORC-611: Support TS Min/max statistics with ns precision
Fixes #497
Change-Id: I06bad2cfe438502035c20393ca325decf9d1c4f1
Signed-off-by: Owen O'Malley <[email protected]>
---
.../org/apache/orc/TimestampColumnStatistics.java | 2 +-
.../org/apache/orc/impl/ColumnStatisticsImpl.java | 141 ++++++++++----
.../orc/impl/writer/TimestampTreeWriter.java | 4 +-
.../test/org/apache/orc/TestColumnStatistics.java | 134 +++++++++++++
.../test/org/apache/orc/TestOrcTimestampPPD.java | 208 +++++++++++++++++++++
.../src/test/org/apache/orc/TestVectorOrcFile.java | 7 +-
.../apache/orc/impl/TestColumnStatisticsImpl.java | 55 +++++-
proto/orc_proto.proto | 3 +
8 files changed, 502 insertions(+), 52 deletions(-)
diff --git a/java/core/src/java/org/apache/orc/TimestampColumnStatistics.java
b/java/core/src/java/org/apache/orc/TimestampColumnStatistics.java
index b095a68..c5abd42 100644
--- a/java/core/src/java/org/apache/orc/TimestampColumnStatistics.java
+++ b/java/core/src/java/org/apache/orc/TimestampColumnStatistics.java
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
index 587e33b..eb37a82 100644
--- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
@@ -1647,8 +1647,14 @@ public class ColumnStatisticsImpl implements
ColumnStatistics {
private static class TimestampStatisticsImpl extends ColumnStatisticsImpl
implements TimestampColumnStatistics {
- private Long minimum = null;
- private Long maximum = null;
+ private static final int DEFAULT_MIN_NANOS = 000_000;
+ private static final int DEFAULT_MAX_NANOS = 999_999;
+
+ private long minimum = Long.MAX_VALUE;
+ private long maximum = Long.MIN_VALUE;
+
+ private int minNanos = DEFAULT_MIN_NANOS;
+ private int maxNanos = DEFAULT_MAX_NANOS;
TimestampStatisticsImpl() {
}
@@ -1679,31 +1685,51 @@ public class ColumnStatisticsImpl implements
ColumnStatistics {
minimum = DateUtils.convertTime(timestampStats.getMinimumUtc(),
writerUsedProlepticGregorian, convertToProlepticGregorian, true);
}
+ if (timestampStats.hasMaximumNanos()) {
+ maxNanos = timestampStats.getMaximumNanos() - 1;
+ }
+ if (timestampStats.hasMinimumNanos()) {
+ minNanos = timestampStats.getMinimumNanos() - 1;
+ }
}
@Override
public void reset() {
super.reset();
- minimum = null;
- maximum = null;
+ minimum = Long.MAX_VALUE;
+ maximum = Long.MIN_VALUE;
+ minNanos = DEFAULT_MIN_NANOS;
+ maxNanos = DEFAULT_MAX_NANOS;
}
@Override
public void updateTimestamp(Timestamp value) {
long millis = SerializationUtils.convertToUtc(TimeZone.getDefault(),
value.getTime());
- updateTimestamp(millis);
+ // prune the last 6 digits for ns precision
+ updateTimestamp(millis, value.getNanos() % 1_000_000);
}
@Override
- public void updateTimestamp(long value) {
- if (minimum == null) {
- minimum = value;
- maximum = value;
- } else if (minimum > value) {
+ public void updateTimestamp(long value, int nanos) {
+ if (minimum > maximum) {
minimum = value;
- } else if (maximum < value) {
maximum = value;
+ minNanos = nanos;
+ maxNanos = nanos;
+ } else {
+ if (minimum >= value) {
+ if (minimum > value || nanos < minNanos) {
+ minNanos = nanos;
+ }
+ minimum = value;
+ }
+ if (maximum <= value) {
+ if (maximum < value || nanos > maxNanos) {
+ maxNanos = nanos;
+ }
+ maximum = value;
+ }
}
}
@@ -1711,19 +1737,31 @@ public class ColumnStatisticsImpl implements
ColumnStatistics {
public void merge(ColumnStatisticsImpl other) {
if (other instanceof TimestampStatisticsImpl) {
TimestampStatisticsImpl timestampStats = (TimestampStatisticsImpl)
other;
- if (minimum == null) {
- minimum = timestampStats.minimum;
- maximum = timestampStats.maximum;
- } else if (timestampStats.minimum != null) {
- if (minimum > timestampStats.minimum) {
+ if (count == 0) {
+ if (timestampStats.count != 0) {
+ minimum = timestampStats.minimum;
+ maximum = timestampStats.maximum;
+ minNanos = timestampStats.minNanos;
+ maxNanos = timestampStats.maxNanos;
+ }
+ } else if (timestampStats.count != 0) {
+ if (minimum >= timestampStats.minimum) {
+ if (minimum > timestampStats.minimum ||
+ minNanos > timestampStats.minNanos) {
+ minNanos = timestampStats.minNanos;
+ }
minimum = timestampStats.minimum;
}
- if (maximum < timestampStats.maximum) {
+ if (maximum <= timestampStats.maximum) {
+ if (maximum < timestampStats.maximum ||
+ maxNanos < timestampStats.maxNanos) {
+ maxNanos = timestampStats.maxNanos;
+ }
maximum = timestampStats.maximum;
}
}
} else {
- if (isStatsExists() && minimum != null) {
+ if (isStatsExists() && count != 0) {
throw new IllegalArgumentException("Incompatible merging of
timestamp column statistics");
}
}
@@ -1735,9 +1773,15 @@ public class ColumnStatisticsImpl implements
ColumnStatistics {
OrcProto.ColumnStatistics.Builder result = super.serialize();
OrcProto.TimestampStatistics.Builder timestampStats =
OrcProto.TimestampStatistics
.newBuilder();
- if (getNumberOfValues() != 0 && minimum != null) {
+ if (getNumberOfValues() != 0) {
timestampStats.setMinimumUtc(minimum);
timestampStats.setMaximumUtc(maximum);
+ if (minNanos != DEFAULT_MIN_NANOS) {
+ timestampStats.setMinimumNanos(minNanos + 1);
+ }
+ if (maxNanos != DEFAULT_MAX_NANOS) {
+ timestampStats.setMaximumNanos(maxNanos + 1);
+ }
}
result.setTimestampStatistics(timestampStats);
return result;
@@ -1745,32 +1789,54 @@ public class ColumnStatisticsImpl implements
ColumnStatistics {
@Override
public Timestamp getMinimum() {
- return minimum == null ? null :
- new
Timestamp(SerializationUtils.convertFromUtc(TimeZone.getDefault(),
- minimum));
+ if (minimum > maximum) {
+ return null;
+ } else {
+ Timestamp ts = new Timestamp(SerializationUtils.
+ convertFromUtc(TimeZone.getDefault(), minimum));
+ ts.setNanos(ts.getNanos() + minNanos);
+ return ts;
+ }
}
@Override
public Timestamp getMaximum() {
- return maximum == null ? null :
- new
Timestamp(SerializationUtils.convertFromUtc(TimeZone.getDefault(),
- maximum));
+ if (minimum > maximum) {
+ return null;
+ } else {
+ Timestamp ts = new Timestamp(SerializationUtils.convertFromUtc(
+ TimeZone.getDefault(), maximum));
+ ts.setNanos(ts.getNanos() + maxNanos);
+ return ts;
+ }
}
@Override
public Timestamp getMinimumUTC() {
- return minimum == null ? null : new Timestamp(minimum);
+ if (minimum > maximum) {
+ return null;
+ } else {
+ Timestamp ts = new Timestamp(minimum);
+ ts.setNanos(ts.getNanos() + minNanos);
+ return ts;
+ }
}
@Override
public Timestamp getMaximumUTC() {
- return maximum == null ? null : new Timestamp(maximum);
+ if (minimum > maximum) {
+ return null;
+ } else {
+ Timestamp ts = new Timestamp(maximum);
+ ts.setNanos(ts.getNanos() + maxNanos);
+ return ts;
+ }
}
@Override
public String toString() {
StringBuilder buf = new StringBuilder(super.toString());
- if (minimum != null || maximum != null) {
+ if (minimum <= maximum) {
buf.append(" min: ");
buf.append(getMinimum());
buf.append(" max: ");
@@ -1793,21 +1859,15 @@ public class ColumnStatisticsImpl implements
ColumnStatistics {
TimestampStatisticsImpl that = (TimestampStatisticsImpl) o;
- if (minimum != null ? !minimum.equals(that.minimum) : that.minimum !=
null) {
- return false;
- }
- if (maximum != null ? !maximum.equals(that.maximum) : that.maximum !=
null) {
- return false;
- }
-
- return true;
+ return minimum == that.minimum && maximum == that.maximum &&
+ minNanos == that.minNanos && maxNanos == that.maxNanos;
}
@Override
public int hashCode() {
int result = super.hashCode();
- result = 31 * result + (minimum != null ? minimum.hashCode() : 0);
- result = 31 * result + (maximum != null ? maximum.hashCode() : 0);
+ result = 31 * result + (int) minimum;
+ result = 31 * result + (int) maximum;
return result;
}
}
@@ -1824,7 +1884,7 @@ public class ColumnStatisticsImpl implements
ColumnStatistics {
@Override
public void updateTimestamp(Timestamp value) {
- updateTimestamp(value.getTime());
+ updateTimestamp(value.getTime(), value.getNanos() % 1_000_000);
}
@Override
@@ -1934,7 +1994,8 @@ public class ColumnStatisticsImpl implements
ColumnStatistics {
throw new UnsupportedOperationException("Can't update timestamp");
}
- public void updateTimestamp(long value) {
+ // has to be extended
+ public void updateTimestamp(long value, int nanos) {
throw new UnsupportedOperationException("Can't update timestamp");
}
diff --git
a/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java
b/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java
index e23413f..dc4dfd2 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java
@@ -111,7 +111,7 @@ public class TimestampTreeWriter extends TreeWriterBase {
}
long utc = vec.isUTC() || alwaysUTC ?
millis : SerializationUtils.convertToUtc(localTimezone, millis);
- indexStatistics.updateTimestamp(utc);
+ indexStatistics.updateTimestamp(utc, newNanos % 1_000_000);
if (createBloomFilter) {
if (bloomFilter != null) {
bloomFilter.addLong(millis);
@@ -139,7 +139,7 @@ public class TimestampTreeWriter extends TreeWriterBase {
millis : SerializationUtils.convertToUtc(localTimezone, millis);
seconds.write(secs - epoch);
nanos.write(formatNanos(newNanos));
- indexStatistics.updateTimestamp(utc);
+ indexStatistics.updateTimestamp(utc, newNanos % 1_000_000);
if (createBloomFilter) {
if (bloomFilter != null) {
bloomFilter.addLong(millis);
diff --git a/java/core/src/test/org/apache/orc/TestColumnStatistics.java
b/java/core/src/test/org/apache/orc/TestColumnStatistics.java
index fe2700f..8cf9dee 100644
--- a/java/core/src/test/org/apache/orc/TestColumnStatistics.java
+++ b/java/core/src/test/org/apache/orc/TestColumnStatistics.java
@@ -442,6 +442,8 @@ public class TestColumnStatistics {
stats1.updateTimestamp(new Timestamp(100));
stats2.updateTimestamp(new Timestamp(1));
stats2.updateTimestamp(new Timestamp(1000));
+ stats1.increment(2);
+ stats2.increment(2);
stats1.merge(stats2);
TimestampColumnStatistics typed = (TimestampColumnStatistics) stats1;
assertEquals(1, typed.getMinimum().getTime());
@@ -449,6 +451,7 @@ public class TestColumnStatistics {
stats1.reset();
stats1.updateTimestamp(new Timestamp(-10));
stats1.updateTimestamp(new Timestamp(10000));
+ stats1.increment(2);
stats1.merge(stats2);
assertEquals(-10, typed.getMinimum().getTime());
assertEquals(10000, typed.getMaximum().getTime());
@@ -489,6 +492,7 @@ public class TestColumnStatistics {
stats1.reset();
stats1.updateTimestamp(parseTime(format, "1999-04-04 00:00:00"));
stats1.updateTimestamp(parseTime(format, "2009-03-08 12:00:00"));
+ stats1.increment(2);
stats1.merge(stats2);
assertEquals("1999-04-04 00:00:00.0", typed.getMinimum().toString());
assertEquals("2009-03-08 12:00:00.0", typed.getMaximum().toString());
@@ -505,6 +509,136 @@ public class TestColumnStatistics {
}
@Test
+ public void testTimestampNanoPrecision() {
+ TypeDescription schema = TypeDescription.createTimestamp();
+
+ TimeZone original = TimeZone.getDefault();
+ TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"));
+
+ ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+ stats1.updateTimestamp(Timestamp.valueOf( "2000-04-02 03:31:00.000"));
+ stats1.increment(1);
+ TimestampColumnStatistics typed = (TimestampColumnStatistics) stats1;
+
+ // Base case no nanos
+ assertEquals("2000-04-02 03:31:00.0", typed.getMinimum().toString());
+ assertEquals("2000-04-02 03:31:00.0", typed.getMaximum().toString());
+
+ // Add nano precision to min
+ stats1.updateTimestamp(Timestamp.valueOf( "2000-04-01 03:30:00.0005"));
+ stats1.increment(1);
+ assertEquals("2000-04-01 03:30:00.0005", typed.getMinimum().toString());
+ assertEquals("2000-04-02 03:31:00.0", typed.getMaximum().toString());
+
+ // Change max with precision
+ stats1.updateTimestamp(Timestamp.valueOf( "2000-04-04 03:30:00.0008"));
+ stats1.increment(1);
+ assertEquals("2000-04-01 03:30:00.0005", typed.getMinimum().toString());
+ assertEquals("2000-04-04 03:30:00.0008", typed.getMaximum().toString());
+
+ // Equal min with nano diff
+ stats1.updateTimestamp(Timestamp.valueOf( "2000-04-04 03:30:00.0009"));
+ stats1.increment(1);
+ assertEquals("2000-04-01 03:30:00.0005", typed.getMinimum().toString());
+ assertEquals("2000-04-04 03:30:00.0009", typed.getMaximum().toString());
+
+ // Test serialisation/deserialisation
+ OrcProto.ColumnStatistics serial = stats1.serialize().build();
+ ColumnStatisticsImpl stats2 =
+ ColumnStatisticsImpl.deserialize(schema, serial);
+ TimestampColumnStatistics typed2 = (TimestampColumnStatistics) stats2;
+ assertEquals("2000-04-01 03:30:00.0005", typed2.getMinimum().toString());
+ assertEquals("2000-04-04 03:30:00.0009", typed2.getMaximum().toString());
+ assertEquals(4L, typed2.getNumberOfValues());
+ TimeZone.setDefault(original);
+ }
+
+ @Test
+ public void testTimestampNanoPrecisionMerge() {
+ TypeDescription schema = TypeDescription.createTimestamp();
+
+ TimeZone original = TimeZone.getDefault();
+ TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"));
+
+ ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+ ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
+ stats1.updateTimestamp(Timestamp.valueOf("2000-04-02 03:30:00.0001"));
+ stats1.updateTimestamp(Timestamp.valueOf( "2000-04-02 01:30:00.0009"));
+ stats1.increment(2);
+
+ stats2.updateTimestamp(Timestamp.valueOf( "2000-04-02 01:30:00.00088"));
+ stats2.updateTimestamp(Timestamp.valueOf( "2000-04-02 03:30:00.00001"));
+ stats2.increment(2);
+
+ TimestampColumnStatistics typed = (TimestampColumnStatistics) stats1;
+ assertEquals("2000-04-02 01:30:00.0009", typed.getMinimum().toString());
+ assertEquals("2000-04-02 03:30:00.0001", typed.getMaximum().toString());
+
+ TimestampColumnStatistics typed2 = (TimestampColumnStatistics) stats2;
+ assertEquals("2000-04-02 03:30:00.00001", typed2.getMaximum().toString());
+ assertEquals("2000-04-02 01:30:00.00088", typed2.getMinimum().toString());
+
+ // make sure merge goes down to ns precision
+ stats1.merge(stats2);
+ assertEquals("2000-04-02 01:30:00.00088", typed.getMinimum().toString());
+ assertEquals("2000-04-02 03:30:00.0001", typed.getMaximum().toString());
+
+ stats1.reset();
+ assertEquals(null, typed.getMinimum());
+ assertEquals(null, typed.getMaximum());
+
+ stats1.updateTimestamp(Timestamp.valueOf( "1999-04-04 00:00:00.000231"));
+ stats1.updateTimestamp(Timestamp.valueOf( "2009-03-08 12:00:00.000654"));
+ stats1.increment(2);
+
+ stats1.merge(stats2);
+ assertEquals("1999-04-04 00:00:00.000231", typed.getMinimum().toString());
+ assertEquals("2009-03-08 12:00:00.000654", typed.getMaximum().toString());
+ TimeZone.setDefault(original);
+ }
+
+ @Test
+ public void testNegativeTimestampNanoPrecision() {
+ TypeDescription schema = TypeDescription.createTimestamp();
+
+ TimeZone original = TimeZone.getDefault();
+ TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"));
+
+ ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+ ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
+ stats1.updateTimestamp(Timestamp.valueOf("1960-04-02 03:30:00.0001"));
+ stats1.updateTimestamp(Timestamp.valueOf( "1969-12-31 16:00:00.0009"));
+ stats1.increment(2);
+
+ stats2.updateTimestamp(Timestamp.valueOf( "1962-04-02 01:30:00.00088"));
+ stats2.updateTimestamp(Timestamp.valueOf( "1969-12-31 16:00:00.00001"));
+ stats2.increment(2);
+
+ stats1.merge(stats2);
+
+ TimestampColumnStatistics typed = (TimestampColumnStatistics) stats1;
+ assertEquals("1960-04-02 03:30:00.0001", typed.getMinimum().toString());
+ assertEquals("1969-12-31 16:00:00.0009", typed.getMaximum().toString());
+
+ stats1.reset();
+ assertEquals(null, typed.getMinimum());
+ assertEquals(null, typed.getMaximum());
+
+ stats1.updateTimestamp(Timestamp.valueOf("1969-12-31 15:00:00.0005"));
+ stats1.increment(1);
+
+ assertEquals("1969-12-31 15:00:00.0005", typed.getMinimum().toString());
+ assertEquals("1969-12-31 15:00:00.0005", typed.getMaximum().toString());
+
+ stats1.updateTimestamp(Timestamp.valueOf("1969-12-31 15:00:00.00055"));
+ stats1.increment(1);
+
+ assertEquals("1969-12-31 15:00:00.0005", typed.getMinimum().toString());
+ assertEquals("1969-12-31 15:00:00.00055", typed.getMaximum().toString());
+ TimeZone.setDefault(original);
+ }
+
+ @Test
public void testDecimalMerge() throws Exception {
TypeDescription schema = TypeDescription.createDecimal()
.withPrecision(38).withScale(16);
diff --git a/java/core/src/test/org/apache/orc/TestOrcTimestampPPD.java
b/java/core/src/test/org/apache/orc/TestOrcTimestampPPD.java
new file mode 100644
index 0000000..dfe7f7c
--- /dev/null
+++ b/java/core/src/test/org/apache/orc/TestOrcTimestampPPD.java
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import com.google.common.collect.Lists;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl;
+import org.apache.orc.impl.RecordReaderImpl;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+import java.io.File;
+import java.io.IOException;
+import java.sql.Timestamp;
+import java.util.List;
+import java.util.TimeZone;
+
+import static junit.framework.Assert.assertEquals;
+
+public class TestOrcTimestampPPD {
+ Path workDir =
+ new Path(System.getProperty("test.tmp.dir", "target" + File.separator +
"test" + File.separator + "tmp"));
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+ static TimeZone defaultTimeZone = TimeZone.getDefault();
+
+ public TestOrcTimestampPPD() {
+ }
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcTimestampPPD." +
testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @After
+ public void restoreTimeZone() {
+ TimeZone.setDefault(defaultTimeZone);
+ }
+
+ public static PredicateLeaf createPredicateLeaf(PredicateLeaf.Operator
operator,
+ PredicateLeaf.Type type,
+ String columnName,
+ Object literal,
+ List<Object> literalList) {
+ return new SearchArgumentImpl.PredicateLeafImpl(operator, type, columnName,
+ literal, literalList);
+ }
+
+ @Test
+ // ORC-611 : PPD evaluation with min-max stats for sub-millisecond timestamps
+ public void testSubMsTimestampWriterStats() throws Exception {
+ TypeDescription schema = TypeDescription.createTimestamp();
+ TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+
OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000).bufferSize(10000)
+ .version(OrcFile.Version.CURRENT));
+
+ List<Timestamp> tslist = Lists.newArrayList();
+ tslist.add(Timestamp.valueOf("1970-01-01 00:00:00.0005"));
+
+ VectorizedRowBatch batch = schema.createRowBatch();
+ TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
+ for (Timestamp t : tslist) {
+ times.set(batch.size++, t);
+ }
+ times.isRepeating = true;
+ writer.addRowBatch(batch);
+ // Done writing to file
+ writer.close();
+
+ TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"));
+ // Now reading
+ Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf).filesystem(fs));
+
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ times = (TimestampColumnVector) batch.cols[0];
+ while (rows.nextBatch(batch)) {
+ for (int r = 0; r < batch.size; ++r) {
+ Assert.assertEquals(tslist.get(0), times.asScratchTimestamp(r));
+ Assert.assertEquals(tslist.get(0).getNanos(),
times.asScratchTimestamp(r).getNanos());
+ }
+ }
+ rows.close();
+ ColumnStatistics[] colStats = reader.getStatistics();
+ Timestamp gotMin = ((TimestampColumnStatistics) colStats[0]).getMinimum();
+ assertEquals("1970-01-01 00:00:00.0005", gotMin.toString());
+
+ Timestamp gotMax = ((TimestampColumnStatistics) colStats[0]).getMaximum();
+ assertEquals("1970-01-01 00:00:00.0005", gotMax.toString());
+
+ PredicateLeaf pred = createPredicateLeaf(PredicateLeaf.Operator.EQUALS,
PredicateLeaf.Type.TIMESTAMP, "c",
+ Timestamp.valueOf("1970-01-01 00:00:00.0005"), null);
+ // Make sure PPD is now passing
+ Assert.assertEquals(SearchArgument.TruthValue.YES,
RecordReaderImpl.evaluatePredicate(colStats[0], pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.LESS_THAN_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "c",
+ Timestamp.valueOf("1970-01-01 00:00:00.0005"), null);
+ Assert.assertEquals(SearchArgument.TruthValue.YES_NO,
RecordReaderImpl.evaluatePredicate(colStats[0], pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.LESS_THAN,
PredicateLeaf.Type.TIMESTAMP, "c",
+ Timestamp.valueOf("1970-01-01 00:00:00.0005"), null);
+ Assert.assertEquals(SearchArgument.TruthValue.NO,
RecordReaderImpl.evaluatePredicate(colStats[0], pred, null));
+ }
+
+ @Test
+ public void testSubMsComplexStats() throws IOException {
+ TypeDescription schema = TypeDescription.createTimestamp();
+ TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+
OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000).bufferSize(10000)
+ .version(OrcFile.Version.CURRENT));
+
+ List<Timestamp> tslist = Lists.newArrayList();
+ tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.001109"));
+ tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.001279"));
+ tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.001499"));
+ tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.0067891"));
+ tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.005199"));
+ tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.006789"));
+
+ VectorizedRowBatch batch = schema.createRowBatch();
+ TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
+ for (Timestamp ts: tslist) {
+ times.set(batch.size++, ts);
+ }
+ times.isRepeating = false;
+ writer.addRowBatch(batch);
+ // Done writing to file
+ writer.close();
+
+ TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"));
+ // Now reading
+ Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf).filesystem(fs));
+
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ times = (TimestampColumnVector) batch.cols[0];
+ while (rows.nextBatch(batch)) {
+ for (int r = 0; r < batch.size; ++r) {
+ Assert.assertEquals(tslist.get(r), times.asScratchTimestamp(r));
+ Assert.assertEquals(tslist.get(r).getNanos(),
times.asScratchTimestamp(r).getNanos());
+ }
+ }
+ rows.close();
+ ColumnStatistics[] colStats = reader.getStatistics();
+ Timestamp gotMin = ((TimestampColumnStatistics) colStats[0]).getMinimum();
+ assertEquals("2037-01-01 00:00:00.001109", gotMin.toString());
+
+ Timestamp gotMax = ((TimestampColumnStatistics) colStats[0]).getMaximum();
+ assertEquals("2037-01-01 00:00:00.0067891", gotMax.toString());
+
+ // PPD EQUALS with nano precision passing
+ PredicateLeaf pred = createPredicateLeaf(PredicateLeaf.Operator.EQUALS,
PredicateLeaf.Type.TIMESTAMP, "c",
+ Timestamp.valueOf("2037-01-01 00:00:00.001109"), null);
+ Assert.assertEquals(SearchArgument.TruthValue.YES_NO,
RecordReaderImpl.evaluatePredicate(colStats[0], pred, null));
+
+ // PPD EQUALS with ms precision NOT passing
+ pred = createPredicateLeaf(PredicateLeaf.Operator.EQUALS,
PredicateLeaf.Type.TIMESTAMP, "c",
+ Timestamp.valueOf("2037-01-01 00:00:001"), null);
+ Assert.assertEquals(SearchArgument.TruthValue.NO,
RecordReaderImpl.evaluatePredicate(colStats[0], pred, null));
+
+ // PPD LESS_THAN with ns precision passing
+ pred = createPredicateLeaf(PredicateLeaf.Operator.LESS_THAN,
PredicateLeaf.Type.TIMESTAMP, "c",
+ Timestamp.valueOf("2037-01-01 00:00:00.006789"), null);
+ Assert.assertEquals(SearchArgument.TruthValue.YES_NO,
RecordReaderImpl.evaluatePredicate(colStats[0], pred, null));
+
+ // PPD LESS_THAN with ms precision passing
+ pred = createPredicateLeaf(PredicateLeaf.Operator.LESS_THAN,
PredicateLeaf.Type.TIMESTAMP, "c",
+ Timestamp.valueOf("2037-01-01 00:00:00.002"), null);
+ Assert.assertEquals(SearchArgument.TruthValue.YES_NO,
RecordReaderImpl.evaluatePredicate(colStats[0], pred, null));
+ }
+}
+
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index bc2ff94..f827202 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -1434,9 +1434,10 @@ public class TestVectorOrcFile {
writer.close();
// check the stats to make sure they match up to the millisecond
+ // ORC-611 update: nanoseconds are now supported!
ColumnStatistics[] stats = writer.getStatistics();
TimestampColumnStatistics tsStat = (TimestampColumnStatistics) stats[1];
- assertEquals(String.format("%04d-12-12 12:34:56.0", maxYear - 1),
+ assertEquals(String.format("%04d-12-12 12:34:56.0001", maxYear - 1),
tsStat.getMaximum().toString());
assertEquals(String.format("%04d-05-05 12:34:56.0", minYear),
tsStat.getMinimum().toString());
@@ -1784,9 +1785,9 @@ public class TestVectorOrcFile {
((TimestampColumnStatistics) stats[1]).getMinimum().toString());
assertEquals("1969-12-31 19:00:00.0",
((TimestampColumnStatistics) stats[1]).getMinimumUTC().toString());
- assertEquals("2037-05-05 12:34:56.203",
+ assertEquals("2037-05-05 12:34:56.2037",
((TimestampColumnStatistics) stats[1]).getMaximum().toString());
- assertEquals("2037-05-05 08:34:56.203",
+ assertEquals("2037-05-05 08:34:56.2037",
((TimestampColumnStatistics) stats[1]).getMaximumUTC().toString());
// check the instant statistics
diff --git
a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java
b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java
index 45274b1..9b8fed1 100644
--- a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -21,7 +21,6 @@ package org.apache.orc.impl;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.orc.ColumnStatistics;
import org.apache.orc.DecimalColumnStatistics;
import org.apache.orc.OrcFile;
import org.apache.orc.OrcProto;
@@ -40,7 +39,7 @@ import static org.junit.Assert.assertTrue;
public class TestColumnStatisticsImpl {
@Test
- public void testUpdateDate() throws Exception {
+ public void testUpdateDate() {
ColumnStatisticsImpl stat =
ColumnStatisticsImpl.create(TypeDescription.createDate());
DateWritable date = new DateWritable(16400);
stat.increment();
@@ -83,12 +82,56 @@ public class TestColumnStatisticsImpl {
TimestampColumnStatistics stats =
(TimestampColumnStatistics) reader.getStatistics()[0];
assertEquals("1995-01-01 00:00:00.688", stats.getMinimum().toString());
- assertEquals("2037-01-01 00:00:00.0", stats.getMaximum().toString());
+ // ORC-611: add TS stats nanosecond support for older files by using (max
TS + 0.999 ms)
+ assertEquals("2037-01-01 00:00:00.000999999",
stats.getMaximum().toString());
TimeZone.setDefault(original);
}
@Test
- public void testDecimal64Overflow() throws IOException {
+ public void testTimestamps() {
+ TimeZone original = TimeZone.getDefault();
+ TimeZone.setDefault(TimeZone.getTimeZone("UTC"));
+ TypeDescription instant = TypeDescription.createTimestampInstant();
+ ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(instant);
+ TimestampColumnStatistics dstats = (TimestampColumnStatistics) stats;
+ assertEquals(null, dstats.getMinimumUTC());
+ assertEquals(null, dstats.getMaximumUTC());
+ stats.updateTimestamp(123, 456789);
+ stats.updateTimestamp(1234, 567890);
+ stats.increment(2);
+ assertEquals("1970-01-01 00:00:00.123456789",
dstats.getMinimum().toString());
+ assertEquals("1970-01-01 00:00:01.23456789",
dstats.getMaximum().toString());
+ stats.updateTimestamp(123, 400000);
+ stats.updateTimestamp(1234, 600000);
+ assertEquals("1970-01-01 00:00:00.1234", dstats.getMinimum().toString());
+ assertEquals("1970-01-01 00:00:01.2346", dstats.getMaximum().toString());
+ stats.updateTimestamp(122, 300000);
+ stats.updateTimestamp(1235, 400000);
+ assertEquals("1970-01-01 00:00:00.1223", dstats.getMinimum().toString());
+ assertEquals("1970-01-01 00:00:01.2354", dstats.getMaximum().toString());
+ stats.merge(stats);
+ assertEquals("1970-01-01 00:00:00.1223", dstats.getMinimum().toString());
+ assertEquals("1970-01-01 00:00:01.2354", dstats.getMaximum().toString());
+ ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(instant);
+ stats2.updateTimestamp(100, 1);
+ stats2.increment(1);
+ TimestampColumnStatistics dstats2 = (TimestampColumnStatistics) stats2;
+ assertEquals("1970-01-01 00:00:00.100000001",
dstats2.getMinimum().toString());
+ assertEquals("1970-01-01 00:00:00.100000001",
dstats2.getMaximum().toString());
+ stats.merge(stats2);
+ assertEquals("1970-01-01 00:00:00.100000001",
dstats.getMinimum().toString());
+ assertEquals("1970-01-01 00:00:01.2354", dstats.getMaximum().toString());
+ stats2.updateTimestamp(2000, 123456);
+ assertEquals("1970-01-01 00:00:00.100000001",
dstats2.getMinimum().toString());
+ assertEquals("1970-01-01 00:00:02.000123456",
dstats2.getMaximum().toString());
+ stats.merge(stats2);
+ assertEquals("1970-01-01 00:00:00.100000001",
dstats.getMinimum().toString());
+ assertEquals("1970-01-01 00:00:02.000123456",
dstats.getMaximum().toString());
+ TimeZone.setDefault(original);
+ }
+
+ @Test
+ public void testDecimal64Overflow() {
TypeDescription schema = TypeDescription.fromString("decimal(18,6)");
OrcProto.ColumnStatistics.Builder pb =
OrcProto.ColumnStatistics.newBuilder();
@@ -150,7 +193,7 @@ public class TestColumnStatisticsImpl {
}
@Test
- public void testCollectionColumnStats() throws Exception {
+ public void testCollectionColumnStats() {
/* test List */
final ColumnStatisticsImpl statList =
ColumnStatisticsImpl.create(TypeDescription.createList(TypeDescription.createInt()));
diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto
index b552554..06ab150 100644
--- a/proto/orc_proto.proto
+++ b/proto/orc_proto.proto
@@ -67,6 +67,9 @@ message TimestampStatistics {
optional sint64 maximum = 2;
optional sint64 minimumUtc = 3;
optional sint64 maximumUtc = 4;
+ // store the lower 6 TS digits for min/max to achieve nanosecond precision
+ optional int32 minimumNanos = 5;
+ optional int32 maximumNanos = 6;
}
message BinaryStatistics {