This is an automated email from the ASF dual-hosted git repository.

omalley pushed a commit to branch branch-1.6
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/branch-1.6 by this push:
     new 865ad54  ORC-611: Support TS Min/max statistics with ns precision
865ad54 is described below

commit 865ad5499332ebf90b482f4e0465c6c26694c823
Author: Panos Garefalakis <[email protected]>
AuthorDate: Mon Mar 16 16:18:29 2020 +0000

    ORC-611: Support TS Min/max statistics with ns precision
    
    Fixes #497
    
    Change-Id: I06bad2cfe438502035c20393ca325decf9d1c4f1
    Signed-off-by: Owen O'Malley <[email protected]>
---
 .../org/apache/orc/TimestampColumnStatistics.java  |   2 +-
 .../org/apache/orc/impl/ColumnStatisticsImpl.java  | 141 ++++++++++----
 .../orc/impl/writer/TimestampTreeWriter.java       |   4 +-
 .../test/org/apache/orc/TestColumnStatistics.java  | 134 +++++++++++++
 .../test/org/apache/orc/TestOrcTimestampPPD.java   | 208 +++++++++++++++++++++
 .../src/test/org/apache/orc/TestVectorOrcFile.java |   7 +-
 .../apache/orc/impl/TestColumnStatisticsImpl.java  |  55 +++++-
 proto/orc_proto.proto                              |   3 +
 8 files changed, 502 insertions(+), 52 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/TimestampColumnStatistics.java 
b/java/core/src/java/org/apache/orc/TimestampColumnStatistics.java
index b095a68..c5abd42 100644
--- a/java/core/src/java/org/apache/orc/TimestampColumnStatistics.java
+++ b/java/core/src/java/org/apache/orc/TimestampColumnStatistics.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java 
b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
index 587e33b..eb37a82 100644
--- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
@@ -1647,8 +1647,14 @@ public class ColumnStatisticsImpl implements 
ColumnStatistics {
 
   private static class TimestampStatisticsImpl extends ColumnStatisticsImpl
       implements TimestampColumnStatistics {
-    private Long minimum = null;
-    private Long maximum = null;
+    private static final int DEFAULT_MIN_NANOS = 000_000;
+    private static final int DEFAULT_MAX_NANOS = 999_999;
+
+    private long minimum = Long.MAX_VALUE;
+    private long maximum = Long.MIN_VALUE;
+
+    private int minNanos = DEFAULT_MIN_NANOS;
+    private int maxNanos = DEFAULT_MAX_NANOS;
 
     TimestampStatisticsImpl() {
     }
@@ -1679,31 +1685,51 @@ public class ColumnStatisticsImpl implements 
ColumnStatistics {
         minimum = DateUtils.convertTime(timestampStats.getMinimumUtc(),
             writerUsedProlepticGregorian, convertToProlepticGregorian, true);
       }
+      if (timestampStats.hasMaximumNanos()) {
+        maxNanos = timestampStats.getMaximumNanos() - 1;
+      }
+      if (timestampStats.hasMinimumNanos()) {
+        minNanos = timestampStats.getMinimumNanos() - 1;
+      }
     }
 
     @Override
     public void reset() {
       super.reset();
-      minimum = null;
-      maximum = null;
+      minimum = Long.MAX_VALUE;
+      maximum = Long.MIN_VALUE;
+      minNanos = DEFAULT_MIN_NANOS;
+      maxNanos = DEFAULT_MAX_NANOS;
     }
 
     @Override
     public void updateTimestamp(Timestamp value) {
       long millis = SerializationUtils.convertToUtc(TimeZone.getDefault(),
           value.getTime());
-      updateTimestamp(millis);
+      // prune the last 6 digits for ns precision
+      updateTimestamp(millis, value.getNanos() % 1_000_000);
     }
 
     @Override
-    public void updateTimestamp(long value) {
-      if (minimum == null) {
-        minimum = value;
-        maximum = value;
-      } else if (minimum > value) {
+    public void updateTimestamp(long value, int nanos) {
+      if (minimum > maximum) {
         minimum = value;
-      } else if (maximum < value) {
         maximum = value;
+        minNanos = nanos;
+        maxNanos = nanos;
+      } else {
+        if (minimum >= value) {
+          if (minimum > value || nanos < minNanos) {
+            minNanos = nanos;
+          }
+          minimum = value;
+        }
+        if (maximum <= value) {
+          if (maximum < value || nanos > maxNanos) {
+            maxNanos = nanos;
+          }
+          maximum = value;
+        }
       }
     }
 
@@ -1711,19 +1737,31 @@ public class ColumnStatisticsImpl implements 
ColumnStatistics {
     public void merge(ColumnStatisticsImpl other) {
       if (other instanceof TimestampStatisticsImpl) {
         TimestampStatisticsImpl timestampStats = (TimestampStatisticsImpl) 
other;
-        if (minimum == null) {
-          minimum = timestampStats.minimum;
-          maximum = timestampStats.maximum;
-        } else if (timestampStats.minimum != null) {
-          if (minimum > timestampStats.minimum) {
+        if (count == 0) {
+          if (timestampStats.count != 0) {
+            minimum = timestampStats.minimum;
+            maximum = timestampStats.maximum;
+            minNanos = timestampStats.minNanos;
+            maxNanos = timestampStats.maxNanos;
+          }
+        } else if (timestampStats.count != 0) {
+          if (minimum >= timestampStats.minimum) {
+            if (minimum > timestampStats.minimum ||
+                minNanos > timestampStats.minNanos) {
+              minNanos = timestampStats.minNanos;
+            }
             minimum = timestampStats.minimum;
           }
-          if (maximum < timestampStats.maximum) {
+          if (maximum <= timestampStats.maximum) {
+            if (maximum < timestampStats.maximum ||
+                maxNanos < timestampStats.maxNanos) {
+              maxNanos = timestampStats.maxNanos;
+            }
             maximum = timestampStats.maximum;
           }
         }
       } else {
-        if (isStatsExists() && minimum != null) {
+        if (isStatsExists() && count != 0) {
           throw new IllegalArgumentException("Incompatible merging of 
timestamp column statistics");
         }
       }
@@ -1735,9 +1773,15 @@ public class ColumnStatisticsImpl implements 
ColumnStatistics {
       OrcProto.ColumnStatistics.Builder result = super.serialize();
       OrcProto.TimestampStatistics.Builder timestampStats = 
OrcProto.TimestampStatistics
           .newBuilder();
-      if (getNumberOfValues() != 0 && minimum != null) {
+      if (getNumberOfValues() != 0) {
         timestampStats.setMinimumUtc(minimum);
         timestampStats.setMaximumUtc(maximum);
+        if (minNanos != DEFAULT_MIN_NANOS) {
+          timestampStats.setMinimumNanos(minNanos + 1);
+        }
+        if (maxNanos != DEFAULT_MAX_NANOS) {
+          timestampStats.setMaximumNanos(maxNanos + 1);
+        }
       }
       result.setTimestampStatistics(timestampStats);
       return result;
@@ -1745,32 +1789,54 @@ public class ColumnStatisticsImpl implements 
ColumnStatistics {
 
     @Override
     public Timestamp getMinimum() {
-      return minimum == null ? null :
-          new 
Timestamp(SerializationUtils.convertFromUtc(TimeZone.getDefault(),
-              minimum));
+      if (minimum > maximum) {
+        return null;
+      } else {
+        Timestamp ts = new Timestamp(SerializationUtils.
+            convertFromUtc(TimeZone.getDefault(), minimum));
+        ts.setNanos(ts.getNanos() + minNanos);
+        return ts;
+      }
     }
 
     @Override
     public Timestamp getMaximum() {
-      return maximum == null ? null :
-          new 
Timestamp(SerializationUtils.convertFromUtc(TimeZone.getDefault(),
-              maximum));
+      if (minimum > maximum) {
+        return null;
+      } else {
+        Timestamp ts = new Timestamp(SerializationUtils.convertFromUtc(
+            TimeZone.getDefault(), maximum));
+        ts.setNanos(ts.getNanos() + maxNanos);
+        return ts;
+      }
     }
 
     @Override
     public Timestamp getMinimumUTC() {
-      return minimum == null ? null : new Timestamp(minimum);
+      if (minimum > maximum) {
+        return null;
+      } else {
+        Timestamp ts = new Timestamp(minimum);
+        ts.setNanos(ts.getNanos() + minNanos);
+        return ts;
+      }
     }
 
     @Override
     public Timestamp getMaximumUTC() {
-      return maximum == null ? null : new Timestamp(maximum);
+      if (minimum > maximum) {
+        return null;
+      } else {
+        Timestamp ts = new Timestamp(maximum);
+        ts.setNanos(ts.getNanos() + maxNanos);
+        return ts;
+      }
     }
 
     @Override
     public String toString() {
       StringBuilder buf = new StringBuilder(super.toString());
-      if (minimum != null || maximum != null) {
+      if (minimum <= maximum) {
         buf.append(" min: ");
         buf.append(getMinimum());
         buf.append(" max: ");
@@ -1793,21 +1859,15 @@ public class ColumnStatisticsImpl implements 
ColumnStatistics {
 
       TimestampStatisticsImpl that = (TimestampStatisticsImpl) o;
 
-      if (minimum != null ? !minimum.equals(that.minimum) : that.minimum != 
null) {
-        return false;
-      }
-      if (maximum != null ? !maximum.equals(that.maximum) : that.maximum != 
null) {
-        return false;
-      }
-
-      return true;
+      return minimum == that.minimum && maximum == that.maximum &&
+          minNanos == that.minNanos && maxNanos == that.maxNanos;
     }
 
     @Override
     public int hashCode() {
       int result = super.hashCode();
-      result = 31 * result + (minimum != null ? minimum.hashCode() : 0);
-      result = 31 * result + (maximum != null ? maximum.hashCode() : 0);
+      result = 31 * result + (int) minimum;
+      result = 31 * result + (int) maximum;
       return result;
     }
   }
@@ -1824,7 +1884,7 @@ public class ColumnStatisticsImpl implements 
ColumnStatistics {
 
     @Override
     public void updateTimestamp(Timestamp value) {
-      updateTimestamp(value.getTime());
+      updateTimestamp(value.getTime(), value.getNanos() % 1_000_000);
     }
 
     @Override
@@ -1934,7 +1994,8 @@ public class ColumnStatisticsImpl implements 
ColumnStatistics {
     throw new UnsupportedOperationException("Can't update timestamp");
   }
 
-  public void updateTimestamp(long value) {
+  // has to be extended
+  public void updateTimestamp(long value, int nanos) {
     throw new UnsupportedOperationException("Can't update timestamp");
   }
 
diff --git 
a/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java 
b/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java
index e23413f..dc4dfd2 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java
@@ -111,7 +111,7 @@ public class TimestampTreeWriter extends TreeWriterBase {
         }
         long utc = vec.isUTC() || alwaysUTC ?
             millis : SerializationUtils.convertToUtc(localTimezone, millis);
-        indexStatistics.updateTimestamp(utc);
+        indexStatistics.updateTimestamp(utc, newNanos % 1_000_000);
         if (createBloomFilter) {
           if (bloomFilter != null) {
             bloomFilter.addLong(millis);
@@ -139,7 +139,7 @@ public class TimestampTreeWriter extends TreeWriterBase {
               millis : SerializationUtils.convertToUtc(localTimezone, millis);
           seconds.write(secs - epoch);
           nanos.write(formatNanos(newNanos));
-          indexStatistics.updateTimestamp(utc);
+          indexStatistics.updateTimestamp(utc, newNanos % 1_000_000);
           if (createBloomFilter) {
             if (bloomFilter != null) {
               bloomFilter.addLong(millis);
diff --git a/java/core/src/test/org/apache/orc/TestColumnStatistics.java 
b/java/core/src/test/org/apache/orc/TestColumnStatistics.java
index fe2700f..8cf9dee 100644
--- a/java/core/src/test/org/apache/orc/TestColumnStatistics.java
+++ b/java/core/src/test/org/apache/orc/TestColumnStatistics.java
@@ -442,6 +442,8 @@ public class TestColumnStatistics {
     stats1.updateTimestamp(new Timestamp(100));
     stats2.updateTimestamp(new Timestamp(1));
     stats2.updateTimestamp(new Timestamp(1000));
+    stats1.increment(2);
+    stats2.increment(2);
     stats1.merge(stats2);
     TimestampColumnStatistics typed = (TimestampColumnStatistics) stats1;
     assertEquals(1, typed.getMinimum().getTime());
@@ -449,6 +451,7 @@ public class TestColumnStatistics {
     stats1.reset();
     stats1.updateTimestamp(new Timestamp(-10));
     stats1.updateTimestamp(new Timestamp(10000));
+    stats1.increment(2);
     stats1.merge(stats2);
     assertEquals(-10, typed.getMinimum().getTime());
     assertEquals(10000, typed.getMaximum().getTime());
@@ -489,6 +492,7 @@ public class TestColumnStatistics {
     stats1.reset();
     stats1.updateTimestamp(parseTime(format, "1999-04-04 00:00:00"));
     stats1.updateTimestamp(parseTime(format, "2009-03-08 12:00:00"));
+    stats1.increment(2);
     stats1.merge(stats2);
     assertEquals("1999-04-04 00:00:00.0", typed.getMinimum().toString());
     assertEquals("2009-03-08 12:00:00.0", typed.getMaximum().toString());
@@ -505,6 +509,136 @@ public class TestColumnStatistics {
   }
 
   @Test
+  public void testTimestampNanoPrecision() {
+    TypeDescription schema = TypeDescription.createTimestamp();
+
+    TimeZone original = TimeZone.getDefault();
+    TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"));
+
+    ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+    stats1.updateTimestamp(Timestamp.valueOf( "2000-04-02 03:31:00.000"));
+    stats1.increment(1);
+    TimestampColumnStatistics typed = (TimestampColumnStatistics) stats1;
+
+    // Base case no nanos
+    assertEquals("2000-04-02 03:31:00.0", typed.getMinimum().toString());
+    assertEquals("2000-04-02 03:31:00.0", typed.getMaximum().toString());
+
+    // Add nano precision to min
+    stats1.updateTimestamp(Timestamp.valueOf( "2000-04-01 03:30:00.0005"));
+    stats1.increment(1);
+    assertEquals("2000-04-01 03:30:00.0005", typed.getMinimum().toString());
+    assertEquals("2000-04-02 03:31:00.0", typed.getMaximum().toString());
+
+    // Change max with precision
+    stats1.updateTimestamp(Timestamp.valueOf( "2000-04-04 03:30:00.0008"));
+    stats1.increment(1);
+    assertEquals("2000-04-01 03:30:00.0005", typed.getMinimum().toString());
+    assertEquals("2000-04-04 03:30:00.0008", typed.getMaximum().toString());
+
+    // Equal min with nano diff
+    stats1.updateTimestamp(Timestamp.valueOf( "2000-04-04 03:30:00.0009"));
+    stats1.increment(1);
+    assertEquals("2000-04-01 03:30:00.0005", typed.getMinimum().toString());
+    assertEquals("2000-04-04 03:30:00.0009", typed.getMaximum().toString());
+
+    // Test serialisation/deserialisation
+    OrcProto.ColumnStatistics serial = stats1.serialize().build();
+    ColumnStatisticsImpl stats2 =
+        ColumnStatisticsImpl.deserialize(schema, serial);
+    TimestampColumnStatistics typed2 = (TimestampColumnStatistics) stats2;
+    assertEquals("2000-04-01 03:30:00.0005", typed2.getMinimum().toString());
+    assertEquals("2000-04-04 03:30:00.0009", typed2.getMaximum().toString());
+    assertEquals(4L, typed2.getNumberOfValues());
+    TimeZone.setDefault(original);
+  }
+
+  @Test
+  public void testTimestampNanoPrecisionMerge() {
+    TypeDescription schema = TypeDescription.createTimestamp();
+
+    TimeZone original = TimeZone.getDefault();
+    TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"));
+
+    ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+    ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
+    stats1.updateTimestamp(Timestamp.valueOf("2000-04-02 03:30:00.0001"));
+    stats1.updateTimestamp(Timestamp.valueOf( "2000-04-02 01:30:00.0009"));
+    stats1.increment(2);
+
+    stats2.updateTimestamp(Timestamp.valueOf( "2000-04-02 01:30:00.00088"));
+    stats2.updateTimestamp(Timestamp.valueOf( "2000-04-02 03:30:00.00001"));
+    stats2.increment(2);
+
+    TimestampColumnStatistics typed = (TimestampColumnStatistics) stats1;
+    assertEquals("2000-04-02 01:30:00.0009", typed.getMinimum().toString());
+    assertEquals("2000-04-02 03:30:00.0001", typed.getMaximum().toString());
+
+    TimestampColumnStatistics typed2 = (TimestampColumnStatistics) stats2;
+    assertEquals("2000-04-02 03:30:00.00001", typed2.getMaximum().toString());
+    assertEquals("2000-04-02 01:30:00.00088", typed2.getMinimum().toString());
+
+    // make sure merge goes down to ns precision
+    stats1.merge(stats2);
+    assertEquals("2000-04-02 01:30:00.00088", typed.getMinimum().toString());
+    assertEquals("2000-04-02 03:30:00.0001", typed.getMaximum().toString());
+
+    stats1.reset();
+    assertEquals(null, typed.getMinimum());
+    assertEquals(null, typed.getMaximum());
+
+    stats1.updateTimestamp(Timestamp.valueOf( "1999-04-04 00:00:00.000231"));
+    stats1.updateTimestamp(Timestamp.valueOf( "2009-03-08 12:00:00.000654"));
+    stats1.increment(2);
+
+    stats1.merge(stats2);
+    assertEquals("1999-04-04 00:00:00.000231", typed.getMinimum().toString());
+    assertEquals("2009-03-08 12:00:00.000654", typed.getMaximum().toString());
+    TimeZone.setDefault(original);
+  }
+
+  @Test
+  public void testNegativeTimestampNanoPrecision() {
+    TypeDescription schema = TypeDescription.createTimestamp();
+
+    TimeZone original = TimeZone.getDefault();
+    TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"));
+
+    ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+    ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
+    stats1.updateTimestamp(Timestamp.valueOf("1960-04-02 03:30:00.0001"));
+    stats1.updateTimestamp(Timestamp.valueOf( "1969-12-31 16:00:00.0009"));
+    stats1.increment(2);
+
+    stats2.updateTimestamp(Timestamp.valueOf( "1962-04-02 01:30:00.00088"));
+    stats2.updateTimestamp(Timestamp.valueOf( "1969-12-31 16:00:00.00001"));
+    stats2.increment(2);
+
+    stats1.merge(stats2);
+
+    TimestampColumnStatistics typed = (TimestampColumnStatistics) stats1;
+    assertEquals("1960-04-02 03:30:00.0001", typed.getMinimum().toString());
+    assertEquals("1969-12-31 16:00:00.0009", typed.getMaximum().toString());
+
+    stats1.reset();
+    assertEquals(null, typed.getMinimum());
+    assertEquals(null, typed.getMaximum());
+
+    stats1.updateTimestamp(Timestamp.valueOf("1969-12-31 15:00:00.0005"));
+    stats1.increment(1);
+
+    assertEquals("1969-12-31 15:00:00.0005", typed.getMinimum().toString());
+    assertEquals("1969-12-31 15:00:00.0005", typed.getMaximum().toString());
+
+    stats1.updateTimestamp(Timestamp.valueOf("1969-12-31 15:00:00.00055"));
+    stats1.increment(1);
+
+    assertEquals("1969-12-31 15:00:00.0005", typed.getMinimum().toString());
+    assertEquals("1969-12-31 15:00:00.00055", typed.getMaximum().toString());
+    TimeZone.setDefault(original);
+  }
+
+  @Test
   public void testDecimalMerge() throws Exception {
     TypeDescription schema = TypeDescription.createDecimal()
         .withPrecision(38).withScale(16);
diff --git a/java/core/src/test/org/apache/orc/TestOrcTimestampPPD.java 
b/java/core/src/test/org/apache/orc/TestOrcTimestampPPD.java
new file mode 100644
index 0000000..dfe7f7c
--- /dev/null
+++ b/java/core/src/test/org/apache/orc/TestOrcTimestampPPD.java
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import com.google.common.collect.Lists;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl;
+import org.apache.orc.impl.RecordReaderImpl;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+import java.io.File;
+import java.io.IOException;
+import java.sql.Timestamp;
+import java.util.List;
+import java.util.TimeZone;
+
+import static junit.framework.Assert.assertEquals;
+
+public class TestOrcTimestampPPD {
+  Path workDir =
+      new Path(System.getProperty("test.tmp.dir", "target" + File.separator + 
"test" + File.separator + "tmp"));
+  Configuration conf;
+  FileSystem fs;
+  Path testFilePath;
+  static TimeZone defaultTimeZone = TimeZone.getDefault();
+
+  public TestOrcTimestampPPD() {
+  }
+
+  @Rule
+  public TestName testCaseName = new TestName();
+
+  @Before
+  public void openFileSystem() throws Exception {
+    conf = new Configuration();
+    fs = FileSystem.getLocal(conf);
+    testFilePath = new Path(workDir, "TestOrcTimestampPPD." + 
testCaseName.getMethodName() + ".orc");
+    fs.delete(testFilePath, false);
+  }
+
+  @After
+  public void restoreTimeZone() {
+    TimeZone.setDefault(defaultTimeZone);
+  }
+
+  public static PredicateLeaf createPredicateLeaf(PredicateLeaf.Operator 
operator,
+      PredicateLeaf.Type type,
+      String columnName,
+      Object literal,
+      List<Object> literalList) {
+    return new SearchArgumentImpl.PredicateLeafImpl(operator, type, columnName,
+        literal, literalList);
+  }
+
+  @Test
+  // ORC-611 : PPD evaluation with min-max stats for sub-millisecond timestamps
+  public void testSubMsTimestampWriterStats() throws Exception {
+    TypeDescription schema = TypeDescription.createTimestamp();
+    TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"));
+
+    Writer writer = OrcFile.createWriter(testFilePath,
+        
OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000).bufferSize(10000)
+            .version(OrcFile.Version.CURRENT));
+
+    List<Timestamp> tslist = Lists.newArrayList();
+    tslist.add(Timestamp.valueOf("1970-01-01 00:00:00.0005"));
+
+    VectorizedRowBatch batch = schema.createRowBatch();
+    TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
+    for (Timestamp t : tslist) {
+      times.set(batch.size++, t);
+    }
+    times.isRepeating = true;
+    writer.addRowBatch(batch);
+    // Done writing to file
+    writer.close();
+
+    TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"));
+    // Now reading
+    Reader reader = OrcFile.createReader(testFilePath, 
OrcFile.readerOptions(conf).filesystem(fs));
+
+    RecordReader rows = reader.rows();
+    batch = reader.getSchema().createRowBatch();
+    times = (TimestampColumnVector) batch.cols[0];
+    while (rows.nextBatch(batch)) {
+      for (int r = 0; r < batch.size; ++r) {
+        Assert.assertEquals(tslist.get(0), times.asScratchTimestamp(r));
+        Assert.assertEquals(tslist.get(0).getNanos(), 
times.asScratchTimestamp(r).getNanos());
+      }
+    }
+    rows.close();
+    ColumnStatistics[] colStats = reader.getStatistics();
+    Timestamp gotMin = ((TimestampColumnStatistics) colStats[0]).getMinimum();
+    assertEquals("1970-01-01 00:00:00.0005", gotMin.toString());
+
+    Timestamp gotMax = ((TimestampColumnStatistics) colStats[0]).getMaximum();
+    assertEquals("1970-01-01 00:00:00.0005", gotMax.toString());
+
+    PredicateLeaf pred = createPredicateLeaf(PredicateLeaf.Operator.EQUALS, 
PredicateLeaf.Type.TIMESTAMP, "c",
+        Timestamp.valueOf("1970-01-01 00:00:00.0005"), null);
+    // Make sure PPD is now passing
+    Assert.assertEquals(SearchArgument.TruthValue.YES, 
RecordReaderImpl.evaluatePredicate(colStats[0], pred, null));
+
+    pred = createPredicateLeaf(PredicateLeaf.Operator.LESS_THAN_EQUALS, 
PredicateLeaf.Type.TIMESTAMP, "c",
+        Timestamp.valueOf("1970-01-01 00:00:00.0005"), null);
+    Assert.assertEquals(SearchArgument.TruthValue.YES_NO, 
RecordReaderImpl.evaluatePredicate(colStats[0], pred, null));
+
+    pred = createPredicateLeaf(PredicateLeaf.Operator.LESS_THAN, 
PredicateLeaf.Type.TIMESTAMP, "c",
+        Timestamp.valueOf("1970-01-01 00:00:00.0005"), null);
+    Assert.assertEquals(SearchArgument.TruthValue.NO, 
RecordReaderImpl.evaluatePredicate(colStats[0], pred, null));
+  }
+
+  @Test
+  public void testSubMsComplexStats() throws IOException {
+    TypeDescription schema = TypeDescription.createTimestamp();
+    TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"));
+
+    Writer writer = OrcFile.createWriter(testFilePath,
+        
OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000).bufferSize(10000)
+            .version(OrcFile.Version.CURRENT));
+
+    List<Timestamp> tslist = Lists.newArrayList();
+    tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.001109"));
+    tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.001279"));
+    tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.001499"));
+    tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.0067891"));
+    tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.005199"));
+    tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.006789"));
+
+    VectorizedRowBatch batch = schema.createRowBatch();
+    TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
+    for (Timestamp ts: tslist) {
+      times.set(batch.size++, ts);
+    }
+    times.isRepeating = false;
+    writer.addRowBatch(batch);
+    // Done writing to file
+    writer.close();
+
+    TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"));
+    // Now reading
+    Reader reader = OrcFile.createReader(testFilePath, 
OrcFile.readerOptions(conf).filesystem(fs));
+
+    RecordReader rows = reader.rows();
+    batch = reader.getSchema().createRowBatch();
+    times = (TimestampColumnVector) batch.cols[0];
+    while (rows.nextBatch(batch)) {
+      for (int r = 0; r < batch.size; ++r) {
+        Assert.assertEquals(tslist.get(r), times.asScratchTimestamp(r));
+        Assert.assertEquals(tslist.get(r).getNanos(), 
times.asScratchTimestamp(r).getNanos());
+      }
+    }
+    rows.close();
+    ColumnStatistics[] colStats = reader.getStatistics();
+    Timestamp gotMin = ((TimestampColumnStatistics) colStats[0]).getMinimum();
+    assertEquals("2037-01-01 00:00:00.001109", gotMin.toString());
+
+    Timestamp gotMax = ((TimestampColumnStatistics) colStats[0]).getMaximum();
+    assertEquals("2037-01-01 00:00:00.0067891", gotMax.toString());
+
+    // PPD EQUALS with nano precision passing
+    PredicateLeaf pred = createPredicateLeaf(PredicateLeaf.Operator.EQUALS, 
PredicateLeaf.Type.TIMESTAMP, "c",
+        Timestamp.valueOf("2037-01-01 00:00:00.001109"), null);
+    Assert.assertEquals(SearchArgument.TruthValue.YES_NO, 
RecordReaderImpl.evaluatePredicate(colStats[0], pred, null));
+
+    // PPD EQUALS with ms precision NOT passing
+    pred = createPredicateLeaf(PredicateLeaf.Operator.EQUALS, 
PredicateLeaf.Type.TIMESTAMP, "c",
+        Timestamp.valueOf("2037-01-01 00:00:001"), null);
+    Assert.assertEquals(SearchArgument.TruthValue.NO, 
RecordReaderImpl.evaluatePredicate(colStats[0], pred, null));
+
+    // PPD LESS_THAN with ns precision passing
+    pred = createPredicateLeaf(PredicateLeaf.Operator.LESS_THAN, 
PredicateLeaf.Type.TIMESTAMP, "c",
+        Timestamp.valueOf("2037-01-01 00:00:00.006789"), null);
+    Assert.assertEquals(SearchArgument.TruthValue.YES_NO, 
RecordReaderImpl.evaluatePredicate(colStats[0], pred, null));
+
+    // PPD LESS_THAN with ms precision passing
+    pred = createPredicateLeaf(PredicateLeaf.Operator.LESS_THAN, 
PredicateLeaf.Type.TIMESTAMP, "c",
+        Timestamp.valueOf("2037-01-01 00:00:00.002"), null);
+    Assert.assertEquals(SearchArgument.TruthValue.YES_NO, 
RecordReaderImpl.evaluatePredicate(colStats[0], pred, null));
+  }
+}
+
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java 
b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index bc2ff94..f827202 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -1434,9 +1434,10 @@ public class TestVectorOrcFile {
     writer.close();
 
     // check the stats to make sure they match up to the millisecond
+    // ORC-611 update: nanoseconds are now supported!
     ColumnStatistics[] stats = writer.getStatistics();
     TimestampColumnStatistics tsStat = (TimestampColumnStatistics) stats[1];
-    assertEquals(String.format("%04d-12-12 12:34:56.0", maxYear - 1),
+    assertEquals(String.format("%04d-12-12 12:34:56.0001", maxYear - 1),
         tsStat.getMaximum().toString());
     assertEquals(String.format("%04d-05-05 12:34:56.0", minYear),
         tsStat.getMinimum().toString());
@@ -1784,9 +1785,9 @@ public class TestVectorOrcFile {
         ((TimestampColumnStatistics) stats[1]).getMinimum().toString());
     assertEquals("1969-12-31 19:00:00.0",
         ((TimestampColumnStatistics) stats[1]).getMinimumUTC().toString());
-    assertEquals("2037-05-05 12:34:56.203",
+    assertEquals("2037-05-05 12:34:56.2037",
         ((TimestampColumnStatistics) stats[1]).getMaximum().toString());
-    assertEquals("2037-05-05 08:34:56.203",
+    assertEquals("2037-05-05 08:34:56.2037",
         ((TimestampColumnStatistics) stats[1]).getMaximumUTC().toString());
 
     // check the instant statistics
diff --git 
a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java 
b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java
index 45274b1..9b8fed1 100644
--- a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -21,7 +21,6 @@ package org.apache.orc.impl;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.orc.ColumnStatistics;
 import org.apache.orc.DecimalColumnStatistics;
 import org.apache.orc.OrcFile;
 import org.apache.orc.OrcProto;
@@ -40,7 +39,7 @@ import static org.junit.Assert.assertTrue;
 public class TestColumnStatisticsImpl {
 
   @Test
-  public void testUpdateDate() throws Exception {
+  public void testUpdateDate() {
     ColumnStatisticsImpl stat = 
ColumnStatisticsImpl.create(TypeDescription.createDate());
     DateWritable date = new DateWritable(16400);
     stat.increment();
@@ -83,12 +82,56 @@ public class TestColumnStatisticsImpl {
     TimestampColumnStatistics stats =
         (TimestampColumnStatistics) reader.getStatistics()[0];
     assertEquals("1995-01-01 00:00:00.688", stats.getMinimum().toString());
-    assertEquals("2037-01-01 00:00:00.0", stats.getMaximum().toString());
+    // ORC-611: add TS stats nanosecond support for older files by using (max 
TS + 0.999 ms)
+    assertEquals("2037-01-01 00:00:00.000999999", 
stats.getMaximum().toString());
     TimeZone.setDefault(original);
   }
 
   @Test
-  public void testDecimal64Overflow() throws IOException {
+  public void testTimestamps() {
+    TimeZone original = TimeZone.getDefault();
+    TimeZone.setDefault(TimeZone.getTimeZone("UTC"));
+    TypeDescription instant = TypeDescription.createTimestampInstant();
+    ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(instant);
+    TimestampColumnStatistics dstats = (TimestampColumnStatistics) stats;
+    assertEquals(null, dstats.getMinimumUTC());
+    assertEquals(null, dstats.getMaximumUTC());
+    stats.updateTimestamp(123, 456789);
+    stats.updateTimestamp(1234, 567890);
+    stats.increment(2);
+    assertEquals("1970-01-01 00:00:00.123456789", 
dstats.getMinimum().toString());
+    assertEquals("1970-01-01 00:00:01.23456789", 
dstats.getMaximum().toString());
+    stats.updateTimestamp(123, 400000);
+    stats.updateTimestamp(1234, 600000);
+    assertEquals("1970-01-01 00:00:00.1234", dstats.getMinimum().toString());
+    assertEquals("1970-01-01 00:00:01.2346", dstats.getMaximum().toString());
+    stats.updateTimestamp(122, 300000);
+    stats.updateTimestamp(1235, 400000);
+    assertEquals("1970-01-01 00:00:00.1223", dstats.getMinimum().toString());
+    assertEquals("1970-01-01 00:00:01.2354", dstats.getMaximum().toString());
+    stats.merge(stats);
+    assertEquals("1970-01-01 00:00:00.1223", dstats.getMinimum().toString());
+    assertEquals("1970-01-01 00:00:01.2354", dstats.getMaximum().toString());
+    ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(instant);
+    stats2.updateTimestamp(100, 1);
+    stats2.increment(1);
+    TimestampColumnStatistics dstats2 = (TimestampColumnStatistics) stats2;
+    assertEquals("1970-01-01 00:00:00.100000001", 
dstats2.getMinimum().toString());
+    assertEquals("1970-01-01 00:00:00.100000001", 
dstats2.getMaximum().toString());
+    stats.merge(stats2);
+    assertEquals("1970-01-01 00:00:00.100000001", 
dstats.getMinimum().toString());
+    assertEquals("1970-01-01 00:00:01.2354", dstats.getMaximum().toString());
+    stats2.updateTimestamp(2000, 123456);
+    assertEquals("1970-01-01 00:00:00.100000001", 
dstats2.getMinimum().toString());
+    assertEquals("1970-01-01 00:00:02.000123456", 
dstats2.getMaximum().toString());
+    stats.merge(stats2);
+    assertEquals("1970-01-01 00:00:00.100000001", 
dstats.getMinimum().toString());
+    assertEquals("1970-01-01 00:00:02.000123456", 
dstats.getMaximum().toString());
+    TimeZone.setDefault(original);
+  }
+
+  @Test
+  public void testDecimal64Overflow() {
     TypeDescription schema = TypeDescription.fromString("decimal(18,6)");
     OrcProto.ColumnStatistics.Builder pb =
         OrcProto.ColumnStatistics.newBuilder();
@@ -150,7 +193,7 @@ public class TestColumnStatisticsImpl {
   }
 
   @Test
-  public void testCollectionColumnStats() throws Exception {
+  public void testCollectionColumnStats() {
     /* test List */
     final ColumnStatisticsImpl statList = 
ColumnStatisticsImpl.create(TypeDescription.createList(TypeDescription.createInt()));
 
diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto
index b552554..06ab150 100644
--- a/proto/orc_proto.proto
+++ b/proto/orc_proto.proto
@@ -67,6 +67,9 @@ message TimestampStatistics {
   optional sint64 maximum = 2;
   optional sint64 minimumUtc = 3;
   optional sint64 maximumUtc = 4;
+  // store the lower 6 TS digits for min/max to achieve nanosecond precision
+  optional int32 minimumNanos = 5;
+  optional int32 maximumNanos = 6;
 }
 
 message BinaryStatistics {

Reply via email to