Repository: orc Updated Branches: refs/heads/master 51b6b6ce3 -> 9c105b92a
ORC-306 Correct pre-1970 timestamps that were off by one second. Fixes #220 Signed-off-by: Owen O'Malley <omal...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/9c105b92 Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/9c105b92 Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/9c105b92 Branch: refs/heads/master Commit: 9c105b92a0c2ab9c624b7bffd3c8b3a91d892175 Parents: 51b6b6c Author: Owen O'Malley <omal...@apache.org> Authored: Mon Feb 26 15:27:52 2018 -0800 Committer: Owen O'Malley <omal...@apache.org> Committed: Fri Mar 2 10:49:30 2018 -0800 ---------------------------------------------------------------------- .../org/apache/orc/impl/TreeReaderFactory.java | 12 ++-- .../orc/impl/writer/TimestampTreeWriter.java | 31 ++++++---- .../test/org/apache/orc/TestVectorOrcFile.java | 59 +++++++++++--------- 3 files changed, 60 insertions(+), 42 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/orc/blob/9c105b92/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java index 1891737..b33ad11 100644 --- a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java +++ b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java @@ -19,6 +19,7 @@ package org.apache.orc.impl; import java.io.EOFException; import java.io.IOException; +import java.sql.Timestamp; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; @@ -49,7 +50,6 @@ import org.apache.orc.impl.writer.TimestampTreeWriter; * Factory for creating ORC tree readers. */ public class TreeReaderFactory { - public interface Context { SchemaEvolution getSchemaEvolution(); @@ -977,12 +977,12 @@ public class TreeReaderFactory { for (int i = 0; i < batchSize; i++) { if (result.noNulls || !result.isNull[i]) { - long millis = data.next() + base_timestamp; - int newNanos = parseNanos(nanos.next()); - if (millis < 0 && newNanos != 0) { - millis -= 1; + final int newNanos = parseNanos(nanos.next()); + long millis = (data.next() + base_timestamp) + * TimestampTreeWriter.MILLIS_PER_SECOND + newNanos / 1_000_000; + if (millis < 0 && newNanos > 999_999) { + millis -= TimestampTreeWriter.MILLIS_PER_SECOND; } - millis *= TimestampTreeWriter.MILLIS_PER_SECOND; long offset = 0; // If reader and writer time zones have different rules, adjust the timezone difference // between reader and writer taking day light savings into account. http://git-wip-us.apache.org/repos/asf/orc/blob/9c105b92/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java index fae108e..1694ca1 100644 --- a/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java +++ b/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java @@ -75,11 +75,16 @@ public class TimestampTreeWriter extends TreeWriterBase { int length) throws IOException { super.writeBatch(vector, offset, length); TimestampColumnVector vec = (TimestampColumnVector) vector; - Timestamp val; if (vector.isRepeating) { if (vector.noNulls || !vector.isNull[0]) { - val = vec.asScratchTimestamp(0); - long millis = val.getTime(); + // ignore the bottom three digits from the vec.time field + final long secs = vec.time[0] / MILLIS_PER_SECOND; + final int newNanos = vec.nanos[0]; + // set the millis based on the top three digits of the nanos + long millis = secs * MILLIS_PER_SECOND + newNanos / 1_000_000; + if (millis < 0 && newNanos > 999_999) { + millis -= MILLIS_PER_SECOND; + } long utc = SerializationUtils.convertToUtc(localTimezone, millis); indexStatistics.updateTimestamp(utc); if (createBloomFilter) { @@ -88,22 +93,26 @@ public class TimestampTreeWriter extends TreeWriterBase { } bloomFilterUtf8.addLong(utc); } - final long secs = millis / MILLIS_PER_SECOND - baseEpochSecsLocalTz; - final long nano = formatNanos(val.getNanos()); + final long nano = formatNanos(vec.nanos[0]); for (int i = 0; i < length; ++i) { - seconds.write(secs); + seconds.write(secs - baseEpochSecsLocalTz); nanos.write(nano); } } } else { for (int i = 0; i < length; ++i) { if (vec.noNulls || !vec.isNull[i + offset]) { - val = vec.asScratchTimestamp(i + offset); - long millis = val.getTime(); - long secs = millis / MILLIS_PER_SECOND - baseEpochSecsLocalTz; + // ignore the bottom three digits from the vec.time field + final long secs = vec.time[i + offset] / MILLIS_PER_SECOND; + final int newNanos = vec.nanos[i + offset]; + // set the millis based on the top three digits of the nanos + long millis = secs * MILLIS_PER_SECOND + newNanos / 1_000_000; + if (millis < 0 && newNanos > 999_999) { + millis -= MILLIS_PER_SECOND; + } long utc = SerializationUtils.convertToUtc(localTimezone, millis); - seconds.write(secs); - nanos.write(formatNanos(val.getNanos())); + seconds.write(secs - baseEpochSecsLocalTz); + nanos.write(formatNanos(newNanos)); indexStatistics.updateTimestamp(utc); if (createBloomFilter) { if (bloomFilter != null) { http://git-wip-us.apache.org/repos/asf/orc/blob/9c105b92/java/core/src/test/org/apache/orc/TestVectorOrcFile.java ---------------------------------------------------------------------- diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java index 3992d4e..b254fb1 100644 --- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java +++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java @@ -1362,18 +1362,34 @@ public class TestVectorOrcFile { .blockPadding(false)); VectorizedRowBatch batch = schema.createRowBatch(); batch.size = 1000; + TimestampColumnVector timestampColVector = (TimestampColumnVector) batch.cols[0]; for (int year = minYear; year < maxYear; ++year) { - for (int ms = 1000; ms < 2000; ++ms) { - TimestampColumnVector timestampColVector = (TimestampColumnVector) batch.cols[0]; - timestampColVector.set(ms - 1000, - Timestamp.valueOf(year + - "-05-05 12:34:56." + ms)); - ((LongColumnVector) batch.cols[1]).vector[ms - 1000] = - new DateWritable(new Date(year - 1900, 11, 25)).getDays(); + for (int row = 0; row < 1000; ++row) { + String timeStr = String.format("%04d-05-05 12:34:56.%04d", year, 2*row); + timestampColVector.set(row, Timestamp.valueOf(timeStr)); } + ((LongColumnVector) batch.cols[1]).vector[0] = + new DateWritable(new Date(year - 1900, 11, 25)).getDays(); + batch.cols[1].isRepeating = true; writer.addRowBatch(batch); } + + // add one more row to check the statistics for the jvm bug case + batch.size = 1; + String timeStr = String.format("%04d-12-12 12:34:56.0001", maxYear-1); + timestampColVector.set(0, Timestamp.valueOf(timeStr)); + writer.addRowBatch(batch); writer.close(); + + // check the stats to make sure they match up to the millisecond + ColumnStatistics[] stats = writer.getStatistics(); + TimestampColumnStatistics tsStat = (TimestampColumnStatistics) stats[1]; + assertEquals(String.format("%04d-12-12 12:34:56.0", maxYear - 1), + tsStat.getMaximum().toString()); + assertEquals(String.format("%04d-05-05 12:34:56.0", minYear), + tsStat.getMinimum().toString()); + + // read back the rows Reader reader = OrcFile.createReader(file, OrcFile.readerOptions(conf)); RecordReader rows = reader.rows(); @@ -1383,27 +1399,20 @@ public class TestVectorOrcFile { for (int year = minYear; year < maxYear; ++year) { rows.nextBatch(batch); assertEquals(1000, batch.size); - for(int ms = 1000; ms < 2000; ++ms) { - StringBuilder buffer = new StringBuilder(); - times.stringifyValue(buffer, ms - 1000); - String expected = Integer.toString(year) + "-05-05 12:34:56."; - // suppress the final zeros on the string by dividing by the largest - // power of 10 that divides evenly. - int roundedMs = ms; - for(int round = 1000; round > 0; round /= 10) { - if (ms % round == 0) { - roundedMs = ms / round; - break; - } - } - expected += roundedMs; - assertEquals(expected, buffer.toString()); - assertEquals(Integer.toString(year) + "-12-25", - new DateWritable((int) dates.vector[ms - 1000]).toString()); + for(int row = 0; row < 1000; ++row) { + Timestamp expected = Timestamp.valueOf( + String.format("%04d-05-05 12:34:56.%04d", year, 2*row)); + assertEquals("ms row " + row + " " + expected, expected.getTime(), + times.time[row]); + assertEquals("nanos row " + row + " " + expected, expected.getNanos(), + times.nanos[row]); + assertEquals("year " + year + " row " + row, + Integer.toString(year) + "-12-25", + new DateWritable((int) dates.vector[row]).toString()); } } rows.nextBatch(batch); - assertEquals(0, batch.size); + assertEquals(1, batch.size); } @Test