Repository: orc Updated Branches: refs/heads/branch-1.4 084ddbc78 -> 6c4865ad9
ORC-306 Correct pre-1970 timestamps that were off by one second. Fixes #220 Signed-off-by: Owen O'Malley <omal...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/6c4865ad Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/6c4865ad Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/6c4865ad Branch: refs/heads/branch-1.4 Commit: 6c4865ad9acb75c35d97206f31b4dd9e0a3a7cb4 Parents: 084ddbc Author: Owen O'Malley <omal...@apache.org> Authored: Mon Feb 26 15:27:52 2018 -0800 Committer: Owen O'Malley <omal...@apache.org> Committed: Tue Mar 20 15:26:06 2018 -0700 ---------------------------------------------------------------------- .../org/apache/orc/impl/TreeReaderFactory.java | 12 ++-- .../java/org/apache/orc/impl/WriterImpl.java | 31 ++++++---- .../test/org/apache/orc/TestVectorOrcFile.java | 59 +++++++++++--------- 3 files changed, 60 insertions(+), 42 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/orc/blob/6c4865ad/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java index 9649be9..08a4359 100644 --- a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java +++ b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java @@ -19,6 +19,7 @@ package org.apache.orc.impl; import java.io.EOFException; import java.io.IOException; +import java.sql.Timestamp; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; @@ -48,7 +49,6 @@ import org.apache.orc.OrcProto; * Factory for creating ORC tree readers. */ public class TreeReaderFactory { - public interface Context { SchemaEvolution getSchemaEvolution(); @@ -975,12 +975,12 @@ public class TreeReaderFactory { for (int i = 0; i < batchSize; i++) { if (result.noNulls || !result.isNull[i]) { - long millis = data.next() + base_timestamp; - int newNanos = parseNanos(nanos.next()); - if (millis < 0 && newNanos != 0) { - millis -= 1; + final int newNanos = parseNanos(nanos.next()); + long millis = (data.next() + base_timestamp) + * WriterImpl.MILLIS_PER_SECOND + newNanos / 1_000_000; + if (millis < 0 && newNanos > 999_999) { + millis -= WriterImpl.MILLIS_PER_SECOND; } - millis *= WriterImpl.MILLIS_PER_SECOND; long offset = 0; // If reader and writer time zones have different rules, adjust the timezone difference // between reader and writer taking day light savings into account. http://git-wip-us.apache.org/repos/asf/orc/blob/6c4865ad/java/core/src/java/org/apache/orc/impl/WriterImpl.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java b/java/core/src/java/org/apache/orc/impl/WriterImpl.java index 32820e1..abd398e 100644 --- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java +++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java @@ -1797,11 +1797,16 @@ public class WriterImpl implements Writer, MemoryManager.Callback { int length) throws IOException { super.writeBatch(vector, offset, length); TimestampColumnVector vec = (TimestampColumnVector) vector; - Timestamp val; if (vector.isRepeating) { if (vector.noNulls || !vector.isNull[0]) { - val = vec.asScratchTimestamp(0); - long millis = val.getTime(); + // ignore the bottom three digits from the vec.time field + final long secs = vec.time[0] / MILLIS_PER_SECOND; + final int newNanos = vec.nanos[0]; + // set the millis based on the top three digits of the nanos + long millis = secs * MILLIS_PER_SECOND + newNanos / 1_000_000; + if (millis < 0 && newNanos > 999_999) { + millis -= MILLIS_PER_SECOND; + } long utc = SerializationUtils.convertToUtc(localTimezone, millis); indexStatistics.updateTimestamp(utc); if (createBloomFilter) { @@ -1810,22 +1815,26 @@ public class WriterImpl implements Writer, MemoryManager.Callback { } bloomFilterUtf8.addLong(utc); } - final long secs = millis / MILLIS_PER_SECOND - baseEpochSecsLocalTz; - final long nano = formatNanos(val.getNanos()); + final long nano = formatNanos(vec.nanos[0]); for(int i=0; i < length; ++i) { - seconds.write(secs); + seconds.write(secs - baseEpochSecsLocalTz); nanos.write(nano); } } } else { for(int i=0; i < length; ++i) { if (vec.noNulls || !vec.isNull[i + offset]) { - val = vec.asScratchTimestamp(i + offset); - long millis = val.getTime(); - long secs = millis / MILLIS_PER_SECOND - baseEpochSecsLocalTz; + // ignore the bottom three digits from the vec.time field + final long secs = vec.time[i + offset] / MILLIS_PER_SECOND; + final int newNanos = vec.nanos[i + offset]; + // set the millis based on the top three digits of the nanos + long millis = secs * MILLIS_PER_SECOND + newNanos / 1_000_000; + if (millis < 0 && newNanos > 999_999) { + millis -= MILLIS_PER_SECOND; + } long utc = SerializationUtils.convertToUtc(localTimezone, millis); - seconds.write(secs); - nanos.write(formatNanos(val.getNanos())); + seconds.write(secs - baseEpochSecsLocalTz); + nanos.write(formatNanos(newNanos)); indexStatistics.updateTimestamp(utc); if (createBloomFilter) { if (bloomFilter != null) { http://git-wip-us.apache.org/repos/asf/orc/blob/6c4865ad/java/core/src/test/org/apache/orc/TestVectorOrcFile.java ---------------------------------------------------------------------- diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java index 4ca4a40..182f969 100644 --- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java +++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java @@ -1362,18 +1362,34 @@ public class TestVectorOrcFile { .blockPadding(false)); VectorizedRowBatch batch = schema.createRowBatch(); batch.size = 1000; + TimestampColumnVector timestampColVector = (TimestampColumnVector) batch.cols[0]; for (int year = minYear; year < maxYear; ++year) { - for (int ms = 1000; ms < 2000; ++ms) { - TimestampColumnVector timestampColVector = (TimestampColumnVector) batch.cols[0]; - timestampColVector.set(ms - 1000, - Timestamp.valueOf(year + - "-05-05 12:34:56." + ms)); - ((LongColumnVector) batch.cols[1]).vector[ms - 1000] = - new DateWritable(new Date(year - 1900, 11, 25)).getDays(); + for (int row = 0; row < 1000; ++row) { + String timeStr = String.format("%04d-05-05 12:34:56.%04d", year, 2*row); + timestampColVector.set(row, Timestamp.valueOf(timeStr)); } + ((LongColumnVector) batch.cols[1]).vector[0] = + new DateWritable(new Date(year - 1900, 11, 25)).getDays(); + batch.cols[1].isRepeating = true; writer.addRowBatch(batch); } + + // add one more row to check the statistics for the jvm bug case + batch.size = 1; + String timeStr = String.format("%04d-12-12 12:34:56.0001", maxYear-1); + timestampColVector.set(0, Timestamp.valueOf(timeStr)); + writer.addRowBatch(batch); writer.close(); + + // check the stats to make sure they match up to the millisecond + ColumnStatistics[] stats = writer.getStatistics(); + TimestampColumnStatistics tsStat = (TimestampColumnStatistics) stats[1]; + assertEquals(String.format("%04d-12-12 12:34:56.0", maxYear - 1), + tsStat.getMaximum().toString()); + assertEquals(String.format("%04d-05-05 12:34:56.0", minYear), + tsStat.getMinimum().toString()); + + // read back the rows Reader reader = OrcFile.createReader(file, OrcFile.readerOptions(conf)); RecordReader rows = reader.rows(); @@ -1383,27 +1399,20 @@ public class TestVectorOrcFile { for (int year = minYear; year < maxYear; ++year) { rows.nextBatch(batch); assertEquals(1000, batch.size); - for(int ms = 1000; ms < 2000; ++ms) { - StringBuilder buffer = new StringBuilder(); - times.stringifyValue(buffer, ms - 1000); - String expected = Integer.toString(year) + "-05-05 12:34:56."; - // suppress the final zeros on the string by dividing by the largest - // power of 10 that divides evenly. - int roundedMs = ms; - for(int round = 1000; round > 0; round /= 10) { - if (ms % round == 0) { - roundedMs = ms / round; - break; - } - } - expected += roundedMs; - assertEquals(expected, buffer.toString()); - assertEquals(Integer.toString(year) + "-12-25", - new DateWritable((int) dates.vector[ms - 1000]).toString()); + for(int row = 0; row < 1000; ++row) { + Timestamp expected = Timestamp.valueOf( + String.format("%04d-05-05 12:34:56.%04d", year, 2*row)); + assertEquals("ms row " + row + " " + expected, expected.getTime(), + times.time[row]); + assertEquals("nanos row " + row + " " + expected, expected.getNanos(), + times.nanos[row]); + assertEquals("year " + year + " row " + row, + Integer.toString(year) + "-12-25", + new DateWritable((int) dates.vector[row]).toString()); } } rows.nextBatch(batch); - assertEquals(0, batch.size); + assertEquals(1, batch.size); } @Test