[ https://issues.apache.org/jira/browse/ORC-322?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Gang Wu updated ORC-322: ------------------------ Fix Version/s: 1.5.0 > c++ writer should not adjust gmtOffset when writing timestamps > -------------------------------------------------------------- > > Key: ORC-322 > URL: https://issues.apache.org/jira/browse/ORC-322 > Project: ORC > Issue Type: Bug > Components: C++ > Reporter: Quanlong Huang > Assignee: Gang Wu > Priority: Major > Fix For: 1.5.0 > > > The c++ TimestampColumnWriter will adjust timestamp with gmtOffset: > {code:java} > void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch, > uint64_t offset, > uint64_t numValues) { > ...... > int64_t *secs = tsBatch->data.data() + offset; > int64_t *nanos = tsBatch->nanoseconds.data() + offset; > ...... > bool hasNull = false; > for (uint64_t i = 0; i < numValues; ++i) { > if (notNull == nullptr || notNull[i]) { > // TimestampVectorBatch already stores data in UTC > int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000; > tsStats->increase(1); > tsStats->update(millsUTC); > secs[i] -= timezone.getVariant(secs[i]).gmtOffset; <-- should not > adjust with gmtOffset here > secs[i] -= timezone.getEpoch(); > nanos[i] = formatNano(nanos[i]); > } else if (!hasNull) { > hasNull = true; > } > } > tsStats->setHasNull(hasNull); > secRleEncoder->add(secs, numValues, notNull); > nanoRleEncoder->add(nanos, numValues, notNull); > } > {code} > The java reader doesn't adjust this: > {code:java} > public void writeBatch(ColumnVector vector, int offset, > int length) throws IOException { > super.writeBatch(vector, offset, length); > TimestampColumnVector vec = (TimestampColumnVector) vector; > if (vector.isRepeating) { > ...... > } else { > for (int i = 0; i < length; ++i) { > if (vec.noNulls || !vec.isNull[i + offset]) { > // ignore the bottom three digits from the vec.time field > final long secs = vec.time[i + offset] / MILLIS_PER_SECOND; > final int newNanos = vec.nanos[i + offset]; > // set the millis based on the top three digits of the nanos > long millis = secs * MILLIS_PER_SECOND + newNanos / 1_000_000; > if (millis < 0 && newNanos > 999_999) { > millis -= MILLIS_PER_SECOND; > } > long utc = SerializationUtils.convertToUtc(localTimezone, millis); > seconds.write(secs - baseEpochSecsLocalTz); <-- only adjust with > ORC epoch > nanos.write(formatNanos(newNanos)); > indexStatistics.updateTimestamp(utc); > if (createBloomFilter) { > if (bloomFilter != null) { > bloomFilter.addLong(millis); > } > bloomFilterUtf8.addLong(utc); > } > } > } > } > } > {code} > This is a follow-up of ORC-320. I think there's a wrong assumption in c++ > codes that timestamps given to the writer's TimestampVectorBatch are equal to > timestamps got from the reader's TimestampVectorBatch (see codes in > TestWriter::writeTimestamp). Noticed that there're gmtOffset processing > logics in the c++ TimestampColumnPrinter. > To reveal the bug: > {code:bash} > $ cat /etc/timezone > America/Los_Angeles > $ cat ts.csv # prepare a csv file with two timestamps > 1520762399 > 1520762400 > $ date -d @1520762399 > Sun Mar 11 01:59:59 PST 2018 > $ date -d @1520762400 > Sun Mar 11 03:00:00 PDT 2018 > $ build/tools/src/csv-import "struct<col0:timestamp>" ts.csv ts.orc # > generate orc file by c++ writer > [2018-03-14 21:41:58] Start importing Orc file... > [2018-03-14 21:41:58] Finish importing Orc file. > [2018-03-14 21:41:58] Total writer elasped time: 0.000475s. > [2018-03-14 21:41:58] Total writer CPU time: 0.000474s. > $ build/tools/src/orc-contents ts.orc # read by c++ reader, results > are not we written > {"col0": "2018-03-11 10:59:59.0"} > {"col0": "2018-03-11 10:00:00.0"} > $ java -jar java/tools/target/orc-tools-1.5.0-SNAPSHOT-uber.jar data ts.orc > # read by java reader, results are the same as c++ reader > Processing data file ts.orc [length: 257] > {"col0":"2018-03-11 10:59:59.0"} > {"col0":"2018-03-11 10:00:00.0"} > {code} > Since there're no tools importing timestamps as long, we can use java codes > to generate a correct orc file: > {code:java} > public class TimestampWriter { > public static void main(String[] args) throws IOException { > TypeDescription schema = > TypeDescription.fromString("struct<col0:timestamp>"); > OrcFile.WriterOptions options = OrcFile.writerOptions(new > Configuration()).setSchema(schema); > Writer writer = OrcFile.createWriter(new Path("./ts_java.orc"), options); > VectorizedRowBatch batch = schema.createRowBatch(); > TimestampColumnVector tsc = (TimestampColumnVector) batch.cols[0]; > tsc.time[0] = 1520762399000L; tsc.nanos[0] = 0; // time array is > milliseconds > tsc.time[1] = 1520762400000L; tsc.nanos[1] = 0; > batch.size = 2; > writer.addRowBatch(batch); > writer.close(); > } > } > {code} > Run the codes to generate `ts_java.orc` and verify this orc file: > {code:bash} > $ build/tools/src/orc-contents ts_java.orc > > > {"col0": "2018-03-11 01:59:59.0"} # The correct results > {"col0": "2018-03-11 03:00:00.0"} > $ java -jar java/tools/target/orc-tools-1.5.0-SNAPSHOT-uber.jar data > ts_java.orc > Processing data file ts_java.orc [length: 251] > {"col0":"2018-03-11 01:59:59.0"} # The correct results > {"col0":"2018-03-11 03:00:00.0"} > {code} -- This message was sent by Atlassian JIRA (v7.6.3#76005)