Quanlong Huang created ORC-322:
----------------------------------
Summary: c++ writer should not adjust gmtOffset when writing
timestamps
Key: ORC-322
URL: https://issues.apache.org/jira/browse/ORC-322
Project: ORC
Issue Type: Bug
Components: C++
Reporter: Quanlong Huang
The c++ TimestampColumnWriter will adjust timestamp with gmtOffset:
{code:c++}
void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch,
uint64_t offset,
uint64_t numValues) {
......
int64_t *secs = tsBatch->data.data() + offset;
int64_t *nanos = tsBatch->nanoseconds.data() + offset;
......
bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull == nullptr || notNull[i]) {
// TimestampVectorBatch already stores data in UTC
int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000;
tsStats->increase(1);
tsStats->update(millsUTC);
secs[i] -= timezone.getVariant(secs[i]).gmtOffset; <-- should not
adjust with gmtOffset here
secs[i] -= timezone.getEpoch();
nanos[i] = formatNano(nanos[i]);
} else if (!hasNull) {
hasNull = true;
}
}
tsStats->setHasNull(hasNull);
secRleEncoder->add(secs, numValues, notNull);
nanoRleEncoder->add(nanos, numValues, notNull);
}
{code}
The java reader doesn't adjust this:
{code:java}
public void writeBatch(ColumnVector vector, int offset,
int length) throws IOException {
super.writeBatch(vector, offset, length);
TimestampColumnVector vec = (TimestampColumnVector) vector;
if (vector.isRepeating) {
......
} else {
for (int i = 0; i < length; ++i) {
if (vec.noNulls || !vec.isNull[i + offset]) {
// ignore the bottom three digits from the vec.time field
final long secs = vec.time[i + offset] / MILLIS_PER_SECOND;
final int newNanos = vec.nanos[i + offset];
// set the millis based on the top three digits of the nanos
long millis = secs * MILLIS_PER_SECOND + newNanos / 1_000_000;
if (millis < 0 && newNanos > 999_999) {
millis -= MILLIS_PER_SECOND;
}
long utc = SerializationUtils.convertToUtc(localTimezone, millis);
seconds.write(secs - baseEpochSecsLocalTz); <-- only adjust with
ORC epoch
nanos.write(formatNanos(newNanos));
indexStatistics.updateTimestamp(utc);
if (createBloomFilter) {
if (bloomFilter != null) {
bloomFilter.addLong(millis);
}
bloomFilterUtf8.addLong(utc);
}
}
}
}
}
This is a follow-up of ORC-320. I think there's a wrong assumption in c++ codes
that timestamps given to the writer's TimestampVectorBatch equal to timestamps
got from the reader's TimestampVectorBatch.
{code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)