This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-1.7
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-1.7 by this push:
new 1740c82a6 ORC-1151: [C++] Fix ColumnWriter for non-UTC Timestamp
columns (#1088)
1740c82a6 is described below
commit 1740c82a6e669d653e507421fce8039d2bade17f
Author: noirello <[email protected]>
AuthorDate: Tue Apr 19 06:56:25 2022 +0200
ORC-1151: [C++] Fix ColumnWriter for non-UTC Timestamp columns (#1088)
### What changes were proposed in this pull request?
Fix converting non UTC timestamps for statistics.
### Why are the changes needed?
Currently, the statistics for timestamp columns are incorrect, when the
writer's time zone is not UTC.
### How was this patch tested?
Ran the existing test cases.
(cherry picked from commit 9042421d99ead64aecf6e005fdd6abec328f6301)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
c++/src/ColumnWriter.cc | 2 +-
c++/test/TestTimestampStatistics.cc | 97 +++++++++++++++++++++++++++++++++++++
2 files changed, 98 insertions(+), 1 deletion(-)
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index a25959490..22d5a9621 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -1809,7 +1809,7 @@ namespace orc {
// TimestampVectorBatch already stores data in UTC
int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000;
if (!isUTC) {
- millsUTC = timezone.convertToUTC(millsUTC);
+ millsUTC = timezone.convertToUTC(secs[i]) * 1000 + nanos[i] /
1000000;
}
++count;
if (enableBloomFilter) {
diff --git a/c++/test/TestTimestampStatistics.cc
b/c++/test/TestTimestampStatistics.cc
index 302ef9b07..ac9744363 100644
--- a/c++/test/TestTimestampStatistics.cc
+++ b/c++/test/TestTimestampStatistics.cc
@@ -21,11 +21,16 @@
#include "Adaptor.hh"
+#include "MemoryInputStream.hh"
+#include "MemoryOutputStream.hh"
+
#include "wrap/gmock.h"
#include "wrap/gtest-wrapper.h"
namespace orc {
+ static const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M
+
TEST(TestTimestampStatistics, testOldFile) {
std::stringstream ss;
@@ -57,4 +62,96 @@ namespace orc {
EXPECT_EQ("Data type: Timestamp\nValues: 12\nHas null: no\nMinimum:
1995-01-01 00:00:00.688\nLowerBound: 1995-01-01 00:00:00.688\nMaximum:
2037-01-01 00:00:00.0\nUpperBound: 2037-01-01 00:00:00.1\n",
stripeColStats->toString());
}
+ TEST(TestTimestampStatistics, testTimezoneUTC) {
+ MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+ MemoryPool *pool = getDefaultPool();
+ std::unique_ptr<Type>
type(Type::buildTypeFromString("struct<col:timestamp>"));
+ WriterOptions wOptions;
+ wOptions.setMemoryPool(pool);
+ std::unique_ptr<Writer> writer = createWriter(*type, &memStream, wOptions);
+ std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(1024);
+ StructVectorBatch *root = dynamic_cast<StructVectorBatch *>(batch.get());
+ TimestampVectorBatch *col = dynamic_cast<orc::TimestampVectorBatch
*>(root->fields[0]);
+
+ int64_t expectedMinMillis = 1650133963321; // 2022-04-16T18:32:43.321+00:00
+ int64_t expectedMaxMillis = 1650133964321; // 2022-04-16T18:32:44.321+00:00
+
+ col->data[0] = expectedMinMillis / 1000;
+ col->nanoseconds[0] = expectedMinMillis % 1000 * 1000000;
+ col->data[1] = expectedMaxMillis / 1000;
+ col->nanoseconds[1] = expectedMaxMillis % 1000 * 1000000;
+ col->numElements = 2;
+ root->numElements = 2;
+
+ writer->add(*batch);
+ writer->close();
+
+ std::unique_ptr<InputStream> inStream(new MemoryInputStream(
+ memStream.getData(), memStream.getLength()));
+ ReaderOptions rOptions;
+ rOptions.setMemoryPool(*pool);
+ std::unique_ptr<Reader> reader = createReader(std::move(inStream),
rOptions);
+
+ std::unique_ptr<StripeStatistics> stripeStats =
reader->getStripeStatistics(0);
+ const TimestampColumnStatistics* stripeColStats =
+ reinterpret_cast<const
TimestampColumnStatistics*>(stripeStats->getColumnStatistics(1));
+
+ EXPECT_TRUE(stripeColStats->hasLowerBound());
+ EXPECT_TRUE(stripeColStats->hasUpperBound());
+ EXPECT_TRUE(stripeColStats->hasMinimum());
+ EXPECT_TRUE(stripeColStats->hasMaximum());
+ EXPECT_EQ(stripeColStats->getMinimum(), expectedMinMillis);
+ EXPECT_EQ(stripeColStats->getMaximum(), expectedMaxMillis);
+ EXPECT_EQ(stripeColStats->getLowerBound(), expectedMinMillis);
+ EXPECT_EQ(stripeColStats->getUpperBound(), expectedMaxMillis + 1);
+ }
+
+ TEST(TestTimestampStatistics, testTimezoneNonUTC) {
+ MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+ MemoryPool *pool = getDefaultPool();
+ std::unique_ptr<Type>
type(Type::buildTypeFromString("struct<col:timestamp>"));
+ WriterOptions wOptions;
+ wOptions.setMemoryPool(pool);
+ wOptions.setTimezoneName("America/Los_Angeles");
+ std::unique_ptr<Writer> writer = createWriter(*type, &memStream, wOptions);
+ std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(1024);
+ StructVectorBatch *root = dynamic_cast<StructVectorBatch *>(batch.get());
+ TimestampVectorBatch *col = dynamic_cast<orc::TimestampVectorBatch
*>(root->fields[0]);
+
+ int64_t minMillis = 1650133963321; // 2022-04-16T18:32:43.321+00:00
+ int64_t maxMillis = 1650133964321; // 2022-04-16T18:32:44.321+00:00
+
+ col->data[0] = minMillis / 1000;
+ col->nanoseconds[0] = minMillis % 1000 * 1000000;
+ col->data[1] = maxMillis / 1000;
+ col->nanoseconds[1] = maxMillis % 1000 * 1000000;
+ col->numElements = 2;
+ root->numElements = 2;
+
+ writer->add(*batch);
+ writer->close();
+
+ std::unique_ptr<InputStream> inStream(new MemoryInputStream(
+ memStream.getData(), memStream.getLength()));
+ ReaderOptions rOptions;
+ rOptions.setMemoryPool(*pool);
+ std::unique_ptr<Reader> reader = createReader(std::move(inStream),
rOptions);
+
+ std::unique_ptr<StripeStatistics> stripeStats =
reader->getStripeStatistics(0);
+ const TimestampColumnStatistics* stripeColStats =
+ reinterpret_cast<const
TimestampColumnStatistics*>(stripeStats->getColumnStatistics(1));
+
+ int64_t expectedMaxMillis = 1650108764321; // 2022-04-16T11:32:44.321+00:00
+ int64_t expectedMinMillis = 1650108763321; // 2022-04-16T11:32:43.321+00:00
+
+ EXPECT_TRUE(stripeColStats->hasLowerBound());
+ EXPECT_TRUE(stripeColStats->hasUpperBound());
+ EXPECT_TRUE(stripeColStats->hasMinimum());
+ EXPECT_TRUE(stripeColStats->hasMaximum());
+ EXPECT_EQ(stripeColStats->getMinimum(), expectedMinMillis);
+ EXPECT_EQ(stripeColStats->getMaximum(), expectedMaxMillis);
+ EXPECT_EQ(stripeColStats->getLowerBound(), expectedMinMillis);
+ EXPECT_EQ(stripeColStats->getUpperBound(), expectedMaxMillis + 1);
+ }
+
} // namespace