This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-1.7
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/branch-1.7 by this push:
     new 1740c82a6 ORC-1151: [C++] Fix ColumnWriter for non-UTC Timestamp 
columns (#1088)
1740c82a6 is described below

commit 1740c82a6e669d653e507421fce8039d2bade17f
Author: noirello <[email protected]>
AuthorDate: Tue Apr 19 06:56:25 2022 +0200

    ORC-1151: [C++] Fix ColumnWriter for non-UTC Timestamp columns (#1088)
    
    ### What changes were proposed in this pull request?
    Fix converting non UTC timestamps for statistics.
    
    ### Why are the changes needed?
    Currently, the statistics for timestamp columns are incorrect, when the 
writer's time zone is not UTC.
    
    ### How was this patch tested?
    Ran the existing test cases.
    
    (cherry picked from commit 9042421d99ead64aecf6e005fdd6abec328f6301)
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 c++/src/ColumnWriter.cc             |  2 +-
 c++/test/TestTimestampStatistics.cc | 97 +++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+), 1 deletion(-)

diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index a25959490..22d5a9621 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -1809,7 +1809,7 @@ namespace orc {
         // TimestampVectorBatch already stores data in UTC
         int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000;
         if (!isUTC) {
-          millsUTC = timezone.convertToUTC(millsUTC);
+          millsUTC = timezone.convertToUTC(secs[i]) * 1000 + nanos[i] / 
1000000;
         }
         ++count;
         if (enableBloomFilter) {
diff --git a/c++/test/TestTimestampStatistics.cc 
b/c++/test/TestTimestampStatistics.cc
index 302ef9b07..ac9744363 100644
--- a/c++/test/TestTimestampStatistics.cc
+++ b/c++/test/TestTimestampStatistics.cc
@@ -21,11 +21,16 @@
 
 #include "Adaptor.hh"
 
+#include "MemoryInputStream.hh"
+#include "MemoryOutputStream.hh"
+
 #include "wrap/gmock.h"
 #include "wrap/gtest-wrapper.h"
 
 namespace orc {
 
+  static const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M
+
   TEST(TestTimestampStatistics, testOldFile) {
 
     std::stringstream ss;
@@ -57,4 +62,96 @@ namespace orc {
     EXPECT_EQ("Data type: Timestamp\nValues: 12\nHas null: no\nMinimum: 
1995-01-01 00:00:00.688\nLowerBound: 1995-01-01 00:00:00.688\nMaximum: 
2037-01-01 00:00:00.0\nUpperBound: 2037-01-01 00:00:00.1\n", 
stripeColStats->toString());
   }
 
+  TEST(TestTimestampStatistics, testTimezoneUTC) {
+    MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+    MemoryPool *pool = getDefaultPool();
+    std::unique_ptr<Type> 
type(Type::buildTypeFromString("struct<col:timestamp>"));
+    WriterOptions wOptions;
+    wOptions.setMemoryPool(pool);
+    std::unique_ptr<Writer> writer = createWriter(*type, &memStream, wOptions);
+    std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(1024);
+    StructVectorBatch *root = dynamic_cast<StructVectorBatch *>(batch.get());
+    TimestampVectorBatch *col = dynamic_cast<orc::TimestampVectorBatch 
*>(root->fields[0]);
+
+    int64_t expectedMinMillis = 1650133963321; // 2022-04-16T18:32:43.321+00:00
+    int64_t expectedMaxMillis = 1650133964321; // 2022-04-16T18:32:44.321+00:00
+
+    col->data[0] = expectedMinMillis / 1000;
+    col->nanoseconds[0] = expectedMinMillis % 1000 * 1000000;
+    col->data[1] = expectedMaxMillis / 1000;
+    col->nanoseconds[1] = expectedMaxMillis % 1000 * 1000000;
+    col->numElements = 2;
+    root->numElements = 2;
+
+    writer->add(*batch);
+    writer->close();
+
+    std::unique_ptr<InputStream> inStream(new MemoryInputStream(
+      memStream.getData(), memStream.getLength()));
+    ReaderOptions rOptions;
+    rOptions.setMemoryPool(*pool);
+    std::unique_ptr<Reader> reader = createReader(std::move(inStream), 
rOptions);
+
+    std::unique_ptr<StripeStatistics> stripeStats = 
reader->getStripeStatistics(0);
+    const TimestampColumnStatistics* stripeColStats =
+      reinterpret_cast<const 
TimestampColumnStatistics*>(stripeStats->getColumnStatistics(1));
+
+    EXPECT_TRUE(stripeColStats->hasLowerBound());
+    EXPECT_TRUE(stripeColStats->hasUpperBound());
+    EXPECT_TRUE(stripeColStats->hasMinimum());
+    EXPECT_TRUE(stripeColStats->hasMaximum());
+    EXPECT_EQ(stripeColStats->getMinimum(), expectedMinMillis);
+    EXPECT_EQ(stripeColStats->getMaximum(), expectedMaxMillis);
+    EXPECT_EQ(stripeColStats->getLowerBound(), expectedMinMillis);
+    EXPECT_EQ(stripeColStats->getUpperBound(), expectedMaxMillis + 1);
+  }
+
+  TEST(TestTimestampStatistics, testTimezoneNonUTC) {
+    MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+    MemoryPool *pool = getDefaultPool();
+    std::unique_ptr<Type> 
type(Type::buildTypeFromString("struct<col:timestamp>"));
+    WriterOptions wOptions;
+    wOptions.setMemoryPool(pool);
+    wOptions.setTimezoneName("America/Los_Angeles");
+    std::unique_ptr<Writer> writer = createWriter(*type, &memStream, wOptions);
+    std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(1024);
+    StructVectorBatch *root = dynamic_cast<StructVectorBatch *>(batch.get());
+    TimestampVectorBatch *col = dynamic_cast<orc::TimestampVectorBatch 
*>(root->fields[0]);
+
+    int64_t minMillis = 1650133963321; // 2022-04-16T18:32:43.321+00:00
+    int64_t maxMillis = 1650133964321; // 2022-04-16T18:32:44.321+00:00
+
+    col->data[0] = minMillis / 1000;
+    col->nanoseconds[0] = minMillis % 1000 * 1000000;
+    col->data[1] = maxMillis / 1000;
+    col->nanoseconds[1] = maxMillis % 1000 * 1000000;
+    col->numElements = 2;
+    root->numElements = 2;
+
+    writer->add(*batch);
+    writer->close();
+
+    std::unique_ptr<InputStream> inStream(new MemoryInputStream(
+      memStream.getData(), memStream.getLength()));
+    ReaderOptions rOptions;
+    rOptions.setMemoryPool(*pool);
+    std::unique_ptr<Reader> reader = createReader(std::move(inStream), 
rOptions);
+
+    std::unique_ptr<StripeStatistics> stripeStats = 
reader->getStripeStatistics(0);
+    const TimestampColumnStatistics* stripeColStats =
+      reinterpret_cast<const 
TimestampColumnStatistics*>(stripeStats->getColumnStatistics(1));
+
+    int64_t expectedMaxMillis = 1650108764321; // 2022-04-16T11:32:44.321+00:00
+    int64_t expectedMinMillis = 1650108763321; // 2022-04-16T11:32:43.321+00:00
+
+    EXPECT_TRUE(stripeColStats->hasLowerBound());
+    EXPECT_TRUE(stripeColStats->hasUpperBound());
+    EXPECT_TRUE(stripeColStats->hasMinimum());
+    EXPECT_TRUE(stripeColStats->hasMaximum());
+    EXPECT_EQ(stripeColStats->getMinimum(), expectedMinMillis);
+    EXPECT_EQ(stripeColStats->getMaximum(), expectedMaxMillis);
+    EXPECT_EQ(stripeColStats->getLowerBound(), expectedMinMillis);
+    EXPECT_EQ(stripeColStats->getUpperBound(), expectedMaxMillis + 1);
+  }
+
 }  // namespace

Reply via email to