This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 16712afa8d GH-48246: [C++][Parquet] Fix pre-1970 INT96 timestamps 
roundtrip (#48247)
16712afa8d is described below

commit 16712afa8d44303c29d6efd0db05adcbb1783016
Author: Zehua Zou <[email protected]>
AuthorDate: Mon Dec 1 10:41:39 2025 +0800

    GH-48246: [C++][Parquet] Fix pre-1970 INT96 timestamps roundtrip (#48247)
    
    ### Rationale for this change
    Parquet should return the same timestamps as written.
    
    ### What changes are included in this PR?
    `ArrowTimestampToImpalaTimestamp` will adjust the calculation based on 
whether the timestamp is negative
    
    ### Are these changes tested?
    Yes.
    
    ### Are there any user-facing changes?
    No.
    * GitHub Issue: #48246
    
    Authored-by: Zehua Zou <[email protected]>
    Signed-off-by: Gang Wu <[email protected]>
---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 29 ++++++++++++++++++-----
 cpp/src/parquet/column_writer.h                   | 13 ++++++----
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc 
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index cd69b2f946..18581e7260 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -28,6 +28,7 @@
 #include <functional>
 #include <set>
 #include <sstream>
+#include <utility>
 #include <vector>
 
 #include "arrow/array/builder_binary.h"
@@ -74,6 +75,7 @@
 #include "parquet/page_index.h"
 #include "parquet/properties.h"
 #include "parquet/test_util.h"
+#include "parquet/types.h"
 
 using arrow::Array;
 using arrow::ArrayData;
@@ -4149,14 +4151,29 @@ INSTANTIATE_TEST_SUITE_P(Repetition_type, 
TestNestedSchemaRead,
                          ::testing::Values(Repetition::REQUIRED, 
Repetition::OPTIONAL));
 
 TEST(TestImpalaConversion, ArrowTimestampToImpalaTimestamp) {
-  // June 20, 2017 16:32:56 and 123456789 nanoseconds
-  int64_t nanoseconds = INT64_C(1497976376123456789);
+  std::vector<std::pair<int64_t, Int96>> test_cases = {
+      // June 20, 2017 16:32:56 and 123456789 nanoseconds
+      {INT64_C(1497976376123456789),
+       {{UINT32_C(632093973), UINT32_C(13871), UINT32_C(2457925)}}},
+      // January 1, 1970 00:00:00 and 000000000 nanoseconds
+      {INT64_C(0), {{UINT32_C(0), UINT32_C(0), UINT32_C(2440588)}}},
+      // December 31, 1969 23:59:59 and 999999000 nanoseconds
+      {INT64_C(-1000), {{UINT32_C(2437872664), UINT32_C(20116), 
UINT32_C(2440587)}}},
+      // December 31, 1969 00:00:00 and 000000000 nanoseconds
+      {INT64_C(-86400000000000), {{UINT32_C(0), UINT32_C(0), 
UINT32_C(2440587)}}},
+      // January 1, 1970 00:00:00 and 000001000 nanoseconds
+      {INT64_C(1000), {{UINT32_C(1000), UINT32_C(0), UINT32_C(2440588)}}},
+      // January 2, 1970 00:00:00 and 000000000 nanoseconds
+      {INT64_C(86400000000000), {{UINT32_C(0), UINT32_C(0), 
UINT32_C(2440589)}}},
+  };
 
-  Int96 calculated;
+  for (auto& [timestamp, impala_timestamp] : test_cases) {
+    ASSERT_EQ(timestamp, ::parquet::Int96GetNanoSeconds(impala_timestamp));
 
-  Int96 expected = {{UINT32_C(632093973), UINT32_C(13871), UINT32_C(2457925)}};
-  ::parquet::internal::NanosecondsToImpalaTimestamp(nanoseconds, &calculated);
-  ASSERT_EQ(expected, calculated);
+    Int96 calculated;
+    ::parquet::internal::NanosecondsToImpalaTimestamp(timestamp, &calculated);
+    ASSERT_EQ(impala_timestamp, calculated);
+  }
 }
 
 void TryReadDataFile(const std::string& path,
diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h
index 2a046a0ca5..5b56eb010a 100644
--- a/cpp/src/parquet/column_writer.h
+++ b/cpp/src/parquet/column_writer.h
@@ -259,14 +259,17 @@ constexpr int64_t kJulianEpochOffsetDays = 
INT64_C(2440588);
 
 template <int64_t UnitPerDay, int64_t NanosecondsPerUnit>
 inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* 
impala_timestamp) {
-  int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays;
-  (*impala_timestamp).value[2] = (uint32_t)julian_days;
-
+  auto julian_days = static_cast<int32_t>(time / UnitPerDay + 
kJulianEpochOffsetDays);
   int64_t last_day_units = time % UnitPerDay;
-  auto last_day_nanos = last_day_units * NanosecondsPerUnit;
+  if (last_day_units < 0) {
+    --julian_days;
+    last_day_units += UnitPerDay;
+  }
+  impala_timestamp->value[2] = static_cast<uint32_t>(julian_days);
+  uint64_t last_day_nanos = static_cast<uint64_t>(last_day_units) * 
NanosecondsPerUnit;
   // impala_timestamp will be unaligned every other entry so do memcpy instead
   // of assign and reinterpret cast to avoid undefined behavior.
-  std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t));
+  std::memcpy(impala_timestamp, &last_day_nanos, sizeof(uint64_t));
 }
 
 constexpr int64_t kSecondsInNanos = INT64_C(1000000000);

Reply via email to