This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 16712afa8d GH-48246: [C++][Parquet] Fix pre-1970 INT96 timestamps
roundtrip (#48247)
16712afa8d is described below
commit 16712afa8d44303c29d6efd0db05adcbb1783016
Author: Zehua Zou <[email protected]>
AuthorDate: Mon Dec 1 10:41:39 2025 +0800
GH-48246: [C++][Parquet] Fix pre-1970 INT96 timestamps roundtrip (#48247)
### Rationale for this change
Parquet should return the same timestamps as written.
### What changes are included in this PR?
`ArrowTimestampToImpalaTimestamp` will adjust the calculation based on
whether the timestamp is negative
### Are these changes tested?
Yes.
### Are there any user-facing changes?
No.
* GitHub Issue: #48246
Authored-by: Zehua Zou <[email protected]>
Signed-off-by: Gang Wu <[email protected]>
---
cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 29 ++++++++++++++++++-----
cpp/src/parquet/column_writer.h | 13 ++++++----
2 files changed, 31 insertions(+), 11 deletions(-)
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index cd69b2f946..18581e7260 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -28,6 +28,7 @@
#include <functional>
#include <set>
#include <sstream>
+#include <utility>
#include <vector>
#include "arrow/array/builder_binary.h"
@@ -74,6 +75,7 @@
#include "parquet/page_index.h"
#include "parquet/properties.h"
#include "parquet/test_util.h"
+#include "parquet/types.h"
using arrow::Array;
using arrow::ArrayData;
@@ -4149,14 +4151,29 @@ INSTANTIATE_TEST_SUITE_P(Repetition_type,
TestNestedSchemaRead,
::testing::Values(Repetition::REQUIRED,
Repetition::OPTIONAL));
TEST(TestImpalaConversion, ArrowTimestampToImpalaTimestamp) {
- // June 20, 2017 16:32:56 and 123456789 nanoseconds
- int64_t nanoseconds = INT64_C(1497976376123456789);
+ std::vector<std::pair<int64_t, Int96>> test_cases = {
+ // June 20, 2017 16:32:56 and 123456789 nanoseconds
+ {INT64_C(1497976376123456789),
+ {{UINT32_C(632093973), UINT32_C(13871), UINT32_C(2457925)}}},
+ // January 1, 1970 00:00:00 and 000000000 nanoseconds
+ {INT64_C(0), {{UINT32_C(0), UINT32_C(0), UINT32_C(2440588)}}},
+ // December 31, 1969 23:59:59 and 999999000 nanoseconds
+ {INT64_C(-1000), {{UINT32_C(2437872664), UINT32_C(20116),
UINT32_C(2440587)}}},
+ // December 31, 1969 00:00:00 and 000000000 nanoseconds
+ {INT64_C(-86400000000000), {{UINT32_C(0), UINT32_C(0),
UINT32_C(2440587)}}},
+ // January 1, 1970 00:00:00 and 000001000 nanoseconds
+ {INT64_C(1000), {{UINT32_C(1000), UINT32_C(0), UINT32_C(2440588)}}},
+ // January 2, 1970 00:00:00 and 000000000 nanoseconds
+ {INT64_C(86400000000000), {{UINT32_C(0), UINT32_C(0),
UINT32_C(2440589)}}},
+ };
- Int96 calculated;
+ for (auto& [timestamp, impala_timestamp] : test_cases) {
+ ASSERT_EQ(timestamp, ::parquet::Int96GetNanoSeconds(impala_timestamp));
- Int96 expected = {{UINT32_C(632093973), UINT32_C(13871), UINT32_C(2457925)}};
- ::parquet::internal::NanosecondsToImpalaTimestamp(nanoseconds, &calculated);
- ASSERT_EQ(expected, calculated);
+ Int96 calculated;
+ ::parquet::internal::NanosecondsToImpalaTimestamp(timestamp, &calculated);
+ ASSERT_EQ(impala_timestamp, calculated);
+ }
}
void TryReadDataFile(const std::string& path,
diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h
index 2a046a0ca5..5b56eb010a 100644
--- a/cpp/src/parquet/column_writer.h
+++ b/cpp/src/parquet/column_writer.h
@@ -259,14 +259,17 @@ constexpr int64_t kJulianEpochOffsetDays =
INT64_C(2440588);
template <int64_t UnitPerDay, int64_t NanosecondsPerUnit>
inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96*
impala_timestamp) {
- int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays;
- (*impala_timestamp).value[2] = (uint32_t)julian_days;
-
+ auto julian_days = static_cast<int32_t>(time / UnitPerDay +
kJulianEpochOffsetDays);
int64_t last_day_units = time % UnitPerDay;
- auto last_day_nanos = last_day_units * NanosecondsPerUnit;
+ if (last_day_units < 0) {
+ --julian_days;
+ last_day_units += UnitPerDay;
+ }
+ impala_timestamp->value[2] = static_cast<uint32_t>(julian_days);
+ uint64_t last_day_nanos = static_cast<uint64_t>(last_day_units) *
NanosecondsPerUnit;
// impala_timestamp will be unaligned every other entry so do memcpy instead
// of assign and reinterpret cast to avoid undefined behavior.
- std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t));
+ std::memcpy(impala_timestamp, &last_day_nanos, sizeof(uint64_t));
}
constexpr int64_t kSecondsInNanos = INT64_C(1000000000);