This is an automated email from the ASF dual-hosted git repository.
bkietz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 22df70a0ca GH-37799: [C++] Compute: CommonTemporal support time32 and
time64 casting (#37949)
22df70a0ca is described below
commit 22df70a0caf888084ae13d7f3e0178a477d1e7d7
Author: mwish <[email protected]>
AuthorDate: Mon Oct 2 22:51:50 2023 +0800
GH-37799: [C++] Compute: CommonTemporal support time32 and time64 casting
(#37949)
### Rationale for this change
The original problem in mentioned in
https://github.com/apache/arrow/issues/37799
1. `SECOND` in Parquet would always cast to `MILLS`
2. `eq` not support `eq(time32[s], time32[ms)`
This patch is as advice in
https://github.com/apache/arrow/issues/37799#issuecomment-1737747352 . We tent
to add time32 and time64 in `CommonTemporal`.
### What changes are included in this PR?
Support time32 and time64 with different time unit in
`arrow::compute::internal::CommonTemporal`.
### Are these changes tested?
Yes
### Are there any user-facing changes?
bugfix
* Closes: #37799
Authored-by: mwish <[email protected]>
Signed-off-by: Benjamin Kietzman <[email protected]>
---
cpp/src/arrow/compute/kernels/codegen_internal.cc | 49 ++++++++++++++++++----
.../arrow/compute/kernels/codegen_internal_test.cc | 12 ++++++
cpp/src/arrow/dataset/file_parquet_test.cc | 25 +++++++++++
3 files changed, 77 insertions(+), 9 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc
b/cpp/src/arrow/compute/kernels/codegen_internal.cc
index 8e2669bd3d..00a833742f 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -251,6 +251,7 @@ TypeHolder CommonTemporal(const TypeHolder* begin, size_t
count) {
bool saw_date32 = false;
bool saw_date64 = false;
bool saw_duration = false;
+ bool saw_time_since_midnight = false;
const TypeHolder* end = begin + count;
for (auto it = begin; it != end; it++) {
auto id = it->type->id();
@@ -271,6 +272,18 @@ TypeHolder CommonTemporal(const TypeHolder* begin, size_t
count) {
finest_unit = std::max(finest_unit, ty.unit());
continue;
}
+ case Type::TIME32: {
+ const auto& type = checked_cast<const Time32Type&>(*it->type);
+ finest_unit = std::max(finest_unit, type.unit());
+ saw_time_since_midnight = true;
+ continue;
+ }
+ case Type::TIME64: {
+ const auto& type = checked_cast<const Time64Type&>(*it->type);
+ finest_unit = std::max(finest_unit, type.unit());
+ saw_time_since_midnight = true;
+ continue;
+ }
case Type::DURATION: {
const auto& ty = checked_cast<const DurationType&>(*it->type);
finest_unit = std::max(finest_unit, ty.unit());
@@ -282,15 +295,33 @@ TypeHolder CommonTemporal(const TypeHolder* begin, size_t
count) {
}
}
- if (timezone) {
- // At least one timestamp seen
- return timestamp(finest_unit, *timezone);
- } else if (saw_date64) {
- return date64();
- } else if (saw_date32) {
- return date32();
- } else if (saw_duration) {
- return duration(finest_unit);
+ bool saw_timestamp_or_date = timezone || saw_date64 || saw_date32 ||
saw_duration;
+
+ if (saw_time_since_midnight && saw_timestamp_or_date) {
+ // Cannot find common type
+ return TypeHolder(nullptr);
+ }
+ if (saw_timestamp_or_date) {
+ if (timezone) {
+ // At least one timestamp seen
+ return timestamp(finest_unit, *timezone);
+ } else if (saw_date64) {
+ return date64();
+ } else if (saw_date32) {
+ return date32();
+ } else if (saw_duration) {
+ return duration(finest_unit);
+ }
+ }
+ if (saw_time_since_midnight) {
+ switch (finest_unit) {
+ case TimeUnit::SECOND:
+ case TimeUnit::MILLI:
+ return time32(finest_unit);
+ case TimeUnit::MICRO:
+ case TimeUnit::NANO:
+ return time64(finest_unit);
+ }
}
return TypeHolder(nullptr);
}
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal_test.cc
b/cpp/src/arrow/compute/kernels/codegen_internal_test.cc
index af024fb8d6..6bb5568d2f 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal_test.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal_test.cc
@@ -159,6 +159,18 @@ TEST(TestDispatchBest, CommonTemporal) {
args = {timestamp(TimeUnit::SECOND, "America/Phoenix"),
timestamp(TimeUnit::SECOND, "UTC")};
ASSERT_EQ(CommonTemporal(args.data(), args.size()), nullptr);
+
+ args = {time32(TimeUnit::SECOND), time32(TimeUnit::MILLI)};
+ AssertTypeEqual(*time32(TimeUnit::MILLI), *CommonTemporal(args.data(),
args.size()));
+
+ args = {time32(TimeUnit::SECOND), time64(TimeUnit::NANO)};
+ AssertTypeEqual(*time64(TimeUnit::NANO), *CommonTemporal(args.data(),
args.size()));
+
+ args = {date32(), time32(TimeUnit::SECOND)};
+ ASSERT_EQ(CommonTemporal(args.data(), args.size()), nullptr);
+
+ args = {timestamp(TimeUnit::SECOND), time32(TimeUnit::SECOND)};
+ ASSERT_EQ(CommonTemporal(args.data(), args.size()), nullptr);
}
TEST(TestDispatchBest, CommonTemporalResolution) {
diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc
b/cpp/src/arrow/dataset/file_parquet_test.cc
index 177ca82417..dc9e085df3 100644
--- a/cpp/src/arrow/dataset/file_parquet_test.cc
+++ b/cpp/src/arrow/dataset/file_parquet_test.cc
@@ -730,6 +730,31 @@ TEST_P(TestParquetFileFormatScan,
PredicatePushdownRowGroupFragmentsUsingDuratio
CountRowGroupsInFragment(fragment, {0}, expr);
}
+TEST_P(TestParquetFileFormatScan,
+ PredicatePushdownRowGroupFragmentsUsingTimestampColumn) {
+ // GH-37799: Parquet arrow will change TimeUnit::SECOND to TimeUnit::MILLI
+ // because parquet LogicalType doesn't support SECOND.
+ for (auto time_unit : {TimeUnit::MILLI, TimeUnit::SECOND}) {
+ auto table = TableFromJSON(schema({field("t", time32(time_unit))}),
+ {
+ R"([{"t": 1}])",
+ R"([{"t": 2}, {"t": 3}])",
+ });
+ TableBatchReader table_reader(*table);
+ ARROW_SCOPED_TRACE("time_unit=", time_unit);
+ ASSERT_OK_AND_ASSIGN(
+ auto source,
+ ParquetFormatHelper::Write(
+ &table_reader,
ArrowWriterProperties::Builder().store_schema()->build())
+ .As<FileSource>());
+ SetSchema({field("t", time32(time_unit))});
+ ASSERT_OK_AND_ASSIGN(auto fragment, format_->MakeFragment(source));
+
+ auto expr = equal(field_ref("t"), literal(::arrow::Time32Scalar(1,
time_unit)));
+ CountRowGroupsInFragment(fragment, {0}, expr);
+ }
+}
+
// Tests projection with nested/indexed FieldRefs.
// https://github.com/apache/arrow/issues/35579
TEST_P(TestParquetFileFormatScan, ProjectWithNonNamedFieldRefs) {