This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 22df70a0ca GH-37799: [C++] Compute: CommonTemporal support time32 and 
time64 casting (#37949)
22df70a0ca is described below

commit 22df70a0caf888084ae13d7f3e0178a477d1e7d7
Author: mwish <[email protected]>
AuthorDate: Mon Oct 2 22:51:50 2023 +0800

    GH-37799: [C++] Compute: CommonTemporal support time32 and time64 casting 
(#37949)
    
    
    
    ### Rationale for this change
    
    The original problem in mentioned in 
https://github.com/apache/arrow/issues/37799
    
    1. `SECOND` in Parquet would always cast to `MILLS`
    2. `eq` not support `eq(time32[s], time32[ms)`
    
    This patch is as advice in 
https://github.com/apache/arrow/issues/37799#issuecomment-1737747352 . We tent 
to add time32 and time64 in `CommonTemporal`.
    
    ### What changes are included in this PR?
    
    Support time32 and time64 with different time unit in 
`arrow::compute::internal::CommonTemporal`.
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    bugfix
    
    * Closes: #37799
    
    Authored-by: mwish <[email protected]>
    Signed-off-by: Benjamin Kietzman <[email protected]>
---
 cpp/src/arrow/compute/kernels/codegen_internal.cc  | 49 ++++++++++++++++++----
 .../arrow/compute/kernels/codegen_internal_test.cc | 12 ++++++
 cpp/src/arrow/dataset/file_parquet_test.cc         | 25 +++++++++++
 3 files changed, 77 insertions(+), 9 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc 
b/cpp/src/arrow/compute/kernels/codegen_internal.cc
index 8e2669bd3d..00a833742f 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -251,6 +251,7 @@ TypeHolder CommonTemporal(const TypeHolder* begin, size_t 
count) {
   bool saw_date32 = false;
   bool saw_date64 = false;
   bool saw_duration = false;
+  bool saw_time_since_midnight = false;
   const TypeHolder* end = begin + count;
   for (auto it = begin; it != end; it++) {
     auto id = it->type->id();
@@ -271,6 +272,18 @@ TypeHolder CommonTemporal(const TypeHolder* begin, size_t 
count) {
         finest_unit = std::max(finest_unit, ty.unit());
         continue;
       }
+      case Type::TIME32: {
+        const auto& type = checked_cast<const Time32Type&>(*it->type);
+        finest_unit = std::max(finest_unit, type.unit());
+        saw_time_since_midnight = true;
+        continue;
+      }
+      case Type::TIME64: {
+        const auto& type = checked_cast<const Time64Type&>(*it->type);
+        finest_unit = std::max(finest_unit, type.unit());
+        saw_time_since_midnight = true;
+        continue;
+      }
       case Type::DURATION: {
         const auto& ty = checked_cast<const DurationType&>(*it->type);
         finest_unit = std::max(finest_unit, ty.unit());
@@ -282,15 +295,33 @@ TypeHolder CommonTemporal(const TypeHolder* begin, size_t 
count) {
     }
   }
 
-  if (timezone) {
-    // At least one timestamp seen
-    return timestamp(finest_unit, *timezone);
-  } else if (saw_date64) {
-    return date64();
-  } else if (saw_date32) {
-    return date32();
-  } else if (saw_duration) {
-    return duration(finest_unit);
+  bool saw_timestamp_or_date = timezone || saw_date64 || saw_date32 || 
saw_duration;
+
+  if (saw_time_since_midnight && saw_timestamp_or_date) {
+    // Cannot find common type
+    return TypeHolder(nullptr);
+  }
+  if (saw_timestamp_or_date) {
+    if (timezone) {
+      // At least one timestamp seen
+      return timestamp(finest_unit, *timezone);
+    } else if (saw_date64) {
+      return date64();
+    } else if (saw_date32) {
+      return date32();
+    } else if (saw_duration) {
+      return duration(finest_unit);
+    }
+  }
+  if (saw_time_since_midnight) {
+    switch (finest_unit) {
+      case TimeUnit::SECOND:
+      case TimeUnit::MILLI:
+        return time32(finest_unit);
+      case TimeUnit::MICRO:
+      case TimeUnit::NANO:
+        return time64(finest_unit);
+    }
   }
   return TypeHolder(nullptr);
 }
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal_test.cc 
b/cpp/src/arrow/compute/kernels/codegen_internal_test.cc
index af024fb8d6..6bb5568d2f 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal_test.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal_test.cc
@@ -159,6 +159,18 @@ TEST(TestDispatchBest, CommonTemporal) {
   args = {timestamp(TimeUnit::SECOND, "America/Phoenix"),
           timestamp(TimeUnit::SECOND, "UTC")};
   ASSERT_EQ(CommonTemporal(args.data(), args.size()), nullptr);
+
+  args = {time32(TimeUnit::SECOND), time32(TimeUnit::MILLI)};
+  AssertTypeEqual(*time32(TimeUnit::MILLI), *CommonTemporal(args.data(), 
args.size()));
+
+  args = {time32(TimeUnit::SECOND), time64(TimeUnit::NANO)};
+  AssertTypeEqual(*time64(TimeUnit::NANO), *CommonTemporal(args.data(), 
args.size()));
+
+  args = {date32(), time32(TimeUnit::SECOND)};
+  ASSERT_EQ(CommonTemporal(args.data(), args.size()), nullptr);
+
+  args = {timestamp(TimeUnit::SECOND), time32(TimeUnit::SECOND)};
+  ASSERT_EQ(CommonTemporal(args.data(), args.size()), nullptr);
 }
 
 TEST(TestDispatchBest, CommonTemporalResolution) {
diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc 
b/cpp/src/arrow/dataset/file_parquet_test.cc
index 177ca82417..dc9e085df3 100644
--- a/cpp/src/arrow/dataset/file_parquet_test.cc
+++ b/cpp/src/arrow/dataset/file_parquet_test.cc
@@ -730,6 +730,31 @@ TEST_P(TestParquetFileFormatScan, 
PredicatePushdownRowGroupFragmentsUsingDuratio
   CountRowGroupsInFragment(fragment, {0}, expr);
 }
 
+TEST_P(TestParquetFileFormatScan,
+       PredicatePushdownRowGroupFragmentsUsingTimestampColumn) {
+  // GH-37799: Parquet arrow will change TimeUnit::SECOND to TimeUnit::MILLI
+  // because parquet LogicalType doesn't support SECOND.
+  for (auto time_unit : {TimeUnit::MILLI, TimeUnit::SECOND}) {
+    auto table = TableFromJSON(schema({field("t", time32(time_unit))}),
+                               {
+                                   R"([{"t": 1}])",
+                                   R"([{"t": 2}, {"t": 3}])",
+                               });
+    TableBatchReader table_reader(*table);
+    ARROW_SCOPED_TRACE("time_unit=", time_unit);
+    ASSERT_OK_AND_ASSIGN(
+        auto source,
+        ParquetFormatHelper::Write(
+            &table_reader, 
ArrowWriterProperties::Builder().store_schema()->build())
+            .As<FileSource>());
+    SetSchema({field("t", time32(time_unit))});
+    ASSERT_OK_AND_ASSIGN(auto fragment, format_->MakeFragment(source));
+
+    auto expr = equal(field_ref("t"), literal(::arrow::Time32Scalar(1, 
time_unit)));
+    CountRowGroupsInFragment(fragment, {0}, expr);
+  }
+}
+
 // Tests projection with nested/indexed FieldRefs.
 // https://github.com/apache/arrow/issues/35579
 TEST_P(TestParquetFileFormatScan, ProjectWithNonNamedFieldRefs) {

Reply via email to