pitrou commented on code in PR #12528:
URL: https://github.com/apache/arrow/pull/12528#discussion_r820888807


##########
cpp/src/arrow/compute/api_scalar.h:
##########
@@ -104,10 +104,28 @@ enum class CalendarUnit : int8_t {
   YEAR
 };
 
+/// \brief How to interpret ambiguous local times that can be interpreted as
+/// multiple instants (normally two) due to DST shifts.
+///
+/// AMBIGUOUS_EARLIEST emits the earliest instant amongst possible 
interpretations.
+/// AMBIGUOUS_LATEST emits the latest instant amongst possible interpretations.
+enum AmbiguousTime { AMBIGUOUS_RAISE, AMBIGUOUS_EARLIEST, AMBIGUOUS_LATEST };
+
+/// \brief How to handle local times that do not exist due to DST shifts.
+///
+/// NONEXISTENT_EARLIEST emits the instant "just before" the DST shift instant
+/// in the given timestamp precision (for example, for a nanoseconds precision
+/// timestamp, this is one nanosecond before the DST shift instant).
+/// NONEXISTENT_LATEST emits the DST shift instant.
+enum NonexistentTime { NONEXISTENT_RAISE, NONEXISTENT_EARLIEST, 
NONEXISTENT_LATEST };

Review Comment:
   Same here.



##########
cpp/src/arrow/compute/kernels/temporal_internal.h:
##########
@@ -105,6 +105,39 @@ struct NonZonedLocalizer {
     return t;
   }
 
+  template <typename Duration, typename Unit>
+  Duration FloorTimePoint(int64_t t, const RoundTemporalOptions* options) 
const {

Review Comment:
   Nit, but why not pass the options by const-reference? I don't think it's 
allowed to omit them.



##########
cpp/src/arrow/compute/kernels/temporal_internal.h:
##########
@@ -119,19 +152,93 @@ struct ZonedLocalizer {
     return tz->to_local(sys_time<Duration>(Duration{t}));
   }
 
-  template <typename Duration>
-  Duration ConvertLocalToSys(Duration t, Status* st) const {
+  template <typename Duration, typename Unit>
+  Duration FloorTimePoint(const int64_t t, const RoundTemporalOptions* 
options) const {
+    local_time<Duration> lt = tz->to_local(sys_time<Duration>(Duration{t}));
+    const Unit d = floor<Unit>(lt).time_since_epoch();
+    Unit d2;
+
+    if (options->multiple == 1) {
+      d2 = d;
+    } else {
+      const Unit unit = Unit{options->multiple};
+      d2 = (d.count() >= 0) ? d / unit * unit : (d - unit + Unit{1}) / unit * 
unit;
+    }
+
     try {
-      return zoned_time<Duration>{tz, local_time<Duration>(t)}
+      return zoned_time<Duration>(tz, 
local_time<Duration>(duration_cast<Duration>(d2)))
           .get_sys_time()
           .time_since_epoch();
-    } catch (const arrow_vendored::date::nonexistent_local_time& e) {
-      *st = Status::Invalid("Local time does not exist: ", e.what());
-      return Duration{0};
-    } catch (const arrow_vendored::date::ambiguous_local_time& e) {
-      *st = Status::Invalid("Local time is ambiguous: ", e.what());
-      return Duration{0};
+    } catch (const arrow_vendored::date::ambiguous_local_time&) {
+      // In case we hit an ambiguous period we round to a time multiple just 
prior,
+      // convert to UTC and add the time unit we're rounding to.
+      const arrow_vendored::date::local_info li =
+          tz->get_info(local_time<Duration>(duration_cast<Duration>(d2)));
+
+      const Duration t2 =
+          zoned_time<Duration>(
+              tz, local_time<Duration>(duration_cast<Duration>(d2 - 
li.second.offset)))
+              .get_sys_time()
+              .time_since_epoch();
+      const Unit unit = Unit{options->multiple};
+      const Unit t3 =
+          (t2.count() >= 0) ? t2 / unit * unit : (t2 - unit + Unit{1}) / unit 
* unit;
+      const Duration t4 = duration_cast<Duration>(t3 + li.first.offset);
+      if (t < t4.count()) {
+        return duration_cast<Duration>(t3 + li.second.offset);
+      }
+      return duration_cast<Duration>(t4);
+    } catch (const arrow_vendored::date::nonexistent_local_time&) {
+      // In case we hit a nonexistent period we calculate the duration between 
the
+      // start of nonexistent period and rounded to moment in UTC 
(nonexistent_offset).
+      // We then floor the beginning of the nonexisting period in local time 
and add
+      // nonexistent_offset to that time point in UTC.
+      const arrow_vendored::date::local_info li =
+          tz->get_info(local_time<Duration>(duration_cast<Duration>(d2)));
+
+      const Duration t2 =
+          zoned_time<Duration>(tz, 
local_time<Duration>(duration_cast<Duration>(d2)),
+                               arrow_vendored::date::choose::earliest)
+              .get_sys_time()
+              .time_since_epoch();
+      const Duration nonexistent_offset = duration_cast<Duration>(t2 - d2);
+
+      const Unit unit = Unit{options->multiple};
+      const Unit t3 =
+          (t2.count() >= 0) ? t2 / unit * unit : (t2 - unit + Unit{1}) / unit 
* unit;
+      return duration_cast<Duration>(t3 + li.second.offset) + 
nonexistent_offset;
+    }
+  }
+
+  template <typename Duration, typename Unit>
+  Duration CeilTimePoint(const int64_t t, const RoundTemporalOptions* options) 
const {
+    const Duration d = FloorTimePoint<Duration, Unit>(t, options);
+    if (d.count() == t) {
+      return d;
+    }
+    return FloorTimePoint<Duration, Unit>(
+        t + duration_cast<Duration>(Unit{options->multiple}).count(), options);
+  }
+
+  template <typename Duration, typename Unit>
+  Duration RoundTimePoint(const int64_t t, const RoundTemporalOptions* 
options) const {
+    const Duration f = FloorTimePoint<Duration, Unit>(t, options);
+    Duration c;
+    if (f.count() == t) {
+      c = f;
+    } else {
+      c = FloorTimePoint<Duration, Unit>(
+          t + duration_cast<Duration>(Unit{options->multiple}).count(), 
options);

Review Comment:
   Same question here wrt. calling `floor` twice.



##########
cpp/src/arrow/compute/api_scalar.h:
##########
@@ -104,10 +104,28 @@ enum class CalendarUnit : int8_t {
   YEAR
 };
 
+/// \brief How to interpret ambiguous local times that can be interpreted as
+/// multiple instants (normally two) due to DST shifts.
+///
+/// AMBIGUOUS_EARLIEST emits the earliest instant amongst possible 
interpretations.
+/// AMBIGUOUS_LATEST emits the latest instant amongst possible interpretations.
+enum AmbiguousTime { AMBIGUOUS_RAISE, AMBIGUOUS_EARLIEST, AMBIGUOUS_LATEST };

Review Comment:
   Use a scoped num instead:
   ```suggestion
   enum class AmbiguousTime : int8_t { RAISE, EARLIEST, LATEST };
   ```



##########
cpp/src/arrow/compute/kernels/scalar_temporal_test.cc:
##########
@@ -2521,6 +2515,130 @@ TEST_F(ScalarTemporalTest, TestRoundTemporal) {
   CheckScalarUnary(op, unit, times, unit, round_15_years, &round_to_15_years);
 }
 
+TEST_F(ScalarTemporalTest, TestCeilTemporalAmbiguous) {
+  std::string timezone = "CET";
+  const char* times = R"(["2018-10-28 01:20:00"])";
+  const char* times_earliest = R"(["2018-10-28 00:30:00"])";
+  const char* times_latest = R"(["2018-10-28 01:30:00"])";
+
+  auto unit = timestamp(TimeUnit::NANO, timezone);
+
+  auto options_earliest =
+      RoundTemporalOptions(15, CalendarUnit::MINUTE, true, AMBIGUOUS_EARLIEST);
+  auto options_latest =
+      RoundTemporalOptions(15, CalendarUnit::MINUTE, true, AMBIGUOUS_LATEST);
+  auto options_raise =
+      RoundTemporalOptions(15, CalendarUnit::MINUTE, true, AMBIGUOUS_RAISE);
+
+  ASSERT_RAISES(Invalid, CeilTemporal(ArrayFromJSON(unit, times), 
options_raise));
+  CheckScalarUnary("ceil_temporal", unit, times, unit, times_earliest, 
&options_earliest);
+  CheckScalarUnary("ceil_temporal", unit, times, unit, times_latest, 
&options_latest);
+}
+
+TEST_F(ScalarTemporalTest, TestFloorTemporalAmbiguous) {
+  std::string timezone = "CET";
+  const char* times = R"(["2018-10-28 01:20:00"])";
+  const char* times_earliest = R"(["2018-10-28 00:15:00"])";
+  const char* times_latest = R"(["2018-10-28 01:15:00"])";
+
+  auto unit = timestamp(TimeUnit::NANO, timezone);
+
+  auto options_earliest =
+      RoundTemporalOptions(15, CalendarUnit::MINUTE, true, AMBIGUOUS_EARLIEST);
+  auto options_latest =
+      RoundTemporalOptions(15, CalendarUnit::MINUTE, true, AMBIGUOUS_LATEST);
+  auto options_raise =
+      RoundTemporalOptions(15, CalendarUnit::MINUTE, true, AMBIGUOUS_RAISE);
+
+  ASSERT_RAISES(Invalid, CeilTemporal(ArrayFromJSON(unit, times), 
options_raise));
+  CheckScalarUnary("floor_temporal", unit, times, unit, times_earliest,
+                   &options_earliest);
+  CheckScalarUnary("floor_temporal", unit, times, unit, times_latest, 
&options_latest);
+}
+
+TEST_F(ScalarTemporalTest, TestRoundTemporalAmbiguous) {
+  std::string timezone = "CET";
+  const char* times = R"(["2018-10-28 01:20:00"])";
+  const char* times_earliest = R"(["2018-10-28 00:30:00"])";
+  const char* times_latest = R"(["2018-10-28 01:15:00"])";
+
+  auto unit = timestamp(TimeUnit::NANO, timezone);
+
+  auto options_earliest =
+      RoundTemporalOptions(15, CalendarUnit::MINUTE, true, AMBIGUOUS_EARLIEST);
+  auto options_latest =
+      RoundTemporalOptions(15, CalendarUnit::MINUTE, true, AMBIGUOUS_LATEST);
+  auto options_raise =
+      RoundTemporalOptions(15, CalendarUnit::MINUTE, true, AMBIGUOUS_RAISE);
+
+  ASSERT_RAISES(Invalid, CeilTemporal(ArrayFromJSON(unit, times), 
options_raise));

Review Comment:
   `RoundTemporal` here. Also, see other instances below.



##########
cpp/src/arrow/compute/kernels/scalar_temporal_test.cc:
##########
@@ -2611,6 +2611,106 @@ TEST_F(ScalarTemporalTest, TestRoundTemporal) {
   CheckScalarUnary(op, unit, times, unit, round_15_years, &round_to_15_years);
 }
 
+TEST_F(ScalarTemporalTest, TestCeilFloorRoundTemporalAmbiguous1) {
+  auto unit = timestamp(TimeUnit::MILLI, "Asia/Tehran");
+  auto options = RoundTemporalOptions(1, CalendarUnit::HOUR, true);

Review Comment:
   Please spell out what `true` is for here:
   ```suggestion
     auto options = RoundTemporalOptions(1, CalendarUnit::HOUR, 
/*some_parameter=*/true);
   ```



##########
cpp/src/arrow/compute/kernels/temporal_internal.h:
##########
@@ -119,19 +152,93 @@ struct ZonedLocalizer {
     return tz->to_local(sys_time<Duration>(Duration{t}));
   }
 
-  template <typename Duration>
-  Duration ConvertLocalToSys(Duration t, Status* st) const {
+  template <typename Duration, typename Unit>
+  Duration FloorTimePoint(const int64_t t, const RoundTemporalOptions* 
options) const {
+    local_time<Duration> lt = tz->to_local(sys_time<Duration>(Duration{t}));
+    const Unit d = floor<Unit>(lt).time_since_epoch();
+    Unit d2;
+
+    if (options->multiple == 1) {
+      d2 = d;
+    } else {
+      const Unit unit = Unit{options->multiple};
+      d2 = (d.count() >= 0) ? d / unit * unit : (d - unit + Unit{1}) / unit * 
unit;
+    }
+
     try {
-      return zoned_time<Duration>{tz, local_time<Duration>(t)}
+      return zoned_time<Duration>(tz, 
local_time<Duration>(duration_cast<Duration>(d2)))
           .get_sys_time()
           .time_since_epoch();
-    } catch (const arrow_vendored::date::nonexistent_local_time& e) {
-      *st = Status::Invalid("Local time does not exist: ", e.what());
-      return Duration{0};
-    } catch (const arrow_vendored::date::ambiguous_local_time& e) {
-      *st = Status::Invalid("Local time is ambiguous: ", e.what());
-      return Duration{0};
+    } catch (const arrow_vendored::date::ambiguous_local_time&) {
+      // In case we hit an ambiguous period we round to a time multiple just 
prior,
+      // convert to UTC and add the time unit we're rounding to.
+      const arrow_vendored::date::local_info li =
+          tz->get_info(local_time<Duration>(duration_cast<Duration>(d2)));
+
+      const Duration t2 =
+          zoned_time<Duration>(
+              tz, local_time<Duration>(duration_cast<Duration>(d2 - 
li.second.offset)))
+              .get_sys_time()
+              .time_since_epoch();
+      const Unit unit = Unit{options->multiple};
+      const Unit t3 =
+          (t2.count() >= 0) ? t2 / unit * unit : (t2 - unit + Unit{1}) / unit 
* unit;
+      const Duration t4 = duration_cast<Duration>(t3 + li.first.offset);
+      if (t < t4.count()) {
+        return duration_cast<Duration>(t3 + li.second.offset);
+      }
+      return duration_cast<Duration>(t4);
+    } catch (const arrow_vendored::date::nonexistent_local_time&) {
+      // In case we hit a nonexistent period we calculate the duration between 
the
+      // start of nonexistent period and rounded to moment in UTC 
(nonexistent_offset).
+      // We then floor the beginning of the nonexisting period in local time 
and add
+      // nonexistent_offset to that time point in UTC.
+      const arrow_vendored::date::local_info li =
+          tz->get_info(local_time<Duration>(duration_cast<Duration>(d2)));
+
+      const Duration t2 =
+          zoned_time<Duration>(tz, 
local_time<Duration>(duration_cast<Duration>(d2)),
+                               arrow_vendored::date::choose::earliest)
+              .get_sys_time()
+              .time_since_epoch();
+      const Duration nonexistent_offset = duration_cast<Duration>(t2 - d2);
+
+      const Unit unit = Unit{options->multiple};
+      const Unit t3 =
+          (t2.count() >= 0) ? t2 / unit * unit : (t2 - unit + Unit{1}) / unit 
* unit;
+      return duration_cast<Duration>(t3 + li.second.offset) + 
nonexistent_offset;
+    }
+  }
+
+  template <typename Duration, typename Unit>
+  Duration CeilTimePoint(const int64_t t, const RoundTemporalOptions* options) 
const {
+    const Duration d = FloorTimePoint<Duration, Unit>(t, options);
+    if (d.count() == t) {
+      return d;
+    }
+    return FloorTimePoint<Duration, Unit>(
+        t + duration_cast<Duration>(Unit{options->multiple}).count(), options);

Review Comment:
   It's a bit unexpected to call `floor` twice in the `ceil` implementation, is 
there perhaps a more efficient way to do this?



##########
cpp/src/arrow/compute/kernels/scalar_temporal_test.cc:
##########
@@ -2521,6 +2515,130 @@ TEST_F(ScalarTemporalTest, TestRoundTemporal) {
   CheckScalarUnary(op, unit, times, unit, round_15_years, &round_to_15_years);
 }
 
+TEST_F(ScalarTemporalTest, TestCeilTemporalAmbiguous) {
+  std::string timezone = "CET";
+  const char* times = R"(["2018-10-28 01:20:00"])";
+  const char* times_earliest = R"(["2018-10-28 00:30:00"])";
+  const char* times_latest = R"(["2018-10-28 01:30:00"])";
+
+  auto unit = timestamp(TimeUnit::NANO, timezone);
+
+  auto options_earliest =
+      RoundTemporalOptions(15, CalendarUnit::MINUTE, true, AMBIGUOUS_EARLIEST);
+  auto options_latest =
+      RoundTemporalOptions(15, CalendarUnit::MINUTE, true, AMBIGUOUS_LATEST);
+  auto options_raise =
+      RoundTemporalOptions(15, CalendarUnit::MINUTE, true, AMBIGUOUS_RAISE);
+
+  ASSERT_RAISES(Invalid, CeilTemporal(ArrayFromJSON(unit, times), 
options_raise));
+  CheckScalarUnary("ceil_temporal", unit, times, unit, times_earliest, 
&options_earliest);
+  CheckScalarUnary("ceil_temporal", unit, times, unit, times_latest, 
&options_latest);
+}
+
+TEST_F(ScalarTemporalTest, TestFloorTemporalAmbiguous) {
+  std::string timezone = "CET";
+  const char* times = R"(["2018-10-28 01:20:00"])";
+  const char* times_earliest = R"(["2018-10-28 00:15:00"])";
+  const char* times_latest = R"(["2018-10-28 01:15:00"])";
+
+  auto unit = timestamp(TimeUnit::NANO, timezone);
+
+  auto options_earliest =
+      RoundTemporalOptions(15, CalendarUnit::MINUTE, true, AMBIGUOUS_EARLIEST);
+  auto options_latest =
+      RoundTemporalOptions(15, CalendarUnit::MINUTE, true, AMBIGUOUS_LATEST);
+  auto options_raise =
+      RoundTemporalOptions(15, CalendarUnit::MINUTE, true, AMBIGUOUS_RAISE);
+
+  ASSERT_RAISES(Invalid, CeilTemporal(ArrayFromJSON(unit, times), 
options_raise));

Review Comment:
   ```suggestion
     ASSERT_RAISES(Invalid, FloorTemporal(ArrayFromJSON(unit, times), 
options_raise));
   ```



##########
cpp/src/arrow/compute/kernels/scalar_temporal_test.cc:
##########
@@ -2611,6 +2611,106 @@ TEST_F(ScalarTemporalTest, TestRoundTemporal) {
   CheckScalarUnary(op, unit, times, unit, round_15_years, &round_to_15_years);
 }
 
+TEST_F(ScalarTemporalTest, TestCeilFloorRoundTemporalAmbiguous1) {
+  auto unit = timestamp(TimeUnit::MILLI, "Asia/Tehran");
+  auto options = RoundTemporalOptions(1, CalendarUnit::HOUR, true);
+  const char* times = R"([

Review Comment:
   It's a bit late to suggest this, but I think we should take the habit of 
making tests more easily understood by spelling out the context, e.g.:
   ```suggestion
     // Asia/Tehran switched from UTC+X to UTC+Y on 2022-03-31 HH:mm:ss
     const char* times = R"([
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to