rok commented on a change in pull request #10610:
URL: https://github.com/apache/arrow/pull/10610#discussion_r703576179



##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal_test.cc
##########
@@ -383,7 +384,96 @@ TEST_F(ScalarTemporalTest, DayOfWeek) {
                                                        /*week_start=*/8)));
 }
 
+// TODO: We should test on windows once ARROW-13168 is resolved.
 #ifndef _WIN32
+TEST_F(ScalarTemporalTest, TestAssumeTimezone) {
+  std::string timezone_utc = "UTC";
+  std::string timezone_kolkata = "Asia/Kolkata";
+  std::string timezone_us_central = "US/Central";
+  const char* times_utc = R"(["1970-01-01T00:00:00", null])";
+  const char* times_kolkata = R"(["1970-01-01T05:30:00", null])";
+  const char* times_us_central = R"(["1969-12-31T18:00:00", null])";
+  auto options_utc = AssumeTimezoneOptions(timezone_utc);
+  auto options_kolkata = AssumeTimezoneOptions(timezone_kolkata);
+  auto options_us_central = AssumeTimezoneOptions(timezone_us_central);
+  auto options_invalid = AssumeTimezoneOptions("Europe/Brusselsss");
+
+  for (auto u : internal::AllTimeUnits()) {
+    auto unit = timestamp(u);
+    auto unit_utc = timestamp(u, timezone_utc);
+    auto unit_kolkata = timestamp(u, timezone_kolkata);
+    auto unit_us_central = timestamp(u, timezone_us_central);
+
+    CheckScalarUnary("assume_timezone", unit, times_utc, unit_utc, times_utc,
+                     &options_utc);
+    CheckScalarUnary("assume_timezone", unit, times_kolkata, unit_kolkata, 
times_utc,
+                     &options_kolkata);
+    CheckScalarUnary("assume_timezone", unit, times_us_central, 
unit_us_central,
+                     times_utc, &options_us_central);
+    ASSERT_RAISES(Invalid,
+                  AssumeTimezone(ArrayFromJSON(unit_kolkata, times_utc), 
options_utc));
+    ASSERT_RAISES(Invalid,
+                  AssumeTimezone(ArrayFromJSON(unit, times_utc), 
options_invalid));
+  }
+}
+
+TEST_F(ScalarTemporalTest, TestAssumeTimezoneAmbiguous) {
+  std::string timezone = "CET";
+  const char* times = R"(["2018-10-28 01:20:00",
+                          "2018-10-28 02:36:00",
+                          "2018-10-28 03:46:00"])";
+  const char* times_earliest = R"(["2018-10-27 23:20:00",
+                                   "2018-10-28 00:36:00",
+                                   "2018-10-28 02:46:00"])";
+  const char* times_latest = R"(["2018-10-27 23:20:00",
+                                 "2018-10-28 01:36:00",
+                                 "2018-10-28 02:46:00"])";
+
+  auto options_earliest = AssumeTimezoneOptions(
+      timezone, AssumeTimezoneOptions::Ambiguous::AMBIGUOUS_EARLIEST);
+  auto options_latest =
+      AssumeTimezoneOptions(timezone, 
AssumeTimezoneOptions::Ambiguous::AMBIGUOUS_LATEST);
+  auto options_raise =
+      AssumeTimezoneOptions(timezone, 
AssumeTimezoneOptions::Ambiguous::AMBIGUOUS_RAISE);
+
+  for (auto u : internal::AllTimeUnits()) {
+    auto unit = timestamp(u);
+    auto unit_local = timestamp(u, timezone);
+    ASSERT_RAISES(Invalid, AssumeTimezone(ArrayFromJSON(unit, times), 
options_raise));
+    CheckScalarUnary("assume_timezone", unit, times, unit_local, 
times_earliest,
+                     &options_earliest);
+    CheckScalarUnary("assume_timezone", unit, times, unit_local, times_latest,
+                     &options_latest);
+  }
+}
+
+TEST_F(ScalarTemporalTest, TestAssumeTimezoneNonexistent) {
+  std::string timezone = "Europe/Warsaw";
+  const char* times = R"(["2015-03-29 02:30:00", "2015-03-29 03:30:00"])";
+  const char* times_earliest = R"(["2015-03-29 01:00:00", "2015-03-29 
01:30:00"])";
+  const char* times_latest = R"(["2015-03-29 01:00:00", "2015-03-29 
01:30:00"])";

Review comment:
       > Look at the results in the Pandas example: "2015-03-29 
01:59:59.999999999+01:00" and "2015-03-29 03:00:00+02:00" are separated by a 
single _nanosecond_. Is this behaviour useful? @jorisvandenbossche
   
   I'm sure there are is someone out there who care about nanoseconds :)).
   
   My thought here is - the input timestamp is half past full hour locally. The 
local hour doesn't exist, so the half past is probably a measurement mistake 
and what's was measured is half past an hour later or earlier. The third option 
(infer) is to ignore the half past and go to the moment the DST shift occurred.
   
   ```
   earliest -> 2015-03-29 02:30:00 (Warsaw) -> 2015-03-29 00:30:00 (UTC)
   infer -> 2015-03-29 02:30:00 (Warsaw) -> 2015-03-29 01:00:00 (UTC)
   latest -> 2015-03-29 01:30:00 (Warsaw) -> 2015-03-29 01:30:00 (UTC)
   ```
   
   Visual aid for the discussion (not exactly the same zones):
   ![](https://files.gitter.im/HowardHinnant/date/fM7T/EuropeMoscow.jpeg)
   
   We can also just park this into 
[ARROW-13347](https://issues.apache.org/jira/browse/ARROW-13347).




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to