rok commented on a change in pull request #10457: URL: https://github.com/apache/arrow/pull/10457#discussion_r651290261
########## File path: cpp/src/arrow/compute/kernels/scalar_temporal_test.cc ########## @@ -143,39 +142,202 @@ TEST(ScalarTemporalTest, TestTemporalComponentExtractionWithDifferentUnits) { CheckScalarUnary("quarter", unit, times, int64(), quarter); CheckScalarUnary("hour", unit, times, int64(), hour); CheckScalarUnary("minute", unit, times, int64(), minute); - CheckScalarUnary("second", unit, times, float64(), second); + CheckScalarUnary("second", unit, times, int64(), second); CheckScalarUnary("millisecond", unit, times, int64(), zeros); CheckScalarUnary("microsecond", unit, times, int64(), zeros); CheckScalarUnary("nanosecond", unit, times, int64(), zeros); CheckScalarUnary("subsecond", unit, times, float64(), zeros); } } -TEST(ScalarTemporalTest, TestZonedTemporalComponentExtraction) { - std::string timezone = "Asia/Kolkata"; - const char* times = R"(["1970-01-01T00:00:59", null])"; +TEST(ScalarTemporalTest, TestZoned1) { + const char* times = + R"(["1970-01-01T00:00:59.123456789","2000-02-29T23:23:23.999999999", + "1899-01-01T00:59:20.001001001","2033-05-18T03:33:20.000000000", + "2020-01-01T01:05:05.001", "2019-12-31T02:10:10.002", + "2019-12-30T03:15:15.003", "2009-12-31T04:20:20.004132", + "2010-01-01T05:25:25.005321", "2010-01-03T06:30:30.006163", + "2010-01-04T07:35:35", "2006-01-01T08:40:40", "2005-12-31T09:45:45", + "2008-12-28", "2008-12-29", "2012-01-01 01:02:03", null])"; + auto unit = timestamp(TimeUnit::NANO, "Pacific/Marquesas"); + auto iso_calendar_type = + struct_({field("iso_year", int64()), field("iso_week", int64()), + field("iso_day_of_week", int64())}); + auto year = + "[1969, 2000, 1898, 2033, 2019, 2019, 2019, 2009, 2009, 2010, 2010, 2005, 2005, " + "2008, 2008, 2011, null]"; + auto month = "[12, 2, 12, 5, 12, 12, 12, 12, 12, 1, 1, 12, 12, 12, 12, 12, null]"; + auto day = "[31, 29, 31, 17, 31, 30, 29, 30, 31, 2, 3, 31, 31, 27, 28, 31, null]"; + auto day_of_week = "[2, 1, 5, 1, 1, 0, 6, 2, 3, 5, 6, 5, 5, 5, 6, 5, null]"; + auto day_of_year = + "[365, 60, 365, 137, 365, 364, 363, 364, 365, 2, 3, 365, 365, 362, 363, 365, null]"; + auto iso_year = + "[1970, 2000, 1898, 2033, 2020, 2020, 2019, 2009, 2009, 2009, 2009, 2005, 2005, " + "2008, 2008, 2011, null]"; + auto iso_week = "[1, 9, 52, 20, 1, 1, 52, 53, 53, 53, 53, 52, 52, 52, 52, 52, null]"; + auto iso_calendar = + ArrayFromJSON(iso_calendar_type, + R"([{"iso_year": 1970, "iso_week": 1, "iso_day_of_week": 3}, + {"iso_year": 2000, "iso_week": 9, "iso_day_of_week": 2}, + {"iso_year": 1898, "iso_week": 52, "iso_day_of_week": 6}, + {"iso_year": 2033, "iso_week": 20, "iso_day_of_week": 2}, + {"iso_year": 2020, "iso_week": 1, "iso_day_of_week": 2}, + {"iso_year": 2020, "iso_week": 1, "iso_day_of_week": 1}, + {"iso_year": 2019, "iso_week": 52, "iso_day_of_week": 7}, + {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 3}, + {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 4}, + {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 6}, + {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 7}, + {"iso_year": 2005, "iso_week": 52, "iso_day_of_week": 6}, + {"iso_year": 2005, "iso_week": 52, "iso_day_of_week": 6}, + {"iso_year": 2008, "iso_week": 52, "iso_day_of_week": 6}, + {"iso_year": 2008, "iso_week": 52, "iso_day_of_week": 7}, + {"iso_year": 2011, "iso_week": 52, "iso_day_of_week": 6}, null])"); + auto quarter = "[4, 1, 4, 2, 4, 4, 4, 4, 4, 1, 1, 4, 4, 4, 4, 4, null]"; + auto hour = "[14, 13, 15, 18, 15, 16, 17, 18, 19, 21, 22, 23, 0, 14, 14, 15, null]"; + auto minute = "[30, 53, 41, 3, 35, 40, 45, 50, 55, 0, 5, 10, 15, 30, 30, 32, null]"; + auto second = "[59, 23, 20, 20, 5, 10, 15, 20, 25, 30, 35, 40, 45, 0, 0, 3, null]"; + auto millisecond = "[123, 999, 1, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, null]"; + auto microsecond = "[456, 999, 1, 0, 0, 0, 0, 132, 321, 163, 0, 0, 0, 0, 0, 0, null]"; + auto nanosecond = "[789, 999, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, null]"; + auto subsecond = + "[0.123456789, 0.999999999, 0.001001001, 0.0, 0.001, 0.002, 0.003, 0.004132, " + "0.005321, 0.006163, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, null]"; - for (auto u : internal::AllTimeUnits()) { - auto unit = timestamp(u, timezone); - auto timestamps = ArrayFromJSON(unit, times); - - ASSERT_RAISES(Invalid, Year(timestamps)); - ASSERT_RAISES(Invalid, Month(timestamps)); - ASSERT_RAISES(Invalid, Day(timestamps)); - ASSERT_RAISES(Invalid, DayOfWeek(timestamps)); - ASSERT_RAISES(Invalid, DayOfYear(timestamps)); - ASSERT_RAISES(Invalid, ISOYear(timestamps)); - ASSERT_RAISES(Invalid, ISOWeek(timestamps)); - ASSERT_RAISES(Invalid, ISOCalendar(timestamps)); - ASSERT_RAISES(Invalid, Quarter(timestamps)); - ASSERT_RAISES(Invalid, Hour(timestamps)); - ASSERT_RAISES(Invalid, Minute(timestamps)); - ASSERT_RAISES(Invalid, Second(timestamps)); - ASSERT_RAISES(Invalid, Millisecond(timestamps)); - ASSERT_RAISES(Invalid, Microsecond(timestamps)); - ASSERT_RAISES(Invalid, Nanosecond(timestamps)); - ASSERT_RAISES(Invalid, Subsecond(timestamps)); - } + CheckScalarUnary("year", unit, times, int64(), year); + CheckScalarUnary("month", unit, times, int64(), month); + CheckScalarUnary("day", unit, times, int64(), day); + CheckScalarUnary("day_of_week", unit, times, int64(), day_of_week); + CheckScalarUnary("day_of_year", unit, times, int64(), day_of_year); + CheckScalarUnary("iso_year", unit, times, int64(), iso_year); + CheckScalarUnary("iso_week", unit, times, int64(), iso_week); + CheckScalarUnary("iso_calendar", ArrayFromJSON(unit, times), iso_calendar); + CheckScalarUnary("quarter", unit, times, int64(), quarter); + CheckScalarUnary("hour", unit, times, int64(), hour); + CheckScalarUnary("minute", unit, times, int64(), minute); + CheckScalarUnary("second", unit, times, int64(), second); + CheckScalarUnary("millisecond", unit, times, int64(), millisecond); + CheckScalarUnary("microsecond", unit, times, int64(), microsecond); + CheckScalarUnary("nanosecond", unit, times, int64(), nanosecond); + CheckScalarUnary("subsecond", unit, times, float64(), subsecond); +} + +TEST(ScalarTemporalTest, TestZoned2) { + const char* times = + R"(["1970-01-01T00:00:59.123456789","2000-02-29T23:23:23.999999999", + "1899-01-01T00:59:20.001001001","2033-05-18T03:33:20.000000000", + "2020-01-01T01:05:05.001", "2019-12-31T02:10:10.002", + "2019-12-30T03:15:15.003", "2009-12-31T04:20:20.004132", + "2010-01-01T05:25:25.005321", "2010-01-03T06:30:30.006163", + "2010-01-04T07:35:35", "2006-01-01T08:40:40", "2005-12-31T09:45:45", + "2008-12-28", "2008-12-29", "2012-01-01 01:02:03", null])"; + auto unit = timestamp(TimeUnit::NANO, "Australia/Broken_Hill"); + auto iso_calendar_type = + struct_({field("iso_year", int64()), field("iso_week", int64()), + field("iso_day_of_week", int64())}); + auto year = + "[1970, 2000, 1899, 2033, 2020, 2019, 2019, 2009, 2010, 2010, 2010, 2006, 2005, " + "2008, 2008, 2012, null]"; + auto month = "[1, 3, 1, 5, 1, 12, 12, 12, 1, 1, 1, 1, 12, 12, 12, 1, null]"; + auto day = "[1, 1, 1, 18, 1, 31, 30, 31, 1, 3, 4, 1, 31, 28, 29, 1, null]"; + auto day_of_week = "[3, 2, 6, 2, 2, 1, 0, 3, 4, 6, 0, 6, 5, 6, 0, 6, null]"; + auto day_of_year = + "[1, 61, 1, 138, 1, 365, 364, 365, 1, 3, 4, 1, 365, 363, 364, 1, null]"; + auto iso_year = + "[1970, 2000, 1898, 2033, 2020, 2020, 2020, 2009, 2009, 2009, 2010, 2005, 2005, " + "2008, 2009, 2011, null]"; + auto iso_week = "[1, 9, 52, 20, 1, 1, 1, 53, 53, 53, 1, 52, 52, 52, 1, 52, null]"; + auto iso_calendar = + ArrayFromJSON(iso_calendar_type, + R"([{"iso_year": 1970, "iso_week": 1, "iso_day_of_week": 4}, + {"iso_year": 2000, "iso_week": 9, "iso_day_of_week": 3}, + {"iso_year": 1898, "iso_week": 52, "iso_day_of_week": 7}, + {"iso_year": 2033, "iso_week": 20, "iso_day_of_week": 3}, + {"iso_year": 2020, "iso_week": 1, "iso_day_of_week": 3}, + {"iso_year": 2020, "iso_week": 1, "iso_day_of_week": 2}, + {"iso_year": 2020, "iso_week": 1, "iso_day_of_week": 1}, + {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 4}, + {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 5}, + {"iso_year": 2009, "iso_week": 53, "iso_day_of_week": 7}, + {"iso_year": 2010, "iso_week": 1, "iso_day_of_week": 1}, + {"iso_year": 2005, "iso_week": 52, "iso_day_of_week": 7}, + {"iso_year": 2005, "iso_week": 52, "iso_day_of_week": 6}, + {"iso_year": 2008, "iso_week": 52, "iso_day_of_week": 7}, + {"iso_year": 2009, "iso_week": 1, "iso_day_of_week": 1}, + {"iso_year": 2011, "iso_week": 52, "iso_day_of_week": 7}, null])"); + auto quarter = "[1, 1, 1, 2, 1, 4, 4, 4, 1, 1, 1, 1, 4, 4, 4, 1, null]"; + auto hour = "[9, 9, 9, 13, 11, 12, 13, 14, 15, 17, 18, 19, 20, 10, 10, 11, null]"; + auto minute = "[30, 53, 59, 3, 35, 40, 45, 50, 55, 0, 5, 10, 15, 30, 30, 32, null]"; + auto second = "[59, 23, 20, 20, 5, 10, 15, 20, 25, 30, 35, 40, 45, 0, 0, 3, null]"; + auto millisecond = "[123, 999, 1, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, null]"; + auto microsecond = "[456, 999, 1, 0, 0, 0, 0, 132, 321, 163, 0, 0, 0, 0, 0, 0, null]"; + auto nanosecond = "[789, 999, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, null]"; + auto subsecond = + "[0.123456789, 0.999999999, 0.001001001, 0.0, 0.001, 0.002, 0.003, 0.004132, " + "0.005321, 0.006163, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, null]"; + + CheckScalarUnary("year", unit, times, int64(), year); + CheckScalarUnary("month", unit, times, int64(), month); + CheckScalarUnary("day", unit, times, int64(), day); + CheckScalarUnary("day_of_week", unit, times, int64(), day_of_week); + CheckScalarUnary("day_of_year", unit, times, int64(), day_of_year); + CheckScalarUnary("iso_year", unit, times, int64(), iso_year); + CheckScalarUnary("iso_week", unit, times, int64(), iso_week); + CheckScalarUnary("iso_calendar", ArrayFromJSON(unit, times), iso_calendar); + CheckScalarUnary("quarter", unit, times, int64(), quarter); + CheckScalarUnary("hour", unit, times, int64(), hour); + CheckScalarUnary("minute", unit, times, int64(), minute); + CheckScalarUnary("second", unit, times, int64(), second); + CheckScalarUnary("millisecond", unit, times, int64(), millisecond); + CheckScalarUnary("microsecond", unit, times, int64(), microsecond); + CheckScalarUnary("nanosecond", unit, times, int64(), nanosecond); + CheckScalarUnary("subsecond", unit, times, float64(), subsecond); +} + +TEST(ScalarTemporalTest, TestOverflow) { + const char* times = R"(["1677-09-20T00:00:59.123456789", "2262-04-13T23:23:23.999999999"])"; + + auto unit = timestamp(TimeUnit::NANO); + auto iso_calendar_type = + struct_({field("iso_year", int64()), field("iso_week", int64()), + field("iso_day_of_week", int64())}); + + auto year = "[2262, 1677]"; + auto month = "[4, 9]"; + auto day = "[10, 22]"; + auto day_of_week = "[3, 2]"; + auto day_of_year = "[100, 265]"; + auto iso_year = "[2262, 1677]"; + auto iso_week = "[15, 38]"; + auto iso_calendar = + ArrayFromJSON(iso_calendar_type, + R"([{"iso_year": 2262, "iso_week": 15, "iso_day_of_week": 4}, + {"iso_year": 1677, "iso_week": 38, "iso_day_of_week": 3}])"); + auto quarter = "[2, 3]"; + auto hour = "[23, 23]"; + auto minute = "[35, 48]"; + auto second = "[32, 50]"; + auto millisecond = "[833, 290]"; + auto microsecond = "[8, 448]"; + auto nanosecond = "[405, 383]"; + auto subsecond = "[0.833008405, 0.290448383]"; + + CheckScalarUnary("year", unit, times, int64(), year); + CheckScalarUnary("month", unit, times, int64(), month); + CheckScalarUnary("day", unit, times, int64(), day); + CheckScalarUnary("day_of_week", unit, times, int64(), day_of_week); + CheckScalarUnary("day_of_year", unit, times, int64(), day_of_year); + CheckScalarUnary("iso_year", unit, times, int64(), iso_year); + CheckScalarUnary("iso_week", unit, times, int64(), iso_week); + CheckScalarUnary("iso_calendar", ArrayFromJSON(unit, times), iso_calendar); + CheckScalarUnary("quarter", unit, times, int64(), quarter); + CheckScalarUnary("hour", unit, times, int64(), hour); + CheckScalarUnary("minute", unit, times, int64(), minute); + CheckScalarUnary("second", unit, times, int64(), second); + CheckScalarUnary("millisecond", unit, times, int64(), millisecond); + CheckScalarUnary("microsecond", unit, times, int64(), microsecond); + CheckScalarUnary("nanosecond", unit, times, int64(), nanosecond); + CheckScalarUnary("subsecond", unit, times, float64(), subsecond); } Review comment: It this what you had in mind [here](https://github.com/apache/arrow/pull/10176#issuecomment-858392898). ########## File path: python/pyarrow/tests/test_compute.py ########## @@ -1255,6 +1255,78 @@ def test_strptime(): assert got == expected +def _check_datetime_components(timestamps, timezone=None): + import pandas as pd + + if timezone: + ts = pd.to_datetime(timestamps).tz_localize(timezone).to_series() + else: + ts = pd.to_datetime(timestamps).to_series() + + tsa = pa.array(ts) + + subseconds = ((ts.dt.microsecond * 10**3 + + ts.dt.nanosecond) * 10**-9).round(9) + iso_calendar_fields = [ + pa.field('iso_year', pa.int64()), + pa.field('iso_week', pa.int64()), + pa.field('iso_day_of_week', pa.int64()) + ] + + iso_year = ts.dt.isocalendar()["year"].astype(int) + iso_week = ts.dt.isocalendar()["week"].astype(int) + iso_day = ts.dt.isocalendar()["day"].astype(int) + iso_calendar = pa.StructArray.from_arrays( + [iso_year, iso_week, iso_day], fields=iso_calendar_fields) + + assert pc.year(tsa).equals(pa.array(ts.dt.year)) + assert pc.month(tsa).equals(pa.array(ts.dt.month)) + assert pc.day(tsa).equals(pa.array(ts.dt.day)) + assert pc.day_of_week(tsa).equals(pa.array(ts.dt.day_of_week)) + assert pc.day_of_year(tsa).equals(pa.array(ts.dt.day_of_year)) + assert pc.iso_year(tsa).equals(pa.array(iso_year)) + assert pc.iso_week(tsa).equals(pa.array(iso_week)) + assert pc.iso_calendar(tsa).equals(iso_calendar) + assert pc.quarter(tsa).equals(pa.array(ts.dt.quarter)) + assert pc.hour(tsa).equals(pa.array(ts.dt.hour)) + assert pc.minute(tsa).equals(pa.array(ts.dt.minute)) + assert pc.second(tsa).equals(pa.array(ts.dt.second.values)) + assert pc.millisecond(tsa).equals(pa.array(ts.dt.microsecond // 10**3)) + assert pc.microsecond(tsa).equals(pa.array(ts.dt.microsecond % 10**3)) + assert pc.nanosecond(tsa).equals(pa.array(ts.dt.nanosecond)) + assert pc.subsecond(tsa).equals(pa.array(subseconds)) + + +@pytest.mark.pandas +def test_extract_datetime_components(): + # TODO: see https://github.com/pandas-dev/pandas/issues/41834 + # "1899-01-01T00:59:20.001001001" Review comment: Probably best to remove this since I believe pandas is wrong and it's not really our TODO. It's kinda hard to confirm who is right. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org