jorisvandenbossche commented on a change in pull request #10176: URL: https://github.com/apache/arrow/pull/10176#discussion_r645350623
########## File path: cpp/src/arrow/compute/api_scalar.h ########## @@ -481,5 +481,184 @@ ARROW_EXPORT Result<Datum> IfElse(const Datum& cond, const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); +/// \brief Year returns year for each element of `values` +/// +/// \param[in] values input to extract year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Year(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Month returns month for each element of `values`. +/// Month is encoded as January=1, December=12 +/// +/// \param[in] values input to extract month from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Month(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Day returns day number for each element of `values` +/// +/// \param[in] values input to extract day from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Day(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief DayOfWeek returns number of the day of the week value for each element of +/// `values`. Week starts on Monday denoted by 0 and ends on Sunday denoted by 6. +/// +/// \param[in] values input to extract number of the day of the week from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> DayOfWeek(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief DayOfYear returns number of day of the year for each element of `values`. +/// January 1st maps to day number 1, February 1st to 32, etc. +/// +/// \param[in] values input to extract number of day of the year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> DayOfYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOYear returns ISO year number for each element of `values`. +/// First week of an ISO year has the majority (4 or more) of it's days in January. Review comment: ```suggestion /// First week of an ISO year has the majority (4 or more) of its days in January. ``` ########## File path: cpp/src/arrow/compute/api_scalar.h ########## @@ -481,5 +481,184 @@ ARROW_EXPORT Result<Datum> IfElse(const Datum& cond, const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); +/// \brief Year returns year for each element of `values` +/// +/// \param[in] values input to extract year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Year(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Month returns month for each element of `values`. +/// Month is encoded as January=1, December=12 +/// +/// \param[in] values input to extract month from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Month(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Day returns day number for each element of `values` +/// +/// \param[in] values input to extract day from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Day(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief DayOfWeek returns number of the day of the week value for each element of +/// `values`. Week starts on Monday denoted by 0 and ends on Sunday denoted by 6. +/// +/// \param[in] values input to extract number of the day of the week from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> DayOfWeek(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief DayOfYear returns number of day of the year for each element of `values`. +/// January 1st maps to day number 1, February 1st to 32, etc. +/// +/// \param[in] values input to extract number of day of the year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> DayOfYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOYear returns ISO year number for each element of `values`. +/// First week of an ISO year has the majority (4 or more) of it's days in January. +/// +/// \param[in] values input to extract ISO year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> ISOYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOWeek returns ISO week of year number for each element of `values`. +/// First ISO week has the majority (4 or more) of it's days in January. Review comment: ```suggestion /// First ISO week has the majority (4 or more) of its days in January. ``` ########## File path: cpp/src/arrow/compute/kernels/scalar_temporal_test.cc ########## @@ -0,0 +1,151 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> +#include "arrow/compute/api_scalar.h" +#include "arrow/compute/kernels/common.h" +#include "arrow/compute/kernels/test_util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/formatting.h" + +namespace arrow { + +using internal::StringFormatter; + +class ScalarTemporalTest : public ::testing::Test {}; + +namespace compute { + +TEST(ScalarTemporalTest, TestTemporalComponentExtraction) { + const char* times = + R"(["1970-01-01T00:00:59.123456789","2000-02-29T23:23:23.999999999", + "1899-01-01T00:59:20.001001001","2033-05-18T03:33:20.000000000", null])"; + auto unit = timestamp(TimeUnit::NANO); + auto iso_calendar_type = + struct_({field("iso_year", int64()), field("iso_week", int64()), + field("day_of_week", int64())}); + + auto year = "[1970, 2000, 1899, 2033, null]"; + auto month = "[1, 2, 1, 5, null]"; + auto day = "[1, 29, 1, 18, null]"; + auto day_of_week = "[4, 2, 7, 3, null]"; + auto day_of_year = "[1, 60, 1, 138, null]"; + auto iso_year = "[1970, 2000, 1899, 2033, null]"; + auto iso_week = "[1, 9, 52, 20, null]"; + auto iso_calendar = + ArrayFromJSON(iso_calendar_type, + R"([{"iso_year": 1970, "iso_week": 1, "day_of_week": 4}, + {"iso_year": 2000, "iso_week": 9, "day_of_week": 2}, + {"iso_year": 1899, "iso_week": 52, "day_of_week": 7}, + {"iso_year": 2033, "iso_week": 20, "day_of_week": 3}, null])"); + auto quarter = "[1, 1, 1, 2, null]"; + auto hour = "[0, 23, 0, 3, null]"; + auto minute = "[0, 23, 59, 33, null]"; + auto second = "[59.123456789, 23.999999999, 20.001001001, 20.0, null]"; + auto millisecond = "[123, 999, 1, 0, null]"; + auto microsecond = "[456, 999, 1, 0, null]"; + auto nanosecond = "[789, 999, 1, 0, null]"; + auto subsecond = "[123456789, 999999999, 1001001, 0, null]"; Review comment: I would have expected a float here, but of course an integer is possible as well (see my question above on the docstring of this function for asking to clarify it, but so my inline suggestion there was assuming it would be a float). Now, if we return an integer, what is this number exactly? Does it then depend on the unit of the Timestamp, or are it always nanoseconds (even when the unit is eg milliseconds) Returning it as a float would avoid the ambiguity (but of course for certain applications an integer might be more useful). ########## File path: docs/source/cpp/compute.rst ########## @@ -637,6 +637,55 @@ String extraction e.g. 'letter' and 'digit' for the regular expression ``(?P<letter>[ab])(?P<digit>\\d)``. +Temporal component extraction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These functions extract datetime components (year, month, day, etc) from timestamp type. +Note: timezone information is currently ignored if present. + ++--------------------+------------+-------------------+-----------------+--------+ +| Function name | Arity | Input types | Output type | Notes | ++====================+============+===================+=================+========+ +| year | Unary | Temporal | Numeric | | ++--------------------+------------+-------------------+-----------------+--------+ +| month | Unary | Temporal | Numeric | | ++--------------------+------------+-------------------+-----------------+--------+ +| day | Unary | Temporal | Numeric | | ++--------------------+------------+-------------------+-----------------+--------+ +| day_of_week | Unary | Temporal | Numeric | \(1) | ++--------------------+------------+-------------------+-----------------+--------+ +| day_of_year | Unary | Temporal | Numeric | | ++--------------------+------------+-------------------+-----------------+--------+ +| iso_year | Unary | Temporal | Numeric | \(2) | ++--------------------+------------+-------------------+-----------------+--------+ +| iso_week | Unary | Temporal | Numeric | \(2) | ++--------------------+------------+-------------------+-----------------+--------+ +| iso_calendar | Unary | Temporal | Scalar Struct | \(3) | ++--------------------+------------+-------------------+-----------------+--------+ +| quarter | Unary | Temporal | Numeric | | ++--------------------+------------+-------------------+-----------------+--------+ +| hour | Unary | Temporal | Numeric | | ++--------------------+------------+-------------------+-----------------+--------+ +| minute | Unary | Temporal | Numeric | | ++--------------------+------------+-------------------+-----------------+--------+ +| second | Unary | Temporal | Numeric | | ++--------------------+------------+-------------------+-----------------+--------+ +| millisecond | Unary | Temporal | Numeric | | ++--------------------+------------+-------------------+-----------------+--------+ +| microsecond | Unary | Temporal | Numeric | | ++--------------------+------------+-------------------+-----------------+--------+ +| nanosecond | Unary | Temporal | Numeric | | ++--------------------+------------+-------------------+-----------------+--------+ +| subsecond | Unary | Temporal | Numeric | | ++--------------------+------------+-------------------+-----------------+--------+ + +* \(1) Outputs the number of the day of the week. Week begins on Monday and is denoted + by 0 and ends on Sunday denoted by 6. +* \(2) First ISO week has the majority (4 or more) of it's days in January. ISO year + starts with the first ISO week. + See `ISO 8601 week date definition`_ for more details. +* \(3) Output is a ``{"iso_year": output type, "iso_week": output type, "day_of_week": output type}`` Struct. +.. _ISO 8601 week date definition: https://en.wikipedia.org/wiki/ISO_week_date#First_week Review comment: Can you leave a blank line above this line? (to separate the list from the link target definition; in general rst often requires blank lines to separate different blocks, although not fully sure it's needed here) ########## File path: cpp/src/arrow/compute/api_scalar.h ########## @@ -481,5 +481,184 @@ ARROW_EXPORT Result<Datum> IfElse(const Datum& cond, const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); +/// \brief Year returns year for each element of `values` +/// +/// \param[in] values input to extract year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Year(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Month returns month for each element of `values`. +/// Month is encoded as January=1, December=12 +/// +/// \param[in] values input to extract month from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Month(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Day returns day number for each element of `values` +/// +/// \param[in] values input to extract day from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Day(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief DayOfWeek returns number of the day of the week value for each element of +/// `values`. Week starts on Monday denoted by 0 and ends on Sunday denoted by 6. +/// +/// \param[in] values input to extract number of the day of the week from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> DayOfWeek(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief DayOfYear returns number of day of the year for each element of `values`. +/// January 1st maps to day number 1, February 1st to 32, etc. +/// +/// \param[in] values input to extract number of day of the year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> DayOfYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOYear returns ISO year number for each element of `values`. +/// First week of an ISO year has the majority (4 or more) of it's days in January. +/// +/// \param[in] values input to extract ISO year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> ISOYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOWeek returns ISO week of year number for each element of `values`. +/// First ISO week has the majority (4 or more) of it's days in January. +/// Week of the year starts with 1 and can run up to 53. +/// +/// \param[in] values input to extract ISO week of year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> ISOWeek(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOCalendar returns a (ISO year, ISO week, weekday) struct for each element of +/// `values` +/// +/// \param[in] values input to ISO calendar struct from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> ISOCalendar(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Quarter returns the quarter of year number for each element of `values` +/// First quarter maps to 1 and forth quarter maps to 4. Review comment: ```suggestion /// First quarter maps to 1 and fourth quarter maps to 4. ``` ########## File path: cpp/src/arrow/compute/api_scalar.h ########## @@ -481,5 +481,184 @@ ARROW_EXPORT Result<Datum> IfElse(const Datum& cond, const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); +/// \brief Year returns year for each element of `values` +/// +/// \param[in] values input to extract year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 Review comment: All those "since 4.0.0" will have to be search/replaced with "since 5.0.0" now ########## File path: cpp/src/arrow/compute/kernels/scalar_temporal_test.cc ########## @@ -0,0 +1,151 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> +#include "arrow/compute/api_scalar.h" +#include "arrow/compute/kernels/common.h" +#include "arrow/compute/kernels/test_util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/formatting.h" + +namespace arrow { + +using internal::StringFormatter; + +class ScalarTemporalTest : public ::testing::Test {}; + +namespace compute { + +TEST(ScalarTemporalTest, TestTemporalComponentExtraction) { + const char* times = + R"(["1970-01-01T00:00:59.123456789","2000-02-29T23:23:23.999999999", + "1899-01-01T00:59:20.001001001","2033-05-18T03:33:20.000000000", null])"; + auto unit = timestamp(TimeUnit::NANO); + auto iso_calendar_type = + struct_({field("iso_year", int64()), field("iso_week", int64()), + field("day_of_week", int64())}); + + auto year = "[1970, 2000, 1899, 2033, null]"; + auto month = "[1, 2, 1, 5, null]"; + auto day = "[1, 29, 1, 18, null]"; + auto day_of_week = "[4, 2, 7, 3, null]"; + auto day_of_year = "[1, 60, 1, 138, null]"; + auto iso_year = "[1970, 2000, 1899, 2033, null]"; + auto iso_week = "[1, 9, 52, 20, null]"; + auto iso_calendar = + ArrayFromJSON(iso_calendar_type, + R"([{"iso_year": 1970, "iso_week": 1, "day_of_week": 4}, + {"iso_year": 2000, "iso_week": 9, "day_of_week": 2}, + {"iso_year": 1899, "iso_week": 52, "day_of_week": 7}, + {"iso_year": 2033, "iso_week": 20, "day_of_week": 3}, null])"); + auto quarter = "[1, 1, 1, 2, null]"; + auto hour = "[0, 23, 0, 3, null]"; + auto minute = "[0, 23, 59, 33, null]"; + auto second = "[59.123456789, 23.999999999, 20.001001001, 20.0, null]"; + auto millisecond = "[123, 999, 1, 0, null]"; + auto microsecond = "[456, 999, 1, 0, null]"; + auto nanosecond = "[789, 999, 1, 0, null]"; + auto subsecond = "[123456789, 999999999, 1001001, 0, null]"; + + CheckScalarUnary("year", unit, times, int64(), year); + CheckScalarUnary("month", unit, times, int64(), month); + CheckScalarUnary("day", unit, times, int64(), day); + CheckScalarUnary("day_of_week", unit, times, int64(), day_of_week); + CheckScalarUnary("day_of_year", unit, times, int64(), day_of_year); + CheckScalarUnary("iso_year", unit, times, int64(), iso_year); + CheckScalarUnary("iso_week", unit, times, int64(), iso_week); + CheckScalarUnary("iso_calendar", ArrayFromJSON(unit, times), iso_calendar); + CheckScalarUnary("quarter", unit, times, int64(), quarter); + CheckScalarUnary("hour", unit, times, int64(), hour); + CheckScalarUnary("minute", unit, times, int64(), minute); + CheckScalarUnary("second", unit, times, float64(), second); + CheckScalarUnary("millisecond", unit, times, int64(), millisecond); + CheckScalarUnary("microsecond", unit, times, int64(), microsecond); + CheckScalarUnary("nanosecond", unit, times, int64(), nanosecond); + CheckScalarUnary("subsecond", unit, times, int64(), subsecond); +} + +TEST(ScalarTemporalTest, TestTemporalComponentExtractionWithDifferentUnits) { + auto iso_calendar_type = + struct_({field("iso_year", int64()), field("iso_week", int64()), + field("day_of_week", int64())}); + const char* times = + R"(["1970-01-01T00:00:59","2000-02-29T23:23:23", + "1899-01-01T00:59:20","2033-05-18T03:33:20", null])"; + auto year = "[1970, 2000, 1899, 2033, null]"; + auto month = "[1, 2, 1, 5, null]"; + auto day = "[1, 29, 1, 18, null]"; + auto day_of_week = "[4, 2, 7, 3, null]"; + auto day_of_year = "[1, 60, 1, 138, null]"; + auto iso_year = "[1970, 2000, 1899, 2033, null]"; + auto iso_week = "[1, 9, 52, 20, null]"; + auto iso_calendar = + ArrayFromJSON(iso_calendar_type, + R"([{"iso_year": 1970, "iso_week": 1, "day_of_week": 4}, + {"iso_year": 2000, "iso_week": 9, "day_of_week": 2}, + {"iso_year": 1899, "iso_week": 52, "day_of_week": 7}, + {"iso_year": 2033, "iso_week": 20, "day_of_week": 3}, null])"); + auto quarter = "[1, 1, 1, 2, null]"; + auto hour = "[0, 23, 0, 3, null]"; + auto minute = "[0, 23, 59, 33, null]"; + auto second = "[59, 23, 20, 20, null]"; + + // for (auto u : internal::AllTimeUnits()) { + for (auto u : {TimeUnit::NANO}) { Review comment: This is not yet testing the different units? ########## File path: cpp/src/arrow/compute/kernels/scalar_temporal_test.cc ########## @@ -0,0 +1,151 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> +#include "arrow/compute/api_scalar.h" +#include "arrow/compute/kernels/common.h" +#include "arrow/compute/kernels/test_util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/formatting.h" + +namespace arrow { + +using internal::StringFormatter; + +class ScalarTemporalTest : public ::testing::Test {}; + +namespace compute { + +TEST(ScalarTemporalTest, TestTemporalComponentExtraction) { + const char* times = + R"(["1970-01-01T00:00:59.123456789","2000-02-29T23:23:23.999999999", + "1899-01-01T00:59:20.001001001","2033-05-18T03:33:20.000000000", null])"; + auto unit = timestamp(TimeUnit::NANO); + auto iso_calendar_type = + struct_({field("iso_year", int64()), field("iso_week", int64()), + field("day_of_week", int64())}); + + auto year = "[1970, 2000, 1899, 2033, null]"; + auto month = "[1, 2, 1, 5, null]"; + auto day = "[1, 29, 1, 18, null]"; + auto day_of_week = "[4, 2, 7, 3, null]"; + auto day_of_year = "[1, 60, 1, 138, null]"; + auto iso_year = "[1970, 2000, 1899, 2033, null]"; + auto iso_week = "[1, 9, 52, 20, null]"; + auto iso_calendar = + ArrayFromJSON(iso_calendar_type, + R"([{"iso_year": 1970, "iso_week": 1, "day_of_week": 4}, + {"iso_year": 2000, "iso_week": 9, "day_of_week": 2}, + {"iso_year": 1899, "iso_week": 52, "day_of_week": 7}, + {"iso_year": 2033, "iso_week": 20, "day_of_week": 3}, null])"); + auto quarter = "[1, 1, 1, 2, null]"; + auto hour = "[0, 23, 0, 3, null]"; + auto minute = "[0, 23, 59, 33, null]"; + auto second = "[59.123456789, 23.999999999, 20.001001001, 20.0, null]"; Review comment: I would have expected an integer here, which is the number for second in standard ISO string representation (just like the hour and minute etc) (so at least this should be documented more clearly, if we keep this behaviour) ########## File path: cpp/src/arrow/compute/api_scalar.h ########## @@ -481,5 +481,184 @@ ARROW_EXPORT Result<Datum> IfElse(const Datum& cond, const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); +/// \brief Year returns year for each element of `values` +/// +/// \param[in] values input to extract year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Year(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Month returns month for each element of `values`. +/// Month is encoded as January=1, December=12 +/// +/// \param[in] values input to extract month from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Month(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Day returns day number for each element of `values` +/// +/// \param[in] values input to extract day from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Day(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief DayOfWeek returns number of the day of the week value for each element of +/// `values`. Week starts on Monday denoted by 0 and ends on Sunday denoted by 6. +/// +/// \param[in] values input to extract number of the day of the week from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> DayOfWeek(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief DayOfYear returns number of day of the year for each element of `values`. +/// January 1st maps to day number 1, February 1st to 32, etc. +/// +/// \param[in] values input to extract number of day of the year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> DayOfYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOYear returns ISO year number for each element of `values`. +/// First week of an ISO year has the majority (4 or more) of it's days in January. +/// +/// \param[in] values input to extract ISO year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> ISOYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOWeek returns ISO week of year number for each element of `values`. +/// First ISO week has the majority (4 or more) of it's days in January. +/// Week of the year starts with 1 and can run up to 53. +/// +/// \param[in] values input to extract ISO week of year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> ISOWeek(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOCalendar returns a (ISO year, ISO week, weekday) struct for each element of +/// `values` +/// +/// \param[in] values input to ISO calendar struct from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> ISOCalendar(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Quarter returns the quarter of year number for each element of `values` +/// First quarter maps to 1 and forth quarter maps to 4. +/// +/// \param[in] values input to extract quarter of year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> Quarter(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Hour returns hour value for each element of `values` +/// +/// \param[in] values input to extract hour from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Hour(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Minute returns minutes value for each element of `values` +/// +/// \param[in] values input to extract minutes from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Minute(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Second returns seconds value for each element of `values` +/// +/// \param[in] values input to extract seconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Second(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Millisecond returns milliseconds value for each element of `values` +/// +/// \param[in] values input to extract milliseconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Millisecond(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Microsecond returns microseconds value for each element of `values` Review comment: I think here (and for nanoseconds below as well), we should clarify what this "microseconds" value is exactly ########## File path: cpp/src/arrow/compute/kernels/scalar_temporal_test.cc ########## @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> +#include "arrow/compute/api_scalar.h" +#include "arrow/compute/kernels/test_util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/formatting.h" + +namespace arrow { + +using internal::StringFormatter; + +class ScalarTemporalTest : public ::testing::Test {}; + +namespace compute { + +TEST(ScalarTemporalTest, TestSimpleTemporalComponentExtraction) { + const char* times = + R"(["1970-01-01T00:00:59.123456789","2000-02-29T23:23:23.999999999", + "1899-01-01T00:59:20.001001001","2033-05-18T03:33:20.000000000", null])"; Review comment: It would also be good to add a case where `iso_year` is not equal to `year` to cover that boundary case ########## File path: cpp/src/arrow/compute/kernels/scalar_temporal_test.cc ########## @@ -0,0 +1,151 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> +#include "arrow/compute/api_scalar.h" +#include "arrow/compute/kernels/common.h" +#include "arrow/compute/kernels/test_util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/formatting.h" + +namespace arrow { + +using internal::StringFormatter; + +class ScalarTemporalTest : public ::testing::Test {}; + +namespace compute { + +TEST(ScalarTemporalTest, TestTemporalComponentExtraction) { + const char* times = + R"(["1970-01-01T00:00:59.123456789","2000-02-29T23:23:23.999999999", + "1899-01-01T00:59:20.001001001","2033-05-18T03:33:20.000000000", null])"; + auto unit = timestamp(TimeUnit::NANO); + auto iso_calendar_type = + struct_({field("iso_year", int64()), field("iso_week", int64()), + field("day_of_week", int64())}); + + auto year = "[1970, 2000, 1899, 2033, null]"; + auto month = "[1, 2, 1, 5, null]"; + auto day = "[1, 29, 1, 18, null]"; + auto day_of_week = "[4, 2, 7, 3, null]"; + auto day_of_year = "[1, 60, 1, 138, null]"; + auto iso_year = "[1970, 2000, 1899, 2033, null]"; + auto iso_week = "[1, 9, 52, 20, null]"; + auto iso_calendar = + ArrayFromJSON(iso_calendar_type, + R"([{"iso_year": 1970, "iso_week": 1, "day_of_week": 4}, + {"iso_year": 2000, "iso_week": 9, "day_of_week": 2}, + {"iso_year": 1899, "iso_week": 52, "day_of_week": 7}, + {"iso_year": 2033, "iso_week": 20, "day_of_week": 3}, null])"); + auto quarter = "[1, 1, 1, 2, null]"; + auto hour = "[0, 23, 0, 3, null]"; + auto minute = "[0, 23, 59, 33, null]"; + auto second = "[59.123456789, 23.999999999, 20.001001001, 20.0, null]"; + auto millisecond = "[123, 999, 1, 0, null]"; + auto microsecond = "[456, 999, 1, 0, null]"; + auto nanosecond = "[789, 999, 1, 0, null]"; + auto subsecond = "[123456789, 999999999, 1001001, 0, null]"; + + CheckScalarUnary("year", unit, times, int64(), year); + CheckScalarUnary("month", unit, times, int64(), month); + CheckScalarUnary("day", unit, times, int64(), day); + CheckScalarUnary("day_of_week", unit, times, int64(), day_of_week); + CheckScalarUnary("day_of_year", unit, times, int64(), day_of_year); + CheckScalarUnary("iso_year", unit, times, int64(), iso_year); + CheckScalarUnary("iso_week", unit, times, int64(), iso_week); + CheckScalarUnary("iso_calendar", ArrayFromJSON(unit, times), iso_calendar); + CheckScalarUnary("quarter", unit, times, int64(), quarter); + CheckScalarUnary("hour", unit, times, int64(), hour); + CheckScalarUnary("minute", unit, times, int64(), minute); + CheckScalarUnary("second", unit, times, float64(), second); + CheckScalarUnary("millisecond", unit, times, int64(), millisecond); + CheckScalarUnary("microsecond", unit, times, int64(), microsecond); + CheckScalarUnary("nanosecond", unit, times, int64(), nanosecond); + CheckScalarUnary("subsecond", unit, times, int64(), subsecond); +} + +TEST(ScalarTemporalTest, TestTemporalComponentExtractionWithDifferentUnits) { + auto iso_calendar_type = + struct_({field("iso_year", int64()), field("iso_week", int64()), + field("day_of_week", int64())}); + const char* times = + R"(["1970-01-01T00:00:59","2000-02-29T23:23:23", + "1899-01-01T00:59:20","2033-05-18T03:33:20", null])"; + auto year = "[1970, 2000, 1899, 2033, null]"; + auto month = "[1, 2, 1, 5, null]"; + auto day = "[1, 29, 1, 18, null]"; + auto day_of_week = "[4, 2, 7, 3, null]"; + auto day_of_year = "[1, 60, 1, 138, null]"; + auto iso_year = "[1970, 2000, 1899, 2033, null]"; + auto iso_week = "[1, 9, 52, 20, null]"; + auto iso_calendar = + ArrayFromJSON(iso_calendar_type, + R"([{"iso_year": 1970, "iso_week": 1, "day_of_week": 4}, + {"iso_year": 2000, "iso_week": 9, "day_of_week": 2}, + {"iso_year": 1899, "iso_week": 52, "day_of_week": 7}, + {"iso_year": 2033, "iso_week": 20, "day_of_week": 3}, null])"); + auto quarter = "[1, 1, 1, 2, null]"; + auto hour = "[0, 23, 0, 3, null]"; + auto minute = "[0, 23, 59, 33, null]"; + auto second = "[59, 23, 20, 20, null]"; Review comment: Can you also test the other sub-second componenents here? (even for a unit of "second", those components should be defined I think?) ########## File path: cpp/src/arrow/compute/api_scalar.h ########## @@ -481,5 +481,184 @@ ARROW_EXPORT Result<Datum> IfElse(const Datum& cond, const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); +/// \brief Year returns year for each element of `values` +/// +/// \param[in] values input to extract year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Year(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Month returns month for each element of `values`. +/// Month is encoded as January=1, December=12 +/// +/// \param[in] values input to extract month from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Month(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Day returns day number for each element of `values` +/// +/// \param[in] values input to extract day from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Day(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief DayOfWeek returns number of the day of the week value for each element of +/// `values`. Week starts on Monday denoted by 0 and ends on Sunday denoted by 6. +/// +/// \param[in] values input to extract number of the day of the week from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> DayOfWeek(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief DayOfYear returns number of day of the year for each element of `values`. +/// January 1st maps to day number 1, February 1st to 32, etc. +/// +/// \param[in] values input to extract number of day of the year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> DayOfYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOYear returns ISO year number for each element of `values`. +/// First week of an ISO year has the majority (4 or more) of it's days in January. +/// +/// \param[in] values input to extract ISO year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> ISOYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOWeek returns ISO week of year number for each element of `values`. +/// First ISO week has the majority (4 or more) of it's days in January. +/// Week of the year starts with 1 and can run up to 53. +/// +/// \param[in] values input to extract ISO week of year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> ISOWeek(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOCalendar returns a (ISO year, ISO week, weekday) struct for each element of Review comment: ```suggestion /// \brief ISOCalendar returns a (ISO year, ISO week, dayofweek) struct for each element of ``` (both are kind of synonyms, but I think our kernel is called DayOfWeek, so best to be consistent?) ########## File path: docs/source/cpp/compute.rst ########## @@ -637,6 +637,55 @@ String extraction e.g. 'letter' and 'digit' for the regular expression ``(?P<letter>[ab])(?P<digit>\\d)``. +Temporal component extraction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These functions extract datetime components (year, month, day, etc) from timestamp type. +Note: timezone information is currently ignored if present. Review comment: This can be updated to indicate that if there is timezone present, this is currently not yet supported ########## File path: cpp/src/arrow/compute/api_scalar.h ########## @@ -481,5 +481,184 @@ ARROW_EXPORT Result<Datum> IfElse(const Datum& cond, const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); +/// \brief Year returns year for each element of `values` +/// +/// \param[in] values input to extract year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Year(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Month returns month for each element of `values`. +/// Month is encoded as January=1, December=12 +/// +/// \param[in] values input to extract month from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Month(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Day returns day number for each element of `values` +/// +/// \param[in] values input to extract day from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Day(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief DayOfWeek returns number of the day of the week value for each element of +/// `values`. Week starts on Monday denoted by 0 and ends on Sunday denoted by 6. +/// +/// \param[in] values input to extract number of the day of the week from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> DayOfWeek(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief DayOfYear returns number of day of the year for each element of `values`. +/// January 1st maps to day number 1, February 1st to 32, etc. +/// +/// \param[in] values input to extract number of day of the year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> DayOfYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOYear returns ISO year number for each element of `values`. +/// First week of an ISO year has the majority (4 or more) of it's days in January. +/// +/// \param[in] values input to extract ISO year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> ISOYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOWeek returns ISO week of year number for each element of `values`. +/// First ISO week has the majority (4 or more) of it's days in January. +/// Week of the year starts with 1 and can run up to 53. +/// +/// \param[in] values input to extract ISO week of year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> ISOWeek(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOCalendar returns a (ISO year, ISO week, weekday) struct for each element of +/// `values` +/// +/// \param[in] values input to ISO calendar struct from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> ISOCalendar(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Quarter returns the quarter of year number for each element of `values` +/// First quarter maps to 1 and forth quarter maps to 4. +/// +/// \param[in] values input to extract quarter of year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result<Datum> Quarter(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Hour returns hour value for each element of `values` +/// +/// \param[in] values input to extract hour from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Hour(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Minute returns minutes value for each element of `values` +/// +/// \param[in] values input to extract minutes from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Minute(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Second returns seconds value for each element of `values` +/// +/// \param[in] values input to extract seconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Second(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Millisecond returns milliseconds value for each element of `values` +/// +/// \param[in] values input to extract milliseconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Millisecond(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Microsecond returns microseconds value for each element of `values` +/// +/// \param[in] values input to extract microseconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Microsecond(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Nanosecond returns nanoseconds value for each element of `values` +/// +/// \param[in] values input to extract nanoseconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 4.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result<Datum> Nanosecond(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Subsecond returns subsecond time fraction since last second for each element of Review comment: ```suggestion /// \brief Subsecond returns subsecond time fraction since last second as a float for each element of ``` (to be explicit, since it could also have been returned as an integer) ########## File path: docs/source/cpp/compute.rst ########## @@ -637,6 +637,54 @@ String extraction e.g. 'letter' and 'digit' for the regular expression ``(?P<letter>[ab])(?P<digit>\\d)``. +Temporal component extraction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These functions extract datetime components (year, month, day, etc) from timestamp type. +Note: timezone information is currently ignored if present. + ++--------------------+------------+-------------------+-----------------+--------+ +| Function name | Arity | Input types | Output type | Notes | ++====================+============+===================+=================+========+ +| year | Unary | Temporal | Numeric | | Review comment: (this isn't done yet? Or you didn't yet push your latest edits?) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org