pitrou commented on a change in pull request #10869: URL: https://github.com/apache/arrow/pull/10869#discussion_r698400302
########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -527,6 +517,55 @@ struct Utf8CapitalizeTransform : public StringTransformBase { template <typename Type> using Utf8Capitalize = StringTransformExec<Type, Utf8CapitalizeTransform>; +struct Utf8TitleTransform : public StringTransformCodepointBase { + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output) { + uint8_t* output_start = output; + const uint8_t* end = input + input_string_ncodeunits; + const uint8_t* curr = NULLPTR; Review comment: Nit, but can use `nullptr` in `.cc` files. ########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -496,29 +493,22 @@ template <typename Type> using UTF8SwapCase = StringTransformExec<Type, StringTransformCodepoint<UTF8SwapCaseTransform>>; -struct Utf8CapitalizeTransform : public StringTransformBase { +struct Utf8CapitalizeTransform : public StringTransformCodepointBase { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { uint8_t* output_start = output; - if (input_string_ncodeunits > 0) { Review comment: Is it ok to remove this condition? ########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -677,6 +716,39 @@ struct AsciiCapitalizeTransform : public StringTransformBase { template <typename Type> using AsciiCapitalize = StringTransformExec<Type, AsciiCapitalizeTransform>; +struct AsciiTitleTransform : public StringTransformBase { + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output) { + const uint8_t* end = input + input_string_ncodeunits; + const uint8_t* curr = NULLPTR; + + do { + // Uppercase first alpha character of current word + while ((curr = input++) < end) { + if (IsCasedCharacterAscii(*curr)) { + *output++ = ascii_toupper(*curr); + break; + } + *output++ = *curr; + } + + // Lowercase characters until a whitespace is found + while ((curr = input++) < end) { + if (IsSpaceCharacterAscii(*curr)) { Review comment: Same remark here concerning non-cased vs. whitespace. ########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -4127,6 +4199,14 @@ const FunctionDoc ascii_capitalize_doc( "non-ASCII characters, use \"utf8_capitalize\" instead."), {"strings"}); +const FunctionDoc ascii_title_doc( + "Transform ASCII input to a string where the first alpha character in each word is " + "uppercase and all other characters are lowercase", + ("For each string in `strings`, return a title version.\n\n" Review comment: `s/title/titlecased/`. Also add a sentence explaining better what it means, e.g.: "Each word in the output will start with an uppercase character and its remaining characters will be lowercase." ########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -4153,6 +4233,11 @@ const FunctionDoc utf8_capitalize_doc( "with the first character uppercased and the others lowercased."), {"strings"}); +const FunctionDoc utf8_title_doc( + "Transform input to a string where the first alpha character in each word is " Review comment: Same remarks here. ########## File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc ########## @@ -413,6 +413,16 @@ TYPED_TEST(TestStringKernels, AsciiCapitalize) { "\"!hello, world!\"]"); } +TYPED_TEST(TestStringKernels, AsciiTitle) { + this->CheckUnary("ascii_title", "[]", this->type(), "[]"); + this->CheckUnary("ascii_title", + "[\"aAazZæÆ&\", null, \"\", \"bBB\", \"hEllO, WoRld!\", \"$. A3\", " + "\"!hELlo, wORLd!\", \" a b cd\"]", + this->type(), + "[\"AaazzæÆ&\", null, \"\", \"Bbb\", \"Hello, World!\", \"$. A3\", " + "\"!Hello, World!\", \" A B Cd\"]"); Review comment: Can you make sure some of these values exercise non-whitespace word separators? ########## File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc ########## @@ -541,6 +551,16 @@ TYPED_TEST(TestStringKernels, Utf8Capitalize) { "\"Hello, world!\", \"$. a3\", \"!ɑɽɽow\"]"); } +TYPED_TEST(TestStringKernels, Utf8Title) { + this->CheckUnary("utf8_title", "[]", this->type(), "[]"); + this->CheckUnary("utf8_title", + "[\"aAazZæÆ&\", null, \"\", \"b\", \"ɑɽⱤoW\", \"ıI\", \"ⱥⱥⱥȺ\", " + "\"hEllO, WoRld!\", \"$. A3\", \"!ɑⱤⱤow\"]", + this->type(), + "[\"Aaazzææ&\", null, \"\", \"B\", \"Ɑɽɽow\", \"Ii\", \"Ⱥⱥⱥⱥ\", " + "\"Hello, World!\", \"$. A3\", \"!Ɑɽɽow\"]"); Review comment: Same here. ########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -4127,6 +4199,14 @@ const FunctionDoc ascii_capitalize_doc( "non-ASCII characters, use \"utf8_capitalize\" instead."), {"strings"}); +const FunctionDoc ascii_title_doc( + "Transform ASCII input to a string where the first alpha character in each word is " Review comment: I would condense this summary line into: "Titlecase each word of ASCII input". ########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -527,6 +517,55 @@ struct Utf8CapitalizeTransform : public StringTransformBase { template <typename Type> using Utf8Capitalize = StringTransformExec<Type, Utf8CapitalizeTransform>; +struct Utf8TitleTransform : public StringTransformCodepointBase { + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output) { + uint8_t* output_start = output; + const uint8_t* end = input + input_string_ncodeunits; + const uint8_t* curr = NULLPTR; + uint32_t codepoint = 0; + + do { + // Uppercase first alpha character of current word + while (input < end) { + curr = input; + if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&curr, &codepoint))) { + return kTransformError; + } + if (IsCasedCharacterUnicode(codepoint)) { + output = + util::UTF8Encode(output, UTF8UpperTransform::TransformCodepoint(codepoint)); + input = curr; + break; + } + output = std::copy(input, curr, output); + input = curr; + } + + // Lowercase characters until a whitespace is found Review comment: I don't think that's the expected semantics. In Python I get: ```python >>> "foo bar;héhé0zop".title() 'Foo Bar;Héhé0Zop' ``` which means that any non-cased character should break this loop, not only whitespace characters. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org