[GitHub] [arrow] pitrou commented on a change in pull request #10869: ARROW-12714: [C++] String title case kernel

GitBox Mon, 30 Aug 2021 04:24:26 -0700


pitrou commented on a change in pull request #10869:
URL: https://github.com/apache/arrow/pull/10869#discussion_r698400302




##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -527,6 +517,55 @@ struct Utf8CapitalizeTransform : public 
StringTransformBase {
 template <typename Type>
 using Utf8Capitalize = StringTransformExec<Type, Utf8CapitalizeTransform>;
 
+struct Utf8TitleTransform : public StringTransformCodepointBase {
+  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+                    uint8_t* output) {
+    uint8_t* output_start = output;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* curr = NULLPTR;

Review comment:
       Nit, but can use `nullptr` in `.cc` files.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -496,29 +493,22 @@ template <typename Type>
 using UTF8SwapCase =
     StringTransformExec<Type, StringTransformCodepoint<UTF8SwapCaseTransform>>;
 
-struct Utf8CapitalizeTransform : public StringTransformBase {
+struct Utf8CapitalizeTransform : public StringTransformCodepointBase {
   int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
                     uint8_t* output) {
     uint8_t* output_start = output;
-    if (input_string_ncodeunits > 0) {

Review comment:
       Is it ok to remove this condition?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -677,6 +716,39 @@ struct AsciiCapitalizeTransform : public 
StringTransformBase {
 template <typename Type>
 using AsciiCapitalize = StringTransformExec<Type, AsciiCapitalizeTransform>;
 
+struct AsciiTitleTransform : public StringTransformBase {
+  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+                    uint8_t* output) {
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* curr = NULLPTR;
+
+    do {
+      // Uppercase first alpha character of current word
+      while ((curr = input++) < end) {
+        if (IsCasedCharacterAscii(*curr)) {
+          *output++ = ascii_toupper(*curr);
+          break;
+        }
+        *output++ = *curr;
+      }
+
+      // Lowercase characters until a whitespace is found
+      while ((curr = input++) < end) {
+        if (IsSpaceCharacterAscii(*curr)) {

Review comment:
       Same remark here concerning non-cased vs. whitespace.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -4127,6 +4199,14 @@ const FunctionDoc ascii_capitalize_doc(
      "non-ASCII characters, use \"utf8_capitalize\" instead."),
     {"strings"});
 
+const FunctionDoc ascii_title_doc(
+    "Transform ASCII input to a string where the first alpha character in each 
word is "
+    "uppercase and all other characters are lowercase",
+    ("For each string in `strings`, return a title version.\n\n"

Review comment:
       `s/title/titlecased/`. Also add a sentence explaining better what it 
means, e.g.:
   "Each word in the output will start with an uppercase character and its 
remaining characters will be lowercase."

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -4153,6 +4233,11 @@ const FunctionDoc utf8_capitalize_doc(
      "with the first character uppercased and the others lowercased."),
     {"strings"});
 
+const FunctionDoc utf8_title_doc(
+    "Transform input to a string where the first alpha character in each word 
is "

Review comment:
       Same remarks here.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -413,6 +413,16 @@ TYPED_TEST(TestStringKernels, AsciiCapitalize) {
                    "\"!hello, world!\"]");
 }
 
+TYPED_TEST(TestStringKernels, AsciiTitle) {
+  this->CheckUnary("ascii_title", "[]", this->type(), "[]");
+  this->CheckUnary("ascii_title",
+                   "[\"aAazZæÆ&\", null, \"\", \"bBB\", \"hEllO, WoRld!\", 
\"$. A3\", "
+                   "\"!hELlo, wORLd!\", \"  a b cd\"]",
+                   this->type(),
+                   "[\"AaazzæÆ&\", null, \"\", \"Bbb\", \"Hello, World!\", 
\"$. A3\", "
+                   "\"!Hello, World!\", \"  A B Cd\"]");

Review comment:
       Can you make sure some of these values exercise non-whitespace word 
separators?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -541,6 +551,16 @@ TYPED_TEST(TestStringKernels, Utf8Capitalize) {
                    "\"Hello, world!\", \"$. a3\", \"!ɑɽɽow\"]");
 }
 
+TYPED_TEST(TestStringKernels, Utf8Title) {
+  this->CheckUnary("utf8_title", "[]", this->type(), "[]");
+  this->CheckUnary("utf8_title",
+                   "[\"aAazZæÆ&\", null, \"\", \"b\", \"ɑɽⱤoW\", \"ıI\", 
\"ⱥⱥⱥȺ\", "
+                   "\"hEllO, WoRld!\", \"$. A3\", \"!ɑⱤⱤow\"]",
+                   this->type(),
+                   "[\"Aaazzææ&\", null, \"\", \"B\", \"Ɑɽɽow\", \"Ii\", 
\"Ⱥⱥⱥⱥ\", "
+                   "\"Hello, World!\", \"$. A3\", \"!Ɑɽɽow\"]");

Review comment:
       Same here.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -4127,6 +4199,14 @@ const FunctionDoc ascii_capitalize_doc(
      "non-ASCII characters, use \"utf8_capitalize\" instead."),
     {"strings"});
 
+const FunctionDoc ascii_title_doc(
+    "Transform ASCII input to a string where the first alpha character in each 
word is "

Review comment:
       I would condense this summary line into:
   "Titlecase each word of ASCII input".
   

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -527,6 +517,55 @@ struct Utf8CapitalizeTransform : public 
StringTransformBase {
 template <typename Type>
 using Utf8Capitalize = StringTransformExec<Type, Utf8CapitalizeTransform>;
 
+struct Utf8TitleTransform : public StringTransformCodepointBase {
+  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+                    uint8_t* output) {
+    uint8_t* output_start = output;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* curr = NULLPTR;
+    uint32_t codepoint = 0;
+
+    do {
+      // Uppercase first alpha character of current word
+      while (input < end) {
+        curr = input;
+        if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&curr, &codepoint))) {
+          return kTransformError;
+        }
+        if (IsCasedCharacterUnicode(codepoint)) {
+          output =
+              util::UTF8Encode(output, 
UTF8UpperTransform::TransformCodepoint(codepoint));
+          input = curr;
+          break;
+        }
+        output = std::copy(input, curr, output);
+        input = curr;
+      }
+
+      // Lowercase characters until a whitespace is found

Review comment:
       I don't think that's the expected semantics. In Python I get:
   ```python
   >>> "foo bar;héhé0zop".title()
   'Foo Bar;Héhé0Zop'
   ```
   
   which means that any non-cased character should break this loop, not only 
whitespace characters.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #10869: ARROW-12714: [C++] String title case kernel

Reply via email to