[GitHub] [arrow] nirandaperera commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

GitBox Mon, 17 May 2021 11:43:08 -0700


nirandaperera commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632648238




##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of 
BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];  // right shift 
leading 4 bits
+      std::copy(input + i, input + (i + offset),
+                output + (input_string_ncodeunits - i - offset));

Review comment:
       We can also use this bit twiddling hack for `min`. 
   ```c
   r = y ^ ((x ^ y) & -(x < y)); // min(x, y)
   ```
   ref: https://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
   
   Any preference?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,54 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1
+ * UTF8_BYTE_SIZE_LUT[0..7] --> pure ascii chars --> 1B length
+ * UTF8_BYTE_SIZE_LUT[8..11] --> internal bytes --> 1B length
+ * UTF8_BYTE_SIZE_LUT[12,13] --> 2B long UTF8 chars
+ * UTF8_BYTE_SIZE_LUT[14] --> 3B long UTF8 chars
+ * UTF8_BYTE_SIZE_LUT[15] --> 4B long UTF8 chars
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 1, 1, 1, 1, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];
+      offset_type stride = std::min(i + offset, input_string_ncodeunits);
+      std::copy(input + i, input + stride, output + input_string_ncodeunits - 
stride);
+      i += offset;
+    }
+    *output_written = input_string_ncodeunits;
+    return true;

Review comment:
       As of this implementation, yes, I am returning `true` on both cases. 
   
   Based on @pitrou 's comment, what I understood was, if there are invalid 
utf8 chars, we'll guarantee that the program runs correctly, but the output 
would be garbage. 
   
   I can actually change the logic to validate utf8 bytes etc, but then AFAIU, 
I'll have to drop the lookup table and count the valid utf8 bytes when ever I 
encounter a non-ascii byte. 
   
   If everyone agrees, I'll add that :slightly_smiling_face: 

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of 
BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};

Review comment:
       I found a problem in the LUT tbh :smile: `xFF` is a malformed utf8 byte, 
but that would mess everything up! :confused: So, looks like I'll have the 5th 
bit also into the LUT. 

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -91,6 +91,30 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
                    "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
+TYPED_TEST(TestStringKernels, AsciiReverse) {
+  this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", 
this->type(),
+                   R"(["dcba", null, "", "bbb"])");
+
+  Datum invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", 
"bbb"])");
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 
sequence"),

Review comment:
       +1. I also agree that this is a weird error str to throw from an ascii 
kernel. 

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of 
BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};

Review comment:
       Final LUT would look like this. 
   |    | bit7 | bit6 | bit5 | bit4 | bit3 | char byte size | notes    |
   |----|------|------|------|------|------|----------------|----------|
   | 0  | 0    | 0    | 0    | 0    | 0    | 1              | ascii    |
   | 1  | 0    | 0    | 0    | 0    | 1    | 1              | ascii    |
   | 2  | 0    | 0    | 0    | 1    | 0    | 1              | ascii    |
   | 3  | 0    | 0    | 0    | 1    | 1    | 1              | ascii    |
   | 4  | 0    | 0    | 1    | 0    | 0    | 1              | ascii    |
   | 5  | 0    | 0    | 1    | 0    | 1    | 1              | ascii    |
   | 6  | 0    | 0    | 1    | 1    | 0    | 1              | ascii    |
   | 7  | 0    | 0    | 1    | 1    | 1    | 1              | ascii    |
   | 8  | 0    | 1    | 0    | 0    | 0    | 1              | ascii    |
   | 9  | 0    | 1    | 0    | 0    | 1    | 1              | ascii    |
   | 10 | 0    | 1    | 0    | 1    | 0    | 1              | ascii    |
   | 11 | 0    | 1    | 0    | 1    | 1    | 1              | ascii    |
   | 12 | 0    | 1    | 1    | 0    | 0    | 1              | ascii    |
   | 13 | 0    | 1    | 1    | 0    | 1    | 1              | ascii    |
   | 14 | 0    | 1    | 1    | 1    | 0    | 1              | ascii    |
   | 15 | 0    | 1    | 1    | 1    | 1    | 1              | ascii    |
   | 16 | 1    | 0    | 0    | 0    | 0    | 1              | internal |
   | 17 | 1    | 0    | 0    | 0    | 1    | 1              | internal |
   | 18 | 1    | 0    | 0    | 1    | 0    | 1              | internal |
   | 19 | 1    | 0    | 0    | 1    | 1    | 1              | internal |
   | 20 | 1    | 0    | 1    | 0    | 0    | 1              | internal |
   | 21 | 1    | 0    | 1    | 0    | 1    | 1              | internal |
   | 22 | 1    | 0    | 1    | 1    | 0    | 1              | internal |
   | 23 | 1    | 0    | 1    | 1    | 1    | 1              | internal |
   | 24 | 1    | 1    | 0    | 0    | 0    | 2              |          |
   | 25 | 1    | 1    | 0    | 0    | 1    | 2              |          |
   | 26 | 1    | 1    | 0    | 1    | 0    | 2              |          |
   | 27 | 1    | 1    | 0    | 1    | 1    | 2              |          |
   | 28 | 1    | 1    | 1    | 0    | 0    | 3              |          |
   | 29 | 1    | 1    | 1    | 0    | 1    | 3              |          |
   | 30 | 1    | 1    | 1    | 1    | 0    | 4              |          |
   | 31 | 1    | 1    | 1    | 1    | 1    | 1              | invalid  |




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] nirandaperera commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Reply via email to