[GitHub] [arrow] pprudhvi commented on a change in pull request #9450: ARROW-11565: [C++][Gandiva] Modify upper()/lower() logic to make them work for utf8 strings

GitBox Tue, 23 Feb 2021 22:16:20 -0800


pprudhvi commented on a change in pull request #9450:
URL: https://github.com/apache/arrow/pull/9450#discussion_r581658737




##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -249,25 +297,73 @@ const char* lower_utf8(gdv_int64 context, const char* 
data, gdv_int32 data_len,
     return "";
   }
 
-  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
data_len));
-  if (ret == nullptr) {
+  // If it is a single-byte character (ASCII), corresponding lowercase is 
always 1-byte
+  // long; if it is >= 2 bytes long, lowercase can be at most 4 bytes long, so 
length of
+  // the output can be at most twice the length of the input
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * 
data_len));
+  if (out == nullptr) {
     gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
     *out_len = 0;
     return "";
   }
-  for (gdv_int32 i = 0; i < data_len; ++i) {
-    char cur = data[i];
 
-    // 'A' - 'Z' : 0x41 - 0x5a
-    // 'a' - 'z' : 0x61 - 0x7a
-    if (cur >= 0x41 && cur <= 0x5a) {
-      cur = static_cast<char>(cur + 0x20);
+  gdv_int32 char_len, out_char_len, out_idx = 0;
+  gdv_uint32 char_codepoint;
+
+  for (gdv_int32 i = 0; i < data_len; i += char_len) {
+    char_len = utf8_char_length(data[i]);
+    // For single byte characters:
+    // If it is an uppercase ASCII character, set the output to its 
corresponding
+    // lowercase character; else, set the output to the read character
+    if (char_len == 1) {
+      char cur = data[i];
+      // 'A' - 'Z' : 0x41 - 0x5a
+      // 'a' - 'z' : 0x61 - 0x7a
+      if (cur >= 0x41 && cur <= 0x5a) {
+        out[out_idx++] = static_cast<char>(cur + 0x20);
+      } else {
+        out[out_idx++] = cur;
+      }
+      continue;
+    }
+
+    // Control reaches here when we encounter a multibyte character
+    gdv_uint8* in_char =
+        reinterpret_cast<gdv_uint8*>(gdv_fn_context_arena_malloc(context, 
char_len));

Review comment:
       this can be avoided

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -249,25 +297,73 @@ const char* lower_utf8(gdv_int64 context, const char* 
data, gdv_int32 data_len,
     return "";
   }
 
-  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
data_len));
-  if (ret == nullptr) {
+  // If it is a single-byte character (ASCII), corresponding lowercase is 
always 1-byte
+  // long; if it is >= 2 bytes long, lowercase can be at most 4 bytes long, so 
length of
+  // the output can be at most twice the length of the input
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * 
data_len));
+  if (out == nullptr) {
     gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
     *out_len = 0;
     return "";
   }
-  for (gdv_int32 i = 0; i < data_len; ++i) {
-    char cur = data[i];
 
-    // 'A' - 'Z' : 0x41 - 0x5a
-    // 'a' - 'z' : 0x61 - 0x7a
-    if (cur >= 0x41 && cur <= 0x5a) {
-      cur = static_cast<char>(cur + 0x20);
+  gdv_int32 char_len, out_char_len, out_idx = 0;
+  gdv_uint32 char_codepoint;
+
+  for (gdv_int32 i = 0; i < data_len; i += char_len) {
+    char_len = utf8_char_length(data[i]);
+    // For single byte characters:
+    // If it is an uppercase ASCII character, set the output to its 
corresponding
+    // lowercase character; else, set the output to the read character
+    if (char_len == 1) {
+      char cur = data[i];
+      // 'A' - 'Z' : 0x41 - 0x5a
+      // 'a' - 'z' : 0x61 - 0x7a
+      if (cur >= 0x41 && cur <= 0x5a) {
+        out[out_idx++] = static_cast<char>(cur + 0x20);
+      } else {
+        out[out_idx++] = cur;
+      }
+      continue;
+    }
+
+    // Control reaches here when we encounter a multibyte character
+    gdv_uint8* in_char =
+        reinterpret_cast<gdv_uint8*>(gdv_fn_context_arena_malloc(context, 
char_len));
+    memcpy(in_char, data + i, char_len);
+
+    // Decode the multibyte character
+    bool is_valid_utf8_char =
+        arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint);
+
+    // If it is an invalid utf8 character, UTF8Decode evaluates to false
+    if (!is_valid_utf8_char) {
+      set_error_for_invalid_utf(context, data[i]);
+      *out_len = 0;
+      return "";
     }
-    ret[i] = cur;
+
+    // Convert the encoded codepoint to its lowercase codepoint
+    gdv_int32 lower_codepoint = utf8proc_tolower(char_codepoint);
+
+    // UTF8Encode advances the pointer by the number of bytes present in the 
lowercase
+    // character
+    gdv_uint8* out_char =
+        reinterpret_cast<gdv_uint8*>(gdv_fn_context_arena_malloc(context, 4));

Review comment:
       this too




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] pprudhvi commented on a change in pull request #9450: ARROW-11565: [C++][Gandiva] Modify upper()/lower() logic to make them work for utf8 strings

Reply via email to