sagnikc-dremio commented on a change in pull request #9450:
URL: https://github.com/apache/arrow/pull/9450#discussion_r581661888



##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -249,25 +297,73 @@ const char* lower_utf8(gdv_int64 context, const char* 
data, gdv_int32 data_len,
     return "";
   }
 
-  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
data_len));
-  if (ret == nullptr) {
+  // If it is a single-byte character (ASCII), corresponding lowercase is 
always 1-byte
+  // long; if it is >= 2 bytes long, lowercase can be at most 4 bytes long, so 
length of
+  // the output can be at most twice the length of the input
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * 
data_len));
+  if (out == nullptr) {
     gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
     *out_len = 0;
     return "";
   }
-  for (gdv_int32 i = 0; i < data_len; ++i) {
-    char cur = data[i];
 
-    // 'A' - 'Z' : 0x41 - 0x5a
-    // 'a' - 'z' : 0x61 - 0x7a
-    if (cur >= 0x41 && cur <= 0x5a) {
-      cur = static_cast<char>(cur + 0x20);
+  gdv_int32 char_len, out_char_len, out_idx = 0;
+  gdv_uint32 char_codepoint;
+
+  for (gdv_int32 i = 0; i < data_len; i += char_len) {
+    char_len = utf8_char_length(data[i]);
+    // For single byte characters:
+    // If it is an uppercase ASCII character, set the output to its 
corresponding
+    // lowercase character; else, set the output to the read character
+    if (char_len == 1) {
+      char cur = data[i];
+      // 'A' - 'Z' : 0x41 - 0x5a
+      // 'a' - 'z' : 0x61 - 0x7a
+      if (cur >= 0x41 && cur <= 0x5a) {
+        out[out_idx++] = static_cast<char>(cur + 0x20);
+      } else {
+        out[out_idx++] = cur;
+      }
+      continue;
+    }
+
+    // Control reaches here when we encounter a multibyte character
+    gdv_uint8* in_char =
+        reinterpret_cast<gdv_uint8*>(gdv_fn_context_arena_malloc(context, 
char_len));

Review comment:
       Yeah I'm sorry, I missed that one. Made the changes.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to