pprudhvi commented on a change in pull request #9450: URL: https://github.com/apache/arrow/pull/9450#discussion_r581000055
########## File path: cpp/src/gandiva/precompiled/string_ops.cc ########## @@ -219,28 +225,74 @@ const char* upper_utf8(gdv_int64 context, const char* data, gdv_int32 data_len, return ""; } - char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, data_len)); - if (ret == nullptr) { + // If it is a single-byte character (ASCII), corresponding uppercase is always 1-byte + // long; if it is >= 2 bytes long, uppercase can be at most 4 bytes long, so length of + // the output can be at most twice the length of the input + char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len)); + if (out == nullptr) { gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); *out_len = 0; return ""; } - for (gdv_int32 i = 0; i < data_len; ++i) { - char cur = data[i]; - // 'A- - 'Z' : 0x41 - 0x5a - // 'a' - 'z' : 0x61 - 0x7a - if (cur >= 0x61 && cur <= 0x7a) { - cur = static_cast<char>(cur - 0x20); + gdv_int32 char_len, out_char_len, out_idx = 0; + gdv_uint32 char_codepoint; + + for (gdv_int32 i = 0; i < data_len; i += char_len) { + char_len = utf8_char_length(data[i]); + // For single byte characters: + // If it is a lowercase ASCII character, set the output to its corresponding uppercase + // character; else, set the output to the read character + if (char_len == 1) { + char cur = data[i]; + // 'A' - 'Z' : 0x41 - 0x5a + // 'a' - 'z' : 0x61 - 0x7a + if (cur >= 0x61 && cur <= 0x7a) { + out[out_idx++] = static_cast<char>(cur - 0x20); + } else { + out[out_idx++] = cur; + } + continue; + } + + // Control reaches here when we encounter a multibyte character + gdv_uint8* in_char = + reinterpret_cast<gdv_uint8*>(gdv_fn_context_arena_malloc(context, char_len)); + memcpy(in_char, data + i, char_len); Review comment: can we avoid this by passing reference to data+i to UTF8Decode? similarily while encoding ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org