pprudhvi commented on a change in pull request #9450: URL: https://github.com/apache/arrow/pull/9450#discussion_r581658737
########## File path: cpp/src/gandiva/precompiled/string_ops.cc ########## @@ -249,25 +297,73 @@ const char* lower_utf8(gdv_int64 context, const char* data, gdv_int32 data_len, return ""; } - char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, data_len)); - if (ret == nullptr) { + // If it is a single-byte character (ASCII), corresponding lowercase is always 1-byte + // long; if it is >= 2 bytes long, lowercase can be at most 4 bytes long, so length of + // the output can be at most twice the length of the input + char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len)); + if (out == nullptr) { gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); *out_len = 0; return ""; } - for (gdv_int32 i = 0; i < data_len; ++i) { - char cur = data[i]; - // 'A' - 'Z' : 0x41 - 0x5a - // 'a' - 'z' : 0x61 - 0x7a - if (cur >= 0x41 && cur <= 0x5a) { - cur = static_cast<char>(cur + 0x20); + gdv_int32 char_len, out_char_len, out_idx = 0; + gdv_uint32 char_codepoint; + + for (gdv_int32 i = 0; i < data_len; i += char_len) { + char_len = utf8_char_length(data[i]); + // For single byte characters: + // If it is an uppercase ASCII character, set the output to its corresponding + // lowercase character; else, set the output to the read character + if (char_len == 1) { + char cur = data[i]; + // 'A' - 'Z' : 0x41 - 0x5a + // 'a' - 'z' : 0x61 - 0x7a + if (cur >= 0x41 && cur <= 0x5a) { + out[out_idx++] = static_cast<char>(cur + 0x20); + } else { + out[out_idx++] = cur; + } + continue; + } + + // Control reaches here when we encounter a multibyte character + gdv_uint8* in_char = + reinterpret_cast<gdv_uint8*>(gdv_fn_context_arena_malloc(context, char_len)); Review comment: this can be avoided ########## File path: cpp/src/gandiva/precompiled/string_ops.cc ########## @@ -249,25 +297,73 @@ const char* lower_utf8(gdv_int64 context, const char* data, gdv_int32 data_len, return ""; } - char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, data_len)); - if (ret == nullptr) { + // If it is a single-byte character (ASCII), corresponding lowercase is always 1-byte + // long; if it is >= 2 bytes long, lowercase can be at most 4 bytes long, so length of + // the output can be at most twice the length of the input + char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len)); + if (out == nullptr) { gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); *out_len = 0; return ""; } - for (gdv_int32 i = 0; i < data_len; ++i) { - char cur = data[i]; - // 'A' - 'Z' : 0x41 - 0x5a - // 'a' - 'z' : 0x61 - 0x7a - if (cur >= 0x41 && cur <= 0x5a) { - cur = static_cast<char>(cur + 0x20); + gdv_int32 char_len, out_char_len, out_idx = 0; + gdv_uint32 char_codepoint; + + for (gdv_int32 i = 0; i < data_len; i += char_len) { + char_len = utf8_char_length(data[i]); + // For single byte characters: + // If it is an uppercase ASCII character, set the output to its corresponding + // lowercase character; else, set the output to the read character + if (char_len == 1) { + char cur = data[i]; + // 'A' - 'Z' : 0x41 - 0x5a + // 'a' - 'z' : 0x61 - 0x7a + if (cur >= 0x41 && cur <= 0x5a) { + out[out_idx++] = static_cast<char>(cur + 0x20); + } else { + out[out_idx++] = cur; + } + continue; + } + + // Control reaches here when we encounter a multibyte character + gdv_uint8* in_char = + reinterpret_cast<gdv_uint8*>(gdv_fn_context_arena_malloc(context, char_len)); + memcpy(in_char, data + i, char_len); + + // Decode the multibyte character + bool is_valid_utf8_char = + arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint); + + // If it is an invalid utf8 character, UTF8Decode evaluates to false + if (!is_valid_utf8_char) { + set_error_for_invalid_utf(context, data[i]); + *out_len = 0; + return ""; } - ret[i] = cur; + + // Convert the encoded codepoint to its lowercase codepoint + gdv_int32 lower_codepoint = utf8proc_tolower(char_codepoint); + + // UTF8Encode advances the pointer by the number of bytes present in the lowercase + // character + gdv_uint8* out_char = + reinterpret_cast<gdv_uint8*>(gdv_fn_context_arena_malloc(context, 4)); Review comment: this too ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org