projjal commented on a change in pull request #10023: URL: https://github.com/apache/arrow/pull/10023#discussion_r630996233
########## File path: cpp/src/gandiva/precompiled/string_ops.cc ########## @@ -533,89 +533,102 @@ const char* castVARCHAR_bool_int64(gdv_int64 context, gdv_boolean value, return out; } -// Truncates the string to given length -FORCE_INLINE -const char* castVARCHAR_utf8_int64(gdv_int64 context, const char* data, - gdv_int32 data_len, int64_t out_len, - int32_t* out_length) { - int32_t len = static_cast<int32_t>(out_len); - - if (len < 0) { - gdv_fn_context_set_error_msg(context, "Output buffer length can't be negative"); - *out_length = 0; - return ""; - } - - if (len >= data_len || len == 0) { - *out_length = data_len; - return data; - } - - int32_t remaining = len; - int32_t index = 0; - bool is_multibyte = false; - do { - // In utf8, MSB of a single byte unicode char is always 0, - // whereas for a multibyte character the MSB of each byte is 1. - // So for a single byte char, a bitwise-and with x80 (10000000) will be 0 - // and it won't be 0 for bytes of a multibyte char - char* data_ptr = const_cast<char*>(data); - - // we advance byte by byte till the 8 byte boundary then advance 8 bytes at a time - auto num_bytes = reinterpret_cast<uintptr_t>(data_ptr) & 0x07; - num_bytes = (8 - num_bytes) & 0x07; - while (num_bytes > 0) { - uint8_t* ptr = reinterpret_cast<uint8_t*>(data_ptr + index); - if ((*ptr & 0x80) != 0) { - is_multibyte = true; - break; - } - index++; - remaining--; - num_bytes--; - } - if (is_multibyte) break; - while (remaining >= 8) { - uint64_t* ptr = reinterpret_cast<uint64_t*>(data_ptr + index); - if ((*ptr & 0x8080808080808080) != 0) { - is_multibyte = true; - break; - } - index += 8; - remaining -= 8; - } - if (is_multibyte) break; - if (remaining >= 4) { - uint32_t* ptr = reinterpret_cast<uint32_t*>(data_ptr + index); - if ((*ptr & 0x80808080) != 0) break; - index += 4; - remaining -= 4; - } - while (remaining > 0) { - uint8_t* ptr = reinterpret_cast<uint8_t*>(data_ptr + index); - if ((*ptr & 0x80) != 0) { - is_multibyte = true; - break; - } - index++; - remaining--; - } - if (is_multibyte) break; - // reached here; all are single byte characters - *out_length = len; - return data; - } while (false); - - // detected multibyte utf8 characters; slow path - int32_t byte_pos = utf8_byte_pos(context, data + index, data_len - index, len - index); - if (byte_pos < 0) { - *out_length = 0; - return ""; - } - - *out_length = index + byte_pos; - return data; -} +// Add functions for castVARBINARY for utf8 and binary +#define CAST_FROM_STRING_AND_BINARY(OUTPUT, TYPE_NAME) \ + FORCE_INLINE \ + const char* cast##OUTPUT##_##TYPE_NAME##_int64(gdv_int64 context, const char* data, \ + gdv_int32 data_len, int64_t out_len, \ + int32_t* out_length) { \ + int32_t len = static_cast<int32_t>(out_len); \ + \ + if (len < 0) { \ + gdv_fn_context_set_error_msg(context, "Output buffer length can't be negative"); \ + *out_length = 0; \ + return ""; \ + } \ + \ + if (len >= data_len || len == 0) { \ + *out_length = data_len; \ + return data; \ + } \ + \ + int32_t remaining = len; \ + int32_t index = 0; \ + bool is_multibyte = false; \ + do { \ + /* In utf8, MSB of a single byte unicode char is always 0, */ \ + /* whereas for a multibyte character the MSB of each byte is 1. */ \ + /* So for a single byte char, a bitwise-and with x80 (10000000) will be 0 */ \ + /* and it won't be 0 for bytes of a multibyte char */ \ + char* data_ptr = const_cast<char*>(data); \ + \ + /* advance byte by byte till the 8 byte boundary, advance 8 bytes at a time */ \ + auto num_bytes = reinterpret_cast<uintptr_t>(data_ptr) & 0x07; \ + num_bytes = (8 - num_bytes) & 0x07; \ + while (num_bytes > 0) { \ + uint8_t* ptr = reinterpret_cast<uint8_t*>(data_ptr + index); \ + if ((*ptr & 0x80) != 0) { \ + is_multibyte = true; \ + break; \ + } \ + index++; \ + remaining--; \ + num_bytes--; \ + } \ + if (is_multibyte) break; \ + while (remaining >= 8) { \ + uint64_t* ptr = reinterpret_cast<uint64_t*>(data_ptr + index); \ + if ((*ptr & 0x8080808080808080) != 0) { \ + is_multibyte = true; \ + break; \ + } \ + index += 8; \ + remaining -= 8; \ + } \ + if (is_multibyte) { \ + break; \ + } \ + if (remaining >= 4) { \ + uint32_t* ptr = reinterpret_cast<uint32_t*>(data_ptr + index); \ + if ((*ptr & 0x80808080) != 0) break; \ + index += 4; \ + remaining -= 4; \ + } \ + while (remaining > 0) { \ + uint8_t* ptr = reinterpret_cast<uint8_t*>(data_ptr + index); \ + if ((*ptr & 0x80) != 0) { \ + is_multibyte = true; \ + break; \ + } \ + index++; \ + remaining--; \ + } \ + if (is_multibyte) { \ + break; \ + } \ + /* reached here; all are single byte characters */ \ + *out_length = len; \ + return data; \ + } while (false); \ + \ + /* detected multibyte utf8 characters; slow path */ \ + int32_t byte_pos = \ + utf8_byte_pos(context, data + index, data_len - index, len - index); \ + if (byte_pos < 0) { \ + *out_length = 0; \ + return ""; \ + } \ + \ + *out_length = index + byte_pos; \ + return data; \ + } + +CAST_FROM_STRING_AND_BINARY(VARCHAR, utf8) Review comment: Looks like you can't reuse the same function. varchar one takes character count which needs utf8 decoding -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org