This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 570771343b GH-49438: [C++][Gandiva] Optimize LPAD/RPAD functions
(#49439)
570771343b is described below
commit 570771343b2666e801e5d289e33cca22243a6bf1
Author: Dmitry Chirkov <[email protected]>
AuthorDate: Thu Mar 12 21:40:29 2026 -0700
GH-49438: [C++][Gandiva] Optimize LPAD/RPAD functions (#49439)
### Rationale for this change
The `lpad_utf8_int32_utf8` and `rpad_utf8_int32_utf8` functions have
performance inefficiency and a potential memory safety issue:
1. **Performance**: Single-byte fills iterate character-by-character when
`memset` would suffice. Multi-byte fills use O(n) iterations instead of O(log
n) with a doubling strategy.
2. **Memory safety**: When the fill string is longer than the padding space
needed, the code could write more bytes than allocated. Fixed preventatively.
### What changes are included in this PR?
1. **Memory safety fix**: Use `std::min(fill_text_len, total_fill_bytes)`
for the initial copy to prevent overflow
2. **Fast path**: Add single-byte fill optimization using `memset`
3. **General path**: Replace character-by-character loop with doubling
strategy for multi-byte fills
4. **Tests**: Add comprehensive tests for the new code paths
### Are these changes tested?
Yes. Added tests covering:
- Large UTF-8 fill characters (4-byte emoji, 3-byte Chinese)
- Single-byte fill boundaries (1 char and 65536 char padding)
- Content verification for fill patterns
- Doubling strategy boundaries
- Partial fill scenarios (fill text longer than padding needed)
### Are there any user-facing changes?
No.
* GitHub Issue: #49438
Authored-by: Dmitry Chirkov <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/src/gandiva/precompiled/string_ops.cc | 140 +++++++++++--------
cpp/src/gandiva/precompiled/string_ops_test.cc | 186 +++++++++++++++++++++++++
2 files changed, 266 insertions(+), 60 deletions(-)
diff --git a/cpp/src/gandiva/precompiled/string_ops.cc
b/cpp/src/gandiva/precompiled/string_ops.cc
index 0b787f461c..54243085f7 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -1966,6 +1966,23 @@ gdv_int32 evaluate_return_char_length(gdv_int32
text_len, gdv_int32 actual_text_
return return_char_length;
}
+// Fill a buffer with repeated fill_text using O(log n) doubling strategy
+static FORCE_INLINE void fill_buffer_with_pattern(gdv_binary dest,
+ gdv_int32 total_fill_bytes,
+ const char* fill_text,
+ gdv_int32 fill_text_len) {
+ gdv_int32 initial_copy = std::min(fill_text_len, total_fill_bytes);
+ memcpy(dest, fill_text, initial_copy);
+ gdv_int32 written = initial_copy;
+ while (written * 2 <= total_fill_bytes) {
+ memcpy(dest + written, dest, written);
+ written *= 2;
+ }
+ if (written < total_fill_bytes) {
+ memcpy(dest + written, dest, total_fill_bytes - written);
+ }
+}
+
FORCE_INLINE
const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text,
gdv_int32 text_len,
gdv_int32 return_length, const char*
fill_text,
@@ -1988,48 +2005,49 @@ const char* lpad_utf8_int32_utf8(gdv_int64 context,
const char* text, gdv_int32
// fill into text but "fill_text" is empty, then return text directly.
*out_len = text_len;
return text;
- } else if (return_length < actual_text_len) {
+ }
+ if (return_length < actual_text_len) {
// case where it truncates the result on return length.
*out_len = utf8_byte_pos(context, text, text_len, return_length);
return text;
- } else {
- // case (return_length > actual_text_len)
- // case where it needs to copy "fill_text" on the string left. The total
number
- // of chars to copy is given by (return_length - actual_text_len)
- gdv_int32 return_char_length = evaluate_return_char_length(
- text_len, actual_text_len, return_length, fill_text, fill_text_len);
- char* ret = reinterpret_cast<gdv_binary>(
- gdv_fn_context_arena_malloc(context, return_char_length));
+ }
+
+ gdv_int32 chars_to_pad = return_length - actual_text_len;
+
+ // FAST PATH: Single-byte fill (most common - space padding)
+ if (fill_text_len == 1) {
+ gdv_int32 out_len_bytes = chars_to_pad + text_len;
+ gdv_binary ret =
+ reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context,
out_len_bytes));
if (ret == nullptr) {
gdv_fn_context_set_error_msg(context,
"Could not allocate memory for output
string");
*out_len = 0;
return "";
}
- // try to fulfill the return string with the "fill_text" continuously
- int32_t copied_chars_count = 0;
- int32_t copied_chars_position = 0;
- while (copied_chars_count < return_length - actual_text_len) {
- int32_t char_len;
- int32_t fill_index;
- // for each char, evaluate its length to consider it when mem copying
- for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len)
{
- if (copied_chars_count >= return_length - actual_text_len) {
- break;
- }
- char_len = utf8_char_length(fill_text[fill_index]);
- // ignore invalid char on the fill text, considering it as size 1
- if (char_len == 0) char_len += 1;
- copied_chars_count++;
- }
- memcpy(ret + copied_chars_position, fill_text, fill_index);
- copied_chars_position += fill_index;
- }
- // after fulfilling the text, copy the main string
- memcpy(ret + copied_chars_position, text, text_len);
- *out_len = copied_chars_position + text_len;
+ memset(ret, fill_text[0], chars_to_pad);
+ memcpy(ret + chars_to_pad, text, text_len);
+ *out_len = out_len_bytes;
return ret;
}
+
+ // GENERAL PATH: Multi-byte fill - use evaluate_return_char_length for
buffer size
+ gdv_int32 return_char_length = evaluate_return_char_length(
+ text_len, actual_text_len, return_length, fill_text, fill_text_len);
+ gdv_binary ret = reinterpret_cast<gdv_binary>(
+ gdv_fn_context_arena_malloc(context, return_char_length));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for
output string");
+ *out_len = 0;
+ return "";
+ }
+
+ // Fill padding region using doubling strategy, then append text
+ gdv_int32 total_fill_bytes = return_char_length - text_len;
+ fill_buffer_with_pattern(ret, total_fill_bytes, fill_text, fill_text_len);
+ memcpy(ret + total_fill_bytes, text, text_len);
+ *out_len = return_char_length;
+ return ret;
}
FORCE_INLINE
@@ -2054,47 +2072,49 @@ const char* rpad_utf8_int32_utf8(gdv_int64 context,
const char* text, gdv_int32
// fill into text but "fill_text" is empty, then return text directly.
*out_len = text_len;
return text;
- } else if (return_length < actual_text_len) {
+ }
+ if (return_length < actual_text_len) {
// case where it truncates the result on return length.
*out_len = utf8_byte_pos(context, text, text_len, return_length);
return text;
- } else {
- // case (return_length > actual_text_len)
- // case where it needs to copy "fill_text" on the string right
- gdv_int32 return_char_length = evaluate_return_char_length(
- text_len, actual_text_len, return_length, fill_text, fill_text_len);
- char* ret = reinterpret_cast<gdv_binary>(
- gdv_fn_context_arena_malloc(context, return_char_length));
+ }
+
+ gdv_int32 chars_to_pad = return_length - actual_text_len;
+
+ // FAST PATH: Single-byte fill (most common - space padding)
+ if (fill_text_len == 1) {
+ gdv_int32 out_len_bytes = chars_to_pad + text_len;
+ gdv_binary ret =
+ reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context,
out_len_bytes));
if (ret == nullptr) {
gdv_fn_context_set_error_msg(context,
"Could not allocate memory for output
string");
*out_len = 0;
return "";
}
- // fulfill the initial text copying the main input string
memcpy(ret, text, text_len);
- // try to fulfill the return string with the "fill_text" continuously
- int32_t copied_chars_count = 0;
- int32_t copied_chars_position = 0;
- while (actual_text_len + copied_chars_count < return_length) {
- int32_t char_len;
- int32_t fill_length;
- // for each char, evaluate its length to consider it when mem copying
- for (fill_length = 0; fill_length < fill_text_len; fill_length +=
char_len) {
- if (actual_text_len + copied_chars_count >= return_length) {
- break;
- }
- char_len = utf8_char_length(fill_text[fill_length]);
- // ignore invalid char on the fill text, considering it as size 1
- if (char_len == 0) char_len += 1;
- copied_chars_count++;
- }
- memcpy(ret + text_len + copied_chars_position, fill_text, fill_length);
- copied_chars_position += fill_length;
- }
- *out_len = copied_chars_position + text_len;
+ memset(ret + text_len, fill_text[0], chars_to_pad);
+ *out_len = out_len_bytes;
return ret;
}
+
+ // GENERAL PATH: Multi-byte fill - use evaluate_return_char_length for
buffer size
+ gdv_int32 return_char_length = evaluate_return_char_length(
+ text_len, actual_text_len, return_length, fill_text, fill_text_len);
+ gdv_binary ret = reinterpret_cast<gdv_binary>(
+ gdv_fn_context_arena_malloc(context, return_char_length));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for
output string");
+ *out_len = 0;
+ return "";
+ }
+
+ // Copy text first, then fill padding region using doubling strategy
+ memcpy(ret, text, text_len);
+ gdv_int32 total_fill_bytes = return_char_length - text_len;
+ fill_buffer_with_pattern(ret + text_len, total_fill_bytes, fill_text,
fill_text_len);
+ *out_len = return_char_length;
+ return ret;
}
FORCE_INLINE
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc
b/cpp/src/gandiva/precompiled/string_ops_test.cc
index e0248667e3..51174c1113 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -1318,6 +1318,99 @@ TEST(TestStringOps, TestLpadString) {
out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, -1, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "x", 1, 65536, "😀", 4, &out_len);
+ EXPECT_EQ(out_len, 65535 * 4 + 1);
+ EXPECT_FALSE(ctx.has_error());
+ EXPECT_EQ(out_str[out_len - 1], 'x');
+ EXPECT_EQ(std::string_view(out_str, 4), "😀");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "A", 1, 65536, "哈", 3, &out_len);
+ EXPECT_EQ(out_len, 65535 * 3 + 1);
+ EXPECT_FALSE(ctx.has_error());
+ EXPECT_EQ(out_str[out_len - 1], 'A');
+ EXPECT_EQ(std::string_view(out_str, 3), "哈");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 2, ".", 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), ".X");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "Z", 1, 65536, "@", 1, &out_len);
+ EXPECT_EQ(out_len, 65536);
+ for (int i = 0; i < 100; i++) {
+ EXPECT_EQ(out_str[i], '@') << "Mismatch at position " << i;
+ }
+ EXPECT_EQ(out_str[out_len - 1], 'Z');
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "END", 3, 11, "ab", 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "ababababEND");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "END", 3, 10, "abc", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "abcabcaEND");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 5, "αβ", 4, &out_len);
+ EXPECT_EQ(out_len, 9);
+ EXPECT_EQ(std::string(out_str, out_len), "αβαβX");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "Y", 1, 4, "䏿–‡", 6, &out_len);
+ EXPECT_EQ(out_len, 10);
+ EXPECT_EQ(std::string(out_str, out_len), "䏿–‡ä¸Y");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 4, "abc", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "abcX");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 7, "abc", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "abcabcX");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 13, "abc", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "abcabcabcabcX");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 10, "abc", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "abcabcabcX");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "E", 1, 129, "ab", 2, &out_len);
+ EXPECT_EQ(out_len, 129);
+ EXPECT_EQ(out_str[0], 'a');
+ EXPECT_EQ(out_str[1], 'b');
+ EXPECT_EQ(out_str[126], 'a');
+ EXPECT_EQ(out_str[127], 'b');
+ EXPECT_EQ(out_str[128], 'E');
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "E", 1, 127, "ab", 2, &out_len);
+ EXPECT_EQ(out_len, 127);
+ EXPECT_EQ(out_str[0], 'a');
+ EXPECT_EQ(out_str[125], 'b');
+ EXPECT_EQ(out_str[126], 'E');
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 2, "abc", 3, &out_len);
+ EXPECT_EQ(out_len, 2);
+ EXPECT_EQ(std::string(out_str, out_len), "aX");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "Y", 1, 3, "abcde", 5, &out_len);
+ EXPECT_EQ(out_len, 3);
+ EXPECT_EQ(std::string(out_str, out_len), "abY");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "Z", 1, 2, "αβ", 4, &out_len);
+ EXPECT_EQ(out_len, 3);
+ EXPECT_EQ(std::string(out_str, out_len), "αZ");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "A", 1, 2, "䏿–‡å—", 9, &out_len);
+ EXPECT_EQ(out_len, 4);
+ EXPECT_EQ(std::string(out_str, out_len), "ä¸A");
+
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, "B", 1, 3, "䏿–‡å—", 9, &out_len);
+ EXPECT_EQ(out_len, 7);
+ EXPECT_EQ(std::string(out_str, out_len), "䏿–‡B");
+
+ std::string large_text(5000, 'X');
+ std::string large_fill;
+ for (int i = 0; i < 50; ++i) {
+ large_fill += "α";
+ }
+ out_str = lpad_utf8_int32_utf8(ctx_ptr, large_text.c_str(), 5000, 5001,
+ large_fill.c_str(), 100, &out_len);
+ EXPECT_EQ(out_len, 5002);
+ EXPECT_EQ(std::string(out_str, 2), "α");
+ EXPECT_EQ(std::string(out_str + 2, 5000), large_text);
}
TEST(TestStringOps, TestRpadString) {
@@ -1396,6 +1489,99 @@ TEST(TestStringOps, TestRpadString) {
out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, -1, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "x", 1, 65536, "😀", 4, &out_len);
+ EXPECT_EQ(out_len, 1 + 65535 * 4);
+ EXPECT_FALSE(ctx.has_error());
+ EXPECT_EQ(out_str[0], 'x');
+ EXPECT_EQ(std::string_view(out_str + out_len - 4, 4), "😀");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "A", 1, 65536, "哈", 3, &out_len);
+ EXPECT_EQ(out_len, 1 + 65535 * 3);
+ EXPECT_FALSE(ctx.has_error());
+ EXPECT_EQ(out_str[0], 'A');
+ EXPECT_EQ(std::string_view(out_str + out_len - 3, 3), "哈");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 2, ".", 1, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "X.");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "Z", 1, 65536, "@", 1, &out_len);
+ EXPECT_EQ(out_len, 65536);
+ EXPECT_EQ(out_str[0], 'Z');
+ for (int i = 1; i < 100; i++) {
+ EXPECT_EQ(out_str[i], '@') << "Mismatch at position " << i;
+ }
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "BEG", 3, 11, "ab", 2, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "BEGabababab");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "BEG", 3, 10, "abc", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "BEGabcabca");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 5, "αβ", 4, &out_len);
+ EXPECT_EQ(out_len, 9);
+ EXPECT_EQ(std::string(out_str, out_len), "Xαβαβ");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "Y", 1, 4, "䏿–‡", 6, &out_len);
+ EXPECT_EQ(out_len, 10);
+ EXPECT_EQ(std::string(out_str, out_len), "Y䏿–‡ä¸");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 4, "abc", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Xabc");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 7, "abc", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Xabcabc");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 13, "abc", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Xabcabcabcabc");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 10, "abc", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Xabcabcabc");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "S", 1, 129, "ab", 2, &out_len);
+ EXPECT_EQ(out_len, 129);
+ EXPECT_EQ(out_str[0], 'S');
+ EXPECT_EQ(out_str[1], 'a');
+ EXPECT_EQ(out_str[2], 'b');
+ EXPECT_EQ(out_str[127], 'a');
+ EXPECT_EQ(out_str[128], 'b');
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "S", 1, 127, "ab", 2, &out_len);
+ EXPECT_EQ(out_len, 127);
+ EXPECT_EQ(out_str[0], 'S');
+ EXPECT_EQ(out_str[125], 'a');
+ EXPECT_EQ(out_str[126], 'b');
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 2, "abc", 3, &out_len);
+ EXPECT_EQ(out_len, 2);
+ EXPECT_EQ(std::string(out_str, out_len), "Xa");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "Y", 1, 3, "abcde", 5, &out_len);
+ EXPECT_EQ(out_len, 3);
+ EXPECT_EQ(std::string(out_str, out_len), "Yab");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "Z", 1, 2, "αβ", 4, &out_len);
+ EXPECT_EQ(out_len, 3);
+ EXPECT_EQ(std::string(out_str, out_len), "Zα");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "A", 1, 2, "䏿–‡å—", 9, &out_len);
+ EXPECT_EQ(out_len, 4);
+ EXPECT_EQ(std::string(out_str, out_len), "Aä¸");
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, "B", 1, 3, "䏿–‡å—", 9, &out_len);
+ EXPECT_EQ(out_len, 7);
+ EXPECT_EQ(std::string(out_str, out_len), "B䏿–‡");
+
+ std::string large_text(5000, 'X');
+ std::string large_fill;
+ for (int i = 0; i < 50; ++i) {
+ large_fill += "α";
+ }
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, large_text.c_str(), 5000, 5001,
+ large_fill.c_str(), 100, &out_len);
+ EXPECT_EQ(out_len, 5002);
+ EXPECT_EQ(std::string(out_str, 5000), large_text);
+ EXPECT_EQ(std::string(out_str + 5000, 2), "α");
}
TEST(TestStringOps, TestRtrim) {