kou commented on code in PR #49439:
URL: https://github.com/apache/arrow/pull/49439#discussion_r2915402535


##########
cpp/src/gandiva/precompiled/string_ops_test.cc:
##########
@@ -1396,6 +1491,101 @@ TEST(TestStringOps, TestRpadString) {
 
   out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, -1, &out_len);
   EXPECT_EQ(std::string(out_str, out_len), "");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "x", 1, 65536, "😀", 4, &out_len);
+  EXPECT_EQ(out_len, 1 + 65535 * 4);
+  EXPECT_FALSE(ctx.has_error());
+  EXPECT_EQ(out_str[0], 'x');
+  EXPECT_EQ(static_cast<unsigned char>(out_str[out_len - 4]), 0xF0);
+  EXPECT_EQ(static_cast<unsigned char>(out_str[out_len - 3]), 0x9F);
+  EXPECT_EQ(static_cast<unsigned char>(out_str[out_len - 2]), 0x98);
+  EXPECT_EQ(static_cast<unsigned char>(out_str[out_len - 1]), 0x80);
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "A", 1, 65536, "哈", 3, &out_len);
+  EXPECT_EQ(out_len, 1 + 65535 * 3);
+  EXPECT_FALSE(ctx.has_error());
+  EXPECT_EQ(out_str[0], 'A');

Review Comment:
   Should we check the last character too?



##########
cpp/src/gandiva/precompiled/string_ops.cc:
##########
@@ -1988,48 +1988,60 @@ const char* lpad_utf8_int32_utf8(gdv_int64 context, 
const char* text, gdv_int32
     // fill into text but "fill_text" is empty, then return text directly.
     *out_len = text_len;
     return text;
-  } else if (return_length < actual_text_len) {
+  }
+  if (return_length < actual_text_len) {
     // case where it truncates the result on return length.
     *out_len = utf8_byte_pos(context, text, text_len, return_length);
     return text;
-  } else {
-    // case (return_length > actual_text_len)
-    // case where it needs to copy "fill_text" on the string left. The total 
number
-    // of chars to copy is given by (return_length -  actual_text_len)
-    gdv_int32 return_char_length = evaluate_return_char_length(
-        text_len, actual_text_len, return_length, fill_text, fill_text_len);
-    char* ret = reinterpret_cast<gdv_binary>(
-        gdv_fn_context_arena_malloc(context, return_char_length));
+  }
+
+  gdv_int32 chars_to_pad = return_length - actual_text_len;
+
+  // FAST PATH: Single-byte fill (most common - space padding)
+  if (fill_text_len == 1) {
+    gdv_int32 out_len_bytes = chars_to_pad + text_len;
+    char* ret =
+        reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, 
out_len_bytes));
     if (ret == nullptr) {
       gdv_fn_context_set_error_msg(context,
                                    "Could not allocate memory for output 
string");
       *out_len = 0;
       return "";
     }
-    // try to fulfill the return string with the "fill_text" continuously
-    int32_t copied_chars_count = 0;
-    int32_t copied_chars_position = 0;
-    while (copied_chars_count < return_length - actual_text_len) {
-      int32_t char_len;
-      int32_t fill_index;
-      // for each char, evaluate its length to consider it when mem copying
-      for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len) 
{
-        if (copied_chars_count >= return_length - actual_text_len) {
-          break;
-        }
-        char_len = utf8_char_length(fill_text[fill_index]);
-        // ignore invalid char on the fill text, considering it as size 1
-        if (char_len == 0) char_len += 1;
-        copied_chars_count++;
-      }
-      memcpy(ret + copied_chars_position, fill_text, fill_index);
-      copied_chars_position += fill_index;
-    }
-    // after fulfilling the text, copy the main string
-    memcpy(ret + copied_chars_position, text, text_len);
-    *out_len = copied_chars_position + text_len;
+    memset(ret, fill_text[0], chars_to_pad);
+    memcpy(ret + chars_to_pad, text, text_len);
+    *out_len = out_len_bytes;
     return ret;
   }
+
+  // GENERAL PATH: Multi-byte fill - use evaluate_return_char_length for 
buffer size
+  gdv_int32 return_char_length = evaluate_return_char_length(
+      text_len, actual_text_len, return_length, fill_text, fill_text_len);
+  char* ret = reinterpret_cast<gdv_binary>(

Review Comment:
   Should we use `gdv_binary` not `char*` here?
   (I know that `gdv_binary = char*`.)
   
   ```suggestion
     gdv_binary ret = reinterpret_cast<gdv_binary>(
   ```



##########
cpp/src/gandiva/precompiled/string_ops_test.cc:
##########
@@ -1396,6 +1491,101 @@ TEST(TestStringOps, TestRpadString) {
 
   out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, -1, &out_len);
   EXPECT_EQ(std::string(out_str, out_len), "");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "x", 1, 65536, "😀", 4, &out_len);
+  EXPECT_EQ(out_len, 1 + 65535 * 4);
+  EXPECT_FALSE(ctx.has_error());
+  EXPECT_EQ(out_str[0], 'x');
+  EXPECT_EQ(static_cast<unsigned char>(out_str[out_len - 4]), 0xF0);
+  EXPECT_EQ(static_cast<unsigned char>(out_str[out_len - 3]), 0x9F);
+  EXPECT_EQ(static_cast<unsigned char>(out_str[out_len - 2]), 0x98);
+  EXPECT_EQ(static_cast<unsigned char>(out_str[out_len - 1]), 0x80);

Review Comment:
   Can we use one `EXPECT_EQ()` instead of 4 `EXPECT_EQ()`?
   
   
   ```suggestion
     EXPECT_EQ(std::string_view(out_str + out_len - 4, 4), "😀");
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to