kou commented on code in PR #50356:
URL: https://github.com/apache/arrow/pull/50356#discussion_r3521932346
##########
cpp/src/gandiva/precompiled/string_ops_test.cc:
##########
@@ -1608,6 +1609,29 @@ TEST(TestStringOps, TestRpadString) {
EXPECT_EQ(std::string(out_str + 5000, 2), "α");
}
+TEST(TestStringOps, TestPadMalformedUtf8NoOverread) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ // A 4-byte utf8 lead byte followed by non-continuation bytes and no trailing
+ // space. utf8_length_ignore_invalid() used to extend the glyph length past
+ // the end of the buffer while scanning the continuation bytes. The input is
+ // held in an exactly-sized heap buffer so any over-read trips
AddressSanitizer.
+ std::vector<char> text = {'\xF0', 'a', 'a', 'a'};
+ const auto text_len = static_cast<gdv_int32>(text.size());
+
+ const char* out_str =
+ lpad_utf8_int32_utf8(ctx_ptr, text.data(), text_len, 6, " ", 1,
&out_len);
+ EXPECT_EQ(out_len, 9);
+ EXPECT_EQ(std::string(out_str + out_len - text_len, text_len),
+ std::string(text.begin(), text.end()));
+
+ out_str = rpad_utf8_int32_utf8(ctx_ptr, text.data(), text_len, 6, " ", 1,
&out_len);
+ EXPECT_EQ(out_len, 9);
+ EXPECT_EQ(std::string(out_str, text_len), std::string(text.begin(),
text.end()));
Review Comment:
ditto.
##########
cpp/src/gandiva/precompiled/string_ops.cc:
##########
@@ -206,7 +206,7 @@ gdv_int32 utf8_length_ignore_invalid(const char* data,
gdv_int32 data_len) {
// if invalid byte or incomplete glyph, ignore it
char_len = 1;
}
- for (int j = 1; j < char_len; ++j) {
+ for (int j = 1; j < char_len && i + j < data_len; ++j) {
if ((data[i + j] & 0xC0) != 0x80) { // bytes following head-byte of
glyph
char_len += 1;
Review Comment:
Hmm, should we `break` here instead?
```suggestion
break;
```
##########
cpp/src/gandiva/precompiled/string_ops_test.cc:
##########
@@ -1608,6 +1609,29 @@ TEST(TestStringOps, TestRpadString) {
EXPECT_EQ(std::string(out_str + 5000, 2), "α");
}
+TEST(TestStringOps, TestPadMalformedUtf8NoOverread) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+
+ // A 4-byte utf8 lead byte followed by non-continuation bytes and no trailing
+ // space. utf8_length_ignore_invalid() used to extend the glyph length past
+ // the end of the buffer while scanning the continuation bytes. The input is
+ // held in an exactly-sized heap buffer so any over-read trips
AddressSanitizer.
+ std::vector<char> text = {'\xF0', 'a', 'a', 'a'};
+ const auto text_len = static_cast<gdv_int32>(text.size());
+
+ const char* out_str =
+ lpad_utf8_int32_utf8(ctx_ptr, text.data(), text_len, 6, " ", 1,
&out_len);
+ EXPECT_EQ(out_len, 9);
+ EXPECT_EQ(std::string(out_str + out_len - text_len, text_len),
+ std::string(text.begin(), text.end()));
Review Comment:
Could you check `out_str` data entirely instead of checking only part of
`out_str`?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]