projjal commented on a change in pull request #9844:
URL: https://github.com/apache/arrow/pull/9844#discussion_r605128545
##########
File path: cpp/src/gandiva/precompiled/string_ops_test.cc
##########
@@ -115,6 +115,55 @@ TEST(TestStringOps, TestCharLength) {
ctx.Reset();
}
+TEST(TestStringOps, TestConvertReplaceInvalidUtf8Char) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+
+ // invalid utf8 (xf8 is invalid but x28 is not - x28 = '(')
+ std::string a(
+ "ok-\xf8\x28"
+ "-a");
+ auto a_in_out_len = static_cast<int>(a.length());
+ const char* a_str = convert_replace_invalid_fromUTF8_binary(
+ ctx_ptr, a.data(), a_in_out_len, "a", 1, &a_in_out_len);
+ EXPECT_EQ(std::string(a_str, a_in_out_len), "ok-a(-a");
+ EXPECT_FALSE(ctx.has_error());
+
+ // invalid utf8 (xa0 and xa1 are invalid)
+ std::string b("ok-\xa0\xa1-valid");
+ auto b_in_out_len = static_cast<int>(b.length());
+ const char* b_str = convert_replace_invalid_fromUTF8_binary(
+ ctx_ptr, b.data(), b_in_out_len, "b", 1, &b_in_out_len);
+ EXPECT_EQ(std::string(b_str, b_in_out_len), "ok-bb-valid");
+ EXPECT_FALSE(ctx.has_error());
+
+ // full valid utf8
+ std::string c("all-valid");
+ auto c_in_out_len = static_cast<int>(c.length());
+ const char* c_str = convert_replace_invalid_fromUTF8_binary(
+ ctx_ptr, c.data(), c_in_out_len, "c", 1, &c_in_out_len);
+ EXPECT_EQ(std::string(c_str, c_in_out_len), "all-valid");
+ EXPECT_FALSE(ctx.has_error());
+
+ // valid utf8 (महसुस is a 4-byte utf-8 char)
Review comment:
महसुस is a four character string, each character of which is likely a
multibyte char
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]