lidavidm commented on a change in pull request #11233:
URL: https://github.com/apache/arrow/pull/11233#discussion_r731077620



##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -88,19 +102,316 @@ class BaseTestStringKernels : public ::testing::Test {
   std::shared_ptr<DataType> offset_type() {
     return TypeTraits<OffsetType>::type_singleton();
   }
+
+  template <typename CType = const char*>
+  std::shared_ptr<Array> MakeArray(const std::vector<CType>& values,
+                                   const std::vector<bool>& is_valid = {}) {
+    return _MakeArray<TestType, CType>(type(), values, is_valid);
+  }
 };
 
+template <typename TestType>
+class TestBaseBinaryKernels : public BaseTestStringKernels<TestType> {};
+
+TYPED_TEST_SUITE(TestBaseBinaryKernels, BaseBinaryArrowTypes);
+
 template <typename TestType>
 class TestBinaryKernels : public BaseTestStringKernels<TestType> {};
 
 TYPED_TEST_SUITE(TestBinaryKernels, BinaryArrowTypes);
 
-TYPED_TEST(TestBinaryKernels, BinaryLength) {
+template <typename TestType>
+class TestStringKernels : public BaseTestStringKernels<TestType> {};
+
+TYPED_TEST_SUITE(TestStringKernels, StringArrowTypes);
+
+TYPED_TEST(TestBaseBinaryKernels, BinaryLength) {
   this->CheckUnary("binary_length", R"(["aaa", null, "áéíóú", "", "b"])",
                    this->offset_type(), "[3, null, 10, 0, 1]");
+
+  // Invalid UTF-8 inputs
+  this->CheckUnary("binary_length", this->MakeArray({"\xf7\x0f\xab", 
"\xff\x9b\xc3\xbb"}),
+                   this->offset_type(), "[3, 4]");
+
+  // Invalid UTF-8 inputs with null bytes
+  this->CheckUnary("binary_length",
+                   this->template MakeArray<std::string>(
+                       {{"\xf7\x00\xab", 3}, {"\x00\x9b\x00\xbb", 4}, 
{"\x00\x00", 2}}),
+                   this->offset_type(), "[3, 4, 2]");
+}
+
+// The NonUtf8XXX tests use kernels that do not accept invalid UTF-8 when
+// processing [Large]StringType data. These tests use invalid UTF-8 inputs.
+TYPED_TEST(TestBinaryKernels, NonUtf8) {
+#ifdef ARROW_WITH_RE2
+  for (auto ignore_case : {true, false}) {
+#else
+  for (auto ignore_case : {false}) {
+#endif
+    MatchSubstringOptions options("\xfc\x40", ignore_case);
+    this->CheckUnary(
+        "find_substring",
+        this->MakeArray({"\xfc\x40\xab", "\xff\x9b\xfc\x40\xab", 
"\x01\xfc\x41"}),
+        this->offset_type(), "[0, 2, -1]", &options);
+
+    // @ = \x40

Review comment:
       What I meant was that it doesn't make any difference after compilation, 
so we can just pick one way of writing it and stick with it




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to