[arrow] branch master updated: ARROW-9163: [C++] Validate UTF8 contents of a StringArray

wesm Mon, 06 Jul 2020 17:41:20 -0700

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 231924c  ARROW-9163: [C++] Validate UTF8 contents of a StringArray
231924c is described below

commit 231924c1e17115283cf5d5dcfb913a78dedd2036
Author: Antoine Pitrou <[email protected]>
AuthorDate: Mon Jul 6 19:39:40 2020 -0500

    ARROW-9163: [C++] Validate UTF8 contents of a StringArray
    
    * Add a ValidateUTF8() method to StringArray and LargeStringArray
    * Automatically call ValidateUTF8() when ValidateFull() is called on
      one of those types
    
    Closes #7596 from pitrou/ARROW-9163-validate-utf8
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Wes McKinney <[email protected]>
---
 cpp/src/arrow/array/array_binary.cc               |  20 ++++
 cpp/src/arrow/array/array_binary.h                |  10 ++
 cpp/src/arrow/array/array_binary_test.cc          | 106 ++++++++++++++++++----
 cpp/src/arrow/array/validate.cc                   |  12 ++-
 cpp/src/arrow/compute/kernels/scalar_cast_test.cc |  22 +++--
 cpp/src/arrow/compute/kernels/scalar_string.cc    |  14 +--
 cpp/src/arrow/csv/column_builder_test.cc          |  34 +++++--
 cpp/src/arrow/csv/converter_test.cc               |  24 +++--
 cpp/src/arrow/util/utf8.h                         |  10 +-
 python/pyarrow/tests/test_csv.py                  |  11 ++-
 10 files changed, 201 insertions(+), 62 deletions(-)

diff --git a/cpp/src/arrow/array/array_binary.cc 
b/cpp/src/arrow/array/array_binary.cc
index b54e796..53cc709 100644
--- a/cpp/src/arrow/array/array_binary.cc
+++ b/cpp/src/arrow/array/array_binary.cc
@@ -24,11 +24,27 @@
 #include "arrow/type.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
+#include "arrow/util/utf8.h"
 
 namespace arrow {
 
 using internal::checked_cast;
 
+namespace {
+
+template <typename StringArrayType>
+Status ValidateStringData(const StringArrayType& array) {
+  util::InitializeUTF8();
+  for (int64_t i = 0; i < array.length(); ++i) {
+    if (!array.IsNull(i) && !util::ValidateUTF8(array.GetView(i))) {
+      return Status::Invalid("Invalid UTF8 sequence at string index ", i);
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
 BinaryArray::BinaryArray(const std::shared_ptr<ArrayData>& data) {
   ARROW_CHECK_EQ(data->type->id(), Type::BINARY);
   SetData(data);
@@ -69,6 +85,8 @@ StringArray::StringArray(int64_t length, const 
std::shared_ptr<Buffer>& value_of
                           offset));
 }
 
+Status StringArray::ValidateUTF8() const { return ValidateStringData(*this); }
+
 LargeStringArray::LargeStringArray(const std::shared_ptr<ArrayData>& data) {
   ARROW_CHECK_EQ(data->type->id(), Type::LARGE_STRING);
   SetData(data);
@@ -83,6 +101,8 @@ LargeStringArray::LargeStringArray(int64_t length,
                           null_count, offset));
 }
 
+Status LargeStringArray::ValidateUTF8() const { return 
ValidateStringData(*this); }
+
 FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& 
data) {
   SetData(data);
 }
diff --git a/cpp/src/arrow/array/array_binary.h 
b/cpp/src/arrow/array/array_binary.h
index c54e504..7b3e75e 100644
--- a/cpp/src/arrow/array/array_binary.h
+++ b/cpp/src/arrow/array/array_binary.h
@@ -161,6 +161,11 @@ class ARROW_EXPORT StringArray : public BinaryArray {
               const std::shared_ptr<Buffer>& data,
               const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
               int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  /// \brief Validate that this array contains only valid UTF8 entries
+  ///
+  /// This check is also implied by ValidateFull()
+  Status ValidateUTF8() const;
 };
 
 /// Concrete Array class for large variable-size binary data
@@ -189,6 +194,11 @@ class ARROW_EXPORT LargeStringArray : public 
LargeBinaryArray {
                    const std::shared_ptr<Buffer>& data,
                    const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
                    int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  /// \brief Validate that this array contains only valid UTF8 entries
+  ///
+  /// This check is also implied by ValidateFull()
+  Status ValidateUTF8() const;
 };
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array/array_binary_test.cc 
b/cpp/src/arrow/array/array_binary_test.cc
index 42ddb33..9c2cd88 100644
--- a/cpp/src/arrow/array/array_binary_test.cc
+++ b/cpp/src/arrow/array/array_binary_test.cc
@@ -46,6 +46,8 @@ using internal::checked_cast;
 using StringTypes =
     ::testing::Types<StringType, LargeStringType, BinaryType, LargeBinaryType>;
 
+using UTF8Types = ::testing::Types<StringType, LargeStringType>;
+
 // ----------------------------------------------------------------------
 // String / Binary tests
 
@@ -245,38 +247,69 @@ class TestStringArray : public ::testing::Test {
     ASSERT_EQ(arr->GetString(0), "b");
   }
 
-  Status ValidateOffsets(int64_t length, std::vector<offset_type> offsets,
-                         util::string_view data, int64_t offset = 0) {
+  Status ValidateFull(int64_t length, std::vector<offset_type> offsets,
+                      util::string_view data, int64_t offset = 0) {
     ArrayType arr(length, Buffer::Wrap(offsets), 
std::make_shared<Buffer>(data),
                   /*null_bitmap=*/nullptr, /*null_count=*/0, offset);
     return arr.ValidateFull();
   }
 
+  Status ValidateFull(const std::string& json) {
+    auto ty = TypeTraits<T>::type_singleton();
+    auto arr = ArrayFromJSON(ty, json);
+    return arr->ValidateFull();
+  }
+
   void TestValidateOffsets() {
-    ASSERT_OK(ValidateOffsets(0, {0}, ""));
-    ASSERT_OK(ValidateOffsets(1, {0, 4}, "data"));
-    ASSERT_OK(ValidateOffsets(2, {0, 4, 4}, "data"));
-    ASSERT_OK(ValidateOffsets(2, {0, 5, 9}, "some data"));
+    ASSERT_OK(ValidateFull(0, {0}, ""));
+    ASSERT_OK(ValidateFull(1, {0, 4}, "data"));
+    ASSERT_OK(ValidateFull(2, {0, 4, 4}, "data"));
+    ASSERT_OK(ValidateFull(2, {0, 5, 9}, "some data"));
 
     // Non-zero array offset
-    ASSERT_OK(ValidateOffsets(0, {0, 4}, "data", 1));
-    ASSERT_OK(ValidateOffsets(1, {0, 5, 9}, "some data", 1));
-    ASSERT_OK(ValidateOffsets(0, {0, 5, 9}, "some data", 2));
+    ASSERT_OK(ValidateFull(0, {0, 4}, "data", 1));
+    ASSERT_OK(ValidateFull(1, {0, 5, 9}, "some data", 1));
+    ASSERT_OK(ValidateFull(0, {0, 5, 9}, "some data", 2));
 
     // Not enough offsets
-    ASSERT_RAISES(Invalid, ValidateOffsets(1, {}, ""));
-    ASSERT_RAISES(Invalid, ValidateOffsets(1, {0}, ""));
-    ASSERT_RAISES(Invalid, ValidateOffsets(2, {0, 4}, "data"));
-    ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 4}, "data", 1));
+    ASSERT_RAISES(Invalid, ValidateFull(1, {}, ""));
+    ASSERT_RAISES(Invalid, ValidateFull(1, {0}, ""));
+    ASSERT_RAISES(Invalid, ValidateFull(2, {0, 4}, "data"));
+    ASSERT_RAISES(Invalid, ValidateFull(1, {0, 4}, "data", 1));
 
     // Offset out of bounds
-    ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 5}, "data"));
+    ASSERT_RAISES(Invalid, ValidateFull(1, {0, 5}, "data"));
     // Negative offset
-    ASSERT_RAISES(Invalid, ValidateOffsets(1, {-1, 0}, "data"));
-    ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, -1}, "data"));
-    ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, -1, -1}, "data", 1));
+    ASSERT_RAISES(Invalid, ValidateFull(1, {-1, 0}, "data"));
+    ASSERT_RAISES(Invalid, ValidateFull(1, {0, -1}, "data"));
+    ASSERT_RAISES(Invalid, ValidateFull(1, {0, -1, -1}, "data", 1));
     // Offsets non-monotonic
-    ASSERT_RAISES(Invalid, ValidateOffsets(2, {0, 5, 4}, "some data"));
+    ASSERT_RAISES(Invalid, ValidateFull(2, {0, 5, 4}, "some data"));
+  }
+
+  void TestValidateData() {
+    // Valid UTF8
+    ASSERT_OK(ValidateFull(R"(["Voix", "ambiguë", "d’un", "cœur"])"));
+    ASSERT_OK(ValidateFull(R"(["いろはにほへと", "ちりぬるを", "わかよたれそ"])"));
+    ASSERT_OK(ValidateFull(R"(["😀", "😄"])"));
+    ASSERT_OK(ValidateFull(1, {0, 4}, "\xf4\x8f\xbf\xbf"));  // \U0010ffff
+
+    // Invalid UTF8
+    auto ty = TypeTraits<T>::type_singleton();
+    auto st1 = ValidateFull(3, {0, 4, 6, 9}, "abc \xff def");
+    // Hypothetical \U00110000
+    auto st2 = ValidateFull(1, {0, 4}, "\xf4\x90\x80\x80");
+    // Single UTF8 character straddles two entries
+    auto st3 = ValidateFull(2, {0, 1, 2}, "\xc3\xa9");
+    if (T::is_utf8) {
+      ASSERT_RAISES(Invalid, st1);
+      ASSERT_RAISES(Invalid, st2);
+      ASSERT_RAISES(Invalid, st3);
+    } else {
+      ASSERT_OK(st1);
+      ASSERT_OK(st2);
+      ASSERT_OK(st3);
+    }
   }
 
  protected:
@@ -320,6 +353,43 @@ TYPED_TEST(TestStringArray, TestSliceGetString) { 
this->TestSliceGetString(); }
 
 TYPED_TEST(TestStringArray, TestValidateOffsets) { 
this->TestValidateOffsets(); }
 
+TYPED_TEST(TestStringArray, TestValidateData) { this->TestValidateData(); }
+
+template <typename T>
+class TestUTF8Array : public ::testing::Test {
+ public:
+  using TypeClass = T;
+  using offset_type = typename TypeClass::offset_type;
+  using ArrayType = typename TypeTraits<TypeClass>::ArrayType;
+
+  Status ValidateUTF8(int64_t length, std::vector<offset_type> offsets,
+                      util::string_view data, int64_t offset = 0) {
+    ArrayType arr(length, Buffer::Wrap(offsets), 
std::make_shared<Buffer>(data),
+                  /*null_bitmap=*/nullptr, /*null_count=*/0, offset);
+    return arr.ValidateUTF8();
+  }
+
+  Status ValidateUTF8(const std::string& json) {
+    auto ty = TypeTraits<T>::type_singleton();
+    auto arr = ArrayFromJSON(ty, json);
+    return checked_cast<const ArrayType&>(*arr).ValidateUTF8();
+  }
+
+  void TestValidateUTF8() {
+    ASSERT_OK(ValidateUTF8(R"(["Voix", "ambiguë", "d’un", "cœur"])"));
+    ASSERT_OK(ValidateUTF8(1, {0, 4}, "\xf4\x8f\xbf\xbf"));  // \U0010ffff
+
+    ASSERT_RAISES(Invalid, ValidateUTF8(1, {0, 1}, "\xf4"));
+
+    // More tests in TestValidateData() above
+    // (ValidateFull() calls ValidateUTF8() internally)
+  }
+};
+
+TYPED_TEST_SUITE(TestUTF8Array, UTF8Types);
+
+TYPED_TEST(TestUTF8Array, TestValidateUTF8) { this->TestValidateUTF8(); }
+
 // ----------------------------------------------------------------------
 // String builder tests
 
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 0562e8e..3dd0ffd 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -430,11 +430,17 @@ struct ValidateArrayDataVisitor {
   // Fallback
   Status Visit(const Array& array) { return Status::OK(); }
 
-  Status Visit(const StringArray& array) { return ValidateBinaryArray(array); }
+  Status Visit(const StringArray& array) {
+    RETURN_NOT_OK(ValidateBinaryArray(array));
+    return array.ValidateUTF8();
+  }
 
-  Status Visit(const BinaryArray& array) { return ValidateBinaryArray(array); }
+  Status Visit(const LargeStringArray& array) {
+    RETURN_NOT_OK(ValidateBinaryArray(array));
+    return array.ValidateUTF8();
+  }
 
-  Status Visit(const LargeStringArray& array) { return 
ValidateBinaryArray(array); }
+  Status Visit(const BinaryArray& array) { return ValidateBinaryArray(array); }
 
   Status Visit(const LargeBinaryArray& array) { return 
ValidateBinaryArray(array); }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc 
b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index 7384a67..2b53dd6 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -83,9 +83,13 @@ class TestCast : public TestBase {
  public:
   void CheckPass(const Array& input, const Array& expected,
                  const std::shared_ptr<DataType>& out_type, const CastOptions& 
options,
-                 bool check_scalar = true) {
+                 bool check_scalar = true, bool validate_full = true) {
     ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, Cast(input, out_type, 
options));
-    ASSERT_OK(result->ValidateFull());
+    if (validate_full) {
+      ASSERT_OK(result->ValidateFull());
+    } else {
+      ASSERT_OK(result->Validate());
+    }
     AssertArraysEqual(expected, *result, /*verbose=*/true);
 
     if (input.type_id() == Type::DECIMAL || out_type->id() == Type::DECIMAL) {
@@ -161,7 +165,7 @@ class TestCast : public TestBase {
                  const std::vector<I_TYPE>& in_values, const 
std::vector<bool>& is_valid,
                  const std::shared_ptr<DataType>& out_type,
                  const std::vector<O_TYPE>& out_values, const CastOptions& 
options,
-                 bool check_scalar = true) {
+                 bool check_scalar = true, bool validate_full = true) {
     ASSERT_EQ(in_values.size(), out_values.size());
     std::shared_ptr<Array> input, expected;
     if (is_valid.size() > 0) {
@@ -172,11 +176,12 @@ class TestCast : public TestBase {
       ArrayFromVector<InType, I_TYPE>(in_type, in_values, &input);
       ArrayFromVector<OutType, O_TYPE>(out_type, out_values, &expected);
     }
-    CheckPass(*input, *expected, out_type, options, check_scalar);
+    CheckPass(*input, *expected, out_type, options, check_scalar, 
validate_full);
 
     // Check a sliced variant
     if (input->length() > 1) {
-      CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options, 
check_scalar);
+      CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options, 
check_scalar,
+                validate_full);
     }
   }
 
@@ -184,10 +189,11 @@ class TestCast : public TestBase {
             typename O_TYPE = typename OutType::c_type>
   void CheckCase(const std::vector<I_TYPE>& in_values, const 
std::vector<bool>& is_valid,
                  const std::vector<O_TYPE>& out_values, const CastOptions& 
options,
-                 bool check_scalar = true) {
+                 bool check_scalar = true, bool validate_full = true) {
     CheckCase<InType, OutType, I_TYPE, O_TYPE>(
         TypeTraits<InType>::type_singleton(), in_values, is_valid,
-        TypeTraits<OutType>::type_singleton(), out_values, options, 
check_scalar);
+        TypeTraits<OutType>::type_singleton(), out_values, options, 
check_scalar,
+        validate_full);
   }
 
   void CheckCaseJSON(const std::shared_ptr<DataType>& in_type,
@@ -239,7 +245,7 @@ class TestCast : public TestBase {
     // Should accept due to option override
     options.allow_invalid_utf8 = true;
     CheckCase<SourceType, DestType>(strings, all, strings, options,
-                                    /*check_scalar=*/false);
+                                    /*check_scalar=*/false, 
/*validate_full=*/false);
   }
 
   template <typename SourceType, typename DestType>
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc 
b/cpp/src/arrow/compute/kernels/scalar_string.cc
index b7d2fee..d496ab6 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -80,7 +80,7 @@ void EnsureLookupTablesFilled() {
 }
 
 template <typename Type, typename Derived>
-struct Utf8Transform {
+struct UTF8Transform {
   using offset_type = typename Type::offset_type;
   using ArrayType = typename TypeTraits<Type>::ArrayType;
 
@@ -88,7 +88,7 @@ struct Utf8Transform {
                         uint8_t* output, offset_type* output_written) {
     uint8_t* output_start = output;
     if (ARROW_PREDICT_FALSE(
-            !arrow::util::Utf8Transform(input, input + 
input_string_ncodeunits, &output,
+            !arrow::util::UTF8Transform(input, input + 
input_string_ncodeunits, &output,
                                         Derived::TransformCodepoint))) {
       return false;
     }
@@ -184,7 +184,7 @@ struct Utf8Transform {
 };
 
 template <typename Type>
-struct Utf8Upper : Utf8Transform<Type, Utf8Upper<Type>> {
+struct UTF8Upper : UTF8Transform<Type, UTF8Upper<Type>> {
   inline static uint32_t TransformCodepoint(uint32_t codepoint) {
     return codepoint <= kMaxCodepointLookup ? lut_upper_codepoint[codepoint]
                                             : utf8proc_toupper(codepoint);
@@ -192,7 +192,7 @@ struct Utf8Upper : Utf8Transform<Type, Utf8Upper<Type>> {
 };
 
 template <typename Type>
-struct Utf8Lower : Utf8Transform<Type, Utf8Lower<Type>> {
+struct UTF8Lower : UTF8Transform<Type, UTF8Lower<Type>> {
   static uint32_t TransformCodepoint(uint32_t codepoint) {
     return codepoint <= kMaxCodepointLookup ? lut_lower_codepoint[codepoint]
                                             : utf8proc_tolower(codepoint);
@@ -468,7 +468,7 @@ void MakeUnaryStringBatchKernel(std::string name, 
FunctionRegistry* registry) {
 #ifdef ARROW_WITH_UTF8PROC
 
 template <template <typename> class Transformer>
-void MakeUnaryStringUtf8TransformKernel(std::string name, FunctionRegistry* 
registry) {
+void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* 
registry) {
   auto func = std::make_shared<ScalarFunction>(name, Arity::Unary());
   ArrayKernelExec exec_32 = Transformer<StringType>::Exec;
   ArrayKernelExec exec_64 = Transformer<LargeStringType>::Exec;
@@ -485,8 +485,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
   MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry);
   MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry);
 #ifdef ARROW_WITH_UTF8PROC
-  MakeUnaryStringUtf8TransformKernel<Utf8Upper>("utf8_upper", registry);
-  MakeUnaryStringUtf8TransformKernel<Utf8Lower>("utf8_lower", registry);
+  MakeUnaryStringUTF8TransformKernel<UTF8Upper>("utf8_upper", registry);
+  MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry);
 #endif
   AddAsciiLength(registry);
   AddBinaryContainsExact(registry);
diff --git a/cpp/src/arrow/csv/column_builder_test.cc 
b/cpp/src/arrow/csv/column_builder_test.cc
index c0867ba..0ec3ed0 100644
--- a/cpp/src/arrow/csv/column_builder_test.cc
+++ b/cpp/src/arrow/csv/column_builder_test.cc
@@ -47,7 +47,8 @@ using ChunkData = std::vector<std::vector<std::string>>;
 class ColumnBuilderTest : public ::testing::Test {
  public:
   void AssertBuilding(const std::shared_ptr<ColumnBuilder>& builder,
-                      const ChunkData& chunks, std::shared_ptr<ChunkedArray>* 
out) {
+                      const ChunkData& chunks, bool validate_full,
+                      std::shared_ptr<ChunkedArray>* out) {
     for (const auto& chunk : chunks) {
       std::shared_ptr<BlockParser> parser;
       MakeColumnParser(chunk, &parser);
@@ -55,24 +56,35 @@ class ColumnBuilderTest : public ::testing::Test {
     }
     ASSERT_OK(builder->task_group()->Finish());
     ASSERT_OK_AND_ASSIGN(*out, builder->Finish());
-    ASSERT_OK((*out)->ValidateFull());
+    if (validate_full) {
+      ASSERT_OK((*out)->ValidateFull());
+    } else {
+      ASSERT_OK((*out)->Validate());
+    }
+  }
+
+  void AssertBuilding(const std::shared_ptr<ColumnBuilder>& builder,
+                      const ChunkData& chunks, std::shared_ptr<ChunkedArray>* 
out) {
+    AssertBuilding(builder, chunks, /*validate_full=*/true, out);
   }
 
   void CheckInferred(const std::shared_ptr<TaskGroup>& tg, const ChunkData& 
csv_data,
                      const ConvertOptions& options,
-                     std::shared_ptr<ChunkedArray> expected) {
+                     std::shared_ptr<ChunkedArray> expected, bool 
validate_full = true) {
     std::shared_ptr<ColumnBuilder> builder;
     std::shared_ptr<ChunkedArray> actual;
     ASSERT_OK_AND_ASSIGN(builder,
                          ColumnBuilder::Make(default_memory_pool(), 0, 
options, tg));
-    AssertBuilding(builder, csv_data, &actual);
+    AssertBuilding(builder, csv_data, validate_full, &actual);
     AssertChunkedEqual(*actual, *expected);
   }
 
   void CheckInferred(const std::shared_ptr<TaskGroup>& tg, const ChunkData& 
csv_data,
                      const ConvertOptions& options,
-                     std::vector<std::shared_ptr<Array>> expected_chunks) {
-    CheckInferred(tg, csv_data, options, 
std::make_shared<ChunkedArray>(expected_chunks));
+                     std::vector<std::shared_ptr<Array>> expected_chunks,
+                     bool validate_full = true) {
+    CheckInferred(tg, csv_data, options, 
std::make_shared<ChunkedArray>(expected_chunks),
+                  validate_full);
   }
 
   void CheckFixedType(const std::shared_ptr<TaskGroup>& tg,
@@ -279,12 +291,13 @@ class InferringColumnBuilderTest : public 
ColumnBuilderTest {
   void CheckAutoDictEncoded(const std::shared_ptr<TaskGroup>& tg,
                             const ChunkData& csv_data, const ConvertOptions& 
options,
                             std::vector<std::shared_ptr<Array>> 
expected_indices,
-                            std::vector<std::shared_ptr<Array>> 
expected_dictionaries) {
+                            std::vector<std::shared_ptr<Array>> 
expected_dictionaries,
+                            bool validate_full = true) {
     std::shared_ptr<ColumnBuilder> builder;
     std::shared_ptr<ChunkedArray> actual;
     ASSERT_OK_AND_ASSIGN(builder,
                          ColumnBuilder::Make(default_memory_pool(), 0, 
options, tg));
-    AssertBuilding(builder, csv_data, &actual);
+    AssertBuilding(builder, csv_data, validate_full, &actual);
     ASSERT_EQ(actual->num_chunks(), static_cast<int>(csv_data.size()));
     for (int i = 0; i < actual->num_chunks(); ++i) {
       ASSERT_EQ(actual->chunk(i)->type_id(), Type::DICTIONARY);
@@ -405,7 +418,8 @@ TEST_F(InferringColumnBuilderTest, SingleChunkString) {
   tg = TaskGroup::MakeSerial();
   ChunkedArrayFromVector<StringType, std::string>({{true, true, true}},
                                                   {{"", "foo\xff", "baré"}}, 
&expected);
-  CheckInferred(tg, {{"", "foo\xff", "baré"}}, options, expected);
+  CheckInferred(tg, {{"", "foo\xff", "baré"}}, options, expected,
+                /*validate_full=*/false);
 }
 
 TEST_F(InferringColumnBuilderTest, SingleChunkBinary) {
@@ -473,7 +487,7 @@ TEST_F(InferringColumnBuilderTest, 
SingleChunkBinaryAutoDict) {
   ArrayFromVector<StringType, std::string>({"ab", "cd\xff"}, 
&expected_dictionary);
 
   CheckAutoDictEncoded(TaskGroup::MakeSerial(), csv_data, options, 
{expected_indices},
-                       {expected_dictionary});
+                       {expected_dictionary}, /*validate_full=*/false);
 
   // With invalid UTF8, checking
   options.check_utf8 = true;
diff --git a/cpp/src/arrow/csv/converter_test.cc 
b/cpp/src/arrow/csv/converter_test.cc
index f8f9baf..2e5b773 100644
--- a/cpp/src/arrow/csv/converter_test.cc
+++ b/cpp/src/arrow/csv/converter_test.cc
@@ -52,7 +52,8 @@ template <typename DATA_TYPE, typename C_TYPE>
 void AssertConversion(const std::shared_ptr<DataType>& type,
                       const std::vector<std::string>& csv_string,
                       const std::vector<std::vector<C_TYPE>>& expected,
-                      ConvertOptions options = ConvertOptions::Defaults()) {
+                      ConvertOptions options = ConvertOptions::Defaults(),
+                      bool validate_full = true) {
   std::shared_ptr<BlockParser> parser;
   std::shared_ptr<Converter> converter;
   std::shared_ptr<Array> array, expected_array;
@@ -63,7 +64,11 @@ void AssertConversion(const std::shared_ptr<DataType>& type,
   for (int32_t col_index = 0; col_index < 
static_cast<int32_t>(expected.size());
        ++col_index) {
     ASSERT_OK_AND_ASSIGN(array, converter->Convert(*parser, col_index));
-    ASSERT_OK(array->ValidateFull());
+    if (validate_full) {
+      ASSERT_OK(array->ValidateFull());
+    } else {
+      ASSERT_OK(array->Validate());
+    }
     ArrayFromVector<DATA_TYPE, C_TYPE>(type, expected[col_index], 
&expected_array);
     AssertArraysEqual(*expected_array, *array);
   }
@@ -115,7 +120,8 @@ void AssertDictConversion(const std::string& csv_string,
                           const std::shared_ptr<Array>& expected_indices,
                           const std::shared_ptr<Array>& expected_dict,
                           int32_t max_cardinality = -1,
-                          ConvertOptions options = ConvertOptions::Defaults()) 
{
+                          ConvertOptions options = ConvertOptions::Defaults(),
+                          bool validate_full = true) {
   std::shared_ptr<BlockParser> parser;
   std::shared_ptr<DictionaryConverter> converter;
   std::shared_ptr<Array> array, expected_array;
@@ -123,7 +129,11 @@ void AssertDictConversion(const std::string& csv_string,
 
   ASSERT_OK_AND_ASSIGN(
       array, DictConversion(expected_dict->type(), csv_string, 
max_cardinality, options));
-  ASSERT_OK(array->ValidateFull());
+  if (validate_full) {
+    ASSERT_OK(array->ValidateFull());
+  } else {
+    ASSERT_OK(array->Validate());
+  }
   expected_type = dictionary(expected_indices->type(), expected_dict->type());
   ASSERT_TRUE(array->type()->Equals(*expected_type));
   const auto& dict_array = internal::checked_cast<const 
DictionaryArray&>(*array);
@@ -193,7 +203,8 @@ static void TestStringConversionBasics() {
   auto options = ConvertOptions::Defaults();
   options.check_utf8 = false;
   AssertConversion<T, std::string>(type, {"ab,cdé\n", ",\xffgh\n"},
-                                   {{"ab", ""}, {"cdé", "\xffgh"}}, options);
+                                   {{"ab", ""}, {"cdé", "\xffgh"}}, options,
+                                   /*validate_full=*/false);
 }
 
 TEST(StringConversion, Basics) { TestStringConversionBasics<StringType>(); }
@@ -485,7 +496,8 @@ TYPED_TEST(TestDictConverter, NonUTF8) {
 
     auto options = ConvertOptions::Defaults();
     options.check_utf8 = false;
-    AssertDictConversion(csv_string, expected_indices, expected_dict, -1, 
options);
+    AssertDictConversion(csv_string, expected_indices, expected_dict, -1, 
options,
+                         /*validate_full=*/false);
   } else {
     AssertDictConversion(csv_string, expected_indices, expected_dict);
   }
diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h
index 50b6cca..1775b19 100644
--- a/cpp/src/arrow/util/utf8.h
+++ b/cpp/src/arrow/util/utf8.h
@@ -282,7 +282,7 @@ static inline bool Utf8IsContinuation(const uint8_t 
codeunit) {
   return (codeunit & 0xC0) == 0x80;  // upper two bits should be 10
 }
 
-static inline uint8_t* Utf8Encode(uint8_t* str, uint32_t codepoint) {
+static inline uint8_t* UTF8Encode(uint8_t* str, uint32_t codepoint) {
   if (codepoint < 0x80) {
     *str++ = codepoint;
   } else if (codepoint < 0x800) {
@@ -303,7 +303,7 @@ static inline uint8_t* Utf8Encode(uint8_t* str, uint32_t 
codepoint) {
   return str;
 }
 
-static inline bool Utf8Decode(const uint8_t** data, uint32_t* codepoint) {
+static inline bool UTF8Decode(const uint8_t** data, uint32_t* codepoint) {
   const uint8_t* str = *data;
   if (*str < 0x80) {  // ascci
     *codepoint = *str++;
@@ -351,16 +351,16 @@ static inline bool Utf8Decode(const uint8_t** data, 
uint32_t* codepoint) {
 }
 
 template <class UnaryOperation>
-static inline bool Utf8Transform(const uint8_t* first, const uint8_t* last,
+static inline bool UTF8Transform(const uint8_t* first, const uint8_t* last,
                                  uint8_t** destination, UnaryOperation&& 
unary_op) {
   const uint8_t* i = first;
   uint8_t* out = *destination;
   while (i < last) {
     uint32_t codepoint = 0;
-    if (ARROW_PREDICT_FALSE(!Utf8Decode(&i, &codepoint))) {
+    if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) {
       return false;
     }
-    out = Utf8Encode(out, unary_op(codepoint));
+    out = UTF8Encode(out, unary_op(codepoint));
   }
   *destination = out;
   return true;
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index 0fd10e3..582f04b 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -563,7 +563,8 @@ class BaseTestCSVRead:
         opts.auto_dict_max_cardinality = 50
         opts.check_utf8 = False
         rows = b"a,b\nab,1\ncd\xff,2\nab,3"
-        table = self.read_bytes(rows, convert_options=opts)
+        table = self.read_bytes(rows, convert_options=opts,
+                                validate_full=False)
         assert table.schema == schema
         dict_values = table['a'].chunk(0).dictionary
         assert len(dict_values) == 2
@@ -809,21 +810,21 @@ class BaseTestCSVRead:
 
 class TestSerialCSVRead(BaseTestCSVRead, unittest.TestCase):
 
-    def read_csv(self, *args, **kwargs):
+    def read_csv(self, *args, validate_full=True, **kwargs):
         read_options = kwargs.setdefault('read_options', ReadOptions())
         read_options.use_threads = False
         table = read_csv(*args, **kwargs)
-        table.validate(full=True)
+        table.validate(full=validate_full)
         return table
 
 
 class TestParallelCSVRead(BaseTestCSVRead, unittest.TestCase):
 
-    def read_csv(self, *args, **kwargs):
+    def read_csv(self, *args, validate_full=True, **kwargs):
         read_options = kwargs.setdefault('read_options', ReadOptions())
         read_options.use_threads = True
         table = read_csv(*args, **kwargs)
-        table.validate(full=True)
+        table.validate(full=validate_full)
         return table

[arrow] branch master updated: ARROW-9163: [C++] Validate UTF8 contents of a StringArray

Reply via email to