This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 231924c ARROW-9163: [C++] Validate UTF8 contents of a StringArray
231924c is described below
commit 231924c1e17115283cf5d5dcfb913a78dedd2036
Author: Antoine Pitrou <[email protected]>
AuthorDate: Mon Jul 6 19:39:40 2020 -0500
ARROW-9163: [C++] Validate UTF8 contents of a StringArray
* Add a ValidateUTF8() method to StringArray and LargeStringArray
* Automatically call ValidateUTF8() when ValidateFull() is called on
one of those types
Closes #7596 from pitrou/ARROW-9163-validate-utf8
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Wes McKinney <[email protected]>
---
cpp/src/arrow/array/array_binary.cc | 20 ++++
cpp/src/arrow/array/array_binary.h | 10 ++
cpp/src/arrow/array/array_binary_test.cc | 106 ++++++++++++++++++----
cpp/src/arrow/array/validate.cc | 12 ++-
cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 22 +++--
cpp/src/arrow/compute/kernels/scalar_string.cc | 14 +--
cpp/src/arrow/csv/column_builder_test.cc | 34 +++++--
cpp/src/arrow/csv/converter_test.cc | 24 +++--
cpp/src/arrow/util/utf8.h | 10 +-
python/pyarrow/tests/test_csv.py | 11 ++-
10 files changed, 201 insertions(+), 62 deletions(-)
diff --git a/cpp/src/arrow/array/array_binary.cc
b/cpp/src/arrow/array/array_binary.cc
index b54e796..53cc709 100644
--- a/cpp/src/arrow/array/array_binary.cc
+++ b/cpp/src/arrow/array/array_binary.cc
@@ -24,11 +24,27 @@
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
+#include "arrow/util/utf8.h"
namespace arrow {
using internal::checked_cast;
+namespace {
+
+template <typename StringArrayType>
+Status ValidateStringData(const StringArrayType& array) {
+ util::InitializeUTF8();
+ for (int64_t i = 0; i < array.length(); ++i) {
+ if (!array.IsNull(i) && !util::ValidateUTF8(array.GetView(i))) {
+ return Status::Invalid("Invalid UTF8 sequence at string index ", i);
+ }
+ }
+ return Status::OK();
+}
+
+} // namespace
+
BinaryArray::BinaryArray(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK_EQ(data->type->id(), Type::BINARY);
SetData(data);
@@ -69,6 +85,8 @@ StringArray::StringArray(int64_t length, const
std::shared_ptr<Buffer>& value_of
offset));
}
+Status StringArray::ValidateUTF8() const { return ValidateStringData(*this); }
+
LargeStringArray::LargeStringArray(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK_EQ(data->type->id(), Type::LARGE_STRING);
SetData(data);
@@ -83,6 +101,8 @@ LargeStringArray::LargeStringArray(int64_t length,
null_count, offset));
}
+Status LargeStringArray::ValidateUTF8() const { return
ValidateStringData(*this); }
+
FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<ArrayData>&
data) {
SetData(data);
}
diff --git a/cpp/src/arrow/array/array_binary.h
b/cpp/src/arrow/array/array_binary.h
index c54e504..7b3e75e 100644
--- a/cpp/src/arrow/array/array_binary.h
+++ b/cpp/src/arrow/array/array_binary.h
@@ -161,6 +161,11 @@ class ARROW_EXPORT StringArray : public BinaryArray {
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ /// \brief Validate that this array contains only valid UTF8 entries
+ ///
+ /// This check is also implied by ValidateFull()
+ Status ValidateUTF8() const;
};
/// Concrete Array class for large variable-size binary data
@@ -189,6 +194,11 @@ class ARROW_EXPORT LargeStringArray : public
LargeBinaryArray {
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ /// \brief Validate that this array contains only valid UTF8 entries
+ ///
+ /// This check is also implied by ValidateFull()
+ Status ValidateUTF8() const;
};
// ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array/array_binary_test.cc
b/cpp/src/arrow/array/array_binary_test.cc
index 42ddb33..9c2cd88 100644
--- a/cpp/src/arrow/array/array_binary_test.cc
+++ b/cpp/src/arrow/array/array_binary_test.cc
@@ -46,6 +46,8 @@ using internal::checked_cast;
using StringTypes =
::testing::Types<StringType, LargeStringType, BinaryType, LargeBinaryType>;
+using UTF8Types = ::testing::Types<StringType, LargeStringType>;
+
// ----------------------------------------------------------------------
// String / Binary tests
@@ -245,38 +247,69 @@ class TestStringArray : public ::testing::Test {
ASSERT_EQ(arr->GetString(0), "b");
}
- Status ValidateOffsets(int64_t length, std::vector<offset_type> offsets,
- util::string_view data, int64_t offset = 0) {
+ Status ValidateFull(int64_t length, std::vector<offset_type> offsets,
+ util::string_view data, int64_t offset = 0) {
ArrayType arr(length, Buffer::Wrap(offsets),
std::make_shared<Buffer>(data),
/*null_bitmap=*/nullptr, /*null_count=*/0, offset);
return arr.ValidateFull();
}
+ Status ValidateFull(const std::string& json) {
+ auto ty = TypeTraits<T>::type_singleton();
+ auto arr = ArrayFromJSON(ty, json);
+ return arr->ValidateFull();
+ }
+
void TestValidateOffsets() {
- ASSERT_OK(ValidateOffsets(0, {0}, ""));
- ASSERT_OK(ValidateOffsets(1, {0, 4}, "data"));
- ASSERT_OK(ValidateOffsets(2, {0, 4, 4}, "data"));
- ASSERT_OK(ValidateOffsets(2, {0, 5, 9}, "some data"));
+ ASSERT_OK(ValidateFull(0, {0}, ""));
+ ASSERT_OK(ValidateFull(1, {0, 4}, "data"));
+ ASSERT_OK(ValidateFull(2, {0, 4, 4}, "data"));
+ ASSERT_OK(ValidateFull(2, {0, 5, 9}, "some data"));
// Non-zero array offset
- ASSERT_OK(ValidateOffsets(0, {0, 4}, "data", 1));
- ASSERT_OK(ValidateOffsets(1, {0, 5, 9}, "some data", 1));
- ASSERT_OK(ValidateOffsets(0, {0, 5, 9}, "some data", 2));
+ ASSERT_OK(ValidateFull(0, {0, 4}, "data", 1));
+ ASSERT_OK(ValidateFull(1, {0, 5, 9}, "some data", 1));
+ ASSERT_OK(ValidateFull(0, {0, 5, 9}, "some data", 2));
// Not enough offsets
- ASSERT_RAISES(Invalid, ValidateOffsets(1, {}, ""));
- ASSERT_RAISES(Invalid, ValidateOffsets(1, {0}, ""));
- ASSERT_RAISES(Invalid, ValidateOffsets(2, {0, 4}, "data"));
- ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 4}, "data", 1));
+ ASSERT_RAISES(Invalid, ValidateFull(1, {}, ""));
+ ASSERT_RAISES(Invalid, ValidateFull(1, {0}, ""));
+ ASSERT_RAISES(Invalid, ValidateFull(2, {0, 4}, "data"));
+ ASSERT_RAISES(Invalid, ValidateFull(1, {0, 4}, "data", 1));
// Offset out of bounds
- ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 5}, "data"));
+ ASSERT_RAISES(Invalid, ValidateFull(1, {0, 5}, "data"));
// Negative offset
- ASSERT_RAISES(Invalid, ValidateOffsets(1, {-1, 0}, "data"));
- ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, -1}, "data"));
- ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, -1, -1}, "data", 1));
+ ASSERT_RAISES(Invalid, ValidateFull(1, {-1, 0}, "data"));
+ ASSERT_RAISES(Invalid, ValidateFull(1, {0, -1}, "data"));
+ ASSERT_RAISES(Invalid, ValidateFull(1, {0, -1, -1}, "data", 1));
// Offsets non-monotonic
- ASSERT_RAISES(Invalid, ValidateOffsets(2, {0, 5, 4}, "some data"));
+ ASSERT_RAISES(Invalid, ValidateFull(2, {0, 5, 4}, "some data"));
+ }
+
+ void TestValidateData() {
+ // Valid UTF8
+ ASSERT_OK(ValidateFull(R"(["Voix", "ambiguë", "d’un", "cœur"])"));
+ ASSERT_OK(ValidateFull(R"(["いろはにほへと", "ちりぬるを", "わかよたれそ"])"));
+ ASSERT_OK(ValidateFull(R"(["😀", "😄"])"));
+ ASSERT_OK(ValidateFull(1, {0, 4}, "\xf4\x8f\xbf\xbf")); // \U0010ffff
+
+ // Invalid UTF8
+ auto ty = TypeTraits<T>::type_singleton();
+ auto st1 = ValidateFull(3, {0, 4, 6, 9}, "abc \xff def");
+ // Hypothetical \U00110000
+ auto st2 = ValidateFull(1, {0, 4}, "\xf4\x90\x80\x80");
+ // Single UTF8 character straddles two entries
+ auto st3 = ValidateFull(2, {0, 1, 2}, "\xc3\xa9");
+ if (T::is_utf8) {
+ ASSERT_RAISES(Invalid, st1);
+ ASSERT_RAISES(Invalid, st2);
+ ASSERT_RAISES(Invalid, st3);
+ } else {
+ ASSERT_OK(st1);
+ ASSERT_OK(st2);
+ ASSERT_OK(st3);
+ }
}
protected:
@@ -320,6 +353,43 @@ TYPED_TEST(TestStringArray, TestSliceGetString) {
this->TestSliceGetString(); }
TYPED_TEST(TestStringArray, TestValidateOffsets) {
this->TestValidateOffsets(); }
+TYPED_TEST(TestStringArray, TestValidateData) { this->TestValidateData(); }
+
+template <typename T>
+class TestUTF8Array : public ::testing::Test {
+ public:
+ using TypeClass = T;
+ using offset_type = typename TypeClass::offset_type;
+ using ArrayType = typename TypeTraits<TypeClass>::ArrayType;
+
+ Status ValidateUTF8(int64_t length, std::vector<offset_type> offsets,
+ util::string_view data, int64_t offset = 0) {
+ ArrayType arr(length, Buffer::Wrap(offsets),
std::make_shared<Buffer>(data),
+ /*null_bitmap=*/nullptr, /*null_count=*/0, offset);
+ return arr.ValidateUTF8();
+ }
+
+ Status ValidateUTF8(const std::string& json) {
+ auto ty = TypeTraits<T>::type_singleton();
+ auto arr = ArrayFromJSON(ty, json);
+ return checked_cast<const ArrayType&>(*arr).ValidateUTF8();
+ }
+
+ void TestValidateUTF8() {
+ ASSERT_OK(ValidateUTF8(R"(["Voix", "ambiguë", "d’un", "cœur"])"));
+ ASSERT_OK(ValidateUTF8(1, {0, 4}, "\xf4\x8f\xbf\xbf")); // \U0010ffff
+
+ ASSERT_RAISES(Invalid, ValidateUTF8(1, {0, 1}, "\xf4"));
+
+ // More tests in TestValidateData() above
+ // (ValidateFull() calls ValidateUTF8() internally)
+ }
+};
+
+TYPED_TEST_SUITE(TestUTF8Array, UTF8Types);
+
+TYPED_TEST(TestUTF8Array, TestValidateUTF8) { this->TestValidateUTF8(); }
+
// ----------------------------------------------------------------------
// String builder tests
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 0562e8e..3dd0ffd 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -430,11 +430,17 @@ struct ValidateArrayDataVisitor {
// Fallback
Status Visit(const Array& array) { return Status::OK(); }
- Status Visit(const StringArray& array) { return ValidateBinaryArray(array); }
+ Status Visit(const StringArray& array) {
+ RETURN_NOT_OK(ValidateBinaryArray(array));
+ return array.ValidateUTF8();
+ }
- Status Visit(const BinaryArray& array) { return ValidateBinaryArray(array); }
+ Status Visit(const LargeStringArray& array) {
+ RETURN_NOT_OK(ValidateBinaryArray(array));
+ return array.ValidateUTF8();
+ }
- Status Visit(const LargeStringArray& array) { return
ValidateBinaryArray(array); }
+ Status Visit(const BinaryArray& array) { return ValidateBinaryArray(array); }
Status Visit(const LargeBinaryArray& array) { return
ValidateBinaryArray(array); }
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index 7384a67..2b53dd6 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -83,9 +83,13 @@ class TestCast : public TestBase {
public:
void CheckPass(const Array& input, const Array& expected,
const std::shared_ptr<DataType>& out_type, const CastOptions&
options,
- bool check_scalar = true) {
+ bool check_scalar = true, bool validate_full = true) {
ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, Cast(input, out_type,
options));
- ASSERT_OK(result->ValidateFull());
+ if (validate_full) {
+ ASSERT_OK(result->ValidateFull());
+ } else {
+ ASSERT_OK(result->Validate());
+ }
AssertArraysEqual(expected, *result, /*verbose=*/true);
if (input.type_id() == Type::DECIMAL || out_type->id() == Type::DECIMAL) {
@@ -161,7 +165,7 @@ class TestCast : public TestBase {
const std::vector<I_TYPE>& in_values, const
std::vector<bool>& is_valid,
const std::shared_ptr<DataType>& out_type,
const std::vector<O_TYPE>& out_values, const CastOptions&
options,
- bool check_scalar = true) {
+ bool check_scalar = true, bool validate_full = true) {
ASSERT_EQ(in_values.size(), out_values.size());
std::shared_ptr<Array> input, expected;
if (is_valid.size() > 0) {
@@ -172,11 +176,12 @@ class TestCast : public TestBase {
ArrayFromVector<InType, I_TYPE>(in_type, in_values, &input);
ArrayFromVector<OutType, O_TYPE>(out_type, out_values, &expected);
}
- CheckPass(*input, *expected, out_type, options, check_scalar);
+ CheckPass(*input, *expected, out_type, options, check_scalar,
validate_full);
// Check a sliced variant
if (input->length() > 1) {
- CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options,
check_scalar);
+ CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options,
check_scalar,
+ validate_full);
}
}
@@ -184,10 +189,11 @@ class TestCast : public TestBase {
typename O_TYPE = typename OutType::c_type>
void CheckCase(const std::vector<I_TYPE>& in_values, const
std::vector<bool>& is_valid,
const std::vector<O_TYPE>& out_values, const CastOptions&
options,
- bool check_scalar = true) {
+ bool check_scalar = true, bool validate_full = true) {
CheckCase<InType, OutType, I_TYPE, O_TYPE>(
TypeTraits<InType>::type_singleton(), in_values, is_valid,
- TypeTraits<OutType>::type_singleton(), out_values, options,
check_scalar);
+ TypeTraits<OutType>::type_singleton(), out_values, options,
check_scalar,
+ validate_full);
}
void CheckCaseJSON(const std::shared_ptr<DataType>& in_type,
@@ -239,7 +245,7 @@ class TestCast : public TestBase {
// Should accept due to option override
options.allow_invalid_utf8 = true;
CheckCase<SourceType, DestType>(strings, all, strings, options,
- /*check_scalar=*/false);
+ /*check_scalar=*/false,
/*validate_full=*/false);
}
template <typename SourceType, typename DestType>
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc
b/cpp/src/arrow/compute/kernels/scalar_string.cc
index b7d2fee..d496ab6 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -80,7 +80,7 @@ void EnsureLookupTablesFilled() {
}
template <typename Type, typename Derived>
-struct Utf8Transform {
+struct UTF8Transform {
using offset_type = typename Type::offset_type;
using ArrayType = typename TypeTraits<Type>::ArrayType;
@@ -88,7 +88,7 @@ struct Utf8Transform {
uint8_t* output, offset_type* output_written) {
uint8_t* output_start = output;
if (ARROW_PREDICT_FALSE(
- !arrow::util::Utf8Transform(input, input +
input_string_ncodeunits, &output,
+ !arrow::util::UTF8Transform(input, input +
input_string_ncodeunits, &output,
Derived::TransformCodepoint))) {
return false;
}
@@ -184,7 +184,7 @@ struct Utf8Transform {
};
template <typename Type>
-struct Utf8Upper : Utf8Transform<Type, Utf8Upper<Type>> {
+struct UTF8Upper : UTF8Transform<Type, UTF8Upper<Type>> {
inline static uint32_t TransformCodepoint(uint32_t codepoint) {
return codepoint <= kMaxCodepointLookup ? lut_upper_codepoint[codepoint]
: utf8proc_toupper(codepoint);
@@ -192,7 +192,7 @@ struct Utf8Upper : Utf8Transform<Type, Utf8Upper<Type>> {
};
template <typename Type>
-struct Utf8Lower : Utf8Transform<Type, Utf8Lower<Type>> {
+struct UTF8Lower : UTF8Transform<Type, UTF8Lower<Type>> {
static uint32_t TransformCodepoint(uint32_t codepoint) {
return codepoint <= kMaxCodepointLookup ? lut_lower_codepoint[codepoint]
: utf8proc_tolower(codepoint);
@@ -468,7 +468,7 @@ void MakeUnaryStringBatchKernel(std::string name,
FunctionRegistry* registry) {
#ifdef ARROW_WITH_UTF8PROC
template <template <typename> class Transformer>
-void MakeUnaryStringUtf8TransformKernel(std::string name, FunctionRegistry*
registry) {
+void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry*
registry) {
auto func = std::make_shared<ScalarFunction>(name, Arity::Unary());
ArrayKernelExec exec_32 = Transformer<StringType>::Exec;
ArrayKernelExec exec_64 = Transformer<LargeStringType>::Exec;
@@ -485,8 +485,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry);
MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry);
#ifdef ARROW_WITH_UTF8PROC
- MakeUnaryStringUtf8TransformKernel<Utf8Upper>("utf8_upper", registry);
- MakeUnaryStringUtf8TransformKernel<Utf8Lower>("utf8_lower", registry);
+ MakeUnaryStringUTF8TransformKernel<UTF8Upper>("utf8_upper", registry);
+ MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry);
#endif
AddAsciiLength(registry);
AddBinaryContainsExact(registry);
diff --git a/cpp/src/arrow/csv/column_builder_test.cc
b/cpp/src/arrow/csv/column_builder_test.cc
index c0867ba..0ec3ed0 100644
--- a/cpp/src/arrow/csv/column_builder_test.cc
+++ b/cpp/src/arrow/csv/column_builder_test.cc
@@ -47,7 +47,8 @@ using ChunkData = std::vector<std::vector<std::string>>;
class ColumnBuilderTest : public ::testing::Test {
public:
void AssertBuilding(const std::shared_ptr<ColumnBuilder>& builder,
- const ChunkData& chunks, std::shared_ptr<ChunkedArray>*
out) {
+ const ChunkData& chunks, bool validate_full,
+ std::shared_ptr<ChunkedArray>* out) {
for (const auto& chunk : chunks) {
std::shared_ptr<BlockParser> parser;
MakeColumnParser(chunk, &parser);
@@ -55,24 +56,35 @@ class ColumnBuilderTest : public ::testing::Test {
}
ASSERT_OK(builder->task_group()->Finish());
ASSERT_OK_AND_ASSIGN(*out, builder->Finish());
- ASSERT_OK((*out)->ValidateFull());
+ if (validate_full) {
+ ASSERT_OK((*out)->ValidateFull());
+ } else {
+ ASSERT_OK((*out)->Validate());
+ }
+ }
+
+ void AssertBuilding(const std::shared_ptr<ColumnBuilder>& builder,
+ const ChunkData& chunks, std::shared_ptr<ChunkedArray>*
out) {
+ AssertBuilding(builder, chunks, /*validate_full=*/true, out);
}
void CheckInferred(const std::shared_ptr<TaskGroup>& tg, const ChunkData&
csv_data,
const ConvertOptions& options,
- std::shared_ptr<ChunkedArray> expected) {
+ std::shared_ptr<ChunkedArray> expected, bool
validate_full = true) {
std::shared_ptr<ColumnBuilder> builder;
std::shared_ptr<ChunkedArray> actual;
ASSERT_OK_AND_ASSIGN(builder,
ColumnBuilder::Make(default_memory_pool(), 0,
options, tg));
- AssertBuilding(builder, csv_data, &actual);
+ AssertBuilding(builder, csv_data, validate_full, &actual);
AssertChunkedEqual(*actual, *expected);
}
void CheckInferred(const std::shared_ptr<TaskGroup>& tg, const ChunkData&
csv_data,
const ConvertOptions& options,
- std::vector<std::shared_ptr<Array>> expected_chunks) {
- CheckInferred(tg, csv_data, options,
std::make_shared<ChunkedArray>(expected_chunks));
+ std::vector<std::shared_ptr<Array>> expected_chunks,
+ bool validate_full = true) {
+ CheckInferred(tg, csv_data, options,
std::make_shared<ChunkedArray>(expected_chunks),
+ validate_full);
}
void CheckFixedType(const std::shared_ptr<TaskGroup>& tg,
@@ -279,12 +291,13 @@ class InferringColumnBuilderTest : public
ColumnBuilderTest {
void CheckAutoDictEncoded(const std::shared_ptr<TaskGroup>& tg,
const ChunkData& csv_data, const ConvertOptions&
options,
std::vector<std::shared_ptr<Array>>
expected_indices,
- std::vector<std::shared_ptr<Array>>
expected_dictionaries) {
+ std::vector<std::shared_ptr<Array>>
expected_dictionaries,
+ bool validate_full = true) {
std::shared_ptr<ColumnBuilder> builder;
std::shared_ptr<ChunkedArray> actual;
ASSERT_OK_AND_ASSIGN(builder,
ColumnBuilder::Make(default_memory_pool(), 0,
options, tg));
- AssertBuilding(builder, csv_data, &actual);
+ AssertBuilding(builder, csv_data, validate_full, &actual);
ASSERT_EQ(actual->num_chunks(), static_cast<int>(csv_data.size()));
for (int i = 0; i < actual->num_chunks(); ++i) {
ASSERT_EQ(actual->chunk(i)->type_id(), Type::DICTIONARY);
@@ -405,7 +418,8 @@ TEST_F(InferringColumnBuilderTest, SingleChunkString) {
tg = TaskGroup::MakeSerial();
ChunkedArrayFromVector<StringType, std::string>({{true, true, true}},
{{"", "foo\xff", "baré"}},
&expected);
- CheckInferred(tg, {{"", "foo\xff", "baré"}}, options, expected);
+ CheckInferred(tg, {{"", "foo\xff", "baré"}}, options, expected,
+ /*validate_full=*/false);
}
TEST_F(InferringColumnBuilderTest, SingleChunkBinary) {
@@ -473,7 +487,7 @@ TEST_F(InferringColumnBuilderTest,
SingleChunkBinaryAutoDict) {
ArrayFromVector<StringType, std::string>({"ab", "cd\xff"},
&expected_dictionary);
CheckAutoDictEncoded(TaskGroup::MakeSerial(), csv_data, options,
{expected_indices},
- {expected_dictionary});
+ {expected_dictionary}, /*validate_full=*/false);
// With invalid UTF8, checking
options.check_utf8 = true;
diff --git a/cpp/src/arrow/csv/converter_test.cc
b/cpp/src/arrow/csv/converter_test.cc
index f8f9baf..2e5b773 100644
--- a/cpp/src/arrow/csv/converter_test.cc
+++ b/cpp/src/arrow/csv/converter_test.cc
@@ -52,7 +52,8 @@ template <typename DATA_TYPE, typename C_TYPE>
void AssertConversion(const std::shared_ptr<DataType>& type,
const std::vector<std::string>& csv_string,
const std::vector<std::vector<C_TYPE>>& expected,
- ConvertOptions options = ConvertOptions::Defaults()) {
+ ConvertOptions options = ConvertOptions::Defaults(),
+ bool validate_full = true) {
std::shared_ptr<BlockParser> parser;
std::shared_ptr<Converter> converter;
std::shared_ptr<Array> array, expected_array;
@@ -63,7 +64,11 @@ void AssertConversion(const std::shared_ptr<DataType>& type,
for (int32_t col_index = 0; col_index <
static_cast<int32_t>(expected.size());
++col_index) {
ASSERT_OK_AND_ASSIGN(array, converter->Convert(*parser, col_index));
- ASSERT_OK(array->ValidateFull());
+ if (validate_full) {
+ ASSERT_OK(array->ValidateFull());
+ } else {
+ ASSERT_OK(array->Validate());
+ }
ArrayFromVector<DATA_TYPE, C_TYPE>(type, expected[col_index],
&expected_array);
AssertArraysEqual(*expected_array, *array);
}
@@ -115,7 +120,8 @@ void AssertDictConversion(const std::string& csv_string,
const std::shared_ptr<Array>& expected_indices,
const std::shared_ptr<Array>& expected_dict,
int32_t max_cardinality = -1,
- ConvertOptions options = ConvertOptions::Defaults())
{
+ ConvertOptions options = ConvertOptions::Defaults(),
+ bool validate_full = true) {
std::shared_ptr<BlockParser> parser;
std::shared_ptr<DictionaryConverter> converter;
std::shared_ptr<Array> array, expected_array;
@@ -123,7 +129,11 @@ void AssertDictConversion(const std::string& csv_string,
ASSERT_OK_AND_ASSIGN(
array, DictConversion(expected_dict->type(), csv_string,
max_cardinality, options));
- ASSERT_OK(array->ValidateFull());
+ if (validate_full) {
+ ASSERT_OK(array->ValidateFull());
+ } else {
+ ASSERT_OK(array->Validate());
+ }
expected_type = dictionary(expected_indices->type(), expected_dict->type());
ASSERT_TRUE(array->type()->Equals(*expected_type));
const auto& dict_array = internal::checked_cast<const
DictionaryArray&>(*array);
@@ -193,7 +203,8 @@ static void TestStringConversionBasics() {
auto options = ConvertOptions::Defaults();
options.check_utf8 = false;
AssertConversion<T, std::string>(type, {"ab,cdé\n", ",\xffgh\n"},
- {{"ab", ""}, {"cdé", "\xffgh"}}, options);
+ {{"ab", ""}, {"cdé", "\xffgh"}}, options,
+ /*validate_full=*/false);
}
TEST(StringConversion, Basics) { TestStringConversionBasics<StringType>(); }
@@ -485,7 +496,8 @@ TYPED_TEST(TestDictConverter, NonUTF8) {
auto options = ConvertOptions::Defaults();
options.check_utf8 = false;
- AssertDictConversion(csv_string, expected_indices, expected_dict, -1,
options);
+ AssertDictConversion(csv_string, expected_indices, expected_dict, -1,
options,
+ /*validate_full=*/false);
} else {
AssertDictConversion(csv_string, expected_indices, expected_dict);
}
diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h
index 50b6cca..1775b19 100644
--- a/cpp/src/arrow/util/utf8.h
+++ b/cpp/src/arrow/util/utf8.h
@@ -282,7 +282,7 @@ static inline bool Utf8IsContinuation(const uint8_t
codeunit) {
return (codeunit & 0xC0) == 0x80; // upper two bits should be 10
}
-static inline uint8_t* Utf8Encode(uint8_t* str, uint32_t codepoint) {
+static inline uint8_t* UTF8Encode(uint8_t* str, uint32_t codepoint) {
if (codepoint < 0x80) {
*str++ = codepoint;
} else if (codepoint < 0x800) {
@@ -303,7 +303,7 @@ static inline uint8_t* Utf8Encode(uint8_t* str, uint32_t
codepoint) {
return str;
}
-static inline bool Utf8Decode(const uint8_t** data, uint32_t* codepoint) {
+static inline bool UTF8Decode(const uint8_t** data, uint32_t* codepoint) {
const uint8_t* str = *data;
if (*str < 0x80) { // ascci
*codepoint = *str++;
@@ -351,16 +351,16 @@ static inline bool Utf8Decode(const uint8_t** data,
uint32_t* codepoint) {
}
template <class UnaryOperation>
-static inline bool Utf8Transform(const uint8_t* first, const uint8_t* last,
+static inline bool UTF8Transform(const uint8_t* first, const uint8_t* last,
uint8_t** destination, UnaryOperation&&
unary_op) {
const uint8_t* i = first;
uint8_t* out = *destination;
while (i < last) {
uint32_t codepoint = 0;
- if (ARROW_PREDICT_FALSE(!Utf8Decode(&i, &codepoint))) {
+ if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) {
return false;
}
- out = Utf8Encode(out, unary_op(codepoint));
+ out = UTF8Encode(out, unary_op(codepoint));
}
*destination = out;
return true;
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index 0fd10e3..582f04b 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -563,7 +563,8 @@ class BaseTestCSVRead:
opts.auto_dict_max_cardinality = 50
opts.check_utf8 = False
rows = b"a,b\nab,1\ncd\xff,2\nab,3"
- table = self.read_bytes(rows, convert_options=opts)
+ table = self.read_bytes(rows, convert_options=opts,
+ validate_full=False)
assert table.schema == schema
dict_values = table['a'].chunk(0).dictionary
assert len(dict_values) == 2
@@ -809,21 +810,21 @@ class BaseTestCSVRead:
class TestSerialCSVRead(BaseTestCSVRead, unittest.TestCase):
- def read_csv(self, *args, **kwargs):
+ def read_csv(self, *args, validate_full=True, **kwargs):
read_options = kwargs.setdefault('read_options', ReadOptions())
read_options.use_threads = False
table = read_csv(*args, **kwargs)
- table.validate(full=True)
+ table.validate(full=validate_full)
return table
class TestParallelCSVRead(BaseTestCSVRead, unittest.TestCase):
- def read_csv(self, *args, **kwargs):
+ def read_csv(self, *args, validate_full=True, **kwargs):
read_options = kwargs.setdefault('read_options', ReadOptions())
read_options.use_threads = True
table = read_csv(*args, **kwargs)
- table.validate(full=True)
+ table.validate(full=validate_full)
return table