This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new ded148c2ef GH-41667: [C++][Parquet] Refuse writing non-nullable column
that contains nulls (#44921)
ded148c2ef is described below
commit ded148c2ef5dd441a3b6ab9496d0ed0aeb940f71
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Dec 4 13:28:11 2024 +0100
GH-41667: [C++][Parquet] Refuse writing non-nullable column that contains
nulls (#44921)
### Rationale for this change
A non-nullable column that contains nulls would result in an invalid
Parquet file, so we'd rather raise an error when writing.
This detection is only implemented for leaf columns. Implementing it for
non-leaf columns would be more involved, and also doesn't actually seem
necessary.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Raising a clear error when trying to write invalid data to Parquet, instead
of letting the Parquet writer silently generate an invalid file.
* GitHub Issue: #41667
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 21 +++++++++++++++++++++
cpp/src/parquet/column_writer.cc | 4 ++++
.../java/org/apache/arrow/dataset/TestAllTypes.java | 12 +++++++-----
3 files changed, 32 insertions(+), 5 deletions(-)
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index f8e639176a..856c032c35 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -3349,6 +3349,27 @@ TEST(TestArrowWrite, CheckChunkSize) {
WriteTable(*table, ::arrow::default_memory_pool(), sink,
chunk_size));
}
+void CheckWritingNonNullableColumnWithNulls(std::shared_ptr<::arrow::Field>
field,
+ std::string json_batch) {
+ ARROW_SCOPED_TRACE("field = ", field, ", json_batch = ", json_batch);
+ auto schema = ::arrow::schema({field});
+ auto table = ::arrow::TableFromJSON(schema, {json_batch});
+ auto sink = CreateOutputStream();
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid, ::testing::HasSubstr("is declared non-nullable but contains
nulls"),
+ WriteTable(*table, ::arrow::default_memory_pool(), sink));
+}
+
+TEST(TestArrowWrite, InvalidSchema) {
+ // GH-41667: nulls in non-nullable column
+ CheckWritingNonNullableColumnWithNulls(
+ ::arrow::field("a", ::arrow::int32(), /*nullable=*/false),
+ R"([{"a": 456}, {"a": null}])");
+ CheckWritingNonNullableColumnWithNulls(
+ ::arrow::field("a", ::arrow::utf8(), /*nullable=*/false),
+ R"([{"a": "foo"}, {"a": null}])");
+}
+
void DoNestedValidate(const std::shared_ptr<::arrow::DataType>& inner_type,
const std::shared_ptr<::arrow::Field>& outer_field,
const std::shared_ptr<Buffer>& buffer,
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 94c301f918..d3e0fdfa81 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -1301,6 +1301,10 @@ class TypedColumnWriterImpl : public ColumnWriterImpl,
public TypedColumnWriter<
bool single_nullable_element =
(level_info_.def_level == level_info_.repeated_ancestor_def_level + 1)
&&
leaf_field_nullable;
+ if (!leaf_field_nullable && leaf_array.null_count() != 0) {
+ return Status::Invalid("Column '", descr_->name(),
+ "' is declared non-nullable but contains nulls");
+ }
bool maybe_parent_nulls = level_info_.HasNullableValues() &&
!single_nullable_element;
if (maybe_parent_nulls) {
ARROW_ASSIGN_OR_RAISE(
diff --git
a/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java
b/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java
index eb73663191..935edcd8b3 100644
--- a/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java
+++ b/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java
@@ -95,11 +95,9 @@ public class TestAllTypes extends TestDataset {
// DenseUnion
List<Field> childFields = new ArrayList<>();
childFields.add(
- new Field(
- "int-child", new FieldType(false, new ArrowType.Int(32, true),
null, null), null));
+ new Field("int-child", FieldType.notNullable(new ArrowType.Int(32,
true)), null));
Field structField =
- new Field(
- "struct", new FieldType(true, ArrowType.Struct.INSTANCE, null,
null), childFields);
+ new Field("struct", FieldType.nullable(ArrowType.Struct.INSTANCE),
childFields);
Field[] fields =
new Field[] {
Field.nullablePrimitive("null", ArrowType.Null.INSTANCE),
@@ -239,7 +237,11 @@ public class TestAllTypes extends TestDataset {
largeListWriter.integer().writeInt(1);
largeListWriter.endList();
- ((StructVector) root.getVector("struct")).getChild("int-child",
IntVector.class).set(1, 1);
+ var intChildVector =
+ ((StructVector) root.getVector("struct")).getChild("int-child",
IntVector.class);
+ // set a non-null value at index 0 since the field is not nullable
+ intChildVector.set(0, 0);
+ intChildVector.set(1, 1);
return root;
}