This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new ded148c2ef GH-41667: [C++][Parquet] Refuse writing non-nullable column 
that contains nulls (#44921)
ded148c2ef is described below

commit ded148c2ef5dd441a3b6ab9496d0ed0aeb940f71
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Dec 4 13:28:11 2024 +0100

    GH-41667: [C++][Parquet] Refuse writing non-nullable column that contains 
nulls (#44921)
    
    ### Rationale for this change
    
    A non-nullable column that contains nulls would result in an invalid 
Parquet file, so we'd rather raise an error when writing.
    
    This detection is only implemented for leaf columns. Implementing it for 
non-leaf columns would be more involved, and also doesn't actually seem 
necessary.
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    Raising a clear error when trying to write invalid data to Parquet, instead 
of letting the Parquet writer silently generate an invalid file.
    
    * GitHub Issue: #41667
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc   | 21 +++++++++++++++++++++
 cpp/src/parquet/column_writer.cc                    |  4 ++++
 .../java/org/apache/arrow/dataset/TestAllTypes.java | 12 +++++++-----
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc 
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index f8e639176a..856c032c35 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -3349,6 +3349,27 @@ TEST(TestArrowWrite, CheckChunkSize) {
                 WriteTable(*table, ::arrow::default_memory_pool(), sink, 
chunk_size));
 }
 
+void CheckWritingNonNullableColumnWithNulls(std::shared_ptr<::arrow::Field> 
field,
+                                            std::string json_batch) {
+  ARROW_SCOPED_TRACE("field = ", field, ", json_batch = ", json_batch);
+  auto schema = ::arrow::schema({field});
+  auto table = ::arrow::TableFromJSON(schema, {json_batch});
+  auto sink = CreateOutputStream();
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("is declared non-nullable but contains 
nulls"),
+      WriteTable(*table, ::arrow::default_memory_pool(), sink));
+}
+
+TEST(TestArrowWrite, InvalidSchema) {
+  // GH-41667: nulls in non-nullable column
+  CheckWritingNonNullableColumnWithNulls(
+      ::arrow::field("a", ::arrow::int32(), /*nullable=*/false),
+      R"([{"a": 456}, {"a": null}])");
+  CheckWritingNonNullableColumnWithNulls(
+      ::arrow::field("a", ::arrow::utf8(), /*nullable=*/false),
+      R"([{"a": "foo"}, {"a": null}])");
+}
+
 void DoNestedValidate(const std::shared_ptr<::arrow::DataType>& inner_type,
                       const std::shared_ptr<::arrow::Field>& outer_field,
                       const std::shared_ptr<Buffer>& buffer,
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 94c301f918..d3e0fdfa81 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -1301,6 +1301,10 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, 
public TypedColumnWriter<
     bool single_nullable_element =
         (level_info_.def_level == level_info_.repeated_ancestor_def_level + 1) 
&&
         leaf_field_nullable;
+    if (!leaf_field_nullable && leaf_array.null_count() != 0) {
+      return Status::Invalid("Column '", descr_->name(),
+                             "' is declared non-nullable but contains nulls");
+    }
     bool maybe_parent_nulls = level_info_.HasNullableValues() && 
!single_nullable_element;
     if (maybe_parent_nulls) {
       ARROW_ASSIGN_OR_RAISE(
diff --git 
a/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java 
b/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java
index eb73663191..935edcd8b3 100644
--- a/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java
+++ b/java/dataset/src/test/java/org/apache/arrow/dataset/TestAllTypes.java
@@ -95,11 +95,9 @@ public class TestAllTypes extends TestDataset {
     // DenseUnion
     List<Field> childFields = new ArrayList<>();
     childFields.add(
-        new Field(
-            "int-child", new FieldType(false, new ArrowType.Int(32, true), 
null, null), null));
+        new Field("int-child", FieldType.notNullable(new ArrowType.Int(32, 
true)), null));
     Field structField =
-        new Field(
-            "struct", new FieldType(true, ArrowType.Struct.INSTANCE, null, 
null), childFields);
+        new Field("struct", FieldType.nullable(ArrowType.Struct.INSTANCE), 
childFields);
     Field[] fields =
         new Field[] {
           Field.nullablePrimitive("null", ArrowType.Null.INSTANCE),
@@ -239,7 +237,11 @@ public class TestAllTypes extends TestDataset {
     largeListWriter.integer().writeInt(1);
     largeListWriter.endList();
 
-    ((StructVector) root.getVector("struct")).getChild("int-child", 
IntVector.class).set(1, 1);
+    var intChildVector =
+        ((StructVector) root.getVector("struct")).getChild("int-child", 
IntVector.class);
+    // set a non-null value at index 0 since the field is not nullable
+    intChildVector.set(0, 0);
+    intChildVector.set(1, 1);
     return root;
   }
 

Reply via email to