This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 2b179b805f feat(parquet): relax type compatility check in parquet 
ArrowWriter (#9099)
2b179b805f is described below

commit 2b179b805f62e927771f764f8ae92151249e9edd
Author: Marko Grujic <[email protected]>
AuthorDate: Wed Jan 7 16:33:04 2026 +0100

    feat(parquet): relax type compatility check in parquet ArrowWriter (#9099)
    
    # Which issue does this PR close?
    - Closes #9098.
    
    # Rationale for this change
    Don't require strict equality for nested fields (including inner field
    name/metadata), just require that nested data types are logically
    equivalent.
    
    # What changes are included in this PR?
    Use `a.equals_datatype(b)` instead of `a == b` at the start of
    `LevelInfoBuilder::types_compatible`.
    
    # Are these changes tested?
    Yes.
    
    # Are there any user-facing changes?
---
 parquet/src/arrow/arrow_writer/levels.rs |  4 +--
 parquet/src/arrow/arrow_writer/mod.rs    | 51 ++++++++++++++++++++++++++++++--
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/parquet/src/arrow/arrow_writer/levels.rs 
b/parquet/src/arrow/arrow_writer/levels.rs
index 3c283bcbe3..59bf6c6024 100644
--- a/parquet/src/arrow/arrow_writer/levels.rs
+++ b/parquet/src/arrow/arrow_writer/levels.rs
@@ -550,8 +550,8 @@ impl LevelInfoBuilder {
     /// and the other is a native array, the dictionary values must have the 
same type as the
     /// native array
     fn types_compatible(a: &DataType, b: &DataType) -> bool {
-        // if the Arrow data types are the same, the types are clearly 
compatible
-        if a == b {
+        // if the Arrow data types are equal, the types are deemed compatible
+        if a.equals_datatype(b) {
             return true;
         }
 
diff --git a/parquet/src/arrow/arrow_writer/mod.rs 
b/parquet/src/arrow/arrow_writer/mod.rs
index 3e3c9108d5..6b1566a681 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -1522,11 +1522,12 @@ fn get_fsb_array_slice(
 #[cfg(test)]
 mod tests {
     use super::*;
+    use std::collections::HashMap;
 
     use std::fs::File;
 
-    use crate::arrow::ARROW_SCHEMA_META_KEY;
     use crate::arrow::arrow_reader::{ParquetRecordBatchReader, 
ParquetRecordBatchReaderBuilder};
+    use crate::arrow::{ARROW_SCHEMA_META_KEY, PARQUET_FIELD_ID_META_KEY};
     use crate::column::page::{Page, PageReader};
     use crate::file::metadata::thrift::PageHeader;
     use crate::file::page_index::column_index::ColumnIndexMetaData;
@@ -1539,7 +1540,7 @@ mod tests {
     use arrow::util::data_gen::create_random_array;
     use arrow::util::pretty::pretty_format_batches;
     use arrow::{array::*, buffer::Buffer};
-    use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano, NullBuffer, 
i256};
+    use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano, NullBuffer, 
OffsetBuffer, i256};
     use arrow_schema::Fields;
     use half::f16;
     use num_traits::{FromPrimitive, ToPrimitive};
@@ -3323,6 +3324,52 @@ mod tests {
             BinaryViewArray::from_iter_values(vec![b"barquet"]),
             LargeBinaryArray::from_iter_values(vec![b"parquet", b"barquet"]),
         );
+
+        // check compatibility for list types
+
+        let list_field_metadata = HashMap::from_iter(vec![(
+            PARQUET_FIELD_ID_META_KEY.to_string(),
+            "1".to_string(),
+        )]);
+        let list_field = Field::new_list_field(DataType::Int32, false);
+
+        let values1 = Arc::new(Int32Array::from(vec![0, 1, 2, 3, 4]));
+        let offsets1 = OffsetBuffer::new(vec![0, 2, 5].into());
+
+        let values2 = Arc::new(Int32Array::from(vec![5, 6, 7, 8, 9]));
+        let offsets2 = OffsetBuffer::new(vec![0, 3, 5].into());
+
+        let values_expected = Arc::new(Int32Array::from(vec![0, 1, 2, 3, 4, 5, 
6, 7, 8, 9]));
+        let offsets_expected = OffsetBuffer::new(vec![0, 2, 5, 8, 10].into());
+
+        ensure_compatible_write(
+            // when the initial schema has the metadata ...
+            ListArray::try_new(
+                Arc::new(
+                    list_field
+                        .clone()
+                        .with_metadata(list_field_metadata.clone()),
+                ),
+                offsets1,
+                values1,
+                None,
+            )
+            .unwrap(),
+            // ... and some intermediate schema doesn't have the metadata
+            ListArray::try_new(Arc::new(list_field.clone()), offsets2, 
values2, None).unwrap(),
+            // ... the write will still go through, and the resulting schema 
will inherit the initial metadata
+            ListArray::try_new(
+                Arc::new(
+                    list_field
+                        .clone()
+                        .with_metadata(list_field_metadata.clone()),
+                ),
+                offsets_expected,
+                values_expected,
+                None,
+            )
+            .unwrap(),
+        );
     }
 
     #[test]

Reply via email to