This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 8c495b6002 parquet: Read field IDs from Parquet Schema (#4878)
8c495b6002 is described below

commit 8c495b60021df1e32e1ff0616dec2979fd66b467
Author: Samrose <[email protected]>
AuthorDate: Sun Oct 1 03:09:18 2023 -0700

    parquet: Read field IDs from Parquet Schema (#4878)
    
    Currently, field ids are only read from the serialized arrow schema
    and not the actual parquet file. This PR adds reading the field ids
    from a Parquet file that doesnt contain the serialized arrow schema.
    
    Signed-off-by: 🐼 Samrose Ahmed 🐼 <[email protected]>
---
 parquet/src/arrow/schema/complex.rs | 13 ++++++--
 parquet/src/arrow/schema/mod.rs     | 63 +++++++++++++++++++++++++++++++++++--
 2 files changed, 71 insertions(+), 5 deletions(-)

diff --git a/parquet/src/arrow/schema/complex.rs 
b/parquet/src/arrow/schema/complex.rs
index 0d19875d97..9f85b2c284 100644
--- a/parquet/src/arrow/schema/complex.rs
+++ b/parquet/src/arrow/schema/complex.rs
@@ -19,7 +19,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 
 use crate::arrow::schema::primitive::convert_primitive;
-use crate::arrow::ProjectionMask;
+use crate::arrow::{ProjectionMask, PARQUET_FIELD_ID_META_KEY};
 use crate::basic::{ConvertedType, Repetition};
 use crate::errors::ParquetError;
 use crate::errors::Result;
@@ -550,7 +550,16 @@ fn convert_field(
 
             field.with_metadata(hint.metadata().clone())
         }
-        None => Field::new(name, data_type, nullable),
+        None => {
+            let mut ret = Field::new(name, data_type, nullable);
+            let basic_info = parquet_type.get_basic_info();
+            if basic_info.has_id() {
+                let mut meta = HashMap::with_capacity(1);
+                meta.insert(PARQUET_FIELD_ID_META_KEY.to_string(), 
basic_info.id().to_string());
+                ret.set_metadata(meta);
+            }
+            ret
+        },
     }
 }
 
diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs
index 3f1994d108..d56cc42d43 100644
--- a/parquet/src/arrow/schema/mod.rs
+++ b/parquet/src/arrow/schema/mod.rs
@@ -45,6 +45,8 @@ mod primitive;
 use crate::arrow::ProjectionMask;
 pub(crate) use complex::{ParquetField, ParquetFieldType};
 
+use super::PARQUET_FIELD_ID_META_KEY;
+
 /// Convert Parquet schema to Arrow schema including optional metadata
 ///
 /// Attempts to decode any existing Arrow schema metadata, falling back
@@ -268,12 +270,20 @@ fn parse_key_value_metadata(
 /// Convert parquet column schema to arrow field.
 pub fn parquet_to_arrow_field(parquet_column: &ColumnDescriptor) -> 
Result<Field> {
     let field = complex::convert_type(&parquet_column.self_type_ptr())?;
-
-    Ok(Field::new(
+    let mut ret = Field::new(
         parquet_column.name(),
         field.arrow_type,
         field.nullable,
-    ))
+    );
+
+    let basic_info = parquet_column.self_type().get_basic_info();
+    if basic_info.has_id() {
+        let mut meta = HashMap::with_capacity(1);
+        meta.insert(PARQUET_FIELD_ID_META_KEY.to_string(), 
basic_info.id().to_string());
+        ret.set_metadata(meta);
+    }
+
+    Ok(ret)
 }
 
 pub fn decimal_length_from_precision(precision: u8) -> usize {
@@ -578,6 +588,7 @@ mod tests {
 
     use crate::arrow::PARQUET_FIELD_ID_META_KEY;
     use crate::file::metadata::KeyValue;
+    use crate::file::reader::FileReader;
     use crate::{
         arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ArrowWriter},
         schema::{parser::parse_message_type, types::SchemaDescriptor},
@@ -1811,6 +1822,52 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_read_parquet_field_ids_raw() -> Result<()> {
+        let meta = |a: &[(&str, &str)]| -> HashMap<String, String> {
+            a.iter()
+                .map(|(a, b)| (a.to_string(), b.to_string()))
+                .collect()
+        };
+        let schema = Schema::new_with_metadata(
+            vec![
+                Field::new("c1", DataType::Utf8, true).with_metadata(meta(&[
+                    (PARQUET_FIELD_ID_META_KEY, "1"),
+                ])),
+                Field::new("c2", DataType::Utf8, true).with_metadata(meta(&[
+                    (PARQUET_FIELD_ID_META_KEY, "2"),
+                ])),
+            ],
+            HashMap::new(),
+        );
+
+        let writer = ArrowWriter::try_new(
+            vec![],
+            Arc::new(schema.clone()),
+            None,
+        )?;
+        let parquet_bytes = writer.into_inner()?;
+
+        let reader = crate::file::reader::SerializedFileReader::new(
+            bytes::Bytes::from(parquet_bytes),
+        )?;
+        let schema_descriptor = 
reader.metadata().file_metadata().schema_descr_ptr();
+
+        // don't pass metadata so field ids are read from Parquet and not from 
serialized Arrow schema
+        let arrow_schema = crate::arrow::parquet_to_arrow_schema(
+            &schema_descriptor,
+            None,
+        )?;
+
+        let parq_schema_descr = 
crate::arrow::arrow_to_parquet_schema(&arrow_schema)?;
+        let parq_fields = parq_schema_descr.root_schema().get_fields();
+        assert_eq!(parq_fields.len(), 2);
+        assert_eq!(parq_fields[0].get_basic_info().id(), 1);
+        assert_eq!(parq_fields[1].get_basic_info().id(), 2);
+
+        Ok(())
+    }
+
     #[test]
     fn test_arrow_schema_roundtrip_lists() -> Result<()> {
         let metadata: HashMap<String, String> =

Reply via email to