This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 8c495b6002 parquet: Read field IDs from Parquet Schema (#4878)
8c495b6002 is described below
commit 8c495b60021df1e32e1ff0616dec2979fd66b467
Author: Samrose <[email protected]>
AuthorDate: Sun Oct 1 03:09:18 2023 -0700
parquet: Read field IDs from Parquet Schema (#4878)
Currently, field ids are only read from the serialized arrow schema
and not the actual parquet file. This PR adds reading the field ids
from a Parquet file that doesnt contain the serialized arrow schema.
Signed-off-by: 🐼 Samrose Ahmed 🐼 <[email protected]>
---
parquet/src/arrow/schema/complex.rs | 13 ++++++--
parquet/src/arrow/schema/mod.rs | 63 +++++++++++++++++++++++++++++++++++--
2 files changed, 71 insertions(+), 5 deletions(-)
diff --git a/parquet/src/arrow/schema/complex.rs
b/parquet/src/arrow/schema/complex.rs
index 0d19875d97..9f85b2c284 100644
--- a/parquet/src/arrow/schema/complex.rs
+++ b/parquet/src/arrow/schema/complex.rs
@@ -19,7 +19,7 @@ use std::collections::HashMap;
use std::sync::Arc;
use crate::arrow::schema::primitive::convert_primitive;
-use crate::arrow::ProjectionMask;
+use crate::arrow::{ProjectionMask, PARQUET_FIELD_ID_META_KEY};
use crate::basic::{ConvertedType, Repetition};
use crate::errors::ParquetError;
use crate::errors::Result;
@@ -550,7 +550,16 @@ fn convert_field(
field.with_metadata(hint.metadata().clone())
}
- None => Field::new(name, data_type, nullable),
+ None => {
+ let mut ret = Field::new(name, data_type, nullable);
+ let basic_info = parquet_type.get_basic_info();
+ if basic_info.has_id() {
+ let mut meta = HashMap::with_capacity(1);
+ meta.insert(PARQUET_FIELD_ID_META_KEY.to_string(),
basic_info.id().to_string());
+ ret.set_metadata(meta);
+ }
+ ret
+ },
}
}
diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs
index 3f1994d108..d56cc42d43 100644
--- a/parquet/src/arrow/schema/mod.rs
+++ b/parquet/src/arrow/schema/mod.rs
@@ -45,6 +45,8 @@ mod primitive;
use crate::arrow::ProjectionMask;
pub(crate) use complex::{ParquetField, ParquetFieldType};
+use super::PARQUET_FIELD_ID_META_KEY;
+
/// Convert Parquet schema to Arrow schema including optional metadata
///
/// Attempts to decode any existing Arrow schema metadata, falling back
@@ -268,12 +270,20 @@ fn parse_key_value_metadata(
/// Convert parquet column schema to arrow field.
pub fn parquet_to_arrow_field(parquet_column: &ColumnDescriptor) ->
Result<Field> {
let field = complex::convert_type(&parquet_column.self_type_ptr())?;
-
- Ok(Field::new(
+ let mut ret = Field::new(
parquet_column.name(),
field.arrow_type,
field.nullable,
- ))
+ );
+
+ let basic_info = parquet_column.self_type().get_basic_info();
+ if basic_info.has_id() {
+ let mut meta = HashMap::with_capacity(1);
+ meta.insert(PARQUET_FIELD_ID_META_KEY.to_string(),
basic_info.id().to_string());
+ ret.set_metadata(meta);
+ }
+
+ Ok(ret)
}
pub fn decimal_length_from_precision(precision: u8) -> usize {
@@ -578,6 +588,7 @@ mod tests {
use crate::arrow::PARQUET_FIELD_ID_META_KEY;
use crate::file::metadata::KeyValue;
+ use crate::file::reader::FileReader;
use crate::{
arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ArrowWriter},
schema::{parser::parse_message_type, types::SchemaDescriptor},
@@ -1811,6 +1822,52 @@ mod tests {
Ok(())
}
+ #[test]
+ fn test_read_parquet_field_ids_raw() -> Result<()> {
+ let meta = |a: &[(&str, &str)]| -> HashMap<String, String> {
+ a.iter()
+ .map(|(a, b)| (a.to_string(), b.to_string()))
+ .collect()
+ };
+ let schema = Schema::new_with_metadata(
+ vec![
+ Field::new("c1", DataType::Utf8, true).with_metadata(meta(&[
+ (PARQUET_FIELD_ID_META_KEY, "1"),
+ ])),
+ Field::new("c2", DataType::Utf8, true).with_metadata(meta(&[
+ (PARQUET_FIELD_ID_META_KEY, "2"),
+ ])),
+ ],
+ HashMap::new(),
+ );
+
+ let writer = ArrowWriter::try_new(
+ vec![],
+ Arc::new(schema.clone()),
+ None,
+ )?;
+ let parquet_bytes = writer.into_inner()?;
+
+ let reader = crate::file::reader::SerializedFileReader::new(
+ bytes::Bytes::from(parquet_bytes),
+ )?;
+ let schema_descriptor =
reader.metadata().file_metadata().schema_descr_ptr();
+
+ // don't pass metadata so field ids are read from Parquet and not from
serialized Arrow schema
+ let arrow_schema = crate::arrow::parquet_to_arrow_schema(
+ &schema_descriptor,
+ None,
+ )?;
+
+ let parq_schema_descr =
crate::arrow::arrow_to_parquet_schema(&arrow_schema)?;
+ let parq_fields = parq_schema_descr.root_schema().get_fields();
+ assert_eq!(parq_fields.len(), 2);
+ assert_eq!(parq_fields[0].get_basic_info().id(), 1);
+ assert_eq!(parq_fields[1].get_basic_info().id(), 2);
+
+ Ok(())
+ }
+
#[test]
fn test_arrow_schema_roundtrip_lists() -> Result<()> {
let metadata: HashMap<String, String> =