This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 4e342035bf Print Parquet BasicTypeInfo id when present (#7094)
4e342035bf is described below
commit 4e342035bfeea702be5664dbbbbb0be13f0b6ba9
Author: Devin Smith <[email protected]>
AuthorDate: Sat Feb 8 06:34:38 2025 -0800
Print Parquet BasicTypeInfo id when present (#7094)
* Print Parquet BasicTypeInfo id when present
* Improve print_schema documentation
* tiny cleanup
---
parquet/src/schema/printer.rs | 342 +++++++++++++++++++++++++++++++++++++-----
1 file changed, 305 insertions(+), 37 deletions(-)
diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs
index 4681c1a970..44c742fca6 100644
--- a/parquet/src/schema/printer.rs
+++ b/parquet/src/schema/printer.rs
@@ -89,6 +89,57 @@ pub fn print_file_metadata(out: &mut dyn io::Write,
file_metadata: &FileMetaData
}
/// Prints Parquet [`Type`] information.
+///
+/// # Example
+///
+/// ```rust
+/// use parquet::{
+/// basic::{ConvertedType, Repetition, Type as PhysicalType},
+/// schema::{printer::print_schema, types::Type},
+/// };
+/// use std::sync::Arc;
+///
+/// let field_a = Type::primitive_type_builder("a", PhysicalType::BYTE_ARRAY)
+/// .with_id(Some(42))
+/// .with_converted_type(ConvertedType::UTF8)
+/// .build()
+/// .unwrap();
+///
+/// let field_b = Type::primitive_type_builder("b", PhysicalType::INT32)
+/// .with_repetition(Repetition::REQUIRED)
+/// .build()
+/// .unwrap();
+///
+/// let field_d = Type::primitive_type_builder("d", PhysicalType::INT64)
+/// .with_id(Some(99))
+/// .build()
+/// .unwrap();
+///
+/// let field_c = Type::group_type_builder("c")
+/// .with_id(Some(43))
+/// .with_fields(vec![Arc::new(field_d)])
+/// .build()
+/// .unwrap();
+///
+/// let schema = Type::group_type_builder("schema")
+/// .with_fields(vec![Arc::new(field_a), Arc::new(field_b),
Arc::new(field_c)])
+/// .build()
+/// .unwrap();
+///
+/// print_schema(&mut std::io::stdout(), &schema);
+/// ```
+///
+/// outputs
+///
+/// ```text
+/// message schema {
+/// OPTIONAL BYTE_ARRAY a [42] (UTF8);
+/// REQUIRED INT32 b;
+/// message c [43] {
+/// OPTIONAL INT64 d [99];
+/// }
+/// }
+/// ```
#[allow(unused_must_use)]
pub fn print_schema(out: &mut dyn io::Write, tp: &Type) {
// TODO: better if we can pass fmt::Write to Printer.
@@ -321,6 +372,16 @@ impl Printer<'_> {
}
_ => format!("{physical_type}"),
};
+ write!(
+ self.output,
+ "{} {} {}",
+ basic_info.repetition(),
+ phys_type_str,
+ basic_info.name()
+ );
+ if basic_info.has_id() {
+ write!(self.output, " [{}]", basic_info.id());
+ }
// Also print logical type if it is available
// If there is a logical type, do not print converted type
let logical_type_str = print_logical_and_converted(
@@ -329,23 +390,10 @@ impl Printer<'_> {
precision,
scale,
);
- if logical_type_str.is_empty() {
- write!(
- self.output,
- "{} {} {};",
- basic_info.repetition(),
- phys_type_str,
- basic_info.name()
- );
+ if !logical_type_str.is_empty() {
+ write!(self.output, " ({});", logical_type_str);
} else {
- write!(
- self.output,
- "{} {} {} ({});",
- basic_info.repetition(),
- phys_type_str,
- basic_info.name(),
- logical_type_str
- );
+ write!(self.output, ";");
}
}
Type::GroupType {
@@ -353,8 +401,15 @@ impl Printer<'_> {
ref fields,
} => {
if basic_info.has_repetition() {
- let r = basic_info.repetition();
- write!(self.output, "{} group {} ", r, basic_info.name());
+ write!(
+ self.output,
+ "{} group {} ",
+ basic_info.repetition(),
+ basic_info.name()
+ );
+ if basic_info.has_id() {
+ write!(self.output, "[{}] ", basic_info.id());
+ }
let logical_str = print_logical_and_converted(
basic_info.logical_type().as_ref(),
basic_info.converted_type(),
@@ -364,10 +419,13 @@ impl Printer<'_> {
if !logical_str.is_empty() {
write!(self.output, "({logical_str}) ");
}
- writeln!(self.output, "{{");
} else {
- writeln!(self.output, "message {} {{", basic_info.name());
+ write!(self.output, "message {} ", basic_info.name());
+ if basic_info.has_id() {
+ write!(self.output, "[{}] ", basic_info.id());
+ }
}
+ writeln!(self.output, "{{");
self.indent += INDENT_WIDTH;
for c in fields {
@@ -405,28 +463,61 @@ mod tests {
#[test]
fn test_print_primitive_type() {
- let mut s = String::new();
- {
- let mut p = Printer::new(&mut s);
- let field = Type::primitive_type_builder("field",
PhysicalType::INT32)
- .with_repetition(Repetition::REQUIRED)
- .with_converted_type(ConvertedType::INT_32)
- .build()
- .unwrap();
- p.print(&field);
- }
- assert_eq!(&mut s, "REQUIRED INT32 field (INT_32);");
+ let types_and_strings = vec![
+ (
+ Type::primitive_type_builder("field", PhysicalType::INT32)
+ .with_repetition(Repetition::REQUIRED)
+ .with_converted_type(ConvertedType::INT_32)
+ .build()
+ .unwrap(),
+ "REQUIRED INT32 field (INT_32);",
+ ),
+ (
+ Type::primitive_type_builder("field", PhysicalType::INT32)
+ .with_repetition(Repetition::REQUIRED)
+ .with_converted_type(ConvertedType::INT_32)
+ .with_id(Some(42))
+ .build()
+ .unwrap(),
+ "REQUIRED INT32 field [42] (INT_32);",
+ ),
+ (
+ Type::primitive_type_builder("field", PhysicalType::INT32)
+ .with_repetition(Repetition::REQUIRED)
+ .build()
+ .unwrap(),
+ "REQUIRED INT32 field;",
+ ),
+ (
+ Type::primitive_type_builder("field", PhysicalType::INT32)
+ .with_repetition(Repetition::REQUIRED)
+ .with_id(Some(42))
+ .build()
+ .unwrap(),
+ "REQUIRED INT32 field [42];",
+ ),
+ ];
+ types_and_strings.into_iter().for_each(|(field, expected)| {
+ let mut s = String::new();
+ {
+ let mut p = Printer::new(&mut s);
+ p.print(&field);
+ }
+ assert_eq!(&s, expected)
+ });
}
#[inline]
fn build_primitive_type(
name: &str,
+ id: Option<i32>,
physical_type: PhysicalType,
logical_type: Option<LogicalType>,
converted_type: ConvertedType,
repetition: Repetition,
) -> Result<Type> {
Type::primitive_type_builder(name, physical_type)
+ .with_id(id)
.with_repetition(repetition)
.with_logical_type(logical_type)
.with_converted_type(converted_type)
@@ -439,6 +530,7 @@ mod tests {
(
build_primitive_type(
"field",
+ None,
PhysicalType::INT32,
Some(LogicalType::Integer {
bit_width: 32,
@@ -453,6 +545,7 @@ mod tests {
(
build_primitive_type(
"field",
+ None,
PhysicalType::INT32,
Some(LogicalType::Integer {
bit_width: 8,
@@ -467,6 +560,7 @@ mod tests {
(
build_primitive_type(
"field",
+ None,
PhysicalType::INT32,
Some(LogicalType::Integer {
bit_width: 16,
@@ -481,6 +575,22 @@ mod tests {
(
build_primitive_type(
"field",
+ Some(42),
+ PhysicalType::INT32,
+ Some(LogicalType::Integer {
+ bit_width: 16,
+ is_signed: true,
+ }),
+ ConvertedType::INT_16,
+ Repetition::REPEATED,
+ )
+ .unwrap(),
+ "REPEATED INT32 field [42] (INTEGER(16,true));",
+ ),
+ (
+ build_primitive_type(
+ "field",
+ None,
PhysicalType::INT64,
None,
ConvertedType::NONE,
@@ -492,6 +602,7 @@ mod tests {
(
build_primitive_type(
"field",
+ None,
PhysicalType::FLOAT,
None,
ConvertedType::NONE,
@@ -503,6 +614,7 @@ mod tests {
(
build_primitive_type(
"booleans",
+ None,
PhysicalType::BOOLEAN,
None,
ConvertedType::NONE,
@@ -511,9 +623,22 @@ mod tests {
.unwrap(),
"OPTIONAL BOOLEAN booleans;",
),
+ (
+ build_primitive_type(
+ "booleans",
+ Some(42),
+ PhysicalType::BOOLEAN,
+ None,
+ ConvertedType::NONE,
+ Repetition::OPTIONAL,
+ )
+ .unwrap(),
+ "OPTIONAL BOOLEAN booleans [42];",
+ ),
(
build_primitive_type(
"field",
+ None,
PhysicalType::INT64,
Some(LogicalType::Timestamp {
is_adjusted_to_u_t_c: true,
@@ -528,6 +653,7 @@ mod tests {
(
build_primitive_type(
"field",
+ None,
PhysicalType::INT32,
Some(LogicalType::Date),
ConvertedType::NONE,
@@ -539,6 +665,7 @@ mod tests {
(
build_primitive_type(
"field",
+ None,
PhysicalType::INT32,
Some(LogicalType::Time {
unit: TimeUnit::MILLIS(Default::default()),
@@ -553,6 +680,22 @@ mod tests {
(
build_primitive_type(
"field",
+ Some(42),
+ PhysicalType::INT32,
+ Some(LogicalType::Time {
+ unit: TimeUnit::MILLIS(Default::default()),
+ is_adjusted_to_u_t_c: false,
+ }),
+ ConvertedType::TIME_MILLIS,
+ Repetition::REQUIRED,
+ )
+ .unwrap(),
+ "REQUIRED INT32 field [42] (TIME(MILLIS,false));",
+ ),
+ (
+ build_primitive_type(
+ "field",
+ None,
PhysicalType::BYTE_ARRAY,
None,
ConvertedType::NONE,
@@ -564,6 +707,19 @@ mod tests {
(
build_primitive_type(
"field",
+ Some(42),
+ PhysicalType::BYTE_ARRAY,
+ None,
+ ConvertedType::NONE,
+ Repetition::REQUIRED,
+ )
+ .unwrap(),
+ "REQUIRED BYTE_ARRAY field [42];",
+ ),
+ (
+ build_primitive_type(
+ "field",
+ None,
PhysicalType::BYTE_ARRAY,
None,
ConvertedType::UTF8,
@@ -575,6 +731,7 @@ mod tests {
(
build_primitive_type(
"field",
+ None,
PhysicalType::BYTE_ARRAY,
Some(LogicalType::Json),
ConvertedType::JSON,
@@ -586,6 +743,7 @@ mod tests {
(
build_primitive_type(
"field",
+ None,
PhysicalType::BYTE_ARRAY,
Some(LogicalType::Bson),
ConvertedType::BSON,
@@ -597,6 +755,7 @@ mod tests {
(
build_primitive_type(
"field",
+ None,
PhysicalType::BYTE_ARRAY,
Some(LogicalType::String),
ConvertedType::NONE,
@@ -605,6 +764,18 @@ mod tests {
.unwrap(),
"REQUIRED BYTE_ARRAY field (STRING);",
),
+ (
+ build_primitive_type(
+ "field",
+ Some(42),
+ PhysicalType::BYTE_ARRAY,
+ Some(LogicalType::String),
+ ConvertedType::NONE,
+ Repetition::REQUIRED,
+ )
+ .unwrap(),
+ "REQUIRED BYTE_ARRAY field [42] (STRING);",
+ ),
];
types_and_strings.into_iter().for_each(|(field, expected)| {
@@ -693,6 +864,53 @@ mod tests {
});
}
+ #[test]
+ fn test_print_schema_documentation() {
+ let mut s = String::new();
+ {
+ let mut p = Printer::new(&mut s);
+ let field_a = Type::primitive_type_builder("a",
PhysicalType::BYTE_ARRAY)
+ .with_id(Some(42))
+ .with_converted_type(ConvertedType::UTF8)
+ .build()
+ .unwrap();
+
+ let field_b = Type::primitive_type_builder("b",
PhysicalType::INT32)
+ .with_repetition(Repetition::REQUIRED)
+ .build()
+ .unwrap();
+
+ let field_d = Type::primitive_type_builder("d",
PhysicalType::INT64)
+ .with_id(Some(99))
+ .build()
+ .unwrap();
+
+ let field_c = Type::group_type_builder("c")
+ .with_id(Some(43))
+ .with_fields(vec![Arc::new(field_d)])
+ .build()
+ .unwrap();
+
+ let schema = Type::group_type_builder("schema")
+ .with_fields(vec![
+ Arc::new(field_a),
+ Arc::new(field_b),
+ Arc::new(field_c),
+ ])
+ .build()
+ .unwrap();
+ p.print(&schema);
+ }
+ let expected = "message schema {
+ OPTIONAL BYTE_ARRAY a [42] (UTF8);
+ REQUIRED INT32 b;
+ message c [43] {
+ OPTIONAL INT64 d [99];
+ }
+}";
+ assert_eq!(&mut s, expected);
+ }
+
#[test]
fn test_print_group_type() {
let mut s = String::new();
@@ -701,21 +919,17 @@ mod tests {
let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32)
.with_repetition(Repetition::REQUIRED)
.with_converted_type(ConvertedType::INT_32)
- .with_id(Some(0))
.build();
let f2 = Type::primitive_type_builder("f2",
PhysicalType::BYTE_ARRAY)
.with_converted_type(ConvertedType::UTF8)
- .with_id(Some(1))
.build();
let f3 = Type::primitive_type_builder("f3",
PhysicalType::BYTE_ARRAY)
.with_logical_type(Some(LogicalType::String))
- .with_id(Some(1))
.build();
let f4 = Type::primitive_type_builder("f4",
PhysicalType::FIXED_LEN_BYTE_ARRAY)
.with_repetition(Repetition::REPEATED)
.with_converted_type(ConvertedType::INTERVAL)
.with_length(12)
- .with_id(Some(2))
.build();
let struct_fields = vec![
@@ -726,14 +940,12 @@ mod tests {
let field = Type::group_type_builder("field")
.with_repetition(Repetition::OPTIONAL)
.with_fields(struct_fields)
- .with_id(Some(1))
.build()
.unwrap();
let fields = vec![Arc::new(field), Arc::new(f4.unwrap())];
let message = Type::group_type_builder("schema")
.with_fields(fields)
- .with_id(Some(2))
.build()
.unwrap();
p.print(&message);
@@ -749,6 +961,62 @@ mod tests {
assert_eq!(&mut s, expected);
}
+ #[test]
+ fn test_print_group_type_with_ids() {
+ let mut s = String::new();
+ {
+ let mut p = Printer::new(&mut s);
+ let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32)
+ .with_repetition(Repetition::REQUIRED)
+ .with_converted_type(ConvertedType::INT_32)
+ .with_id(Some(0))
+ .build();
+ let f2 = Type::primitive_type_builder("f2",
PhysicalType::BYTE_ARRAY)
+ .with_converted_type(ConvertedType::UTF8)
+ .with_id(Some(1))
+ .build();
+ let f3 = Type::primitive_type_builder("f3",
PhysicalType::BYTE_ARRAY)
+ .with_logical_type(Some(LogicalType::String))
+ .with_id(Some(1))
+ .build();
+ let f4 = Type::primitive_type_builder("f4",
PhysicalType::FIXED_LEN_BYTE_ARRAY)
+ .with_repetition(Repetition::REPEATED)
+ .with_converted_type(ConvertedType::INTERVAL)
+ .with_length(12)
+ .with_id(Some(2))
+ .build();
+
+ let struct_fields = vec![
+ Arc::new(f1.unwrap()),
+ Arc::new(f2.unwrap()),
+ Arc::new(f3.unwrap()),
+ ];
+ let field = Type::group_type_builder("field")
+ .with_repetition(Repetition::OPTIONAL)
+ .with_fields(struct_fields)
+ .with_id(Some(1))
+ .build()
+ .unwrap();
+
+ let fields = vec![Arc::new(field), Arc::new(f4.unwrap())];
+ let message = Type::group_type_builder("schema")
+ .with_fields(fields)
+ .with_id(Some(2))
+ .build()
+ .unwrap();
+ p.print(&message);
+ }
+ let expected = "message schema [2] {
+ OPTIONAL group field [1] {
+ REQUIRED INT32 f1 [0] (INT_32);
+ OPTIONAL BYTE_ARRAY f2 [1] (UTF8);
+ OPTIONAL BYTE_ARRAY f3 [1] (STRING);
+ }
+ REPEATED FIXED_LEN_BYTE_ARRAY (12) f4 [2] (INTERVAL);
+}";
+ assert_eq!(&mut s, expected);
+ }
+
#[test]
fn test_print_and_parse_primitive() {
let a2 = Type::primitive_type_builder("a2", PhysicalType::BYTE_ARRAY)