This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new caeb4d2e88 feat: Improve DataType display for `RunEndEncoded` (#8596)
caeb4d2e88 is described below
commit caeb4d2e88fb0fea57f1d30b139be1fd6195f174
Author: Alex Huang <[email protected]>
AuthorDate: Thu Oct 16 18:24:21 2025 +0300
feat: Improve DataType display for `RunEndEncoded` (#8596)
## Which issue does this PR close?
- Closes #8351
## Rationale for this change
## What changes are included in this PR?
This PR refactors and improves the `Display` formatting for `DataType`
by:
- **Improving Union type display** - now shows field information with
parentheses for clarity: `Union(Sparse, 0: ('a': Int32), 1: ('b':
nullable Utf8))`
- **Improving RunEndEncoded type display** - now properly formats both
run_ends and values fields: `RunEndEncoded('run_ends': UInt32, 'values':
nullable Int32)`
## Are these changes tested?
Yes
## Are there any user-facing changes?
---
arrow-schema/src/datatype_display.rs | 128 ++++++++++++++++++++++++++++-------
1 file changed, 104 insertions(+), 24 deletions(-)
diff --git a/arrow-schema/src/datatype_display.rs
b/arrow-schema/src/datatype_display.rs
index 80f214606c..2d54e76dd8 100644
--- a/arrow-schema/src/datatype_display.rs
+++ b/arrow-schema/src/datatype_display.rs
@@ -29,6 +29,14 @@ impl fmt::Display for DataType {
}
}
+ fn format_field(field: &crate::Field) -> String {
+ let name = field.name();
+ let maybe_nullable = if field.is_nullable() { "nullable " } else {
"" };
+ let data_type = field.data_type();
+ let metadata_str = format_metadata(field.metadata());
+ format!("{name:?}: {maybe_nullable}{data_type}{metadata_str}")
+ }
+
// A lot of these can still be improved a lot.
// _Some_ of these can be parsed with `FromStr`, but not all (YET!).
// The goal is that the formatting should always be
@@ -122,13 +130,7 @@ impl fmt::Display for DataType {
if !fields.is_empty() {
let fields_str = fields
.iter()
- .map(|field| {
- let name = field.name();
- let maybe_nullable = if field.is_nullable() {
"nullable " } else { "" };
- let data_type = field.data_type();
- let metadata_str =
format_metadata(field.metadata());
- format!("{name:?}:
{maybe_nullable}{data_type}{metadata_str}")
- })
+ .map(|field| format_field(field))
.collect::<Vec<_>>()
.join(", ");
write!(f, "{fields_str}")?;
@@ -143,11 +145,8 @@ impl fmt::Display for DataType {
.iter()
.map(|v| {
let type_id = v.0;
- let field = v.1;
- let maybe_nullable = if field.is_nullable() {
"nullable " } else { "" };
- let data_type = field.data_type();
- let metadata_str =
format_metadata(field.metadata());
- format!("{type_id:?}:
{maybe_nullable}{data_type}{metadata_str}")
+ let field_str = format_field(v.1);
+ format!("{type_id:?}: ({field_str})")
})
.collect::<Vec<_>>()
.join(", ");
@@ -165,20 +164,19 @@ impl fmt::Display for DataType {
Self::Decimal256(precision, scale) => write!(f,
"Decimal256({precision}, {scale})"),
Self::Map(field, sorted) => {
write!(f, "Map(")?;
- let name = field.name();
- let maybe_nullable = if field.is_nullable() { "nullable " }
else { "" };
- let data_type = field.data_type();
- let metadata_str = format_metadata(field.metadata());
+ let map_field_str = format_field(field);
let keys_are_sorted = if *sorted { "sorted" } else {
"unsorted" };
- write!(
- f,
- "\"{name}\": {maybe_nullable}{data_type}{metadata_str},
{keys_are_sorted})"
- )?;
+ write!(f, "{map_field_str}, {keys_are_sorted})")?;
Ok(())
}
Self::RunEndEncoded(run_ends_field, values_field) => {
- write!(f, "RunEndEncoded({run_ends_field}, {values_field})")
+ write!(f, "RunEndEncoded(")?;
+ let run_ends_str = format_field(run_ends_field);
+ let values_str = format_field(values_field);
+
+ write!(f, "{run_ends_str}, {values_str})")?;
+ Ok(())
}
}
}
@@ -391,7 +389,7 @@ mod tests {
let union_data_type = DataType::Union(union_fields,
crate::UnionMode::Sparse);
let union_data_type_string = union_data_type.to_string();
- let expected_string = "Union(Sparse, 0: Int32, 1: nullable Utf8)";
+ let expected_string = "Union(Sparse, 0: (\"a\": Int32), 1: (\"b\":
nullable Utf8))";
assert_eq!(union_data_type_string, expected_string);
// Test with metadata
@@ -407,8 +405,7 @@ mod tests {
let union_data_type_with_metadata =
DataType::Union(union_fields_with_metadata,
crate::UnionMode::Sparse);
let union_data_type_with_metadata_string =
union_data_type_with_metadata.to_string();
- let expected_string_with_metadata =
- "Union(Sparse, 0: Int32, 1: nullable Utf8, metadata: {\"key\":
\"value\"})";
+ let expected_string_with_metadata = "Union(Sparse, 0: (\"a\": Int32),
1: (\"b\": nullable Utf8, metadata: {\"key\": \"value\"}))";
assert_eq!(
union_data_type_with_metadata_string,
expected_string_with_metadata
@@ -456,4 +453,87 @@ mod tests {
expected_string_with_metadata
);
}
+
+ #[test]
+ fn test_display_run_end_encoded() {
+ let run_ends_field = Arc::new(Field::new("run_ends", DataType::UInt32,
false));
+ let values_field = Arc::new(Field::new("values", DataType::Int32,
true));
+ let ree_data_type = DataType::RunEndEncoded(run_ends_field.clone(),
values_field.clone());
+ let ree_data_type_string = ree_data_type.to_string();
+ let expected_string = "RunEndEncoded(\"run_ends\": UInt32, \"values\":
nullable Int32)";
+ assert_eq!(ree_data_type_string, expected_string);
+
+ // Test with metadata
+ let mut run_ends_field_with_metadata = Field::new("run_ends",
DataType::UInt32, false);
+ let metadata = HashMap::from([("key".to_string(),
"value".to_string())]);
+ run_ends_field_with_metadata.set_metadata(metadata);
+ let ree_data_type_with_metadata =
+ DataType::RunEndEncoded(Arc::new(run_ends_field_with_metadata),
values_field.clone());
+ let ree_data_type_with_metadata_string =
ree_data_type_with_metadata.to_string();
+ let expected_string_with_metadata = "RunEndEncoded(\"run_ends\":
UInt32, metadata: {\"key\": \"value\"}, \"values\": nullable Int32)";
+ assert_eq!(
+ ree_data_type_with_metadata_string,
+ expected_string_with_metadata
+ );
+ }
+
+ #[test]
+ fn test_display_dictionary() {
+ let dict_data_type =
+ DataType::Dictionary(Box::new(DataType::Int8),
Box::new(DataType::Utf8));
+ let dict_data_type_string = dict_data_type.to_string();
+ let expected_string = "Dictionary(Int8, Utf8)";
+ assert_eq!(dict_data_type_string, expected_string);
+
+ // Test with complex index and value types
+ let complex_dict_data_type = DataType::Dictionary(
+ Box::new(DataType::Int16),
+ Box::new(DataType::Struct(
+ vec![
+ Field::new("a", DataType::Int32, false),
+ Field::new("b", DataType::Utf8, true),
+ ]
+ .into(),
+ )),
+ );
+ let complex_dict_data_type_string = complex_dict_data_type.to_string();
+ let expected_complex_string =
+ "Dictionary(Int16, Struct(\"a\": Int32, \"b\": nullable Utf8))";
+ assert_eq!(complex_dict_data_type_string, expected_complex_string);
+ }
+
+ #[test]
+ fn test_display_interval() {
+ let interval_year_month =
DataType::Interval(crate::IntervalUnit::YearMonth);
+ let interval_year_month_string = interval_year_month.to_string();
+ let expected_year_month_string = "Interval(YearMonth)";
+ assert_eq!(interval_year_month_string, expected_year_month_string);
+
+ let interval_day_time =
DataType::Interval(crate::IntervalUnit::DayTime);
+ let interval_day_time_string = interval_day_time.to_string();
+ let expected_day_time_string = "Interval(DayTime)";
+ assert_eq!(interval_day_time_string, expected_day_time_string);
+
+ let interval_month_day_nano =
DataType::Interval(crate::IntervalUnit::MonthDayNano);
+ let interval_month_day_nano_string =
interval_month_day_nano.to_string();
+ let expected_month_day_nano_string = "Interval(MonthDayNano)";
+ assert_eq!(
+ interval_month_day_nano_string,
+ expected_month_day_nano_string
+ );
+ }
+
+ #[test]
+ fn test_display_timestamp() {
+ let timestamp_without_tz =
DataType::Timestamp(crate::TimeUnit::Microsecond, None);
+ let timestamp_without_tz_string = timestamp_without_tz.to_string();
+ let expected_without_tz_string = "Timestamp(µs)";
+ assert_eq!(timestamp_without_tz_string, expected_without_tz_string);
+
+ let timestamp_with_tz =
+ DataType::Timestamp(crate::TimeUnit::Nanosecond,
Some(Arc::from("UTC")));
+ let timestamp_with_tz_string = timestamp_with_tz.to_string();
+ let expected_with_tz_string = "Timestamp(ns, \"UTC\")";
+ assert_eq!(timestamp_with_tz_string, expected_with_tz_string);
+ }
}