This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new d48bf0d128 feat: parse `DataType::Union`, `DataType::Map`,
`DataType::RunEndEncoded` (#8765)
d48bf0d128 is described below
commit d48bf0d12823c1152c39eef8a7a374c234ceecb4
Author: Khanh Duong <[email protected]>
AuthorDate: Wed Nov 5 04:04:26 2025 +0900
feat: parse `DataType::Union`, `DataType::Map`, `DataType::RunEndEncoded`
(#8765)
# Which issue does this PR close?
- Closes #8648
# Rationale for this change
Parse `DataType::Union`, `DataType::Map` and `DataType::RunEndEncoded`.
# What changes are included in this PR?
- Add `parse_union`, `parse_map`, and `parse_run_encoded`.
- Refactor `parse_list_field_name` -> `parse_list_field` to remove
duplicated codes.
# Are these changes tested?
Yes, new test added.
# Are there any user-facing changes?
Yes. Relate to #8351
---
arrow-schema/src/datatype_display.rs | 3 +-
arrow-schema/src/datatype_parse.rs | 298 +++++++++++++++++++++++++++--------
2 files changed, 238 insertions(+), 63 deletions(-)
diff --git a/arrow-schema/src/datatype_display.rs
b/arrow-schema/src/datatype_display.rs
index 6c89e3cdae..af36c0cb2c 100644
--- a/arrow-schema/src/datatype_display.rs
+++ b/arrow-schema/src/datatype_display.rs
@@ -135,8 +135,9 @@ impl fmt::Display for DataType {
Ok(())
}
Self::Union(union_fields, union_mode) => {
- write!(f, "Union({union_mode:?}, ")?;
+ write!(f, "Union({union_mode:?}")?;
if !union_fields.is_empty() {
+ write!(f, ", ")?;
let fields_str = union_fields
.iter()
.map(|v| {
diff --git a/arrow-schema/src/datatype_parse.rs
b/arrow-schema/src/datatype_parse.rs
index 56d8fb56a5..4ad32f59aa 100644
--- a/arrow-schema/src/datatype_parse.rs
+++ b/arrow-schema/src/datatype_parse.rs
@@ -17,7 +17,7 @@
use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc};
-use crate::{ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit};
+use crate::{ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit,
UnionFields, UnionMode};
/// Parses a DataType from a string representation
///
@@ -88,6 +88,9 @@ impl<'a> Parser<'a> {
Token::LargeListView => self.parse_large_list_view(),
Token::FixedSizeList => self.parse_fixed_size_list(),
Token::Struct => self.parse_struct(),
+ Token::Union => self.parse_union(),
+ Token::Map => self.parse_map(),
+ Token::RunEndEncoded => self.parse_run_end_encoded(),
tok => Err(make_error(
self.val,
&format!("finding next type, got unexpected '{tok}'"),
@@ -95,73 +98,78 @@ impl<'a> Parser<'a> {
}
}
- /// Parses list field name. Returns default field name if not found.
- fn parse_list_field_name(&mut self, context: &str) -> ArrowResult<String> {
- // field must be after a comma
- if self
+ /// parses Field, this is the inversion of `format_field` in
`datatype_display.rs`.
+ /// E.g: "a": nullable Int64
+ ///
+ /// TODO: support metadata: `"a": nullable Int64 metadata: {"foo":
"value"}`
+ fn parse_field(&mut self) -> ArrowResult<Field> {
+ let name = self.parse_double_quoted_string("Field")?;
+ self.expect_token(Token::Colon)?;
+ let nullable = self.parse_opt_nullable();
+ let data_type = self.parse_next_type()?;
+ Ok(Field::new(name, data_type, nullable))
+ }
+
+ /// Parses field inside a list. Use `Field::LIST_FIELD_DEFAULT_NAME`
+ /// if no field name is specified.
+ /// E.g: `nullable Int64, field: 'foo'` or `nullable Int64`
+ ///
+ /// TODO: support metadata: `nullable Int64, metadata: {"foo2": "value"}`
+ fn parse_list_field(&mut self, context: &str) -> ArrowResult<Field> {
+ let nullable = self.parse_opt_nullable();
+ let data_type = self.parse_next_type()?;
+
+ // the field name (if exists) must be after a comma
+ let field_name = if self
.tokenizer
.next_if(|next| matches!(next, Ok(Token::Comma)))
.is_none()
{
- return Ok(Field::LIST_FIELD_DEFAULT_NAME.into());
- }
+ Field::LIST_FIELD_DEFAULT_NAME.into()
+ } else {
+ // expects: `field: 'field_name'`.
+ self.expect_token(Token::Field)?;
+ self.expect_token(Token::Colon)?;
+ self.parse_single_quoted_string(context)?
+ };
- // expects: `field: 'field_name'`.
- self.expect_token(Token::Field)?;
- self.expect_token(Token::Colon)?;
- self.parse_single_quoted_string(context)
+ Ok(Field::new(field_name, data_type, nullable))
}
/// Parses the List type (called after `List` has been consumed)
/// E.g: List(nullable Int64, field: 'foo')
fn parse_list(&mut self) -> ArrowResult<DataType> {
self.expect_token(Token::LParen)?;
- let nullable = self.parse_opt_nullable();
- let data_type = self.parse_next_type()?;
- let field = self.parse_list_field_name("List")?;
+ let field = self.parse_list_field("List")?;
self.expect_token(Token::RParen)?;
- Ok(DataType::List(Arc::new(Field::new(
- field, data_type, nullable,
- ))))
+ Ok(DataType::List(Arc::new(field)))
}
/// Parses the ListView type (called after `ListView` has been consumed)
/// E.g: ListView(nullable Int64, field: 'foo')
fn parse_list_view(&mut self) -> ArrowResult<DataType> {
self.expect_token(Token::LParen)?;
- let nullable = self.parse_opt_nullable();
- let data_type = self.parse_next_type()?;
- let field = self.parse_list_field_name("ListView")?;
+ let field = self.parse_list_field("ListView")?;
self.expect_token(Token::RParen)?;
- Ok(DataType::ListView(Arc::new(Field::new(
- field, data_type, nullable,
- ))))
+ Ok(DataType::ListView(Arc::new(field)))
}
/// Parses the LargeList type (called after `LargeList` has been consumed)
/// E.g: LargeList(nullable Int64, field: 'foo')
fn parse_large_list(&mut self) -> ArrowResult<DataType> {
self.expect_token(Token::LParen)?;
- let nullable = self.parse_opt_nullable();
- let data_type = self.parse_next_type()?;
- let field = self.parse_list_field_name("LargeList")?;
+ let field = self.parse_list_field("LargeList")?;
self.expect_token(Token::RParen)?;
- Ok(DataType::LargeList(Arc::new(Field::new(
- field, data_type, nullable,
- ))))
+ Ok(DataType::LargeList(Arc::new(field)))
}
/// Parses the LargeListView type (called after `LargeListView` has been
consumed)
/// E.g: LargeListView(nullable Int64, field: 'foo')
fn parse_large_list_view(&mut self) -> ArrowResult<DataType> {
self.expect_token(Token::LParen)?;
- let nullable = self.parse_opt_nullable();
- let data_type = self.parse_next_type()?;
- let field = self.parse_list_field_name("LargeListView")?;
+ let field = self.parse_list_field("LargeListView")?;
self.expect_token(Token::RParen)?;
- Ok(DataType::LargeListView(Arc::new(Field::new(
- field, data_type, nullable,
- ))))
+ Ok(DataType::LargeListView(Arc::new(field)))
}
/// Parses the FixedSizeList type (called after `FixedSizeList` has been
consumed)
@@ -170,14 +178,9 @@ impl<'a> Parser<'a> {
self.expect_token(Token::LParen)?;
let length = self.parse_i32("FixedSizeList")?;
self.expect_token(Token::X)?;
- let nullable = self.parse_opt_nullable();
- let data_type = self.parse_next_type()?;
- let field = self.parse_list_field_name("FixedSizeList")?;
+ let field = self.parse_list_field("FixedSizeList")?;
self.expect_token(Token::RParen)?;
- Ok(DataType::FixedSizeList(
- Arc::new(Field::new(field, data_type, nullable)),
- length,
- ))
+ Ok(DataType::FixedSizeList(Arc::new(field), length))
}
/// Parses the next timeunit
@@ -413,25 +416,16 @@ impl<'a> Parser<'a> {
self.expect_token(Token::LParen)?;
let mut fields = Vec::new();
loop {
- // expects: "field name": [nullable] #datatype
-
- let field_name = match self.next_token()? {
- Token::RParen => {
- break;
- }
- Token::DoubleQuotedString(field_name) => field_name,
- tok => {
- return Err(make_error(
- self.val,
- &format!("Expected a double quoted string for a field
name; got {tok:?}"),
- ));
- }
- };
- self.expect_token(Token::Colon)?;
+ if self
+ .tokenizer
+ .next_if(|next| matches!(next, Ok(Token::RParen)))
+ .is_some()
+ {
+ break;
+ }
- let nullable = self.parse_opt_nullable();
- let field_type = self.parse_next_type()?;
- fields.push(Arc::new(Field::new(field_name, field_type,
nullable)));
+ let field = self.parse_field()?;
+ fields.push(Arc::new(field));
match self.next_token()? {
Token::Comma => continue,
Token::RParen => break,
@@ -448,6 +442,90 @@ impl<'a> Parser<'a> {
Ok(DataType::Struct(Fields::from(fields)))
}
+ /// Parses the next Union (called after `Union` has been consumed)
+ /// E.g: Union(Sparse, 0: ("a": Int32), 1: ("b": nullable Utf8))
+ fn parse_union(&mut self) -> ArrowResult<DataType> {
+ self.expect_token(Token::LParen)?;
+ let union_mode = self.parse_union_mode()?;
+ let mut type_ids = vec![];
+ let mut fields = vec![];
+ loop {
+ if self
+ .tokenizer
+ .next_if(|next| matches!(next, Ok(Token::RParen)))
+ .is_some()
+ {
+ break;
+ }
+ self.expect_token(Token::Comma)?;
+ let (type_id, field) = self.parse_union_field()?;
+ type_ids.push(type_id);
+ fields.push(field);
+ }
+ Ok(DataType::Union(
+ UnionFields::new(type_ids, fields),
+ union_mode,
+ ))
+ }
+
+ /// Parses the next UnionMode
+ fn parse_union_mode(&mut self) -> ArrowResult<UnionMode> {
+ match self.next_token()? {
+ Token::UnionMode(union_mode) => Ok(union_mode),
+ tok => Err(make_error(
+ self.val,
+ &format!("finding UnionMode for Union, got {tok}"),
+ )),
+ }
+ }
+
+ /// Parses the next UnionField
+ /// 0: ("a": nullable Int32)
+ fn parse_union_field(&mut self) -> ArrowResult<(i8, Field)> {
+ let type_id = self.parse_i8("UnionField")?;
+ self.expect_token(Token::Colon)?;
+ self.expect_token(Token::LParen)?;
+ let field = self.parse_field()?;
+ self.expect_token(Token::RParen)?;
+ Ok((type_id, field))
+ }
+
+ /// Parses the next Map (called after `Map` has been consumed)
+ /// E.g: Map("entries": Struct("key": Utf8, "value": nullable Int32),
sorted)
+ fn parse_map(&mut self) -> ArrowResult<DataType> {
+ self.expect_token(Token::LParen)?;
+ let field = self.parse_field()?;
+ self.expect_token(Token::Comma)?;
+ let sorted = self.parse_map_sorted()?;
+ self.expect_token(Token::RParen)?;
+ Ok(DataType::Map(Arc::new(field), sorted))
+ }
+
+ /// Parses map's sorted
+ fn parse_map_sorted(&mut self) -> ArrowResult<bool> {
+ match self.next_token()? {
+ Token::MapSorted(sorted) => Ok(sorted),
+ tok => Err(make_error(
+ self.val,
+ &format!("Expected sorted or unsorted for a map; got {tok:?}"),
+ )),
+ }
+ }
+
+ /// Parses the next RunEndEncoded (called after `RunEndEncoded` has been
consumed)
+ /// E.g: RunEndEncoded("run_ends": UInt32, "values": nullable Int32)
+ fn parse_run_end_encoded(&mut self) -> ArrowResult<DataType> {
+ self.expect_token(Token::LParen)?;
+ let run_ends = self.parse_field()?;
+ self.expect_token(Token::Comma)?;
+ let values = self.parse_field()?;
+ self.expect_token(Token::RParen)?;
+ Ok(DataType::RunEndEncoded(
+ Arc::new(run_ends),
+ Arc::new(values),
+ ))
+ }
+
/// return and consume if the next token is `Token::Nullable`
fn parse_opt_nullable(&mut self) -> bool {
self.tokenizer
@@ -485,7 +563,7 @@ enum QuoteType {
}
#[derive(Debug)]
-/// Splits a strings like Dictionary(Int32, Int64) into tokens sutable for
parsing
+/// Splits a strings like Dictionary(Int32, Int64) into tokens suitable for
parsing
///
/// For example the string "Timestamp(ns)" would be parsed into:
///
@@ -612,6 +690,16 @@ impl<'a> Tokenizer<'a> {
"Struct" => Token::Struct,
+ "Union" => Token::Union,
+ "Sparse" => Token::UnionMode(UnionMode::Sparse),
+ "Dense" => Token::UnionMode(UnionMode::Dense),
+
+ "Map" => Token::Map,
+ "sorted" => Token::MapSorted(true),
+ "unsorted" => Token::MapSorted(false),
+
+ "RunEndEncoded" => Token::RunEndEncoded,
+
token => {
return Err(make_error(self.val, &format!("unknown token:
{token}")));
}
@@ -752,6 +840,11 @@ enum Token {
LargeListView,
FixedSizeList,
Struct,
+ Union,
+ UnionMode(UnionMode),
+ Map,
+ MapSorted(bool),
+ RunEndEncoded,
Nullable,
Field,
X,
@@ -789,6 +882,13 @@ impl Display for Token {
Token::DoubleQuotedString(s) => write!(f,
"DoubleQuotedString({s})"),
Token::SingleQuotedString(s) => write!(f,
"SingleQuotedString({s})"),
Token::Struct => write!(f, "Struct"),
+ Token::Union => write!(f, "Union"),
+ Token::UnionMode(m) => write!(f, "{m:?}"),
+ Token::Map => write!(f, "Map"),
+ Token::MapSorted(sorted) => {
+ write!(f, "{}", if *sorted { "sorted" } else { "unsorted" })
+ }
+ Token::RunEndEncoded => write!(f, "RunEndEncoded"),
Token::Nullable => write!(f, "nullable"),
Token::Field => write!(f, "field"),
Token::X => write!(f, "x"),
@@ -930,6 +1030,7 @@ mod test {
true,
),
])),
+ DataType::Struct(Fields::from(vec![Field::new("f1",
DataType::Int64, true)])),
DataType::Struct(Fields::empty()),
DataType::List(Arc::new(Field::new_list_field(DataType::Int64,
true))),
DataType::List(Arc::new(Field::new_list_field(DataType::Int64,
false))),
@@ -982,7 +1083,80 @@ mod test {
)),
2,
),
- // TODO support more structured types (Union, Map, RunEndEncoded,
etc)
+ DataType::Union(
+ UnionFields::new(
+ vec![0, 1],
+ vec![
+ Field::new("Int32", DataType::Int32, false),
+ Field::new("Utf8", DataType::Utf8, true),
+ ],
+ ),
+ UnionMode::Sparse,
+ ),
+ DataType::Union(
+ UnionFields::new(
+ vec![0, 1],
+ vec![
+ Field::new("Int32", DataType::Int32, false),
+ Field::new("Utf8", DataType::Utf8, true),
+ ],
+ ),
+ UnionMode::Dense,
+ ),
+ DataType::Union(
+ UnionFields::new(
+ vec![0, 1],
+ vec![
+ Field::new_union(
+ "nested_union",
+ vec![0, 1],
+ vec![
+ Field::new("Int32", DataType::Int32, false),
+ Field::new("Utf8", DataType::Utf8, true),
+ ],
+ UnionMode::Dense,
+ ),
+ Field::new("Utf8", DataType::Utf8, true),
+ ],
+ ),
+ UnionMode::Sparse,
+ ),
+ DataType::Union(
+ UnionFields::new(vec![0], vec![Field::new("Int32",
DataType::Int32, false)]),
+ UnionMode::Dense,
+ ),
+ DataType::Union(
+ UnionFields::new(Vec::<i8>::new(), Vec::<Field>::new()),
+ UnionMode::Sparse,
+ ),
+ DataType::Map(Arc::new(Field::new("Int64", DataType::Int64,
true)), true),
+ DataType::Map(Arc::new(Field::new("Int64", DataType::Int64,
true)), false),
+ DataType::Map(
+ Arc::new(Field::new_map(
+ "nested_map",
+ "entries",
+ Field::new("key", DataType::Utf8, false),
+ Field::new("value", DataType::Int32, true),
+ false,
+ true,
+ )),
+ true,
+ ),
+ DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::UInt32, false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ ),
+ DataType::RunEndEncoded(
+ Arc::new(Field::new(
+ "nested_run_end_encoded",
+ DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::UInt32,
false)),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ ),
+ true,
+ )),
+ Arc::new(Field::new("values", DataType::Int32, true)),
+ ),
]
}