This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new cd61ead4f8 feat: parse DataType `List`, `ListView`, `LargeList`,
`LargeListView`, `FixedSizeList` (#8649)
cd61ead4f8 is described below
commit cd61ead4f8f4365e9406f7f3d1056d195159e748
Author: Khanh Duong <[email protected]>
AuthorDate: Fri Oct 31 03:59:42 2025 +0900
feat: parse DataType `List`, `ListView`, `LargeList`, `LargeListView`,
`FixedSizeList` (#8649)
# Which issue does this PR close?
- Part of #8648.
This PR only implements for list types to make review easier.
# Rationale for this change
The format for `DataType::List` includes:
- [x] `List(Int64)`: list not nullable.
- [x] `List(nullable Int64)`: list nullable.
- [x] `List(nullable Int64, field: 'foo')`: list nullable with field.
~`List(nullable Int64, metadata: {"foo1": "value1"})`: list with
metadata.~
(... The list goes on for `ListView`, `LargeList`, `LargeListView`,
`FixedSizeList`)
`parse_data_type` cannot (or incorrectly) work on those data types
listed above.
# What changes are included in this PR?
- Add `Token::...` to support new `Display` format for list types
introduced in #8351
(e.g. `FixedSizeList(5 x nullable Int64, field: 'foo'`).
- Add `fn nullable` to check whether nested data type is nullable.
- Add `parse_single_quoted_string` and `parse_list_field_name` to handle
`field: 'foo'`.
# Are these changes tested?
Yes. Added round trip tests.
# Are there any user-facing changes?
Yes. This is related to #8351
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-schema/src/datatype_parse.rs | 196 +++++++++++++++++++++++++++++++++----
1 file changed, 175 insertions(+), 21 deletions(-)
diff --git a/arrow-schema/src/datatype_parse.rs
b/arrow-schema/src/datatype_parse.rs
index 48b7089e8e..56d8fb56a5 100644
--- a/arrow-schema/src/datatype_parse.rs
+++ b/arrow-schema/src/datatype_parse.rs
@@ -83,7 +83,9 @@ impl<'a> Parser<'a> {
Token::Decimal256 => self.parse_decimal_256(),
Token::Dictionary => self.parse_dictionary(),
Token::List => self.parse_list(),
+ Token::ListView => self.parse_list_view(),
Token::LargeList => self.parse_large_list(),
+ Token::LargeListView => self.parse_large_list_view(),
Token::FixedSizeList => self.parse_fixed_size_list(),
Token::Struct => self.parse_struct(),
tok => Err(make_error(
@@ -93,35 +95,87 @@ impl<'a> Parser<'a> {
}
}
- /// Parses the List type
+ /// Parses list field name. Returns default field name if not found.
+ fn parse_list_field_name(&mut self, context: &str) -> ArrowResult<String> {
+ // field must be after a comma
+ if self
+ .tokenizer
+ .next_if(|next| matches!(next, Ok(Token::Comma)))
+ .is_none()
+ {
+ return Ok(Field::LIST_FIELD_DEFAULT_NAME.into());
+ }
+
+ // expects: `field: 'field_name'`.
+ self.expect_token(Token::Field)?;
+ self.expect_token(Token::Colon)?;
+ self.parse_single_quoted_string(context)
+ }
+
+ /// Parses the List type (called after `List` has been consumed)
+ /// E.g: List(nullable Int64, field: 'foo')
fn parse_list(&mut self) -> ArrowResult<DataType> {
self.expect_token(Token::LParen)?;
+ let nullable = self.parse_opt_nullable();
+ let data_type = self.parse_next_type()?;
+ let field = self.parse_list_field_name("List")?;
+ self.expect_token(Token::RParen)?;
+ Ok(DataType::List(Arc::new(Field::new(
+ field, data_type, nullable,
+ ))))
+ }
+
+ /// Parses the ListView type (called after `ListView` has been consumed)
+ /// E.g: ListView(nullable Int64, field: 'foo')
+ fn parse_list_view(&mut self) -> ArrowResult<DataType> {
+ self.expect_token(Token::LParen)?;
+ let nullable = self.parse_opt_nullable();
let data_type = self.parse_next_type()?;
+ let field = self.parse_list_field_name("ListView")?;
self.expect_token(Token::RParen)?;
- Ok(DataType::List(Arc::new(Field::new_list_field(
- data_type, true,
+ Ok(DataType::ListView(Arc::new(Field::new(
+ field, data_type, nullable,
))))
}
- /// Parses the LargeList type
+ /// Parses the LargeList type (called after `LargeList` has been consumed)
+ /// E.g: LargeList(nullable Int64, field: 'foo')
fn parse_large_list(&mut self) -> ArrowResult<DataType> {
self.expect_token(Token::LParen)?;
+ let nullable = self.parse_opt_nullable();
let data_type = self.parse_next_type()?;
+ let field = self.parse_list_field_name("LargeList")?;
self.expect_token(Token::RParen)?;
- Ok(DataType::LargeList(Arc::new(Field::new_list_field(
- data_type, true,
+ Ok(DataType::LargeList(Arc::new(Field::new(
+ field, data_type, nullable,
))))
}
- /// Parses the FixedSizeList type
+ /// Parses the LargeListView type (called after `LargeListView` has been
consumed)
+ /// E.g: LargeListView(nullable Int64, field: 'foo')
+ fn parse_large_list_view(&mut self) -> ArrowResult<DataType> {
+ self.expect_token(Token::LParen)?;
+ let nullable = self.parse_opt_nullable();
+ let data_type = self.parse_next_type()?;
+ let field = self.parse_list_field_name("LargeListView")?;
+ self.expect_token(Token::RParen)?;
+ Ok(DataType::LargeListView(Arc::new(Field::new(
+ field, data_type, nullable,
+ ))))
+ }
+
+ /// Parses the FixedSizeList type (called after `FixedSizeList` has been
consumed)
+ /// E.g: FixedSizeList(5 x nullable Int64, field: 'foo')
fn parse_fixed_size_list(&mut self) -> ArrowResult<DataType> {
self.expect_token(Token::LParen)?;
let length = self.parse_i32("FixedSizeList")?;
- self.expect_token(Token::Comma)?;
+ self.expect_token(Token::X)?;
+ let nullable = self.parse_opt_nullable();
let data_type = self.parse_next_type()?;
+ let field = self.parse_list_field_name("FixedSizeList")?;
self.expect_token(Token::RParen)?;
Ok(DataType::FixedSizeList(
- Arc::new(Field::new_list_field(data_type, true)),
+ Arc::new(Field::new(field, data_type, nullable)),
length,
))
}
@@ -150,6 +204,19 @@ impl<'a> Parser<'a> {
}
}
+ /// Parses the next single quoted string
+ fn parse_single_quoted_string(&mut self, context: &str) ->
ArrowResult<String> {
+ let token = self.next_token()?;
+ if let Token::SingleQuotedString(string) = token {
+ Ok(string)
+ } else {
+ Err(make_error(
+ self.val,
+ &format!("expected single quoted string for {context}, got
'{token}'"),
+ ))
+ }
+ }
+
/// Parses the next integer value
fn parse_i64(&mut self, context: &str) -> ArrowResult<i64> {
match self.next_token()? {
@@ -340,6 +407,8 @@ impl<'a> Parser<'a> {
Box::new(value_type),
))
}
+
+ /// Parses the next Struct (called after `Struct` has been consumed)
fn parse_struct(&mut self) -> ArrowResult<DataType> {
self.expect_token(Token::LParen)?;
let mut fields = Vec::new();
@@ -354,16 +423,13 @@ impl<'a> Parser<'a> {
tok => {
return Err(make_error(
self.val,
- &format!("Expected a quoted string for a field name;
got {tok:?}"),
+ &format!("Expected a double quoted string for a field
name; got {tok:?}"),
));
}
};
self.expect_token(Token::Colon)?;
- let nullable = self
- .tokenizer
- .next_if(|next| matches!(next, Ok(Token::Nullable)))
- .is_some();
+ let nullable = self.parse_opt_nullable();
let field_type = self.parse_next_type()?;
fields.push(Arc::new(Field::new(field_name, field_type,
nullable)));
match self.next_token()? {
@@ -382,6 +448,13 @@ impl<'a> Parser<'a> {
Ok(DataType::Struct(Fields::from(fields)))
}
+ /// return and consume if the next token is `Token::Nullable`
+ fn parse_opt_nullable(&mut self) -> bool {
+ self.tokenizer
+ .next_if(|next| matches!(next, Ok(Token::Nullable)))
+ .is_some()
+ }
+
/// return the next token, or an error if there are none left
fn next_token(&mut self) -> ArrowResult<Token> {
match self.tokenizer.next() {
@@ -406,6 +479,11 @@ fn is_separator(c: char) -> bool {
c == '(' || c == ')' || c == ',' || c == ':' || c == ' '
}
+enum QuoteType {
+ Double,
+ Single,
+}
+
#[derive(Debug)]
/// Splits a strings like Dictionary(Int32, Int64) into tokens sutable for
parsing
///
@@ -497,7 +575,9 @@ impl<'a> Tokenizer<'a> {
"Date64" => Token::SimpleType(DataType::Date64),
"List" => Token::List,
+ "ListView" => Token::ListView,
"LargeList" => Token::LargeList,
+ "LargeListView" => Token::LargeListView,
"FixedSizeList" => Token::FixedSizeList,
"s" | "Second" => Token::TimeUnit(TimeUnit::Second),
@@ -527,6 +607,8 @@ impl<'a> Tokenizer<'a> {
"None" => Token::None,
"nullable" => Token::Nullable,
+ "field" => Token::Field,
+ "x" => Token::X,
"Struct" => Token::Struct,
@@ -537,9 +619,14 @@ impl<'a> Tokenizer<'a> {
Ok(token)
}
- /// Parses e.g. `"foo bar"`
- fn parse_quoted_string(&mut self) -> ArrowResult<Token> {
- if self.next_char() != Some('\"') {
+ /// Parses e.g. `"foo bar"`, `'foo bar'`
+ fn parse_quoted_string(&mut self, quote_type: QuoteType) ->
ArrowResult<Token> {
+ let quote = match quote_type {
+ QuoteType::Double => '\"',
+ QuoteType::Single => '\'',
+ };
+
+ if self.next_char() != Some(quote) {
return Err(make_error(self.val, "Expected \""));
}
@@ -561,7 +648,7 @@ impl<'a> Tokenizer<'a> {
is_escaped = true;
self.word.push(c);
}
- '"' => {
+ c if c == quote => {
if is_escaped {
self.word.push(c);
is_escaped = false;
@@ -585,7 +672,10 @@ impl<'a> Tokenizer<'a> {
return Err(make_error(self.val, "empty strings aren't allowed"));
}
- Ok(Token::DoubleQuotedString(val))
+ match quote_type {
+ QuoteType::Double => Ok(Token::DoubleQuotedString(val)),
+ QuoteType::Single => Ok(Token::SingleQuotedString(val)),
+ }
}
}
@@ -601,7 +691,10 @@ impl Iterator for Tokenizer<'_> {
continue;
}
'"' => {
- return Some(self.parse_quoted_string());
+ return Some(self.parse_quoted_string(QuoteType::Double));
+ }
+ '\'' => {
+ return Some(self.parse_quoted_string(QuoteType::Single));
}
'(' => {
self.next_char();
@@ -652,11 +745,16 @@ enum Token {
None,
Integer(i64),
DoubleQuotedString(String),
+ SingleQuotedString(String),
List,
+ ListView,
LargeList,
+ LargeListView,
FixedSizeList,
Struct,
Nullable,
+ Field,
+ X,
}
impl Display for Token {
@@ -664,7 +762,9 @@ impl Display for Token {
match self {
Token::SimpleType(t) => write!(f, "{t}"),
Token::List => write!(f, "List"),
+ Token::ListView => write!(f, "ListView"),
Token::LargeList => write!(f, "LargeList"),
+ Token::LargeListView => write!(f, "LargeListView"),
Token::FixedSizeList => write!(f, "FixedSizeList"),
Token::Timestamp => write!(f, "Timestamp"),
Token::Time32 => write!(f, "Time32"),
@@ -687,8 +787,11 @@ impl Display for Token {
Token::Dictionary => write!(f, "Dictionary"),
Token::Integer(v) => write!(f, "Integer({v})"),
Token::DoubleQuotedString(s) => write!(f,
"DoubleQuotedString({s})"),
+ Token::SingleQuotedString(s) => write!(f,
"SingleQuotedString({s})"),
Token::Struct => write!(f, "Struct"),
Token::Nullable => write!(f, "nullable"),
+ Token::Field => write!(f, "field"),
+ Token::X => write!(f, "x"),
}
}
}
@@ -828,7 +931,58 @@ mod test {
),
])),
DataType::Struct(Fields::empty()),
- // TODO support more structured types (List, LargeList, Union,
Map, RunEndEncoded, etc)
+ DataType::List(Arc::new(Field::new_list_field(DataType::Int64,
true))),
+ DataType::List(Arc::new(Field::new_list_field(DataType::Int64,
false))),
+ DataType::List(Arc::new(Field::new("Int64", DataType::Int64,
true))),
+ DataType::List(Arc::new(Field::new("Int64", DataType::Int64,
false))),
+ DataType::List(Arc::new(Field::new(
+ "nested_list",
+ DataType::List(Arc::new(Field::new("Int64", DataType::Int64,
true))),
+ true,
+ ))),
+ DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64,
true))),
+ DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64,
false))),
+ DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64,
true))),
+ DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64,
false))),
+ DataType::ListView(Arc::new(Field::new(
+ "nested_list_view",
+ DataType::ListView(Arc::new(Field::new("Int64",
DataType::Int64, true))),
+ true,
+ ))),
+
DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, true))),
+
DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, false))),
+ DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64,
true))),
+ DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64,
false))),
+ DataType::LargeList(Arc::new(Field::new(
+ "nested_large_list",
+ DataType::LargeList(Arc::new(Field::new("Int64",
DataType::Int64, true))),
+ true,
+ ))),
+
DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, true))),
+
DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64,
false))),
+ DataType::LargeListView(Arc::new(Field::new("Int64",
DataType::Int64, true))),
+ DataType::LargeListView(Arc::new(Field::new("Int64",
DataType::Int64, false))),
+ DataType::LargeListView(Arc::new(Field::new(
+ "nested_large_list_view",
+ DataType::LargeListView(Arc::new(Field::new("Int64",
DataType::Int64, true))),
+ true,
+ ))),
+
DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, true)),
2),
+
DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64,
false)), 2),
+ DataType::FixedSizeList(Arc::new(Field::new("Int64",
DataType::Int64, true)), 2),
+ DataType::FixedSizeList(Arc::new(Field::new("Int64",
DataType::Int64, false)), 2),
+ DataType::FixedSizeList(
+ Arc::new(Field::new(
+ "nested_fixed_size_list",
+ DataType::FixedSizeList(
+ Arc::new(Field::new("Int64", DataType::Int64, true)),
+ 2,
+ ),
+ true,
+ )),
+ 2,
+ ),
+ // TODO support more structured types (Union, Map, RunEndEncoded,
etc)
]
}