This is an automated email from the ASF dual-hosted git repository.
mbrobbel pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 1fd3aaebb7 Quote `DataType::Struct` field names in `Display`
formatting (#8291)
1fd3aaebb7 is described below
commit 1fd3aaebb7395b9bf3f4d745858adc723ae2d8dc
Author: Emil Ernerfeldt <[email protected]>
AuthorDate: Thu Sep 25 10:04:34 2025 +0200
Quote `DataType::Struct` field names in `Display` formatting (#8291)
# Which issue does this PR close?
* Follows https://github.com/apache/arrow-rs/pull/8290 (merge that
first, and the diff of this PR will drop)
* https://github.com/apache/arrow-rs/pull/7469
* Part of https://github.com/apache/arrow-rs/issues/8351
# Rationale for this change
We would previously format structs like so:
`Struct(name1 type1, name2 nullable type2)`
This will break badly whenever the field name is anything but a simple
identifier. In other words: it allows [string
injection](https://xkcd.com/327/) if the field name contains an
end-paranthesis.
Except for that, it is also difficult to debug mistakingly bad field
names like " " or "\n".
# What changes are included in this PR?
We change the `Display` and `Debug` formatting of `Struct`
**Before**: `Struct(name1 type1, name2 nullable type2)`
**After**: `Struct("name1": type1, "name2": nullable type2)`
# Are these changes tested?
Yes - I've updated the existing tests.
# Are there any user-facing changes?
Yes, changing the `Display` formatting is a **breaking change**
---
arrow-cast/src/cast/mod.rs | 8 +-
arrow-json/src/lib.rs | 2 +-
arrow-row/src/lib.rs | 4 +-
arrow-schema/src/datatype_display.rs | 2 +-
arrow-schema/src/datatype_parse.rs | 177 ++++++++++++++++++----------------
arrow-schema/src/schema.rs | 2 +-
parquet/src/arrow/arrow_reader/mod.rs | 4 +-
7 files changed, 103 insertions(+), 96 deletions(-)
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index 0330ce9138..71de8f9f18 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -8665,7 +8665,7 @@ mod tests {
};
assert_eq!(
t,
- r#"Casting from Map(Field { "entries": Struct(key Utf8, value
nullable Utf8) }, false) to Map(Field { "entries": Struct(key Utf8, value Utf8)
}, true) not supported"#
+ r#"Casting from Map(Field { "entries": Struct("key": Utf8,
"value": nullable Utf8) }, false) to Map(Field { "entries": Struct("key": Utf8,
"value": Utf8) }, true) not supported"#
);
}
@@ -8716,7 +8716,7 @@ mod tests {
};
assert_eq!(
t,
- r#"Casting from Map(Field { "entries": Struct(key Utf8, value
nullable Interval(DayTime)) }, false) to Map(Field { "entries": Struct(key
Utf8, value Duration(s)) }, true) not supported"#
+ r#"Casting from Map(Field { "entries": Struct("key": Utf8,
"value": nullable Interval(DayTime)) }, false) to Map(Field { "entries":
Struct("key": Utf8, "value": Duration(s)) }, true) not supported"#
);
}
@@ -10805,7 +10805,7 @@ mod tests {
let to_type = DataType::Utf8;
let result = cast(&struct_array, &to_type);
assert_eq!(
- r#"Cast error: Casting from Struct(a Boolean) to Utf8 not
supported"#,
+ r#"Cast error: Casting from Struct("a": Boolean) to Utf8 not
supported"#,
result.unwrap_err().to_string()
);
}
@@ -10816,7 +10816,7 @@ mod tests {
let to_type = DataType::Struct(vec![Field::new("a", DataType::Boolean,
false)].into());
let result = cast(&array, &to_type);
assert_eq!(
- r#"Cast error: Casting from Utf8 to Struct(a Boolean) not
supported"#,
+ r#"Cast error: Casting from Utf8 to Struct("a": Boolean) not
supported"#,
result.unwrap_err().to_string()
);
}
diff --git a/arrow-json/src/lib.rs b/arrow-json/src/lib.rs
index 5a5430fef9..12ad5efa37 100644
--- a/arrow-json/src/lib.rs
+++ b/arrow-json/src/lib.rs
@@ -87,7 +87,7 @@ use serde_json::{Number, Value};
///
/// This enum controls which form(s) the Reader will accept and which form the
/// Writer will produce. For example, if the RecordBatch Schema is
-/// `[("a", Int32), ("r", Struct(b Boolean, c Utf8))]`
+/// `[("a", Int32), ("r", Struct("b": Boolean, "c" Utf8))]`
/// then a Reader with [`StructMode::ObjectOnly`] would read rows of the form
/// `{"a": 1, "r": {"b": true, "c": "cat"}}` while with
['StructMode::ListOnly']
/// would read rows of the form `[1, [true, "cat"]]`. A Writer would produce
diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs
index cdb52a8ee7..69b8a24cc6 100644
--- a/arrow-row/src/lib.rs
+++ b/arrow-row/src/lib.rs
@@ -2292,7 +2292,7 @@ mod tests {
let [s2] = back.try_into().unwrap();
// RowConverter flattens Dictionary
- // s.ty = Struct(foo Dictionary(Int32, Utf8)), s2.ty = Struct(foo Utf8)
+ // s.ty = Struct("foo": Dictionary(Int32, Utf8)), s2.ty =
Struct("foo": Utf8)
assert_ne!(&s.data_type(), &s2.data_type());
s2.to_data().validate_full().unwrap();
@@ -2340,7 +2340,7 @@ mod tests {
let [s2] = back.try_into().unwrap();
// RowConverter flattens Dictionary
- // s.ty = Struct(foo Dictionary(Int32, Int32)), s2.ty = Struct(foo
Int32)
+ // s.ty = Struct("foo": Dictionary(Int32, Int32)), s2.ty =
Struct("foo": Int32)
assert_ne!(&s.data_type(), &s2.data_type());
s2.to_data().validate_full().unwrap();
assert_eq!(s.len(), 0);
diff --git a/arrow-schema/src/datatype_display.rs
b/arrow-schema/src/datatype_display.rs
index f23beb489d..73ceb3f680 100644
--- a/arrow-schema/src/datatype_display.rs
+++ b/arrow-schema/src/datatype_display.rs
@@ -122,7 +122,7 @@ impl fmt::Display for DataType {
let maybe_nullable = if field.is_nullable() {
"nullable " } else { "" };
let data_type = field.data_type();
let metadata_str =
format_metadata(field.metadata());
- format!("{name}
{maybe_nullable}{data_type}{metadata_str}")
+ format!("{name:?}:
{maybe_nullable}{data_type}{metadata_str}")
})
.collect::<Vec<_>>()
.join(", ");
diff --git a/arrow-schema/src/datatype_parse.rs
b/arrow-schema/src/datatype_parse.rs
index f465871ad0..1042784c30 100644
--- a/arrow-schema/src/datatype_parse.rs
+++ b/arrow-schema/src/datatype_parse.rs
@@ -81,9 +81,6 @@ impl<'a> Parser<'a> {
Token::LargeList => self.parse_large_list(),
Token::FixedSizeList => self.parse_fixed_size_list(),
Token::Struct => self.parse_struct(),
- Token::FieldName(word) => {
- Err(make_error(self.val, &format!("unrecognized word:
{word}")))
- }
tok => Err(make_error(
self.val,
&format!("finding next type, got unexpected '{tok}'"),
@@ -137,15 +134,14 @@ impl<'a> Parser<'a> {
/// Parses the next double quoted string
fn parse_double_quoted_string(&mut self, context: &str) ->
ArrowResult<String> {
- match self.next_token()? {
- Token::DoubleQuotedString(s) => Ok(s),
- Token::FieldName(word) => {
- Err(make_error(self.val, &format!("unrecognized word:
{word}")))
- }
- tok => Err(make_error(
+ let token = self.next_token()?;
+ if let Token::DoubleQuotedString(string) = token {
+ Ok(string)
+ } else {
+ Err(make_error(
self.val,
- &format!("finding double quoted string for {context}, got
'{tok}'"),
- )),
+ &format!("expected double quoted string for {context}, got
'{token}'"),
+ ))
}
}
@@ -321,27 +317,22 @@ impl<'a> Parser<'a> {
self.expect_token(Token::LParen)?;
let mut fields = Vec::new();
loop {
+ // expects: "field name": [nullable] #datatype
+
let field_name = match self.next_token()? {
- // It's valid to have a name that is a type name
- Token::SimpleType(data_type) => data_type.to_string(),
- Token::FieldName(name) => name,
Token::RParen => {
- if fields.is_empty() {
- break;
- } else {
- return Err(make_error(
- self.val,
- "Unexpected token while parsing Struct fields.
Expected a word for the name of Struct, but got trailing comma",
- ));
- }
+ break;
}
+ Token::DoubleQuotedString(field_name) => field_name,
tok => {
return Err(make_error(
self.val,
- &format!("Expected a word for the name of Struct, but
got {tok}"),
+ &format!("Expected a quoted string for a field name;
got {tok:?}"),
))
}
};
+ self.expect_token(Token::Colon)?;
+
let nullable = self
.tokenizer
.next_if(|next| matches!(next, Ok(Token::Nullable)))
@@ -383,7 +374,7 @@ impl<'a> Parser<'a> {
/// returns true if this character is a separator
fn is_separator(c: char) -> bool {
- c == '(' || c == ')' || c == ',' || c == ' '
+ c == '(' || c == ')' || c == ',' || c == ':' || c == ' '
}
#[derive(Debug)]
@@ -445,50 +436,6 @@ impl<'a> Tokenizer<'a> {
})?;
return Ok(Token::Integer(val));
}
- // if it started with a double quote `"`, try parsing it as a
double quoted string
- else if c == '"' {
- let len = self.word.chars().count();
-
- // to verify it's double quoted
- if let Some(last_c) = self.word.chars().last() {
- if last_c != '"' || len < 2 {
- return Err(make_error(
- self.val,
- &format!(
- "parsing {} as double quoted string: last char
must be \"",
- self.word
- ),
- ));
- }
- }
-
- if len == 2 {
- return Err(make_error(
- self.val,
- &format!(
- "parsing {} as double quoted string: empty string
isn't supported",
- self.word
- ),
- ));
- }
-
- let val: String = self.word.parse().map_err(|e| {
- make_error(
- self.val,
- &format!("parsing {} as double quoted string: {e}",
self.word),
- )
- })?;
-
- let s = val[1..len - 1].to_string();
- if s.contains('"') {
- return Err(make_error(
- self.val,
- &format!("parsing {} as double quoted string: escaped
double quote isn't supported", self.word),
- ));
- }
-
- return Ok(Token::DoubleQuotedString(s));
- }
}
// figure out what the word was
@@ -554,11 +501,63 @@ impl<'a> Tokenizer<'a> {
"Struct" => Token::Struct,
- // If we don't recognize the word, treat it as a field name
- word => Token::FieldName(word.to_string()),
+ token => {
+ return Err(make_error(self.val, &format!("unknown token:
{token}")));
+ }
};
Ok(token)
}
+
+ /// Parses e.g. `"foo bar"`
+ fn parse_quoted_string(&mut self) -> ArrowResult<Token> {
+ if self.next_char() != Some('\"') {
+ return Err(make_error(self.val, "Expected \""));
+ }
+
+ // reset temp space
+ self.word.clear();
+
+ let mut is_escaped = false;
+
+ loop {
+ match self.next_char() {
+ None => {
+ return Err(ArrowError::ParseError(format!(
+ "Unterminated string at: \"{}",
+ self.word
+ )));
+ }
+ Some(c) => match c {
+ '\\' => {
+ is_escaped = true;
+ self.word.push(c);
+ }
+ '"' => {
+ if is_escaped {
+ self.word.push(c);
+ is_escaped = false;
+ } else {
+ break;
+ }
+ }
+ c => {
+ self.word.push(c);
+ }
+ },
+ }
+ }
+
+ let val: String = self.word.parse().map_err(|err| {
+ ArrowError::ParseError(format!("Failed to parse string: \"{}\":
{err}", self.word))
+ })?;
+
+ if val.is_empty() {
+ // Using empty strings as field names is just asking for trouble
+ return Err(make_error(self.val, "empty strings aren't allowed"));
+ }
+
+ Ok(Token::DoubleQuotedString(val))
+ }
}
impl Iterator for Tokenizer<'_> {
@@ -572,6 +571,9 @@ impl Iterator for Tokenizer<'_> {
self.next_char();
continue;
}
+ '"' => {
+ return Some(self.parse_quoted_string());
+ }
'(' => {
self.next_char();
return Some(Ok(Token::LParen));
@@ -584,6 +586,10 @@ impl Iterator for Tokenizer<'_> {
self.next_char();
return Some(Ok(Token::Comma));
}
+ ':' => {
+ self.next_char();
+ return Some(Ok(Token::Colon));
+ }
_ => return Some(self.parse_word()),
}
}
@@ -612,6 +618,7 @@ enum Token {
LParen,
RParen,
Comma,
+ Colon,
Some,
None,
Integer(i64),
@@ -621,7 +628,6 @@ enum Token {
FixedSizeList,
Struct,
Nullable,
- FieldName(String),
}
impl Display for Token {
@@ -641,6 +647,7 @@ impl Display for Token {
Token::LParen => write!(f, "("),
Token::RParen => write!(f, ")"),
Token::Comma => write!(f, ","),
+ Token::Colon => write!(f, ":"),
Token::Some => write!(f, "Some"),
Token::None => write!(f, "None"),
Token::FixedSizeBinary => write!(f, "FixedSizeBinary"),
@@ -653,7 +660,6 @@ impl Display for Token {
Token::DoubleQuotedString(s) => write!(f,
"DoubleQuotedString({s})"),
Token::Struct => write!(f, "Struct"),
Token::Nullable => write!(f, "nullable"),
- Token::FieldName(s) => write!(f, "FieldName({s})"),
}
}
}
@@ -837,19 +843,19 @@ mod test {
("Nu", "Unsupported type 'Nu'"),
(
r#"Timestamp(ns, +00:00)"#,
- "Error unrecognized word: +00:00",
+ "Error unknown token: +00",
),
(
r#"Timestamp(ns, "+00:00)"#,
- r#"parsing "+00:00 as double quoted string: last char must be
""#,
+ r#"Unterminated string at: "+00:00)"#,
),
(
r#"Timestamp(ns, "")"#,
- r#"parsing "" as double quoted string: empty string isn't
supported"#,
+ r#"empty strings aren't allowed"#,
),
(
r#"Timestamp(ns, "+00:00"")"#,
- r#"parsing "+00:00"" as double quoted string: escaped double
quote isn't supported"#,
+ r#"Parser error: Unterminated string at: ")"#,
),
("Timestamp(ns, ", "Error finding next token"),
(
@@ -871,9 +877,9 @@ mod test {
("Decimal64(3, 500)", "Error converting 500 into i8 for Decimal64:
out of range integral type conversion attempted"),
("Decimal128(3, 500)", "Error converting 500 into i8 for
Decimal128: out of range integral type conversion attempted"),
("Decimal256(3, 500)", "Error converting 500 into i8 for
Decimal256: out of range integral type conversion attempted"),
- ("Struct(f1, Int64)", "Error finding next type, got unexpected
','"),
- ("Struct(f1 Int64,)", "Expected a word for the name of Struct, but
got trailing comma"),
- ("Struct(f1)", "Error finding next type, got unexpected ')'"),
+ ("Struct(f1 Int64)", "Error unknown token: f1"),
+ ("Struct(\"f1\" Int64)", "Expected ':'"),
+ ("Struct(\"f1\": )", "Error finding next type, got unexpected
')'"),
];
for (data_type_string, expected_message) in cases {
@@ -884,12 +890,13 @@ mod test {
let message = e.to_string();
assert!(
message.contains(expected_message),
- "\n\ndid not find expected in actual.\n\nexpected:
{expected_message}\nactual:{message}\n"
+ "\n\ndid not find expected in actual.\n\nexpected:
{expected_message}\nactual: {message}\n"
);
- // errors should also contain a help message
- assert!(message.contains(
- "Must be a supported arrow type name such as 'Int32'
or 'Timestamp(ns)'"
- ));
+
+ if !message.contains("Unterminated string") {
+ // errors should also contain a help message
+ assert!(message.contains("Must be a supported arrow
type name such as 'Int32' or 'Timestamp(ns)'"), "message: {message}");
+ }
}
}
}
@@ -899,6 +906,6 @@ mod test {
fn parse_error_type() {
let err = parse_data_type("foobar").unwrap_err();
assert!(matches!(err, ArrowError::ParseError(_)));
- assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'.
Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error
unrecognized word: foobar");
+ assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'.
Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error
unknown token: foobar");
}
}
diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs
index dcb1b6183b..37545a8eed 100644
--- a/arrow-schema/src/schema.rs
+++ b/arrow-schema/src/schema.rs
@@ -701,7 +701,7 @@ mod tests {
schema.to_string(),
"Field { \"first_name\": Utf8, metadata: {\"k\": \"v\"} }, \
Field { \"last_name\": Utf8 }, \
- Field { \"address\": Struct(street Utf8, zip UInt16) }, \
+ Field { \"address\": Struct(\"street\": Utf8, \"zip\": UInt16) },
\
Field { \"interests\": nullable Dictionary(Int32, Utf8), dict_id:
123, dict_is_ordered }"
)
}
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index 0a5a7d0969..8a7e2ef709 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -3677,8 +3677,8 @@ mod tests {
),
])),
"Arrow: Incompatible supplied Arrow schema: data type mismatch for
field nested: \
- requested Struct(nested1_valid Utf8, nested1_invalid Int32) \
- but found Struct(nested1_valid Utf8, nested1_invalid Int64)",
+ requested Struct(\"nested1_valid\": Utf8, \"nested1_invalid\":
Int32) \
+ but found Struct(\"nested1_valid\": Utf8, \"nested1_invalid\":
Int64)",
);
}