(arrow-rs) branch main updated: Quote `DataType::Struct` field names in `Display` formatting (#8291)

mbrobbel Sat, 18 Oct 2025 05:50:14 -0700

This is an automated email from the ASF dual-hosted git repository.

mbrobbel pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/main by this push:
     new 1fd3aaebb7 Quote `DataType::Struct` field names in `Display` 
formatting (#8291)
1fd3aaebb7 is described below

commit 1fd3aaebb7395b9bf3f4d745858adc723ae2d8dc
Author: Emil Ernerfeldt <[email protected]>
AuthorDate: Thu Sep 25 10:04:34 2025 +0200

    Quote `DataType::Struct` field names in `Display` formatting (#8291)
    
    # Which issue does this PR close?
    * Follows https://github.com/apache/arrow-rs/pull/8290 (merge that
    first, and the diff of this PR will drop)
    * https://github.com/apache/arrow-rs/pull/7469
    * Part of https://github.com/apache/arrow-rs/issues/8351
    
    # Rationale for this change
    We would previously format structs like so:
    
    `Struct(name1 type1, name2 nullable type2)`
    
    This will break badly whenever the field name is anything but a simple
    identifier. In other words: it allows [string
    injection](https://xkcd.com/327/) if the field name contains an
    end-paranthesis.
    
    Except for that, it is also difficult to debug mistakingly bad field
    names like " " or "\n".
    
    # What changes are included in this PR?
    We change the `Display` and `Debug` formatting of `Struct`
    
    **Before**: `Struct(name1 type1, name2 nullable type2)`
    **After**: `Struct("name1": type1, "name2": nullable type2)`
    
    # Are these changes tested?
    Yes - I've updated the existing tests.
    
    # Are there any user-facing changes?
    Yes, changing the `Display` formatting is a **breaking change**
---
 arrow-cast/src/cast/mod.rs            |   8 +-
 arrow-json/src/lib.rs                 |   2 +-
 arrow-row/src/lib.rs                  |   4 +-
 arrow-schema/src/datatype_display.rs  |   2 +-
 arrow-schema/src/datatype_parse.rs    | 177 ++++++++++++++++++----------------
 arrow-schema/src/schema.rs            |   2 +-
 parquet/src/arrow/arrow_reader/mod.rs |   4 +-
 7 files changed, 103 insertions(+), 96 deletions(-)

diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index 0330ce9138..71de8f9f18 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -8665,7 +8665,7 @@ mod tests {
         };
         assert_eq!(
             t,
-            r#"Casting from Map(Field { "entries": Struct(key Utf8, value 
nullable Utf8) }, false) to Map(Field { "entries": Struct(key Utf8, value Utf8) 
}, true) not supported"#
+            r#"Casting from Map(Field { "entries": Struct("key": Utf8, 
"value": nullable Utf8) }, false) to Map(Field { "entries": Struct("key": Utf8, 
"value": Utf8) }, true) not supported"#
         );
     }
 
@@ -8716,7 +8716,7 @@ mod tests {
         };
         assert_eq!(
             t,
-            r#"Casting from Map(Field { "entries": Struct(key Utf8, value 
nullable Interval(DayTime)) }, false) to Map(Field { "entries": Struct(key 
Utf8, value Duration(s)) }, true) not supported"#
+            r#"Casting from Map(Field { "entries": Struct("key": Utf8, 
"value": nullable Interval(DayTime)) }, false) to Map(Field { "entries": 
Struct("key": Utf8, "value": Duration(s)) }, true) not supported"#
         );
     }
 
@@ -10805,7 +10805,7 @@ mod tests {
         let to_type = DataType::Utf8;
         let result = cast(&struct_array, &to_type);
         assert_eq!(
-            r#"Cast error: Casting from Struct(a Boolean) to Utf8 not 
supported"#,
+            r#"Cast error: Casting from Struct("a": Boolean) to Utf8 not 
supported"#,
             result.unwrap_err().to_string()
         );
     }
@@ -10816,7 +10816,7 @@ mod tests {
         let to_type = DataType::Struct(vec![Field::new("a", DataType::Boolean, 
false)].into());
         let result = cast(&array, &to_type);
         assert_eq!(
-            r#"Cast error: Casting from Utf8 to Struct(a Boolean) not 
supported"#,
+            r#"Cast error: Casting from Utf8 to Struct("a": Boolean) not 
supported"#,
             result.unwrap_err().to_string()
         );
     }
diff --git a/arrow-json/src/lib.rs b/arrow-json/src/lib.rs
index 5a5430fef9..12ad5efa37 100644
--- a/arrow-json/src/lib.rs
+++ b/arrow-json/src/lib.rs
@@ -87,7 +87,7 @@ use serde_json::{Number, Value};
 ///
 /// This enum controls which form(s) the Reader will accept and which form the
 /// Writer will produce. For example, if the RecordBatch Schema is
-/// `[("a", Int32), ("r", Struct(b Boolean, c Utf8))]`
+/// `[("a", Int32), ("r", Struct("b": Boolean, "c" Utf8))]`
 /// then a Reader with [`StructMode::ObjectOnly`] would read rows of the form
 /// `{"a": 1, "r": {"b": true, "c": "cat"}}` while with 
['StructMode::ListOnly']
 /// would read rows of the form `[1, [true, "cat"]]`. A Writer would produce
diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs
index cdb52a8ee7..69b8a24cc6 100644
--- a/arrow-row/src/lib.rs
+++ b/arrow-row/src/lib.rs
@@ -2292,7 +2292,7 @@ mod tests {
         let [s2] = back.try_into().unwrap();
 
         // RowConverter flattens Dictionary
-        // s.ty = Struct(foo Dictionary(Int32, Utf8)), s2.ty = Struct(foo Utf8)
+        // s.ty = Struct("foo": Dictionary(Int32, Utf8)), s2.ty = 
Struct("foo": Utf8)
         assert_ne!(&s.data_type(), &s2.data_type());
         s2.to_data().validate_full().unwrap();
 
@@ -2340,7 +2340,7 @@ mod tests {
         let [s2] = back.try_into().unwrap();
 
         // RowConverter flattens Dictionary
-        // s.ty = Struct(foo Dictionary(Int32, Int32)), s2.ty = Struct(foo 
Int32)
+        // s.ty = Struct("foo": Dictionary(Int32, Int32)), s2.ty = 
Struct("foo": Int32)
         assert_ne!(&s.data_type(), &s2.data_type());
         s2.to_data().validate_full().unwrap();
         assert_eq!(s.len(), 0);
diff --git a/arrow-schema/src/datatype_display.rs 
b/arrow-schema/src/datatype_display.rs
index f23beb489d..73ceb3f680 100644
--- a/arrow-schema/src/datatype_display.rs
+++ b/arrow-schema/src/datatype_display.rs
@@ -122,7 +122,7 @@ impl fmt::Display for DataType {
                             let maybe_nullable = if field.is_nullable() { 
"nullable " } else { "" };
                             let data_type = field.data_type();
                             let metadata_str = 
format_metadata(field.metadata());
-                            format!("{name} 
{maybe_nullable}{data_type}{metadata_str}")
+                            format!("{name:?}: 
{maybe_nullable}{data_type}{metadata_str}")
                         })
                         .collect::<Vec<_>>()
                         .join(", ");
diff --git a/arrow-schema/src/datatype_parse.rs 
b/arrow-schema/src/datatype_parse.rs
index f465871ad0..1042784c30 100644
--- a/arrow-schema/src/datatype_parse.rs
+++ b/arrow-schema/src/datatype_parse.rs
@@ -81,9 +81,6 @@ impl<'a> Parser<'a> {
             Token::LargeList => self.parse_large_list(),
             Token::FixedSizeList => self.parse_fixed_size_list(),
             Token::Struct => self.parse_struct(),
-            Token::FieldName(word) => {
-                Err(make_error(self.val, &format!("unrecognized word: 
{word}")))
-            }
             tok => Err(make_error(
                 self.val,
                 &format!("finding next type, got unexpected '{tok}'"),
@@ -137,15 +134,14 @@ impl<'a> Parser<'a> {
 
     /// Parses the next double quoted string
     fn parse_double_quoted_string(&mut self, context: &str) -> 
ArrowResult<String> {
-        match self.next_token()? {
-            Token::DoubleQuotedString(s) => Ok(s),
-            Token::FieldName(word) => {
-                Err(make_error(self.val, &format!("unrecognized word: 
{word}")))
-            }
-            tok => Err(make_error(
+        let token = self.next_token()?;
+        if let Token::DoubleQuotedString(string) = token {
+            Ok(string)
+        } else {
+            Err(make_error(
                 self.val,
-                &format!("finding double quoted string for {context}, got 
'{tok}'"),
-            )),
+                &format!("expected double quoted string for {context}, got 
'{token}'"),
+            ))
         }
     }
 
@@ -321,27 +317,22 @@ impl<'a> Parser<'a> {
         self.expect_token(Token::LParen)?;
         let mut fields = Vec::new();
         loop {
+            // expects:   "field name": [nullable] #datatype
+
             let field_name = match self.next_token()? {
-                // It's valid to have a name that is a type name
-                Token::SimpleType(data_type) => data_type.to_string(),
-                Token::FieldName(name) => name,
                 Token::RParen => {
-                    if fields.is_empty() {
-                        break;
-                    } else {
-                        return Err(make_error(
-                            self.val,
-                            "Unexpected token while parsing Struct fields. 
Expected a word for the name of Struct, but got trailing comma",
-                        ));
-                    }
+                    break;
                 }
+                Token::DoubleQuotedString(field_name) => field_name,
                 tok => {
                     return Err(make_error(
                         self.val,
-                        &format!("Expected a word for the name of Struct, but 
got {tok}"),
+                        &format!("Expected a quoted string for a field name; 
got {tok:?}"),
                     ))
                 }
             };
+            self.expect_token(Token::Colon)?;
+
             let nullable = self
                 .tokenizer
                 .next_if(|next| matches!(next, Ok(Token::Nullable)))
@@ -383,7 +374,7 @@ impl<'a> Parser<'a> {
 
 /// returns true if this character is a separator
 fn is_separator(c: char) -> bool {
-    c == '(' || c == ')' || c == ',' || c == ' '
+    c == '(' || c == ')' || c == ',' || c == ':' || c == ' '
 }
 
 #[derive(Debug)]
@@ -445,50 +436,6 @@ impl<'a> Tokenizer<'a> {
                 })?;
                 return Ok(Token::Integer(val));
             }
-            // if it started with a double quote `"`, try parsing it as a 
double quoted string
-            else if c == '"' {
-                let len = self.word.chars().count();
-
-                // to verify it's double quoted
-                if let Some(last_c) = self.word.chars().last() {
-                    if last_c != '"' || len < 2 {
-                        return Err(make_error(
-                            self.val,
-                            &format!(
-                                "parsing {} as double quoted string: last char 
must be \"",
-                                self.word
-                            ),
-                        ));
-                    }
-                }
-
-                if len == 2 {
-                    return Err(make_error(
-                        self.val,
-                        &format!(
-                            "parsing {} as double quoted string: empty string 
isn't supported",
-                            self.word
-                        ),
-                    ));
-                }
-
-                let val: String = self.word.parse().map_err(|e| {
-                    make_error(
-                        self.val,
-                        &format!("parsing {} as double quoted string: {e}", 
self.word),
-                    )
-                })?;
-
-                let s = val[1..len - 1].to_string();
-                if s.contains('"') {
-                    return Err(make_error(
-                        self.val,
-                        &format!("parsing {} as double quoted string: escaped 
double quote isn't supported", self.word),
-                    ));
-                }
-
-                return Ok(Token::DoubleQuotedString(s));
-            }
         }
 
         // figure out what the word was
@@ -554,11 +501,63 @@ impl<'a> Tokenizer<'a> {
 
             "Struct" => Token::Struct,
 
-            // If we don't recognize the word, treat it as a field name
-            word => Token::FieldName(word.to_string()),
+            token => {
+                return Err(make_error(self.val, &format!("unknown token: 
{token}")));
+            }
         };
         Ok(token)
     }
+
+    /// Parses e.g. `"foo bar"`
+    fn parse_quoted_string(&mut self) -> ArrowResult<Token> {
+        if self.next_char() != Some('\"') {
+            return Err(make_error(self.val, "Expected \""));
+        }
+
+        // reset temp space
+        self.word.clear();
+
+        let mut is_escaped = false;
+
+        loop {
+            match self.next_char() {
+                None => {
+                    return Err(ArrowError::ParseError(format!(
+                        "Unterminated string at: \"{}",
+                        self.word
+                    )));
+                }
+                Some(c) => match c {
+                    '\\' => {
+                        is_escaped = true;
+                        self.word.push(c);
+                    }
+                    '"' => {
+                        if is_escaped {
+                            self.word.push(c);
+                            is_escaped = false;
+                        } else {
+                            break;
+                        }
+                    }
+                    c => {
+                        self.word.push(c);
+                    }
+                },
+            }
+        }
+
+        let val: String = self.word.parse().map_err(|err| {
+            ArrowError::ParseError(format!("Failed to parse string: \"{}\": 
{err}", self.word))
+        })?;
+
+        if val.is_empty() {
+            // Using empty strings as field names is just asking for trouble
+            return Err(make_error(self.val, "empty strings aren't allowed"));
+        }
+
+        Ok(Token::DoubleQuotedString(val))
+    }
 }
 
 impl Iterator for Tokenizer<'_> {
@@ -572,6 +571,9 @@ impl Iterator for Tokenizer<'_> {
                     self.next_char();
                     continue;
                 }
+                '"' => {
+                    return Some(self.parse_quoted_string());
+                }
                 '(' => {
                     self.next_char();
                     return Some(Ok(Token::LParen));
@@ -584,6 +586,10 @@ impl Iterator for Tokenizer<'_> {
                     self.next_char();
                     return Some(Ok(Token::Comma));
                 }
+                ':' => {
+                    self.next_char();
+                    return Some(Ok(Token::Colon));
+                }
                 _ => return Some(self.parse_word()),
             }
         }
@@ -612,6 +618,7 @@ enum Token {
     LParen,
     RParen,
     Comma,
+    Colon,
     Some,
     None,
     Integer(i64),
@@ -621,7 +628,6 @@ enum Token {
     FixedSizeList,
     Struct,
     Nullable,
-    FieldName(String),
 }
 
 impl Display for Token {
@@ -641,6 +647,7 @@ impl Display for Token {
             Token::LParen => write!(f, "("),
             Token::RParen => write!(f, ")"),
             Token::Comma => write!(f, ","),
+            Token::Colon => write!(f, ":"),
             Token::Some => write!(f, "Some"),
             Token::None => write!(f, "None"),
             Token::FixedSizeBinary => write!(f, "FixedSizeBinary"),
@@ -653,7 +660,6 @@ impl Display for Token {
             Token::DoubleQuotedString(s) => write!(f, 
"DoubleQuotedString({s})"),
             Token::Struct => write!(f, "Struct"),
             Token::Nullable => write!(f, "nullable"),
-            Token::FieldName(s) => write!(f, "FieldName({s})"),
         }
     }
 }
@@ -837,19 +843,19 @@ mod test {
             ("Nu", "Unsupported type 'Nu'"),
             (
                 r#"Timestamp(ns, +00:00)"#,
-                "Error unrecognized word: +00:00",
+                "Error unknown token: +00",
             ),
             (
                 r#"Timestamp(ns, "+00:00)"#,
-                r#"parsing "+00:00 as double quoted string: last char must be 
""#,
+                r#"Unterminated string at: "+00:00)"#,
             ),
             (
                 r#"Timestamp(ns, "")"#,
-                r#"parsing "" as double quoted string: empty string isn't 
supported"#,
+                r#"empty strings aren't allowed"#,
             ),
             (
                 r#"Timestamp(ns, "+00:00"")"#,
-                r#"parsing "+00:00"" as double quoted string: escaped double 
quote isn't supported"#,
+                r#"Parser error: Unterminated string at: ")"#,
             ),
             ("Timestamp(ns, ", "Error finding next token"),
             (
@@ -871,9 +877,9 @@ mod test {
             ("Decimal64(3, 500)", "Error converting 500 into i8 for Decimal64: 
out of range integral type conversion attempted"),
             ("Decimal128(3, 500)", "Error converting 500 into i8 for 
Decimal128: out of range integral type conversion attempted"),
             ("Decimal256(3, 500)", "Error converting 500 into i8 for 
Decimal256: out of range integral type conversion attempted"),
-            ("Struct(f1, Int64)", "Error finding next type, got unexpected 
','"),
-            ("Struct(f1 Int64,)", "Expected a word for the name of Struct, but 
got trailing comma"),
-            ("Struct(f1)", "Error finding next type, got unexpected ')'"),
+            ("Struct(f1 Int64)", "Error unknown token: f1"),
+            ("Struct(\"f1\" Int64)", "Expected ':'"),
+            ("Struct(\"f1\": )", "Error finding next type, got unexpected 
')'"),
         ];
 
         for (data_type_string, expected_message) in cases {
@@ -884,12 +890,13 @@ mod test {
                     let message = e.to_string();
                     assert!(
                         message.contains(expected_message),
-                        "\n\ndid not find expected in actual.\n\nexpected: 
{expected_message}\nactual:{message}\n"
+                        "\n\ndid not find expected in actual.\n\nexpected: 
{expected_message}\nactual: {message}\n"
                     );
-                    // errors should also contain  a help message
-                    assert!(message.contains(
-                        "Must be a supported arrow type name such as 'Int32' 
or 'Timestamp(ns)'"
-                    ));
+
+                    if !message.contains("Unterminated string") {
+                        // errors should also contain a help message
+                        assert!(message.contains("Must be a supported arrow 
type name such as 'Int32' or 'Timestamp(ns)'"), "message: {message}");
+                    }
                 }
             }
         }
@@ -899,6 +906,6 @@ mod test {
     fn parse_error_type() {
         let err = parse_data_type("foobar").unwrap_err();
         assert!(matches!(err, ArrowError::ParseError(_)));
-        assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'. 
Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error 
unrecognized word: foobar");
+        assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'. 
Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error 
unknown token: foobar");
     }
 }
diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs
index dcb1b6183b..37545a8eed 100644
--- a/arrow-schema/src/schema.rs
+++ b/arrow-schema/src/schema.rs
@@ -701,7 +701,7 @@ mod tests {
             schema.to_string(),
             "Field { \"first_name\": Utf8, metadata: {\"k\": \"v\"} }, \
              Field { \"last_name\": Utf8 }, \
-             Field { \"address\": Struct(street Utf8, zip UInt16) }, \
+             Field { \"address\": Struct(\"street\": Utf8, \"zip\": UInt16) }, 
\
              Field { \"interests\": nullable Dictionary(Int32, Utf8), dict_id: 
123, dict_is_ordered }"
         )
     }
diff --git a/parquet/src/arrow/arrow_reader/mod.rs 
b/parquet/src/arrow/arrow_reader/mod.rs
index 0a5a7d0969..8a7e2ef709 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -3677,8 +3677,8 @@ mod tests {
                 ),
             ])),
             "Arrow: Incompatible supplied Arrow schema: data type mismatch for 
field nested: \
-            requested Struct(nested1_valid Utf8, nested1_invalid Int32) \
-            but found Struct(nested1_valid Utf8, nested1_invalid Int64)",
+            requested Struct(\"nested1_valid\": Utf8, \"nested1_invalid\": 
Int32) \
+            but found Struct(\"nested1_valid\": Utf8, \"nested1_invalid\": 
Int64)",
         );
     }

(arrow-rs) branch main updated: Quote `DataType::Struct` field names in `Display` formatting (#8291)

Reply via email to