This is an automated email from the ASF dual-hosted git repository. kriskras99 pushed a commit to branch fix/stricter_schema_parsing in repository https://gitbox.apache.org/repos/asf/avro-rs.git
commit 9ca98bb6e2a5181a0c9b072f5ecb7e6fb2741caa Author: Kriskras99 <[email protected]> AuthorDate: Fri Feb 20 12:17:48 2026 +0000 fix!: Stricter schema parsing --- avro/src/bigdecimal.rs | 12 +- avro/src/error.rs | 7 +- avro/src/schema/mod.rs | 261 ++++++++++++++++++--------------------- avro/src/schema/parser.rs | 19 +-- avro/src/schema/record/field.rs | 18 ++- avro/src/schema/record/mod.rs | 1 - avro/src/schema/record/schema.rs | 10 -- avro/src/schema_compatibility.rs | 4 +- avro/src/serde/ser_schema.rs | 2 +- avro/tests/avro-3787.rs | 34 ++--- avro/tests/schema.rs | 12 +- avro/tests/validators.rs | 9 +- avro_derive/tests/derive.rs | 12 +- avro_test_helper/src/data.rs | 4 +- 14 files changed, 193 insertions(+), 212 deletions(-) diff --git a/avro/src/bigdecimal.rs b/avro/src/bigdecimal.rs index 28f395d..b887535 100644 --- a/avro/src/bigdecimal.rs +++ b/avro/src/bigdecimal.rs @@ -137,8 +137,10 @@ mod tests { "fields": [ { "name": "field_name", - "type": "bytes", - "logicalType": "big-decimal" + "type": { + "type": "bytes", + "logicalType": "big-decimal" + } } ] } @@ -196,8 +198,10 @@ mod tests { "fields": [ { "name": "big_decimal", - "type": "bytes", - "logicalType": "big-decimal" + "type": { + "type": "bytes", + "logicalType": "big-decimal" + } } ] } diff --git a/avro/src/error.rs b/avro/src/error.rs index 17c7497..6b951c1 100644 --- a/avro/src/error.rs +++ b/avro/src/error.rs @@ -309,8 +309,8 @@ pub enum Details { #[error("One union type {0:?} must match the `default`'s value type {1:?}")] GetDefaultUnion(SchemaKind, ValueKind), - #[error("`default`'s value type of field {0:?} in {1:?} must be {2:?}")] - GetDefaultRecordField(String, String, String), + #[error("`default`'s value type of field `{0}` in `{1}` must be a `{2:#}`. Got: {3:?}")] + GetDefaultRecordField(String, String, String, serde_json::Value), #[error("JSON number {0} could not be converted into an Avro value as it's too large")] JsonNumberTooLarge(serde_json::Number), @@ -405,6 +405,9 @@ pub enum Details { #[error("No `type` in complex type")] GetComplexTypeField, + #[error("No `type` in in record field")] + GetRecordFieldTypeField, + #[error("No `fields` in record")] GetRecordFieldsJson, diff --git a/avro/src/schema/mod.rs b/avro/src/schema/mod.rs index 50f7db8..a5166d8 100644 --- a/avro/src/schema/mod.rs +++ b/avro/src/schema/mod.rs @@ -27,7 +27,7 @@ mod union; use crate::{ AvroResult, error::{Details, Error}, - schema::{parser::Parser, record::RecordSchemaParseLocation}, + schema::parser::Parser, schema_equality, types::{self, Value}, }; @@ -1836,9 +1836,12 @@ mod tests { "name" : "record", "fields" : [ { - "type" : "enum", "name" : "enum", - "symbols": ["one", "two", "three"] + "type": { + "name" : "enum", + "type" : "enum", + "symbols": ["one", "two", "three"] + } }, { "name" : "next", "type" : "enum" } ] @@ -1882,17 +1885,9 @@ mod tests { doc: None, default: None, aliases: None, - schema: Schema::Enum(EnumSchema { - name: Name { - name: "enum".to_owned(), - namespace: None, - }, - aliases: None, - doc: None, - symbols: vec!["one".to_string(), "two".to_string(), "three".to_string()], - default: None, - attributes: Default::default(), - }), + schema: Schema::Ref { + name: Name::new("enum")?, + }, order: RecordFieldOrder::Ascending, position: 1, custom_attributes: Default::default(), @@ -1919,9 +1914,12 @@ mod tests { "name" : "record", "fields" : [ { - "type" : "fixed", - "name" : "fixed", - "size": 456 + "name": "fixed", + "type": { + "type" : "fixed", + "name" : "fixed", + "size": 456 + } }, { "name" : "next", "type" : "fixed" } ] @@ -1965,16 +1963,9 @@ mod tests { doc: None, default: None, aliases: None, - schema: Schema::Fixed(FixedSchema { - name: Name { - name: "fixed".to_owned(), - namespace: None, - }, - aliases: None, - doc: None, - size: 456, - attributes: Default::default(), - }), + schema: Schema::Ref { + name: Name::new("fixed")?, + }, order: RecordFieldOrder::Ascending, position: 1, custom_attributes: Default::default(), @@ -2139,7 +2130,7 @@ mod tests { "fields": [ {"name": "a", "type": "long", "default": 42}, {"name": "b", "type": "string"}, - {"name": "c", "type": "long", "logicalType": "timestamp-micros"} + {"name": "c", "type": {"type": "long", "logicalType": "timestamp-micros"}} ] } "#; @@ -3069,11 +3060,13 @@ mod tests { "fields": [ { "name": "decimal", - "type": "fixed", - "name": "nestedFixed", - "size": 8, - "logicalType": "decimal", - "precision": 4 + "type": { + "type": "fixed", + "name": "nestedFixed", + "size": 8, + "logicalType": "decimal", + "precision": 4 + } } ] }); @@ -3773,19 +3766,10 @@ mod tests { ] } "#; - let expected = Details::GetDefaultRecordField( - "f1".to_string(), - "ns.record1".to_string(), - r#""int""#.to_string(), - ) - .to_string(); - let result = Schema::parse_str(schema_str); - assert!(result.is_err()); - let err = result - .map_err(|e| e.to_string()) - .err() - .unwrap_or_else(|| "unexpected".to_string()); - assert_eq!(expected, err); + assert_eq!( + Schema::parse_str(schema_str).unwrap_err().to_string(), + "`default`'s value type of field `f1` in `ns.record1` must be a `\"int\"`. Got: String(\"invalid\")" + ); Ok(()) } @@ -3815,20 +3799,10 @@ mod tests { ] } "#; - let expected = Details::GetDefaultRecordField( - "f1".to_string(), - "ns.record1".to_string(), - r#"{"name":"ns.record2","type":"record","fields":[{"name":"f1_1","type":"int"}]}"# - .to_string(), - ) - .to_string(); - let result = Schema::parse_str(schema_str); - assert!(result.is_err()); - let err = result - .map_err(|e| e.to_string()) - .err() - .unwrap_or_else(|| "unexpected".to_string()); - assert_eq!(expected, err); + assert_eq!( + Schema::parse_str(schema_str).unwrap_err().to_string(), + "`default`'s value type of field `f1` in `ns.record1` must be a `{\"name\":\"ns.record2\",\"type\":\"record\",\"fields\":[{\"name\":\"f1_1\",\"type\":\"int\"}]}`. Got: String(\"invalid\")" + ); Ok(()) } @@ -3853,19 +3827,10 @@ mod tests { ] } "#; - let expected = Details::GetDefaultRecordField( - "f1".to_string(), - "ns.record1".to_string(), - r#"{"name":"ns.enum1","type":"enum","symbols":["a","b","c"]}"#.to_string(), - ) - .to_string(); - let result = Schema::parse_str(schema_str); - assert!(result.is_err()); - let err = result - .map_err(|e| e.to_string()) - .err() - .unwrap_or_else(|| "unexpected".to_string()); - assert_eq!(expected, err); + assert_eq!( + Schema::parse_str(schema_str).unwrap_err().to_string(), + "`default`'s value type of field `f1` in `ns.record1` must be a `{\"name\":\"ns.enum1\",\"type\":\"enum\",\"symbols\":[\"a\",\"b\",\"c\"]}`. Got: String(\"invalid\")" + ); Ok(()) } @@ -3890,19 +3855,10 @@ mod tests { ] } "#; - let expected = Details::GetDefaultRecordField( - "f1".to_string(), - "ns.record1".to_string(), - r#"{"name":"ns.fixed1","type":"fixed","size":3}"#.to_string(), - ) - .to_string(); - let result = Schema::parse_str(schema_str); - assert!(result.is_err()); - let err = result - .map_err(|e| e.to_string()) - .err() - .unwrap_or_else(|| "unexpected".to_string()); - assert_eq!(expected, err); + assert_eq!( + Schema::parse_str(schema_str).unwrap_err().to_string(), + "`default`'s value type of field `f1` in `ns.record1` must be a `{\"name\":\"ns.fixed1\",\"type\":\"fixed\",\"size\":3}`. Got: Number(100)" + ); Ok(()) } @@ -3917,8 +3873,10 @@ mod tests { "fields": [ { "name": "f1", - "type": "array", - "items": "int", + "type": { + "type": "array", + "items": "int" + }, "default": "invalid" } ] @@ -3932,7 +3890,7 @@ mod tests { .err() .unwrap_or_else(|| "unexpected".to_string()); assert_eq!( - r#"Default value for an array must be an array! Got: "invalid""#, + r#"`default`'s value type of field `f1` in `ns.record1` must be a `{"type":"array","items":"int"}`. Got: String("invalid")"#, err ); @@ -3949,8 +3907,10 @@ mod tests { "fields": [ { "name": "f1", - "type": "map", - "values": "string", + "type": { + "type": "map", + "values": "string" + }, "default": "invalid" } ] @@ -3964,7 +3924,7 @@ mod tests { .err() .unwrap_or_else(|| "unexpected".to_string()); assert_eq!( - r#"Default value for a map must be an object! Got: "invalid""#, + r#"`default`'s value type of field `f1` in `ns.record1` must be a `{"type":"map","values":"string"}`. Got: String("invalid")"#, err ); @@ -3999,19 +3959,10 @@ mod tests { ] } "#; - let expected = Details::GetDefaultRecordField( - "f2".to_string(), - "ns.record1".to_string(), - r#""ns.record2""#.to_string(), - ) - .to_string(); - let result = Schema::parse_str(schema_str); - assert!(result.is_err()); - let err = result - .map_err(|e| e.to_string()) - .err() - .unwrap_or_else(|| "unexpected".to_string()); - assert_eq!(expected, err); + assert_eq!( + Schema::parse_str(schema_str).unwrap_err().to_string(), + "`default`'s value type of field `f2` in `ns.record1` must be a `{\"name\":\"ns.record2\",\"type\":\"record\",\"fields\":[{\"name\":\"f1_1\",\"type\":\"int\"}]}`. Got: Object {\"f1_1\": Bool(true)}" + ); Ok(()) } @@ -4772,7 +4723,7 @@ mod tests { "fields": [ {"name": "a", "type": "long", "default": 42, "doc": "The field a"}, {"name": "b", "type": "string", "namespace": "test.a"}, - {"name": "c", "type": "long", "logicalType": "timestamp-micros"} + {"name": "c", "type": {"type": "long", "logicalType": "timestamp-micros"}} ] }"#; @@ -4855,16 +4806,18 @@ mod tests { "fields": [ { "name": "bar", - "type": "array", - "items": { - "type": "record", - "name": "baz", - "fields": [ - { - "name": "quux", - "type": "int" - } - ] + "type": { + "type": "array", + "items": { + "type": "record", + "name": "baz", + "fields": [ + { + "name": "quux", + "type": "int" + } + ] + } } } ] @@ -4928,16 +4881,18 @@ mod tests { "fields": [ { "name": "bar", - "type": "map", - "values": { - "type": "record", - "name": "baz", - "fields": [ - { - "name": "quux", - "type": "int" - } - ] + "type": { + "type": "map", + "values": { + "type": "record", + "name": "baz", + "fields": [ + { + "name": "quux", + "type": "int" + } + ] + } } } ] @@ -5281,15 +5236,19 @@ mod tests { "fields": [ { "name": "one", - "type": "enum", - "name": "ABC", - "symbols": ["A", "B", "C"] + "type": { + "type": "enum", + "name": "ABC", + "symbols": ["A", "B", "C"] + } }, { "name": "two", - "type": "array", - "items": "ABC", - "default": ["A", "B", "C"] + "type": { + "type": "array", + "items": "ABC", + "default": ["A", "B", "C"] + } } ] }"#, @@ -5410,15 +5369,19 @@ mod tests { "fields": [ { "name": "one", - "type": "enum", - "name": "ABC", - "symbols": ["A", "B", "C"] + "type": { + "type": "enum", + "name": "ABC", + "symbols": ["A", "B", "C"] + } }, { "name": "two", - "type": "map", - "values": "ABC", - "default": {"foo": "A"} + "type": { + "type": "map", + "values": "ABC", + "default": {"foo": "A"} + } } ] }"#, @@ -5438,4 +5401,26 @@ mod tests { Ok(()) } + + #[test] + fn avro_rs_476_enum_cannot_be_directly_in_field() -> TestResult { + let schema_str = r#"{ + "type": "record", + "name": "ExampleEnum", + "namespace": "com.schema", + "fields": [ + { + "name": "wrong_enum", + "type": "enum", + "symbols": ["INSERT", "UPDATE"] + } + ] + }"#; + let result = Schema::parse_str(schema_str).unwrap_err(); + assert_eq!( + result.to_string(), + "Invalid schema: There is no type called 'enum', if you meant to define a non-primitive schema, it should be defined inside `type` attribute. Please review the specification" + ); + Ok(()) + } } diff --git a/avro/src/schema/parser.rs b/avro/src/schema/parser.rs index 6747d84..d8791e0 100644 --- a/avro/src/schema/parser.rs +++ b/avro/src/schema/parser.rs @@ -16,7 +16,6 @@ // under the License. use crate::error::Details; -use crate::schema::record::RecordSchemaParseLocation; use crate::schema::{ Alias, Aliases, ArraySchema, DecimalMetadata, DecimalSchema, EnumSchema, FixedSchema, MapSchema, Name, Names, Namespace, Precision, RecordField, RecordSchema, Scale, Schema, @@ -110,9 +109,7 @@ impl Parser { ) -> AvroResult<Schema> { match *value { Value::String(ref t) => self.parse_known_schema(t.as_str(), enclosing_namespace), - Value::Object(ref data) => { - self.parse_complex(data, enclosing_namespace, RecordSchemaParseLocation::Root) - } + Value::Object(ref data) => self.parse_complex(data, enclosing_namespace), Value::Array(ref data) => self.parse_union(data, enclosing_namespace), _ => Err(Details::ParseSchemaFromValidJson.into()), } @@ -252,7 +249,6 @@ impl Parser { &mut self, complex: &Map<String, Value>, enclosing_namespace: &Namespace, - parse_location: RecordSchemaParseLocation, ) -> AvroResult<Schema> { // Try to parse this as a native complex type. fn parse_as_native_complex( @@ -461,23 +457,14 @@ impl Parser { } match complex.get("type") { Some(Value::String(t)) => match t.as_str() { - "record" => match parse_location { - RecordSchemaParseLocation::Root => { - self.parse_record(complex, enclosing_namespace) - } - RecordSchemaParseLocation::FromField => { - self.fetch_schema_ref(t, enclosing_namespace) - } - }, + "record" => self.parse_record(complex, enclosing_namespace), "enum" => self.parse_enum(complex, enclosing_namespace), "array" => self.parse_array(complex, enclosing_namespace), "map" => self.parse_map(complex, enclosing_namespace), "fixed" => self.parse_fixed(complex, enclosing_namespace), other => self.parse_known_schema(other, enclosing_namespace), }, - Some(Value::Object(data)) => { - self.parse_complex(data, enclosing_namespace, RecordSchemaParseLocation::Root) - } + Some(Value::Object(data)) => self.parse_complex(data, enclosing_namespace), Some(Value::Array(variants)) => self.parse_union(variants, enclosing_namespace), Some(unknown) => Err(Details::GetComplexType(unknown.clone()).into()), None => Err(Details::GetComplexTypeField.into()), diff --git a/avro/src/schema/record/field.rs b/avro/src/schema/record/field.rs index bda8a56..a2a1df3 100644 --- a/avro/src/schema/record/field.rs +++ b/avro/src/schema/record/field.rs @@ -17,9 +17,7 @@ use crate::AvroResult; use crate::error::Details; -use crate::schema::{ - Documentation, Name, Names, Parser, RecordSchemaParseLocation, Schema, SchemaKind, -}; +use crate::schema::{Documentation, Name, Names, Parser, Schema, SchemaKind}; use crate::types; use crate::util::MapHelper; use crate::validator::validate_record_field_name; @@ -81,12 +79,8 @@ impl RecordField { validate_record_field_name(&name)?; - // TODO: "type" = "<record name>" - let schema = parser.parse_complex( - field, - &enclosing_record.namespace, - RecordSchemaParseLocation::FromField, - )?; + let ty = field.get("type").ok_or(Details::GetRecordFieldTypeField)?; + let schema = parser.parse(ty, &enclosing_record.namespace)?; let default = field.get("default").cloned(); Self::resolve_default_value( @@ -162,10 +156,14 @@ impl RecordField { .is_ok(); if !resolved { + let schemata = names.values().cloned().collect::<Vec<_>>(); return Err(Details::GetDefaultRecordField( field_name.to_string(), record_name.to_string(), - field_schema.canonical_form(), + field_schema + .independent_canonical_form(&schemata) + .unwrap_or("Unknown".to_string()), + value.clone(), ) .into()); } diff --git a/avro/src/schema/record/mod.rs b/avro/src/schema/record/mod.rs index 0d1c5a1..080d9fe 100644 --- a/avro/src/schema/record/mod.rs +++ b/avro/src/schema/record/mod.rs @@ -19,5 +19,4 @@ mod field; pub use field::{RecordField, RecordFieldBuilder, RecordFieldOrder}; mod schema; -pub(crate) use schema::RecordSchemaParseLocation; pub use schema::{RecordSchema, RecordSchemaBuilder}; diff --git a/avro/src/schema/record/schema.rs b/avro/src/schema/record/schema.rs index 297dde9..b14bad7 100644 --- a/avro/src/schema/record/schema.rs +++ b/avro/src/schema/record/schema.rs @@ -66,16 +66,6 @@ fn calculate_lookup_table(fields: &[RecordField]) -> BTreeMap<String, usize> { .collect() } -#[derive(Debug, Default)] -pub(crate) enum RecordSchemaParseLocation { - /// When the parse is happening at root level - #[default] - Root, - - /// When the parse is happening inside a record field - FromField, -} - #[cfg(test)] mod tests { use super::*; diff --git a/avro/src/schema_compatibility.rs b/avro/src/schema_compatibility.rs index 20bbef2..4cc8c2f 100644 --- a/avro/src/schema_compatibility.rs +++ b/avro/src/schema_compatibility.rs @@ -542,7 +542,7 @@ mod tests { } fn int_list_record_schema() -> Schema { - Schema::parse_str(r#"{"type":"record", "name":"List", "fields": [{"name": "head", "type": "int"},{"name": "tail", "type": "array", "items": "int"}]}"#).unwrap() + Schema::parse_str(r#"{"type":"record", "name":"List", "fields": [{"name": "head", "type": "int"},{"name": "tail", "type": {"type": "array", "items": "int"}}]}"#).unwrap() } fn long_list_record_schema() -> Schema { @@ -551,7 +551,7 @@ mod tests { { "type":"record", "name":"List", "fields": [ {"name": "head", "type": "long"}, - {"name": "tail", "type": "array", "items": "long"} + {"name": "tail", "type": {"type": "array", "items": "long"}} ]} "#, ) diff --git a/avro/src/serde/ser_schema.rs b/avro/src/serde/ser_schema.rs index 7fb52ab..61534b9 100644 --- a/avro/src/serde/ser_schema.rs +++ b/avro/src/serde/ser_schema.rs @@ -3014,7 +3014,7 @@ mod tests { {"name": "stringField", "type": "string"}, {"name": "intField", "type": "int"}, {"name": "bigDecimalField", "type": {"type": "bytes", "logicalType": "big-decimal"}}, - {"name": "uuidField", "type": "fixed", "size": 16, "logicalType": "uuid"}, + {"name": "uuidField", "type": {"name": "uuid", "type": "fixed", "size": 16, "logicalType": "uuid"}}, {"name": "innerRecord", "type": ["null", "TestRecord"]} ] }"#, diff --git a/avro/tests/avro-3787.rs b/avro/tests/avro-3787.rs index 1795fca..192c573 100644 --- a/avro/tests/avro-3787.rs +++ b/avro/tests/avro-3787.rs @@ -204,15 +204,17 @@ fn avro_3787_deserialize_union_with_unknown_symbol_no_ref() -> TestResult { "name": "BarParent", "fields": [ { - "type": "enum", "name": "Bar", - "symbols": - [ - "bar0", - "bar1", - "bar2" - ], - "default": "bar0" + "type": { + "type": "enum", + "name": "Bar", + "symbols": [ + "bar0", + "bar1", + "bar2" + ], + "default": "bar0" + } } ] } @@ -235,14 +237,16 @@ fn avro_3787_deserialize_union_with_unknown_symbol_no_ref() -> TestResult { "name": "BarParent", "fields": [ { - "type": "enum", "name": "Bar", - "symbols": - [ - "bar0", - "bar1" - ], - "default": "bar0" + "type": { + "name": "Bar", + "type": "enum", + "symbols": [ + "bar0", + "bar1" + ], + "default": "bar0" + } } ] } diff --git a/avro/tests/schema.rs b/avro/tests/schema.rs index 3bcc895..89bfd68 100644 --- a/avro/tests/schema.rs +++ b/avro/tests/schema.rs @@ -1850,8 +1850,10 @@ fn test_avro_3851_read_default_value_for_array_record_field() -> TestResult { "type": "int" }, { "name": "f2", - "type": "array", - "items": "int", + "type": { + "type": "array", + "items": "int" + }, "default": [1, 2, 3] } ] @@ -1892,8 +1894,10 @@ fn test_avro_3851_read_default_value_for_map_record_field() -> TestResult { "type": "int" }, { "name": "f2", - "type": "map", - "values": "string", + "type": { + "type": "map", + "values": "string" + }, "default": { "a": "A", "b": "B", "c": "C" } } ] diff --git a/avro/tests/validators.rs b/avro/tests/validators.rs index ab37248..55659be 100644 --- a/avro/tests/validators.rs +++ b/avro/tests/validators.rs @@ -72,9 +72,12 @@ fn avro_3900_custom_validator_with_spec_invalid_names() -> TestResult { "type": "int" }, { - "type": "enum", - "name": "Test", - "symbols": ["A-B", "B-A"] + "name": "field-name", + "type": { + "type": "enum", + "name": "Test", + "symbols": ["A-B", "B-A"] + } } ] }"#; diff --git a/avro_derive/tests/derive.rs b/avro_derive/tests/derive.rs index 9c26152..c9f5d44 100644 --- a/avro_derive/tests/derive.rs +++ b/avro_derive/tests/derive.rs @@ -2088,13 +2088,17 @@ fn avro_rs_401_supported_type_variants() { }, { "name":"six", - "type":"array", - "items":"int" + "type":{ + "type":"array", + "items":"int" + } }, { "name":"seven", - "type":"array", - "items":"int" + "type": { + "type":"array", + "items":"int" + } } ] } diff --git a/avro_test_helper/src/data.rs b/avro_test_helper/src/data.rs index 8f61777..3713e83 100644 --- a/avro_test_helper/src/data.rs +++ b/avro_test_helper/src/data.rs @@ -362,8 +362,8 @@ pub const OTHER_ATTRIBUTES_EXAMPLES: &[(&str, bool)] = &[ "cp_int": 1, "cp_array": [ 1, 2, 3, 4], "fields": [ - {"name": "f1", "type": "fixed", "size": 16, "cp_object": {"a":1,"b":2}}, - {"name": "f2", "type": "fixed", "size": 8, "cp_null": null} + {"name": "f1", "type": {"name": "f1", "type": "fixed", "size": 16}, "cp_object": {"a":1,"b":2}}, + {"name": "f2", "type": {"name": "f2", "type": "fixed", "size": 8}, "cp_null": null} ] }"#, true,
