[ 
https://issues.apache.org/jira/browse/AVRO-3772?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17732780#comment-17732780
 ] 

Evan Blackwell edited comment on AVRO-3772 at 6/14/23 10:33 PM:
----------------------------------------------------------------

[~mgrigorov] thanks for getting this together so quickly. It looks like the 
test cases I gave did pass, but I was doing some reading on defaults in avro 
and am now wondering if I had the default in the right spot. It looks like 
there's a difference between a field default and a symbol default according to 
this article 
[https://medium.com/expedia-group-tech/safety-considerations-when-using-enums-in-avro-schemas-82e18baaa081]
 

 

If the reader schema looked like this, I think the symbol default would be 
diamonds and the field default would be spades. It seems the symbol default is 
intended for use with forward compatibility if we read a symbol we don't 
recognize (e.g. if we read "ninjas," we would resolve to "diamonds"), and the 
field default is for backwards compatibility if we read a message doesn't have 
the field at all (e.g. if an old schema doesn't have field c, we default to 
"spades").
{code:java}
{
            "type": "record",
            "name": "test",
            "fields": [
                {"name": "a", "type": "long", "default": 42},
                {"name": "b", "type": "string"},
                {
                    "name": "c",
                    "type": {
                        "type": "enum",
                        "name": "suit",
                        "symbols": ["diamonds", "spades", "ninja", "hearts"],
                        "default": "diamonds"
                    },
                    "default": "spades"
                }
            ]
        } {code}
It seems like I might have asked you to make the field default operate the way 
that the symbol default is supposed to be used. Maybe this actually should have 
been the test
{code:java}
    #[test]
    fn avro_3772_enum_default() -> TestResult {
        let writer_raw_schema = r#"
        {
          "type": "record",
          "name": "test",
          "fields": [
            {"name": "a", "type": "long", "default": 42},
            {"name": "b", "type": "string"},
            {
              "name": "c",
              "type": {
                "type": "enum",
                "name": "suit",
                "symbols": ["diamonds", "spades", "clubs", "hearts"]
              },
              "default": "spades"
            }
          ]
        }
        "#;        let reader_raw_schema = r#"
        {
          "type": "record",
          "name": "test",
          "fields": [
            {"name": "a", "type": "long", "default": 42},
            {"name": "b", "type": "string"},
            {
              "name": "c",
              "type": {
                  "type": "enum",
                  "name": "suit",
                  "symbols": ["diamonds", "spades", "ninja", "hearts"],
                  "default": "spades"
               }
            }
          ]
        }
      "#;
        let writer_schema = Schema::parse_str(writer_raw_schema)?;
        let reader_schema = Schema::parse_str(reader_raw_schema)?;
        let mut writer = Writer::with_codec(&writer_schema, Vec::new(), 
Codec::Null);
        let mut record = Record::new(writer.schema()).unwrap();
        record.put("a", 27i64);
        record.put("b", "foo");
        record.put("c", "clubs");
        writer.append(record).unwrap();
        let input = writer.into_inner()?;
        let mut reader = Reader::with_schema(&reader_schema, &input[..])?;
        assert_eq!(
            reader.next().unwrap().unwrap(),
            Value::Record(vec![
                ("a".to_string(), Value::Long(27)),
                ("b".to_string(), Value::String("foo".to_string())),
                ("c".to_string(), Value::Enum(1, "spades".to_string())),
            ])
        );
        assert!(reader.next().is_none());
        Ok(())
    } {code}


was (Author: JIRAUSER300776):
[~mgrigorov] thanks for getting this together so quickly. It looks like the 
test cases I gave did pass, but I was doing some reading on defaults in avro 
and am now wondering if I had the default in the right spot. It looks like 
there's a difference between a field default and a symbol default according to 
this article 
[https://medium.com/expedia-group-tech/safety-considerations-when-using-enums-in-avro-schemas-82e18baaa081]
 

 

If the reader schema looked like this, I think the symbol default would be 
diamonds and the field default would be spades. It seems the symbol default is 
intended for use with forward compatibility if we read a symbol we don't 
recognize (e.g. if we read "ninjas," we would resolve to "diamonds"), and the 
field default is for backwards compatibility if we read a message doesn't have 
the field at all (e.g. if an old schema doesn't have field c, we default to 
"spades").
{code:java}
{
            "type": "record",
            "name": "test",
            "fields": [
                {"name": "a", "type": "long", "default": 42},
                {"name": "b", "type": "string"},
                {
                    "name": "c",
                    "type": {
                        "type": "enum",
                        "name": "suit",
                        "symbols": ["diamonds", "spades", "ninja", "hearts"],
                        "default": "diamonds"
                    },
                    "default": "spades"
                }
            ]
        } {code}
It seems like I might have asked you to make the field default operate the way 
that the symbol default is supposed to be used. Maybe this actually should have 
been the test
{code:java}
    #[test]
    fn avro_3772_enum_default() -> TestResult {
        let writer_raw_schema = r#"
        {
          "type": "record",
          "name": "test",
          "fields": [
            {"name": "a", "type": "long", "default": 42},
            {"name": "b", "type": "string"},
            {
              "name": "c",
              "type": {
                "type": "enum",
                "name": "suit",
                "symbols": ["diamonds", "spades", "clubs", "hearts"]
              },
              "default": "spades"
            }
          ]
        }
        "#;        let reader_raw_schema = r#"
        {
          "type": "record",
          "name": "test",
          "fields": [
            {"name": "a", "type": "long", "default": 42},
            {"name": "b", "type": "string"},
            {
              "name": "c",
              "type": {
                  "type": "enum",
                  "name": "suit",
                  "symbols": ["diamonds", "spades", "ninja", "hearts"],
                  "default": "spades"               }
            }
          ]
        }
      "#;
        let writer_schema = Schema::parse_str(writer_raw_schema)?;
        let reader_schema = Schema::parse_str(reader_raw_schema)?;
        let mut writer = Writer::with_codec(&writer_schema, Vec::new(), 
Codec::Null);
        let mut record = Record::new(writer.schema()).unwrap();
        record.put("a", 27i64);
        record.put("b", "foo");
        record.put("c", "clubs");
        writer.append(record).unwrap();
        let input = writer.into_inner()?;
        let mut reader = Reader::with_schema(&reader_schema, &input[..])?;
        assert_eq!(
            reader.next().unwrap().unwrap(),
            Value::Record(vec![
                ("a".to_string(), Value::Long(27)),
                ("b".to_string(), Value::String("foo".to_string())),
                ("c".to_string(), Value::Enum(1, "spades".to_string())),
            ])
        );
        assert!(reader.next().is_none());
        Ok(())
    } {code}

> [Rust] Deserialize Errors for an Unknown Enum Symbol instead of Returning 
> Default
> ---------------------------------------------------------------------------------
>
>                 Key: AVRO-3772
>                 URL: https://issues.apache.org/jira/browse/AVRO-3772
>             Project: Apache Avro
>          Issue Type: Bug
>          Components: rust
>    Affects Versions: 1.9.0, 1.10.0, 1.9.1, 1.9.2, 1.11.0, 1.10.1, 1.10.2, 
> 1.11.1
>            Reporter: Evan Blackwell
>            Assignee: Martin Tzvetanov Grigorov
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 1.12.0, 1.11.2
>
>          Time Spent: 0.5h
>  Remaining Estimate: 0h
>
> The rust implementation of avro appears to behave according to an old 
> specification when deserializing messages with a symbol in the writer's enum 
> that is not in the reader's enum. The deserializer is returning an error even 
> if the reader schema specifies a default. Starting in version 1.9.0, the 
> default should be used in this situation according to this section of the 
> documentation about schema resolution: 
> [https://avro.apache.org/docs/current/spec.html#Schema+Resolution]
> {code:java}
> if both are enums:
> if the writer's symbol is not present in the reader's enum and the reader has 
> a default value, then that value is used, otherwise an error is signalled. 
> {code}
>  
> This test currently in master expects the deserializer to error because it 
> was written 5 years ago before spec 1.9.0 was released. (summary of the test: 
> writer has symbol "clubs," the reader does not have the "clubs" symbol but 
> does have a default, record gets made with symbol "clubs," the result is an 
> error)
> [https://github.com/apache/avro/blob/6f4162e3d71e602bc398563b102d569846d5f39f/lang/rust/avro/src/lib.rs#L871]
>  
> {code:java}
> #[test]
> fn test_enum_resolution() {
>     let writer_raw_schema = r#"
>         {
>             "type": "record",
>             "name": "test",
>             "fields": [
>                 {"name": "a", "type": "long", "default": 42},
>                 {"name": "b", "type": "string"},
>                 {
>                     "name": "c",
>                     "type": {
>                         "type": "enum",
>                         "name": "suit",
>                         "symbols": ["diamonds", "spades", "clubs", "hearts"]
>                     },
>                     "default": "spades"
>                 }
>             ]
>         }
>     "#;
>     let reader_raw_schema = r#"
>         {
>             "type": "record",
>             "name": "test",
>             "fields": [
>                 {"name": "a", "type": "long", "default": 42},
>                 {"name": "b", "type": "string"},
>                 {
>                     "name": "c",
>                     "type": {
>                         "type": "enum",
>                         "name": "suit",
>                             "symbols": ["diamonds", "spades", "ninja", 
> "hearts"]
>                     },
>                     "default": "spades"
>                 }
>             ]
>         }
>     "#;
>     let writer_schema = Schema::parse_str(writer_raw_schema).unwrap();
>     let reader_schema = Schema::parse_str(reader_raw_schema).unwrap();
>     let mut writer = Writer::with_codec(&writer_schema, Vec::new(), 
> Codec::Null);
>     let mut record = Record::new(writer.schema()).unwrap();
>     record.put("a", 27i64);
>     record.put("b", "foo");
>     record.put("c", "clubs");
>     writer.append(record).unwrap();
>     let input = writer.into_inner().unwrap();
>     let mut reader = Reader::with_schema(&reader_schema, &input[..]).unwrap();
>     assert!(reader.next().unwrap().is_err());
>     assert!(reader.next().is_none());
> } {code}
>  
>  
> If the deserializer was correctly using the default, I would expect the last 
> two lines to instead assert the first two values were as expected with c 
> defaulting to "spades"
>  
> {code:java}
> assert_eq!(
>     reader.next().unwrap().unwrap(),
>     Value::Record(vec![
>         ("a".to_string(), Value::Long(27)),
>         ("b".to_string(), Value::String("foo".to_string())),
>         ("c".to_string(), Value::Enum(1, "spades".to_string())),
>     ])
> );
> assert!(reader.next().is_none()); {code}
>  
> The error returned is GetDefaultEnum, and it seems to be getting created here 
> when we recognize that the symbol is not in the list of symbols that the 
> reader has for the enum. 
> [https://github.com/apache/avro/blob/6f4162e3d71e602bc398563b102d569846d5f39f/lang/rust/avro/src/types.rs#L848]
>  
> {code:java}
> fn resolve_enum(self, symbols: &[String]) -> Result<Self, Error> {
>         let validate_symbol = |symbol: String, symbols: &[String]| {
>             if let Some(index) = symbols.iter().position(|item| item == 
> &symbol) {
>                 Ok(Value::Enum(index as u32, symbol))
>             } else {
>                 Err(Error::GetEnumDefault {
>                     symbol,
>                     symbols: symbols.into(),
>                 })
>             }
>         };
>         match self {
>             Value::Enum(raw_index, s) => {
>                 let index = usize::try_from(raw_index)
>                     .map_err(|e| Error::ConvertU32ToUsize(e, raw_index))?;
>                 if (0..=symbols.len()).contains(&index) {
>                     validate_symbol(s, symbols)
>                 } else {
>                     Err(Error::GetEnumValue {
>                         index,
>                         nsymbols: symbols.len(),
>                     })
>                 }
>             }
>             Value::String(s) => validate_symbol(s, symbols),
>             other => Err(Error::GetEnum(other.into())),
>         }
>     } {code}
>  
> To follow the specification, it seems that instead of always returning the 
> GetEnumDefault error if the symbol doesn't exist in the reader enum, we 
> should first check if there is a default to use and return it if possible. Or 
> the caller should check for this error and use the default if available.
>  
> I'm relatively new to avro, so if I'm misunderstanding either the 
> specification or the code behavior please let me know. Thanks!
>  
> (I'm also curious if there would be another issue if the record contained a 
> symbol that exists in the writer enum and had an index greater than the len 
> of the reader's symbols since it appears to be doing a check for that on line 
> 859).
>  
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to