[ 
https://issues.apache.org/jira/browse/AVRO-2910?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Chris Coutinho updated AVRO-2910:
---------------------------------
    Description: 
The random Avro data generator tool doesn't generate data that conforms to a 
schema containing nullable fields.

For example, the following test.avdl can be used to define a schema for single 
`test` record, where some of the fields are nullable based on `union`. There 
doesn't seem to be a difference between setting null first or last.

 

{{@namespace("com.example")}}
{{ protocol test {}}
{{ record test}}

{{{ string name; union \\{null, string}}}

{{null_name1;}}
{{ union \{string, null} null_name2;}}
{{ int num;}}
{{ union \{null, int} null_num1;}}
{{ union \{int, null} null_num2;}}
{{ }}}

{{}}}

 

Using the `idl2schemata` tool, the previous .avdl file generates the following 
(.avsc) schema

 

{{{}}
{{ "type" : "record",}}
{{ "name" : "test",}}
{{ "namespace" : "com.example",}}
{{ "fields" : [}}

{{{ "name" : "name", "type" : "string" }}}

{{,}}

{{{ "name" : "null_name1", "type" : [ "null", "string" ] }}}

{{,}}

{{{ "name" : "null_name2", "type" : [ "string", "null" ] }}}

{{,}}

{{{ "name" : "num", "type" : "int" }}}

{{,}}

{{{ "name" : "null_num1", "type" : [ "null", "int" ] }}}

{{,}}

{{{ "name" : "null_num2", "type" : [ "int", "null" ] }}}

{{]}}
{{ }}}

 

 

Using the `random` tool and piping the result into `tojson`, you can see that 
the generated data doesn't conform to the schema:

 

 

{{$ avro-tools random --count 10 --schema-file test.avsc - | avro-tools tojson 
--reader-schema-file test.avsc - > sample.json}}

 

Note how the non-null random values are unnecessarily nested in the following 
examples, where the generated value is nested within an object with the data 
type as a key (e.g. {{"null_num2":\{"int":1517937645}` instead of 
`"null_num2":1517937645}})

 

{{{"name":"xgpmamxstwwehhluowytabwleviffo","null_name1":null,"null_name2":null,"num":565397409,"null_num1":\\{"int":1893963068}}}

{{,"null_num2":\{"int":1517937645}}}}

{{{"name":"qexuommiixgjriduvqydexerym","null_name1":null,"null_name2":\\{"string":"oxflkduyjagrsrlnagnkxuwej"}}}

{{,"num":1709296377,"null_num1":null,"null_num2":\{"int":1155695595}}}}

{{{"name":"adjlcnnewythkqsufythqj","null_name1":null,"null_name2":\\{"string":"qttntcfnuxvetji"}}}

{{,"num":-985375976,"null_num1":null,"null_num2":\{"int":1473838855}}}}

{{{"name":"atvtwigrsgjsbqllinslarhayra","null_name1":null,"null_name2":null,"num":-829302451,"null_num1":null,"null_num2":null}
 
\{"name":"mbshjkjmpoaotrrcmo","null_name1":null,"null_name2":\{"string":"fpoaply"}}}

{{,"num":-528843282,"null_num1":null,"null_num2":null}}}

{{{"name":"swnvwyhl","null_name1":\\{"string":"eppuugalnegxqlntgsjyjipynhmgsha"}}}

{{,"null_name2":null,"num":1959434696,"null_num1":\{"int":401147221},"null_num2":\{"int":-1054848203}}}}

{{{"name":"nxejciyeghtsrparo","null_name1":\\{"string":"pdnhnhh"}}}

{{,"null_name2":null,"num":226278493,"null_num1":\{"int":-1921928686},"null_num2":null}}}

{{{"name":"iqeoxaorhs","null_name1":null,"null_name2":\\{"string":"qjmjhy"}}}

{{,"num":-1562871043,"null_num1":\{"int":819200163},"null_num2":null}}}

{{{"name":"qumynkqmmvubsrwhdkutcwuupddqalmlwek","null_name1":null,"null_name2":\\{"string":"s"}}}

{{,"num":-1023135104,"null_num1":null,"null_num2":\{"int":-1707061745}}}}

{{{"name":"sajkrdfevvwokxpdrqhymjdvv","null_name1":null,"null_name2":\\{"string":"egiv"}}}

{{,"num":389828169,"null_num1":\{"int":-511951083},"null_num2":null}}}

 

  was:
The random Avro data generator tool doesn't generate data that conforms to a 
schema containing nullable fields.

For example, the following test.avdl can be used to define a schema for single 
`test` record, where some of the fields are nullable based on `union`. There 
doesn't seem to be a difference between setting null first or last.

 

```

@namespace("com.example")
protocol test {
 record test {
 string name;
 union \{null, string} null_name1;
 union \{string, null} null_name2;
 int num;
 union \{null, int} null_num1;
 union \{int, null} null_num2;
 }

}

```

Using the `idl2schemata` tool, the previous .avdl file generates the following 
(.avsc) schema

```json

{
 "type" : "record",
 "name" : "test",
 "namespace" : "com.example",
 "fields" : [ {
 "name" : "name",
 "type" : "string"
 }, {
 "name" : "null_name1",
 "type" : [ "null", "string" ]
 }, {
 "name" : "null_name2",
 "type" : [ "string", "null" ]
 }, {
 "name" : "num",
 "type" : "int"
 }, {
 "name" : "null_num1",
 "type" : [ "null", "int" ]
 }, {
 "name" : "null_num2",
 "type" : [ "int", "null" ]
 } ]
}

```

 

Using the `random` tool and piping the result into `tojson`, you can see that 
the generated data doesn't conform to the schema:

 

```shell

$ avro-tools random --count 10 --schema-file test.avsc - | avro-tools tojson 
--reader-schema-file test.avsc - > sample.json

```

Note how the non-null random values are unnecessarily nested in the following 
examples, where the generated value is nested within an object with the data 
type as a key (e.g. `"null_num2":\{"int":1517937645}` instead of 
`"null_num2":1517937645`)

```json

{"name":"xgpmamxstwwehhluowytabwleviffo","null_name1":null,"null_name2":null,"num":565397409,"null_num1":\{"int":1893963068},"null_num2":\{"int":1517937645}}
{"name":"qexuommiixgjriduvqydexerym","null_name1":null,"null_name2":\{"string":"oxflkduyjagrsrlnagnkxuwej"},"num":1709296377,"null_num1":null,"null_num2":\{"int":1155695595}}
{"name":"adjlcnnewythkqsufythqj","null_name1":null,"null_name2":\{"string":"qttntcfnuxvetji"},"num":-985375976,"null_num1":null,"null_num2":\{"int":1473838855}}
{"name":"atvtwigrsgjsbqllinslarhayra","null_name1":null,"null_name2":null,"num":-829302451,"null_num1":null,"null_num2":null}
{"name":"mbshjkjmpoaotrrcmo","null_name1":null,"null_name2":\{"string":"fpoaply"},"num":-528843282,"null_num1":null,"null_num2":null}
{"name":"swnvwyhl","null_name1":\{"string":"eppuugalnegxqlntgsjyjipynhmgsha"},"null_name2":null,"num":1959434696,"null_num1":\{"int":401147221},"null_num2":\{"int":-1054848203}}
{"name":"nxejciyeghtsrparo","null_name1":\{"string":"pdnhnhh"},"null_name2":null,"num":226278493,"null_num1":\{"int":-1921928686},"null_num2":null}
{"name":"iqeoxaorhs","null_name1":null,"null_name2":\{"string":"qjmjhy"},"num":-1562871043,"null_num1":\{"int":819200163},"null_num2":null}
{"name":"qumynkqmmvubsrwhdkutcwuupddqalmlwek","null_name1":null,"null_name2":\{"string":"s"},"num":-1023135104,"null_num1":null,"null_num2":\{"int":-1707061745}}
{"name":"sajkrdfevvwokxpdrqhymjdvv","null_name1":null,"null_name2":\{"string":"egiv"},"num":389828169,"null_num1":\{"int":-511951083},"null_num2":null}

```

 


> Random untion(__, null) data doesn't conform to schema
> ------------------------------------------------------
>
>                 Key: AVRO-2910
>                 URL: https://issues.apache.org/jira/browse/AVRO-2910
>             Project: Apache Avro
>          Issue Type: Bug
>          Components: tools
>            Reporter: Chris Coutinho
>            Priority: Minor
>
> The random Avro data generator tool doesn't generate data that conforms to a 
> schema containing nullable fields.
> For example, the following test.avdl can be used to define a schema for 
> single `test` record, where some of the fields are nullable based on `union`. 
> There doesn't seem to be a difference between setting null first or last.
>  
> {{@namespace("com.example")}}
> {{ protocol test {}}
> {{ record test}}
> {{{ string name; union \\{null, string}}}
> {{null_name1;}}
> {{ union \{string, null} null_name2;}}
> {{ int num;}}
> {{ union \{null, int} null_num1;}}
> {{ union \{int, null} null_num2;}}
> {{ }}}
> {{}}}
>  
> Using the `idl2schemata` tool, the previous .avdl file generates the 
> following (.avsc) schema
>  
> {{{}}
> {{ "type" : "record",}}
> {{ "name" : "test",}}
> {{ "namespace" : "com.example",}}
> {{ "fields" : [}}
> {{{ "name" : "name", "type" : "string" }}}
> {{,}}
> {{{ "name" : "null_name1", "type" : [ "null", "string" ] }}}
> {{,}}
> {{{ "name" : "null_name2", "type" : [ "string", "null" ] }}}
> {{,}}
> {{{ "name" : "num", "type" : "int" }}}
> {{,}}
> {{{ "name" : "null_num1", "type" : [ "null", "int" ] }}}
> {{,}}
> {{{ "name" : "null_num2", "type" : [ "int", "null" ] }}}
> {{]}}
> {{ }}}
>  
>  
> Using the `random` tool and piping the result into `tojson`, you can see that 
> the generated data doesn't conform to the schema:
>  
>  
> {{$ avro-tools random --count 10 --schema-file test.avsc - | avro-tools 
> tojson --reader-schema-file test.avsc - > sample.json}}
>  
> Note how the non-null random values are unnecessarily nested in the following 
> examples, where the generated value is nested within an object with the data 
> type as a key (e.g. {{"null_num2":\{"int":1517937645}` instead of 
> `"null_num2":1517937645}})
>  
> {{{"name":"xgpmamxstwwehhluowytabwleviffo","null_name1":null,"null_name2":null,"num":565397409,"null_num1":\\{"int":1893963068}}}
> {{,"null_num2":\{"int":1517937645}}}}
> {{{"name":"qexuommiixgjriduvqydexerym","null_name1":null,"null_name2":\\{"string":"oxflkduyjagrsrlnagnkxuwej"}}}
> {{,"num":1709296377,"null_num1":null,"null_num2":\{"int":1155695595}}}}
> {{{"name":"adjlcnnewythkqsufythqj","null_name1":null,"null_name2":\\{"string":"qttntcfnuxvetji"}}}
> {{,"num":-985375976,"null_num1":null,"null_num2":\{"int":1473838855}}}}
> {{{"name":"atvtwigrsgjsbqllinslarhayra","null_name1":null,"null_name2":null,"num":-829302451,"null_num1":null,"null_num2":null}
>  
> \{"name":"mbshjkjmpoaotrrcmo","null_name1":null,"null_name2":\{"string":"fpoaply"}}}
> {{,"num":-528843282,"null_num1":null,"null_num2":null}}}
> {{{"name":"swnvwyhl","null_name1":\\{"string":"eppuugalnegxqlntgsjyjipynhmgsha"}}}
> {{,"null_name2":null,"num":1959434696,"null_num1":\{"int":401147221},"null_num2":\{"int":-1054848203}}}}
> {{{"name":"nxejciyeghtsrparo","null_name1":\\{"string":"pdnhnhh"}}}
> {{,"null_name2":null,"num":226278493,"null_num1":\{"int":-1921928686},"null_num2":null}}}
> {{{"name":"iqeoxaorhs","null_name1":null,"null_name2":\\{"string":"qjmjhy"}}}
> {{,"num":-1562871043,"null_num1":\{"int":819200163},"null_num2":null}}}
> {{{"name":"qumynkqmmvubsrwhdkutcwuupddqalmlwek","null_name1":null,"null_name2":\\{"string":"s"}}}
> {{,"num":-1023135104,"null_num1":null,"null_num2":\{"int":-1707061745}}}}
> {{{"name":"sajkrdfevvwokxpdrqhymjdvv","null_name1":null,"null_name2":\\{"string":"egiv"}}}
> {{,"num":389828169,"null_num1":\{"int":-511951083},"null_num2":null}}}
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to