[
https://issues.apache.org/jira/browse/ARROW-9336?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Steven Willis updated ARROW-9336:
---------------------------------
Description:
Using {{::Arrow::RecordBatch.new(schema, data)}} (which uses the
{{RecordBatchBuilder}}) appears to handle when a record is missing an entry for
a top level column, but it doesn't handle when a record is missing an entry
within a struct column. For example, I'd expect the following code to print out
{{true}} for each {{puts}}, but 2 of them are {{false}}:
{code:ruby}
require 'parquet'
require 'arrow'
schema = [
{name: "a", type: "string"},
{name: "b", type: "struct", fields: [
{name: "c", type: "string"},
{name: "d", type: "string"},
]
},
]
arrow_schema = ::Arrow::Schema.new(schema)
record_batch = ::Arrow::RecordBatch.new(
arrow_schema,
[
{"a" => "a", "b" => {"c" => "c", }},
{ "b" => {"c" => "c", }},
{ "b" => { "d" => "d"}},
]
)
table = record_batch.to_table
puts(table['a'][0] == 'a')
puts(table['a'][1].nil?)
puts(table['a'][2].nil?)
puts(table['b'][0].key?('c'))
puts(table['b'][0]['c'] == 'c')
puts(table['b'][0].key?('d'))
puts(table['b'][0]['d'].nil?) # False ?
puts(!table['b'][0].key?('e'))
puts(table['b'][1].key?('c'))
puts(table['b'][1]['c'] == 'c')
puts(table['b'][1].key?('d'))
puts(table['b'][1]['d'].nil?)
puts(!table['b'][1].key?('e'))
puts(table['b'][2].key?('c'))
puts(table['b'][2]['c'].nil?)
puts(table['b'][2].key?('d'))
puts(table['b'][2]['d'] == 'd') # False ?
puts(!table['b'][2].key?('e'))
{code}
I'd expect {{puts(table)}} to print this representation:
{noformat}
a b
0 a {"c"=>"c", "d"=>nil}
1 {"c"=>"c", "d"=>nil}
2 {"c"=>nil, "d"=>"d"}
{noformat}
But it prints this instead:
{noformat}
a b
0 a {"c"=>"c", "d"=>"d"}
1 {"c"=>"c", "d"=>nil}
2 {"c"=>nil, "d"=>nil}
{noformat}
Furthermore, trying to write that table out to a parquet file results in the
following error:
{noformat}
Traceback (most recent call last):
7: from arrow_parquet2.rb:53:in `<main>'
6: from
/usr/local/lib/ruby/gems/2.6.0/gems/red-arrow-0.17.1/lib/arrow/block-closable.rb:25:in
`open'
5: from arrow_parquet2.rb:54:in `block in <main>'
4: from
/usr/local/lib/ruby/gems/2.6.0/gems/red-arrow-0.17.1/lib/arrow/block-closable.rb:25:in
`open'
3: from arrow_parquet2.rb:56:in `block (2 levels) in <main>'
2: from
/usr/local/lib/ruby/gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:514:in
`block in define_method'
1: from
/usr/local/lib/ruby/gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:600:in
`invoke'
/usr/local/lib/ruby/gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:600:in
`invoke': [parquet][arrow][file-writer][write-table]: Invalid: Column 1: In
chunk 0: Invalid: Struct child array #0 has length different from struct array
(2 != 3) (Arrow::Error::Invalid)
{noformat}
was:
Using {{::Arrow::RecordBatch.new(schema, data)}} (which uses the
{{RecordBatchBuilder}}) appears to handle when a record is missing an entry for
a top level column, but it doesn't handle when a record is missing an entry
within a struct column. For example, I'd expect the following code to print out
{{true}} for each {{puts}}, but 2 of them are {{false}}:
{code:ruby}
require 'parquet'
require 'arrow'
schema = [
{name: "a", type: "string"},
{name: "b", type: "struct", fields: [
{name: "c", type: "string"},
{name: "d", type: "string"},
]
},
]
arrow_schema = ::Arrow::Schema.new(schema)
record_batch = ::Arrow::RecordBatch.new(
arrow_schema,
[
{"a" => "a", "b" => {"c" => "c", }},
{ "b" => {"c" => "c", }},
{ "b" => { "d" => "d"}},
]
)
table = record_batch.to_table
puts(table['a'][0] == 'a')
puts(table['a'][1].nil?)
puts(table['a'][2].nil?)
puts(table['b'][0].key?('c'))
puts(table['b'][0]['c'] == 'c')
puts(table['b'][0].key?('d'))
puts(table['b'][0]['d'].nil?) # False ?
puts(!table['b'][0].key?('e'))
puts(table['b'][1].key?('c'))
puts(table['b'][1]['c'] == 'c')
puts(table['b'][1].key?('d'))
puts(table['b'][1]['d'].nil?)
puts(!table['b'][1].key?('e'))
puts(table['b'][2].key?('c'))
puts(table['b'][2]['c'].nil?)
puts(table['b'][2].key?('d'))
puts(table['b'][2]['d'] == 'd') # False ?
puts(!table['b'][2].key?('e'))
{code}
I'd expect {{puts(table)}} to print this representation:
{noformat}
a b
0 a {"c"=>"c", "d"=>nil}
1 {"c"=>"c", "d"=>nil}
2 {"c"=>nil, "d"=>"d"}
{noformat}
But it prints this instead:
{noformat}
a b
0 a {"c"=>"c", "d"=>"d"}
1 {"c"=>"c", "d"=>nil}
2 {"c"=>nil, "d"=>nil}
{noformat}
> [Ruby] Creating RecordBatch with structs missing keys results in a malformed
> table
> ----------------------------------------------------------------------------------
>
> Key: ARROW-9336
> URL: https://issues.apache.org/jira/browse/ARROW-9336
> Project: Apache Arrow
> Issue Type: Bug
> Components: Ruby
> Affects Versions: 0.17.1
> Reporter: Steven Willis
> Priority: Major
>
> Using {{::Arrow::RecordBatch.new(schema, data)}} (which uses the
> {{RecordBatchBuilder}}) appears to handle when a record is missing an entry
> for a top level column, but it doesn't handle when a record is missing an
> entry within a struct column. For example, I'd expect the following code to
> print out {{true}} for each {{puts}}, but 2 of them are {{false}}:
> {code:ruby}
> require 'parquet'
> require 'arrow'
> schema = [
> {name: "a", type: "string"},
> {name: "b", type: "struct", fields: [
> {name: "c", type: "string"},
> {name: "d", type: "string"},
> ]
> },
> ]
> arrow_schema = ::Arrow::Schema.new(schema)
> record_batch = ::Arrow::RecordBatch.new(
> arrow_schema,
> [
> {"a" => "a", "b" => {"c" => "c", }},
> { "b" => {"c" => "c", }},
> { "b" => { "d" => "d"}},
> ]
> )
> table = record_batch.to_table
> puts(table['a'][0] == 'a')
> puts(table['a'][1].nil?)
> puts(table['a'][2].nil?)
> puts(table['b'][0].key?('c'))
> puts(table['b'][0]['c'] == 'c')
> puts(table['b'][0].key?('d'))
> puts(table['b'][0]['d'].nil?) # False ?
> puts(!table['b'][0].key?('e'))
> puts(table['b'][1].key?('c'))
> puts(table['b'][1]['c'] == 'c')
> puts(table['b'][1].key?('d'))
> puts(table['b'][1]['d'].nil?)
> puts(!table['b'][1].key?('e'))
> puts(table['b'][2].key?('c'))
> puts(table['b'][2]['c'].nil?)
> puts(table['b'][2].key?('d'))
> puts(table['b'][2]['d'] == 'd') # False ?
> puts(!table['b'][2].key?('e'))
> {code}
> I'd expect {{puts(table)}} to print this representation:
> {noformat}
> a b
> 0 a {"c"=>"c", "d"=>nil}
> 1 {"c"=>"c", "d"=>nil}
> 2 {"c"=>nil, "d"=>"d"}
> {noformat}
> But it prints this instead:
> {noformat}
> a b
> 0 a {"c"=>"c", "d"=>"d"}
> 1 {"c"=>"c", "d"=>nil}
> 2 {"c"=>nil, "d"=>nil}
> {noformat}
> Furthermore, trying to write that table out to a parquet file results in the
> following error:
> {noformat}
> Traceback (most recent call last):
> 7: from arrow_parquet2.rb:53:in `<main>'
> 6: from
> /usr/local/lib/ruby/gems/2.6.0/gems/red-arrow-0.17.1/lib/arrow/block-closable.rb:25:in
> `open'
> 5: from arrow_parquet2.rb:54:in `block in <main>'
> 4: from
> /usr/local/lib/ruby/gems/2.6.0/gems/red-arrow-0.17.1/lib/arrow/block-closable.rb:25:in
> `open'
> 3: from arrow_parquet2.rb:56:in `block (2 levels) in <main>'
> 2: from
> /usr/local/lib/ruby/gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:514:in
> `block in define_method'
> 1: from
> /usr/local/lib/ruby/gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:600:in
> `invoke'
> /usr/local/lib/ruby/gems/2.6.0/gems/gobject-introspection-3.4.3/lib/gobject-introspection/loader.rb:600:in
> `invoke': [parquet][arrow][file-writer][write-table]: Invalid: Column 1: In
> chunk 0: Invalid: Struct child array #0 has length different from struct
> array (2 != 3) (Arrow::Error::Invalid)
> {noformat}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)