Re: [PR] fix: avro_to_arrow: Handle avro nested nullable struct (union) [arrow-datafusion]

via GitHub Mon, 02 Oct 2023 10:10:38 -0700


sarutak commented on code in PR #7663:
URL: https://github.com/apache/arrow-datafusion/pull/7663#discussion_r1342876564



##########
datafusion/sqllogictest/test_files/avro.slt:
##########
@@ -101,11 +101,11 @@ SELECT id, CAST(string_col AS varchar) FROM 
alltypes_plain_multi_files
 1  1
 
 # test avro nested records
-query ??
-SELECT f1, f2 FROM nested_records
+query ????
+SELECT f1, f2, f3, f4 FROM nested_records
 ----
-{ns2.record2.f1_1: aaa, ns2.record2.f1_2: 10, ns2.record2.f1_3: 
{ns3.record3.f1_3_1: 3.14}} [{ns4.record4.f2_1: true, ns4.record4.f2_2: 1.2}, 
{ns4.record4.f2_1: true, ns4.record4.f2_2: 2.2}]
-{ns2.record2.f1_1: bbb, ns2.record2.f1_2: 20, ns2.record2.f1_3: 
{ns3.record3.f1_3_1: 3.14}} [{ns4.record4.f2_1: false, ns4.record4.f2_2: 10.2}]
+{f1_1: aaa, f1_2: 10, f1_3: {f1_3_1: 3.14}} [{f2_1: true, f2_2: 1.2}, {f2_1: 
true, f2_2: 2.2}] {f3_1: xyz} [{f4_1: 200}, {f4_1: }]

Review Comment:
   `[{f4_1: 200}, {f4_1: }]` seems a little bit weird. it should be like 
`[{f4_1: 200}, NULL]` or `[{f4_1: 200}, ]`?



##########
datafusion/core/src/datasource/avro_to_arrow/arrow_array_reader.rs:
##########
@@ -35,7 +35,7 @@ use crate::arrow::error::ArrowError;
 use crate::arrow::record_batch::RecordBatch;
 use crate::arrow::util::bit_util;
 use crate::error::{DataFusionError, Result};
-use apache_avro::schema::RecordSchema;
+use apache_avro::schema::{RecordSchema, UnionSchema};

Review Comment:
   `UnionSchema` is unused.



##########
datafusion/core/src/datasource/avro_to_arrow/schema.rs:
##########
@@ -129,8 +129,8 @@ fn schema_to_field_with_props(
                     }*/
                     schema_to_field_with_props(
                         &field.schema,
-                        Some(&format!("{}.{}", name.fullname(None), 
field.name)),
-                        false,
+                        Some(&field.name),
+                        nullable,

Review Comment:
   I don't think this change is correct because nullability of a record an its 
children are independent.



##########
datafusion/core/src/datasource/avro_to_arrow/arrow_array_reader.rs:
##########
@@ -97,15 +97,15 @@ impl<'a, R: Read> AvroArrowArrayReader<'a, R> {
         schema_lookup: &'b mut BTreeMap<String, usize>,
     ) -> Result<&'b BTreeMap<String, usize>> {
         match schema {
-            AvroSchema::Record(RecordSchema {
-                name,
-                fields,
-                lookup,
-                ..
-            }) => {
+            AvroSchema::Union(union_schema) => {
+                if union_schema.is_nullable() {
+                    let rec_schema = &union_schema.variants()[1];
+                    Self::child_schema_lookup(rec_schema, schema_lookup)?;
+                }
+            }
+            AvroSchema::Record(RecordSchema { fields, lookup, .. }) => {
                 lookup.iter().for_each(|(field_name, pos)| {
-                    schema_lookup
-                        .insert(format!("{}.{}", name.fullname(None), 
field_name), *pos);
+                    schema_lookup.insert(field_name.clone(), *pos);

Review Comment:
   This part looks good to me.
   I agree that it's better not to use full name of a record.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] fix: avro_to_arrow: Handle avro nested nullable struct (union) [arrow-datafusion]

Reply via email to