ngli-me commented on code in PR #6758:
URL: https://github.com/apache/arrow-rs/pull/6758#discussion_r1855247625


##########
arrow-schema/src/schema.rs:
##########
@@ -413,6 +413,93 @@ impl Schema {
         &self.metadata
     }
 
+    /// Returns a new schema, normalized based on the max_level
+    /// This carries metadata from the parent schema over as well
+    pub fn normalize(&self, separator: &str, mut max_level: usize) -> 
Result<Self, ArrowError> {
+        if max_level == 0 {
+            max_level = usize::MAX;
+        }
+        let mut new_fields: Vec<Field> = vec![];
+        for field in self.fields() {
+            match field.data_type() {
+                //DataType::List(f) => field,
+                //DataType::ListView(_) => field,
+                //DataType::FixedSizeList(_, _) => field,
+                //DataType::LargeList(_) => field,
+                //DataType::LargeListView(_) => field,
+                DataType::Struct(nested_fields) => {
+                    let field_name = field.name().as_str();
+                    new_fields = [
+                        new_fields,
+                        Self::normalizer(

Review Comment:
   Not sure if it's better to have it be recursive or iterative.



##########
arrow-array/src/record_batch.rs:
##########
@@ -403,6 +406,68 @@ impl RecordBatch {
         )
     }
 
+    /// Normalize a semi-structured RecordBatch into a flat table
+    /// If max_level is 0, normalizes all levels.
+    pub fn normalize(&self, separator: &str, mut max_level: usize) -> 
Result<Self, ArrowError> {
+        if max_level == 0 {
+            max_level = usize::MAX;
+        }
+        if self.num_rows() == 0 {
+            // No data, only need to normalize the schema
+            return Ok(Self::new_empty(Arc::new(
+                self.schema.normalize(separator, max_level)?,
+            )));
+        }
+        let mut queue: VecDeque<(usize, &Arc<dyn Array>, &FieldRef)> = 
VecDeque::new();
+
+        // push fields
+        for (c, f) in self.columns.iter().zip(self.schema.fields()) {
+            queue.push_front((0, c, f));
+        }
+
+        while !queue.is_empty() {
+            match queue.pop_front() {
+                Some((depth, c, f)) => {
+
+                    if depth < max_level {
+                        match (c.data_type(), f.data_type()) {
+                            //DataType::List(f) => field,

Review Comment:
   If I understand correctly, I only need to unwrap `StructArray` and the 
associated `DataType::Struct` `Field`, right?



##########
arrow-array/src/record_batch.rs:
##########
@@ -403,6 +406,68 @@ impl RecordBatch {
         )
     }
 
+    /// Normalize a semi-structured RecordBatch into a flat table
+    /// If max_level is 0, normalizes all levels.
+    pub fn normalize(&self, separator: &str, mut max_level: usize) -> 
Result<Self, ArrowError> {
+        if max_level == 0 {
+            max_level = usize::MAX;
+        }
+        if self.num_rows() == 0 {
+            // No data, only need to normalize the schema
+            return Ok(Self::new_empty(Arc::new(
+                self.schema.normalize(separator, max_level)?,
+            )));
+        }
+        let mut queue: VecDeque<(usize, &Arc<dyn Array>, &FieldRef)> = 
VecDeque::new();
+
+        // push fields
+        for (c, f) in self.columns.iter().zip(self.schema.fields()) {
+            queue.push_front((0, c, f));
+        }
+
+        while !queue.is_empty() {
+            match queue.pop_front() {
+                Some((depth, c, f)) => {
+
+                    if depth < max_level {
+                        match (c.data_type(), f.data_type()) {
+                            //DataType::List(f) => field,
+                            //DataType::ListView(_) => field,
+                            //DataType::FixedSizeList(_, _) => field,
+                            //DataType::LargeList(_) => field,
+                            //DataType::LargeListView(_) => field,
+                            (DataType::Struct(cf), DataType::Struct(ff)) => {
+                                let field_name = f.name().as_str();
+                                let new_key = 
format!("{key_string}{separator}{field_name}");
+                                
ff.iter().rev().zip(cf.iter().rev()).map(|(field, ())| {
+                                    let updated_field = Field::new(
+                                        format!("{key_string}{separator}{}", 
field.name()),
+                                        field.data_type().clone(),
+                                        field.is_nullable(),
+                                    );
+                                    queue.push_front((
+                                        depth + 1,
+                                        c, // TODO: need to modify c -- if 
it's a StructArray, it needs to have the fields modified.
+                                        &Arc::new(updated_field),
+                                    ))
+                                });
+                            }
+                            //DataType::Union(_, _) => field,
+                            //DataType::Dictionary(_, _) => field,
+                            //DataType::Map(_, _) => field,
+                            //DataType::RunEndEncoded(_, _) => field, // not 
sure how to support this field
+                            _ => queue.push_front((depth, c, f)),
+                        }
+                    } else {
+                        queue.push_front((depth, c, f));
+                    }
+                }
+                None => break,

Review Comment:
   This should probably return an `Err`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to