ngli-me commented on code in PR #6758: URL: https://github.com/apache/arrow-rs/pull/6758#discussion_r1903200250
########## arrow-array/src/record_batch.rs: ########## @@ -394,6 +396,104 @@ impl RecordBatch { ) } + /// Normalize a semi-structured [`RecordBatch`] into a flat table. + /// + /// `separator`: Nested [`Field`]s will generate names separated by `separator`, e.g. for + /// separator= "." and the schema: + /// ```text + /// "foo": StructArray<"bar": Utf8> + /// ``` + /// will generate: + /// ```text + /// "foo.bar": Utf8 + /// ``` + /// `max_level`: The maximum number of levels (depth of the `Schema` and `Columns`) to + /// normalize. If `0`, normalizes all levels. + /// + /// # Example + /// + /// ``` + /// # use std::sync::Arc; + /// # use arrow_array::{ArrayRef, Int64Array, StringArray, StructArray, RecordBatch}; + /// # use arrow_schema::{DataType, Field, Fields, Schema}; + /// + /// let animals: ArrayRef = Arc::new(StringArray::from(vec!["Parrot", ""])); + /// let n_legs: ArrayRef = Arc::new(Int64Array::from(vec![Some(2), Some(4)])); + /// + /// let animals_field = Arc::new(Field::new("animals", DataType::Utf8, true)); + /// let n_legs_field = Arc::new(Field::new("n_legs", DataType::Int64, true)); + /// + /// let a = Arc::new(StructArray::from(vec![ + /// (animals_field.clone(), Arc::new(animals.clone()) as ArrayRef), + /// (n_legs_field.clone(), Arc::new(n_legs.clone()) as ArrayRef), + /// ])); + /// + /// let schema = Schema::new(vec![ + /// Field::new( + /// "a", + /// DataType::Struct(Fields::from(vec![animals_field, n_legs_field])), + /// false, + /// ) + /// ]); + /// + /// let normalized = RecordBatch::try_new(Arc::new(schema), vec![a]) + /// .expect("valid conversion") + /// .normalize(".", 0) + /// .expect("valid normalization"); + /// + /// let expected = RecordBatch::try_from_iter_with_nullable(vec![ + /// ("a.animals", animals.clone(), true), + /// ("a.n_legs", n_legs.clone(), true), + /// ]) + /// .expect("valid conversion"); + /// + /// assert_eq!(expected, normalized); + /// ``` + pub fn normalize(&self, separator: &str, mut max_level: usize) -> Result<Self, ArrowError> { + if max_level == 0 { + max_level = usize::MAX; + } + let mut queue: VecDeque<(usize, &ArrayRef, Vec<&str>, &DataType, bool)> = VecDeque::new(); + for (c, f) in self.columns.iter().zip(self.schema.fields()) { + let name_vec: Vec<&str> = vec![f.name()]; + queue.push_back((0, c, name_vec, f.data_type(), f.is_nullable())); + } + let mut columns: Vec<ArrayRef> = Vec::new(); + let mut fields: Vec<FieldRef> = Vec::new(); + + while let Some((depth, c, name, data_type, nullable)) = queue.pop_front() { + if depth < max_level { + match data_type { + DataType::Struct(ff) => { + // Need to zip these in reverse to maintain original order + for (cff, fff) in c.as_struct().columns().iter().zip(ff.into_iter()).rev() { + let mut name = name.clone(); + name.push(separator); + name.push(fff.name().as_str()); Review Comment: Yep this works, no idea why I added `as_str()`! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org