alamb commented on code in PR #5149:
URL: https://github.com/apache/arrow-rs/pull/5149#discussion_r1409732355
##########
arrow-schema/src/fields.rs:
##########
@@ -99,6 +100,93 @@ impl Fields {
.all(|(a, b)| Arc::ptr_eq(a, b) || a.contains(b))
}
+ /// Performs a depth-first scan of [`Fields`] filtering the [`FieldRef`]
with no children
Review Comment:
Here is a proposal of how to better phrase what this is doing (as well as
give a usecase that might not be obvious to the casual reader)
```suggestion
/// Returns a copy of this [`Fields`], with only those non nested (leaf)
[`FieldRef`]s that
/// pass a predicate.
///
/// This can be used to select only a subset of fields in nested types
such as
/// [`DataType::Struct`].
/// Leaf [`FieldRef`]s `DataType`s have no nested `FieldRef`s`. For
example
/// a field with [`DatatType::Int32`] would be a leaf.
```
##########
arrow-schema/src/fields.rs:
##########
@@ -99,6 +100,93 @@ impl Fields {
.all(|(a, b)| Arc::ptr_eq(a, b) || a.contains(b))
}
+ /// Performs a depth-first scan of [`Fields`] filtering the [`FieldRef`]
with no children
+ ///
+ /// Invokes `filter` with each leaf [`FieldRef`], i.e. one containing no
children, and a
+ /// count of the number of previous calls to `filter` - i.e. the leaf's
index.
+ ///
+ /// Returns a new [`Fields`] comprising the [`FieldRef`] for which
`filter` returned `true`
+ ///
+ /// ```
+ /// # use arrow_schema::{DataType, Field, Fields};
+ /// let fields = Fields::from(vec![
+ /// Field::new("a", DataType::Int32, true),
+ /// Field::new("b", DataType::Struct(Fields::from(vec![
+ /// Field::new("c", DataType::Float32, false),
+ /// Field::new("d", DataType::Float64, false),
+ /// ])), false)
+ /// ]);
+ /// let filtered = fields.filter_leaves(|idx, _| idx == 0 || idx == 2);
+ /// let expected = Fields::from(vec![
+ /// Field::new("a", DataType::Int32, true),
+ /// Field::new("b", DataType::Struct(Fields::from(vec![
+ /// Field::new("d", DataType::Float64, false),
+ /// ])), false)
+ /// ]);
+ /// assert_eq!(filtered, expected);
+ /// ```
+ pub fn filter_leaves<F: FnMut(usize, &FieldRef) -> bool>(&self, mut
filter: F) -> Self {
+ fn filter_field<F: FnMut(&FieldRef) -> bool>(
+ f: &FieldRef,
+ filter: &mut F,
+ ) -> Option<FieldRef> {
+ use DataType::*;
+
+ let (k, v) = match f.data_type() {
+ Dictionary(k, v) => (Some(k.clone()), v.as_ref()),
Review Comment:
this presumes the key itself doesn't have any leaves which I think is
reasonable, but figured I would point it out.
##########
arrow-schema/src/fields.rs:
##########
@@ -99,6 +100,93 @@ impl Fields {
.all(|(a, b)| Arc::ptr_eq(a, b) || a.contains(b))
}
+ /// Performs a depth-first scan of [`Fields`] filtering the [`FieldRef`]
with no children
+ ///
+ /// Invokes `filter` with each leaf [`FieldRef`], i.e. one containing no
children, and a
+ /// count of the number of previous calls to `filter` - i.e. the leaf's
index.
Review Comment:
```suggestion
/// Invokes `filter` with each leaf [`FieldRef`] and a
/// count of the depth to the `field` (i.e. the number of previous calls
to `filter`, and the leaf's index.)
```
##########
arrow-schema/src/fields.rs:
##########
@@ -99,6 +100,90 @@ impl Fields {
.all(|(a, b)| Arc::ptr_eq(a, b) || a.contains(b))
}
+ /// Performs a depth-first scan of [`Fields`] filtering the [`FieldRef`]
with no children
+ ///
+ /// Returns a new [`Fields`] comprising the [`FieldRef`] for which
`filter` returned `true`
+ ///
+ /// ```
+ /// # use arrow_schema::{DataType, Field, Fields};
+ /// let fields = Fields::from(vec![
+ /// Field::new("a", DataType::Int32, true),
+ /// Field::new("b", DataType::Struct(Fields::from(vec![
+ /// Field::new("c", DataType::Float32, false),
+ /// Field::new("d", DataType::Float64, false),
+ /// ])), false)
+ /// ]);
+ /// let filtered = fields.filter_leaves(|idx, _| idx == 0 || idx == 2);
+ /// let expected = Fields::from(vec![
+ /// Field::new("a", DataType::Int32, true),
+ /// Field::new("b", DataType::Struct(Fields::from(vec![
+ /// Field::new("d", DataType::Float64, false),
+ /// ])), false)
+ /// ]);
+ /// assert_eq!(filtered, expected);
+ /// ```
+ pub fn filter_leaves<F: FnMut(usize, &FieldRef) -> bool>(&self, mut
filter: F) -> Self {
Review Comment:
I wonder how we would support selecting a subfield that is itself a field?
For example:
```
{
a : {
b: {
c: 1,
},
d: 2
}
```
Could I use this API to support only selecting a.b?
```
{
a : {
b: {
c: 1,
},
}
```
Maybe we could if the parent FieldRef was also passed
```rust
/// calls filter(depth, parent_field, child_field)`
pub fn filter_leaves<F: FnMut(usize, &FieldRef, &FieldRef) ->
bool>(&self, mut filter: F) -> Self {
...
}
```
🤔
##########
arrow-schema/src/fields.rs:
##########
@@ -99,6 +100,90 @@ impl Fields {
.all(|(a, b)| Arc::ptr_eq(a, b) || a.contains(b))
}
+ /// Performs a depth-first scan of [`Fields`] filtering the [`FieldRef`]
with no children
+ ///
+ /// Returns a new [`Fields`] comprising the [`FieldRef`] for which
`filter` returned `true`
+ ///
+ /// ```
+ /// # use arrow_schema::{DataType, Field, Fields};
+ /// let fields = Fields::from(vec![
+ /// Field::new("a", DataType::Int32, true),
+ /// Field::new("b", DataType::Struct(Fields::from(vec![
+ /// Field::new("c", DataType::Float32, false),
+ /// Field::new("d", DataType::Float64, false),
+ /// ])), false)
+ /// ]);
+ /// let filtered = fields.filter_leaves(|idx, _| idx == 0 || idx == 2);
+ /// let expected = Fields::from(vec![
+ /// Field::new("a", DataType::Int32, true),
+ /// Field::new("b", DataType::Struct(Fields::from(vec![
+ /// Field::new("d", DataType::Float64, false),
+ /// ])), false)
+ /// ]);
+ /// assert_eq!(filtered, expected);
+ /// ```
+ pub fn filter_leaves<F: FnMut(usize, &FieldRef) -> bool>(&self, mut
filter: F) -> Self {
Review Comment:
I wonder how we would support selecting a subfield that is itself a field?
For example:
```
{
a : {
b: {
c: 1,
},
d: 2
}
```
Could I use this API to support only selecting a.b?
```
{
a : {
b: {
c: 1,
},
}
```
Maybe we could if the parent FieldRef was also passed
```rust
/// calls filter(depth, parent_field, child_field)`
pub fn filter_leaves<F: FnMut(usize, &FieldRef, &FieldRef) ->
bool>(&self, mut filter: F) -> Self {
...
}
```
🤔
##########
arrow-schema/src/fields.rs:
##########
@@ -99,6 +100,93 @@ impl Fields {
.all(|(a, b)| Arc::ptr_eq(a, b) || a.contains(b))
}
+ /// Performs a depth-first scan of [`Fields`] filtering the [`FieldRef`]
with no children
+ ///
+ /// Invokes `filter` with each leaf [`FieldRef`], i.e. one containing no
children, and a
+ /// count of the number of previous calls to `filter` - i.e. the leaf's
index.
+ ///
+ /// Returns a new [`Fields`] comprising the [`FieldRef`] for which
`filter` returned `true`
+ ///
+ /// ```
+ /// # use arrow_schema::{DataType, Field, Fields};
+ /// let fields = Fields::from(vec![
+ /// Field::new("a", DataType::Int32, true),
+ /// Field::new("b", DataType::Struct(Fields::from(vec![
+ /// Field::new("c", DataType::Float32, false),
+ /// Field::new("d", DataType::Float64, false),
+ /// ])), false)
+ /// ]);
+ /// let filtered = fields.filter_leaves(|idx, _| idx == 0 || idx == 2);
+ /// let expected = Fields::from(vec![
+ /// Field::new("a", DataType::Int32, true),
+ /// Field::new("b", DataType::Struct(Fields::from(vec![
+ /// Field::new("d", DataType::Float64, false),
+ /// ])), false)
+ /// ]);
+ /// assert_eq!(filtered, expected);
+ /// ```
+ pub fn filter_leaves<F: FnMut(usize, &FieldRef) -> bool>(&self, mut
filter: F) -> Self {
+ fn filter_field<F: FnMut(&FieldRef) -> bool>(
+ f: &FieldRef,
+ filter: &mut F,
+ ) -> Option<FieldRef> {
+ use DataType::*;
+
+ let (k, v) = match f.data_type() {
+ Dictionary(k, v) => (Some(k.clone()), v.as_ref()),
+ d => (None, d),
+ };
+ let d = match v {
Review Comment:
[Map](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#variant.Map)
and
[RunEndEncoded](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#variant.RunEndEncoded)
appear to have embedded fields too. It seems like they might also need to be
handled 🤔
The usecase might be "I want only the `values` of the map? Or maybe Arrow
can't express that concept (Apply the filter to only the values 🤔 )
##########
arrow-schema/src/fields.rs:
##########
@@ -99,6 +100,93 @@ impl Fields {
.all(|(a, b)| Arc::ptr_eq(a, b) || a.contains(b))
}
+ /// Performs a depth-first scan of [`Fields`] filtering the [`FieldRef`]
with no children
+ ///
+ /// Invokes `filter` with each leaf [`FieldRef`], i.e. one containing no
children, and a
+ /// count of the number of previous calls to `filter` - i.e. the leaf's
index.
+ ///
+ /// Returns a new [`Fields`] comprising the [`FieldRef`] for which
`filter` returned `true`
+ ///
+ /// ```
+ /// # use arrow_schema::{DataType, Field, Fields};
+ /// let fields = Fields::from(vec![
+ /// Field::new("a", DataType::Int32, true),
+ /// Field::new("b", DataType::Struct(Fields::from(vec![
+ /// Field::new("c", DataType::Float32, false),
+ /// Field::new("d", DataType::Float64, false),
+ /// ])), false)
+ /// ]);
+ /// let filtered = fields.filter_leaves(|idx, _| idx == 0 || idx == 2);
+ /// let expected = Fields::from(vec![
+ /// Field::new("a", DataType::Int32, true),
+ /// Field::new("b", DataType::Struct(Fields::from(vec![
+ /// Field::new("d", DataType::Float64, false),
+ /// ])), false)
+ /// ]);
+ /// assert_eq!(filtered, expected);
+ /// ```
+ pub fn filter_leaves<F: FnMut(usize, &FieldRef) -> bool>(&self, mut
filter: F) -> Self {
+ fn filter_field<F: FnMut(&FieldRef) -> bool>(
+ f: &FieldRef,
+ filter: &mut F,
+ ) -> Option<FieldRef> {
+ use DataType::*;
+
+ let (k, v) = match f.data_type() {
+ Dictionary(k, v) => (Some(k.clone()), v.as_ref()),
+ d => (None, d),
+ };
+ let d = match v {
Review Comment:
[Map](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#variant.Map)
and
[RunEndEncoded](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#variant.RunEndEncoded)
appear to have embedded fields too. It seems like they might also need to be
handled 🤔
The usecase might be "I want only the `values` of the map? Or maybe Arrow
can't express that concept (Apply the filter to only the values 🤔 )
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]