tustvold commented on code in PR #1716:
URL: https://github.com/apache/arrow-rs/pull/1716#discussion_r879404146
##########
parquet/src/arrow/schema.rs:
##########
@@ -51,74 +52,18 @@ pub fn parquet_to_arrow_schema(
) -> Result<Schema> {
parquet_to_arrow_schema_by_columns(
parquet_schema,
- 0..parquet_schema.columns().len(),
+ ProjectionMask::all(),
key_value_metadata,
)
}
-/// Convert parquet schema to arrow schema including optional metadata,
-/// only preserving some root columns.
-/// This is useful if we have columns `a.b`, `a.c.e` and `a.d`,
-/// and want `a` with all its child fields
-pub fn parquet_to_arrow_schema_by_root_columns<T>(
- parquet_schema: &SchemaDescriptor,
- column_indices: T,
- key_value_metadata: Option<&Vec<KeyValue>>,
-) -> Result<Schema>
-where
- T: IntoIterator<Item = usize>,
-{
- // Reconstruct the index ranges of the parent columns
- // An Arrow struct gets represented by 1+ columns based on how many child
fields the
- // struct has. This means that getting fields 1 and 2 might return the
struct twice,
- // if field 1 is the struct having say 3 fields, and field 2 is a
primitive.
- //
- // The below gets the parent columns, and counts the number of child
fields in each parent,
- // such that we would end up with:
- // - field 1 - columns: [0, 1, 2]
- // - field 2 - columns: [3]
- let mut parent_columns = vec![];
- let mut curr_name = "";
- let mut prev_name = "";
- let mut indices = vec![];
- (0..(parquet_schema.num_columns())).for_each(|i| {
- let p_type = parquet_schema.get_column_root(i);
- curr_name = p_type.get_basic_info().name();
- if prev_name.is_empty() {
- // first index
- indices.push(i);
- prev_name = curr_name;
- } else if curr_name != prev_name {
- prev_name = curr_name;
- parent_columns.push((curr_name.to_string(), indices.clone()));
- indices = vec![i];
- } else {
- indices.push(i);
- }
- });
- // push the last column if indices has values
- if !indices.is_empty() {
- parent_columns.push((curr_name.to_string(), indices));
- }
-
- // gather the required leaf columns
- let leaf_columns = column_indices
- .into_iter()
- .flat_map(|i| parent_columns[i].1.clone());
-
- parquet_to_arrow_schema_by_columns(parquet_schema, leaf_columns,
key_value_metadata)
-}
-
/// Convert parquet schema to arrow schema including optional metadata,
/// only preserving some leaf columns.
-pub fn parquet_to_arrow_schema_by_columns<T>(
+pub fn parquet_to_arrow_schema_by_columns(
parquet_schema: &SchemaDescriptor,
- column_indices: T,
+ mask: ProjectionMask,
Review Comment:
Currently yes, it gets moved into the Visitor. Theoretically it could borrow
and have lifetimes, but in most cases I suspect we have the mask by value
anyway.
Edit: This might be an argument to move to arrow Bitmap, as that is
internally refcounted... Future PR me thinks
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]