alamb commented on code in PR #8715: URL: https://github.com/apache/arrow-rs/pull/8715#discussion_r2519554440
########## parquet/src/arrow/array_reader/row_number.rs: ########## @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::arrow::array_reader::ArrayReader; +use crate::errors::{ParquetError, Result}; +use crate::file::metadata::{ParquetMetaData, RowGroupMetaData}; +use arrow_array::{ArrayRef, Int64Array}; +use arrow_schema::DataType; +use std::any::Any; +use std::collections::HashSet; +use std::sync::Arc; + +pub(crate) struct RowNumberReader { + buffered_row_numbers: Vec<i64>, + remaining_row_numbers: std::iter::Flatten<std::vec::IntoIter<std::ops::Range<i64>>>, +} + +impl RowNumberReader { + pub(crate) fn try_new<'a>( + parquet_metadata: &'a ParquetMetaData, + row_groups: impl Iterator<Item = &'a RowGroupMetaData>, + ) -> Result<Self> { + // Collect ordinals from the selected row groups + let selected_ordinals: HashSet<i16> = row_groups + .map(|rg| { + rg.ordinal().ok_or_else(|| { Review Comment: looks good ########## parquet/src/arrow/arrow_reader/mod.rs: ########## @@ -537,6 +543,73 @@ impl ArrowReaderOptions { } } + /// Include virtual columns in the output. + /// + /// Virtual columns are columns that are not part of the Parquet schema, but are added to the output by the reader. Review Comment: ```suggestion /// Virtual columns are columns that are not part of the Parquet schema, but are added to the output by the reader such as row numbers. ``` ########## parquet/src/arrow/array_reader/mod.rs: ########## @@ -42,12 +42,13 @@ mod map_array; mod null_array; mod primitive_array; mod row_group_cache; +mod row_number; mod struct_array; #[cfg(test)] mod test_util; -// Note that this crate is public under the `experimental` feature flag. Review Comment: Fixed in 030525123aa ########## parquet/src/arrow/schema/complex.rs: ########## @@ -77,10 +78,18 @@ impl ParquetField { match &self.field_type { ParquetFieldType::Primitive { .. } => None, ParquetFieldType::Group { children } => Some(children), + ParquetFieldType::Virtual(_) => None, } } } +/// Types of virtual columns that can be computed at read time +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum VirtualColumnType { + /// Row number within the file + RowNumber, +} + #[derive(Debug, Clone)] pub enum ParquetFieldType { Review Comment: I double checked at this enum is not publically exposed: https://docs.rs/parquet/latest/parquet/?search=ParquetFieldType Thus this is a backwards compatible change -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
