vustef commented on code in PR #8715:
URL: https://github.com/apache/arrow-rs/pull/8715#discussion_r2501028305
##########
parquet/src/arrow/arrow_reader/mod.rs:
##########
@@ -516,6 +518,73 @@ impl ArrowReaderOptions {
}
}
+ /// Include virtual columns in the output.
+ ///
+ /// Virtual columns are columns that are not part of the Parquet schema,
but are added to the output by the reader.
+ ///
+ /// # Example
+ /// ```
+ /// # use std::sync::Arc;
+ /// # use arrow_array::{ArrayRef, Int64Array, RecordBatch};
+ /// # use arrow_schema::{DataType, Field, Schema};
+ /// # use parquet::arrow::{ArrowWriter, RowNumber};
+ /// # use parquet::arrow::arrow_reader::{ArrowReaderOptions,
ParquetRecordBatchReaderBuilder};
+ /// # use tempfile::tempfile;
+ /// #
+ /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
+ /// // Create a simple record batch with some data
+ /// let values = Arc::new(Int64Array::from(vec![1, 2, 3])) as ArrayRef;
+ /// let batch = RecordBatch::try_from_iter(vec![("value", values)])?;
+ ///
+ /// // Write the batch to a temporary parquet file
+ /// let file = tempfile()?;
+ /// let mut writer = ArrowWriter::try_new(
+ /// file.try_clone()?,
+ /// batch.schema(),
+ /// None
+ /// )?;
+ /// writer.write(&batch)?;
+ /// writer.close()?;
+ ///
+ /// // Create a virtual column for row numbers
+ /// let row_number_field = Field::new("row_number", DataType::Int64, false)
+ /// .with_extension_type(RowNumber::default());
+ ///
+ /// // Configure options with virtual columns
+ /// let options = ArrowReaderOptions::new()
+ /// .with_virtual_columns(vec![row_number_field]);
+ ///
+ /// // Create a reader with the options
+ /// let mut reader = ParquetRecordBatchReaderBuilder::try_new_with_options(
+ /// file,
+ /// options
+ /// )?
+ /// .build()?;
+ ///
+ /// // Read the batch - it will include both the original column and the
virtual row_number column
+ /// let result_batch = reader.next().unwrap()?;
+ /// assert_eq!(result_batch.num_columns(), 2); // "value" + "row_number"
+ /// assert_eq!(result_batch.num_rows(), 3);
+ /// #
+ /// # Ok(())
+ /// # }
+ /// ```
+ pub fn with_virtual_columns(self, virtual_columns: Vec<Field>) -> Self {
Review Comment:
> I guess one benefit of the current API is that the user can choose the
output column name vs my proposal would have to hard code it
Think this is a big benefit though. In addition to it, they can also attach
some more metadata to the `Field`, e.g. field ID. Iceberg for some reason
reserve specific field IDs for its metadata fields (e.g. `_pos`, a column name
that would correspond to row number, should have field ID 2147483645, according
to [the spec](https://iceberg.apache.org/spec/#reserved-field-ids)).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]