vustef commented on code in PR #1824:
URL: https://github.com/apache/iceberg-rust/pull/1824#discussion_r2533534707
##########
crates/iceberg/src/scan/mod.rs:
##########
@@ -124,6 +125,47 @@ impl<'a> TableScanBuilder<'a> {
self
}
+ /// Include the _file metadata column in the scan.
+ ///
+ /// This is a convenience method that adds the _file column to the current
selection.
+ /// If no columns are currently selected (select_all), this will select
all columns plus _file.
+ /// If specific columns are selected, this adds _file to that selection.
+ ///
+ /// # Example
+ /// ```no_run
+ /// # use iceberg::table::Table;
+ /// # async fn example(table: Table) -> iceberg::Result<()> {
+ /// // Select id, name, and _file
+ /// let scan = table
+ /// .scan()
+ /// .select(["id", "name"])
+ /// .with_file_path_column()
+ /// .build()?;
+ /// # Ok(())
+ /// # }
+ /// ```
+ pub fn with_file_path_column(mut self) -> Self {
Review Comment:
nit: given that the column is named `_file`, maybe name this
`with_file_column`?
##########
crates/iceberg/src/arrow/record_batch_transformer.rs:
##########
@@ -539,83 +593,164 @@ impl RecordBatchTransformer {
prim_lit: &Option<PrimitiveLiteral>,
num_rows: usize,
) -> Result<ArrayRef> {
- Ok(match (target_type, prim_lit) {
- (DataType::Boolean, Some(PrimitiveLiteral::Boolean(value))) => {
- Arc::new(BooleanArray::from(vec![*value; num_rows]))
- }
- (DataType::Boolean, None) => {
- let vals: Vec<Option<bool>> = vec![None; num_rows];
- Arc::new(BooleanArray::from(vals))
- }
- (DataType::Int32, Some(PrimitiveLiteral::Int(value))) => {
- Arc::new(Int32Array::from(vec![*value; num_rows]))
- }
- (DataType::Int32, None) => {
- let vals: Vec<Option<i32>> = vec![None; num_rows];
- Arc::new(Int32Array::from(vals))
- }
- (DataType::Date32, Some(PrimitiveLiteral::Int(value))) => {
- Arc::new(Date32Array::from(vec![*value; num_rows]))
+ // All added columns use Run-End Encoding for memory efficiency
+ let DataType::RunEndEncoded(_, values_field) = target_type else {
+ return Err(Error::new(
+ ErrorKind::Unexpected,
+ format!(
+ "Expected RunEndEncoded type for added column, got: {}",
+ target_type
+ ),
+ ));
+ };
+
+ // Helper to create a Run-End Encoded array
+ let create_ree_array = |values_array: ArrayRef| -> Result<ArrayRef> {
+ let run_ends = if num_rows == 0 {
+ Int32Array::from(Vec::<i32>::new())
+ } else {
+ Int32Array::from(vec![num_rows as i32])
+ };
+ Ok(Arc::new(
+ RunArray::try_new(&run_ends, &values_array).map_err(|e| {
+ Error::new(
+ ErrorKind::Unexpected,
+ "Failed to create RunArray for constant value",
+ )
+ .with_source(e)
+ })?,
+ ))
+ };
+
+ // Create the values array based on the literal value
+ let values_array: ArrayRef = match (values_field.data_type(),
prim_lit) {
+ (DataType::Boolean, Some(PrimitiveLiteral::Boolean(v))) => {
+ Arc::new(BooleanArray::from(vec![*v]))
}
- (DataType::Date32, None) => {
- let vals: Vec<Option<i32>> = vec![None; num_rows];
- Arc::new(Date32Array::from(vals))
+ (DataType::Boolean, None) =>
Arc::new(BooleanArray::from(vec![Option::<bool>::None])),
+ (DataType::Int32, Some(PrimitiveLiteral::Int(v))) => {
+ Arc::new(Int32Array::from(vec![*v]))
}
- (DataType::Int64, Some(PrimitiveLiteral::Long(value))) => {
- Arc::new(Int64Array::from(vec![*value; num_rows]))
+ (DataType::Int32, None) =>
Arc::new(Int32Array::from(vec![Option::<i32>::None])),
+ (DataType::Date32, Some(PrimitiveLiteral::Int(v))) => {
+ Arc::new(Date32Array::from(vec![*v]))
}
- (DataType::Int64, None) => {
- let vals: Vec<Option<i64>> = vec![None; num_rows];
- Arc::new(Int64Array::from(vals))
+ (DataType::Date32, None) =>
Arc::new(Date32Array::from(vec![Option::<i32>::None])),
+ (DataType::Int64, Some(PrimitiveLiteral::Long(v))) => {
+ Arc::new(Int64Array::from(vec![*v]))
}
- (DataType::Float32, Some(PrimitiveLiteral::Float(value))) => {
- Arc::new(Float32Array::from(vec![value.0; num_rows]))
+ (DataType::Int64, None) =>
Arc::new(Int64Array::from(vec![Option::<i64>::None])),
+ (DataType::Float32, Some(PrimitiveLiteral::Float(v))) => {
+ Arc::new(Float32Array::from(vec![v.0]))
}
- (DataType::Float32, None) => {
- let vals: Vec<Option<f32>> = vec![None; num_rows];
- Arc::new(Float32Array::from(vals))
+ (DataType::Float32, None) =>
Arc::new(Float32Array::from(vec![Option::<f32>::None])),
+ (DataType::Float64, Some(PrimitiveLiteral::Double(v))) => {
+ Arc::new(Float64Array::from(vec![v.0]))
}
- (DataType::Float64, Some(PrimitiveLiteral::Double(value))) => {
- Arc::new(Float64Array::from(vec![value.0; num_rows]))
+ (DataType::Float64, None) =>
Arc::new(Float64Array::from(vec![Option::<f64>::None])),
+ (DataType::Utf8, Some(PrimitiveLiteral::String(v))) => {
+ Arc::new(StringArray::from(vec![v.as_str()]))
}
- (DataType::Float64, None) => {
- let vals: Vec<Option<f64>> = vec![None; num_rows];
- Arc::new(Float64Array::from(vals))
+ (DataType::Utf8, None) =>
Arc::new(StringArray::from(vec![Option::<&str>::None])),
+ (DataType::Binary, Some(PrimitiveLiteral::Binary(v))) => {
+ Arc::new(BinaryArray::from_vec(vec![v.as_slice()]))
}
- (DataType::Utf8, Some(PrimitiveLiteral::String(value))) => {
- Arc::new(StringArray::from(vec![value.clone(); num_rows]))
+ (DataType::Binary, None) => {
+
Arc::new(BinaryArray::from_opt_vec(vec![Option::<&[u8]>::None]))
}
- (DataType::Utf8, None) => {
- let vals: Vec<Option<String>> = vec![None; num_rows];
- Arc::new(StringArray::from(vals))
+ (DataType::Decimal128(_, _), Some(PrimitiveLiteral::Int128(v))) =>
{
+ Arc::new(arrow_array::Decimal128Array::from(vec![{ *v }]))
}
- (DataType::Binary, Some(PrimitiveLiteral::Binary(value))) => {
- Arc::new(BinaryArray::from_vec(vec![value; num_rows]))
+ (DataType::Decimal128(_, _), Some(PrimitiveLiteral::UInt128(v)))
=> {
+ Arc::new(arrow_array::Decimal128Array::from(vec![*v as i128]))
}
- (DataType::Binary, None) => {
- let vals: Vec<Option<&[u8]>> = vec![None; num_rows];
- Arc::new(BinaryArray::from_opt_vec(vals))
+ (DataType::Decimal128(_, _), None) => {
+ Arc::new(arrow_array::Decimal128Array::from(vec![
+ Option::<i128>::None,
+ ]))
}
(DataType::Struct(fields), None) => {
- // Create a StructArray filled with nulls. Per Iceberg spec,
optional struct fields
- // default to null when added to the schema. We defer non-null
default struct values
- // and leave them as not implemented yet.
+ // Create a single-element StructArray with nulls
let null_arrays: Vec<ArrayRef> = fields
.iter()
- .map(|field| Self::create_column(field.data_type(), &None,
num_rows))
- .collect::<Result<Vec<_>>>()?;
-
- Arc::new(StructArray::new(
+ .map(|f| {
+ // Recursively create null arrays for struct fields
+ // For primitive fields in structs, use simple null
arrays (not REE within struct)
+ match f.data_type() {
+ DataType::Boolean => {
+
Arc::new(BooleanArray::from(vec![Option::<bool>::None])) as ArrayRef
+ }
+ DataType::Int32 | DataType::Date32 => {
+
Arc::new(Int32Array::from(vec![Option::<i32>::None]))
+ }
+ DataType::Int64 => {
+
Arc::new(Int64Array::from(vec![Option::<i64>::None]))
+ }
+ DataType::Float32 => {
+
Arc::new(Float32Array::from(vec![Option::<f32>::None]))
+ }
+ DataType::Float64 => {
+
Arc::new(Float64Array::from(vec![Option::<f64>::None]))
+ }
+ DataType::Utf8 => {
+
Arc::new(StringArray::from(vec![Option::<&str>::None]))
+ }
+ DataType::Binary => {
+
Arc::new(BinaryArray::from_opt_vec(vec![Option::<&[u8]>::None]))
+ }
+ _ => panic!("Unsupported struct field type: {:?}",
f.data_type()),
+ }
+ })
+ .collect();
+ Arc::new(arrow_array::StructArray::new(
fields.clone(),
null_arrays,
- Some(NullBuffer::new_null(num_rows)),
+ Some(arrow_buffer::NullBuffer::new_null(1)),
))
}
- (DataType::Null, _) => Arc::new(NullArray::new(num_rows)),
- (dt, _) => {
+ _ => {
+ return Err(Error::new(
+ ErrorKind::Unexpected,
+ format!(
+ "Unsupported constant type combination: {:?} with
{:?}",
+ values_field.data_type(),
+ prim_lit
+ ),
+ ));
+ }
+ };
+
+ // Wrap in Run-End Encoding
+ create_ree_array(values_array)
+ }
+
+ /// Converts a PrimitiveLiteral to its corresponding Arrow DataType.
+ /// This is used for constant fields to determine the Arrow type.
+ /// For constant values, we use Run-End Encoding for all types to save
memory.
+ fn primitive_literal_to_arrow_type(literal: &PrimitiveLiteral) ->
Result<DataType> {
+ // Helper to create REE type with the given values type
+ // Note: values field is nullable as Arrow expects this when building
the
+ // final Arrow schema with `RunArray::try_new`.
+ let make_ree = |values_type: DataType| -> DataType {
Review Comment:
I'd limit REEs only to this method, others are not really related to the PR
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]