jecsand838 commented on PR #9605:
URL: https://github.com/apache/arrow-rs/pull/9605#issuecomment-4172638199
@ariel-miculas
I think the correctness fix here is right.
One improvement I'd recommend is pushing the writer-wire planning fully into
`codec.rs`, instead of carrying full `AvroDataType`s down into `record.rs` and
having `Skipper::from_avro` reconstruct the skip tree there.
That would keep the projection layer working with a precomputed writer-side
plan and should shave some decoder-construction work.
Maybe something like this in `codec.rs`:
```rust
#[derive(Debug, Clone, PartialEq)]
pub(crate) enum WriterSkipPlan {
Null,
Boolean,
Int32,
Int64,
Float32,
Float64,
Bytes,
String,
TimeMicros,
TimestampMillis,
TimestampMicros,
TimestampNanos,
Fixed(usize),
Decimal(Option<usize>),
UuidString,
Enum,
List(Box<WriterSkipPlan>),
Map(Box<WriterSkipPlan>),
Struct(Arc<[WriterSkipPlan]>),
Union(Arc<[WriterSkipPlan]>),
Nullable(Nullability, Box<WriterSkipPlan>),
DurationFixed12,
#[cfg(feature = "avro_custom_types")]
RunEndEncoded(Box<WriterSkipPlan>),
}
#[derive(Debug, Clone, PartialEq)]
pub(crate) enum ResolvedField {
ToReader(usize, WriterSkipPlan),
Skip(WriterSkipPlan),
}
````
and then materialize the writer-side skip tree in `codec.rs` once:
```rust
impl AvroDataType {
fn writer_skip_plan(&self) -> Result<WriterSkipPlan, ArrowError> {
let mut plan = match self.codec() {
Codec::Null => WriterSkipPlan::Null,
Codec::Boolean => WriterSkipPlan::Boolean,
Codec::Int32 | Codec::Date32 | Codec::TimeMillis =>
WriterSkipPlan::Int32,
Codec::Int64 => WriterSkipPlan::Int64,
Codec::TimeMicros => WriterSkipPlan::TimeMicros,
Codec::TimestampMillis(_) => WriterSkipPlan::TimestampMillis,
Codec::TimestampMicros(_) => WriterSkipPlan::TimestampMicros,
Codec::TimestampNanos(_) => WriterSkipPlan::TimestampNanos,
#[cfg(feature = "avro_custom_types")]
Codec::DurationNanos
| Codec::DurationMicros
| Codec::DurationMillis
| Codec::DurationSeconds => WriterSkipPlan::Int64,
#[cfg(feature = "avro_custom_types")]
Codec::Int8 | Codec::Int16 | Codec::UInt8 | Codec::UInt16 |
Codec::Time32Secs => {
WriterSkipPlan::Int32
}
#[cfg(feature = "avro_custom_types")]
Codec::UInt32 | Codec::Date64 | Codec::TimeNanos |
Codec::TimestampSecs(_) => {
WriterSkipPlan::Int64
}
#[cfg(feature = "avro_custom_types")]
Codec::UInt64 => WriterSkipPlan::Fixed(8),
#[cfg(feature = "avro_custom_types")]
Codec::Float16 => WriterSkipPlan::Fixed(2),
#[cfg(feature = "avro_custom_types")]
Codec::IntervalYearMonth => WriterSkipPlan::Fixed(4),
#[cfg(feature = "avro_custom_types")]
Codec::IntervalMonthDayNano => WriterSkipPlan::Fixed(16),
#[cfg(feature = "avro_custom_types")]
Codec::IntervalDayTime => WriterSkipPlan::Fixed(8),
Codec::Float32 => WriterSkipPlan::Float32,
Codec::Float64 => WriterSkipPlan::Float64,
Codec::Binary => WriterSkipPlan::Bytes,
Codec::Utf8 | Codec::Utf8View => WriterSkipPlan::String,
Codec::Fixed(sz) => WriterSkipPlan::Fixed(*sz as usize),
Codec::Decimal(_, _, size) => WriterSkipPlan::Decimal(*size),
Codec::Uuid => WriterSkipPlan::UuidString,
Codec::Enum(_) => WriterSkipPlan::Enum,
Codec::List(item) =>
WriterSkipPlan::List(Box::new(item.writer_skip_plan()?)),
Codec::Map(values) =>
WriterSkipPlan::Map(Box::new(values.writer_skip_plan()?)),
Codec::Struct(fields) => {
let children = if let Some(ResolutionInfo::Record(rec)) =
self.resolution.as_ref() {
rec.writer_fields
.iter()
.map(|f| match f {
ResolvedField::ToReader { writer_plan, .. }
| ResolvedField::Skip(writer_plan) =>
Ok(writer_plan.clone()),
})
.collect::<Result<Arc<[_]>, ArrowError>>()?
} else {
fields
.iter()
.map(|f| f.data_type().writer_skip_plan())
.collect::<Result<Arc<[_]>, ArrowError>>()?
};
WriterSkipPlan::Struct(children)
}
Codec::Interval => WriterSkipPlan::DurationFixed12,
Codec::Union(encodings, _, _) => {
let children = encodings
.iter()
.map(|e| e.writer_skip_plan())
.collect::<Result<Arc<[_]>, ArrowError>>()?;
WriterSkipPlan::Union(children)
}
#[cfg(feature = "avro_custom_types")]
Codec::RunEndEncoded(values, _) => {
WriterSkipPlan::RunEndEncoded(Box::new(values.writer_skip_plan()?))
}
};
if let Some(nullability) = self.nullability() {
plan = WriterSkipPlan::Nullable(nullability, Box::new(plan));
}
Ok(plan)
}
}
```
`resolve_records` can also stash the precomputed writer plan instead of a
full `AvroDataType`:
```rust
let writer_fields = writer_record
.fields
.iter()
.enumerate()
.map(|(writer_index, writer_field)| {
let dt = self.parse_type(&writer_field.r#type, writer_ns)?;
let writer_plan = dt.writer_skip_plan()?;
Ok(match writer_to_reader[writer_index] {
Some(reader_index) => ResolvedField::ToReader(
reader_index,
writer_plan,
),
None => ResolvedField::Skip(writer_plan),
})
})
.collect::<Result<_, ArrowError>>()?;
```
That way `record.rs` can consume a prebuilt writer-side plan instead of
reconstructing it from `AvroDataType` + `ResolutionInfo::Record`.
We can also implement these improvements in a follow-up as well.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]