scovich commented on code in PR #8274: URL: https://github.com/apache/arrow-rs/pull/8274#discussion_r2322847044
########## arrow-avro/src/writer/encoder.rs: ########## @@ -275,3 +384,434 @@ impl F64Encoder<'_> { .map_err(|e| ArrowError::IoError(format!("write f64: {e}"), e)) } } + +struct Utf8GenericEncoder<'a, O: OffsetSizeTrait>(&'a GenericStringArray<O>); + +impl<'a, O: OffsetSizeTrait> Utf8GenericEncoder<'a, O> { + #[inline] + fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> { + write_len_prefixed(out, self.0.value(idx).as_bytes()) + } +} + +type Utf8Encoder<'a> = Utf8GenericEncoder<'a, i32>; +type Utf8LargeEncoder<'a> = Utf8GenericEncoder<'a, i64>; + +/// Unified field encoder: +/// - Holds the inner `Encoder` (by value) +/// - Tracks the column/site null buffer and whether any nulls exist +/// - Carries per-site Avro `Nullability` and precomputed union branch (fast path) +pub struct FieldEncoder<'a> { + encoder: Encoder<'a>, + nulls: Option<NullBuffer>, + has_nulls: bool, + nullability: Option<Nullability>, + /// Precomputed constant branch byte if the site is nullable but contains no nulls + pre: Option<u8>, +} + +impl<'a> FieldEncoder<'a> { + fn make_encoder( + array: &'a dyn Array, + field: &Field, + plan: PlanRef<'_>, + ) -> Result<Self, ArrowError> { + let nulls = array.nulls().cloned(); + let has_nulls = array.null_count() > 0; + let encoder = match plan { + FieldPlan::Struct { encoders } => { + let arr = array + .as_any() + .downcast_ref::<StructArray>() + .ok_or_else(|| ArrowError::SchemaError("Expected StructArray".into()))?; + Encoder::Struct(Box::new(StructEncoder::try_new(arr, encoders)?)) + } + FieldPlan::List { + items_nullability, + item_plan, + } => match array.data_type() { + DataType::List(_) => { + let arr = array + .as_any() + .downcast_ref::<ListArray>() + .ok_or_else(|| ArrowError::SchemaError("Expected ListArray".into()))?; + Encoder::List(Box::new(ListEncoder32::try_new( + arr, + *items_nullability, + item_plan.as_ref(), + )?)) + } + DataType::LargeList(_) => { + let arr = array + .as_any() + .downcast_ref::<LargeListArray>() + .ok_or_else(|| ArrowError::SchemaError("Expected LargeListArray".into()))?; + Encoder::LargeList(Box::new(ListEncoder64::try_new( + arr, + *items_nullability, + item_plan.as_ref(), + )?)) + } + other => { + return Err(ArrowError::SchemaError(format!( + "Avro array site requires Arrow List/LargeList, found: {other:?}" + ))) + } + }, + FieldPlan::Scalar => match array.data_type() { + DataType::Boolean => Encoder::Boolean(BooleanEncoder(array.as_boolean())), + DataType::Utf8 => { + Encoder::Utf8(Utf8GenericEncoder::<i32>(array.as_string::<i32>())) + } + DataType::LargeUtf8 => { + Encoder::Utf8Large(Utf8GenericEncoder::<i64>(array.as_string::<i64>())) + } + DataType::Int32 => Encoder::Int(IntEncoder(array.as_primitive::<Int32Type>())), + DataType::Int64 => Encoder::Long(LongEncoder(array.as_primitive::<Int64Type>())), + DataType::Float32 => { + Encoder::Float32(F32Encoder(array.as_primitive::<Float32Type>())) + } + DataType::Float64 => { + Encoder::Float64(F64Encoder(array.as_primitive::<Float64Type>())) + } + DataType::Binary => Encoder::Binary(BinaryEncoder(array.as_binary::<i32>())), + DataType::LargeBinary => { + Encoder::LargeBinary(BinaryEncoder(array.as_binary::<i64>())) + } + DataType::Timestamp(TimeUnit::Microsecond, _) => Encoder::Timestamp(LongEncoder( + array.as_primitive::<TimestampMicrosecondType>(), + )), + other => { + return Err(ArrowError::NotYetImplemented(format!( + "Avro scalar type not yet supported: {other:?}" + ))); + } + }, + other => { + return Err(ArrowError::NotYetImplemented( + "Avro writer: {other:?} not yet supported".into(), + )); + } + }; + Ok(Self { + encoder, + nulls, + has_nulls, + nullability: None, + pre: None, + }) + } + + #[inline] + fn has_nulls(&self) -> bool { + self.has_nulls + } + + #[inline] + fn is_null(&self, idx: usize) -> bool { + self.nulls + .as_ref() + .is_some_and(|null_buffer| null_buffer.is_null(idx)) + } + + #[inline] + fn with_effective_nullability(mut self, order: Option<Nullability>) -> Self { + self.nullability = order; + self.pre = self.precomputed_union_value_branch(order); + self + } + + #[inline] + fn precomputed_union_value_branch(&self, order: Option<Nullability>) -> Option<u8> { + match (order, self.has_nulls()) { + (Some(Nullability::NullFirst), false) => Some(0x02), // value branch index 1 + (Some(Nullability::NullSecond), false) => Some(0x00), // value branch index 0 + _ => None, + } + } + + #[inline] + fn encode_inner<W: Write + ?Sized>( + &mut self, + idx: usize, + out: &mut W, + ) -> Result<(), ArrowError> { + self.encoder.encode(idx, out) + } + + #[inline] + fn encode<W: Write + ?Sized>(&mut self, idx: usize, out: &mut W) -> Result<(), ArrowError> { + if let Some(b) = self.pre { + return out + .write_all(&[b]) + .map_err(|e| ArrowError::IoError(format!("write union value branch: {e}"), e)) + .and_then(|_| self.encode_inner(idx, out)); + } + if let Some(order) = self.nullability { + let is_null = self.is_null(idx); + write_optional_index(out, is_null, order)?; + if is_null { + return Ok(()); + } + } + self.encode_inner(idx, out) Review Comment: One thought that struck me this morning -- do we actually need `has_nulls` at all? Or is it actually just an intermediate calculation? In other words, if the constructor sees `!has_nulls` then: * set `nulls = None` even if the input array had a null buffer (because we just proved it's a no-op) * generate `pre` |nulls|nullability|pre|note |:-:|:-:|:-:|- |None|None|None|Non-nullable data with non-nullable writer |None|None|Some|**INVALID** -- Some `pre` implies Some `nullability` |None|Some|None|**Missed optimization** -- we should have Some `pre` |None|Some|Some|Non-nullable data with nullable writer (leverage `pre`) |Some|None|None|**INVALID** -- Some `nulls` but writer is non-nullable |Some|None|Some|**INVALID** -- Some `pre` implies Some `nullability`<br>**INVALID** -- Some `nulls` but writer is non-nullable<br>**INVALID** -- Some `pre` implies None `nulls` |Some|Some|None|Nullable data with nulls and nullable writer (cannot leverage `pre`) |Some|Some|Some|**INVALID** -- Some `pre` implies None `nulls` ... but it's still a huge truth table with only three valid states... Maybe better to just define an enum with three variants? ```rust enum NullState { // non-nullable writer with non-nullable data NonNullable, // nullable writer with non-nullable data NullableButNoNulls(NullOrdering, u8), // nullable writer with nullable data Nullable(NullBuffer, NullOrdering), } ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org