jecsand838 commented on code in PR #7451:
URL: https://github.com/apache/arrow-rs/pull/7451#discussion_r2069363906
##########
arrow-avro/src/reader/record.rs:
##########
@@ -267,10 +305,83 @@ impl Decoder {
.collect::<Result<Vec<_>, _>>()?;
Arc::new(StructArray::new(fields.clone(), arrays, nulls))
}
+ Self::Map(map_field, k_off, m_off, kdata, valdec) => {
+ let moff = flush_offsets(m_off);
+ let koff = flush_offsets(k_off);
+ let kd = flush_values(kdata).into();
+ let val_arr = valdec.flush(None)?;
+ let key_arr = StringArray::new(koff, kd, None);
+ if key_arr.len() != val_arr.len() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Map keys length ({}) != map values length ({})",
+ key_arr.len(),
+ val_arr.len()
+ )));
+ }
+ let final_len = moff.len() - 1;
+ if let Some(n) = &nulls {
+ if n.len() != final_len {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Map array null buffer length {} != final map
length {final_len}",
+ n.len()
+ )));
+ }
+ }
+ let entries_struct = StructArray::new(
+ Fields::from(vec![
+ Arc::new(ArrowField::new("key", DataType::Utf8,
false)),
+ Arc::new(ArrowField::new("value",
val_arr.data_type().clone(), true)),
+ ]),
+ vec![Arc::new(key_arr), val_arr],
+ None,
+ );
+ let map_arr = MapArray::new(map_field.clone(), moff,
entries_struct, nulls, false);
+ Arc::new(map_arr)
+ }
})
}
}
+
+fn read_map_blocks(
+ buf: &mut AvroCursor,
+ decode_entry: impl FnMut(&mut AvroCursor) -> Result<(), ArrowError>,
+) -> Result<usize, ArrowError> {
+ read_blockwise_items(buf, true, decode_entry)
+}
+
+fn read_blockwise_items(
+ buf: &mut AvroCursor,
+ read_size_after_negative: bool,
+ mut decode_fn: impl FnMut(&mut AvroCursor) -> Result<(), ArrowError>,
+) -> Result<usize, ArrowError> {
+ let mut total = 0usize;
+ loop {
+ let blk = buf.get_long()?;
Review Comment:
@klion26
`blk` is the block count and there are cases where `blk` will be negative.
A negative `blk` is expected by the Avro spec for block-encoded arrays &
maps, indicating that the count is `-blk` items. Avro decoders usually handle
this by reading the size marker and then proceeding to decode `|blk|` entries
in that block. After finishing the block, decoding continues with the next
block count, until a 0 count terminates the sequence.
Here's the text from the [Avro
Specification](https://avro.apache.org/docs/1.11.1/specification/#maps)
regarding maps with negative block counts:
> If a block’s count is negative, its absolute value is used, and the count
is followed immediately by a long block size indicating the number of bytes in
the block. This block size permits fast skipping through data, e.g., when
projecting a record to a subset of its fields.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]