tustvold commented on code in PR #3212:
URL: https://github.com/apache/arrow-rs/pull/3212#discussion_r1033973275
##########
arrow/src/row/mod.rs:
##########
@@ -332,8 +340,99 @@ mod variable;
#[derive(Debug)]
pub struct RowConverter {
fields: Arc<[SortField]>,
- /// interning state for column `i`, if column`i` is a dictionary
- interners: Vec<Option<Box<OrderPreservingInterner>>>,
+ /// State for codecs
+ codecs: Vec<Codec>,
+}
+
+#[derive(Debug)]
+enum Codec {
+ /// No additional codec state is necessary
+ Stateless,
+ // The interner used to encode dictionary values
+ Dictionary(OrderPreservingInterner),
+ // A row converter for the child fields
+ // and the encoding of a row containing only nulls
+ Struct(RowConverter, OwnedRow),
+}
+
+impl Codec {
+ fn new(sort_field: &SortField) -> Result<Self> {
+ match &sort_field.data_type {
+ DataType::Dictionary(_, _) =>
Ok(Self::Dictionary(Default::default())),
+ d if !d.is_nested() => Ok(Self::Stateless),
+ DataType::Struct(f) => {
+ let sort_fields = f
+ .iter()
+ .map(|x| {
+ SortField::new_with_options(
+ x.data_type().clone(),
+ sort_field.options,
+ )
+ })
+ .collect();
+
+ let mut converter = RowConverter::new(sort_fields)?;
+ let nulls: Vec<_> =
+ f.iter().map(|x| new_null_array(x.data_type(),
1)).collect();
+
+ let nulls = converter.convert_columns(&nulls)?;
+ let owned = OwnedRow {
+ data: nulls.buffer,
+ config: nulls.config,
+ };
+
+ Ok(Self::Struct(converter, owned))
+ }
+ _ => Err(ArrowError::NotYetImplemented(format!(
+ "not yet implemented: {:?}",
+ sort_field.data_type
+ ))),
+ }
+ }
+
+ fn encoder(&mut self, array: &dyn Array) -> Result<Encoder<'_>> {
+ match self {
+ Codec::Stateless => Ok(Encoder::Stateless),
+ Codec::Dictionary(interner) => {
+ let values = downcast_dictionary_array! {
+ array => array.values(),
+ _ => unreachable!()
+ };
+
+ let mapping = compute_dictionary_mapping(interner, values)
+ .into_iter()
+ .map(|maybe_interned| {
+ maybe_interned.map(|interned|
interner.normalized_key(interned))
+ })
+ .collect();
+
+ Ok(Encoder::Dictionary(mapping))
+ }
+ Codec::Struct(converter, null) => {
+ let v = as_struct_array(array);
+ let rows = converter.convert_columns(v.columns())?;
Review Comment:
Yup - we just flatten the schema, so a `Struct{Int32, Int32},
Struct{Float32}` is encoded similarly to `Int32,Int32,Float32` albeit with
additional nullability for the structs
##########
arrow/src/row/mod.rs:
##########
@@ -783,54 +881,64 @@ fn null_sentinel(options: SortOptions) -> u8 {
}
/// Computes the length of each encoded [`Rows`] and returns an empty [`Rows`]
-fn new_empty_rows(
- cols: &[ArrayRef],
- dictionaries: &[Option<Vec<Option<&[u8]>>>],
- config: RowConfig,
-) -> Rows {
+fn new_empty_rows(cols: &[ArrayRef], encoders: &[Encoder], config: RowConfig)
-> Rows {
use fixed::FixedLengthEncoding;
let num_rows = cols.first().map(|x| x.len()).unwrap_or(0);
let mut lengths = vec![0; num_rows];
- for (array, dict) in cols.iter().zip(dictionaries) {
- downcast_primitive_array! {
- array => lengths.iter_mut().for_each(|x| *x +=
fixed::encoded_len(array)),
- DataType::Null => {},
- DataType::Boolean => lengths.iter_mut().for_each(|x| *x +=
bool::ENCODED_LEN),
- DataType::Binary => as_generic_binary_array::<i32>(array)
- .iter()
- .zip(lengths.iter_mut())
- .for_each(|(slice, length)| *length +=
variable::encoded_len(slice)),
- DataType::LargeBinary => as_generic_binary_array::<i64>(array)
- .iter()
- .zip(lengths.iter_mut())
- .for_each(|(slice, length)| *length +=
variable::encoded_len(slice)),
- DataType::Utf8 => as_string_array(array)
- .iter()
- .zip(lengths.iter_mut())
- .for_each(|(slice, length)| {
- *length += variable::encoded_len(slice.map(|x|
x.as_bytes()))
- }),
- DataType::LargeUtf8 => as_largestring_array(array)
- .iter()
- .zip(lengths.iter_mut())
- .for_each(|(slice, length)| {
- *length += variable::encoded_len(slice.map(|x|
x.as_bytes()))
- }),
- DataType::Dictionary(_, _) => downcast_dictionary_array! {
- array => {
- let dict = dict.as_ref().unwrap();
- for (v, length) in
array.keys().iter().zip(lengths.iter_mut()) {
- match v.and_then(|v| dict[v as usize]) {
- Some(k) => *length += k.len() + 1,
- None => *length += 1,
+ for (array, encoder) in cols.iter().zip(encoders) {
+ match encoder {
+ Encoder::Stateless => {
+ downcast_primitive_array! {
+ array => lengths.iter_mut().for_each(|x| *x +=
fixed::encoded_len(array)),
+ DataType::Null => {},
+ DataType::Boolean => lengths.iter_mut().for_each(|x| *x +=
bool::ENCODED_LEN),
+ DataType::Binary => as_generic_binary_array::<i32>(array)
+ .iter()
+ .zip(lengths.iter_mut())
+ .for_each(|(slice, length)| *length +=
variable::encoded_len(slice)),
+ DataType::LargeBinary =>
as_generic_binary_array::<i64>(array)
+ .iter()
+ .zip(lengths.iter_mut())
+ .for_each(|(slice, length)| *length +=
variable::encoded_len(slice)),
+ DataType::Utf8 => as_string_array(array)
Review Comment:
Oh yes, most definitely :grin:
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]