alamb commented on code in PR #3251:
URL: https://github.com/apache/arrow-rs/pull/3251#discussion_r1037251752
##########
arrow/src/row/mod.rs:
##########
@@ -1671,6 +1796,200 @@ mod tests {
let _ = converter.convert_rows(&rows);
}
+ fn test_single_list<O: OffsetSizeTrait>() {
+ let mut builder = GenericListBuilder::<O, _>::new(Int32Builder::new());
+ builder.values().append_value(32);
+ builder.values().append_value(52);
+ builder.values().append_value(32);
+ builder.append(true);
+ builder.values().append_value(32);
+ builder.values().append_value(52);
+ builder.values().append_value(12);
+ builder.append(true);
+ builder.values().append_value(32);
+ builder.values().append_value(52);
+ builder.append(true);
+ builder.values().append_value(32); // MASKED
Review Comment:
masked means this row is NULL so these values should be igored
##########
arrow/src/row/mod.rs:
##########
@@ -381,13 +430,29 @@ enum Codec {
/// A row converter for the child fields
/// and the encoding of a row containing only nulls
Struct(RowConverter, OwnedRow),
+ /// A row converter for the child field
+ List(RowConverter),
}
impl Codec {
fn new(sort_field: &SortField) -> Result<Self> {
match &sort_field.data_type {
DataType::Dictionary(_, _) =>
Ok(Self::Dictionary(Default::default())),
d if !d.is_nested() => Ok(Self::Stateless),
+ DataType::List(f) | DataType::LargeList(f) => {
+ // The encoded contents will be inverted if descending is set
to true
+ // As such we set `descending` to false and negate nulls first
if it
+ // it set to true
+ let options = SortOptions {
+ descending: false,
+ nulls_first: sort_field.options.nulls_first
+ != sort_field.options.descending,
+ };
+
+ let field = SortField::new_with_options(f.data_type().clone(),
options);
Review Comment:
I don't understand how descending sort is achieved if the list is always
encoded descending false 🤔 )
However, I see it is tested below, so 👍
```rust
let options = SortOptions {
descending: true,
nulls_first: false,
};
```
##########
arrow/src/row/list.rs:
##########
@@ -0,0 +1,178 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::compute::SortOptions;
+use crate::row::{RowConverter, Rows, SortField};
+use arrow_array::builder::BufferBuilder;
+use arrow_array::{Array, GenericListArray, OffsetSizeTrait};
+use arrow_data::ArrayDataBuilder;
+use arrow_schema::ArrowError;
+use std::ops::Range;
+
+pub fn compute_lengths<O: OffsetSizeTrait>(
+ lengths: &mut [usize],
+ rows: &Rows,
+ array: &GenericListArray<O>,
+) {
+ let offsets = array.value_offsets().windows(2);
+ lengths
+ .iter_mut()
+ .zip(offsets)
+ .enumerate()
+ .for_each(|(idx, (length, offsets))| {
+ let start = offsets[0].as_usize();
+ let end = offsets[1].as_usize();
+ let range = array.is_valid(idx).then_some(start..end);
+ *length += encoded_len(rows, range);
+ });
+}
+
+fn encoded_len(rows: &Rows, range: Option<Range<usize>>) -> usize {
+ match range {
+ None => 1,
+ Some(range) if range.start == range.end => 1,
+ Some(range) => {
+ let element_count = range.end - range.start;
+ let row_bytes = range.map(|i|
rows.row(i).as_ref().len()).sum::<usize>();
+ let total = (1 + element_count) * std::mem::size_of::<u32>() +
row_bytes;
+ super::variable::padded_length(Some(total))
+ }
+ }
+}
+
+/// Encodes the provided `GenericListArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded child elements
+pub fn encode<O: OffsetSizeTrait>(
+ out: &mut Rows,
+ rows: &Rows,
+ opts: SortOptions,
+ array: &GenericListArray<O>,
+) {
+ let mut temporary = vec![];
+ let offsets = array.value_offsets().windows(2);
+ out.offsets
+ .iter_mut()
+ .skip(1)
+ .zip(offsets)
+ .enumerate()
+ .for_each(|(idx, (offset, offsets))| {
+ let start = offsets[0].as_usize();
+ let end = offsets[1].as_usize();
+ let range = array.is_valid(idx).then_some(start..end);
+ let out = &mut out.buffer[*offset..];
+ *offset += encode_one(out, &mut temporary, rows, range, opts)
+ });
+}
+
+#[inline]
+fn encode_one(
+ out: &mut [u8],
+ temporary: &mut Vec<u8>,
+ rows: &Rows,
+ range: Option<Range<usize>>,
+ opts: SortOptions,
+) -> usize {
+ temporary.clear();
+
+ match range {
+ None => super::variable::encode_one(out, None, opts),
+ Some(range) if range.start == range.end => {
+ super::variable::encode_one(out, Some(&[]), opts)
+ }
+ Some(range) => {
+ for row in range.clone().map(|i| rows.row(i)) {
+ temporary.extend_from_slice(row.as_ref());
+ }
+ for row in range.clone().map(|i| rows.row(i)) {
+ let len: u32 = row
+ .as_ref()
+ .len()
+ .try_into()
+ .expect("lists with elements larger than u32::MAX not
supported");
Review Comment:
👍 I was wondering what would happen given here are only 32 bits for length
👍
```suggestion
.expect("ListArray or LargeListArray containing a list
of more than u32::MAX items is not supported");
```
##########
arrow/src/row/mod.rs:
##########
@@ -1671,6 +1796,200 @@ mod tests {
let _ = converter.convert_rows(&rows);
}
+ fn test_single_list<O: OffsetSizeTrait>() {
+ let mut builder = GenericListBuilder::<O, _>::new(Int32Builder::new());
+ builder.values().append_value(32);
+ builder.values().append_value(52);
+ builder.values().append_value(32);
+ builder.append(true);
+ builder.values().append_value(32);
+ builder.values().append_value(52);
+ builder.values().append_value(12);
+ builder.append(true);
+ builder.values().append_value(32);
+ builder.values().append_value(52);
+ builder.append(true);
+ builder.values().append_value(32); // MASKED
+ builder.values().append_value(52); // MASKED
+ builder.append(false);
+ builder.values().append_value(32);
+ builder.values().append_null();
+ builder.append(true);
+ builder.append(true);
+
+ let list = Arc::new(builder.finish()) as ArrayRef;
+ let d = list.data_type().clone();
+
+ let mut converter =
RowConverter::new(vec![SortField::new(d.clone())]).unwrap();
+
+ let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
+ assert!(rows.row(0) > rows.row(1)); // [32, 52, 32] > [32, 52, 12]
+ assert!(rows.row(2) < rows.row(1)); // [32, 42] < [32, 52, 12]
+ assert!(rows.row(3) < rows.row(2)); // null < [32, 42]
+ assert!(rows.row(4) < rows.row(2)); // [32, null] < [32, 42]
+ assert!(rows.row(5) < rows.row(2)); // [] < [32, 42]
+ assert!(rows.row(3) < rows.row(5)); // null < []
+
+ let back = converter.convert_rows(&rows).unwrap();
+ assert_eq!(back.len(), 1);
+ back[0].data().validate_full().unwrap();
+ assert_eq!(&back[0], &list);
+
+ let options = SortOptions {
+ descending: false,
+ nulls_first: false,
+ };
+ let field = SortField::new_with_options(d.clone(), options);
+ let mut converter = RowConverter::new(vec![field]).unwrap();
+ let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
+
+ assert!(rows.row(0) > rows.row(1)); // [32, 52, 32] > [32, 52, 12]
+ assert!(rows.row(2) < rows.row(1)); // [32, 42] < [32, 52, 12]
+ assert!(rows.row(3) > rows.row(2)); // null > [32, 42]
+ assert!(rows.row(4) > rows.row(2)); // [32, null] > [32, 42]
+ assert!(rows.row(5) < rows.row(2)); // [] < [32, 42]
+ assert!(rows.row(3) > rows.row(5)); // null > []
+
+ let back = converter.convert_rows(&rows).unwrap();
+ assert_eq!(back.len(), 1);
+ back[0].data().validate_full().unwrap();
+ assert_eq!(&back[0], &list);
+
+ let options = SortOptions {
+ descending: true,
+ nulls_first: false,
+ };
+ let field = SortField::new_with_options(d.clone(), options);
+ let mut converter = RowConverter::new(vec![field]).unwrap();
+ let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
+
+ assert!(rows.row(0) < rows.row(1)); // [32, 52, 32] < [32, 52, 12]
+ assert!(rows.row(2) > rows.row(1)); // [32, 42] > [32, 52, 12]
+ assert!(rows.row(3) > rows.row(2)); // null > [32, 42]
+ assert!(rows.row(4) > rows.row(2)); // [32, null] > [32, 42]
+ assert!(rows.row(5) > rows.row(2)); // [] > [32, 42]
+ assert!(rows.row(3) > rows.row(5)); // null > []
+
+ let back = converter.convert_rows(&rows).unwrap();
+ assert_eq!(back.len(), 1);
+ back[0].data().validate_full().unwrap();
+ assert_eq!(&back[0], &list);
+
+ let options = SortOptions {
+ descending: true,
+ nulls_first: true,
+ };
+ let field = SortField::new_with_options(d, options);
+ let mut converter = RowConverter::new(vec![field]).unwrap();
+ let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
+
+ assert!(rows.row(0) < rows.row(1)); // [32, 52, 32] < [32, 52, 12]
+ assert!(rows.row(2) > rows.row(1)); // [32, 42] > [32, 52, 12]
+ assert!(rows.row(3) < rows.row(2)); // null < [32, 42]
Review Comment:
👍
Thank you for the comments -- they make the tests easy to follow
##########
arrow/src/row/mod.rs:
##########
@@ -1671,6 +1796,200 @@ mod tests {
let _ = converter.convert_rows(&rows);
}
+ fn test_single_list<O: OffsetSizeTrait>() {
+ let mut builder = GenericListBuilder::<O, _>::new(Int32Builder::new());
+ builder.values().append_value(32);
+ builder.values().append_value(52);
+ builder.values().append_value(32);
+ builder.append(true);
+ builder.values().append_value(32);
+ builder.values().append_value(52);
+ builder.values().append_value(12);
+ builder.append(true);
+ builder.values().append_value(32);
+ builder.values().append_value(52);
+ builder.append(true);
+ builder.values().append_value(32); // MASKED
+ builder.values().append_value(52); // MASKED
+ builder.append(false);
+ builder.values().append_value(32);
+ builder.values().append_null();
+ builder.append(true);
+ builder.append(true);
+
+ let list = Arc::new(builder.finish()) as ArrayRef;
+ let d = list.data_type().clone();
+
+ let mut converter =
RowConverter::new(vec![SortField::new(d.clone())]).unwrap();
+
+ let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
+ assert!(rows.row(0) > rows.row(1)); // [32, 52, 32] > [32, 52, 12]
+ assert!(rows.row(2) < rows.row(1)); // [32, 42] < [32, 52, 12]
+ assert!(rows.row(3) < rows.row(2)); // null < [32, 42]
+ assert!(rows.row(4) < rows.row(2)); // [32, null] < [32, 42]
+ assert!(rows.row(5) < rows.row(2)); // [] < [32, 42]
+ assert!(rows.row(3) < rows.row(5)); // null < []
+
+ let back = converter.convert_rows(&rows).unwrap();
+ assert_eq!(back.len(), 1);
+ back[0].data().validate_full().unwrap();
+ assert_eq!(&back[0], &list);
+
+ let options = SortOptions {
+ descending: false,
+ nulls_first: false,
+ };
+ let field = SortField::new_with_options(d.clone(), options);
+ let mut converter = RowConverter::new(vec![field]).unwrap();
+ let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
+
+ assert!(rows.row(0) > rows.row(1)); // [32, 52, 32] > [32, 52, 12]
+ assert!(rows.row(2) < rows.row(1)); // [32, 42] < [32, 52, 12]
+ assert!(rows.row(3) > rows.row(2)); // null > [32, 42]
+ assert!(rows.row(4) > rows.row(2)); // [32, null] > [32, 42]
+ assert!(rows.row(5) < rows.row(2)); // [] < [32, 42]
+ assert!(rows.row(3) > rows.row(5)); // null > []
+
+ let back = converter.convert_rows(&rows).unwrap();
+ assert_eq!(back.len(), 1);
+ back[0].data().validate_full().unwrap();
+ assert_eq!(&back[0], &list);
+
+ let options = SortOptions {
+ descending: true,
+ nulls_first: false,
+ };
+ let field = SortField::new_with_options(d.clone(), options);
+ let mut converter = RowConverter::new(vec![field]).unwrap();
+ let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
+
+ assert!(rows.row(0) < rows.row(1)); // [32, 52, 32] < [32, 52, 12]
+ assert!(rows.row(2) > rows.row(1)); // [32, 42] > [32, 52, 12]
+ assert!(rows.row(3) > rows.row(2)); // null > [32, 42]
+ assert!(rows.row(4) > rows.row(2)); // [32, null] > [32, 42]
+ assert!(rows.row(5) > rows.row(2)); // [] > [32, 42]
+ assert!(rows.row(3) > rows.row(5)); // null > []
+
+ let back = converter.convert_rows(&rows).unwrap();
+ assert_eq!(back.len(), 1);
+ back[0].data().validate_full().unwrap();
+ assert_eq!(&back[0], &list);
+
+ let options = SortOptions {
+ descending: true,
+ nulls_first: true,
+ };
+ let field = SortField::new_with_options(d, options);
+ let mut converter = RowConverter::new(vec![field]).unwrap();
+ let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
+
+ assert!(rows.row(0) < rows.row(1)); // [32, 52, 32] < [32, 52, 12]
+ assert!(rows.row(2) > rows.row(1)); // [32, 42] > [32, 52, 12]
+ assert!(rows.row(3) < rows.row(2)); // null < [32, 42]
+ assert!(rows.row(4) < rows.row(2)); // [32, null] < [32, 42]
+ assert!(rows.row(5) > rows.row(2)); // [] > [32, 42]
+ assert!(rows.row(3) < rows.row(5)); // null < []
+
+ let back = converter.convert_rows(&rows).unwrap();
+ assert_eq!(back.len(), 1);
+ back[0].data().validate_full().unwrap();
+ assert_eq!(&back[0], &list);
+ }
+
+ fn test_nested_list<O: OffsetSizeTrait>() {
+ let mut builder = GenericListBuilder::<O, _>::new(
+ GenericListBuilder::<O, _>::new(Int32Builder::new()),
+ );
+
+ builder.values().values().append_value(1);
+ builder.values().values().append_value(2);
+ builder.values().append(true);
+ builder.values().values().append_value(1);
+ builder.values().values().append_null();
+ builder.values().append(true);
+ builder.append(true);
+
+ builder.values().values().append_value(1);
+ builder.values().values().append_null();
+ builder.values().append(true);
+ builder.values().values().append_value(1);
+ builder.values().values().append_null();
+ builder.values().append(true);
+ builder.append(true);
+
+ builder.values().values().append_value(1);
+ builder.values().values().append_null();
+ builder.values().append(true);
+ builder.values().append(false);
+ builder.append(true);
+ builder.append(false);
+
+ builder.values().values().append_value(1);
+ builder.values().values().append_value(2);
+ builder.values().append(true);
+ builder.append(true);
+
+ let list = Arc::new(builder.finish()) as ArrayRef;
+ let d = list.data_type().clone();
+
+ // [
+ // [[1, 2], [1, null]],
+ // [[1, null], [1, null]],
+ // [[1, null], null]
+ // null
+ // [[1, 2]]
+ // ]
+ let options = SortOptions {
+ descending: false,
+ nulls_first: true,
Review Comment:
I recommend adding a test in nested lists for `nulls_first: false`, and
verify that
```rust
assert!(rows.row(0) < rows.row(1));
```
##########
arrow/src/row/mod.rs:
##########
@@ -343,6 +344,54 @@ mod variable;
/// └───────┴───────────────┴───────┴─────────┴───────┘
/// ```
///
+/// ## List Encoding
+///
+/// Lists are encoded by first encoding all child elements to the row format.
+///
+/// A "canonical byte array" is then constructed by concatenating the row
+/// encodings of all their elements into a single binary array, followed
+/// by the lengths of each encoded row, and the number of elements, encoded
+/// as big endian `u32`.
+///
+/// This canonical byte array is then encoded using the variable length byte
+/// encoding described above.
+///
+/// _The lengths are not strictly necessary but greatly simplify decode, they
+/// may be removed in a future iteration_.
+///
+/// For example given:
+///
+/// ```text
+/// [1_u8, 2_u8, 3_u8]
+/// [1_u8, null]
+/// []
+/// null
+/// ```
+///
+/// The elements would be converted to:
+///
+/// ```text
+/// ┌──┬──┐ ┌──┬──┐ ┌──┬──┐ ┌──┬──┐ ┌──┬──┐
+/// 1 │01│01│ 2 │01│02│ 3 │01│02│ 1 │01│01│ null │00│00│
+/// └──┴──┘ └──┴──┘ └──┴──┘ └──┴──┘ └──┴──┘
+///```
+///
+/// Which would be grouped into the following canonical byte arrays:
+///
+/// ```text
+///
┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┐
+/// [1_u8, 2_u8, 3_u8]
│01│01│01│02│01│03│00│00│00│02│00│00│00│02│00│00│00│02│00│00│00│03│
Review Comment:
So basically `00│00│00│02│00│00│00│02│00│00│00│02│00│00│00│03` represents
`[element0_len, element1_len, element2_len, element count]` --> `[2, 2, 2, 3]`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]