This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 6859c877cd Handle primitive REPEATED field not contained in LIST
annotated group (#6649)
6859c877cd is described below
commit 6859c877cd44cff63c46a1c2e14592538bd09934
Author: Ze'ev Maor <[email protected]>
AuthorDate: Sat Nov 2 12:30:30 2024 +0200
Handle primitive REPEATED field not contained in LIST annotated group
(#6649)
* Handle primitive REPEATED field not contained in LIST annotated group
* cargo fmt
* Add UT
* cargo fmt
* comment
* clippy
* clippy
* update parquet-testing module
* cargo fmt
---------
Co-authored-by: Ze'ev Maor <[email protected]>
---
parquet-testing | 2 +-
parquet/src/record/reader.rs | 137 ++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 137 insertions(+), 2 deletions(-)
diff --git a/parquet-testing b/parquet-testing
index 50af3d8ce2..550368ca77 160000
--- a/parquet-testing
+++ b/parquet-testing
@@ -1 +1 @@
-Subproject commit 50af3d8ce206990d81014b1862e5ce7380dc3e08
+Subproject commit 550368ca77b97231efead39251a96bd6f8f08c6e
diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs
index 1f9128a8b4..fd6ca7cdd5 100644
--- a/parquet/src/record/reader.rs
+++ b/parquet/src/record/reader.rs
@@ -138,7 +138,17 @@ impl TreeBuilder {
.column_descr_ptr();
let col_reader = row_group_reader.get_column_reader(orig_index)?;
let column = TripletIter::new(col_descr, col_reader,
self.batch_size);
- Reader::PrimitiveReader(field, Box::new(column))
+ let reader = Reader::PrimitiveReader(field.clone(),
Box::new(column));
+ if repetition == Repetition::REPEATED {
+ Reader::RepeatedReader(
+ field,
+ curr_def_level - 1,
+ curr_rep_level - 1,
+ Box::new(reader),
+ )
+ } else {
+ reader
+ }
} else {
match field.get_basic_info().converted_type() {
// List types
@@ -1688,6 +1698,131 @@ mod tests {
assert_eq!(rows, expected_rows);
}
+ #[test]
+ fn test_tree_reader_handle_primitive_repeated_fields_with_no_annotation() {
+ // In this test the REPEATED fields are primitives
+ let rows = test_file_reader_rows("repeated_primitive_no_list.parquet",
None).unwrap();
+ let expected_rows = vec![
+ row![
+ (
+ "Int32_list".to_string(),
+ Field::ListInternal(make_list([0, 1, 2,
3].map(Field::Int).to_vec()))
+ ),
+ (
+ "String_list".to_string(),
+ Field::ListInternal(make_list(
+ ["foo", "zero", "one", "two"]
+ .map(|s| Field::Str(s.to_string()))
+ .to_vec()
+ ))
+ ),
+ (
+ "group_of_lists".to_string(),
+ group![
+ (
+ "Int32_list_in_group".to_string(),
+ Field::ListInternal(make_list([0, 1, 2,
3].map(Field::Int).to_vec()))
+ ),
+ (
+ "String_list_in_group".to_string(),
+ Field::ListInternal(make_list(
+ ["foo", "zero", "one", "two"]
+ .map(|s| Field::Str(s.to_string()))
+ .to_vec()
+ ))
+ )
+ ]
+ )
+ ],
+ row![
+ (
+ "Int32_list".to_string(),
+ Field::ListInternal(make_list(vec![]))
+ ),
+ (
+ "String_list".to_string(),
+ Field::ListInternal(make_list(
+ ["three"].map(|s| Field::Str(s.to_string())).to_vec()
+ ))
+ ),
+ (
+ "group_of_lists".to_string(),
+ group![
+ (
+ "Int32_list_in_group".to_string(),
+ Field::ListInternal(make_list(vec![]))
+ ),
+ (
+ "String_list_in_group".to_string(),
+ Field::ListInternal(make_list(
+ ["three"].map(|s|
Field::Str(s.to_string())).to_vec()
+ ))
+ )
+ ]
+ )
+ ],
+ row![
+ (
+ "Int32_list".to_string(),
+ Field::ListInternal(make_list(vec![Field::Int(4)]))
+ ),
+ (
+ "String_list".to_string(),
+ Field::ListInternal(make_list(
+ ["four"].map(|s| Field::Str(s.to_string())).to_vec()
+ ))
+ ),
+ (
+ "group_of_lists".to_string(),
+ group![
+ (
+ "Int32_list_in_group".to_string(),
+ Field::ListInternal(make_list(vec![Field::Int(4)]))
+ ),
+ (
+ "String_list_in_group".to_string(),
+ Field::ListInternal(make_list(
+ ["four"].map(|s|
Field::Str(s.to_string())).to_vec()
+ ))
+ )
+ ]
+ )
+ ],
+ row![
+ (
+ "Int32_list".to_string(),
+ Field::ListInternal(make_list([5, 6, 7,
8].map(Field::Int).to_vec()))
+ ),
+ (
+ "String_list".to_string(),
+ Field::ListInternal(make_list(
+ ["five", "six", "seven", "eight"]
+ .map(|s| Field::Str(s.to_string()))
+ .to_vec()
+ ))
+ ),
+ (
+ "group_of_lists".to_string(),
+ group![
+ (
+ "Int32_list_in_group".to_string(),
+ Field::ListInternal(make_list([5, 6, 7,
8].map(Field::Int).to_vec()))
+ ),
+ (
+ "String_list_in_group".to_string(),
+ Field::ListInternal(make_list(
+ ["five", "six", "seven", "eight"]
+ .map(|s| Field::Str(s.to_string()))
+ .to_vec()
+ ))
+ )
+ ]
+ )
+ ],
+ ];
+ assert_eq!(rows, expected_rows);
+ }
+
fn test_file_reader_rows(file_name: &str, schema: Option<Type>) ->
Result<Vec<Row>> {
let file = get_test_file(file_name);
let file_reader: Box<dyn FileReader> =
Box::new(SerializedFileReader::new(file)?);