alamb commented on code in PR #7971: URL: https://github.com/apache/arrow-rs/pull/7971#discussion_r2258189640
########## parquet/tests/arrow_reader/io/async_reader.rs: ########## @@ -0,0 +1,430 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests for the async reader ([`ParquetRecordBatchStreamBuilder`]) + +use crate::io::{ + filter_a_175_b_625, filter_b_575_625, filter_b_false, test_file, test_options, LogEntry, + OperationLog, TestParquetFile, +}; +use bytes::Bytes; +use futures::future::BoxFuture; +use futures::{FutureExt, StreamExt}; +use parquet::arrow::arrow_reader::{ArrowReaderOptions, RowSelection, RowSelector}; +use parquet::arrow::async_reader::AsyncFileReader; +use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask}; +use parquet::errors::Result; +use parquet::file::metadata::ParquetMetaData; +use std::ops::Range; +use std::sync::Arc; + +#[tokio::test] +async fn test_read_entire_file() { + let test_file = test_file(); + // Expect to see IO for all data pages for each row group and column + let builder = async_builder(&test_file, test_options()).await; + run_test( + &test_file, + builder, + [ + "Get Provided Metadata", Review Comment: The whole point of this PR is to get tests in this style: human readable descriptions of what IO the decoders are doing. I am quite please with how it came out, though it took a lot of work 😅 ########## parquet/tests/arrow_reader/io/async_reader.rs: ########## @@ -0,0 +1,434 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests for the async reader ([`ParquetRecordBatchStreamBuilder`]) + +use crate::io::{ + filter_a_175_b_625, filter_b_575_625, filter_b_false, test_file, test_options, LogEntry, + OperationLog, TestParquetFile, +}; +use bytes::Bytes; +use futures::future::BoxFuture; +use futures::{FutureExt, StreamExt}; +use parquet::arrow::arrow_reader::{ArrowReaderOptions, RowSelection, RowSelector}; +use parquet::arrow::async_reader::AsyncFileReader; +use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask}; +use parquet::errors::Result; +use parquet::file::metadata::ParquetMetaData; +use std::ops::Range; +use std::sync::Arc; + +#[tokio::test] +async fn test_read_entire_file() { + // read entire file without any filtering or projection + let test_file = test_file(); + let builder = async_builder(&test_file, test_options()).await; + run_test( + &test_file, + builder, + [ + "Get Provided Metadata", + "Event: Builder Configured", + "Event: Reader Built", + "Read Multi:", + " Row Group 0, column 'a': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + " Row Group 0, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + " Row Group 0, column 'c': MultiPage(dictionary_page: true, data_pages: [0, 1]) (7346 bytes, 1 requests) [data]", + "Read Multi:", + " Row Group 1, column 'a': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + " Row Group 1, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + " Row Group 1, column 'c': MultiPage(dictionary_page: true, data_pages: [0, 1]) (7456 bytes, 1 requests) [data]", + ] + ) + .await; +} + +#[tokio::test] +async fn test_read_single_group() { + let test_file = test_file(); + let builder = async_builder(&test_file, test_options()) + .await + // read only second row group + .with_row_groups(vec![1]); + + // Expect to see only IO for Row Group 1. Should see no IO for Row Group 0. + run_test( + &test_file, + builder, + [ + "Get Provided Metadata", + "Event: Builder Configured", + "Event: Reader Built", + "Read Multi:", + " Row Group 1, column 'a': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + " Row Group 1, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + " Row Group 1, column 'c': MultiPage(dictionary_page: true, data_pages: [0, 1]) (7456 bytes, 1 requests) [data]", + ] + ).await; +} + +#[tokio::test] +async fn test_read_single_column() { + let test_file = test_file(); + let builder = async_builder(&test_file, test_options()).await; + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let builder = builder.with_projection(ProjectionMask::columns(&schema_descr, ["b"])); + // Expect to see only IO for column "b". Should see no IO for columns "a" or "c". + run_test( + &test_file, + builder, + [ + "Get Provided Metadata", + "Event: Builder Configured", + "Event: Reader Built", + "Read Multi:", + " Row Group 0, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + "Read Multi:", + " Row Group 1, column 'b': MultiPage(dictionary_page: true, data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + ] + ).await; +} + +#[tokio::test] +async fn test_read_row_selection() { + // There are 400 total rows spread across 4 data pages (100 rows each) + // select rows 175..225 (i.e. DataPage(1) of row group 0 and DataPage(0) of row group 1) + let test_file = test_file(); + let builder = async_builder(&test_file, test_options()).await; + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let builder = builder + .with_projection(ProjectionMask::columns(&schema_descr, ["a", "b"])) + .with_row_selection(RowSelection::from(vec![ + RowSelector::skip(175), + RowSelector::select(50), + ])); + + // Expect to see only data IO for one page for each column for each row group + run_test( + &test_file, + builder, + [ + "Get Provided Metadata", + "Event: Builder Configured", + "Event: Reader Built", + "Read Multi:", + " Row Group 0, column 'a': DictionaryPage (1617 bytes, 1 requests) [data]", + " Row Group 0, column 'a': DataPage(1) (126 bytes , 1 requests) [data]", + " Row Group 0, column 'b': DictionaryPage (1617 bytes, 1 requests) [data]", + " Row Group 0, column 'b': DataPage(1) (126 bytes , 1 requests) [data]", + "Read Multi:", + " Row Group 1, column 'a': DictionaryPage (1617 bytes, 1 requests) [data]", + " Row Group 1, column 'a': DataPage(0) (113 bytes , 1 requests) [data]", + " Row Group 1, column 'b': DictionaryPage (1617 bytes, 1 requests) [data]", + " Row Group 1, column 'b': DataPage(0) (113 bytes , 1 requests) [data]", + ], + ) + .await; +} + +#[tokio::test] +async fn test_read_limit() { + // There are 400 total rows spread across 4 data pages (100 rows each) + // a limit of 125 rows should only fetch the first two data pages (DataPage(0) and DataPage(1)) from row group 0 + let test_file = test_file(); + let builder = async_builder(&test_file, test_options()).await; + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let builder = builder + .with_projection(ProjectionMask::columns(&schema_descr, ["a"])) + .with_limit(125); + + run_test( + &test_file, + builder, + [ + "Get Provided Metadata", + "Event: Builder Configured", + "Event: Reader Built", + "Read Multi:", + " Row Group 0, column 'a': DictionaryPage (1617 bytes, 1 requests) [data]", + " Row Group 0, column 'a': DataPage(0) (113 bytes , 1 requests) [data]", + " Row Group 0, column 'a': DataPage(1) (126 bytes , 1 requests) [data]", + ], + ) + .await; +} + +#[tokio::test] +async fn test_read_single_row_filter() { + // Values from column "b" range 400..799 + // filter "b" > 575 and < than 625 + // (last data page in Row Group 0 and first DataPage in Row Group 1) + let test_file = test_file(); + let builder = async_builder(&test_file, test_options()).await; + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + + let builder = builder + .with_projection(ProjectionMask::columns(&schema_descr, ["a", "b"])) + .with_row_filter(filter_b_575_625(&schema_descr)); + + // Expect to see I/O for column b in both row groups to evaluate filter, + // then a single pages for the "a" column in each row group + run_test( Review Comment: Here you can see the async reader does IO during read, not during reader construction, which is different than the sync reader ########## parquet/tests/arrow_reader/io/sync_reader.rs: ########## @@ -0,0 +1,454 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests for the sync reader - [`ParquetRecordBatchReaderBuilder`] + +use crate::io::{ + filter_a_175_b_625, filter_b_575_625, filter_b_false, test_file, test_options, LogEntry, + OperationLog, TestParquetFile, +}; + +use bytes::Bytes; +use parquet::arrow::arrow_reader::{ + ArrowReaderOptions, ParquetRecordBatchReaderBuilder, RowSelection, RowSelector, +}; +use parquet::arrow::ProjectionMask; +use parquet::file::reader::{ChunkReader, Length}; +use std::io::Read; +use std::sync::Arc; + +#[test] +fn test_read_entire_file() { + // read entire file without any filtering or projection + let test_file = test_file(); + // Expect to see IO for all data pages for each row group and column + let builder = sync_builder(&test_file, test_options()); + run_test( + &test_file, + builder, + [ + "Footer: 8 bytes", + "Metadata: 1162", + "UNKNOWN: 22230..22877 (maybe Page Index)", + "Event: Builder Configured", + "Event: Reader Built", + "Row Group 0, column 'a': DictionaryPage (1617 bytes, 1 requests) [data]", + "Row Group 0, column 'a': DataPage(0) (113 bytes , 1 requests) [data]", + "Row Group 0, column 'a': DataPage(1) (126 bytes , 1 requests) [data]", + "Row Group 1, column 'a': DictionaryPage (1617 bytes, 1 requests) [data]", + "Row Group 1, column 'a': DataPage(0) (113 bytes , 1 requests) [data]", + "Row Group 1, column 'a': DataPage(1) (126 bytes , 1 requests) [data]", + "Row Group 0, column 'b': DictionaryPage (1617 bytes, 1 requests) [data]", + "Row Group 0, column 'b': DataPage(0) (113 bytes , 1 requests) [data]", + "Row Group 0, column 'b': DataPage(1) (126 bytes , 1 requests) [data]", + "Row Group 1, column 'b': DictionaryPage (1617 bytes, 1 requests) [data]", + "Row Group 1, column 'b': DataPage(0) (113 bytes , 1 requests) [data]", + "Row Group 1, column 'b': DataPage(1) (126 bytes , 1 requests) [data]", + "Row Group 0, column 'c': DictionaryPage (7107 bytes, 1 requests) [data]", + "Row Group 0, column 'c': DataPage(0) (113 bytes , 1 requests) [data]", + "Row Group 0, column 'c': DataPage(1) (126 bytes , 1 requests) [data]", + "Row Group 1, column 'c': DictionaryPage (7217 bytes, 1 requests) [data]", + "Row Group 1, column 'c': DataPage(0) (113 bytes , 1 requests) [data]", + "Row Group 1, column 'c': DataPage(1) (126 bytes , 1 requests) [data]", + ], + ); +} + +#[test] +fn test_read_single_group() { + let test_file = test_file(); + let builder = sync_builder(&test_file, test_options()).with_row_groups(vec![1]); // read only second row group + + // Expect to see only IO for Row Group 1. Should see no IO for Row Group 0. + run_test( + &test_file, + builder, + [ + "Footer: 8 bytes", + "Metadata: 1162", + "UNKNOWN: 22230..22877 (maybe Page Index)", + "Event: Builder Configured", + "Event: Reader Built", + "Row Group 1, column 'a': DictionaryPage (1617 bytes, 1 requests) [data]", + "Row Group 1, column 'a': DataPage(0) (113 bytes , 1 requests) [data]", + "Row Group 1, column 'a': DataPage(1) (126 bytes , 1 requests) [data]", + "Row Group 1, column 'b': DictionaryPage (1617 bytes, 1 requests) [data]", + "Row Group 1, column 'b': DataPage(0) (113 bytes , 1 requests) [data]", + "Row Group 1, column 'b': DataPage(1) (126 bytes , 1 requests) [data]", + "Row Group 1, column 'c': DictionaryPage (7217 bytes, 1 requests) [data]", + "Row Group 1, column 'c': DataPage(0) (113 bytes , 1 requests) [data]", + "Row Group 1, column 'c': DataPage(1) (126 bytes , 1 requests) [data]", + ], + ); +} + +#[test] +fn test_read_single_column() { + let test_file = test_file(); + let builder = sync_builder(&test_file, test_options()); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let builder = builder.with_projection(ProjectionMask::columns(&schema_descr, ["b"])); + // Expect to see only IO for column "b". Should see no IO for columns "a" or "c". + run_test( + &test_file, + builder, + [ + "Footer: 8 bytes", + "Metadata: 1162", + "UNKNOWN: 22230..22877 (maybe Page Index)", + "Event: Builder Configured", + "Event: Reader Built", + "Row Group 0, column 'b': DictionaryPage (1617 bytes, 1 requests) [data]", + "Row Group 0, column 'b': DataPage(0) (113 bytes , 1 requests) [data]", + "Row Group 0, column 'b': DataPage(1) (126 bytes , 1 requests) [data]", + "Row Group 1, column 'b': DictionaryPage (1617 bytes, 1 requests) [data]", + "Row Group 1, column 'b': DataPage(0) (113 bytes , 1 requests) [data]", + "Row Group 1, column 'b': DataPage(1) (126 bytes , 1 requests) [data]", + ], + ); +} + +#[test] +fn test_read_single_column_no_page_index() { + let test_file = test_file(); + let options = test_options().with_page_index(false); + let builder = sync_builder(&test_file, options); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let builder = builder.with_projection(ProjectionMask::columns(&schema_descr, ["b"])); + // Expect to see only IO for column "b", should see no IO for columns "a" or "c". + // + // Note that we need to read all data page headers to find the pages for column b + // so there are many more small reads than in the test_read_single_column test above + run_test( + &test_file, + builder, + [ + "Footer: 8 bytes", + "Metadata: 1162", + "Event: Builder Configured", + "Event: Reader Built", + "Row Group 0, column 'b': DictionaryPage (17 bytes , 17 requests) [header]", + "Row Group 0, column 'b': DictionaryPage (1600 bytes, 1 requests) [data]", + "Row Group 0, column 'b': DataPage(0) (20 bytes , 20 requests) [header]", + "Row Group 0, column 'b': DataPage(0) (93 bytes , 1 requests) [data]", + "Row Group 0, column 'b': DataPage(1) (20 bytes , 20 requests) [header]", + "Row Group 0, column 'b': DataPage(1) (106 bytes , 1 requests) [data]", + "Row Group 1, column 'b': DictionaryPage (17 bytes , 17 requests) [header]", + "Row Group 1, column 'b': DictionaryPage (1600 bytes, 1 requests) [data]", + "Row Group 1, column 'b': DataPage(0) (20 bytes , 20 requests) [header]", + "Row Group 1, column 'b': DataPage(0) (93 bytes , 1 requests) [data]", + "Row Group 1, column 'b': DataPage(1) (20 bytes , 20 requests) [header]", + "Row Group 1, column 'b': DataPage(1) (106 bytes , 1 requests) [data]", + ], + ); +} + +#[test] +fn test_read_row_selection() { + // There are 400 total rows spread across 4 data pages (100 rows each) + // select rows 175..225 (i.e. DataPage(1) of row group 0 and DataPage(0) of row group 1) + let test_file = test_file(); + let builder = sync_builder(&test_file, test_options()); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let builder = builder + .with_projection( + // read both "a" and "b" + ProjectionMask::columns(&schema_descr, ["a", "b"]), + ) + .with_row_selection(RowSelection::from(vec![ + RowSelector::skip(175), + RowSelector::select(50), + ])); + + // Expect to see only data IO for one page for each column for each row group + // Note the data page headers for all pages need to be read to find the correct pages + run_test( + &test_file, + builder, + [ + "Footer: 8 bytes", + "Metadata: 1162", + "UNKNOWN: 22230..22877 (maybe Page Index)", + "Event: Builder Configured", + "Event: Reader Built", + "Row Group 0, column 'a': DictionaryPage (1617 bytes, 1 requests) [data]", + "Row Group 0, column 'a': DataPage(1) (126 bytes , 1 requests) [data]", + "Row Group 0, column 'b': DictionaryPage (1617 bytes, 1 requests) [data]", + "Row Group 0, column 'b': DataPage(1) (126 bytes , 1 requests) [data]", + "Row Group 1, column 'a': DictionaryPage (1617 bytes, 1 requests) [data]", + "Row Group 1, column 'a': DataPage(0) (113 bytes , 1 requests) [data]", + "Row Group 1, column 'b': DictionaryPage (1617 bytes, 1 requests) [data]", + "Row Group 1, column 'b': DataPage(0) (113 bytes , 1 requests) [data]", + ], + ); +} + +#[test] +fn test_read_limit() { + // There are 400 total rows spread across 4 data pages (100 rows each) + // a limit of 125 rows should only fetch the first two data pages (DataPage(0) and DataPage(1)) from row group 0 + let test_file = test_file(); + let builder = sync_builder(&test_file, test_options()); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let builder = builder + .with_projection(ProjectionMask::columns(&schema_descr, ["a"])) + .with_limit(125); + + run_test( + &test_file, + builder, + [ + "Footer: 8 bytes", + "Metadata: 1162", + "UNKNOWN: 22230..22877 (maybe Page Index)", + "Event: Builder Configured", + "Event: Reader Built", + "Row Group 0, column 'a': DictionaryPage (1617 bytes, 1 requests) [data]", + "Row Group 0, column 'a': DataPage(0) (113 bytes , 1 requests) [data]", + "Row Group 0, column 'a': DataPage(1) (126 bytes , 1 requests) [data]", + ], + ); +} + +#[test] +fn test_read_single_row_filter() { + // Values from column "b" range 400..799 + // filter "b" > 575 and < 625 + // (last data page in Row Group 0 and first DataPage in Row Group 1) + let test_file = test_file(); + let builder = sync_builder(&test_file, test_options()); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + + let builder = builder + .with_projection( + // read both "a" and "b" + ProjectionMask::columns(&schema_descr, ["a", "b"]), + ) + // "b" > 575 and "b" < 625 + .with_row_filter(filter_b_575_625(&schema_descr)); + + // Expect to see I/O for column b in both row groups and then reading just a + // single pages for a in each row group + // + // Note there is significant IO that happens during the construction of the + // reader (between "Builder Configured" and "Reader Built") + run_test( Review Comment: This is also pretty cool -- it shows that the IO for evaluating the filter with the sync reader actually happens during the construction ########## parquet/tests/arrow_reader/io/async_reader.rs: ########## @@ -0,0 +1,424 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests for the async reader ([`ParquetRecordBatchStreamBuilder`]) + +use crate::io::{ + filter_a_175_b_625, filter_b_575_625, filter_b_false, test_file, test_options, LogEntry, + OperationLog, TestParquetFile, +}; +use bytes::Bytes; +use futures::future::BoxFuture; +use futures::{FutureExt, StreamExt}; +use parquet::arrow::arrow_reader::{ArrowReaderOptions, RowSelection, RowSelector}; +use parquet::arrow::async_reader::AsyncFileReader; +use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask}; +use parquet::errors::Result; +use parquet::file::metadata::ParquetMetaData; +use std::ops::Range; +use std::sync::Arc; + +#[tokio::test] +async fn test_read_entire_file() { + let test_file = test_file(); + // Expect to see IO for all data pages for each row group and column + let builder = async_builder(&test_file, test_options()).await; + run_test( + &test_file, + builder, + [ + "Get Provided Metadata", + "Event: Builder Configured", + "Event: Reader Built", + "Read Multi:", + " Row Group 0, column 'a': MultiPage(dictionary_page: true, : data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + " Row Group 0, column 'b': MultiPage(dictionary_page: true, : data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + " Row Group 0, column 'c': MultiPage(dictionary_page: true, : data_pages: [0, 1]) (7346 bytes, 1 requests) [data]", + "Read Multi:", + " Row Group 1, column 'a': MultiPage(dictionary_page: true, : data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + " Row Group 1, column 'b': MultiPage(dictionary_page: true, : data_pages: [0, 1]) (1856 bytes, 1 requests) [data]", + " Row Group 1, column 'c': MultiPage(dictionary_page: true, : data_pages: [0, 1]) (7456 bytes, 1 requests) [data]", + ] + ) + .await; +} + +#[tokio::test] +async fn test_read_single_group() { + let test_file = test_file(); + let builder = async_builder(&test_file, test_options()) + .await + .with_row_groups(vec![1]); // read only second row group + + // Expect to see only IO for Row Group 1. + // Should see no IO for Row Group 0. + run_test( Review Comment: I find itpretty cool to see the IO patterns visible and tested -- like here is the IO pattern showing that projection pushdown actually reduces IO! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org