alamb commented on code in PR #7850:
URL: https://github.com/apache/arrow-rs/pull/7850#discussion_r2257175036


##########
parquet/src/arrow/async_reader/mod.rs:
##########
@@ -1832,6 +1882,7 @@ mod tests {
         assert_eq!(total_rows, 730);
     }
 
+    #[ignore]

Review Comment:
   I don't think we can merge this PR without un-ignoring this test
   
   I think it is showing a regression. When I I looked into it more, and it 
seems like the new cache, even when disabled, is changing the behavior and 
fetching more pages. Specifically it looks like we now fetch all the pages, 
even those that they are supposed to be skipped:
   
   ```
   Expected page requests: [
       113..222,
       331..440,
       573..682,
       791..900,
       1033..1142,
       1251..1360,
   ...
   ```
   
   ```
   Actual page requests: [
       4..113,
       113..222,
       222..331,
       331..440,
       440..573,
       573..682,
       682..791,
       791..900,
       900..1033,
       1033..1142,
       1142..1251,
       1251..1360,
   ...
   ```
   
   
   
   Here is the diff I was using to investigate:
   
   <details><summary>Details</summary>
   <p>
   
   ```diff
   diff --git a/parquet/src/arrow/async_reader/mod.rs 
b/parquet/src/arrow/async_reader/mod.rs
   index 843ad766e9..b3da39c48e 100644
   --- a/parquet/src/arrow/async_reader/mod.rs
   +++ b/parquet/src/arrow/async_reader/mod.rs
   @@ -1884,7 +1884,6 @@ mod tests {
            assert_eq!(total_rows, 730);
        }
   
   -    #[ignore]
        #[tokio::test]
        async fn test_in_memory_row_group_sparse() {
            let testdata = arrow::util::test_util::parquet_test_data();
   @@ -1925,8 +1924,6 @@ mod tests {
            )
            .unwrap();
   
   -        let _schema_desc = metadata.file_metadata().schema_descr();
   -
            let projection = 
ProjectionMask::leaves(metadata.file_metadata().schema_descr(), vec![0]);
   
            let reader_factory = ReaderFactory {
   @@ -1946,19 +1943,25 @@ mod tests {
            // Setup `RowSelection` so that we can skip every other page, 
selecting the last page
            let mut selectors = vec![];
            let mut expected_page_requests: Vec<Range<usize>> = vec![];
   +        let mut page_idx = 0;
            while let Some(page) = pages.next() {
   +
                let num_rows = if let Some(next_page) = pages.peek() {
                    next_page.first_row_index - page.first_row_index
                } else {
                    num_rows - page.first_row_index
                };
   +            println!("page {page_idx}: first_row_index={} offset={} 
compressed_page_size={}, num_rows={num_rows}, skip={skip}", 
page.first_row_index, page.offset, page.compressed_page_size);
   +            page_idx += 1;
   
   +            let start = page.offset as usize;
   +            let end = start + page.compressed_page_size as usize;
                if skip {
                    selectors.push(RowSelector::skip(num_rows as usize));
   +                println!("  skipping page with {num_rows} rows : 
{start}..{end}");
                } else {
                    selectors.push(RowSelector::select(num_rows as usize));
   -                let start = page.offset as usize;
   -                let end = start + page.compressed_page_size as usize;
   +                println!("  selecting page with {num_rows} rows: 
{start}..{end}");
                    expected_page_requests.push(start..end);
                }
                skip = !skip;
   @@ -1973,7 +1976,13 @@ mod tests {
   
            let requests = requests.lock().unwrap();
   
   -        assert_eq!(&requests[..], &expected_page_requests)
   +        println!("Expected page requests: {:#?}", &expected_page_requests);
   +        println!("Actual page requests: {:#?}", &requests[..]);
   +
   +        assert_eq!(
   +            format!("{:#?}",&expected_page_requests),
   +            format!("{:#?}", &requests[..]),
   +        );
        }
   
        #[tokio::test]
   ```
   
   </p>
   </details> 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to