alamb commented on code in PR #9847:
URL: https://github.com/apache/arrow-rs/pull/9847#discussion_r3173211519


##########
parquet/src/arrow/array_reader/test_util.rs:
##########
@@ -91,112 +88,51 @@ pub fn byte_array_all_encodings(
     (pages, encoded_dictionary)
 }
 
-/// Array reader for test.
-pub struct InMemoryArrayReader {
-    data_type: ArrowType,
-    array: ArrayRef,
-    def_levels: Option<Vec<i16>>,
-    rep_levels: Option<Vec<i16>>,
-    last_idx: usize,
-    cur_idx: usize,
-    need_consume_records: usize,
-}
-
-impl InMemoryArrayReader {
-    pub fn new(
-        data_type: ArrowType,
-        array: ArrayRef,
-        def_levels: Option<Vec<i16>>,
-        rep_levels: Option<Vec<i16>>,
-    ) -> Self {
-        assert!(
-            def_levels
-                .as_ref()
-                .map(|d| d.len() == array.len())
-                .unwrap_or(true)
-        );
-
-        assert!(
-            rep_levels
-                .as_ref()
-                .map(|r| r.len() == array.len())
-                .unwrap_or(true)
-        );
-
-        Self {
-            data_type,
-            array,
-            def_levels,
-            rep_levels,
-            cur_idx: 0,
-            last_idx: 0,
-            need_consume_records: 0,
-        }
-    }
-}
-
-impl ArrayReader for InMemoryArrayReader {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn get_data_type(&self) -> &ArrowType {
-        &self.data_type
-    }
-
-    fn read_records(&mut self, batch_size: usize) -> Result<usize> {
-        assert_ne!(batch_size, 0);
-        // This replicates the logical normally performed by
-        // RecordReader to delimit semantic records
-        let read = match &self.rep_levels {
-            Some(rep_levels) => {
-                let rep_levels = &rep_levels[self.cur_idx..];
-                let mut levels_read = 0;
-                let mut records_read = 0;
-                while levels_read < rep_levels.len() && records_read < 
batch_size {
-                    if rep_levels[levels_read] == 0 {
-                        records_read += 1; // Start of new record
-                    }
-                    levels_read += 1;
-                }
-
-                // Find end of current record
-                while levels_read < rep_levels.len() && 
rep_levels[levels_read] != 0 {
-                    levels_read += 1
-                }
-                levels_read
-            }
-            None => batch_size.min(self.array.len() - self.cur_idx),
-        };
-        self.need_consume_records += read;
-        Ok(read)
-    }
-
-    fn consume_batch(&mut self) -> Result<ArrayRef> {
-        let batch_size = self.need_consume_records;
-        assert_ne!(batch_size, 0);
-        self.last_idx = self.cur_idx;
-        self.cur_idx += batch_size;
-        self.need_consume_records = 0;
-        Ok(self.array.slice(self.last_idx, batch_size))
-    }
+/// Build a real `PrimitiveArrayReader<Int32Type>` from raw non-null values
+/// and definition/repetition levels. This exercises the full production
+/// `RecordReader` code path (including selective padding when a parent
+/// `ListArrayReader` calls `enable_selective_padding`).
+///
+/// `values` must contain only the non-null values (entries where
+/// `def_levels[i] == max_def_level`), in order, as Parquet encodes them.
+pub fn make_int32_page_reader(
+    values: &[i32],

Review Comment:
   I double checked that all callsites actually used `DataType::Int32` which 
makes sense to me 
   
   



##########
parquet/src/arrow/array_reader/test_util.rs:
##########
@@ -91,112 +88,51 @@ pub fn byte_array_all_encodings(
     (pages, encoded_dictionary)
 }
 
-/// Array reader for test.
-pub struct InMemoryArrayReader {
-    data_type: ArrowType,
-    array: ArrayRef,
-    def_levels: Option<Vec<i16>>,
-    rep_levels: Option<Vec<i16>>,
-    last_idx: usize,
-    cur_idx: usize,
-    need_consume_records: usize,
-}
-
-impl InMemoryArrayReader {
-    pub fn new(
-        data_type: ArrowType,
-        array: ArrayRef,
-        def_levels: Option<Vec<i16>>,
-        rep_levels: Option<Vec<i16>>,
-    ) -> Self {
-        assert!(
-            def_levels
-                .as_ref()
-                .map(|d| d.len() == array.len())
-                .unwrap_or(true)
-        );
-
-        assert!(
-            rep_levels
-                .as_ref()
-                .map(|r| r.len() == array.len())
-                .unwrap_or(true)
-        );
-
-        Self {
-            data_type,
-            array,
-            def_levels,
-            rep_levels,
-            cur_idx: 0,
-            last_idx: 0,
-            need_consume_records: 0,
-        }
-    }
-}
-
-impl ArrayReader for InMemoryArrayReader {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn get_data_type(&self) -> &ArrowType {
-        &self.data_type
-    }
-
-    fn read_records(&mut self, batch_size: usize) -> Result<usize> {
-        assert_ne!(batch_size, 0);
-        // This replicates the logical normally performed by
-        // RecordReader to delimit semantic records
-        let read = match &self.rep_levels {
-            Some(rep_levels) => {
-                let rep_levels = &rep_levels[self.cur_idx..];
-                let mut levels_read = 0;
-                let mut records_read = 0;
-                while levels_read < rep_levels.len() && records_read < 
batch_size {
-                    if rep_levels[levels_read] == 0 {
-                        records_read += 1; // Start of new record
-                    }
-                    levels_read += 1;
-                }
-
-                // Find end of current record
-                while levels_read < rep_levels.len() && 
rep_levels[levels_read] != 0 {
-                    levels_read += 1
-                }
-                levels_read
-            }
-            None => batch_size.min(self.array.len() - self.cur_idx),
-        };
-        self.need_consume_records += read;
-        Ok(read)
-    }
-
-    fn consume_batch(&mut self) -> Result<ArrayRef> {
-        let batch_size = self.need_consume_records;
-        assert_ne!(batch_size, 0);
-        self.last_idx = self.cur_idx;
-        self.cur_idx += batch_size;
-        self.need_consume_records = 0;
-        Ok(self.array.slice(self.last_idx, batch_size))
-    }
+/// Build a real `PrimitiveArrayReader<Int32Type>` from raw non-null values
+/// and definition/repetition levels. This exercises the full production
+/// `RecordReader` code path (including selective padding when a parent
+/// `ListArrayReader` calls `enable_selective_padding`).
+///
+/// `values` must contain only the non-null values (entries where

Review Comment:
   👍 



##########
parquet/src/arrow/array_reader/struct_array.rs:
##########
@@ -224,39 +224,27 @@ impl ArrayReader for StructArrayReader {
 mod tests {
     use super::*;
     use crate::arrow::array_reader::ListArrayReader;
-    use crate::arrow::array_reader::test_util::InMemoryArrayReader;
+    use crate::arrow::array_reader::test_util::make_int32_page_reader;
     use arrow::buffer::Buffer;
     use arrow::datatypes::Field;
     use arrow_array::cast::AsArray;
-    use arrow_array::{Array, Int32Array, ListArray};
+    use arrow_array::{Array, ListArray};
     use arrow_schema::Fields;
 
     #[test]
     fn test_struct_array_reader() {
-        let array_1 = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]));
-        let array_reader_1 = InMemoryArrayReader::new(
-            ArrowType::Int32,
-            array_1.clone(),
-            Some(vec![0, 1, 2, 3, 1]),
-            Some(vec![0, 1, 1, 1, 1]),
-        );
+        let array_reader_1 = make_int32_page_reader(&[4], &[0, 1, 2, 3, 1], 
&[0, 1, 1, 1, 1], 3, 1);

Review Comment:
   As an API suggestion (maybe for another PR, not required):
   
   I would personally find this much easier to understand if the fields were 
named so I didn't have to refer to  `make_int32_page_reader` to figure out 
what`&[0, 1, 2, 3, 1]` and `&[0, 1, 1, 1, 1]` meant. 
   
   Perhaps we could do this with an builder in the test. Something like this 
perhaps:
   
   ```rust
   let array_reader_1 = TestArrayReaderBuilder::new()
    .with_non_null_i32_values(&[4]) // <-- this is especially non obvous if you 
are used to normal Arrow arrays
    .with_def_levels(&[0, 1, 2, 3, 1])
    .with_rep_levels(&[0, 1, 1, 1, 1])
    .with_max_def_level(3)
    .with_max_rep_level(1)
    .build()
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to