Re: [PR] Initial Extract parquet data page statistics API [datafusion]

via GitHub Fri, 14 Jun 2024 22:01:43 -0700


marvinlanhenke commented on code in PR #10852:
URL: https://github.com/apache/datafusion/pull/10852#discussion_r1640748152



##########
datafusion/core/tests/parquet/arrow_statistics.rs:
##########
@@ -39,102 +40,102 @@ use arrow_array::{
 use arrow_schema::{DataType, Field, Schema};
 use datafusion::datasource::physical_plan::parquet::StatisticsConverter;
 use half::f16;
-use parquet::arrow::arrow_reader::{ArrowReaderBuilder, 
ParquetRecordBatchReaderBuilder};
+use parquet::arrow::arrow_reader::{
+    ArrowReaderBuilder, ArrowReaderOptions, ParquetRecordBatchReaderBuilder,
+};
 use parquet::arrow::ArrowWriter;
 use parquet::file::properties::{EnabledStatistics, WriterProperties};
 
 use super::make_test_file_rg;
 
-// TEST HELPERS
-
-/// Return a record batch with i64 with Null values
-fn make_int64_batches_with_null(
+#[derive(Debug, Default, Clone)]
+struct Int64Case {
+    /// Number of nulls in the column
     null_values: usize,
+    /// Non null values in the range `[no_null_values_start,
+    /// no_null_values_end]`, one value for each row
     no_null_values_start: i64,
     no_null_values_end: i64,
-) -> RecordBatch {
-    let schema = Arc::new(Schema::new(vec![Field::new("i64", DataType::Int64, 
true)]));
-
-    let v64: Vec<i64> = (no_null_values_start as _..no_null_values_end as 
_).collect();
-
-    RecordBatch::try_new(
-        schema,
-        vec![make_array(
-            Int64Array::from_iter(
-                v64.into_iter()
-                    .map(Some)
-                    .chain(std::iter::repeat(None).take(null_values)),
-            )
-            .to_data(),
-        )],
-    )
-    .unwrap()
-}
-
-// Create a parquet file with one column for data type i64
-// Data of the file include
-//   . Number of null rows is the given num_null
-//   . There are non-null values in the range [no_null_values_start, 
no_null_values_end], one value each row
-//   . The file is divided into row groups of size row_per_group
-pub fn parquet_file_one_column(
-    num_null: usize,
-    no_null_values_start: i64,
-    no_null_values_end: i64,
+    /// Number of rows per row group
     row_per_group: usize,
-) -> ParquetRecordBatchReaderBuilder<File> {
-    parquet_file_one_column_stats(
-        num_null,
-        no_null_values_start,
-        no_null_values_end,
-        row_per_group,
-        EnabledStatistics::Chunk,
-    )
+    /// if specified, overrides default statistics settings
+    enable_stats: Option<EnabledStatistics>,
+    /// If specified, the number of values in each page
+    data_page_row_count_limit: Option<usize>,
 }
 
-// Create a parquet file with one column for data type i64
-// Data of the file include
-//   . Number of null rows is the given num_null
-//   . There are non-null values in the range [no_null_values_start, 
no_null_values_end], one value each row
-//   . The file is divided into row groups of size row_per_group
-//  . Statistics are enabled/disabled based on the given enable_stats
-pub fn parquet_file_one_column_stats(
-    num_null: usize,
-    no_null_values_start: i64,
-    no_null_values_end: i64,
-    row_per_group: usize,
-    enable_stats: EnabledStatistics,
-) -> ParquetRecordBatchReaderBuilder<File> {
-    let mut output_file = tempfile::Builder::new()
-        .prefix("parquert_statistics_test")
-        .suffix(".parquet")
-        .tempfile()
-        .expect("tempfile creation");
-
-    let props = WriterProperties::builder()
-        .set_max_row_group_size(row_per_group)
-        .set_statistics_enabled(enable_stats)
-        .build();
-
-    let batches = vec![make_int64_batches_with_null(
-        num_null,
-        no_null_values_start,
-        no_null_values_end,
-    )];
-
-    let schema = batches[0].schema();
-
-    let mut writer = ArrowWriter::try_new(&mut output_file, schema, 
Some(props)).unwrap();
+impl Int64Case {
+    /// Return a record batch with i64 with Null values
+    /// The first no_null_values_end - no_null_values_start values
+    /// are non-null with the specified range, the rest are null
+    fn make_int64_batches_with_null(&self) -> RecordBatch {
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("i64", DataType::Int64, 
true)]));
+
+        let v64: Vec<i64> =
+            (self.no_null_values_start as _..self.no_null_values_end as 
_).collect();
+
+        RecordBatch::try_new(
+            schema,
+            vec![make_array(
+                Int64Array::from_iter(
+                    v64.into_iter()
+                        .map(Some)
+                        .chain(std::iter::repeat(None).take(self.null_values)),
+                )
+                .to_data(),
+            )],
+        )
+        .unwrap()
+    }
+
+    // Create a parquet file with the specified settings
+    pub fn build(&self) -> ParquetRecordBatchReaderBuilder<File> {

Review Comment:
   Do we really need an extra struct here. Couldn't we not just simply create 
another helper fn like `make_test_file_rg` or pass the param 
`data_page_row_count_limit: Option<usize>` into the existing helper 
`make_test_file_rg`? This would make the test setup a lot easier - also for 
supporting other data_types?
   
   EDIT: I tested this and it works fine. But, perhaps you had something else 
in mind - when introducing the `Int64Case` ?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Re: [PR] Initial Extract parquet data page statistics API [datafusion]

Reply via email to